1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, is_conversion, algebraic_properties, const_expr):
37      """Parameters:
38
39      - name is the name of the opcode (prepend nir_op_ for the enum name)
40      - all types are strings that get nir_type_ prepended to them
41      - input_types is a list of types
42      - is_conversion is true if this opcode represents a type conversion
43      - algebraic_properties is a space-seperated string, where nir_op_is_ is
44        prepended before each entry
45      - const_expr is an expression or series of statements that computes the
46        constant value of the opcode given the constant values of its inputs.
47
48      Constant expressions are formed from the variables src0, src1, ...,
49      src(N-1), where N is the number of arguments.  The output of the
50      expression should be stored in the dst variable.  Per-component input
51      and output variables will be scalars and non-per-component input and
52      output variables will be a struct with fields named x, y, z, and w
53      all of the correct type.  Input and output variables can be assumed
54      to already be of the correct type and need no conversion.  In
55      particular, the conversion from the C bool type to/from  NIR_TRUE and
56      NIR_FALSE happens automatically.
57
58      For per-component instructions, the entire expression will be
59      executed once for each component.  For non-per-component
60      instructions, the expression is expected to store the correct values
61      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
62      constant expression, an assignment to dst will happen automatically
63      and the result will be equivalent to "dst = <expression>" for
64      per-component instructions and "dst.x = dst.y = ... = <expression>"
65      for non-per-component instructions.
66      """
67      assert isinstance(name, str)
68      assert isinstance(output_size, int)
69      assert isinstance(output_type, str)
70      assert isinstance(input_sizes, list)
71      assert isinstance(input_sizes[0], int)
72      assert isinstance(input_types, list)
73      assert isinstance(input_types[0], str)
74      assert isinstance(is_conversion, bool)
75      assert isinstance(algebraic_properties, str)
76      assert isinstance(const_expr, str)
77      assert len(input_sizes) == len(input_types)
78      assert 0 <= output_size <= 4
79      for size in input_sizes:
80         assert 0 <= size <= 4
81         if output_size != 0:
82            assert size != 0
83      self.name = name
84      self.num_inputs = len(input_sizes)
85      self.output_size = output_size
86      self.output_type = output_type
87      self.input_sizes = input_sizes
88      self.input_types = input_types
89      self.is_conversion = is_conversion
90      self.algebraic_properties = algebraic_properties
91      self.const_expr = const_expr
92
93# helper variables for strings
94tfloat = "float"
95tint = "int"
96tbool = "bool"
97tbool1 = "bool1"
98tbool32 = "bool32"
99tuint = "uint"
100tuint16 = "uint16"
101tfloat32 = "float32"
102tint32 = "int32"
103tuint32 = "uint32"
104tint64 = "int64"
105tuint64 = "uint64"
106tfloat64 = "float64"
107
108_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
109
110def type_has_size(type_):
111    m = _TYPE_SPLIT_RE.match(type_)
112    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
113    return m.group('bits') is not None
114
115def type_size(type_):
116    m = _TYPE_SPLIT_RE.match(type_)
117    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118    assert m.group('bits') is not None, \
119           'NIR type string has no bit size: "{}"'.format(type_)
120    return int(m.group('bits'))
121
122def type_sizes(type_):
123    if type_has_size(type_):
124        return [type_size(type_)]
125    elif type_ == 'bool':
126        return [1, 32]
127    elif type_ == 'float':
128        return [16, 32, 64]
129    else:
130        return [1, 8, 16, 32, 64]
131
132def type_base_type(type_):
133    m = _TYPE_SPLIT_RE.match(type_)
134    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
135    return m.group('type')
136
137commutative = "commutative "
138associative = "associative "
139
140# global dictionary of opcodes
141opcodes = {}
142
143def opcode(name, output_size, output_type, input_sizes, input_types,
144           is_conversion, algebraic_properties, const_expr):
145   assert name not in opcodes
146   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
147                          input_types, is_conversion, algebraic_properties,
148                          const_expr)
149
150def unop_convert(name, out_type, in_type, const_expr):
151   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
152
153def unop(name, ty, const_expr):
154   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
155
156def unop_horiz(name, output_size, output_type, input_size, input_type,
157               const_expr):
158   opcode(name, output_size, output_type, [input_size], [input_type],
159          False, "", const_expr)
160
161def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
162                reduce_expr, final_expr):
163   def prereduce(src):
164      return "(" + prereduce_expr.format(src=src) + ")"
165   def final(src):
166      return final_expr.format(src="(" + src + ")")
167   def reduce_(src0, src1):
168      return reduce_expr.format(src0=src0, src1=src1)
169   src0 = prereduce("src0.x")
170   src1 = prereduce("src0.y")
171   src2 = prereduce("src0.z")
172   src3 = prereduce("src0.w")
173   unop_horiz(name + "2", output_size, output_type, 2, input_type,
174              final(reduce_(src0, src1)))
175   unop_horiz(name + "3", output_size, output_type, 3, input_type,
176              final(reduce_(reduce_(src0, src1), src2)))
177   unop_horiz(name + "4", output_size, output_type, 4, input_type,
178              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
179
180def unop_numeric_convert(name, out_type, in_type, const_expr):
181   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
182
183# These two move instructions differ in what modifiers they support and what
184# the negate modifier means. Otherwise, they are identical.
185unop("fmov", tfloat, "src0")
186unop("imov", tint, "src0")
187
188unop("ineg", tint, "-src0")
189unop("fneg", tfloat, "-src0")
190unop("inot", tint, "~src0") # invert every bit of the integer
191unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
192                      "((src0 == 0.0f) ? 1.0f : 0.0f)"))
193unop("fsign", tfloat, ("bit_size == 64 ? " +
194                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
195                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
196unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
197unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
198unop("fabs", tfloat, "fabs(src0)")
199unop("fsat", tfloat, ("bit_size == 64 ? " +
200                      "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
201                      "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
202unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
203unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
204unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
205unop("fexp2", tfloat, "exp2f(src0)")
206unop("flog2", tfloat, "log2f(src0)")
207
208# Generate all of the numeric conversion opcodes
209for src_t in [tint, tuint, tfloat, tbool]:
210   if src_t == tbool:
211      dst_types = [tfloat, tint]
212   elif src_t == tint:
213      dst_types = [tfloat, tint, tbool]
214   elif src_t == tuint:
215      dst_types = [tfloat, tuint]
216   elif src_t == tfloat:
217      dst_types = [tint, tuint, tfloat, tbool]
218
219   for dst_t in dst_types:
220      for bit_size in type_sizes(dst_t):
221          if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
222              rnd_modes = ['_rtne', '_rtz', '']
223              for rnd_mode in rnd_modes:
224                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
225                                                              bit_size, rnd_mode),
226                                       dst_t + str(bit_size), src_t, "src0")
227          else:
228              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
229              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
230                                   dst_t + str(bit_size), src_t, conv_expr)
231
232
233# Unary floating-point rounding operations.
234
235
236unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
237unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
238unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
239unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
240unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
241
242unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
243
244# Trigonometric operations.
245
246
247unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
248unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
249
250# dfrexp
251unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
252unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
253
254# Partial derivatives.
255
256
257unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
258unop("fddy", tfloat, "0.0")
259unop("fddx_fine", tfloat, "0.0")
260unop("fddy_fine", tfloat, "0.0")
261unop("fddx_coarse", tfloat, "0.0")
262unop("fddy_coarse", tfloat, "0.0")
263
264
265# Floating point pack and unpack operations.
266
267def pack_2x16(fmt):
268   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
269dst.x = (uint32_t) pack_fmt_1x16(src0.x);
270dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
271""".replace("fmt", fmt))
272
273def pack_4x8(fmt):
274   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
275dst.x = (uint32_t) pack_fmt_1x8(src0.x);
276dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
277dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
278dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
279""".replace("fmt", fmt))
280
281def unpack_2x16(fmt):
282   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
283dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
284dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
285""".replace("fmt", fmt))
286
287def unpack_4x8(fmt):
288   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
289dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
290dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
291dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
292dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
293""".replace("fmt", fmt))
294
295
296pack_2x16("snorm")
297pack_4x8("snorm")
298pack_2x16("unorm")
299pack_4x8("unorm")
300pack_2x16("half")
301unpack_2x16("snorm")
302unpack_4x8("snorm")
303unpack_2x16("unorm")
304unpack_4x8("unorm")
305unpack_2x16("half")
306
307unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
308dst.x = (src0.x & 0xffff) | (src0.y << 16);
309""")
310
311unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
312dst.x = (src0.x <<  0) |
313        (src0.y <<  8) |
314        (src0.z << 16) |
315        (src0.w << 24);
316""")
317
318unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
319           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
320
321unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
322           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
323
324unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
325           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
326
327unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
328           "dst.x = src0.x; dst.y = src0.x >> 32;")
329
330unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
331           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
332
333unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
334           "dst.x = src0.x; dst.y = src0.x >> 16;")
335
336# Lowered floating point unpacking operations.
337
338
339unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
340             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
341unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
342             "unpack_half_1x16((uint16_t)(src0 >> 16))")
343
344unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
345unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
346
347unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
348unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
349
350# Bit operations, part of ARB_gpu_shader5.
351
352
353unop("bitfield_reverse", tuint32, """
354/* we're not winning any awards for speed here, but that's ok */
355dst = 0;
356for (unsigned bit = 0; bit < 32; bit++)
357   dst |= ((src0 >> bit) & 1) << (31 - bit);
358""")
359unop_convert("bit_count", tuint32, tuint, """
360dst = 0;
361for (unsigned bit = 0; bit < bit_size; bit++) {
362   if ((src0 >> bit) & 1)
363      dst++;
364}
365""")
366
367unop_convert("ufind_msb", tint32, tuint, """
368dst = -1;
369for (int bit = bit_size - 1; bit >= 0; bit--) {
370   if ((src0 >> bit) & 1) {
371      dst = bit;
372      break;
373   }
374}
375""")
376
377unop("ifind_msb", tint32, """
378dst = -1;
379for (int bit = 31; bit >= 0; bit--) {
380   /* If src0 < 0, we're looking for the first 0 bit.
381    * if src0 >= 0, we're looking for the first 1 bit.
382    */
383   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
384      (!((src0 >> bit) & 1) && (src0 < 0))) {
385      dst = bit;
386      break;
387   }
388}
389""")
390
391unop_convert("find_lsb", tint32, tint, """
392dst = -1;
393for (unsigned bit = 0; bit < bit_size; bit++) {
394   if ((src0 >> bit) & 1) {
395      dst = bit;
396      break;
397   }
398}
399""")
400
401
402for i in range(1, 5):
403   for j in range(1, 5):
404      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
405
406
407# AMD_gcn_shader extended instructions
408unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
409dst.x = dst.y = 0.0;
410float absX = fabs(src0.x);
411float absY = fabs(src0.y);
412float absZ = fabs(src0.z);
413
414float ma = 0.0;
415if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
416if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
417if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
418
419if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
420if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
421if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
422if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
423if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
424if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
425
426dst.x = dst.x / ma + 0.5;
427dst.y = dst.y / ma + 0.5;
428""")
429
430unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
431float absX = fabs(src0.x);
432float absY = fabs(src0.y);
433float absZ = fabs(src0.z);
434if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
435if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
436if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
437if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
438if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
439if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
440""")
441
442
443def binop_convert(name, out_type, in_type, alg_props, const_expr):
444   opcode(name, 0, out_type, [0, 0], [in_type, in_type],
445          False, alg_props, const_expr)
446
447def binop(name, ty, alg_props, const_expr):
448   binop_convert(name, ty, ty, alg_props, const_expr)
449
450def binop_compare(name, ty, alg_props, const_expr):
451   binop_convert(name, tbool1, ty, alg_props, const_expr)
452
453def binop_compare32(name, ty, alg_props, const_expr):
454   binop_convert(name, tbool32, ty, alg_props, const_expr)
455
456def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
457                src2_type, const_expr):
458   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
459          False, "", const_expr)
460
461def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
462                 reduce_expr, final_expr):
463   def final(src):
464      return final_expr.format(src= "(" + src + ")")
465   def reduce_(src0, src1):
466      return reduce_expr.format(src0=src0, src1=src1)
467   def prereduce(src0, src1):
468      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
469   src0 = prereduce("src0.x", "src1.x")
470   src1 = prereduce("src0.y", "src1.y")
471   src2 = prereduce("src0.z", "src1.z")
472   src3 = prereduce("src0.w", "src1.w")
473   opcode(name + "2", output_size, output_type,
474          [2, 2], [src_type, src_type], False, commutative,
475          final(reduce_(src0, src1)))
476   opcode(name + "3", output_size, output_type,
477          [3, 3], [src_type, src_type], False, commutative,
478          final(reduce_(reduce_(src0, src1), src2)))
479   opcode(name + "4", output_size, output_type,
480          [4, 4], [src_type, src_type], False, commutative,
481          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
482
483binop("fadd", tfloat, commutative + associative, "src0 + src1")
484binop("iadd", tint, commutative + associative, "src0 + src1")
485binop("iadd_sat", tint, commutative, """
486      src1 > 0 ?
487         (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
488         (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
489""")
490binop("uadd_sat", tuint, commutative,
491      "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
492binop("isub_sat", tint, "", """
493      src1 < 0 ?
494         (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
495         (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
496""")
497binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
498
499binop("fsub", tfloat, "", "src0 - src1")
500binop("isub", tint, "", "src0 - src1")
501
502binop("fmul", tfloat, commutative + associative, "src0 * src1")
503# low 32-bits of signed/unsigned integer multiply
504binop("imul", tint, commutative + associative, "src0 * src1")
505
506# Generate 64 bit result from 2 32 bits quantity
507binop_convert("imul_2x32_64", tint64, tint32, commutative,
508              "(int64_t)src0 * (int64_t)src1")
509binop_convert("umul_2x32_64", tuint64, tuint32, commutative,
510              "(uint64_t)src0 * (uint64_t)src1")
511
512# high 32-bits of signed integer multiply
513binop("imul_high", tint, commutative, """
514if (bit_size == 64) {
515   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
516    * extension to work properly.  The casts are kind-of annoying but needed
517    * to prevent compiler warnings.
518    */
519   uint32_t src0_u32[4] = {
520      src0,
521      (int64_t)src0 >> 32,
522      (int64_t)src0 >> 63,
523      (int64_t)src0 >> 63,
524   };
525   uint32_t src1_u32[4] = {
526      src1,
527      (int64_t)src1 >> 32,
528      (int64_t)src1 >> 63,
529      (int64_t)src1 >> 63,
530   };
531   uint32_t prod_u32[4];
532   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
533   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
534} else {
535   dst = ((int64_t)src0 * (int64_t)src1) >> bit_size;
536}
537""")
538
539# high 32-bits of unsigned integer multiply
540binop("umul_high", tuint, commutative, """
541if (bit_size == 64) {
542   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
543   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
544   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
545   uint32_t prod_u32[4];
546   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
547   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
548} else {
549   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
550}
551""")
552
553binop("fdiv", tfloat, "", "src0 / src1")
554binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
555binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
556
557# returns a boolean representing the carry resulting from the addition of
558# the two unsigned arguments.
559
560binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
561
562# returns a boolean representing the borrow resulting from the subtraction
563# of the two unsigned arguments.
564
565binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
566
567# hadd: (a + b) >> 1 (without overflow)
568# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
569#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
570#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
571#       =     ((x & y) << 1) + (x ^ y)
572#
573# Since we know that the bottom bit of (x & y) << 1 is zero,
574#
575# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
576#              =   (x & y) +      ((x ^ y)  >> 1)
577binop("ihadd", tint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
578binop("uhadd", tuint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
579
580# rhadd: (a + b + 1) >> 1 (without overflow)
581# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
582#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
583#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
584#           =     ((x | y) << 1) - (x ^ y) + 1
585#
586# Since we know that the bottom bit of (x & y) << 1 is zero,
587#
588# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
589#                  = (x | y) -  ((x ^ y)      >> 1)
590binop("irhadd", tint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
591binop("urhadd", tuint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
592
593binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
594
595# For signed integers, there are several different possible definitions of
596# "modulus" or "remainder".  We follow the conventions used by LLVM and
597# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
598# operation while the imod opcode implements the more mathematical
599# "modulus" operation.  For details on the difference, see
600#
601# http://mathforum.org/library/drmath/view/52343.html
602
603binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
604binop("imod", tint, "",
605      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
606      "                 src0 % src1 : src0 % src1 + src1)")
607binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
608binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
609
610#
611# Comparisons
612#
613
614
615# these integer-aware comparisons return a boolean (0 or ~0)
616
617binop_compare("flt", tfloat, "", "src0 < src1")
618binop_compare("fge", tfloat, "", "src0 >= src1")
619binop_compare("feq", tfloat, commutative, "src0 == src1")
620binop_compare("fne", tfloat, commutative, "src0 != src1")
621binop_compare("ilt", tint, "", "src0 < src1")
622binop_compare("ige", tint, "", "src0 >= src1")
623binop_compare("ieq", tint, commutative, "src0 == src1")
624binop_compare("ine", tint, commutative, "src0 != src1")
625binop_compare("ult", tuint, "", "src0 < src1")
626binop_compare("uge", tuint, "", "src0 >= src1")
627binop_compare32("flt32", tfloat, "", "src0 < src1")
628binop_compare32("fge32", tfloat, "", "src0 >= src1")
629binop_compare32("feq32", tfloat, commutative, "src0 == src1")
630binop_compare32("fne32", tfloat, commutative, "src0 != src1")
631binop_compare32("ilt32", tint, "", "src0 < src1")
632binop_compare32("ige32", tint, "", "src0 >= src1")
633binop_compare32("ieq32", tint, commutative, "src0 == src1")
634binop_compare32("ine32", tint, commutative, "src0 != src1")
635binop_compare32("ult32", tuint, "", "src0 < src1")
636binop_compare32("uge32", tuint, "", "src0 >= src1")
637
638# integer-aware GLSL-style comparisons that compare floats and ints
639
640binop_reduce("ball_fequal",  1, tbool1, tfloat, "{src0} == {src1}",
641             "{src0} && {src1}", "{src}")
642binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}",
643             "{src0} || {src1}", "{src}")
644binop_reduce("ball_iequal",  1, tbool1, tint, "{src0} == {src1}",
645             "{src0} && {src1}", "{src}")
646binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}",
647             "{src0} || {src1}", "{src}")
648
649binop_reduce("b32all_fequal",  1, tbool32, tfloat, "{src0} == {src1}",
650             "{src0} && {src1}", "{src}")
651binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}",
652             "{src0} || {src1}", "{src}")
653binop_reduce("b32all_iequal",  1, tbool32, tint, "{src0} == {src1}",
654             "{src0} && {src1}", "{src}")
655binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}",
656             "{src0} || {src1}", "{src}")
657
658# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
659
660binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
661             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
662binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
663             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
664
665# These comparisons for integer-less hardware return 1.0 and 0.0 for true
666# and false respectively
667
668binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
669binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
670binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
671binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
672
673# SPIRV shifts are undefined for shift-operands >= bitsize,
674# but SM5 shifts are defined to use the least significant bits, only
675# The NIR definition is according to the SM5 specification.
676opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
677       "src0 << (src1 & (sizeof(src0) * 8 - 1))")
678opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
679       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
680opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
681       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
682
683# bitwise logic operators
684#
685# These are also used as boolean and, or, xor for hardware supporting
686# integers.
687
688
689binop("iand", tuint, commutative + associative, "src0 & src1")
690binop("ior", tuint, commutative + associative, "src0 | src1")
691binop("ixor", tuint, commutative + associative, "src0 ^ src1")
692
693
694# floating point logic operators
695#
696# These use (src != 0.0) for testing the truth of the input, and output 1.0
697# for true and 0.0 for false
698
699binop("fand", tfloat32, commutative,
700      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
701binop("for", tfloat32, commutative,
702      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
703binop("fxor", tfloat32, commutative,
704      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
705
706binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
707             "{src}")
708
709binop_reduce("fdot_replicated", 4, tfloat, tfloat,
710             "{src0} * {src1}", "{src0} + {src1}", "{src}")
711
712opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
713       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
714opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
715       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
716
717binop("fmin", tfloat, "", "fminf(src0, src1)")
718binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
719binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
720binop("fmax", tfloat, "", "fmaxf(src0, src1)")
721binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
722binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
723
724# Saturated vector add for 4 8bit ints.
725binop("usadd_4x8", tint32, commutative + associative, """
726dst = 0;
727for (int i = 0; i < 32; i += 8) {
728   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
729}
730""")
731
732# Saturated vector subtract for 4 8bit ints.
733binop("ussub_4x8", tint32, "", """
734dst = 0;
735for (int i = 0; i < 32; i += 8) {
736   int src0_chan = (src0 >> i) & 0xff;
737   int src1_chan = (src1 >> i) & 0xff;
738   if (src0_chan > src1_chan)
739      dst |= (src0_chan - src1_chan) << i;
740}
741""")
742
743# vector min for 4 8bit ints.
744binop("umin_4x8", tint32, commutative + associative, """
745dst = 0;
746for (int i = 0; i < 32; i += 8) {
747   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
748}
749""")
750
751# vector max for 4 8bit ints.
752binop("umax_4x8", tint32, commutative + associative, """
753dst = 0;
754for (int i = 0; i < 32; i += 8) {
755   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
756}
757""")
758
759# unorm multiply: (a * b) / 255.
760binop("umul_unorm_4x8", tint32, commutative + associative, """
761dst = 0;
762for (int i = 0; i < 32; i += 8) {
763   int src0_chan = (src0 >> i) & 0xff;
764   int src1_chan = (src1 >> i) & 0xff;
765   dst |= ((src0_chan * src1_chan) / 255) << i;
766}
767""")
768
769binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
770
771binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
772            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
773
774binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
775              "src0 | ((uint64_t)src1 << 32)")
776
777binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
778              "src0 | ((uint32_t)src1 << 16)")
779
780# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
781# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
782# if either of its arguments are 32.
783binop_convert("bfm", tuint32, tint32, "", """
784int bits = src0, offset = src1;
785if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
786   dst = 0; /* undefined */
787else
788   dst = ((1u << bits) - 1) << offset;
789""")
790
791opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
792dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
793/* flush denormals to zero. */
794if (!isnormal(dst))
795   dst = copysignf(0.0f, src0);
796""")
797
798# Combines the first component of each input to make a 2-component vector.
799
800binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
801dst.x = src0.x;
802dst.y = src1.x;
803""")
804
805# Byte extraction
806binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
807binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
808
809# Word extraction
810binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
811binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
812
813
814def triop(name, ty, const_expr):
815   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, "", const_expr)
816def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
817   opcode(name, output_size, tuint,
818   [src1_size, src2_size, src3_size],
819   [tuint, tuint, tuint], False, "", const_expr)
820
821triop("ffma", tfloat, "src0 * src1 + src2")
822
823triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
824
825# Conditional Select
826#
827# A vector conditional select instruction (like ?:, but operating per-
828# component on vectors). There are two versions, one for floating point
829# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
830
831
832triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
833
834# 3 way min/max/med
835triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
836triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
837triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
838
839triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
840triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
841triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
842
843triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
844triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
845triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
846
847opcode("bcsel", 0, tuint, [0, 0, 0],
848      [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
849opcode("b32csel", 0, tuint, [0, 0, 0],
850       [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
851
852# SM5 bfi assembly
853triop("bfi", tuint32, """
854unsigned mask = src0, insert = src1, base = src2;
855if (mask == 0) {
856   dst = base;
857} else {
858   unsigned tmp = mask;
859   while (!(tmp & 1)) {
860      tmp >>= 1;
861      insert <<= 1;
862   }
863   dst = (base & ~mask) | (insert & mask);
864}
865""")
866
867# SM5 ubfe/ibfe assembly
868opcode("ubfe", 0, tuint32,
869       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
870unsigned base = src0;
871int offset = src1, bits = src2;
872if (bits == 0) {
873   dst = 0;
874} else if (bits < 0 || offset < 0) {
875   dst = 0; /* undefined */
876} else if (offset + bits < 32) {
877   dst = (base << (32 - bits - offset)) >> (32 - bits);
878} else {
879   dst = base >> offset;
880}
881""")
882opcode("ibfe", 0, tint32,
883       [0, 0, 0], [tint32, tint32, tint32], False, "", """
884int base = src0;
885int offset = src1, bits = src2;
886if (bits == 0) {
887   dst = 0;
888} else if (bits < 0 || offset < 0) {
889   dst = 0; /* undefined */
890} else if (offset + bits < 32) {
891   dst = (base << (32 - bits - offset)) >> (32 - bits);
892} else {
893   dst = base >> offset;
894}
895""")
896
897# GLSL bitfieldExtract()
898opcode("ubitfield_extract", 0, tuint32,
899       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
900unsigned base = src0;
901int offset = src1, bits = src2;
902if (bits == 0) {
903   dst = 0;
904} else if (bits < 0 || offset < 0 || offset + bits > 32) {
905   dst = 0; /* undefined per the spec */
906} else {
907   dst = (base >> offset) & ((1ull << bits) - 1);
908}
909""")
910opcode("ibitfield_extract", 0, tint32,
911       [0, 0, 0], [tint32, tint32, tint32], False, "", """
912int base = src0;
913int offset = src1, bits = src2;
914if (bits == 0) {
915   dst = 0;
916} else if (offset < 0 || bits < 0 || offset + bits > 32) {
917   dst = 0;
918} else {
919   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
920}
921""")
922
923# Combines the first component of each input to make a 3-component vector.
924
925triop_horiz("vec3", 3, 1, 1, 1, """
926dst.x = src0.x;
927dst.y = src1.x;
928dst.z = src2.x;
929""")
930
931def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
932                 src4_size, const_expr):
933   opcode(name, output_size, tuint,
934          [src1_size, src2_size, src3_size, src4_size],
935          [tuint, tuint, tuint, tuint],
936          False, "", const_expr)
937
938opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
939       [tuint32, tuint32, tint32, tint32], False, "", """
940unsigned base = src0, insert = src1;
941int offset = src2, bits = src3;
942if (bits == 0) {
943   dst = base;
944} else if (offset < 0 || bits < 0 || bits + offset > 32) {
945   dst = 0;
946} else {
947   unsigned mask = ((1ull << bits) - 1) << offset;
948   dst = (base & ~mask) | ((insert << offset) & mask);
949}
950""")
951
952quadop_horiz("vec4", 4, 1, 1, 1, 1, """
953dst.x = src0.x;
954dst.y = src1.x;
955dst.z = src2.x;
956dst.w = src3.x;
957""")
958
959
960