1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26import re 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, is_conversion, algebraic_properties, const_expr): 37 """Parameters: 38 39 - name is the name of the opcode (prepend nir_op_ for the enum name) 40 - all types are strings that get nir_type_ prepended to them 41 - input_types is a list of types 42 - is_conversion is true if this opcode represents a type conversion 43 - algebraic_properties is a space-seperated string, where nir_op_is_ is 44 prepended before each entry 45 - const_expr is an expression or series of statements that computes the 46 constant value of the opcode given the constant values of its inputs. 47 48 Constant expressions are formed from the variables src0, src1, ..., 49 src(N-1), where N is the number of arguments. The output of the 50 expression should be stored in the dst variable. Per-component input 51 and output variables will be scalars and non-per-component input and 52 output variables will be a struct with fields named x, y, z, and w 53 all of the correct type. Input and output variables can be assumed 54 to already be of the correct type and need no conversion. In 55 particular, the conversion from the C bool type to/from NIR_TRUE and 56 NIR_FALSE happens automatically. 57 58 For per-component instructions, the entire expression will be 59 executed once for each component. For non-per-component 60 instructions, the expression is expected to store the correct values 61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 62 constant expression, an assignment to dst will happen automatically 63 and the result will be equivalent to "dst = <expression>" for 64 per-component instructions and "dst.x = dst.y = ... = <expression>" 65 for non-per-component instructions. 66 """ 67 assert isinstance(name, str) 68 assert isinstance(output_size, int) 69 assert isinstance(output_type, str) 70 assert isinstance(input_sizes, list) 71 assert isinstance(input_sizes[0], int) 72 assert isinstance(input_types, list) 73 assert isinstance(input_types[0], str) 74 assert isinstance(is_conversion, bool) 75 assert isinstance(algebraic_properties, str) 76 assert isinstance(const_expr, str) 77 assert len(input_sizes) == len(input_types) 78 assert 0 <= output_size <= 4 79 for size in input_sizes: 80 assert 0 <= size <= 4 81 if output_size != 0: 82 assert size != 0 83 self.name = name 84 self.num_inputs = len(input_sizes) 85 self.output_size = output_size 86 self.output_type = output_type 87 self.input_sizes = input_sizes 88 self.input_types = input_types 89 self.is_conversion = is_conversion 90 self.algebraic_properties = algebraic_properties 91 self.const_expr = const_expr 92 93# helper variables for strings 94tfloat = "float" 95tint = "int" 96tbool = "bool" 97tbool1 = "bool1" 98tbool32 = "bool32" 99tuint = "uint" 100tuint16 = "uint16" 101tfloat32 = "float32" 102tint32 = "int32" 103tuint32 = "uint32" 104tint64 = "int64" 105tuint64 = "uint64" 106tfloat64 = "float64" 107 108_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?') 109 110def type_has_size(type_): 111 m = _TYPE_SPLIT_RE.match(type_) 112 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 113 return m.group('bits') is not None 114 115def type_size(type_): 116 m = _TYPE_SPLIT_RE.match(type_) 117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 118 assert m.group('bits') is not None, \ 119 'NIR type string has no bit size: "{}"'.format(type_) 120 return int(m.group('bits')) 121 122def type_sizes(type_): 123 if type_has_size(type_): 124 return [type_size(type_)] 125 elif type_ == 'bool': 126 return [1, 32] 127 elif type_ == 'float': 128 return [16, 32, 64] 129 else: 130 return [1, 8, 16, 32, 64] 131 132def type_base_type(type_): 133 m = _TYPE_SPLIT_RE.match(type_) 134 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 135 return m.group('type') 136 137commutative = "commutative " 138associative = "associative " 139 140# global dictionary of opcodes 141opcodes = {} 142 143def opcode(name, output_size, output_type, input_sizes, input_types, 144 is_conversion, algebraic_properties, const_expr): 145 assert name not in opcodes 146 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 147 input_types, is_conversion, algebraic_properties, 148 const_expr) 149 150def unop_convert(name, out_type, in_type, const_expr): 151 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr) 152 153def unop(name, ty, const_expr): 154 opcode(name, 0, ty, [0], [ty], False, "", const_expr) 155 156def unop_horiz(name, output_size, output_type, input_size, input_type, 157 const_expr): 158 opcode(name, output_size, output_type, [input_size], [input_type], 159 False, "", const_expr) 160 161def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 162 reduce_expr, final_expr): 163 def prereduce(src): 164 return "(" + prereduce_expr.format(src=src) + ")" 165 def final(src): 166 return final_expr.format(src="(" + src + ")") 167 def reduce_(src0, src1): 168 return reduce_expr.format(src0=src0, src1=src1) 169 src0 = prereduce("src0.x") 170 src1 = prereduce("src0.y") 171 src2 = prereduce("src0.z") 172 src3 = prereduce("src0.w") 173 unop_horiz(name + "2", output_size, output_type, 2, input_type, 174 final(reduce_(src0, src1))) 175 unop_horiz(name + "3", output_size, output_type, 3, input_type, 176 final(reduce_(reduce_(src0, src1), src2))) 177 unop_horiz(name + "4", output_size, output_type, 4, input_type, 178 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 179 180def unop_numeric_convert(name, out_type, in_type, const_expr): 181 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr) 182 183# These two move instructions differ in what modifiers they support and what 184# the negate modifier means. Otherwise, they are identical. 185unop("fmov", tfloat, "src0") 186unop("imov", tint, "src0") 187 188unop("ineg", tint, "-src0") 189unop("fneg", tfloat, "-src0") 190unop("inot", tint, "~src0") # invert every bit of the integer 191unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " + 192 "((src0 == 0.0f) ? 1.0f : 0.0f)")) 193unop("fsign", tfloat, ("bit_size == 64 ? " + 194 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 195 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 196unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 197unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 198unop("fabs", tfloat, "fabs(src0)") 199unop("fsat", tfloat, ("bit_size == 64 ? " + 200 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " + 201 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))")) 202unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 203unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 204unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 205unop("fexp2", tfloat, "exp2f(src0)") 206unop("flog2", tfloat, "log2f(src0)") 207 208# Generate all of the numeric conversion opcodes 209for src_t in [tint, tuint, tfloat, tbool]: 210 if src_t == tbool: 211 dst_types = [tfloat, tint] 212 elif src_t == tint: 213 dst_types = [tfloat, tint, tbool] 214 elif src_t == tuint: 215 dst_types = [tfloat, tuint] 216 elif src_t == tfloat: 217 dst_types = [tint, tuint, tfloat, tbool] 218 219 for dst_t in dst_types: 220 for bit_size in type_sizes(dst_t): 221 if bit_size == 16 and dst_t == tfloat and src_t == tfloat: 222 rnd_modes = ['_rtne', '_rtz', ''] 223 for rnd_mode in rnd_modes: 224 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0], 225 bit_size, rnd_mode), 226 dst_t + str(bit_size), src_t, "src0") 227 else: 228 conv_expr = "src0 != 0" if dst_t == tbool else "src0" 229 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size), 230 dst_t + str(bit_size), src_t, conv_expr) 231 232 233# Unary floating-point rounding operations. 234 235 236unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 237unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 238unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 239unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 240unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 241 242unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 243 244# Trigonometric operations. 245 246 247unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 248unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 249 250# dfrexp 251unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);") 252unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);") 253 254# Partial derivatives. 255 256 257unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 258unop("fddy", tfloat, "0.0") 259unop("fddx_fine", tfloat, "0.0") 260unop("fddy_fine", tfloat, "0.0") 261unop("fddx_coarse", tfloat, "0.0") 262unop("fddy_coarse", tfloat, "0.0") 263 264 265# Floating point pack and unpack operations. 266 267def pack_2x16(fmt): 268 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 269dst.x = (uint32_t) pack_fmt_1x16(src0.x); 270dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 271""".replace("fmt", fmt)) 272 273def pack_4x8(fmt): 274 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 275dst.x = (uint32_t) pack_fmt_1x8(src0.x); 276dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 277dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 278dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 279""".replace("fmt", fmt)) 280 281def unpack_2x16(fmt): 282 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 283dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 284dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 285""".replace("fmt", fmt)) 286 287def unpack_4x8(fmt): 288 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 289dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 290dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 291dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 292dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 293""".replace("fmt", fmt)) 294 295 296pack_2x16("snorm") 297pack_4x8("snorm") 298pack_2x16("unorm") 299pack_4x8("unorm") 300pack_2x16("half") 301unpack_2x16("snorm") 302unpack_4x8("snorm") 303unpack_2x16("unorm") 304unpack_4x8("unorm") 305unpack_2x16("half") 306 307unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 308dst.x = (src0.x & 0xffff) | (src0.y << 16); 309""") 310 311unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 312dst.x = (src0.x << 0) | 313 (src0.y << 8) | 314 (src0.z << 16) | 315 (src0.w << 24); 316""") 317 318unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 319 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 320 321unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 322 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 323 324unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 325 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 326 327unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 328 "dst.x = src0.x; dst.y = src0.x >> 32;") 329 330unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 331 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;") 332 333unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 334 "dst.x = src0.x; dst.y = src0.x >> 16;") 335 336# Lowered floating point unpacking operations. 337 338 339unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 340 "unpack_half_1x16((uint16_t)(src0 & 0xffff))") 341unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 342 "unpack_half_1x16((uint16_t)(src0 >> 16))") 343 344unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 345unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 346 347unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 348unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 349 350# Bit operations, part of ARB_gpu_shader5. 351 352 353unop("bitfield_reverse", tuint32, """ 354/* we're not winning any awards for speed here, but that's ok */ 355dst = 0; 356for (unsigned bit = 0; bit < 32; bit++) 357 dst |= ((src0 >> bit) & 1) << (31 - bit); 358""") 359unop_convert("bit_count", tuint32, tuint, """ 360dst = 0; 361for (unsigned bit = 0; bit < bit_size; bit++) { 362 if ((src0 >> bit) & 1) 363 dst++; 364} 365""") 366 367unop_convert("ufind_msb", tint32, tuint, """ 368dst = -1; 369for (int bit = bit_size - 1; bit >= 0; bit--) { 370 if ((src0 >> bit) & 1) { 371 dst = bit; 372 break; 373 } 374} 375""") 376 377unop("ifind_msb", tint32, """ 378dst = -1; 379for (int bit = 31; bit >= 0; bit--) { 380 /* If src0 < 0, we're looking for the first 0 bit. 381 * if src0 >= 0, we're looking for the first 1 bit. 382 */ 383 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 384 (!((src0 >> bit) & 1) && (src0 < 0))) { 385 dst = bit; 386 break; 387 } 388} 389""") 390 391unop_convert("find_lsb", tint32, tint, """ 392dst = -1; 393for (unsigned bit = 0; bit < bit_size; bit++) { 394 if ((src0 >> bit) & 1) { 395 dst = bit; 396 break; 397 } 398} 399""") 400 401 402for i in range(1, 5): 403 for j in range(1, 5): 404 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f") 405 406 407# AMD_gcn_shader extended instructions 408unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """ 409dst.x = dst.y = 0.0; 410float absX = fabs(src0.x); 411float absY = fabs(src0.y); 412float absZ = fabs(src0.z); 413 414float ma = 0.0; 415if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; } 416if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; } 417if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; } 418 419if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; } 420if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; } 421if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; } 422if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; } 423if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; } 424if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; } 425 426dst.x = dst.x / ma + 0.5; 427dst.y = dst.y / ma + 0.5; 428""") 429 430unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """ 431float absX = fabs(src0.x); 432float absY = fabs(src0.y); 433float absZ = fabs(src0.z); 434if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0; 435if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1; 436if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2; 437if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3; 438if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4; 439if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5; 440""") 441 442 443def binop_convert(name, out_type, in_type, alg_props, const_expr): 444 opcode(name, 0, out_type, [0, 0], [in_type, in_type], 445 False, alg_props, const_expr) 446 447def binop(name, ty, alg_props, const_expr): 448 binop_convert(name, ty, ty, alg_props, const_expr) 449 450def binop_compare(name, ty, alg_props, const_expr): 451 binop_convert(name, tbool1, ty, alg_props, const_expr) 452 453def binop_compare32(name, ty, alg_props, const_expr): 454 binop_convert(name, tbool32, ty, alg_props, const_expr) 455 456def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 457 src2_type, const_expr): 458 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 459 False, "", const_expr) 460 461def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 462 reduce_expr, final_expr): 463 def final(src): 464 return final_expr.format(src= "(" + src + ")") 465 def reduce_(src0, src1): 466 return reduce_expr.format(src0=src0, src1=src1) 467 def prereduce(src0, src1): 468 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 469 src0 = prereduce("src0.x", "src1.x") 470 src1 = prereduce("src0.y", "src1.y") 471 src2 = prereduce("src0.z", "src1.z") 472 src3 = prereduce("src0.w", "src1.w") 473 opcode(name + "2", output_size, output_type, 474 [2, 2], [src_type, src_type], False, commutative, 475 final(reduce_(src0, src1))) 476 opcode(name + "3", output_size, output_type, 477 [3, 3], [src_type, src_type], False, commutative, 478 final(reduce_(reduce_(src0, src1), src2))) 479 opcode(name + "4", output_size, output_type, 480 [4, 4], [src_type, src_type], False, commutative, 481 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 482 483binop("fadd", tfloat, commutative + associative, "src0 + src1") 484binop("iadd", tint, commutative + associative, "src0 + src1") 485binop("iadd_sat", tint, commutative, """ 486 src1 > 0 ? 487 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) : 488 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1) 489""") 490binop("uadd_sat", tuint, commutative, 491 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)") 492binop("isub_sat", tint, "", """ 493 src1 < 0 ? 494 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) : 495 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1) 496""") 497binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") 498 499binop("fsub", tfloat, "", "src0 - src1") 500binop("isub", tint, "", "src0 - src1") 501 502binop("fmul", tfloat, commutative + associative, "src0 * src1") 503# low 32-bits of signed/unsigned integer multiply 504binop("imul", tint, commutative + associative, "src0 * src1") 505 506# Generate 64 bit result from 2 32 bits quantity 507binop_convert("imul_2x32_64", tint64, tint32, commutative, 508 "(int64_t)src0 * (int64_t)src1") 509binop_convert("umul_2x32_64", tuint64, tuint32, commutative, 510 "(uint64_t)src0 * (uint64_t)src1") 511 512# high 32-bits of signed integer multiply 513binop("imul_high", tint, commutative, """ 514if (bit_size == 64) { 515 /* We need to do a full 128-bit x 128-bit multiply in order for the sign 516 * extension to work properly. The casts are kind-of annoying but needed 517 * to prevent compiler warnings. 518 */ 519 uint32_t src0_u32[4] = { 520 src0, 521 (int64_t)src0 >> 32, 522 (int64_t)src0 >> 63, 523 (int64_t)src0 >> 63, 524 }; 525 uint32_t src1_u32[4] = { 526 src1, 527 (int64_t)src1 >> 32, 528 (int64_t)src1 >> 63, 529 (int64_t)src1 >> 63, 530 }; 531 uint32_t prod_u32[4]; 532 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 533 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 534} else { 535 dst = ((int64_t)src0 * (int64_t)src1) >> bit_size; 536} 537""") 538 539# high 32-bits of unsigned integer multiply 540binop("umul_high", tuint, commutative, """ 541if (bit_size == 64) { 542 /* The casts are kind-of annoying but needed to prevent compiler warnings. */ 543 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 }; 544 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 }; 545 uint32_t prod_u32[4]; 546 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 547 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 548} else { 549 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size; 550} 551""") 552 553binop("fdiv", tfloat, "", "src0 / src1") 554binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 555binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 556 557# returns a boolean representing the carry resulting from the addition of 558# the two unsigned arguments. 559 560binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0") 561 562# returns a boolean representing the borrow resulting from the subtraction 563# of the two unsigned arguments. 564 565binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 566 567# hadd: (a + b) >> 1 (without overflow) 568# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y) 569# = (x & y) + (x & ~y) + (x & y) + (~x & y) 570# = 2 * (x & y) + (x & ~y) + (~x & y) 571# = ((x & y) << 1) + (x ^ y) 572# 573# Since we know that the bottom bit of (x & y) << 1 is zero, 574# 575# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1 576# = (x & y) + ((x ^ y) >> 1) 577binop("ihadd", tint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 578binop("uhadd", tuint, commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 579 580# rhadd: (a + b + 1) >> 1 (without overflow) 581# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1 582# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1 583# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1 584# = ((x | y) << 1) - (x ^ y) + 1 585# 586# Since we know that the bottom bit of (x & y) << 1 is zero, 587# 588# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1) 589# = (x | y) - ((x ^ y) >> 1) 590binop("irhadd", tint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)") 591binop("urhadd", tuint, commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)") 592 593binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 594 595# For signed integers, there are several different possible definitions of 596# "modulus" or "remainder". We follow the conventions used by LLVM and 597# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 598# operation while the imod opcode implements the more mathematical 599# "modulus" operation. For details on the difference, see 600# 601# http://mathforum.org/library/drmath/view/52343.html 602 603binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 604binop("imod", tint, "", 605 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 606 " src0 % src1 : src0 % src1 + src1)") 607binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 608binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 609 610# 611# Comparisons 612# 613 614 615# these integer-aware comparisons return a boolean (0 or ~0) 616 617binop_compare("flt", tfloat, "", "src0 < src1") 618binop_compare("fge", tfloat, "", "src0 >= src1") 619binop_compare("feq", tfloat, commutative, "src0 == src1") 620binop_compare("fne", tfloat, commutative, "src0 != src1") 621binop_compare("ilt", tint, "", "src0 < src1") 622binop_compare("ige", tint, "", "src0 >= src1") 623binop_compare("ieq", tint, commutative, "src0 == src1") 624binop_compare("ine", tint, commutative, "src0 != src1") 625binop_compare("ult", tuint, "", "src0 < src1") 626binop_compare("uge", tuint, "", "src0 >= src1") 627binop_compare32("flt32", tfloat, "", "src0 < src1") 628binop_compare32("fge32", tfloat, "", "src0 >= src1") 629binop_compare32("feq32", tfloat, commutative, "src0 == src1") 630binop_compare32("fne32", tfloat, commutative, "src0 != src1") 631binop_compare32("ilt32", tint, "", "src0 < src1") 632binop_compare32("ige32", tint, "", "src0 >= src1") 633binop_compare32("ieq32", tint, commutative, "src0 == src1") 634binop_compare32("ine32", tint, commutative, "src0 != src1") 635binop_compare32("ult32", tuint, "", "src0 < src1") 636binop_compare32("uge32", tuint, "", "src0 >= src1") 637 638# integer-aware GLSL-style comparisons that compare floats and ints 639 640binop_reduce("ball_fequal", 1, tbool1, tfloat, "{src0} == {src1}", 641 "{src0} && {src1}", "{src}") 642binop_reduce("bany_fnequal", 1, tbool1, tfloat, "{src0} != {src1}", 643 "{src0} || {src1}", "{src}") 644binop_reduce("ball_iequal", 1, tbool1, tint, "{src0} == {src1}", 645 "{src0} && {src1}", "{src}") 646binop_reduce("bany_inequal", 1, tbool1, tint, "{src0} != {src1}", 647 "{src0} || {src1}", "{src}") 648 649binop_reduce("b32all_fequal", 1, tbool32, tfloat, "{src0} == {src1}", 650 "{src0} && {src1}", "{src}") 651binop_reduce("b32any_fnequal", 1, tbool32, tfloat, "{src0} != {src1}", 652 "{src0} || {src1}", "{src}") 653binop_reduce("b32all_iequal", 1, tbool32, tint, "{src0} == {src1}", 654 "{src0} && {src1}", "{src}") 655binop_reduce("b32any_inequal", 1, tbool32, tint, "{src0} != {src1}", 656 "{src0} || {src1}", "{src}") 657 658# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 659 660binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 661 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 662binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 663 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 664 665# These comparisons for integer-less hardware return 1.0 and 0.0 for true 666# and false respectively 667 668binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 669binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 670binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 671binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 672 673# SPIRV shifts are undefined for shift-operands >= bitsize, 674# but SM5 shifts are defined to use the least significant bits, only 675# The NIR definition is according to the SM5 specification. 676opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "", 677 "src0 << (src1 & (sizeof(src0) * 8 - 1))") 678opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "", 679 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 680opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "", 681 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 682 683# bitwise logic operators 684# 685# These are also used as boolean and, or, xor for hardware supporting 686# integers. 687 688 689binop("iand", tuint, commutative + associative, "src0 & src1") 690binop("ior", tuint, commutative + associative, "src0 | src1") 691binop("ixor", tuint, commutative + associative, "src0 ^ src1") 692 693 694# floating point logic operators 695# 696# These use (src != 0.0) for testing the truth of the input, and output 1.0 697# for true and 0.0 for false 698 699binop("fand", tfloat32, commutative, 700 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") 701binop("for", tfloat32, commutative, 702 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") 703binop("fxor", tfloat32, commutative, 704 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") 705 706binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 707 "{src}") 708 709binop_reduce("fdot_replicated", 4, tfloat, tfloat, 710 "{src0} * {src1}", "{src0} + {src1}", "{src}") 711 712opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "", 713 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 714opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "", 715 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 716 717binop("fmin", tfloat, "", "fminf(src0, src1)") 718binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1") 719binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1") 720binop("fmax", tfloat, "", "fmaxf(src0, src1)") 721binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") 722binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") 723 724# Saturated vector add for 4 8bit ints. 725binop("usadd_4x8", tint32, commutative + associative, """ 726dst = 0; 727for (int i = 0; i < 32; i += 8) { 728 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 729} 730""") 731 732# Saturated vector subtract for 4 8bit ints. 733binop("ussub_4x8", tint32, "", """ 734dst = 0; 735for (int i = 0; i < 32; i += 8) { 736 int src0_chan = (src0 >> i) & 0xff; 737 int src1_chan = (src1 >> i) & 0xff; 738 if (src0_chan > src1_chan) 739 dst |= (src0_chan - src1_chan) << i; 740} 741""") 742 743# vector min for 4 8bit ints. 744binop("umin_4x8", tint32, commutative + associative, """ 745dst = 0; 746for (int i = 0; i < 32; i += 8) { 747 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 748} 749""") 750 751# vector max for 4 8bit ints. 752binop("umax_4x8", tint32, commutative + associative, """ 753dst = 0; 754for (int i = 0; i < 32; i += 8) { 755 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 756} 757""") 758 759# unorm multiply: (a * b) / 255. 760binop("umul_unorm_4x8", tint32, commutative + associative, """ 761dst = 0; 762for (int i = 0; i < 32; i += 8) { 763 int src0_chan = (src0 >> i) & 0xff; 764 int src1_chan = (src1 >> i) & 0xff; 765 dst |= ((src0_chan * src1_chan) / 255) << i; 766} 767""") 768 769binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 770 771binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 772 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 773 774binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 775 "src0 | ((uint64_t)src1 << 32)") 776 777binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 778 "src0 | ((uint32_t)src1 << 16)") 779 780# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 781# and that of the "bfi1" i965 instruction. That is, it has undefined behavior 782# if either of its arguments are 32. 783binop_convert("bfm", tuint32, tint32, "", """ 784int bits = src0, offset = src1; 785if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) 786 dst = 0; /* undefined */ 787else 788 dst = ((1u << bits) - 1) << offset; 789""") 790 791opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """ 792dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 793/* flush denormals to zero. */ 794if (!isnormal(dst)) 795 dst = copysignf(0.0f, src0); 796""") 797 798# Combines the first component of each input to make a 2-component vector. 799 800binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 801dst.x = src0.x; 802dst.y = src1.x; 803""") 804 805# Byte extraction 806binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 807binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 808 809# Word extraction 810binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 811binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 812 813 814def triop(name, ty, const_expr): 815 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, "", const_expr) 816def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 817 opcode(name, output_size, tuint, 818 [src1_size, src2_size, src3_size], 819 [tuint, tuint, tuint], False, "", const_expr) 820 821triop("ffma", tfloat, "src0 * src1 + src2") 822 823triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") 824 825# Conditional Select 826# 827# A vector conditional select instruction (like ?:, but operating per- 828# component on vectors). There are two versions, one for floating point 829# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 830 831 832triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") 833 834# 3 way min/max/med 835triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))") 836triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))") 837triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))") 838 839triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))") 840triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))") 841triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))") 842 843triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))") 844triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))") 845triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))") 846 847opcode("bcsel", 0, tuint, [0, 0, 0], 848 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2") 849opcode("b32csel", 0, tuint, [0, 0, 0], 850 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2") 851 852# SM5 bfi assembly 853triop("bfi", tuint32, """ 854unsigned mask = src0, insert = src1, base = src2; 855if (mask == 0) { 856 dst = base; 857} else { 858 unsigned tmp = mask; 859 while (!(tmp & 1)) { 860 tmp >>= 1; 861 insert <<= 1; 862 } 863 dst = (base & ~mask) | (insert & mask); 864} 865""") 866 867# SM5 ubfe/ibfe assembly 868opcode("ubfe", 0, tuint32, 869 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 870unsigned base = src0; 871int offset = src1, bits = src2; 872if (bits == 0) { 873 dst = 0; 874} else if (bits < 0 || offset < 0) { 875 dst = 0; /* undefined */ 876} else if (offset + bits < 32) { 877 dst = (base << (32 - bits - offset)) >> (32 - bits); 878} else { 879 dst = base >> offset; 880} 881""") 882opcode("ibfe", 0, tint32, 883 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 884int base = src0; 885int offset = src1, bits = src2; 886if (bits == 0) { 887 dst = 0; 888} else if (bits < 0 || offset < 0) { 889 dst = 0; /* undefined */ 890} else if (offset + bits < 32) { 891 dst = (base << (32 - bits - offset)) >> (32 - bits); 892} else { 893 dst = base >> offset; 894} 895""") 896 897# GLSL bitfieldExtract() 898opcode("ubitfield_extract", 0, tuint32, 899 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 900unsigned base = src0; 901int offset = src1, bits = src2; 902if (bits == 0) { 903 dst = 0; 904} else if (bits < 0 || offset < 0 || offset + bits > 32) { 905 dst = 0; /* undefined per the spec */ 906} else { 907 dst = (base >> offset) & ((1ull << bits) - 1); 908} 909""") 910opcode("ibitfield_extract", 0, tint32, 911 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 912int base = src0; 913int offset = src1, bits = src2; 914if (bits == 0) { 915 dst = 0; 916} else if (offset < 0 || bits < 0 || offset + bits > 32) { 917 dst = 0; 918} else { 919 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 920} 921""") 922 923# Combines the first component of each input to make a 3-component vector. 924 925triop_horiz("vec3", 3, 1, 1, 1, """ 926dst.x = src0.x; 927dst.y = src1.x; 928dst.z = src2.x; 929""") 930 931def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 932 src4_size, const_expr): 933 opcode(name, output_size, tuint, 934 [src1_size, src2_size, src3_size, src4_size], 935 [tuint, tuint, tuint, tuint], 936 False, "", const_expr) 937 938opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 939 [tuint32, tuint32, tint32, tint32], False, "", """ 940unsigned base = src0, insert = src1; 941int offset = src2, bits = src3; 942if (bits == 0) { 943 dst = base; 944} else if (offset < 0 || bits < 0 || bits + offset > 32) { 945 dst = 0; 946} else { 947 unsigned mask = ((1ull << bits) - 1) << offset; 948 dst = (base & ~mask) | ((insert << offset) & mask); 949} 950""") 951 952quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 953dst.x = src0.x; 954dst.y = src1.x; 955dst.z = src2.x; 956dst.w = src3.x; 957""") 958 959 960