nir_opcodes.py revision 01e04c3f
1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26 27# Class that represents all the information we have about the opcode 28# NOTE: this must be kept in sync with nir_op_info 29 30class Opcode(object): 31 """Class that represents all the information we have about the opcode 32 NOTE: this must be kept in sync with nir_op_info 33 """ 34 def __init__(self, name, output_size, output_type, input_sizes, 35 input_types, algebraic_properties, const_expr): 36 """Parameters: 37 38 - name is the name of the opcode (prepend nir_op_ for the enum name) 39 - all types are strings that get nir_type_ prepended to them 40 - input_types is a list of types 41 - algebraic_properties is a space-seperated string, where nir_op_is_ is 42 prepended before each entry 43 - const_expr is an expression or series of statements that computes the 44 constant value of the opcode given the constant values of its inputs. 45 46 Constant expressions are formed from the variables src0, src1, ..., 47 src(N-1), where N is the number of arguments. The output of the 48 expression should be stored in the dst variable. Per-component input 49 and output variables will be scalars and non-per-component input and 50 output variables will be a struct with fields named x, y, z, and w 51 all of the correct type. Input and output variables can be assumed 52 to already be of the correct type and need no conversion. In 53 particular, the conversion from the C bool type to/from NIR_TRUE and 54 NIR_FALSE happens automatically. 55 56 For per-component instructions, the entire expression will be 57 executed once for each component. For non-per-component 58 instructions, the expression is expected to store the correct values 59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 60 constant expression, an assignment to dst will happen automatically 61 and the result will be equivalent to "dst = <expression>" for 62 per-component instructions and "dst.x = dst.y = ... = <expression>" 63 for non-per-component instructions. 64 """ 65 assert isinstance(name, str) 66 assert isinstance(output_size, int) 67 assert isinstance(output_type, str) 68 assert isinstance(input_sizes, list) 69 assert isinstance(input_sizes[0], int) 70 assert isinstance(input_types, list) 71 assert isinstance(input_types[0], str) 72 assert isinstance(algebraic_properties, str) 73 assert isinstance(const_expr, str) 74 assert len(input_sizes) == len(input_types) 75 assert 0 <= output_size <= 4 76 for size in input_sizes: 77 assert 0 <= size <= 4 78 if output_size != 0: 79 assert size != 0 80 self.name = name 81 self.num_inputs = len(input_sizes) 82 self.output_size = output_size 83 self.output_type = output_type 84 self.input_sizes = input_sizes 85 self.input_types = input_types 86 self.algebraic_properties = algebraic_properties 87 self.const_expr = const_expr 88 89# helper variables for strings 90tfloat = "float" 91tint = "int" 92tbool = "bool32" 93tuint = "uint" 94tuint16 = "uint16" 95tfloat32 = "float32" 96tint32 = "int32" 97tuint32 = "uint32" 98tint64 = "int64" 99tuint64 = "uint64" 100tfloat64 = "float64" 101 102commutative = "commutative " 103associative = "associative " 104 105# global dictionary of opcodes 106opcodes = {} 107 108def opcode(name, output_size, output_type, input_sizes, input_types, 109 algebraic_properties, const_expr): 110 assert name not in opcodes 111 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 112 input_types, algebraic_properties, const_expr) 113 114def unop_convert(name, out_type, in_type, const_expr): 115 opcode(name, 0, out_type, [0], [in_type], "", const_expr) 116 117def unop(name, ty, const_expr): 118 opcode(name, 0, ty, [0], [ty], "", const_expr) 119 120def unop_horiz(name, output_size, output_type, input_size, input_type, 121 const_expr): 122 opcode(name, output_size, output_type, [input_size], [input_type], "", 123 const_expr) 124 125def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 126 reduce_expr, final_expr): 127 def prereduce(src): 128 return "(" + prereduce_expr.format(src=src) + ")" 129 def final(src): 130 return final_expr.format(src="(" + src + ")") 131 def reduce_(src0, src1): 132 return reduce_expr.format(src0=src0, src1=src1) 133 src0 = prereduce("src0.x") 134 src1 = prereduce("src0.y") 135 src2 = prereduce("src0.z") 136 src3 = prereduce("src0.w") 137 unop_horiz(name + "2", output_size, output_type, 2, input_type, 138 final(reduce_(src0, src1))) 139 unop_horiz(name + "3", output_size, output_type, 3, input_type, 140 final(reduce_(reduce_(src0, src1), src2))) 141 unop_horiz(name + "4", output_size, output_type, 4, input_type, 142 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 143 144 145# These two move instructions differ in what modifiers they support and what 146# the negate modifier means. Otherwise, they are identical. 147unop("fmov", tfloat, "src0") 148unop("imov", tint, "src0") 149 150unop("ineg", tint, "-src0") 151unop("fneg", tfloat, "-src0") 152unop("inot", tint, "~src0") # invert every bit of the integer 153unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " + 154 "((src0 == 0.0f) ? 1.0f : 0.0f)")) 155unop("fsign", tfloat, ("bit_size == 64 ? " + 156 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 157 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 158unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 159unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 160unop("fabs", tfloat, "fabs(src0)") 161unop("fsat", tfloat, ("bit_size == 64 ? " + 162 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " + 163 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))")) 164unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 165unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 166unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 167unop("fexp2", tfloat, "exp2f(src0)") 168unop("flog2", tfloat, "log2f(src0)") 169 170# Generate all of the numeric conversion opcodes 171for src_t in [tint, tuint, tfloat]: 172 if src_t in (tint, tuint): 173 dst_types = [tfloat, src_t] 174 elif src_t == tfloat: 175 dst_types = [tint, tuint, tfloat] 176 177 for dst_t in dst_types: 178 if dst_t == tfloat: 179 bit_sizes = [16, 32, 64] 180 else: 181 bit_sizes = [8, 16, 32, 64] 182 for bit_size in bit_sizes: 183 if bit_size == 16 and dst_t == tfloat and src_t == tfloat: 184 rnd_modes = ['_rtne', '_rtz', ''] 185 for rnd_mode in rnd_modes: 186 unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0], 187 bit_size, rnd_mode), 188 dst_t + str(bit_size), src_t, "src0") 189 else: 190 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size), 191 dst_t + str(bit_size), src_t, "src0") 192 193# We'll hand-code the to/from bool conversion opcodes. Because bool doesn't 194# have multiple bit-sizes, we can always infer the size from the other type. 195unop_convert("f2b", tbool, tfloat, "src0 != 0.0") 196unop_convert("i2b", tbool, tint, "src0 != 0") 197unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0") 198unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") 199 200 201# Unary floating-point rounding operations. 202 203 204unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 205unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 206unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 207unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 208unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 209 210unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 211 212# Trigonometric operations. 213 214 215unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 216unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 217 218# dfrexp 219unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);") 220unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);") 221 222# Partial derivatives. 223 224 225unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 226unop("fddy", tfloat, "0.0") 227unop("fddx_fine", tfloat, "0.0") 228unop("fddy_fine", tfloat, "0.0") 229unop("fddx_coarse", tfloat, "0.0") 230unop("fddy_coarse", tfloat, "0.0") 231 232 233# Floating point pack and unpack operations. 234 235def pack_2x16(fmt): 236 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 237dst.x = (uint32_t) pack_fmt_1x16(src0.x); 238dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 239""".replace("fmt", fmt)) 240 241def pack_4x8(fmt): 242 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 243dst.x = (uint32_t) pack_fmt_1x8(src0.x); 244dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 245dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 246dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 247""".replace("fmt", fmt)) 248 249def unpack_2x16(fmt): 250 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 251dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 252dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 253""".replace("fmt", fmt)) 254 255def unpack_4x8(fmt): 256 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 257dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 258dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 259dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 260dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 261""".replace("fmt", fmt)) 262 263 264pack_2x16("snorm") 265pack_4x8("snorm") 266pack_2x16("unorm") 267pack_4x8("unorm") 268pack_2x16("half") 269unpack_2x16("snorm") 270unpack_4x8("snorm") 271unpack_2x16("unorm") 272unpack_4x8("unorm") 273unpack_2x16("half") 274 275unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 276dst.x = (src0.x & 0xffff) | (src0.y << 16); 277""") 278 279unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 280dst.x = (src0.x << 0) | 281 (src0.y << 8) | 282 (src0.z << 16) | 283 (src0.w << 24); 284""") 285 286unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 287 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 288 289unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 290 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 291 292unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 293 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 294 295unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 296 "dst.x = src0.x; dst.y = src0.x >> 32;") 297 298unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 299 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;") 300 301unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 302 "dst.x = src0.x; dst.y = src0.x >> 16;") 303 304# Lowered floating point unpacking operations. 305 306 307unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 308 "unpack_half_1x16((uint16_t)(src0 & 0xffff))") 309unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 310 "unpack_half_1x16((uint16_t)(src0 >> 16))") 311 312unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 313unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 314 315unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 316unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 317 318# Bit operations, part of ARB_gpu_shader5. 319 320 321unop("bitfield_reverse", tuint32, """ 322/* we're not winning any awards for speed here, but that's ok */ 323dst = 0; 324for (unsigned bit = 0; bit < 32; bit++) 325 dst |= ((src0 >> bit) & 1) << (31 - bit); 326""") 327unop_convert("bit_count", tuint32, tuint, """ 328dst = 0; 329for (unsigned bit = 0; bit < bit_size; bit++) { 330 if ((src0 >> bit) & 1) 331 dst++; 332} 333""") 334 335unop_convert("ufind_msb", tint32, tuint, """ 336dst = -1; 337for (int bit = bit_size - 1; bit >= 0; bit--) { 338 if ((src0 >> bit) & 1) { 339 dst = bit; 340 break; 341 } 342} 343""") 344 345unop("ifind_msb", tint32, """ 346dst = -1; 347for (int bit = 31; bit >= 0; bit--) { 348 /* If src0 < 0, we're looking for the first 0 bit. 349 * if src0 >= 0, we're looking for the first 1 bit. 350 */ 351 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 352 (!((src0 >> bit) & 1) && (src0 < 0))) { 353 dst = bit; 354 break; 355 } 356} 357""") 358 359unop_convert("find_lsb", tint32, tint, """ 360dst = -1; 361for (unsigned bit = 0; bit < bit_size; bit++) { 362 if ((src0 >> bit) & 1) { 363 dst = bit; 364 break; 365 } 366} 367""") 368 369 370for i in range(1, 5): 371 for j in range(1, 5): 372 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f") 373 374 375# AMD_gcn_shader extended instructions 376unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """ 377dst.x = dst.y = 0.0; 378float absX = fabs(src0.x); 379float absY = fabs(src0.y); 380float absZ = fabs(src0.z); 381if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; } 382if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; } 383if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; } 384if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; } 385if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; } 386if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; } 387""") 388 389unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """ 390float absX = fabs(src0.x); 391float absY = fabs(src0.y); 392float absZ = fabs(src0.z); 393if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0; 394if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1; 395if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2; 396if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3; 397if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4; 398if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5; 399""") 400 401 402def binop_convert(name, out_type, in_type, alg_props, const_expr): 403 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr) 404 405def binop(name, ty, alg_props, const_expr): 406 binop_convert(name, ty, ty, alg_props, const_expr) 407 408def binop_compare(name, ty, alg_props, const_expr): 409 binop_convert(name, tbool, ty, alg_props, const_expr) 410 411def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 412 src2_type, const_expr): 413 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 414 "", const_expr) 415 416def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 417 reduce_expr, final_expr): 418 def final(src): 419 return final_expr.format(src= "(" + src + ")") 420 def reduce_(src0, src1): 421 return reduce_expr.format(src0=src0, src1=src1) 422 def prereduce(src0, src1): 423 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 424 src0 = prereduce("src0.x", "src1.x") 425 src1 = prereduce("src0.y", "src1.y") 426 src2 = prereduce("src0.z", "src1.z") 427 src3 = prereduce("src0.w", "src1.w") 428 opcode(name + "2", output_size, output_type, 429 [2, 2], [src_type, src_type], commutative, 430 final(reduce_(src0, src1))) 431 opcode(name + "3", output_size, output_type, 432 [3, 3], [src_type, src_type], commutative, 433 final(reduce_(reduce_(src0, src1), src2))) 434 opcode(name + "4", output_size, output_type, 435 [4, 4], [src_type, src_type], commutative, 436 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 437 438binop("fadd", tfloat, commutative + associative, "src0 + src1") 439binop("iadd", tint, commutative + associative, "src0 + src1") 440binop("fsub", tfloat, "", "src0 - src1") 441binop("isub", tint, "", "src0 - src1") 442 443binop("fmul", tfloat, commutative + associative, "src0 * src1") 444# low 32-bits of signed/unsigned integer multiply 445binop("imul", tint, commutative + associative, "src0 * src1") 446# high 32-bits of signed integer multiply 447binop("imul_high", tint32, commutative, 448 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") 449# high 32-bits of unsigned integer multiply 450binop("umul_high", tuint32, commutative, 451 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") 452 453binop("fdiv", tfloat, "", "src0 / src1") 454binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 455binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 456 457# returns a boolean representing the carry resulting from the addition of 458# the two unsigned arguments. 459 460binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0") 461 462# returns a boolean representing the borrow resulting from the subtraction 463# of the two unsigned arguments. 464 465binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 466 467binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 468 469# For signed integers, there are several different possible definitions of 470# "modulus" or "remainder". We follow the conventions used by LLVM and 471# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 472# operation while the imod opcode implements the more mathematical 473# "modulus" operation. For details on the difference, see 474# 475# http://mathforum.org/library/drmath/view/52343.html 476 477binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 478binop("imod", tint, "", 479 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 480 " src0 % src1 : src0 % src1 + src1)") 481binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 482binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 483 484# 485# Comparisons 486# 487 488 489# these integer-aware comparisons return a boolean (0 or ~0) 490 491binop_compare("flt", tfloat, "", "src0 < src1") 492binop_compare("fge", tfloat, "", "src0 >= src1") 493binop_compare("feq", tfloat, commutative, "src0 == src1") 494binop_compare("fne", tfloat, commutative, "src0 != src1") 495binop_compare("ilt", tint, "", "src0 < src1") 496binop_compare("ige", tint, "", "src0 >= src1") 497binop_compare("ieq", tint, commutative, "src0 == src1") 498binop_compare("ine", tint, commutative, "src0 != src1") 499binop_compare("ult", tuint, "", "src0 < src1") 500binop_compare("uge", tuint, "", "src0 >= src1") 501 502# integer-aware GLSL-style comparisons that compare floats and ints 503 504binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}", 505 "{src0} && {src1}", "{src}") 506binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}", 507 "{src0} || {src1}", "{src}") 508binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}", 509 "{src0} && {src1}", "{src}") 510binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", 511 "{src0} || {src1}", "{src}") 512 513# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 514 515binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 516 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 517binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 518 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 519 520# These comparisons for integer-less hardware return 1.0 and 0.0 for true 521# and false respectively 522 523binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 524binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 525binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 526binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 527 528 529opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1") 530opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1") 531opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1") 532 533# bitwise logic operators 534# 535# These are also used as boolean and, or, xor for hardware supporting 536# integers. 537 538 539binop("iand", tuint, commutative + associative, "src0 & src1") 540binop("ior", tuint, commutative + associative, "src0 | src1") 541binop("ixor", tuint, commutative + associative, "src0 ^ src1") 542 543 544# floating point logic operators 545# 546# These use (src != 0.0) for testing the truth of the input, and output 1.0 547# for true and 0.0 for false 548 549binop("fand", tfloat32, commutative, 550 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") 551binop("for", tfloat32, commutative, 552 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") 553binop("fxor", tfloat32, commutative, 554 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") 555 556binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 557 "{src}") 558 559binop_reduce("fdot_replicated", 4, tfloat, tfloat, 560 "{src0} * {src1}", "{src0} + {src1}", "{src}") 561 562opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "", 563 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 564opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "", 565 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 566 567binop("fmin", tfloat, "", "fminf(src0, src1)") 568binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1") 569binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1") 570binop("fmax", tfloat, "", "fmaxf(src0, src1)") 571binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") 572binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") 573 574# Saturated vector add for 4 8bit ints. 575binop("usadd_4x8", tint32, commutative + associative, """ 576dst = 0; 577for (int i = 0; i < 32; i += 8) { 578 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 579} 580""") 581 582# Saturated vector subtract for 4 8bit ints. 583binop("ussub_4x8", tint32, "", """ 584dst = 0; 585for (int i = 0; i < 32; i += 8) { 586 int src0_chan = (src0 >> i) & 0xff; 587 int src1_chan = (src1 >> i) & 0xff; 588 if (src0_chan > src1_chan) 589 dst |= (src0_chan - src1_chan) << i; 590} 591""") 592 593# vector min for 4 8bit ints. 594binop("umin_4x8", tint32, commutative + associative, """ 595dst = 0; 596for (int i = 0; i < 32; i += 8) { 597 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 598} 599""") 600 601# vector max for 4 8bit ints. 602binop("umax_4x8", tint32, commutative + associative, """ 603dst = 0; 604for (int i = 0; i < 32; i += 8) { 605 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 606} 607""") 608 609# unorm multiply: (a * b) / 255. 610binop("umul_unorm_4x8", tint32, commutative + associative, """ 611dst = 0; 612for (int i = 0; i < 32; i += 8) { 613 int src0_chan = (src0 >> i) & 0xff; 614 int src1_chan = (src1 >> i) & 0xff; 615 dst |= ((src0_chan * src1_chan) / 255) << i; 616} 617""") 618 619binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 620 621binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 622 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 623 624binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 625 "src0 | ((uint64_t)src1 << 32)") 626 627binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 628 "src0 | ((uint32_t)src1 << 16)") 629 630# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 631# and that of the "bfi1" i965 instruction. That is, it has undefined behavior 632# if either of its arguments are 32. 633binop_convert("bfm", tuint32, tint32, "", """ 634int bits = src0, offset = src1; 635if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) 636 dst = 0; /* undefined */ 637else 638 dst = ((1u << bits) - 1) << offset; 639""") 640 641opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """ 642dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 643/* flush denormals to zero. */ 644if (!isnormal(dst)) 645 dst = copysignf(0.0f, src0); 646""") 647 648# Combines the first component of each input to make a 2-component vector. 649 650binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 651dst.x = src0.x; 652dst.y = src1.x; 653""") 654 655# Byte extraction 656binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 657binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 658 659# Word extraction 660binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 661binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 662 663 664def triop(name, ty, const_expr): 665 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) 666def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 667 opcode(name, output_size, tuint, 668 [src1_size, src2_size, src3_size], 669 [tuint, tuint, tuint], "", const_expr) 670 671triop("ffma", tfloat, "src0 * src1 + src2") 672 673triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") 674 675# Conditional Select 676# 677# A vector conditional select instruction (like ?:, but operating per- 678# component on vectors). There are two versions, one for floating point 679# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 680 681 682triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") 683 684# 3 way min/max/med 685triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))") 686triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))") 687triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))") 688 689triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))") 690triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))") 691triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))") 692 693triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))") 694triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))") 695triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))") 696 697opcode("bcsel", 0, tuint, [0, 0, 0], 698 [tbool, tuint, tuint], "", "src0 ? src1 : src2") 699 700# SM5 bfi assembly 701triop("bfi", tuint32, """ 702unsigned mask = src0, insert = src1, base = src2; 703if (mask == 0) { 704 dst = base; 705} else { 706 unsigned tmp = mask; 707 while (!(tmp & 1)) { 708 tmp >>= 1; 709 insert <<= 1; 710 } 711 dst = (base & ~mask) | (insert & mask); 712} 713""") 714 715# SM5 ubfe/ibfe assembly 716opcode("ubfe", 0, tuint32, 717 [0, 0, 0], [tuint32, tint32, tint32], "", """ 718unsigned base = src0; 719int offset = src1, bits = src2; 720if (bits == 0) { 721 dst = 0; 722} else if (bits < 0 || offset < 0) { 723 dst = 0; /* undefined */ 724} else if (offset + bits < 32) { 725 dst = (base << (32 - bits - offset)) >> (32 - bits); 726} else { 727 dst = base >> offset; 728} 729""") 730opcode("ibfe", 0, tint32, 731 [0, 0, 0], [tint32, tint32, tint32], "", """ 732int base = src0; 733int offset = src1, bits = src2; 734if (bits == 0) { 735 dst = 0; 736} else if (bits < 0 || offset < 0) { 737 dst = 0; /* undefined */ 738} else if (offset + bits < 32) { 739 dst = (base << (32 - bits - offset)) >> (32 - bits); 740} else { 741 dst = base >> offset; 742} 743""") 744 745# GLSL bitfieldExtract() 746opcode("ubitfield_extract", 0, tuint32, 747 [0, 0, 0], [tuint32, tint32, tint32], "", """ 748unsigned base = src0; 749int offset = src1, bits = src2; 750if (bits == 0) { 751 dst = 0; 752} else if (bits < 0 || offset < 0 || offset + bits > 32) { 753 dst = 0; /* undefined per the spec */ 754} else { 755 dst = (base >> offset) & ((1ull << bits) - 1); 756} 757""") 758opcode("ibitfield_extract", 0, tint32, 759 [0, 0, 0], [tint32, tint32, tint32], "", """ 760int base = src0; 761int offset = src1, bits = src2; 762if (bits == 0) { 763 dst = 0; 764} else if (offset < 0 || bits < 0 || offset + bits > 32) { 765 dst = 0; 766} else { 767 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 768} 769""") 770 771# Combines the first component of each input to make a 3-component vector. 772 773triop_horiz("vec3", 3, 1, 1, 1, """ 774dst.x = src0.x; 775dst.y = src1.x; 776dst.z = src2.x; 777""") 778 779def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 780 src4_size, const_expr): 781 opcode(name, output_size, tuint, 782 [src1_size, src2_size, src3_size, src4_size], 783 [tuint, tuint, tuint, tuint], 784 "", const_expr) 785 786opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 787 [tuint32, tuint32, tint32, tint32], "", """ 788unsigned base = src0, insert = src1; 789int offset = src2, bits = src3; 790if (bits == 0) { 791 dst = base; 792} else if (offset < 0 || bits < 0 || bits + offset > 32) { 793 dst = 0; 794} else { 795 unsigned mask = ((1ull << bits) - 1) << offset; 796 dst = (base & ~mask) | ((insert << offset) & mask); 797} 798""") 799 800quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 801dst.x = src0.x; 802dst.y = src1.x; 803dst.z = src2.x; 804dst.w = src3.x; 805""") 806 807 808