1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include <float.h> 49 50#include <llvm/Config/llvm-config.h> 51 52#include "util/u_memory.h" 53#include "util/u_debug.h" 54#include "util/u_math.h" 55#include "util/u_cpu_detect.h" 56 57#include "lp_bld_type.h" 58#include "lp_bld_const.h" 59#include "lp_bld_init.h" 60#include "lp_bld_intr.h" 61#include "lp_bld_logic.h" 62#include "lp_bld_pack.h" 63#include "lp_bld_debug.h" 64#include "lp_bld_bitarit.h" 65#include "lp_bld_arit.h" 66#include "lp_bld_flow.h" 67 68#if defined(PIPE_ARCH_SSE) 69#include <xmmintrin.h> 70#endif 71 72#ifndef _MM_DENORMALS_ZERO_MASK 73#define _MM_DENORMALS_ZERO_MASK 0x0040 74#endif 75 76#ifndef _MM_FLUSH_ZERO_MASK 77#define _MM_FLUSH_ZERO_MASK 0x8000 78#endif 79 80#define EXP_POLY_DEGREE 5 81 82#define LOG_POLY_DEGREE 4 83 84 85/** 86 * Generate min(a, b) 87 * No checks for special case values of a or b = 1 or 0 are done. 88 * NaN's are handled according to the behavior specified by the 89 * nan_behavior argument. 90 */ 91static LLVMValueRef 92lp_build_min_simple(struct lp_build_context *bld, 93 LLVMValueRef a, 94 LLVMValueRef b, 95 enum gallivm_nan_behavior nan_behavior) 96{ 97 const struct lp_type type = bld->type; 98 const char *intrinsic = NULL; 99 unsigned intr_size = 0; 100 LLVMValueRef cond; 101 102 assert(lp_check_value(type, a)); 103 assert(lp_check_value(type, b)); 104 105 /* TODO: optimize the constant case */ 106 107 if (type.floating && util_get_cpu_caps()->has_sse) { 108 if (type.width == 32) { 109 if (type.length == 1) { 110 intrinsic = "llvm.x86.sse.min.ss"; 111 intr_size = 128; 112 } 113 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { 114 intrinsic = "llvm.x86.sse.min.ps"; 115 intr_size = 128; 116 } 117 else { 118 intrinsic = "llvm.x86.avx.min.ps.256"; 119 intr_size = 256; 120 } 121 } 122 if (type.width == 64 && util_get_cpu_caps()->has_sse2) { 123 if (type.length == 1) { 124 intrinsic = "llvm.x86.sse2.min.sd"; 125 intr_size = 128; 126 } 127 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { 128 intrinsic = "llvm.x86.sse2.min.pd"; 129 intr_size = 128; 130 } 131 else { 132 intrinsic = "llvm.x86.avx.min.pd.256"; 133 intr_size = 256; 134 } 135 } 136 } 137 else if (type.floating && util_get_cpu_caps()->has_altivec) { 138 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 139 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 140 __FUNCTION__); 141 } 142 if (type.width == 32 && type.length == 4) { 143 intrinsic = "llvm.ppc.altivec.vminfp"; 144 intr_size = 128; 145 } 146 } else if (util_get_cpu_caps()->has_altivec) { 147 intr_size = 128; 148 if (type.width == 8) { 149 if (!type.sign) { 150 intrinsic = "llvm.ppc.altivec.vminub"; 151 } else { 152 intrinsic = "llvm.ppc.altivec.vminsb"; 153 } 154 } else if (type.width == 16) { 155 if (!type.sign) { 156 intrinsic = "llvm.ppc.altivec.vminuh"; 157 } else { 158 intrinsic = "llvm.ppc.altivec.vminsh"; 159 } 160 } else if (type.width == 32) { 161 if (!type.sign) { 162 intrinsic = "llvm.ppc.altivec.vminuw"; 163 } else { 164 intrinsic = "llvm.ppc.altivec.vminsw"; 165 } 166 } 167 } 168 169 if (intrinsic) { 170 /* We need to handle nan's for floating point numbers. If one of the 171 * inputs is nan the other should be returned (required by both D3D10+ 172 * and OpenCL). 173 * The sse intrinsics return the second operator in case of nan by 174 * default so we need to special code to handle those. 175 */ 176 if (util_get_cpu_caps()->has_sse && type.floating && 177 nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 178 LLVMValueRef isnan, min; 179 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 180 type, 181 intr_size, a, b); 182 isnan = lp_build_isnan(bld, b); 183 return lp_build_select(bld, isnan, a, min); 184 } else { 185 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 186 type, 187 intr_size, a, b); 188 } 189 } 190 191 if (type.floating) { 192 switch (nan_behavior) { 193 case GALLIVM_NAN_RETURN_OTHER: { 194 LLVMValueRef isnan = lp_build_isnan(bld, a); 195 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 196 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 197 return lp_build_select(bld, cond, a, b); 198 } 199 break; 200 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 201 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b); 202 return lp_build_select(bld, cond, a, b); 203 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 204 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a); 205 return lp_build_select(bld, cond, b, a); 206 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 207 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 208 return lp_build_select(bld, cond, a, b); 209 break; 210 default: 211 assert(0); 212 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 213 return lp_build_select(bld, cond, a, b); 214 } 215 } else { 216 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 217 return lp_build_select(bld, cond, a, b); 218 } 219} 220 221 222LLVMValueRef 223lp_build_fmuladd(LLVMBuilderRef builder, 224 LLVMValueRef a, 225 LLVMValueRef b, 226 LLVMValueRef c) 227{ 228 LLVMTypeRef type = LLVMTypeOf(a); 229 assert(type == LLVMTypeOf(b)); 230 assert(type == LLVMTypeOf(c)); 231 232 char intrinsic[32]; 233 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); 234 LLVMValueRef args[] = { a, b, c }; 235 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0); 236} 237 238 239/** 240 * Generate max(a, b) 241 * No checks for special case values of a or b = 1 or 0 are done. 242 * NaN's are handled according to the behavior specified by the 243 * nan_behavior argument. 244 */ 245static LLVMValueRef 246lp_build_max_simple(struct lp_build_context *bld, 247 LLVMValueRef a, 248 LLVMValueRef b, 249 enum gallivm_nan_behavior nan_behavior) 250{ 251 const struct lp_type type = bld->type; 252 const char *intrinsic = NULL; 253 unsigned intr_size = 0; 254 LLVMValueRef cond; 255 256 assert(lp_check_value(type, a)); 257 assert(lp_check_value(type, b)); 258 259 /* TODO: optimize the constant case */ 260 261 if (type.floating && util_get_cpu_caps()->has_sse) { 262 if (type.width == 32) { 263 if (type.length == 1) { 264 intrinsic = "llvm.x86.sse.max.ss"; 265 intr_size = 128; 266 } 267 else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) { 268 intrinsic = "llvm.x86.sse.max.ps"; 269 intr_size = 128; 270 } 271 else { 272 intrinsic = "llvm.x86.avx.max.ps.256"; 273 intr_size = 256; 274 } 275 } 276 if (type.width == 64 && util_get_cpu_caps()->has_sse2) { 277 if (type.length == 1) { 278 intrinsic = "llvm.x86.sse2.max.sd"; 279 intr_size = 128; 280 } 281 else if (type.length == 2 || !util_get_cpu_caps()->has_avx) { 282 intrinsic = "llvm.x86.sse2.max.pd"; 283 intr_size = 128; 284 } 285 else { 286 intrinsic = "llvm.x86.avx.max.pd.256"; 287 intr_size = 256; 288 } 289 } 290 } 291 else if (type.floating && util_get_cpu_caps()->has_altivec) { 292 if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 293 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 294 __FUNCTION__); 295 } 296 if (type.width == 32 || type.length == 4) { 297 intrinsic = "llvm.ppc.altivec.vmaxfp"; 298 intr_size = 128; 299 } 300 } else if (util_get_cpu_caps()->has_altivec) { 301 intr_size = 128; 302 if (type.width == 8) { 303 if (!type.sign) { 304 intrinsic = "llvm.ppc.altivec.vmaxub"; 305 } else { 306 intrinsic = "llvm.ppc.altivec.vmaxsb"; 307 } 308 } else if (type.width == 16) { 309 if (!type.sign) { 310 intrinsic = "llvm.ppc.altivec.vmaxuh"; 311 } else { 312 intrinsic = "llvm.ppc.altivec.vmaxsh"; 313 } 314 } else if (type.width == 32) { 315 if (!type.sign) { 316 intrinsic = "llvm.ppc.altivec.vmaxuw"; 317 } else { 318 intrinsic = "llvm.ppc.altivec.vmaxsw"; 319 } 320 } 321 } 322 323 if (intrinsic) { 324 if (util_get_cpu_caps()->has_sse && type.floating && 325 nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 326 LLVMValueRef isnan, max; 327 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 328 type, 329 intr_size, a, b); 330 isnan = lp_build_isnan(bld, b); 331 return lp_build_select(bld, isnan, a, max); 332 } else { 333 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 334 type, 335 intr_size, a, b); 336 } 337 } 338 339 if (type.floating) { 340 switch (nan_behavior) { 341 case GALLIVM_NAN_RETURN_OTHER: { 342 LLVMValueRef isnan = lp_build_isnan(bld, a); 343 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 344 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 345 return lp_build_select(bld, cond, a, b); 346 } 347 break; 348 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 349 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b); 350 return lp_build_select(bld, cond, a, b); 351 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 352 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a); 353 return lp_build_select(bld, cond, b, a); 354 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 355 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 356 return lp_build_select(bld, cond, a, b); 357 break; 358 default: 359 assert(0); 360 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 361 return lp_build_select(bld, cond, a, b); 362 } 363 } else { 364 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 365 return lp_build_select(bld, cond, a, b); 366 } 367} 368 369 370/** 371 * Generate 1 - a, or ~a depending on bld->type. 372 */ 373LLVMValueRef 374lp_build_comp(struct lp_build_context *bld, 375 LLVMValueRef a) 376{ 377 LLVMBuilderRef builder = bld->gallivm->builder; 378 const struct lp_type type = bld->type; 379 380 assert(lp_check_value(type, a)); 381 382 if(a == bld->one) 383 return bld->zero; 384 if(a == bld->zero) 385 return bld->one; 386 387 if(type.norm && !type.floating && !type.fixed && !type.sign) { 388 if(LLVMIsConstant(a)) 389 return LLVMConstNot(a); 390 else 391 return LLVMBuildNot(builder, a, ""); 392 } 393 394 if(LLVMIsConstant(a)) 395 if (type.floating) 396 return LLVMConstFSub(bld->one, a); 397 else 398 return LLVMConstSub(bld->one, a); 399 else 400 if (type.floating) 401 return LLVMBuildFSub(builder, bld->one, a, ""); 402 else 403 return LLVMBuildSub(builder, bld->one, a, ""); 404} 405 406 407/** 408 * Generate a + b 409 */ 410LLVMValueRef 411lp_build_add(struct lp_build_context *bld, 412 LLVMValueRef a, 413 LLVMValueRef b) 414{ 415 LLVMBuilderRef builder = bld->gallivm->builder; 416 const struct lp_type type = bld->type; 417 LLVMValueRef res; 418 419 assert(lp_check_value(type, a)); 420 assert(lp_check_value(type, b)); 421 422 if (a == bld->zero) 423 return b; 424 if (b == bld->zero) 425 return a; 426 if (a == bld->undef || b == bld->undef) 427 return bld->undef; 428 429 if (type.norm) { 430 const char *intrinsic = NULL; 431 432 if (!type.sign && (a == bld->one || b == bld->one)) 433 return bld->one; 434 435 if (!type.floating && !type.fixed) { 436 if (LLVM_VERSION_MAJOR >= 8) { 437 char intrin[32]; 438 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; 439 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 440 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 441 } 442 if (type.width * type.length == 128) { 443 if (util_get_cpu_caps()->has_sse2) { 444 if (type.width == 8) 445 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; 446 if (type.width == 16) 447 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; 448 } else if (util_get_cpu_caps()->has_altivec) { 449 if (type.width == 8) 450 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; 451 if (type.width == 16) 452 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; 453 } 454 } 455 if (type.width * type.length == 256) { 456 if (util_get_cpu_caps()->has_avx2) { 457 if (type.width == 8) 458 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; 459 if (type.width == 16) 460 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; 461 } 462 } 463 } 464 465 if (intrinsic) 466 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 467 } 468 469 if(type.norm && !type.floating && !type.fixed) { 470 if (type.sign) { 471 uint64_t sign = (uint64_t)1 << (type.width - 1); 472 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 473 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 474 /* a_clamp_max is the maximum a for positive b, 475 a_clamp_min is the minimum a for negative b. */ 476 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 477 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 478 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); 479 } 480 } 481 482 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 483 if (type.floating) 484 res = LLVMConstFAdd(a, b); 485 else 486 res = LLVMConstAdd(a, b); 487 else 488 if (type.floating) 489 res = LLVMBuildFAdd(builder, a, b, ""); 490 else 491 res = LLVMBuildAdd(builder, a, b, ""); 492 493 /* clamp to ceiling of 1.0 */ 494 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 495 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 496 497 if (type.norm && !type.floating && !type.fixed) { 498 if (!type.sign) { 499 /* 500 * newer llvm versions no longer support the intrinsics, but recognize 501 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit 502 * code, it is important we match the pattern llvm uses (and pray llvm 503 * doesn't change it - and hope they decide on the same pattern for 504 * all backends supporting it...). 505 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 506 * interfere with llvm's ability to recognize the pattern but seems 507 * a bit brittle. 508 * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 509 */ 510 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res); 511 res = lp_build_select(bld, overflowed, 512 LLVMConstAllOnes(bld->int_vec_type), res); 513 } 514 } 515 516 /* XXX clamp to floor of -1 or 0??? */ 517 518 return res; 519} 520 521 522/** Return the scalar sum of the elements of a. 523 * Should avoid this operation whenever possible. 524 */ 525LLVMValueRef 526lp_build_horizontal_add(struct lp_build_context *bld, 527 LLVMValueRef a) 528{ 529 LLVMBuilderRef builder = bld->gallivm->builder; 530 const struct lp_type type = bld->type; 531 LLVMValueRef index, res; 532 unsigned i, length; 533 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 534 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 535 LLVMValueRef vecres, elem2; 536 537 assert(lp_check_value(type, a)); 538 539 if (type.length == 1) { 540 return a; 541 } 542 543 assert(!bld->type.norm); 544 545 /* 546 * for byte vectors can do much better with psadbw. 547 * Using repeated shuffle/adds here. Note with multiple vectors 548 * this can be done more efficiently as outlined in the intel 549 * optimization manual. 550 * Note: could cause data rearrangement if used with smaller element 551 * sizes. 552 */ 553 554 vecres = a; 555 length = type.length / 2; 556 while (length > 1) { 557 LLVMValueRef vec1, vec2; 558 for (i = 0; i < length; i++) { 559 shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 560 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 561 } 562 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 563 LLVMConstVector(shuffles1, length), ""); 564 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 565 LLVMConstVector(shuffles2, length), ""); 566 if (type.floating) { 567 vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 568 } 569 else { 570 vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 571 } 572 length = length >> 1; 573 } 574 575 /* always have vector of size 2 here */ 576 assert(length == 1); 577 578 index = lp_build_const_int32(bld->gallivm, 0); 579 res = LLVMBuildExtractElement(builder, vecres, index, ""); 580 index = lp_build_const_int32(bld->gallivm, 1); 581 elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 582 583 if (type.floating) 584 res = LLVMBuildFAdd(builder, res, elem2, ""); 585 else 586 res = LLVMBuildAdd(builder, res, elem2, ""); 587 588 return res; 589} 590 591/** 592 * Return the horizontal sums of 4 float vectors as a float4 vector. 593 * This uses the technique as outlined in Intel Optimization Manual. 594 */ 595static LLVMValueRef 596lp_build_horizontal_add4x4f(struct lp_build_context *bld, 597 LLVMValueRef src[4]) 598{ 599 struct gallivm_state *gallivm = bld->gallivm; 600 LLVMBuilderRef builder = gallivm->builder; 601 LLVMValueRef shuffles[4]; 602 LLVMValueRef tmp[4]; 603 LLVMValueRef sumtmp[2], shuftmp[2]; 604 605 /* lower half of regs */ 606 shuffles[0] = lp_build_const_int32(gallivm, 0); 607 shuffles[1] = lp_build_const_int32(gallivm, 1); 608 shuffles[2] = lp_build_const_int32(gallivm, 4); 609 shuffles[3] = lp_build_const_int32(gallivm, 5); 610 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 611 LLVMConstVector(shuffles, 4), ""); 612 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 613 LLVMConstVector(shuffles, 4), ""); 614 615 /* upper half of regs */ 616 shuffles[0] = lp_build_const_int32(gallivm, 2); 617 shuffles[1] = lp_build_const_int32(gallivm, 3); 618 shuffles[2] = lp_build_const_int32(gallivm, 6); 619 shuffles[3] = lp_build_const_int32(gallivm, 7); 620 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 621 LLVMConstVector(shuffles, 4), ""); 622 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 623 LLVMConstVector(shuffles, 4), ""); 624 625 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 626 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 627 628 shuffles[0] = lp_build_const_int32(gallivm, 0); 629 shuffles[1] = lp_build_const_int32(gallivm, 2); 630 shuffles[2] = lp_build_const_int32(gallivm, 4); 631 shuffles[3] = lp_build_const_int32(gallivm, 6); 632 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 633 LLVMConstVector(shuffles, 4), ""); 634 635 shuffles[0] = lp_build_const_int32(gallivm, 1); 636 shuffles[1] = lp_build_const_int32(gallivm, 3); 637 shuffles[2] = lp_build_const_int32(gallivm, 5); 638 shuffles[3] = lp_build_const_int32(gallivm, 7); 639 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 640 LLVMConstVector(shuffles, 4), ""); 641 642 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 643} 644 645 646/* 647 * partially horizontally add 2-4 float vectors with length nx4, 648 * i.e. only four adjacent values in each vector will be added, 649 * assuming values are really grouped in 4 which also determines 650 * output order. 651 * 652 * Return a vector of the same length as the initial vectors, 653 * with the excess elements (if any) being undefined. 654 * The element order is independent of number of input vectors. 655 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 656 * the output order thus will be 657 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 658 */ 659LLVMValueRef 660lp_build_hadd_partial4(struct lp_build_context *bld, 661 LLVMValueRef vectors[], 662 unsigned num_vecs) 663{ 664 struct gallivm_state *gallivm = bld->gallivm; 665 LLVMBuilderRef builder = gallivm->builder; 666 LLVMValueRef ret_vec; 667 LLVMValueRef tmp[4]; 668 const char *intrinsic = NULL; 669 670 assert(num_vecs >= 2 && num_vecs <= 4); 671 assert(bld->type.floating); 672 673 /* only use this with at least 2 vectors, as it is sort of expensive 674 * (depending on cpu) and we always need two horizontal adds anyway, 675 * so a shuffle/add approach might be better. 676 */ 677 678 tmp[0] = vectors[0]; 679 tmp[1] = vectors[1]; 680 681 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 682 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 683 684 if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 && 685 bld->type.length == 4) { 686 intrinsic = "llvm.x86.sse3.hadd.ps"; 687 } 688 else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 && 689 bld->type.length == 8) { 690 intrinsic = "llvm.x86.avx.hadd.ps.256"; 691 } 692 if (intrinsic) { 693 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 694 lp_build_vec_type(gallivm, bld->type), 695 tmp[0], tmp[1]); 696 if (num_vecs > 2) { 697 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 698 lp_build_vec_type(gallivm, bld->type), 699 tmp[2], tmp[3]); 700 } 701 else { 702 tmp[1] = tmp[0]; 703 } 704 return lp_build_intrinsic_binary(builder, intrinsic, 705 lp_build_vec_type(gallivm, bld->type), 706 tmp[0], tmp[1]); 707 } 708 709 if (bld->type.length == 4) { 710 ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 711 } 712 else { 713 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 714 unsigned j; 715 unsigned num_iter = bld->type.length / 4; 716 struct lp_type parttype = bld->type; 717 parttype.length = 4; 718 for (j = 0; j < num_iter; j++) { 719 LLVMValueRef partsrc[4]; 720 unsigned i; 721 for (i = 0; i < 4; i++) { 722 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 723 } 724 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 725 } 726 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 727 } 728 return ret_vec; 729} 730 731/** 732 * Generate a - b 733 */ 734LLVMValueRef 735lp_build_sub(struct lp_build_context *bld, 736 LLVMValueRef a, 737 LLVMValueRef b) 738{ 739 LLVMBuilderRef builder = bld->gallivm->builder; 740 const struct lp_type type = bld->type; 741 LLVMValueRef res; 742 743 assert(lp_check_value(type, a)); 744 assert(lp_check_value(type, b)); 745 746 if (b == bld->zero) 747 return a; 748 if (a == bld->undef || b == bld->undef) 749 return bld->undef; 750 if (a == b) 751 return bld->zero; 752 753 if (type.norm) { 754 const char *intrinsic = NULL; 755 756 if (!type.sign && b == bld->one) 757 return bld->zero; 758 759 if (!type.floating && !type.fixed) { 760 if (LLVM_VERSION_MAJOR >= 8) { 761 char intrin[32]; 762 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; 763 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 764 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 765 } 766 if (type.width * type.length == 128) { 767 if (util_get_cpu_caps()->has_sse2) { 768 if (type.width == 8) 769 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; 770 if (type.width == 16) 771 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; 772 } else if (util_get_cpu_caps()->has_altivec) { 773 if (type.width == 8) 774 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; 775 if (type.width == 16) 776 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; 777 } 778 } 779 if (type.width * type.length == 256) { 780 if (util_get_cpu_caps()->has_avx2) { 781 if (type.width == 8) 782 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; 783 if (type.width == 16) 784 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; 785 } 786 } 787 } 788 789 if (intrinsic) 790 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 791 } 792 793 if(type.norm && !type.floating && !type.fixed) { 794 if (type.sign) { 795 uint64_t sign = (uint64_t)1 << (type.width - 1); 796 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 797 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 798 /* a_clamp_max is the maximum a for negative b, 799 a_clamp_min is the minimum a for positive b. */ 800 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 801 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 802 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max); 803 } else { 804 /* 805 * This must match llvm pattern for saturated unsigned sub. 806 * (lp_build_max_simple actually does the job with its current 807 * definition but do it explicitly here.) 808 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 809 * interfere with llvm's ability to recognize the pattern but seems 810 * a bit brittle. 811 * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 812 */ 813 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 814 a = lp_build_select(bld, no_ov, a, b); 815 } 816 } 817 818 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 819 if (type.floating) 820 res = LLVMConstFSub(a, b); 821 else 822 res = LLVMConstSub(a, b); 823 else 824 if (type.floating) 825 res = LLVMBuildFSub(builder, a, b, ""); 826 else 827 res = LLVMBuildSub(builder, a, b, ""); 828 829 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 830 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 831 832 return res; 833} 834 835 836 837/** 838 * Normalized multiplication. 839 * 840 * There are several approaches for (using 8-bit normalized multiplication as 841 * an example): 842 * 843 * - alpha plus one 844 * 845 * makes the following approximation to the division (Sree) 846 * 847 * a*b/255 ~= (a*(b + 1)) >> 256 848 * 849 * which is the fastest method that satisfies the following OpenGL criteria of 850 * 851 * 0*0 = 0 and 255*255 = 255 852 * 853 * - geometric series 854 * 855 * takes the geometric series approximation to the division 856 * 857 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 858 * 859 * in this case just the first two terms to fit in 16bit arithmetic 860 * 861 * t/255 ~= (t + (t >> 8)) >> 8 862 * 863 * note that just by itself it doesn't satisfies the OpenGL criteria, as 864 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 865 * must be used. 866 * 867 * - geometric series plus rounding 868 * 869 * when using a geometric series division instead of truncating the result 870 * use roundoff in the approximation (Jim Blinn) 871 * 872 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 873 * 874 * achieving the exact results. 875 * 876 * 877 * 878 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 879 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 880 * @sa Michael Herf, The "double blend trick", May 2000, 881 * http://www.stereopsis.com/doubleblend.html 882 */ 883LLVMValueRef 884lp_build_mul_norm(struct gallivm_state *gallivm, 885 struct lp_type wide_type, 886 LLVMValueRef a, LLVMValueRef b) 887{ 888 LLVMBuilderRef builder = gallivm->builder; 889 struct lp_build_context bld; 890 unsigned n; 891 LLVMValueRef half; 892 LLVMValueRef ab; 893 894 assert(!wide_type.floating); 895 assert(lp_check_value(wide_type, a)); 896 assert(lp_check_value(wide_type, b)); 897 898 lp_build_context_init(&bld, gallivm, wide_type); 899 900 n = wide_type.width / 2; 901 if (wide_type.sign) { 902 --n; 903 } 904 905 /* 906 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW 907 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/ 908 */ 909 910 /* 911 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n 912 */ 913 914 ab = LLVMBuildMul(builder, a, b, ""); 915 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), ""); 916 917 /* 918 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1)) 919 */ 920 921 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1)); 922 if (wide_type.sign) { 923 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, ""); 924 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1); 925 half = lp_build_select(&bld, sign, minus_half, half); 926 } 927 ab = LLVMBuildAdd(builder, ab, half, ""); 928 929 /* Final division */ 930 ab = lp_build_shr_imm(&bld, ab, n); 931 932 return ab; 933} 934 935/** 936 * Generate a * b 937 */ 938LLVMValueRef 939lp_build_mul(struct lp_build_context *bld, 940 LLVMValueRef a, 941 LLVMValueRef b) 942{ 943 LLVMBuilderRef builder = bld->gallivm->builder; 944 const struct lp_type type = bld->type; 945 LLVMValueRef shift; 946 LLVMValueRef res; 947 948 assert(lp_check_value(type, a)); 949 assert(lp_check_value(type, b)); 950 951 if(a == bld->zero) 952 return bld->zero; 953 if(a == bld->one) 954 return b; 955 if(b == bld->zero) 956 return bld->zero; 957 if(b == bld->one) 958 return a; 959 if(a == bld->undef || b == bld->undef) 960 return bld->undef; 961 962 if (!type.floating && !type.fixed && type.norm) { 963 struct lp_type wide_type = lp_wider_type(type); 964 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 965 966 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); 967 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); 968 969 /* PMULLW, PSRLW, PADDW */ 970 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); 971 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); 972 973 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); 974 975 return ab; 976 } 977 978 if(type.fixed) 979 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 980 else 981 shift = NULL; 982 983 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 984 if (type.floating) 985 res = LLVMConstFMul(a, b); 986 else 987 res = LLVMConstMul(a, b); 988 if(shift) { 989 if(type.sign) 990 res = LLVMConstAShr(res, shift); 991 else 992 res = LLVMConstLShr(res, shift); 993 } 994 } 995 else { 996 if (type.floating) 997 res = LLVMBuildFMul(builder, a, b, ""); 998 else 999 res = LLVMBuildMul(builder, a, b, ""); 1000 if(shift) { 1001 if(type.sign) 1002 res = LLVMBuildAShr(builder, res, shift, ""); 1003 else 1004 res = LLVMBuildLShr(builder, res, shift, ""); 1005 } 1006 } 1007 1008 return res; 1009} 1010 1011/* 1012 * Widening mul, valid for 32x32 bit -> 64bit only. 1013 * Result is low 32bits, high bits returned in res_hi. 1014 * 1015 * Emits code that is meant to be compiled for the host CPU. 1016 */ 1017LLVMValueRef 1018lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, 1019 LLVMValueRef a, 1020 LLVMValueRef b, 1021 LLVMValueRef *res_hi) 1022{ 1023 struct gallivm_state *gallivm = bld->gallivm; 1024 LLVMBuilderRef builder = gallivm->builder; 1025 1026 assert(bld->type.width == 32); 1027 assert(bld->type.floating == 0); 1028 assert(bld->type.fixed == 0); 1029 assert(bld->type.norm == 0); 1030 1031 /* 1032 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces 1033 * for x86 simd is atrocious (even if the high bits weren't required), 1034 * trying to handle real 64bit inputs (which of course can't happen due 1035 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but 1036 * apparently llvm does not recognize this widening mul). This includes 6 1037 * (instead of 2) pmuludq plus extra adds and shifts 1038 * The same story applies to signed mul, albeit fixing this requires sse41. 1039 * https://llvm.org/bugs/show_bug.cgi?id=30845 1040 * So, whip up our own code, albeit only for length 4 and 8 (which 1041 * should be good enough)... 1042 * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern 1043 * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle 1044 * for signed), which the fallback code does not, without this llvm 1045 * will likely still produce atrocious code. 1046 */ 1047 if (LLVM_VERSION_MAJOR < 7 && 1048 (bld->type.length == 4 || bld->type.length == 8) && 1049 ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) || 1050 util_get_cpu_caps()->has_sse4_1)) { 1051 const char *intrinsic = NULL; 1052 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; 1053 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; 1054 struct lp_type type_wide = lp_wider_type(bld->type); 1055 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); 1056 unsigned i; 1057 for (i = 0; i < bld->type.length; i += 2) { 1058 shuf[i] = lp_build_const_int32(gallivm, i+1); 1059 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1060 } 1061 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1062 aeven = a; 1063 beven = b; 1064 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); 1065 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); 1066 1067 if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) { 1068 if (bld->type.sign) { 1069 intrinsic = "llvm.x86.avx2.pmul.dq"; 1070 } else { 1071 intrinsic = "llvm.x86.avx2.pmulu.dq"; 1072 } 1073 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1074 wider_type, aeven, beven); 1075 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1076 wider_type, aodd, bodd); 1077 } 1078 else { 1079 /* for consistent naming look elsewhere... */ 1080 if (bld->type.sign) { 1081 intrinsic = "llvm.x86.sse41.pmuldq"; 1082 } else { 1083 intrinsic = "llvm.x86.sse2.pmulu.dq"; 1084 } 1085 /* 1086 * XXX If we only have AVX but not AVX2 this is a pain. 1087 * lp_build_intrinsic_binary_anylength() can't handle it 1088 * (due to src and dst type not being identical). 1089 */ 1090 if (bld->type.length == 8) { 1091 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; 1092 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; 1093 LLVMValueRef muleven2[2], mulodd2[2]; 1094 struct lp_type type_wide_half = type_wide; 1095 LLVMTypeRef wtype_half; 1096 type_wide_half.length = 2; 1097 wtype_half = lp_build_vec_type(gallivm, type_wide_half); 1098 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); 1099 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); 1100 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); 1101 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); 1102 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); 1103 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); 1104 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); 1105 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); 1106 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1107 wtype_half, aevenlo, bevenlo); 1108 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1109 wtype_half, aoddlo, boddlo); 1110 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1111 wtype_half, aevenhi, bevenhi); 1112 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1113 wtype_half, aoddhi, boddhi); 1114 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); 1115 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); 1116 1117 } 1118 else { 1119 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1120 wider_type, aeven, beven); 1121 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1122 wider_type, aodd, bodd); 1123 } 1124 } 1125 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); 1126 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); 1127 1128 for (i = 0; i < bld->type.length; i += 2) { 1129 shuf[i] = lp_build_const_int32(gallivm, i + 1); 1130 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); 1131 } 1132 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1133 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1134 1135 for (i = 0; i < bld->type.length; i += 2) { 1136 shuf[i] = lp_build_const_int32(gallivm, i); 1137 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); 1138 } 1139 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1140 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1141 } 1142 else { 1143 return lp_build_mul_32_lohi(bld, a, b, res_hi); 1144 } 1145} 1146 1147 1148/* 1149 * Widening mul, valid for <= 32 (8, 16, 32) -> 64 1150 * Result is low N bits, high bits returned in res_hi. 1151 * 1152 * Emits generic code. 1153 */ 1154LLVMValueRef 1155lp_build_mul_32_lohi(struct lp_build_context *bld, 1156 LLVMValueRef a, 1157 LLVMValueRef b, 1158 LLVMValueRef *res_hi) 1159{ 1160 struct gallivm_state *gallivm = bld->gallivm; 1161 LLVMBuilderRef builder = gallivm->builder; 1162 LLVMValueRef tmp, shift, res_lo; 1163 struct lp_type type_tmp; 1164 LLVMTypeRef wide_type, narrow_type; 1165 1166 type_tmp = bld->type; 1167 narrow_type = lp_build_vec_type(gallivm, type_tmp); 1168 if (bld->type.width < 32) 1169 type_tmp.width = 32; 1170 else 1171 type_tmp.width *= 2; 1172 wide_type = lp_build_vec_type(gallivm, type_tmp); 1173 shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width); 1174 1175 if (bld->type.sign) { 1176 a = LLVMBuildSExt(builder, a, wide_type, ""); 1177 b = LLVMBuildSExt(builder, b, wide_type, ""); 1178 } else { 1179 a = LLVMBuildZExt(builder, a, wide_type, ""); 1180 b = LLVMBuildZExt(builder, b, wide_type, ""); 1181 } 1182 tmp = LLVMBuildMul(builder, a, b, ""); 1183 1184 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1185 1186 /* Since we truncate anyway, LShr and AShr are equivalent. */ 1187 tmp = LLVMBuildLShr(builder, tmp, shift, ""); 1188 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1189 1190 return res_lo; 1191} 1192 1193 1194/* a * b + c */ 1195LLVMValueRef 1196lp_build_mad(struct lp_build_context *bld, 1197 LLVMValueRef a, 1198 LLVMValueRef b, 1199 LLVMValueRef c) 1200{ 1201 const struct lp_type type = bld->type; 1202 if (type.floating) { 1203 return lp_build_fmuladd(bld->gallivm->builder, a, b, c); 1204 } else { 1205 return lp_build_add(bld, lp_build_mul(bld, a, b), c); 1206 } 1207} 1208 1209 1210/** 1211 * Small vector x scale multiplication optimization. 1212 */ 1213LLVMValueRef 1214lp_build_mul_imm(struct lp_build_context *bld, 1215 LLVMValueRef a, 1216 int b) 1217{ 1218 LLVMBuilderRef builder = bld->gallivm->builder; 1219 LLVMValueRef factor; 1220 1221 assert(lp_check_value(bld->type, a)); 1222 1223 if(b == 0) 1224 return bld->zero; 1225 1226 if(b == 1) 1227 return a; 1228 1229 if(b == -1) 1230 return lp_build_negate(bld, a); 1231 1232 if(b == 2 && bld->type.floating) 1233 return lp_build_add(bld, a, a); 1234 1235 if(util_is_power_of_two_or_zero(b)) { 1236 unsigned shift = ffs(b) - 1; 1237 1238 if(bld->type.floating) { 1239#if 0 1240 /* 1241 * Power of two multiplication by directly manipulating the exponent. 1242 * 1243 * XXX: This might not be always faster, it will introduce a small error 1244 * for multiplication by zero, and it will produce wrong results 1245 * for Inf and NaN. 1246 */ 1247 unsigned mantissa = lp_mantissa(bld->type); 1248 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 1249 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 1250 a = LLVMBuildAdd(builder, a, factor, ""); 1251 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 1252 return a; 1253#endif 1254 } 1255 else { 1256 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 1257 return LLVMBuildShl(builder, a, factor, ""); 1258 } 1259 } 1260 1261 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 1262 return lp_build_mul(bld, a, factor); 1263} 1264 1265 1266/** 1267 * Generate a / b 1268 */ 1269LLVMValueRef 1270lp_build_div(struct lp_build_context *bld, 1271 LLVMValueRef a, 1272 LLVMValueRef b) 1273{ 1274 LLVMBuilderRef builder = bld->gallivm->builder; 1275 const struct lp_type type = bld->type; 1276 1277 assert(lp_check_value(type, a)); 1278 assert(lp_check_value(type, b)); 1279 1280 if(a == bld->zero) 1281 return bld->zero; 1282 if(a == bld->one && type.floating) 1283 return lp_build_rcp(bld, b); 1284 if(b == bld->zero) 1285 return bld->undef; 1286 if(b == bld->one) 1287 return a; 1288 if(a == bld->undef || b == bld->undef) 1289 return bld->undef; 1290 1291 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1292 if (type.floating) 1293 return LLVMConstFDiv(a, b); 1294 else if (type.sign) 1295 return LLVMConstSDiv(a, b); 1296 else 1297 return LLVMConstUDiv(a, b); 1298 } 1299 1300 /* fast rcp is disabled (just uses div), so makes no sense to try that */ 1301 if(FALSE && 1302 ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 1303 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) && 1304 type.floating) 1305 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 1306 1307 if (type.floating) 1308 return LLVMBuildFDiv(builder, a, b, ""); 1309 else if (type.sign) 1310 return LLVMBuildSDiv(builder, a, b, ""); 1311 else 1312 return LLVMBuildUDiv(builder, a, b, ""); 1313} 1314 1315 1316/** 1317 * Linear interpolation helper. 1318 * 1319 * @param normalized whether we are interpolating normalized values, 1320 * encoded in normalized integers, twice as wide. 1321 * 1322 * @sa http://www.stereopsis.com/doubleblend.html 1323 */ 1324static inline LLVMValueRef 1325lp_build_lerp_simple(struct lp_build_context *bld, 1326 LLVMValueRef x, 1327 LLVMValueRef v0, 1328 LLVMValueRef v1, 1329 unsigned flags) 1330{ 1331 unsigned half_width = bld->type.width/2; 1332 LLVMBuilderRef builder = bld->gallivm->builder; 1333 LLVMValueRef delta; 1334 LLVMValueRef res; 1335 1336 assert(lp_check_value(bld->type, x)); 1337 assert(lp_check_value(bld->type, v0)); 1338 assert(lp_check_value(bld->type, v1)); 1339 1340 delta = lp_build_sub(bld, v1, v0); 1341 1342 if (bld->type.floating) { 1343 assert(flags == 0); 1344 return lp_build_mad(bld, x, delta, v0); 1345 } 1346 1347 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) { 1348 if (!bld->type.sign) { 1349 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) { 1350 /* 1351 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the 1352 * most-significant-bit to the lowest-significant-bit, so that 1353 * later we can just divide by 2**n instead of 2**n - 1. 1354 */ 1355 1356 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1)); 1357 } 1358 1359 /* (x * delta) >> n */ 1360 /* 1361 * For this multiply, higher internal precision is required to pass CTS, 1362 * the most efficient path to that is pmulhrsw on ssse3 and above. 1363 * This could be opencoded on other arches if conformance was required. 1364 */ 1365 if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) { 1366 res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7)); 1367 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff)); 1368 } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) { 1369 res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7)); 1370 res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff)); 1371 } else { 1372 res = lp_build_mul(bld, x, delta); 1373 res = lp_build_shr_imm(bld, res, half_width); 1374 } 1375 } else { 1376 /* 1377 * The rescaling trick above doesn't work for signed numbers, so 1378 * use the 2**n - 1 divison approximation in lp_build_mul_norm 1379 * instead. 1380 */ 1381 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1382 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta); 1383 } 1384 } else { 1385 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1386 res = lp_build_mul(bld, x, delta); 1387 } 1388 1389 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) { 1390 /* 1391 * At this point both res and v0 only use the lower half of the bits, 1392 * the rest is zero. Instead of add / mask, do add with half wide type. 1393 */ 1394 struct lp_type narrow_type; 1395 struct lp_build_context narrow_bld; 1396 1397 memset(&narrow_type, 0, sizeof narrow_type); 1398 narrow_type.sign = bld->type.sign; 1399 narrow_type.width = bld->type.width/2; 1400 narrow_type.length = bld->type.length*2; 1401 1402 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type); 1403 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, ""); 1404 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, ""); 1405 res = lp_build_add(&narrow_bld, v0, res); 1406 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 1407 } else { 1408 res = lp_build_add(bld, v0, res); 1409 1410 if (bld->type.fixed) { 1411 /* 1412 * We need to mask out the high order bits when lerping 8bit 1413 * normalized colors stored on 16bits 1414 */ 1415 /* XXX: This step is necessary for lerping 8bit colors stored on 1416 * 16bits, but it will be wrong for true fixed point use cases. 1417 * Basically we need a more powerful lp_type, capable of further 1418 * distinguishing the values interpretation from the value storage. 1419 */ 1420 LLVMValueRef low_bits; 1421 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1); 1422 res = LLVMBuildAnd(builder, res, low_bits, ""); 1423 } 1424 } 1425 1426 return res; 1427} 1428 1429 1430/** 1431 * Linear interpolation. 1432 */ 1433LLVMValueRef 1434lp_build_lerp(struct lp_build_context *bld, 1435 LLVMValueRef x, 1436 LLVMValueRef v0, 1437 LLVMValueRef v1, 1438 unsigned flags) 1439{ 1440 const struct lp_type type = bld->type; 1441 LLVMValueRef res; 1442 1443 assert(lp_check_value(type, x)); 1444 assert(lp_check_value(type, v0)); 1445 assert(lp_check_value(type, v1)); 1446 1447 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED)); 1448 1449 if (type.norm) { 1450 struct lp_type wide_type; 1451 struct lp_build_context wide_bld; 1452 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 1453 1454 assert(type.length >= 2); 1455 1456 /* 1457 * Create a wider integer type, enough to hold the 1458 * intermediate result of the multiplication. 1459 */ 1460 memset(&wide_type, 0, sizeof wide_type); 1461 wide_type.sign = type.sign; 1462 wide_type.width = type.width*2; 1463 wide_type.length = type.length/2; 1464 1465 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 1466 1467 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); 1468 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 1469 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 1470 1471 /* 1472 * Lerp both halves. 1473 */ 1474 1475 flags |= LP_BLD_LERP_WIDE_NORMALIZED; 1476 1477 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); 1478 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); 1479 1480 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); 1481 } else { 1482 res = lp_build_lerp_simple(bld, x, v0, v1, flags); 1483 } 1484 1485 return res; 1486} 1487 1488 1489/** 1490 * Bilinear interpolation. 1491 * 1492 * Values indices are in v_{yx}. 1493 */ 1494LLVMValueRef 1495lp_build_lerp_2d(struct lp_build_context *bld, 1496 LLVMValueRef x, 1497 LLVMValueRef y, 1498 LLVMValueRef v00, 1499 LLVMValueRef v01, 1500 LLVMValueRef v10, 1501 LLVMValueRef v11, 1502 unsigned flags) 1503{ 1504 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags); 1505 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags); 1506 return lp_build_lerp(bld, y, v0, v1, flags); 1507} 1508 1509 1510LLVMValueRef 1511lp_build_lerp_3d(struct lp_build_context *bld, 1512 LLVMValueRef x, 1513 LLVMValueRef y, 1514 LLVMValueRef z, 1515 LLVMValueRef v000, 1516 LLVMValueRef v001, 1517 LLVMValueRef v010, 1518 LLVMValueRef v011, 1519 LLVMValueRef v100, 1520 LLVMValueRef v101, 1521 LLVMValueRef v110, 1522 LLVMValueRef v111, 1523 unsigned flags) 1524{ 1525 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags); 1526 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags); 1527 return lp_build_lerp(bld, z, v0, v1, flags); 1528} 1529 1530 1531/** 1532 * Generate min(a, b) 1533 * Do checks for special cases but not for nans. 1534 */ 1535LLVMValueRef 1536lp_build_min(struct lp_build_context *bld, 1537 LLVMValueRef a, 1538 LLVMValueRef b) 1539{ 1540 assert(lp_check_value(bld->type, a)); 1541 assert(lp_check_value(bld->type, b)); 1542 1543 if(a == bld->undef || b == bld->undef) 1544 return bld->undef; 1545 1546 if(a == b) 1547 return a; 1548 1549 if (bld->type.norm) { 1550 if (!bld->type.sign) { 1551 if (a == bld->zero || b == bld->zero) { 1552 return bld->zero; 1553 } 1554 } 1555 if(a == bld->one) 1556 return b; 1557 if(b == bld->one) 1558 return a; 1559 } 1560 1561 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1562} 1563 1564/** 1565 * Generate min(a, b) 1566 * NaN's are handled according to the behavior specified by the 1567 * nan_behavior argument. 1568 */ 1569LLVMValueRef 1570lp_build_min_ext(struct lp_build_context *bld, 1571 LLVMValueRef a, 1572 LLVMValueRef b, 1573 enum gallivm_nan_behavior nan_behavior) 1574{ 1575 assert(lp_check_value(bld->type, a)); 1576 assert(lp_check_value(bld->type, b)); 1577 1578 if(a == bld->undef || b == bld->undef) 1579 return bld->undef; 1580 1581 if(a == b) 1582 return a; 1583 1584 if (bld->type.norm) { 1585 if (!bld->type.sign) { 1586 if (a == bld->zero || b == bld->zero) { 1587 return bld->zero; 1588 } 1589 } 1590 if(a == bld->one) 1591 return b; 1592 if(b == bld->one) 1593 return a; 1594 } 1595 1596 return lp_build_min_simple(bld, a, b, nan_behavior); 1597} 1598 1599/** 1600 * Generate max(a, b) 1601 * Do checks for special cases, but NaN behavior is undefined. 1602 */ 1603LLVMValueRef 1604lp_build_max(struct lp_build_context *bld, 1605 LLVMValueRef a, 1606 LLVMValueRef b) 1607{ 1608 assert(lp_check_value(bld->type, a)); 1609 assert(lp_check_value(bld->type, b)); 1610 1611 if(a == bld->undef || b == bld->undef) 1612 return bld->undef; 1613 1614 if(a == b) 1615 return a; 1616 1617 if(bld->type.norm) { 1618 if(a == bld->one || b == bld->one) 1619 return bld->one; 1620 if (!bld->type.sign) { 1621 if (a == bld->zero) { 1622 return b; 1623 } 1624 if (b == bld->zero) { 1625 return a; 1626 } 1627 } 1628 } 1629 1630 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1631} 1632 1633 1634/** 1635 * Generate max(a, b) 1636 * Checks for special cases. 1637 * NaN's are handled according to the behavior specified by the 1638 * nan_behavior argument. 1639 */ 1640LLVMValueRef 1641lp_build_max_ext(struct lp_build_context *bld, 1642 LLVMValueRef a, 1643 LLVMValueRef b, 1644 enum gallivm_nan_behavior nan_behavior) 1645{ 1646 assert(lp_check_value(bld->type, a)); 1647 assert(lp_check_value(bld->type, b)); 1648 1649 if(a == bld->undef || b == bld->undef) 1650 return bld->undef; 1651 1652 if(a == b) 1653 return a; 1654 1655 if(bld->type.norm) { 1656 if(a == bld->one || b == bld->one) 1657 return bld->one; 1658 if (!bld->type.sign) { 1659 if (a == bld->zero) { 1660 return b; 1661 } 1662 if (b == bld->zero) { 1663 return a; 1664 } 1665 } 1666 } 1667 1668 return lp_build_max_simple(bld, a, b, nan_behavior); 1669} 1670 1671/** 1672 * Generate clamp(a, min, max) 1673 * NaN behavior (for any of a, min, max) is undefined. 1674 * Do checks for special cases. 1675 */ 1676LLVMValueRef 1677lp_build_clamp(struct lp_build_context *bld, 1678 LLVMValueRef a, 1679 LLVMValueRef min, 1680 LLVMValueRef max) 1681{ 1682 assert(lp_check_value(bld->type, a)); 1683 assert(lp_check_value(bld->type, min)); 1684 assert(lp_check_value(bld->type, max)); 1685 1686 a = lp_build_min(bld, a, max); 1687 a = lp_build_max(bld, a, min); 1688 return a; 1689} 1690 1691 1692/** 1693 * Generate clamp(a, 0, 1) 1694 * A NaN will get converted to zero. 1695 */ 1696LLVMValueRef 1697lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld, 1698 LLVMValueRef a) 1699{ 1700 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 1701 a = lp_build_min(bld, a, bld->one); 1702 return a; 1703} 1704 1705 1706/** 1707 * Generate abs(a) 1708 */ 1709LLVMValueRef 1710lp_build_abs(struct lp_build_context *bld, 1711 LLVMValueRef a) 1712{ 1713 LLVMBuilderRef builder = bld->gallivm->builder; 1714 const struct lp_type type = bld->type; 1715 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1716 1717 assert(lp_check_value(type, a)); 1718 1719 if(!type.sign) 1720 return a; 1721 1722 if(type.floating) { 1723 char intrinsic[32]; 1724 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); 1725 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1726 } 1727 1728 if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) { 1729 switch(type.width) { 1730 case 8: 1731 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1732 case 16: 1733 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1734 case 32: 1735 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1736 } 1737 } 1738 else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) { 1739 switch(type.width) { 1740 case 8: 1741 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); 1742 case 16: 1743 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); 1744 case 32: 1745 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); 1746 } 1747 } 1748 1749 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero), 1750 a, LLVMBuildNeg(builder, a, "")); 1751} 1752 1753 1754LLVMValueRef 1755lp_build_negate(struct lp_build_context *bld, 1756 LLVMValueRef a) 1757{ 1758 LLVMBuilderRef builder = bld->gallivm->builder; 1759 1760 assert(lp_check_value(bld->type, a)); 1761 1762 if (bld->type.floating) 1763 a = LLVMBuildFNeg(builder, a, ""); 1764 else 1765 a = LLVMBuildNeg(builder, a, ""); 1766 1767 return a; 1768} 1769 1770 1771/** Return -1, 0 or +1 depending on the sign of a */ 1772LLVMValueRef 1773lp_build_sgn(struct lp_build_context *bld, 1774 LLVMValueRef a) 1775{ 1776 LLVMBuilderRef builder = bld->gallivm->builder; 1777 const struct lp_type type = bld->type; 1778 LLVMValueRef cond; 1779 LLVMValueRef res; 1780 1781 assert(lp_check_value(type, a)); 1782 1783 /* Handle non-zero case */ 1784 if(!type.sign) { 1785 /* if not zero then sign must be positive */ 1786 res = bld->one; 1787 } 1788 else if(type.floating) { 1789 LLVMTypeRef vec_type; 1790 LLVMTypeRef int_type; 1791 LLVMValueRef mask; 1792 LLVMValueRef sign; 1793 LLVMValueRef one; 1794 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1795 1796 int_type = lp_build_int_vec_type(bld->gallivm, type); 1797 vec_type = lp_build_vec_type(bld->gallivm, type); 1798 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1799 1800 /* Take the sign bit and add it to 1 constant */ 1801 sign = LLVMBuildBitCast(builder, a, int_type, ""); 1802 sign = LLVMBuildAnd(builder, sign, mask, ""); 1803 one = LLVMConstBitCast(bld->one, int_type); 1804 res = LLVMBuildOr(builder, sign, one, ""); 1805 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1806 } 1807 else 1808 { 1809 /* signed int/norm/fixed point */ 1810 /* could use psign with sse3 and appropriate vectors here */ 1811 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1812 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1813 res = lp_build_select(bld, cond, bld->one, minus_one); 1814 } 1815 1816 /* Handle zero */ 1817 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1818 res = lp_build_select(bld, cond, bld->zero, res); 1819 1820 return res; 1821} 1822 1823 1824/** 1825 * Set the sign of float vector 'a' according to 'sign'. 1826 * If sign==0, return abs(a). 1827 * If sign==1, return -abs(a); 1828 * Other values for sign produce undefined results. 1829 */ 1830LLVMValueRef 1831lp_build_set_sign(struct lp_build_context *bld, 1832 LLVMValueRef a, LLVMValueRef sign) 1833{ 1834 LLVMBuilderRef builder = bld->gallivm->builder; 1835 const struct lp_type type = bld->type; 1836 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1837 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1838 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1839 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1840 ~((unsigned long long) 1 << (type.width - 1))); 1841 LLVMValueRef val, res; 1842 1843 assert(type.floating); 1844 assert(lp_check_value(type, a)); 1845 1846 /* val = reinterpret_cast<int>(a) */ 1847 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1848 /* val = val & mask */ 1849 val = LLVMBuildAnd(builder, val, mask, ""); 1850 /* sign = sign << shift */ 1851 sign = LLVMBuildShl(builder, sign, shift, ""); 1852 /* res = val | sign */ 1853 res = LLVMBuildOr(builder, val, sign, ""); 1854 /* res = reinterpret_cast<float>(res) */ 1855 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1856 1857 return res; 1858} 1859 1860 1861/** 1862 * Convert vector of (or scalar) int to vector of (or scalar) float. 1863 */ 1864LLVMValueRef 1865lp_build_int_to_float(struct lp_build_context *bld, 1866 LLVMValueRef a) 1867{ 1868 LLVMBuilderRef builder = bld->gallivm->builder; 1869 const struct lp_type type = bld->type; 1870 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1871 1872 assert(type.floating); 1873 1874 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1875} 1876 1877static boolean 1878arch_rounding_available(const struct lp_type type) 1879{ 1880 if ((util_get_cpu_caps()->has_sse4_1 && 1881 (type.length == 1 || type.width*type.length == 128)) || 1882 (util_get_cpu_caps()->has_avx && type.width*type.length == 256) || 1883 (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512)) 1884 return TRUE; 1885 else if ((util_get_cpu_caps()->has_altivec && 1886 (type.width == 32 && type.length == 4))) 1887 return TRUE; 1888 else if (util_get_cpu_caps()->has_neon) 1889 return TRUE; 1890 1891 return FALSE; 1892} 1893 1894enum lp_build_round_mode 1895{ 1896 LP_BUILD_ROUND_NEAREST = 0, 1897 LP_BUILD_ROUND_FLOOR = 1, 1898 LP_BUILD_ROUND_CEIL = 2, 1899 LP_BUILD_ROUND_TRUNCATE = 3 1900}; 1901 1902static inline LLVMValueRef 1903lp_build_iround_nearest_sse2(struct lp_build_context *bld, 1904 LLVMValueRef a) 1905{ 1906 LLVMBuilderRef builder = bld->gallivm->builder; 1907 const struct lp_type type = bld->type; 1908 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 1909 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 1910 const char *intrinsic; 1911 LLVMValueRef res; 1912 1913 assert(type.floating); 1914 /* using the double precision conversions is a bit more complicated */ 1915 assert(type.width == 32); 1916 1917 assert(lp_check_value(type, a)); 1918 assert(util_get_cpu_caps()->has_sse2); 1919 1920 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 1921 if (type.length == 1) { 1922 LLVMTypeRef vec_type; 1923 LLVMValueRef undef; 1924 LLVMValueRef arg; 1925 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 1926 1927 vec_type = LLVMVectorType(bld->elem_type, 4); 1928 1929 intrinsic = "llvm.x86.sse.cvtss2si"; 1930 1931 undef = LLVMGetUndef(vec_type); 1932 1933 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 1934 1935 res = lp_build_intrinsic_unary(builder, intrinsic, 1936 ret_type, arg); 1937 } 1938 else { 1939 if (type.width* type.length == 128) { 1940 intrinsic = "llvm.x86.sse2.cvtps2dq"; 1941 } 1942 else { 1943 assert(type.width*type.length == 256); 1944 assert(util_get_cpu_caps()->has_avx); 1945 1946 intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 1947 } 1948 res = lp_build_intrinsic_unary(builder, intrinsic, 1949 ret_type, a); 1950 } 1951 1952 return res; 1953} 1954 1955 1956/* 1957 */ 1958static inline LLVMValueRef 1959lp_build_round_altivec(struct lp_build_context *bld, 1960 LLVMValueRef a, 1961 enum lp_build_round_mode mode) 1962{ 1963 LLVMBuilderRef builder = bld->gallivm->builder; 1964 const struct lp_type type = bld->type; 1965 const char *intrinsic = NULL; 1966 1967 assert(type.floating); 1968 1969 assert(lp_check_value(type, a)); 1970 assert(util_get_cpu_caps()->has_altivec); 1971 1972 (void)type; 1973 1974 switch (mode) { 1975 case LP_BUILD_ROUND_NEAREST: 1976 intrinsic = "llvm.ppc.altivec.vrfin"; 1977 break; 1978 case LP_BUILD_ROUND_FLOOR: 1979 intrinsic = "llvm.ppc.altivec.vrfim"; 1980 break; 1981 case LP_BUILD_ROUND_CEIL: 1982 intrinsic = "llvm.ppc.altivec.vrfip"; 1983 break; 1984 case LP_BUILD_ROUND_TRUNCATE: 1985 intrinsic = "llvm.ppc.altivec.vrfiz"; 1986 break; 1987 } 1988 1989 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 1990} 1991 1992static inline LLVMValueRef 1993lp_build_round_arch(struct lp_build_context *bld, 1994 LLVMValueRef a, 1995 enum lp_build_round_mode mode) 1996{ 1997 if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) { 1998 LLVMBuilderRef builder = bld->gallivm->builder; 1999 const struct lp_type type = bld->type; 2000 const char *intrinsic_root; 2001 char intrinsic[32]; 2002 2003 assert(type.floating); 2004 assert(lp_check_value(type, a)); 2005 (void)type; 2006 2007 switch (mode) { 2008 case LP_BUILD_ROUND_NEAREST: 2009 intrinsic_root = "llvm.nearbyint"; 2010 break; 2011 case LP_BUILD_ROUND_FLOOR: 2012 intrinsic_root = "llvm.floor"; 2013 break; 2014 case LP_BUILD_ROUND_CEIL: 2015 intrinsic_root = "llvm.ceil"; 2016 break; 2017 case LP_BUILD_ROUND_TRUNCATE: 2018 intrinsic_root = "llvm.trunc"; 2019 break; 2020 default: 2021 unreachable("unhandled lp_build_round_mode"); 2022 } 2023 2024 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); 2025 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2026 } 2027 else /* (util_get_cpu_caps()->has_altivec) */ 2028 return lp_build_round_altivec(bld, a, mode); 2029} 2030 2031/** 2032 * Return the integer part of a float (vector) value (== round toward zero). 2033 * The returned value is a float (vector). 2034 * Ex: trunc(-1.5) = -1.0 2035 */ 2036LLVMValueRef 2037lp_build_trunc(struct lp_build_context *bld, 2038 LLVMValueRef a) 2039{ 2040 LLVMBuilderRef builder = bld->gallivm->builder; 2041 const struct lp_type type = bld->type; 2042 2043 assert(type.floating); 2044 assert(lp_check_value(type, a)); 2045 2046 if (type.width == 16) { 2047 char intrinsic[64]; 2048 lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type); 2049 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2050 } 2051 2052 if (arch_rounding_available(type)) { 2053 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE); 2054 } 2055 else { 2056 const struct lp_type type = bld->type; 2057 struct lp_type inttype; 2058 struct lp_build_context intbld; 2059 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2060 LLVMValueRef trunc, res, anosign, mask; 2061 LLVMTypeRef int_vec_type = bld->int_vec_type; 2062 LLVMTypeRef vec_type = bld->vec_type; 2063 2064 inttype = type; 2065 inttype.floating = 0; 2066 lp_build_context_init(&intbld, bld->gallivm, inttype); 2067 2068 /* round by truncation */ 2069 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2070 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2071 2072 /* mask out sign bit */ 2073 anosign = lp_build_abs(bld, a); 2074 /* 2075 * mask out all values if anosign > 2^24 2076 * This should work both for large ints (all rounding is no-op for them 2077 * because such floats are always exact) as well as special cases like 2078 * NaNs, Infs (taking advantage of the fact they use max exponent). 2079 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2080 */ 2081 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2082 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2083 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2084 return lp_build_select(bld, mask, a, res); 2085 } 2086} 2087 2088 2089/** 2090 * Return float (vector) rounded to nearest integer (vector). The returned 2091 * value is a float (vector). 2092 * Ex: round(0.9) = 1.0 2093 * Ex: round(-1.5) = -2.0 2094 */ 2095LLVMValueRef 2096lp_build_round(struct lp_build_context *bld, 2097 LLVMValueRef a) 2098{ 2099 LLVMBuilderRef builder = bld->gallivm->builder; 2100 const struct lp_type type = bld->type; 2101 2102 assert(type.floating); 2103 assert(lp_check_value(type, a)); 2104 2105 if (type.width == 16) { 2106 char intrinsic[64]; 2107 lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type); 2108 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2109 } 2110 2111 if (arch_rounding_available(type)) { 2112 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2113 } 2114 else { 2115 const struct lp_type type = bld->type; 2116 struct lp_type inttype; 2117 struct lp_build_context intbld; 2118 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2119 LLVMValueRef res, anosign, mask; 2120 LLVMTypeRef int_vec_type = bld->int_vec_type; 2121 LLVMTypeRef vec_type = bld->vec_type; 2122 2123 inttype = type; 2124 inttype.floating = 0; 2125 lp_build_context_init(&intbld, bld->gallivm, inttype); 2126 2127 res = lp_build_iround(bld, a); 2128 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 2129 2130 /* mask out sign bit */ 2131 anosign = lp_build_abs(bld, a); 2132 /* 2133 * mask out all values if anosign > 2^24 2134 * This should work both for large ints (all rounding is no-op for them 2135 * because such floats are always exact) as well as special cases like 2136 * NaNs, Infs (taking advantage of the fact they use max exponent). 2137 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2138 */ 2139 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2140 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2141 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2142 return lp_build_select(bld, mask, a, res); 2143 } 2144} 2145 2146 2147/** 2148 * Return floor of float (vector), result is a float (vector) 2149 * Ex: floor(1.1) = 1.0 2150 * Ex: floor(-1.1) = -2.0 2151 */ 2152LLVMValueRef 2153lp_build_floor(struct lp_build_context *bld, 2154 LLVMValueRef a) 2155{ 2156 LLVMBuilderRef builder = bld->gallivm->builder; 2157 const struct lp_type type = bld->type; 2158 2159 assert(type.floating); 2160 assert(lp_check_value(type, a)); 2161 2162 if (arch_rounding_available(type)) { 2163 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2164 } 2165 else { 2166 const struct lp_type type = bld->type; 2167 struct lp_type inttype; 2168 struct lp_build_context intbld; 2169 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2170 LLVMValueRef trunc, res, anosign, mask; 2171 LLVMTypeRef int_vec_type = bld->int_vec_type; 2172 LLVMTypeRef vec_type = bld->vec_type; 2173 2174 if (type.width != 32) { 2175 char intrinsic[32]; 2176 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type); 2177 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2178 } 2179 2180 assert(type.width == 32); /* might want to handle doubles at some point */ 2181 2182 inttype = type; 2183 inttype.floating = 0; 2184 lp_build_context_init(&intbld, bld->gallivm, inttype); 2185 2186 /* round by truncation */ 2187 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2188 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2189 2190 if (type.sign) { 2191 LLVMValueRef tmp; 2192 2193 /* 2194 * fix values if rounding is wrong (for non-special cases) 2195 * - this is the case if trunc > a 2196 */ 2197 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a); 2198 /* tmp = trunc > a ? 1.0 : 0.0 */ 2199 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2200 tmp = lp_build_and(&intbld, mask, tmp); 2201 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2202 res = lp_build_sub(bld, res, tmp); 2203 } 2204 2205 /* mask out sign bit */ 2206 anosign = lp_build_abs(bld, a); 2207 /* 2208 * mask out all values if anosign > 2^24 2209 * This should work both for large ints (all rounding is no-op for them 2210 * because such floats are always exact) as well as special cases like 2211 * NaNs, Infs (taking advantage of the fact they use max exponent). 2212 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2213 */ 2214 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2215 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2216 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2217 return lp_build_select(bld, mask, a, res); 2218 } 2219} 2220 2221 2222/** 2223 * Return ceiling of float (vector), returning float (vector). 2224 * Ex: ceil( 1.1) = 2.0 2225 * Ex: ceil(-1.1) = -1.0 2226 */ 2227LLVMValueRef 2228lp_build_ceil(struct lp_build_context *bld, 2229 LLVMValueRef a) 2230{ 2231 LLVMBuilderRef builder = bld->gallivm->builder; 2232 const struct lp_type type = bld->type; 2233 2234 assert(type.floating); 2235 assert(lp_check_value(type, a)); 2236 2237 if (arch_rounding_available(type)) { 2238 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2239 } 2240 else { 2241 const struct lp_type type = bld->type; 2242 struct lp_type inttype; 2243 struct lp_build_context intbld; 2244 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2245 LLVMValueRef trunc, res, anosign, mask, tmp; 2246 LLVMTypeRef int_vec_type = bld->int_vec_type; 2247 LLVMTypeRef vec_type = bld->vec_type; 2248 2249 if (type.width != 32) { 2250 char intrinsic[32]; 2251 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type); 2252 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2253 } 2254 2255 assert(type.width == 32); /* might want to handle doubles at some point */ 2256 2257 inttype = type; 2258 inttype.floating = 0; 2259 lp_build_context_init(&intbld, bld->gallivm, inttype); 2260 2261 /* round by truncation */ 2262 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2263 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc"); 2264 2265 /* 2266 * fix values if rounding is wrong (for non-special cases) 2267 * - this is the case if trunc < a 2268 */ 2269 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2270 /* tmp = trunc < a ? 1.0 : 0.0 */ 2271 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2272 tmp = lp_build_and(&intbld, mask, tmp); 2273 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2274 res = lp_build_add(bld, trunc, tmp); 2275 2276 /* mask out sign bit */ 2277 anosign = lp_build_abs(bld, a); 2278 /* 2279 * mask out all values if anosign > 2^24 2280 * This should work both for large ints (all rounding is no-op for them 2281 * because such floats are always exact) as well as special cases like 2282 * NaNs, Infs (taking advantage of the fact they use max exponent). 2283 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2284 */ 2285 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2286 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2287 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2288 return lp_build_select(bld, mask, a, res); 2289 } 2290} 2291 2292 2293/** 2294 * Return fractional part of 'a' computed as a - floor(a) 2295 * Typically used in texture coord arithmetic. 2296 */ 2297LLVMValueRef 2298lp_build_fract(struct lp_build_context *bld, 2299 LLVMValueRef a) 2300{ 2301 assert(bld->type.floating); 2302 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 2303} 2304 2305 2306/** 2307 * Prevent returning 1.0 for very small negative values of 'a' by clamping 2308 * against 0.99999(9). (Will also return that value for NaNs.) 2309 */ 2310static inline LLVMValueRef 2311clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 2312{ 2313 LLVMValueRef max; 2314 2315 /* this is the largest number smaller than 1.0 representable as float */ 2316 max = lp_build_const_vec(bld->gallivm, bld->type, 2317 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 2318 return lp_build_min_ext(bld, fract, max, 2319 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 2320} 2321 2322 2323/** 2324 * Same as lp_build_fract, but guarantees that the result is always smaller 2325 * than one. Will also return the smaller-than-one value for infs, NaNs. 2326 */ 2327LLVMValueRef 2328lp_build_fract_safe(struct lp_build_context *bld, 2329 LLVMValueRef a) 2330{ 2331 return clamp_fract(bld, lp_build_fract(bld, a)); 2332} 2333 2334 2335/** 2336 * Return the integer part of a float (vector) value (== round toward zero). 2337 * The returned value is an integer (vector). 2338 * Ex: itrunc(-1.5) = -1 2339 */ 2340LLVMValueRef 2341lp_build_itrunc(struct lp_build_context *bld, 2342 LLVMValueRef a) 2343{ 2344 LLVMBuilderRef builder = bld->gallivm->builder; 2345 const struct lp_type type = bld->type; 2346 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2347 2348 assert(type.floating); 2349 assert(lp_check_value(type, a)); 2350 2351 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2352} 2353 2354 2355/** 2356 * Return float (vector) rounded to nearest integer (vector). The returned 2357 * value is an integer (vector). 2358 * Ex: iround(0.9) = 1 2359 * Ex: iround(-1.5) = -2 2360 */ 2361LLVMValueRef 2362lp_build_iround(struct lp_build_context *bld, 2363 LLVMValueRef a) 2364{ 2365 LLVMBuilderRef builder = bld->gallivm->builder; 2366 const struct lp_type type = bld->type; 2367 LLVMTypeRef int_vec_type = bld->int_vec_type; 2368 LLVMValueRef res; 2369 2370 assert(type.floating); 2371 2372 assert(lp_check_value(type, a)); 2373 2374 if ((util_get_cpu_caps()->has_sse2 && 2375 ((type.width == 32) && (type.length == 1 || type.length == 4))) || 2376 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { 2377 return lp_build_iround_nearest_sse2(bld, a); 2378 } 2379 if (arch_rounding_available(type)) { 2380 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2381 } 2382 else { 2383 LLVMValueRef half; 2384 2385 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0)); 2386 2387 if (type.sign) { 2388 LLVMTypeRef vec_type = bld->vec_type; 2389 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 2390 (unsigned long long)1 << (type.width - 1)); 2391 LLVMValueRef sign; 2392 2393 /* get sign bit */ 2394 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 2395 sign = LLVMBuildAnd(builder, sign, mask, ""); 2396 2397 /* sign * 0.5 */ 2398 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 2399 half = LLVMBuildOr(builder, sign, half, ""); 2400 half = LLVMBuildBitCast(builder, half, vec_type, ""); 2401 } 2402 2403 res = LLVMBuildFAdd(builder, a, half, ""); 2404 } 2405 2406 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 2407 2408 return res; 2409} 2410 2411 2412/** 2413 * Return floor of float (vector), result is an int (vector) 2414 * Ex: ifloor(1.1) = 1.0 2415 * Ex: ifloor(-1.1) = -2.0 2416 */ 2417LLVMValueRef 2418lp_build_ifloor(struct lp_build_context *bld, 2419 LLVMValueRef a) 2420{ 2421 LLVMBuilderRef builder = bld->gallivm->builder; 2422 const struct lp_type type = bld->type; 2423 LLVMTypeRef int_vec_type = bld->int_vec_type; 2424 LLVMValueRef res; 2425 2426 assert(type.floating); 2427 assert(lp_check_value(type, a)); 2428 2429 res = a; 2430 if (type.sign) { 2431 if (arch_rounding_available(type)) { 2432 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2433 } 2434 else { 2435 struct lp_type inttype; 2436 struct lp_build_context intbld; 2437 LLVMValueRef trunc, itrunc, mask; 2438 2439 assert(type.floating); 2440 assert(lp_check_value(type, a)); 2441 2442 inttype = type; 2443 inttype.floating = 0; 2444 lp_build_context_init(&intbld, bld->gallivm, inttype); 2445 2446 /* round by truncation */ 2447 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2448 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc"); 2449 2450 /* 2451 * fix values if rounding is wrong (for non-special cases) 2452 * - this is the case if trunc > a 2453 * The results of doing this with NaNs, very large values etc. 2454 * are undefined but this seems to be the case anyway. 2455 */ 2456 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a); 2457 /* cheapie minus one with mask since the mask is minus one / zero */ 2458 return lp_build_add(&intbld, itrunc, mask); 2459 } 2460 } 2461 2462 /* round to nearest (toward zero) */ 2463 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 2464 2465 return res; 2466} 2467 2468 2469/** 2470 * Return ceiling of float (vector), returning int (vector). 2471 * Ex: iceil( 1.1) = 2 2472 * Ex: iceil(-1.1) = -1 2473 */ 2474LLVMValueRef 2475lp_build_iceil(struct lp_build_context *bld, 2476 LLVMValueRef a) 2477{ 2478 LLVMBuilderRef builder = bld->gallivm->builder; 2479 const struct lp_type type = bld->type; 2480 LLVMTypeRef int_vec_type = bld->int_vec_type; 2481 LLVMValueRef res; 2482 2483 assert(type.floating); 2484 assert(lp_check_value(type, a)); 2485 2486 if (arch_rounding_available(type)) { 2487 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2488 } 2489 else { 2490 struct lp_type inttype; 2491 struct lp_build_context intbld; 2492 LLVMValueRef trunc, itrunc, mask; 2493 2494 assert(type.floating); 2495 assert(lp_check_value(type, a)); 2496 2497 inttype = type; 2498 inttype.floating = 0; 2499 lp_build_context_init(&intbld, bld->gallivm, inttype); 2500 2501 /* round by truncation */ 2502 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2503 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc"); 2504 2505 /* 2506 * fix values if rounding is wrong (for non-special cases) 2507 * - this is the case if trunc < a 2508 * The results of doing this with NaNs, very large values etc. 2509 * are undefined but this seems to be the case anyway. 2510 */ 2511 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2512 /* cheapie plus one with mask since the mask is minus one / zero */ 2513 return lp_build_sub(&intbld, itrunc, mask); 2514 } 2515 2516 /* round to nearest (toward zero) */ 2517 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 2518 2519 return res; 2520} 2521 2522 2523/** 2524 * Combined ifloor() & fract(). 2525 * 2526 * Preferred to calling the functions separately, as it will ensure that the 2527 * strategy (floor() vs ifloor()) that results in less redundant work is used. 2528 */ 2529void 2530lp_build_ifloor_fract(struct lp_build_context *bld, 2531 LLVMValueRef a, 2532 LLVMValueRef *out_ipart, 2533 LLVMValueRef *out_fpart) 2534{ 2535 LLVMBuilderRef builder = bld->gallivm->builder; 2536 const struct lp_type type = bld->type; 2537 LLVMValueRef ipart; 2538 2539 assert(type.floating); 2540 assert(lp_check_value(type, a)); 2541 2542 if (arch_rounding_available(type)) { 2543 /* 2544 * floor() is easier. 2545 */ 2546 2547 ipart = lp_build_floor(bld, a); 2548 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2549 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 2550 } 2551 else { 2552 /* 2553 * ifloor() is easier. 2554 */ 2555 2556 *out_ipart = lp_build_ifloor(bld, a); 2557 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 2558 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2559 } 2560} 2561 2562 2563/** 2564 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 2565 * always smaller than one. 2566 */ 2567void 2568lp_build_ifloor_fract_safe(struct lp_build_context *bld, 2569 LLVMValueRef a, 2570 LLVMValueRef *out_ipart, 2571 LLVMValueRef *out_fpart) 2572{ 2573 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 2574 *out_fpart = clamp_fract(bld, *out_fpart); 2575} 2576 2577 2578LLVMValueRef 2579lp_build_sqrt(struct lp_build_context *bld, 2580 LLVMValueRef a) 2581{ 2582 LLVMBuilderRef builder = bld->gallivm->builder; 2583 const struct lp_type type = bld->type; 2584 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2585 char intrinsic[32]; 2586 2587 assert(lp_check_value(type, a)); 2588 2589 assert(type.floating); 2590 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type); 2591 2592 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2593} 2594 2595 2596/** 2597 * Do one Newton-Raphson step to improve reciprocate precision: 2598 * 2599 * x_{i+1} = x_i + x_i * (1 - a * x_i) 2600 * 2601 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 2602 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 2603 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's 2604 * halo. It would be necessary to clamp the argument to prevent this. 2605 * 2606 * See also: 2607 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 2608 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 2609 */ 2610static inline LLVMValueRef 2611lp_build_rcp_refine(struct lp_build_context *bld, 2612 LLVMValueRef a, 2613 LLVMValueRef rcp_a) 2614{ 2615 LLVMBuilderRef builder = bld->gallivm->builder; 2616 LLVMValueRef neg_a; 2617 LLVMValueRef res; 2618 2619 neg_a = LLVMBuildFNeg(builder, a, ""); 2620 res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one); 2621 res = lp_build_fmuladd(builder, res, rcp_a, rcp_a); 2622 2623 return res; 2624} 2625 2626 2627LLVMValueRef 2628lp_build_rcp(struct lp_build_context *bld, 2629 LLVMValueRef a) 2630{ 2631 LLVMBuilderRef builder = bld->gallivm->builder; 2632 const struct lp_type type = bld->type; 2633 2634 assert(lp_check_value(type, a)); 2635 2636 if(a == bld->zero) 2637 return bld->undef; 2638 if(a == bld->one) 2639 return bld->one; 2640 if(a == bld->undef) 2641 return bld->undef; 2642 2643 assert(type.floating); 2644 2645 if(LLVMIsConstant(a)) 2646 return LLVMConstFDiv(bld->one, a); 2647 2648 /* 2649 * We don't use RCPPS because: 2650 * - it only has 10bits of precision 2651 * - it doesn't even get the reciprocate of 1.0 exactly 2652 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 2653 * - for recent processors the benefit over DIVPS is marginal, a case 2654 * dependent 2655 * 2656 * We could still use it on certain processors if benchmarks show that the 2657 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 2658 * particular uses that require less workarounds. 2659 */ 2660 2661 if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 2662 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){ 2663 const unsigned num_iterations = 0; 2664 LLVMValueRef res; 2665 unsigned i; 2666 const char *intrinsic = NULL; 2667 2668 if (type.length == 4) { 2669 intrinsic = "llvm.x86.sse.rcp.ps"; 2670 } 2671 else { 2672 intrinsic = "llvm.x86.avx.rcp.ps.256"; 2673 } 2674 2675 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2676 2677 for (i = 0; i < num_iterations; ++i) { 2678 res = lp_build_rcp_refine(bld, a, res); 2679 } 2680 2681 return res; 2682 } 2683 2684 return LLVMBuildFDiv(builder, bld->one, a, ""); 2685} 2686 2687 2688/** 2689 * Do one Newton-Raphson step to improve rsqrt precision: 2690 * 2691 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 2692 * 2693 * See also Intel 64 and IA-32 Architectures Optimization Manual. 2694 */ 2695static inline LLVMValueRef 2696lp_build_rsqrt_refine(struct lp_build_context *bld, 2697 LLVMValueRef a, 2698 LLVMValueRef rsqrt_a) 2699{ 2700 LLVMBuilderRef builder = bld->gallivm->builder; 2701 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 2702 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 2703 LLVMValueRef res; 2704 2705 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 2706 res = LLVMBuildFMul(builder, a, res, ""); 2707 res = LLVMBuildFSub(builder, three, res, ""); 2708 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 2709 res = LLVMBuildFMul(builder, half, res, ""); 2710 2711 return res; 2712} 2713 2714 2715/** 2716 * Generate 1/sqrt(a). 2717 * Result is undefined for values < 0, infinity for +0. 2718 */ 2719LLVMValueRef 2720lp_build_rsqrt(struct lp_build_context *bld, 2721 LLVMValueRef a) 2722{ 2723 const struct lp_type type = bld->type; 2724 2725 assert(lp_check_value(type, a)); 2726 2727 assert(type.floating); 2728 2729 /* 2730 * This should be faster but all denormals will end up as infinity. 2731 */ 2732 if (0 && lp_build_fast_rsqrt_available(type)) { 2733 const unsigned num_iterations = 1; 2734 LLVMValueRef res; 2735 unsigned i; 2736 2737 /* rsqrt(1.0) != 1.0 here */ 2738 res = lp_build_fast_rsqrt(bld, a); 2739 2740 if (num_iterations) { 2741 /* 2742 * Newton-Raphson will result in NaN instead of infinity for zero, 2743 * and NaN instead of zero for infinity. 2744 * Also, need to ensure rsqrt(1.0) == 1.0. 2745 * All numbers smaller than FLT_MIN will result in +infinity 2746 * (rsqrtps treats all denormals as zero). 2747 */ 2748 LLVMValueRef cmp; 2749 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN); 2750 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY); 2751 2752 for (i = 0; i < num_iterations; ++i) { 2753 res = lp_build_rsqrt_refine(bld, a, res); 2754 } 2755 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min); 2756 res = lp_build_select(bld, cmp, inf, res); 2757 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf); 2758 res = lp_build_select(bld, cmp, bld->zero, res); 2759 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); 2760 res = lp_build_select(bld, cmp, bld->one, res); 2761 } 2762 2763 return res; 2764 } 2765 2766 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2767} 2768 2769/** 2770 * If there's a fast (inaccurate) rsqrt instruction available 2771 * (caller may want to avoid to call rsqrt_fast if it's not available, 2772 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if 2773 * unavailable it would result in sqrt/div/mul so obviously 2774 * much better to just call sqrt, skipping both div and mul). 2775 */ 2776boolean 2777lp_build_fast_rsqrt_available(struct lp_type type) 2778{ 2779 assert(type.floating); 2780 2781 if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) || 2782 (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) { 2783 return true; 2784 } 2785 return false; 2786} 2787 2788 2789/** 2790 * Generate 1/sqrt(a). 2791 * Result is undefined for values < 0, infinity for +0. 2792 * Precision is limited, only ~10 bits guaranteed 2793 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). 2794 */ 2795LLVMValueRef 2796lp_build_fast_rsqrt(struct lp_build_context *bld, 2797 LLVMValueRef a) 2798{ 2799 LLVMBuilderRef builder = bld->gallivm->builder; 2800 const struct lp_type type = bld->type; 2801 2802 assert(lp_check_value(type, a)); 2803 2804 if (lp_build_fast_rsqrt_available(type)) { 2805 const char *intrinsic = NULL; 2806 2807 if (type.length == 4) { 2808 intrinsic = "llvm.x86.sse.rsqrt.ps"; 2809 } 2810 else { 2811 intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2812 } 2813 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2814 } 2815 else { 2816 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); 2817 } 2818 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2819} 2820 2821 2822/** 2823 * Generate sin(a) or cos(a) using polynomial approximation. 2824 * TODO: it might be worth recognizing sin and cos using same source 2825 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time 2826 * would be way cheaper than calculating (nearly) everything twice... 2827 * Not sure it's common enough to be worth bothering however, scs 2828 * opcode could also benefit from calculating both though. 2829 */ 2830static LLVMValueRef 2831lp_build_sin_or_cos(struct lp_build_context *bld, 2832 LLVMValueRef a, 2833 boolean cos) 2834{ 2835 struct gallivm_state *gallivm = bld->gallivm; 2836 LLVMBuilderRef b = gallivm->builder; 2837 struct lp_type int_type = lp_int_type(bld->type); 2838 2839 /* 2840 * take the absolute value, 2841 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2842 */ 2843 2844 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2845 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2846 2847 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2848 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2849 2850 /* 2851 * scale by 4/Pi 2852 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2853 */ 2854 2855 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2856 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2857 2858 /* 2859 * store the integer part of y in mm0 2860 * emm2 = _mm_cvttps_epi32(y); 2861 */ 2862 2863 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2864 2865 /* 2866 * j=(j+1) & (~1) (see the cephes sources) 2867 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2868 */ 2869 2870 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2871 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2872 /* 2873 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2874 */ 2875 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2876 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2877 2878 /* 2879 * y = _mm_cvtepi32_ps(emm2); 2880 */ 2881 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2882 2883 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2884 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2885 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2886 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2887 2888 /* 2889 * Argument used for poly selection and sign bit determination 2890 * is different for sin vs. cos. 2891 */ 2892 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") : 2893 emm2_and; 2894 2895 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4, 2896 LLVMBuildNot(b, emm2_2, ""), ""), 2897 const_29, "sign_bit") : 2898 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si, 2899 LLVMBuildShl(b, emm2_add, 2900 const_29, ""), ""), 2901 sign_mask, "sign_bit"); 2902 2903 /* 2904 * get the polynom selection mask 2905 * there is one polynom for 0 <= x <= Pi/4 2906 * and another one for Pi/4<x<=Pi/2 2907 * Both branches will be computed. 2908 * 2909 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 2910 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 2911 */ 2912 2913 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3"); 2914 LLVMValueRef poly_mask = lp_build_compare(gallivm, 2915 int_type, PIPE_FUNC_EQUAL, 2916 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 2917 2918 /* 2919 * _PS_CONST(minus_cephes_DP1, -0.78515625); 2920 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 2921 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 2922 */ 2923 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 2924 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 2925 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 2926 2927 /* 2928 * The magic pass: "Extended precision modular arithmetic" 2929 * x = ((x - y * DP1) - y * DP2) - y * DP3; 2930 */ 2931 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs); 2932 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1); 2933 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2); 2934 2935 /* 2936 * Evaluate the first polynom (0 <= x <= Pi/4) 2937 * 2938 * z = _mm_mul_ps(x,x); 2939 */ 2940 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 2941 2942 /* 2943 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 2944 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 2945 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 2946 */ 2947 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 2948 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 2949 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 2950 2951 /* 2952 * y = *(v4sf*)_ps_coscof_p0; 2953 * y = _mm_mul_ps(y, z); 2954 */ 2955 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1); 2956 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2); 2957 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 2958 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 2959 2960 2961 /* 2962 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 2963 * y = _mm_sub_ps(y, tmp); 2964 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 2965 */ 2966 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 2967 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 2968 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 2969 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 2970 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 2971 2972 /* 2973 * _PS_CONST(sincof_p0, -1.9515295891E-4); 2974 * _PS_CONST(sincof_p1, 8.3321608736E-3); 2975 * _PS_CONST(sincof_p2, -1.6666654611E-1); 2976 */ 2977 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 2978 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 2979 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 2980 2981 /* 2982 * Evaluate the second polynom (Pi/4 <= x <= 0) 2983 * 2984 * y2 = *(v4sf*)_ps_sincof_p0; 2985 * y2 = _mm_mul_ps(y2, z); 2986 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 2987 * y2 = _mm_mul_ps(y2, z); 2988 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 2989 * y2 = _mm_mul_ps(y2, z); 2990 * y2 = _mm_mul_ps(y2, x); 2991 * y2 = _mm_add_ps(y2, x); 2992 */ 2993 2994 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1); 2995 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2); 2996 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 2997 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3); 2998 2999 /* 3000 * select the correct result from the two polynoms 3001 * xmm3 = poly_mask; 3002 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 3003 * y = _mm_andnot_ps(xmm3, y); 3004 * y = _mm_or_ps(y,y2); 3005 */ 3006 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 3007 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 3008 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 3009 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv"); 3010 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 3011 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine"); 3012 3013 /* 3014 * update the sign 3015 * y = _mm_xor_ps(y, sign_bit); 3016 */ 3017 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign"); 3018 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 3019 3020 LLVMValueRef isfinite = lp_build_isfinite(bld, a); 3021 3022 /* clamp output to be within [-1, 1] */ 3023 y_result = lp_build_clamp(bld, y_result, 3024 lp_build_const_vec(bld->gallivm, bld->type, -1.f), 3025 lp_build_const_vec(bld->gallivm, bld->type, 1.f)); 3026 /* If a is -inf, inf or NaN then return NaN */ 3027 y_result = lp_build_select(bld, isfinite, y_result, 3028 lp_build_const_vec(bld->gallivm, bld->type, NAN)); 3029 return y_result; 3030} 3031 3032 3033/** 3034 * Generate sin(a) 3035 */ 3036LLVMValueRef 3037lp_build_sin(struct lp_build_context *bld, 3038 LLVMValueRef a) 3039{ 3040 const struct lp_type type = bld->type; 3041 3042 if (type.width == 16) { 3043 LLVMBuilderRef builder = bld->gallivm->builder; 3044 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3045 char intrinsic[32]; 3046 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type); 3047 LLVMValueRef args[] = { a }; 3048 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3049 } 3050 3051 return lp_build_sin_or_cos(bld, a, FALSE); 3052} 3053 3054 3055/** 3056 * Generate cos(a) 3057 */ 3058LLVMValueRef 3059lp_build_cos(struct lp_build_context *bld, 3060 LLVMValueRef a) 3061{ 3062 const struct lp_type type = bld->type; 3063 3064 if (type.width == 16) { 3065 LLVMBuilderRef builder = bld->gallivm->builder; 3066 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3067 char intrinsic[32]; 3068 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type); 3069 LLVMValueRef args[] = { a }; 3070 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3071 } 3072 3073 return lp_build_sin_or_cos(bld, a, TRUE); 3074} 3075 3076 3077/** 3078 * Generate pow(x, y) 3079 */ 3080LLVMValueRef 3081lp_build_pow(struct lp_build_context *bld, 3082 LLVMValueRef x, 3083 LLVMValueRef y) 3084{ 3085 /* TODO: optimize the constant case */ 3086 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3087 LLVMIsConstant(x) && LLVMIsConstant(y)) { 3088 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3089 __FUNCTION__); 3090 } 3091 3092 LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f)); 3093 LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y)); 3094 3095 res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res); 3096 return res; 3097} 3098 3099 3100/** 3101 * Generate exp(x) 3102 */ 3103LLVMValueRef 3104lp_build_exp(struct lp_build_context *bld, 3105 LLVMValueRef x) 3106{ 3107 /* log2(e) = 1/log(2) */ 3108 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 3109 1.4426950408889634); 3110 3111 assert(lp_check_value(bld->type, x)); 3112 3113 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 3114} 3115 3116 3117/** 3118 * Generate log(x) 3119 * Behavior is undefined with infs, 0s and nans 3120 */ 3121LLVMValueRef 3122lp_build_log(struct lp_build_context *bld, 3123 LLVMValueRef x) 3124{ 3125 /* log(2) */ 3126 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3127 0.69314718055994529); 3128 3129 assert(lp_check_value(bld->type, x)); 3130 3131 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 3132} 3133 3134/** 3135 * Generate log(x) that handles edge cases (infs, 0s and nans) 3136 */ 3137LLVMValueRef 3138lp_build_log_safe(struct lp_build_context *bld, 3139 LLVMValueRef x) 3140{ 3141 /* log(2) */ 3142 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3143 0.69314718055994529); 3144 3145 assert(lp_check_value(bld->type, x)); 3146 3147 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x)); 3148} 3149 3150 3151/** 3152 * Generate polynomial. 3153 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 3154 */ 3155LLVMValueRef 3156lp_build_polynomial(struct lp_build_context *bld, 3157 LLVMValueRef x, 3158 const double *coeffs, 3159 unsigned num_coeffs) 3160{ 3161 const struct lp_type type = bld->type; 3162 LLVMValueRef even = NULL, odd = NULL; 3163 LLVMValueRef x2; 3164 unsigned i; 3165 3166 assert(lp_check_value(bld->type, x)); 3167 3168 /* TODO: optimize the constant case */ 3169 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3170 LLVMIsConstant(x)) { 3171 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3172 __FUNCTION__); 3173 } 3174 3175 /* 3176 * Calculate odd and even terms seperately to decrease data dependency 3177 * Ex: 3178 * c[0] + x^2 * c[2] + x^4 * c[4] ... 3179 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 3180 */ 3181 x2 = lp_build_mul(bld, x, x); 3182 3183 for (i = num_coeffs; i--; ) { 3184 LLVMValueRef coeff; 3185 3186 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 3187 3188 if (i % 2 == 0) { 3189 if (even) 3190 even = lp_build_mad(bld, x2, even, coeff); 3191 else 3192 even = coeff; 3193 } else { 3194 if (odd) 3195 odd = lp_build_mad(bld, x2, odd, coeff); 3196 else 3197 odd = coeff; 3198 } 3199 } 3200 3201 if (odd) 3202 return lp_build_mad(bld, odd, x, even); 3203 else if (even) 3204 return even; 3205 else 3206 return bld->undef; 3207} 3208 3209 3210/** 3211 * Minimax polynomial fit of 2**x, in range [0, 1[ 3212 */ 3213const double lp_build_exp2_polynomial[] = { 3214#if EXP_POLY_DEGREE == 5 3215 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */ 3216 0.693153073200168932794, 3217 0.240153617044375388211, 3218 0.0558263180532956664775, 3219 0.00898934009049466391101, 3220 0.00187757667519147912699 3221#elif EXP_POLY_DEGREE == 4 3222 1.00000259337069434683, 3223 0.693003834469974940458, 3224 0.24144275689150793076, 3225 0.0520114606103070150235, 3226 0.0135341679161270268764 3227#elif EXP_POLY_DEGREE == 3 3228 0.999925218562710312959, 3229 0.695833540494823811697, 3230 0.226067155427249155588, 3231 0.0780245226406372992967 3232#elif EXP_POLY_DEGREE == 2 3233 1.00172476321474503578, 3234 0.657636275736077639316, 3235 0.33718943461968720704 3236#else 3237#error 3238#endif 3239}; 3240 3241 3242LLVMValueRef 3243lp_build_exp2(struct lp_build_context *bld, 3244 LLVMValueRef x) 3245{ 3246 LLVMBuilderRef builder = bld->gallivm->builder; 3247 const struct lp_type type = bld->type; 3248 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3249 LLVMValueRef ipart = NULL; 3250 LLVMValueRef fpart = NULL; 3251 LLVMValueRef expipart = NULL; 3252 LLVMValueRef expfpart = NULL; 3253 LLVMValueRef res = NULL; 3254 3255 if (type.floating && type.width == 16) { 3256 char intrinsic[32]; 3257 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type); 3258 LLVMValueRef args[] = { x }; 3259 return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0); 3260 } 3261 3262 assert(lp_check_value(bld->type, x)); 3263 3264 /* TODO: optimize the constant case */ 3265 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3266 LLVMIsConstant(x)) { 3267 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3268 __FUNCTION__); 3269 } 3270 3271 assert(type.floating && type.width == 32); 3272 3273 /* We want to preserve NaN and make sure than for exp2 if x > 128, 3274 * the result is INF and if it's smaller than -126.9 the result is 0 */ 3275 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x, 3276 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3277 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), 3278 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3279 3280 /* ipart = floor(x) */ 3281 /* fpart = x - ipart */ 3282 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 3283 3284 /* expipart = (float) (1 << ipart) */ 3285 expipart = LLVMBuildAdd(builder, ipart, 3286 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3287 expipart = LLVMBuildShl(builder, expipart, 3288 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3289 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 3290 3291 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 3292 ARRAY_SIZE(lp_build_exp2_polynomial)); 3293 3294 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 3295 3296 return res; 3297} 3298 3299 3300 3301/** 3302 * Extract the exponent of a IEEE-754 floating point value. 3303 * 3304 * Optionally apply an integer bias. 3305 * 3306 * Result is an integer value with 3307 * 3308 * ifloor(log2(x)) + bias 3309 */ 3310LLVMValueRef 3311lp_build_extract_exponent(struct lp_build_context *bld, 3312 LLVMValueRef x, 3313 int bias) 3314{ 3315 LLVMBuilderRef builder = bld->gallivm->builder; 3316 const struct lp_type type = bld->type; 3317 unsigned mantissa = lp_mantissa(type); 3318 LLVMValueRef res; 3319 3320 assert(type.floating); 3321 3322 assert(lp_check_value(bld->type, x)); 3323 3324 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3325 3326 res = LLVMBuildLShr(builder, x, 3327 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 3328 res = LLVMBuildAnd(builder, res, 3329 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 3330 res = LLVMBuildSub(builder, res, 3331 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 3332 3333 return res; 3334} 3335 3336 3337/** 3338 * Extract the mantissa of the a floating. 3339 * 3340 * Result is a floating point value with 3341 * 3342 * x / floor(log2(x)) 3343 */ 3344LLVMValueRef 3345lp_build_extract_mantissa(struct lp_build_context *bld, 3346 LLVMValueRef x) 3347{ 3348 LLVMBuilderRef builder = bld->gallivm->builder; 3349 const struct lp_type type = bld->type; 3350 unsigned mantissa = lp_mantissa(type); 3351 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 3352 (1ULL << mantissa) - 1); 3353 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 3354 LLVMValueRef res; 3355 3356 assert(lp_check_value(bld->type, x)); 3357 3358 assert(type.floating); 3359 3360 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3361 3362 /* res = x / 2**ipart */ 3363 res = LLVMBuildAnd(builder, x, mantmask, ""); 3364 res = LLVMBuildOr(builder, res, one, ""); 3365 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 3366 3367 return res; 3368} 3369 3370 3371 3372/** 3373 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 3374 * These coefficients can be generate with 3375 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 3376 */ 3377const double lp_build_log2_polynomial[] = { 3378#if LOG_POLY_DEGREE == 5 3379 2.88539008148777786488L, 3380 0.961796878841293367824L, 3381 0.577058946784739859012L, 3382 0.412914355135828735411L, 3383 0.308591899232910175289L, 3384 0.352376952300281371868L, 3385#elif LOG_POLY_DEGREE == 4 3386 2.88539009343309178325L, 3387 0.961791550404184197881L, 3388 0.577440339438736392009L, 3389 0.403343858251329912514L, 3390 0.406718052498846252698L, 3391#elif LOG_POLY_DEGREE == 3 3392 2.88538959748872753838L, 3393 0.961932915889597772928L, 3394 0.571118517972136195241L, 3395 0.493997535084709500285L, 3396#else 3397#error 3398#endif 3399}; 3400 3401/** 3402 * See http://www.devmaster.net/forums/showthread.php?p=43580 3403 * http://en.wikipedia.org/wiki/Logarithm#Calculation 3404 * http://www.nezumi.demon.co.uk/consult/logx.htm 3405 * 3406 * If handle_edge_cases is true the function will perform computations 3407 * to match the required D3D10+ behavior for each of the edge cases. 3408 * That means that if input is: 3409 * - less than zero (to and including -inf) then NaN will be returned 3410 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned 3411 * - +infinity, then +infinity will be returned 3412 * - NaN, then NaN will be returned 3413 * 3414 * Those checks are fairly expensive so if you don't need them make sure 3415 * handle_edge_cases is false. 3416 */ 3417void 3418lp_build_log2_approx(struct lp_build_context *bld, 3419 LLVMValueRef x, 3420 LLVMValueRef *p_exp, 3421 LLVMValueRef *p_floor_log2, 3422 LLVMValueRef *p_log2, 3423 boolean handle_edge_cases) 3424{ 3425 LLVMBuilderRef builder = bld->gallivm->builder; 3426 const struct lp_type type = bld->type; 3427 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3428 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 3429 3430 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 3431 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 3432 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 3433 3434 LLVMValueRef i = NULL; 3435 LLVMValueRef y = NULL; 3436 LLVMValueRef z = NULL; 3437 LLVMValueRef exp = NULL; 3438 LLVMValueRef mant = NULL; 3439 LLVMValueRef logexp = NULL; 3440 LLVMValueRef p_z = NULL; 3441 LLVMValueRef res = NULL; 3442 3443 if (bld->type.width == 16) { 3444 char intrinsic[32]; 3445 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type); 3446 LLVMValueRef args[] = { x }; 3447 if (p_log2) 3448 *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0); 3449 return; 3450 } 3451 3452 assert(lp_check_value(bld->type, x)); 3453 3454 if(p_exp || p_floor_log2 || p_log2) { 3455 /* TODO: optimize the constant case */ 3456 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3457 LLVMIsConstant(x)) { 3458 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3459 __FUNCTION__); 3460 } 3461 3462 assert(type.floating && type.width == 32); 3463 3464 /* 3465 * We don't explicitly handle denormalized numbers. They will yield a 3466 * result in the neighbourhood of -127, which appears to be adequate 3467 * enough. 3468 */ 3469 3470 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3471 3472 /* exp = (float) exponent(x) */ 3473 exp = LLVMBuildAnd(builder, i, expmask, ""); 3474 } 3475 3476 if(p_floor_log2 || p_log2) { 3477 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3478 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3479 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 3480 } 3481 3482 if (p_log2) { 3483 /* mant = 1 + (float) mantissa(x) */ 3484 mant = LLVMBuildAnd(builder, i, mantmask, ""); 3485 mant = LLVMBuildOr(builder, mant, one, ""); 3486 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 3487 3488 /* y = (mant - 1) / (mant + 1) */ 3489 y = lp_build_div(bld, 3490 lp_build_sub(bld, mant, bld->one), 3491 lp_build_add(bld, mant, bld->one) 3492 ); 3493 3494 /* z = y^2 */ 3495 z = lp_build_mul(bld, y, y); 3496 3497 /* compute P(z) */ 3498 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 3499 ARRAY_SIZE(lp_build_log2_polynomial)); 3500 3501 /* y * P(z) + logexp */ 3502 res = lp_build_mad(bld, y, p_z, logexp); 3503 3504 if (type.floating && handle_edge_cases) { 3505 LLVMValueRef negmask, infmask, zmask; 3506 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x, 3507 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3508 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, 3509 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3510 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x, 3511 lp_build_const_vec(bld->gallivm, type, INFINITY)); 3512 3513 /* If x is qual to inf make sure we return inf */ 3514 res = lp_build_select(bld, infmask, 3515 lp_build_const_vec(bld->gallivm, type, INFINITY), 3516 res); 3517 /* If x is qual to 0, return -inf */ 3518 res = lp_build_select(bld, zmask, 3519 lp_build_const_vec(bld->gallivm, type, -INFINITY), 3520 res); 3521 /* If x is nan or less than 0, return nan */ 3522 res = lp_build_select(bld, negmask, 3523 lp_build_const_vec(bld->gallivm, type, NAN), 3524 res); 3525 } 3526 } 3527 3528 if (p_exp) { 3529 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 3530 *p_exp = exp; 3531 } 3532 3533 if (p_floor_log2) 3534 *p_floor_log2 = logexp; 3535 3536 if (p_log2) 3537 *p_log2 = res; 3538} 3539 3540 3541/* 3542 * log2 implementation which doesn't have special code to 3543 * handle edge cases (-inf, 0, inf, NaN). It's faster but 3544 * the results for those cases are undefined. 3545 */ 3546LLVMValueRef 3547lp_build_log2(struct lp_build_context *bld, 3548 LLVMValueRef x) 3549{ 3550 LLVMValueRef res; 3551 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE); 3552 return res; 3553} 3554 3555/* 3556 * Version of log2 which handles all edge cases. 3557 * Look at documentation of lp_build_log2_approx for 3558 * description of the behavior for each of the edge cases. 3559 */ 3560LLVMValueRef 3561lp_build_log2_safe(struct lp_build_context *bld, 3562 LLVMValueRef x) 3563{ 3564 LLVMValueRef res; 3565 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE); 3566 return res; 3567} 3568 3569 3570/** 3571 * Faster (and less accurate) log2. 3572 * 3573 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 3574 * 3575 * Piece-wise linear approximation, with exact results when x is a 3576 * power of two. 3577 * 3578 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 3579 */ 3580LLVMValueRef 3581lp_build_fast_log2(struct lp_build_context *bld, 3582 LLVMValueRef x) 3583{ 3584 LLVMBuilderRef builder = bld->gallivm->builder; 3585 LLVMValueRef ipart; 3586 LLVMValueRef fpart; 3587 3588 assert(lp_check_value(bld->type, x)); 3589 3590 assert(bld->type.floating); 3591 3592 /* ipart = floor(log2(x)) - 1 */ 3593 ipart = lp_build_extract_exponent(bld, x, -1); 3594 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 3595 3596 /* fpart = x / 2**ipart */ 3597 fpart = lp_build_extract_mantissa(bld, x); 3598 3599 /* ipart + fpart */ 3600 return LLVMBuildFAdd(builder, ipart, fpart, ""); 3601} 3602 3603 3604/** 3605 * Fast implementation of iround(log2(x)). 3606 * 3607 * Not an approximation -- it should give accurate results all the time. 3608 */ 3609LLVMValueRef 3610lp_build_ilog2(struct lp_build_context *bld, 3611 LLVMValueRef x) 3612{ 3613 LLVMBuilderRef builder = bld->gallivm->builder; 3614 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 3615 LLVMValueRef ipart; 3616 3617 assert(bld->type.floating); 3618 3619 assert(lp_check_value(bld->type, x)); 3620 3621 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 3622 x = LLVMBuildFMul(builder, x, sqrt2, ""); 3623 3624 /* ipart = floor(log2(x) + 0.5) */ 3625 ipart = lp_build_extract_exponent(bld, x, 0); 3626 3627 return ipart; 3628} 3629 3630LLVMValueRef 3631lp_build_mod(struct lp_build_context *bld, 3632 LLVMValueRef x, 3633 LLVMValueRef y) 3634{ 3635 LLVMBuilderRef builder = bld->gallivm->builder; 3636 LLVMValueRef res; 3637 const struct lp_type type = bld->type; 3638 3639 assert(lp_check_value(type, x)); 3640 assert(lp_check_value(type, y)); 3641 3642 if (type.floating) 3643 res = LLVMBuildFRem(builder, x, y, ""); 3644 else if (type.sign) 3645 res = LLVMBuildSRem(builder, x, y, ""); 3646 else 3647 res = LLVMBuildURem(builder, x, y, ""); 3648 return res; 3649} 3650 3651 3652/* 3653 * For floating inputs it creates and returns a mask 3654 * which is all 1's for channels which are NaN. 3655 * Channels inside x which are not NaN will be 0. 3656 */ 3657LLVMValueRef 3658lp_build_isnan(struct lp_build_context *bld, 3659 LLVMValueRef x) 3660{ 3661 LLVMValueRef mask; 3662 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3663 3664 assert(bld->type.floating); 3665 assert(lp_check_value(bld->type, x)); 3666 3667 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x, 3668 "isnotnan"); 3669 mask = LLVMBuildNot(bld->gallivm->builder, mask, ""); 3670 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan"); 3671 return mask; 3672} 3673 3674/* Returns all 1's for floating point numbers that are 3675 * finite numbers and returns all zeros for -inf, 3676 * inf and nan's */ 3677LLVMValueRef 3678lp_build_isfinite(struct lp_build_context *bld, 3679 LLVMValueRef x) 3680{ 3681 LLVMBuilderRef builder = bld->gallivm->builder; 3682 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3683 struct lp_type int_type = lp_int_type(bld->type); 3684 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3685 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type, 3686 0x7f800000); 3687 3688 if (!bld->type.floating) { 3689 return lp_build_const_int_vec(bld->gallivm, bld->type, 0); 3690 } 3691 assert(bld->type.floating); 3692 assert(lp_check_value(bld->type, x)); 3693 assert(bld->type.width == 32); 3694 3695 intx = LLVMBuildAnd(builder, intx, infornan32, ""); 3696 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL, 3697 intx, infornan32); 3698} 3699 3700/* 3701 * Returns true if the number is nan or inf and false otherwise. 3702 * The input has to be a floating point vector. 3703 */ 3704LLVMValueRef 3705lp_build_is_inf_or_nan(struct gallivm_state *gallivm, 3706 const struct lp_type type, 3707 LLVMValueRef x) 3708{ 3709 LLVMBuilderRef builder = gallivm->builder; 3710 struct lp_type int_type = lp_int_type(type); 3711 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type, 3712 0x7f800000); 3713 LLVMValueRef ret; 3714 3715 assert(type.floating); 3716 3717 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), ""); 3718 ret = LLVMBuildAnd(builder, ret, const0, ""); 3719 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL, 3720 ret, const0); 3721 3722 return ret; 3723} 3724 3725 3726LLVMValueRef 3727lp_build_fpstate_get(struct gallivm_state *gallivm) 3728{ 3729 if (util_get_cpu_caps()->has_sse) { 3730 LLVMBuilderRef builder = gallivm->builder; 3731 LLVMValueRef mxcsr_ptr = lp_build_alloca( 3732 gallivm, 3733 LLVMInt32TypeInContext(gallivm->context), 3734 "mxcsr_ptr"); 3735 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr, 3736 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3737 lp_build_intrinsic(builder, 3738 "llvm.x86.sse.stmxcsr", 3739 LLVMVoidTypeInContext(gallivm->context), 3740 &mxcsr_ptr8, 1, 0); 3741 return mxcsr_ptr; 3742 } 3743 return 0; 3744} 3745 3746void 3747lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, 3748 boolean zero) 3749{ 3750 if (util_get_cpu_caps()->has_sse) { 3751 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ 3752 int daz_ftz = _MM_FLUSH_ZERO_MASK; 3753 3754 LLVMBuilderRef builder = gallivm->builder; 3755 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm); 3756 LLVMValueRef mxcsr = 3757 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); 3758 3759 if (util_get_cpu_caps()->has_daz) { 3760 /* Enable denormals are zero mode */ 3761 daz_ftz |= _MM_DENORMALS_ZERO_MASK; 3762 } 3763 if (zero) { 3764 mxcsr = LLVMBuildOr(builder, mxcsr, 3765 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), ""); 3766 } else { 3767 mxcsr = LLVMBuildAnd(builder, mxcsr, 3768 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), ""); 3769 } 3770 3771 LLVMBuildStore(builder, mxcsr, mxcsr_ptr); 3772 lp_build_fpstate_set(gallivm, mxcsr_ptr); 3773 } 3774} 3775 3776void 3777lp_build_fpstate_set(struct gallivm_state *gallivm, 3778 LLVMValueRef mxcsr_ptr) 3779{ 3780 if (util_get_cpu_caps()->has_sse) { 3781 LLVMBuilderRef builder = gallivm->builder; 3782 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, 3783 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3784 lp_build_intrinsic(builder, 3785 "llvm.x86.sse.ldmxcsr", 3786 LLVMVoidTypeInContext(gallivm->context), 3787 &mxcsr_ptr, 1, 0); 3788 } 3789} 3790