1/************************************************************************** 2 * 3 * Copyright 2009-2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28 29/** 30 * @file 31 * Helper 32 * 33 * LLVM IR doesn't support all basic arithmetic operations we care about (most 34 * notably min/max and saturated operations), and it is often necessary to 35 * resort machine-specific intrinsics directly. The functions here hide all 36 * these implementation details from the other modules. 37 * 38 * We also do simple expressions simplification here. Reasons are: 39 * - it is very easy given we have all necessary information readily available 40 * - LLVM optimization passes fail to simplify several vector expressions 41 * - We often know value constraints which the optimization passes have no way 42 * of knowing, such as when source arguments are known to be in [0, 1] range. 43 * 44 * @author Jose Fonseca <jfonseca@vmware.com> 45 */ 46 47 48#include <float.h> 49 50#include "util/u_memory.h" 51#include "util/u_debug.h" 52#include "util/u_math.h" 53#include "util/u_cpu_detect.h" 54 55#include "lp_bld_type.h" 56#include "lp_bld_const.h" 57#include "lp_bld_init.h" 58#include "lp_bld_intr.h" 59#include "lp_bld_logic.h" 60#include "lp_bld_pack.h" 61#include "lp_bld_debug.h" 62#include "lp_bld_bitarit.h" 63#include "lp_bld_arit.h" 64#include "lp_bld_flow.h" 65 66#if defined(PIPE_ARCH_SSE) 67#include <xmmintrin.h> 68#endif 69 70#ifndef _MM_DENORMALS_ZERO_MASK 71#define _MM_DENORMALS_ZERO_MASK 0x0040 72#endif 73 74#ifndef _MM_FLUSH_ZERO_MASK 75#define _MM_FLUSH_ZERO_MASK 0x8000 76#endif 77 78#define EXP_POLY_DEGREE 5 79 80#define LOG_POLY_DEGREE 4 81 82 83/** 84 * Generate min(a, b) 85 * No checks for special case values of a or b = 1 or 0 are done. 86 * NaN's are handled according to the behavior specified by the 87 * nan_behavior argument. 88 */ 89static LLVMValueRef 90lp_build_min_simple(struct lp_build_context *bld, 91 LLVMValueRef a, 92 LLVMValueRef b, 93 enum gallivm_nan_behavior nan_behavior) 94{ 95 const struct lp_type type = bld->type; 96 const char *intrinsic = NULL; 97 unsigned intr_size = 0; 98 LLVMValueRef cond; 99 100 assert(lp_check_value(type, a)); 101 assert(lp_check_value(type, b)); 102 103 /* TODO: optimize the constant case */ 104 105 if (type.floating && util_cpu_caps.has_sse) { 106 if (type.width == 32) { 107 if (type.length == 1) { 108 intrinsic = "llvm.x86.sse.min.ss"; 109 intr_size = 128; 110 } 111 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 112 intrinsic = "llvm.x86.sse.min.ps"; 113 intr_size = 128; 114 } 115 else { 116 intrinsic = "llvm.x86.avx.min.ps.256"; 117 intr_size = 256; 118 } 119 } 120 if (type.width == 64 && util_cpu_caps.has_sse2) { 121 if (type.length == 1) { 122 intrinsic = "llvm.x86.sse2.min.sd"; 123 intr_size = 128; 124 } 125 else if (type.length == 2 || !util_cpu_caps.has_avx) { 126 intrinsic = "llvm.x86.sse2.min.pd"; 127 intr_size = 128; 128 } 129 else { 130 intrinsic = "llvm.x86.avx.min.pd.256"; 131 intr_size = 256; 132 } 133 } 134 } 135 else if (type.floating && util_cpu_caps.has_altivec) { 136 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 137 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 138 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 139 __FUNCTION__); 140 } 141 if (type.width == 32 && type.length == 4) { 142 intrinsic = "llvm.ppc.altivec.vminfp"; 143 intr_size = 128; 144 } 145 } else if (HAVE_LLVM < 0x0309 && 146 util_cpu_caps.has_avx2 && type.length > 4) { 147 intr_size = 256; 148 switch (type.width) { 149 case 8: 150 intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b"; 151 break; 152 case 16: 153 intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w"; 154 break; 155 case 32: 156 intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d"; 157 break; 158 } 159 } else if (HAVE_LLVM < 0x0309 && 160 util_cpu_caps.has_sse2 && type.length >= 2) { 161 intr_size = 128; 162 if ((type.width == 8 || type.width == 16) && 163 (type.width * type.length <= 64) && 164 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 165 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 166 __FUNCTION__); 167 } 168 if (type.width == 8 && !type.sign) { 169 intrinsic = "llvm.x86.sse2.pminu.b"; 170 } 171 else if (type.width == 16 && type.sign) { 172 intrinsic = "llvm.x86.sse2.pmins.w"; 173 } 174 if (util_cpu_caps.has_sse4_1) { 175 if (type.width == 8 && type.sign) { 176 intrinsic = "llvm.x86.sse41.pminsb"; 177 } 178 if (type.width == 16 && !type.sign) { 179 intrinsic = "llvm.x86.sse41.pminuw"; 180 } 181 if (type.width == 32 && !type.sign) { 182 intrinsic = "llvm.x86.sse41.pminud"; 183 } 184 if (type.width == 32 && type.sign) { 185 intrinsic = "llvm.x86.sse41.pminsd"; 186 } 187 } 188 } else if (util_cpu_caps.has_altivec) { 189 intr_size = 128; 190 if (type.width == 8) { 191 if (!type.sign) { 192 intrinsic = "llvm.ppc.altivec.vminub"; 193 } else { 194 intrinsic = "llvm.ppc.altivec.vminsb"; 195 } 196 } else if (type.width == 16) { 197 if (!type.sign) { 198 intrinsic = "llvm.ppc.altivec.vminuh"; 199 } else { 200 intrinsic = "llvm.ppc.altivec.vminsh"; 201 } 202 } else if (type.width == 32) { 203 if (!type.sign) { 204 intrinsic = "llvm.ppc.altivec.vminuw"; 205 } else { 206 intrinsic = "llvm.ppc.altivec.vminsw"; 207 } 208 } 209 } 210 211 if (intrinsic) { 212 /* We need to handle nan's for floating point numbers. If one of the 213 * inputs is nan the other should be returned (required by both D3D10+ 214 * and OpenCL). 215 * The sse intrinsics return the second operator in case of nan by 216 * default so we need to special code to handle those. 217 */ 218 if (util_cpu_caps.has_sse && type.floating && 219 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 220 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 221 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 222 LLVMValueRef isnan, min; 223 min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 224 type, 225 intr_size, a, b); 226 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 227 isnan = lp_build_isnan(bld, b); 228 return lp_build_select(bld, isnan, a, min); 229 } else { 230 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 231 isnan = lp_build_isnan(bld, a); 232 return lp_build_select(bld, isnan, a, min); 233 } 234 } else { 235 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 236 type, 237 intr_size, a, b); 238 } 239 } 240 241 if (type.floating) { 242 switch (nan_behavior) { 243 case GALLIVM_NAN_RETURN_NAN: { 244 LLVMValueRef isnan = lp_build_isnan(bld, b); 245 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 246 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 247 return lp_build_select(bld, cond, a, b); 248 } 249 break; 250 case GALLIVM_NAN_RETURN_OTHER: { 251 LLVMValueRef isnan = lp_build_isnan(bld, a); 252 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 253 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 254 return lp_build_select(bld, cond, a, b); 255 } 256 break; 257 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 258 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b); 259 return lp_build_select(bld, cond, a, b); 260 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 261 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a); 262 return lp_build_select(bld, cond, b, a); 263 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 264 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 265 return lp_build_select(bld, cond, a, b); 266 break; 267 default: 268 assert(0); 269 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 270 return lp_build_select(bld, cond, a, b); 271 } 272 } else { 273 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b); 274 return lp_build_select(bld, cond, a, b); 275 } 276} 277 278 279LLVMValueRef 280lp_build_fmuladd(LLVMBuilderRef builder, 281 LLVMValueRef a, 282 LLVMValueRef b, 283 LLVMValueRef c) 284{ 285 LLVMTypeRef type = LLVMTypeOf(a); 286 assert(type == LLVMTypeOf(b)); 287 assert(type == LLVMTypeOf(c)); 288 if (HAVE_LLVM < 0x0304) { 289 /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is 290 * not supported, and instead it falls-back to a C function. 291 */ 292 return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, ""); 293 } 294 char intrinsic[32]; 295 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type); 296 LLVMValueRef args[] = { a, b, c }; 297 return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0); 298} 299 300 301/** 302 * Generate max(a, b) 303 * No checks for special case values of a or b = 1 or 0 are done. 304 * NaN's are handled according to the behavior specified by the 305 * nan_behavior argument. 306 */ 307static LLVMValueRef 308lp_build_max_simple(struct lp_build_context *bld, 309 LLVMValueRef a, 310 LLVMValueRef b, 311 enum gallivm_nan_behavior nan_behavior) 312{ 313 const struct lp_type type = bld->type; 314 const char *intrinsic = NULL; 315 unsigned intr_size = 0; 316 LLVMValueRef cond; 317 318 assert(lp_check_value(type, a)); 319 assert(lp_check_value(type, b)); 320 321 /* TODO: optimize the constant case */ 322 323 if (type.floating && util_cpu_caps.has_sse) { 324 if (type.width == 32) { 325 if (type.length == 1) { 326 intrinsic = "llvm.x86.sse.max.ss"; 327 intr_size = 128; 328 } 329 else if (type.length <= 4 || !util_cpu_caps.has_avx) { 330 intrinsic = "llvm.x86.sse.max.ps"; 331 intr_size = 128; 332 } 333 else { 334 intrinsic = "llvm.x86.avx.max.ps.256"; 335 intr_size = 256; 336 } 337 } 338 if (type.width == 64 && util_cpu_caps.has_sse2) { 339 if (type.length == 1) { 340 intrinsic = "llvm.x86.sse2.max.sd"; 341 intr_size = 128; 342 } 343 else if (type.length == 2 || !util_cpu_caps.has_avx) { 344 intrinsic = "llvm.x86.sse2.max.pd"; 345 intr_size = 128; 346 } 347 else { 348 intrinsic = "llvm.x86.avx.max.pd.256"; 349 intr_size = 256; 350 } 351 } 352 } 353 else if (type.floating && util_cpu_caps.has_altivec) { 354 if (nan_behavior == GALLIVM_NAN_RETURN_NAN || 355 nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 356 debug_printf("%s: altivec doesn't support nan return nan behavior\n", 357 __FUNCTION__); 358 } 359 if (type.width == 32 || type.length == 4) { 360 intrinsic = "llvm.ppc.altivec.vmaxfp"; 361 intr_size = 128; 362 } 363 } else if (HAVE_LLVM < 0x0309 && 364 util_cpu_caps.has_avx2 && type.length > 4) { 365 intr_size = 256; 366 switch (type.width) { 367 case 8: 368 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b"; 369 break; 370 case 16: 371 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w"; 372 break; 373 case 32: 374 intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d"; 375 break; 376 } 377 } else if (HAVE_LLVM < 0x0309 && 378 util_cpu_caps.has_sse2 && type.length >= 2) { 379 intr_size = 128; 380 if ((type.width == 8 || type.width == 16) && 381 (type.width * type.length <= 64) && 382 (gallivm_debug & GALLIVM_DEBUG_PERF)) { 383 debug_printf("%s: inefficient code, bogus shuffle due to packing\n", 384 __FUNCTION__); 385 } 386 if (type.width == 8 && !type.sign) { 387 intrinsic = "llvm.x86.sse2.pmaxu.b"; 388 intr_size = 128; 389 } 390 else if (type.width == 16 && type.sign) { 391 intrinsic = "llvm.x86.sse2.pmaxs.w"; 392 } 393 if (util_cpu_caps.has_sse4_1) { 394 if (type.width == 8 && type.sign) { 395 intrinsic = "llvm.x86.sse41.pmaxsb"; 396 } 397 if (type.width == 16 && !type.sign) { 398 intrinsic = "llvm.x86.sse41.pmaxuw"; 399 } 400 if (type.width == 32 && !type.sign) { 401 intrinsic = "llvm.x86.sse41.pmaxud"; 402 } 403 if (type.width == 32 && type.sign) { 404 intrinsic = "llvm.x86.sse41.pmaxsd"; 405 } 406 } 407 } else if (util_cpu_caps.has_altivec) { 408 intr_size = 128; 409 if (type.width == 8) { 410 if (!type.sign) { 411 intrinsic = "llvm.ppc.altivec.vmaxub"; 412 } else { 413 intrinsic = "llvm.ppc.altivec.vmaxsb"; 414 } 415 } else if (type.width == 16) { 416 if (!type.sign) { 417 intrinsic = "llvm.ppc.altivec.vmaxuh"; 418 } else { 419 intrinsic = "llvm.ppc.altivec.vmaxsh"; 420 } 421 } else if (type.width == 32) { 422 if (!type.sign) { 423 intrinsic = "llvm.ppc.altivec.vmaxuw"; 424 } else { 425 intrinsic = "llvm.ppc.altivec.vmaxsw"; 426 } 427 } 428 } 429 430 if (intrinsic) { 431 if (util_cpu_caps.has_sse && type.floating && 432 nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED && 433 nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN && 434 nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) { 435 LLVMValueRef isnan, max; 436 max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 437 type, 438 intr_size, a, b); 439 if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) { 440 isnan = lp_build_isnan(bld, b); 441 return lp_build_select(bld, isnan, a, max); 442 } else { 443 assert(nan_behavior == GALLIVM_NAN_RETURN_NAN); 444 isnan = lp_build_isnan(bld, a); 445 return lp_build_select(bld, isnan, a, max); 446 } 447 } else { 448 return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic, 449 type, 450 intr_size, a, b); 451 } 452 } 453 454 if (type.floating) { 455 switch (nan_behavior) { 456 case GALLIVM_NAN_RETURN_NAN: { 457 LLVMValueRef isnan = lp_build_isnan(bld, b); 458 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 459 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 460 return lp_build_select(bld, cond, a, b); 461 } 462 break; 463 case GALLIVM_NAN_RETURN_OTHER: { 464 LLVMValueRef isnan = lp_build_isnan(bld, a); 465 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 466 cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, ""); 467 return lp_build_select(bld, cond, a, b); 468 } 469 break; 470 case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN: 471 cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b); 472 return lp_build_select(bld, cond, a, b); 473 case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN: 474 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a); 475 return lp_build_select(bld, cond, b, a); 476 case GALLIVM_NAN_BEHAVIOR_UNDEFINED: 477 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 478 return lp_build_select(bld, cond, a, b); 479 break; 480 default: 481 assert(0); 482 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 483 return lp_build_select(bld, cond, a, b); 484 } 485 } else { 486 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 487 return lp_build_select(bld, cond, a, b); 488 } 489} 490 491 492/** 493 * Generate 1 - a, or ~a depending on bld->type. 494 */ 495LLVMValueRef 496lp_build_comp(struct lp_build_context *bld, 497 LLVMValueRef a) 498{ 499 LLVMBuilderRef builder = bld->gallivm->builder; 500 const struct lp_type type = bld->type; 501 502 assert(lp_check_value(type, a)); 503 504 if(a == bld->one) 505 return bld->zero; 506 if(a == bld->zero) 507 return bld->one; 508 509 if(type.norm && !type.floating && !type.fixed && !type.sign) { 510 if(LLVMIsConstant(a)) 511 return LLVMConstNot(a); 512 else 513 return LLVMBuildNot(builder, a, ""); 514 } 515 516 if(LLVMIsConstant(a)) 517 if (type.floating) 518 return LLVMConstFSub(bld->one, a); 519 else 520 return LLVMConstSub(bld->one, a); 521 else 522 if (type.floating) 523 return LLVMBuildFSub(builder, bld->one, a, ""); 524 else 525 return LLVMBuildSub(builder, bld->one, a, ""); 526} 527 528 529/** 530 * Generate a + b 531 */ 532LLVMValueRef 533lp_build_add(struct lp_build_context *bld, 534 LLVMValueRef a, 535 LLVMValueRef b) 536{ 537 LLVMBuilderRef builder = bld->gallivm->builder; 538 const struct lp_type type = bld->type; 539 LLVMValueRef res; 540 541 assert(lp_check_value(type, a)); 542 assert(lp_check_value(type, b)); 543 544 if (a == bld->zero) 545 return b; 546 if (b == bld->zero) 547 return a; 548 if (a == bld->undef || b == bld->undef) 549 return bld->undef; 550 551 if (type.norm) { 552 const char *intrinsic = NULL; 553 554 if (!type.sign && (a == bld->one || b == bld->one)) 555 return bld->one; 556 557 if (!type.floating && !type.fixed) { 558 if (HAVE_LLVM >= 0x0900) { 559 char intrin[32]; 560 intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat"; 561 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 562 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 563 } 564 if (type.width * type.length == 128) { 565 if (util_cpu_caps.has_sse2) { 566 if (type.width == 8) 567 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : 568 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL; 569 if (type.width == 16) 570 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : 571 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL; 572 } else if (util_cpu_caps.has_altivec) { 573 if (type.width == 8) 574 intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; 575 if (type.width == 16) 576 intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; 577 } 578 } 579 if (type.width * type.length == 256) { 580 if (util_cpu_caps.has_avx2) { 581 if (type.width == 8) 582 intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : 583 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL; 584 if (type.width == 16) 585 intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : 586 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL; 587 } 588 } 589 } 590 591 if (intrinsic) 592 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 593 } 594 595 if(type.norm && !type.floating && !type.fixed) { 596 if (type.sign) { 597 uint64_t sign = (uint64_t)1 << (type.width - 1); 598 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 599 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 600 /* a_clamp_max is the maximum a for positive b, 601 a_clamp_min is the minimum a for negative b. */ 602 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 603 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 604 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); 605 } 606 } 607 608 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 609 if (type.floating) 610 res = LLVMConstFAdd(a, b); 611 else 612 res = LLVMConstAdd(a, b); 613 else 614 if (type.floating) 615 res = LLVMBuildFAdd(builder, a, b, ""); 616 else 617 res = LLVMBuildAdd(builder, a, b, ""); 618 619 /* clamp to ceiling of 1.0 */ 620 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 621 res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 622 623 if (type.norm && !type.floating && !type.fixed) { 624 if (!type.sign) { 625 /* 626 * newer llvm versions no longer support the intrinsics, but recognize 627 * the pattern. Since auto-upgrade of intrinsics doesn't work for jit 628 * code, it is important we match the pattern llvm uses (and pray llvm 629 * doesn't change it - and hope they decide on the same pattern for 630 * all backends supporting it...). 631 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 632 * interfere with llvm's ability to recognize the pattern but seems 633 * a bit brittle. 634 * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 635 */ 636 LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res); 637 res = lp_build_select(bld, overflowed, 638 LLVMConstAllOnes(bld->int_vec_type), res); 639 } 640 } 641 642 /* XXX clamp to floor of -1 or 0??? */ 643 644 return res; 645} 646 647 648/** Return the scalar sum of the elements of a. 649 * Should avoid this operation whenever possible. 650 */ 651LLVMValueRef 652lp_build_horizontal_add(struct lp_build_context *bld, 653 LLVMValueRef a) 654{ 655 LLVMBuilderRef builder = bld->gallivm->builder; 656 const struct lp_type type = bld->type; 657 LLVMValueRef index, res; 658 unsigned i, length; 659 LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2]; 660 LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2]; 661 LLVMValueRef vecres, elem2; 662 663 assert(lp_check_value(type, a)); 664 665 if (type.length == 1) { 666 return a; 667 } 668 669 assert(!bld->type.norm); 670 671 /* 672 * for byte vectors can do much better with psadbw. 673 * Using repeated shuffle/adds here. Note with multiple vectors 674 * this can be done more efficiently as outlined in the intel 675 * optimization manual. 676 * Note: could cause data rearrangement if used with smaller element 677 * sizes. 678 */ 679 680 vecres = a; 681 length = type.length / 2; 682 while (length > 1) { 683 LLVMValueRef vec1, vec2; 684 for (i = 0; i < length; i++) { 685 shuffles1[i] = lp_build_const_int32(bld->gallivm, i); 686 shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length); 687 } 688 vec1 = LLVMBuildShuffleVector(builder, vecres, vecres, 689 LLVMConstVector(shuffles1, length), ""); 690 vec2 = LLVMBuildShuffleVector(builder, vecres, vecres, 691 LLVMConstVector(shuffles2, length), ""); 692 if (type.floating) { 693 vecres = LLVMBuildFAdd(builder, vec1, vec2, ""); 694 } 695 else { 696 vecres = LLVMBuildAdd(builder, vec1, vec2, ""); 697 } 698 length = length >> 1; 699 } 700 701 /* always have vector of size 2 here */ 702 assert(length == 1); 703 704 index = lp_build_const_int32(bld->gallivm, 0); 705 res = LLVMBuildExtractElement(builder, vecres, index, ""); 706 index = lp_build_const_int32(bld->gallivm, 1); 707 elem2 = LLVMBuildExtractElement(builder, vecres, index, ""); 708 709 if (type.floating) 710 res = LLVMBuildFAdd(builder, res, elem2, ""); 711 else 712 res = LLVMBuildAdd(builder, res, elem2, ""); 713 714 return res; 715} 716 717/** 718 * Return the horizontal sums of 4 float vectors as a float4 vector. 719 * This uses the technique as outlined in Intel Optimization Manual. 720 */ 721static LLVMValueRef 722lp_build_horizontal_add4x4f(struct lp_build_context *bld, 723 LLVMValueRef src[4]) 724{ 725 struct gallivm_state *gallivm = bld->gallivm; 726 LLVMBuilderRef builder = gallivm->builder; 727 LLVMValueRef shuffles[4]; 728 LLVMValueRef tmp[4]; 729 LLVMValueRef sumtmp[2], shuftmp[2]; 730 731 /* lower half of regs */ 732 shuffles[0] = lp_build_const_int32(gallivm, 0); 733 shuffles[1] = lp_build_const_int32(gallivm, 1); 734 shuffles[2] = lp_build_const_int32(gallivm, 4); 735 shuffles[3] = lp_build_const_int32(gallivm, 5); 736 tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1], 737 LLVMConstVector(shuffles, 4), ""); 738 tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3], 739 LLVMConstVector(shuffles, 4), ""); 740 741 /* upper half of regs */ 742 shuffles[0] = lp_build_const_int32(gallivm, 2); 743 shuffles[1] = lp_build_const_int32(gallivm, 3); 744 shuffles[2] = lp_build_const_int32(gallivm, 6); 745 shuffles[3] = lp_build_const_int32(gallivm, 7); 746 tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1], 747 LLVMConstVector(shuffles, 4), ""); 748 tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3], 749 LLVMConstVector(shuffles, 4), ""); 750 751 sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], ""); 752 sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], ""); 753 754 shuffles[0] = lp_build_const_int32(gallivm, 0); 755 shuffles[1] = lp_build_const_int32(gallivm, 2); 756 shuffles[2] = lp_build_const_int32(gallivm, 4); 757 shuffles[3] = lp_build_const_int32(gallivm, 6); 758 shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 759 LLVMConstVector(shuffles, 4), ""); 760 761 shuffles[0] = lp_build_const_int32(gallivm, 1); 762 shuffles[1] = lp_build_const_int32(gallivm, 3); 763 shuffles[2] = lp_build_const_int32(gallivm, 5); 764 shuffles[3] = lp_build_const_int32(gallivm, 7); 765 shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1], 766 LLVMConstVector(shuffles, 4), ""); 767 768 return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], ""); 769} 770 771 772/* 773 * partially horizontally add 2-4 float vectors with length nx4, 774 * i.e. only four adjacent values in each vector will be added, 775 * assuming values are really grouped in 4 which also determines 776 * output order. 777 * 778 * Return a vector of the same length as the initial vectors, 779 * with the excess elements (if any) being undefined. 780 * The element order is independent of number of input vectors. 781 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7 782 * the output order thus will be 783 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef 784 */ 785LLVMValueRef 786lp_build_hadd_partial4(struct lp_build_context *bld, 787 LLVMValueRef vectors[], 788 unsigned num_vecs) 789{ 790 struct gallivm_state *gallivm = bld->gallivm; 791 LLVMBuilderRef builder = gallivm->builder; 792 LLVMValueRef ret_vec; 793 LLVMValueRef tmp[4]; 794 const char *intrinsic = NULL; 795 796 assert(num_vecs >= 2 && num_vecs <= 4); 797 assert(bld->type.floating); 798 799 /* only use this with at least 2 vectors, as it is sort of expensive 800 * (depending on cpu) and we always need two horizontal adds anyway, 801 * so a shuffle/add approach might be better. 802 */ 803 804 tmp[0] = vectors[0]; 805 tmp[1] = vectors[1]; 806 807 tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0]; 808 tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0]; 809 810 if (util_cpu_caps.has_sse3 && bld->type.width == 32 && 811 bld->type.length == 4) { 812 intrinsic = "llvm.x86.sse3.hadd.ps"; 813 } 814 else if (util_cpu_caps.has_avx && bld->type.width == 32 && 815 bld->type.length == 8) { 816 intrinsic = "llvm.x86.avx.hadd.ps.256"; 817 } 818 if (intrinsic) { 819 tmp[0] = lp_build_intrinsic_binary(builder, intrinsic, 820 lp_build_vec_type(gallivm, bld->type), 821 tmp[0], tmp[1]); 822 if (num_vecs > 2) { 823 tmp[1] = lp_build_intrinsic_binary(builder, intrinsic, 824 lp_build_vec_type(gallivm, bld->type), 825 tmp[2], tmp[3]); 826 } 827 else { 828 tmp[1] = tmp[0]; 829 } 830 return lp_build_intrinsic_binary(builder, intrinsic, 831 lp_build_vec_type(gallivm, bld->type), 832 tmp[0], tmp[1]); 833 } 834 835 if (bld->type.length == 4) { 836 ret_vec = lp_build_horizontal_add4x4f(bld, tmp); 837 } 838 else { 839 LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4]; 840 unsigned j; 841 unsigned num_iter = bld->type.length / 4; 842 struct lp_type parttype = bld->type; 843 parttype.length = 4; 844 for (j = 0; j < num_iter; j++) { 845 LLVMValueRef partsrc[4]; 846 unsigned i; 847 for (i = 0; i < 4; i++) { 848 partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4); 849 } 850 partres[j] = lp_build_horizontal_add4x4f(bld, partsrc); 851 } 852 ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter); 853 } 854 return ret_vec; 855} 856 857/** 858 * Generate a - b 859 */ 860LLVMValueRef 861lp_build_sub(struct lp_build_context *bld, 862 LLVMValueRef a, 863 LLVMValueRef b) 864{ 865 LLVMBuilderRef builder = bld->gallivm->builder; 866 const struct lp_type type = bld->type; 867 LLVMValueRef res; 868 869 assert(lp_check_value(type, a)); 870 assert(lp_check_value(type, b)); 871 872 if (b == bld->zero) 873 return a; 874 if (a == bld->undef || b == bld->undef) 875 return bld->undef; 876 if (a == b) 877 return bld->zero; 878 879 if (type.norm) { 880 const char *intrinsic = NULL; 881 882 if (!type.sign && b == bld->one) 883 return bld->zero; 884 885 if (!type.floating && !type.fixed) { 886 if (HAVE_LLVM >= 0x0900) { 887 char intrin[32]; 888 intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat"; 889 lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type); 890 return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b); 891 } 892 if (type.width * type.length == 128) { 893 if (util_cpu_caps.has_sse2) { 894 if (type.width == 8) 895 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : 896 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL; 897 if (type.width == 16) 898 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : 899 HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL; 900 } else if (util_cpu_caps.has_altivec) { 901 if (type.width == 8) 902 intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; 903 if (type.width == 16) 904 intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; 905 } 906 } 907 if (type.width * type.length == 256) { 908 if (util_cpu_caps.has_avx2) { 909 if (type.width == 8) 910 intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : 911 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL; 912 if (type.width == 16) 913 intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : 914 HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL; 915 } 916 } 917 } 918 919 if (intrinsic) 920 return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b); 921 } 922 923 if(type.norm && !type.floating && !type.fixed) { 924 if (type.sign) { 925 uint64_t sign = (uint64_t)1 << (type.width - 1); 926 LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1); 927 LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign); 928 /* a_clamp_max is the maximum a for negative b, 929 a_clamp_min is the minimum a for positive b. */ 930 LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 931 LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); 932 a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max); 933 } else { 934 /* 935 * This must match llvm pattern for saturated unsigned sub. 936 * (lp_build_max_simple actually does the job with its current 937 * definition but do it explicitly here.) 938 * NOTE: cmp/select does sext/trunc of the mask. Does not seem to 939 * interfere with llvm's ability to recognize the pattern but seems 940 * a bit brittle. 941 * NOTE: llvm 9+ always uses (non arch specific) intrinsic. 942 */ 943 LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); 944 a = lp_build_select(bld, no_ov, a, b); 945 } 946 } 947 948 if(LLVMIsConstant(a) && LLVMIsConstant(b)) 949 if (type.floating) 950 res = LLVMConstFSub(a, b); 951 else 952 res = LLVMConstSub(a, b); 953 else 954 if (type.floating) 955 res = LLVMBuildFSub(builder, a, b, ""); 956 else 957 res = LLVMBuildSub(builder, a, b, ""); 958 959 if(bld->type.norm && (bld->type.floating || bld->type.fixed)) 960 res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 961 962 return res; 963} 964 965 966 967/** 968 * Normalized multiplication. 969 * 970 * There are several approaches for (using 8-bit normalized multiplication as 971 * an example): 972 * 973 * - alpha plus one 974 * 975 * makes the following approximation to the division (Sree) 976 * 977 * a*b/255 ~= (a*(b + 1)) >> 256 978 * 979 * which is the fastest method that satisfies the following OpenGL criteria of 980 * 981 * 0*0 = 0 and 255*255 = 255 982 * 983 * - geometric series 984 * 985 * takes the geometric series approximation to the division 986 * 987 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 988 * 989 * in this case just the first two terms to fit in 16bit arithmetic 990 * 991 * t/255 ~= (t + (t >> 8)) >> 8 992 * 993 * note that just by itself it doesn't satisfies the OpenGL criteria, as 994 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff 995 * must be used. 996 * 997 * - geometric series plus rounding 998 * 999 * when using a geometric series division instead of truncating the result 1000 * use roundoff in the approximation (Jim Blinn) 1001 * 1002 * t/255 ~= (t + (t >> 8) + 0x80) >> 8 1003 * 1004 * achieving the exact results. 1005 * 1006 * 1007 * 1008 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 1009 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf 1010 * @sa Michael Herf, The "double blend trick", May 2000, 1011 * http://www.stereopsis.com/doubleblend.html 1012 */ 1013LLVMValueRef 1014lp_build_mul_norm(struct gallivm_state *gallivm, 1015 struct lp_type wide_type, 1016 LLVMValueRef a, LLVMValueRef b) 1017{ 1018 LLVMBuilderRef builder = gallivm->builder; 1019 struct lp_build_context bld; 1020 unsigned n; 1021 LLVMValueRef half; 1022 LLVMValueRef ab; 1023 1024 assert(!wide_type.floating); 1025 assert(lp_check_value(wide_type, a)); 1026 assert(lp_check_value(wide_type, b)); 1027 1028 lp_build_context_init(&bld, gallivm, wide_type); 1029 1030 n = wide_type.width / 2; 1031 if (wide_type.sign) { 1032 --n; 1033 } 1034 1035 /* 1036 * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW 1037 * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/ 1038 */ 1039 1040 /* 1041 * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n 1042 */ 1043 1044 ab = LLVMBuildMul(builder, a, b, ""); 1045 ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), ""); 1046 1047 /* 1048 * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1)) 1049 */ 1050 1051 half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1)); 1052 if (wide_type.sign) { 1053 LLVMValueRef minus_half = LLVMBuildNeg(builder, half, ""); 1054 LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1); 1055 half = lp_build_select(&bld, sign, minus_half, half); 1056 } 1057 ab = LLVMBuildAdd(builder, ab, half, ""); 1058 1059 /* Final division */ 1060 ab = lp_build_shr_imm(&bld, ab, n); 1061 1062 return ab; 1063} 1064 1065/** 1066 * Generate a * b 1067 */ 1068LLVMValueRef 1069lp_build_mul(struct lp_build_context *bld, 1070 LLVMValueRef a, 1071 LLVMValueRef b) 1072{ 1073 LLVMBuilderRef builder = bld->gallivm->builder; 1074 const struct lp_type type = bld->type; 1075 LLVMValueRef shift; 1076 LLVMValueRef res; 1077 1078 assert(lp_check_value(type, a)); 1079 assert(lp_check_value(type, b)); 1080 1081 if(a == bld->zero) 1082 return bld->zero; 1083 if(a == bld->one) 1084 return b; 1085 if(b == bld->zero) 1086 return bld->zero; 1087 if(b == bld->one) 1088 return a; 1089 if(a == bld->undef || b == bld->undef) 1090 return bld->undef; 1091 1092 if (!type.floating && !type.fixed && type.norm) { 1093 struct lp_type wide_type = lp_wider_type(type); 1094 LLVMValueRef al, ah, bl, bh, abl, abh, ab; 1095 1096 lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah); 1097 lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh); 1098 1099 /* PMULLW, PSRLW, PADDW */ 1100 abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl); 1101 abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh); 1102 1103 ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh); 1104 1105 return ab; 1106 } 1107 1108 if(type.fixed) 1109 shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2); 1110 else 1111 shift = NULL; 1112 1113 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1114 if (type.floating) 1115 res = LLVMConstFMul(a, b); 1116 else 1117 res = LLVMConstMul(a, b); 1118 if(shift) { 1119 if(type.sign) 1120 res = LLVMConstAShr(res, shift); 1121 else 1122 res = LLVMConstLShr(res, shift); 1123 } 1124 } 1125 else { 1126 if (type.floating) 1127 res = LLVMBuildFMul(builder, a, b, ""); 1128 else 1129 res = LLVMBuildMul(builder, a, b, ""); 1130 if(shift) { 1131 if(type.sign) 1132 res = LLVMBuildAShr(builder, res, shift, ""); 1133 else 1134 res = LLVMBuildLShr(builder, res, shift, ""); 1135 } 1136 } 1137 1138 return res; 1139} 1140 1141/* 1142 * Widening mul, valid for 32x32 bit -> 64bit only. 1143 * Result is low 32bits, high bits returned in res_hi. 1144 * 1145 * Emits code that is meant to be compiled for the host CPU. 1146 */ 1147LLVMValueRef 1148lp_build_mul_32_lohi_cpu(struct lp_build_context *bld, 1149 LLVMValueRef a, 1150 LLVMValueRef b, 1151 LLVMValueRef *res_hi) 1152{ 1153 struct gallivm_state *gallivm = bld->gallivm; 1154 LLVMBuilderRef builder = gallivm->builder; 1155 1156 assert(bld->type.width == 32); 1157 assert(bld->type.floating == 0); 1158 assert(bld->type.fixed == 0); 1159 assert(bld->type.norm == 0); 1160 1161 /* 1162 * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces 1163 * for x86 simd is atrocious (even if the high bits weren't required), 1164 * trying to handle real 64bit inputs (which of course can't happen due 1165 * to using 64bit umul with 32bit numbers zero-extended to 64bit, but 1166 * apparently llvm does not recognize this widening mul). This includes 6 1167 * (instead of 2) pmuludq plus extra adds and shifts 1168 * The same story applies to signed mul, albeit fixing this requires sse41. 1169 * https://llvm.org/bugs/show_bug.cgi?id=30845 1170 * So, whip up our own code, albeit only for length 4 and 8 (which 1171 * should be good enough)... 1172 */ 1173 if ((bld->type.length == 4 || bld->type.length == 8) && 1174 ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) || 1175 util_cpu_caps.has_sse4_1)) { 1176 const char *intrinsic = NULL; 1177 LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd; 1178 LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec; 1179 struct lp_type type_wide = lp_wider_type(bld->type); 1180 LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide); 1181 unsigned i; 1182 for (i = 0; i < bld->type.length; i += 2) { 1183 shuf[i] = lp_build_const_int32(gallivm, i+1); 1184 shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context)); 1185 } 1186 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1187 aeven = a; 1188 beven = b; 1189 aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, ""); 1190 bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, ""); 1191 1192 if (util_cpu_caps.has_avx2 && bld->type.length == 8) { 1193 if (bld->type.sign) { 1194 intrinsic = "llvm.x86.avx2.pmul.dq"; 1195 } else { 1196 intrinsic = "llvm.x86.avx2.pmulu.dq"; 1197 } 1198 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1199 wider_type, aeven, beven); 1200 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1201 wider_type, aodd, bodd); 1202 } 1203 else { 1204 /* for consistent naming look elsewhere... */ 1205 if (bld->type.sign) { 1206 intrinsic = "llvm.x86.sse41.pmuldq"; 1207 } else { 1208 intrinsic = "llvm.x86.sse2.pmulu.dq"; 1209 } 1210 /* 1211 * XXX If we only have AVX but not AVX2 this is a pain. 1212 * lp_build_intrinsic_binary_anylength() can't handle it 1213 * (due to src and dst type not being identical). 1214 */ 1215 if (bld->type.length == 8) { 1216 LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi; 1217 LLVMValueRef aoddlo, aoddhi, boddlo, boddhi; 1218 LLVMValueRef muleven2[2], mulodd2[2]; 1219 struct lp_type type_wide_half = type_wide; 1220 LLVMTypeRef wtype_half; 1221 type_wide_half.length = 2; 1222 wtype_half = lp_build_vec_type(gallivm, type_wide_half); 1223 aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4); 1224 aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4); 1225 bevenlo = lp_build_extract_range(gallivm, beven, 0, 4); 1226 bevenhi = lp_build_extract_range(gallivm, beven, 4, 4); 1227 aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4); 1228 aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4); 1229 boddlo = lp_build_extract_range(gallivm, bodd, 0, 4); 1230 boddhi = lp_build_extract_range(gallivm, bodd, 4, 4); 1231 muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1232 wtype_half, aevenlo, bevenlo); 1233 mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic, 1234 wtype_half, aoddlo, boddlo); 1235 muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1236 wtype_half, aevenhi, bevenhi); 1237 mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic, 1238 wtype_half, aoddhi, boddhi); 1239 muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2); 1240 mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2); 1241 1242 } 1243 else { 1244 muleven = lp_build_intrinsic_binary(builder, intrinsic, 1245 wider_type, aeven, beven); 1246 mulodd = lp_build_intrinsic_binary(builder, intrinsic, 1247 wider_type, aodd, bodd); 1248 } 1249 } 1250 muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, ""); 1251 mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, ""); 1252 1253 for (i = 0; i < bld->type.length; i += 2) { 1254 shuf[i] = lp_build_const_int32(gallivm, i + 1); 1255 shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length); 1256 } 1257 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1258 *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1259 1260 for (i = 0; i < bld->type.length; i += 2) { 1261 shuf[i] = lp_build_const_int32(gallivm, i); 1262 shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length); 1263 } 1264 shuf_vec = LLVMConstVector(shuf, bld->type.length); 1265 return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, ""); 1266 } 1267 else { 1268 return lp_build_mul_32_lohi(bld, a, b, res_hi); 1269 } 1270} 1271 1272 1273/* 1274 * Widening mul, valid for 32x32 bit -> 64bit only. 1275 * Result is low 32bits, high bits returned in res_hi. 1276 * 1277 * Emits generic code. 1278 */ 1279LLVMValueRef 1280lp_build_mul_32_lohi(struct lp_build_context *bld, 1281 LLVMValueRef a, 1282 LLVMValueRef b, 1283 LLVMValueRef *res_hi) 1284{ 1285 struct gallivm_state *gallivm = bld->gallivm; 1286 LLVMBuilderRef builder = gallivm->builder; 1287 LLVMValueRef tmp, shift, res_lo; 1288 struct lp_type type_tmp; 1289 LLVMTypeRef wide_type, narrow_type; 1290 1291 type_tmp = bld->type; 1292 narrow_type = lp_build_vec_type(gallivm, type_tmp); 1293 type_tmp.width *= 2; 1294 wide_type = lp_build_vec_type(gallivm, type_tmp); 1295 shift = lp_build_const_vec(gallivm, type_tmp, 32); 1296 1297 if (bld->type.sign) { 1298 a = LLVMBuildSExt(builder, a, wide_type, ""); 1299 b = LLVMBuildSExt(builder, b, wide_type, ""); 1300 } else { 1301 a = LLVMBuildZExt(builder, a, wide_type, ""); 1302 b = LLVMBuildZExt(builder, b, wide_type, ""); 1303 } 1304 tmp = LLVMBuildMul(builder, a, b, ""); 1305 1306 res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1307 1308 /* Since we truncate anyway, LShr and AShr are equivalent. */ 1309 tmp = LLVMBuildLShr(builder, tmp, shift, ""); 1310 *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, ""); 1311 1312 return res_lo; 1313} 1314 1315 1316/* a * b + c */ 1317LLVMValueRef 1318lp_build_mad(struct lp_build_context *bld, 1319 LLVMValueRef a, 1320 LLVMValueRef b, 1321 LLVMValueRef c) 1322{ 1323 const struct lp_type type = bld->type; 1324 if (type.floating) { 1325 return lp_build_fmuladd(bld->gallivm->builder, a, b, c); 1326 } else { 1327 return lp_build_add(bld, lp_build_mul(bld, a, b), c); 1328 } 1329} 1330 1331 1332/** 1333 * Small vector x scale multiplication optimization. 1334 */ 1335LLVMValueRef 1336lp_build_mul_imm(struct lp_build_context *bld, 1337 LLVMValueRef a, 1338 int b) 1339{ 1340 LLVMBuilderRef builder = bld->gallivm->builder; 1341 LLVMValueRef factor; 1342 1343 assert(lp_check_value(bld->type, a)); 1344 1345 if(b == 0) 1346 return bld->zero; 1347 1348 if(b == 1) 1349 return a; 1350 1351 if(b == -1) 1352 return lp_build_negate(bld, a); 1353 1354 if(b == 2 && bld->type.floating) 1355 return lp_build_add(bld, a, a); 1356 1357 if(util_is_power_of_two_or_zero(b)) { 1358 unsigned shift = ffs(b) - 1; 1359 1360 if(bld->type.floating) { 1361#if 0 1362 /* 1363 * Power of two multiplication by directly manipulating the exponent. 1364 * 1365 * XXX: This might not be always faster, it will introduce a small error 1366 * for multiplication by zero, and it will produce wrong results 1367 * for Inf and NaN. 1368 */ 1369 unsigned mantissa = lp_mantissa(bld->type); 1370 factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa); 1371 a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), ""); 1372 a = LLVMBuildAdd(builder, a, factor, ""); 1373 a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), ""); 1374 return a; 1375#endif 1376 } 1377 else { 1378 factor = lp_build_const_vec(bld->gallivm, bld->type, shift); 1379 return LLVMBuildShl(builder, a, factor, ""); 1380 } 1381 } 1382 1383 factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b); 1384 return lp_build_mul(bld, a, factor); 1385} 1386 1387 1388/** 1389 * Generate a / b 1390 */ 1391LLVMValueRef 1392lp_build_div(struct lp_build_context *bld, 1393 LLVMValueRef a, 1394 LLVMValueRef b) 1395{ 1396 LLVMBuilderRef builder = bld->gallivm->builder; 1397 const struct lp_type type = bld->type; 1398 1399 assert(lp_check_value(type, a)); 1400 assert(lp_check_value(type, b)); 1401 1402 if(a == bld->zero) 1403 return bld->zero; 1404 if(a == bld->one && type.floating) 1405 return lp_build_rcp(bld, b); 1406 if(b == bld->zero) 1407 return bld->undef; 1408 if(b == bld->one) 1409 return a; 1410 if(a == bld->undef || b == bld->undef) 1411 return bld->undef; 1412 1413 if(LLVMIsConstant(a) && LLVMIsConstant(b)) { 1414 if (type.floating) 1415 return LLVMConstFDiv(a, b); 1416 else if (type.sign) 1417 return LLVMConstSDiv(a, b); 1418 else 1419 return LLVMConstUDiv(a, b); 1420 } 1421 1422 /* fast rcp is disabled (just uses div), so makes no sense to try that */ 1423 if(FALSE && 1424 ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 1425 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) && 1426 type.floating) 1427 return lp_build_mul(bld, a, lp_build_rcp(bld, b)); 1428 1429 if (type.floating) 1430 return LLVMBuildFDiv(builder, a, b, ""); 1431 else if (type.sign) 1432 return LLVMBuildSDiv(builder, a, b, ""); 1433 else 1434 return LLVMBuildUDiv(builder, a, b, ""); 1435} 1436 1437 1438/** 1439 * Linear interpolation helper. 1440 * 1441 * @param normalized whether we are interpolating normalized values, 1442 * encoded in normalized integers, twice as wide. 1443 * 1444 * @sa http://www.stereopsis.com/doubleblend.html 1445 */ 1446static inline LLVMValueRef 1447lp_build_lerp_simple(struct lp_build_context *bld, 1448 LLVMValueRef x, 1449 LLVMValueRef v0, 1450 LLVMValueRef v1, 1451 unsigned flags) 1452{ 1453 unsigned half_width = bld->type.width/2; 1454 LLVMBuilderRef builder = bld->gallivm->builder; 1455 LLVMValueRef delta; 1456 LLVMValueRef res; 1457 1458 assert(lp_check_value(bld->type, x)); 1459 assert(lp_check_value(bld->type, v0)); 1460 assert(lp_check_value(bld->type, v1)); 1461 1462 delta = lp_build_sub(bld, v1, v0); 1463 1464 if (bld->type.floating) { 1465 assert(flags == 0); 1466 return lp_build_mad(bld, x, delta, v0); 1467 } 1468 1469 if (flags & LP_BLD_LERP_WIDE_NORMALIZED) { 1470 if (!bld->type.sign) { 1471 if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) { 1472 /* 1473 * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the 1474 * most-significant-bit to the lowest-significant-bit, so that 1475 * later we can just divide by 2**n instead of 2**n - 1. 1476 */ 1477 1478 x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1)); 1479 } 1480 1481 /* (x * delta) >> n */ 1482 res = lp_build_mul(bld, x, delta); 1483 res = lp_build_shr_imm(bld, res, half_width); 1484 } else { 1485 /* 1486 * The rescaling trick above doesn't work for signed numbers, so 1487 * use the 2**n - 1 divison approximation in lp_build_mul_norm 1488 * instead. 1489 */ 1490 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1491 res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta); 1492 } 1493 } else { 1494 assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)); 1495 res = lp_build_mul(bld, x, delta); 1496 } 1497 1498 if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) { 1499 /* 1500 * At this point both res and v0 only use the lower half of the bits, 1501 * the rest is zero. Instead of add / mask, do add with half wide type. 1502 */ 1503 struct lp_type narrow_type; 1504 struct lp_build_context narrow_bld; 1505 1506 memset(&narrow_type, 0, sizeof narrow_type); 1507 narrow_type.sign = bld->type.sign; 1508 narrow_type.width = bld->type.width/2; 1509 narrow_type.length = bld->type.length*2; 1510 1511 lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type); 1512 res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, ""); 1513 v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, ""); 1514 res = lp_build_add(&narrow_bld, v0, res); 1515 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 1516 } else { 1517 res = lp_build_add(bld, v0, res); 1518 1519 if (bld->type.fixed) { 1520 /* 1521 * We need to mask out the high order bits when lerping 8bit 1522 * normalized colors stored on 16bits 1523 */ 1524 /* XXX: This step is necessary for lerping 8bit colors stored on 1525 * 16bits, but it will be wrong for true fixed point use cases. 1526 * Basically we need a more powerful lp_type, capable of further 1527 * distinguishing the values interpretation from the value storage. 1528 */ 1529 LLVMValueRef low_bits; 1530 low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1); 1531 res = LLVMBuildAnd(builder, res, low_bits, ""); 1532 } 1533 } 1534 1535 return res; 1536} 1537 1538 1539/** 1540 * Linear interpolation. 1541 */ 1542LLVMValueRef 1543lp_build_lerp(struct lp_build_context *bld, 1544 LLVMValueRef x, 1545 LLVMValueRef v0, 1546 LLVMValueRef v1, 1547 unsigned flags) 1548{ 1549 const struct lp_type type = bld->type; 1550 LLVMValueRef res; 1551 1552 assert(lp_check_value(type, x)); 1553 assert(lp_check_value(type, v0)); 1554 assert(lp_check_value(type, v1)); 1555 1556 assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED)); 1557 1558 if (type.norm) { 1559 struct lp_type wide_type; 1560 struct lp_build_context wide_bld; 1561 LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; 1562 1563 assert(type.length >= 2); 1564 1565 /* 1566 * Create a wider integer type, enough to hold the 1567 * intermediate result of the multiplication. 1568 */ 1569 memset(&wide_type, 0, sizeof wide_type); 1570 wide_type.sign = type.sign; 1571 wide_type.width = type.width*2; 1572 wide_type.length = type.length/2; 1573 1574 lp_build_context_init(&wide_bld, bld->gallivm, wide_type); 1575 1576 lp_build_unpack2_native(bld->gallivm, type, wide_type, x, &xl, &xh); 1577 lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h); 1578 lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h); 1579 1580 /* 1581 * Lerp both halves. 1582 */ 1583 1584 flags |= LP_BLD_LERP_WIDE_NORMALIZED; 1585 1586 resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags); 1587 resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags); 1588 1589 res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh); 1590 } else { 1591 res = lp_build_lerp_simple(bld, x, v0, v1, flags); 1592 } 1593 1594 return res; 1595} 1596 1597 1598/** 1599 * Bilinear interpolation. 1600 * 1601 * Values indices are in v_{yx}. 1602 */ 1603LLVMValueRef 1604lp_build_lerp_2d(struct lp_build_context *bld, 1605 LLVMValueRef x, 1606 LLVMValueRef y, 1607 LLVMValueRef v00, 1608 LLVMValueRef v01, 1609 LLVMValueRef v10, 1610 LLVMValueRef v11, 1611 unsigned flags) 1612{ 1613 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags); 1614 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags); 1615 return lp_build_lerp(bld, y, v0, v1, flags); 1616} 1617 1618 1619LLVMValueRef 1620lp_build_lerp_3d(struct lp_build_context *bld, 1621 LLVMValueRef x, 1622 LLVMValueRef y, 1623 LLVMValueRef z, 1624 LLVMValueRef v000, 1625 LLVMValueRef v001, 1626 LLVMValueRef v010, 1627 LLVMValueRef v011, 1628 LLVMValueRef v100, 1629 LLVMValueRef v101, 1630 LLVMValueRef v110, 1631 LLVMValueRef v111, 1632 unsigned flags) 1633{ 1634 LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags); 1635 LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags); 1636 return lp_build_lerp(bld, z, v0, v1, flags); 1637} 1638 1639 1640/** 1641 * Generate min(a, b) 1642 * Do checks for special cases but not for nans. 1643 */ 1644LLVMValueRef 1645lp_build_min(struct lp_build_context *bld, 1646 LLVMValueRef a, 1647 LLVMValueRef b) 1648{ 1649 assert(lp_check_value(bld->type, a)); 1650 assert(lp_check_value(bld->type, b)); 1651 1652 if(a == bld->undef || b == bld->undef) 1653 return bld->undef; 1654 1655 if(a == b) 1656 return a; 1657 1658 if (bld->type.norm) { 1659 if (!bld->type.sign) { 1660 if (a == bld->zero || b == bld->zero) { 1661 return bld->zero; 1662 } 1663 } 1664 if(a == bld->one) 1665 return b; 1666 if(b == bld->one) 1667 return a; 1668 } 1669 1670 return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1671} 1672 1673 1674/** 1675 * Generate min(a, b) 1676 * NaN's are handled according to the behavior specified by the 1677 * nan_behavior argument. 1678 */ 1679LLVMValueRef 1680lp_build_min_ext(struct lp_build_context *bld, 1681 LLVMValueRef a, 1682 LLVMValueRef b, 1683 enum gallivm_nan_behavior nan_behavior) 1684{ 1685 assert(lp_check_value(bld->type, a)); 1686 assert(lp_check_value(bld->type, b)); 1687 1688 if(a == bld->undef || b == bld->undef) 1689 return bld->undef; 1690 1691 if(a == b) 1692 return a; 1693 1694 if (bld->type.norm) { 1695 if (!bld->type.sign) { 1696 if (a == bld->zero || b == bld->zero) { 1697 return bld->zero; 1698 } 1699 } 1700 if(a == bld->one) 1701 return b; 1702 if(b == bld->one) 1703 return a; 1704 } 1705 1706 return lp_build_min_simple(bld, a, b, nan_behavior); 1707} 1708 1709/** 1710 * Generate max(a, b) 1711 * Do checks for special cases, but NaN behavior is undefined. 1712 */ 1713LLVMValueRef 1714lp_build_max(struct lp_build_context *bld, 1715 LLVMValueRef a, 1716 LLVMValueRef b) 1717{ 1718 assert(lp_check_value(bld->type, a)); 1719 assert(lp_check_value(bld->type, b)); 1720 1721 if(a == bld->undef || b == bld->undef) 1722 return bld->undef; 1723 1724 if(a == b) 1725 return a; 1726 1727 if(bld->type.norm) { 1728 if(a == bld->one || b == bld->one) 1729 return bld->one; 1730 if (!bld->type.sign) { 1731 if (a == bld->zero) { 1732 return b; 1733 } 1734 if (b == bld->zero) { 1735 return a; 1736 } 1737 } 1738 } 1739 1740 return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); 1741} 1742 1743 1744/** 1745 * Generate max(a, b) 1746 * Checks for special cases. 1747 * NaN's are handled according to the behavior specified by the 1748 * nan_behavior argument. 1749 */ 1750LLVMValueRef 1751lp_build_max_ext(struct lp_build_context *bld, 1752 LLVMValueRef a, 1753 LLVMValueRef b, 1754 enum gallivm_nan_behavior nan_behavior) 1755{ 1756 assert(lp_check_value(bld->type, a)); 1757 assert(lp_check_value(bld->type, b)); 1758 1759 if(a == bld->undef || b == bld->undef) 1760 return bld->undef; 1761 1762 if(a == b) 1763 return a; 1764 1765 if(bld->type.norm) { 1766 if(a == bld->one || b == bld->one) 1767 return bld->one; 1768 if (!bld->type.sign) { 1769 if (a == bld->zero) { 1770 return b; 1771 } 1772 if (b == bld->zero) { 1773 return a; 1774 } 1775 } 1776 } 1777 1778 return lp_build_max_simple(bld, a, b, nan_behavior); 1779} 1780 1781/** 1782 * Generate clamp(a, min, max) 1783 * NaN behavior (for any of a, min, max) is undefined. 1784 * Do checks for special cases. 1785 */ 1786LLVMValueRef 1787lp_build_clamp(struct lp_build_context *bld, 1788 LLVMValueRef a, 1789 LLVMValueRef min, 1790 LLVMValueRef max) 1791{ 1792 assert(lp_check_value(bld->type, a)); 1793 assert(lp_check_value(bld->type, min)); 1794 assert(lp_check_value(bld->type, max)); 1795 1796 a = lp_build_min(bld, a, max); 1797 a = lp_build_max(bld, a, min); 1798 return a; 1799} 1800 1801 1802/** 1803 * Generate clamp(a, 0, 1) 1804 * A NaN will get converted to zero. 1805 */ 1806LLVMValueRef 1807lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld, 1808 LLVMValueRef a) 1809{ 1810 a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 1811 a = lp_build_min(bld, a, bld->one); 1812 return a; 1813} 1814 1815 1816/** 1817 * Generate abs(a) 1818 */ 1819LLVMValueRef 1820lp_build_abs(struct lp_build_context *bld, 1821 LLVMValueRef a) 1822{ 1823 LLVMBuilderRef builder = bld->gallivm->builder; 1824 const struct lp_type type = bld->type; 1825 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1826 1827 assert(lp_check_value(type, a)); 1828 1829 if(!type.sign) 1830 return a; 1831 1832 if(type.floating) { 1833 if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) { 1834 /* Workaround llvm.org/PR27332 */ 1835 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1836 unsigned long long absMask = ~(1ULL << (type.width - 1)); 1837 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); 1838 a = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1839 a = LLVMBuildAnd(builder, a, mask, ""); 1840 a = LLVMBuildBitCast(builder, a, vec_type, ""); 1841 return a; 1842 } else { 1843 char intrinsic[32]; 1844 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); 1845 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 1846 } 1847 } 1848 1849 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) { 1850 switch(type.width) { 1851 case 8: 1852 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a); 1853 case 16: 1854 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a); 1855 case 32: 1856 return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a); 1857 } 1858 } 1859 else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) { 1860 switch(type.width) { 1861 case 8: 1862 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a); 1863 case 16: 1864 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a); 1865 case 32: 1866 return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a); 1867 } 1868 } 1869 1870 return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero), 1871 a, LLVMBuildNeg(builder, a, "")); 1872} 1873 1874 1875LLVMValueRef 1876lp_build_negate(struct lp_build_context *bld, 1877 LLVMValueRef a) 1878{ 1879 LLVMBuilderRef builder = bld->gallivm->builder; 1880 1881 assert(lp_check_value(bld->type, a)); 1882 1883 if (bld->type.floating) 1884 a = LLVMBuildFNeg(builder, a, ""); 1885 else 1886 a = LLVMBuildNeg(builder, a, ""); 1887 1888 return a; 1889} 1890 1891 1892/** Return -1, 0 or +1 depending on the sign of a */ 1893LLVMValueRef 1894lp_build_sgn(struct lp_build_context *bld, 1895 LLVMValueRef a) 1896{ 1897 LLVMBuilderRef builder = bld->gallivm->builder; 1898 const struct lp_type type = bld->type; 1899 LLVMValueRef cond; 1900 LLVMValueRef res; 1901 1902 assert(lp_check_value(type, a)); 1903 1904 /* Handle non-zero case */ 1905 if(!type.sign) { 1906 /* if not zero then sign must be positive */ 1907 res = bld->one; 1908 } 1909 else if(type.floating) { 1910 LLVMTypeRef vec_type; 1911 LLVMTypeRef int_type; 1912 LLVMValueRef mask; 1913 LLVMValueRef sign; 1914 LLVMValueRef one; 1915 unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); 1916 1917 int_type = lp_build_int_vec_type(bld->gallivm, type); 1918 vec_type = lp_build_vec_type(bld->gallivm, type); 1919 mask = lp_build_const_int_vec(bld->gallivm, type, maskBit); 1920 1921 /* Take the sign bit and add it to 1 constant */ 1922 sign = LLVMBuildBitCast(builder, a, int_type, ""); 1923 sign = LLVMBuildAnd(builder, sign, mask, ""); 1924 one = LLVMConstBitCast(bld->one, int_type); 1925 res = LLVMBuildOr(builder, sign, one, ""); 1926 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1927 } 1928 else 1929 { 1930 /* signed int/norm/fixed point */ 1931 /* could use psign with sse3 and appropriate vectors here */ 1932 LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0); 1933 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero); 1934 res = lp_build_select(bld, cond, bld->one, minus_one); 1935 } 1936 1937 /* Handle zero */ 1938 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero); 1939 res = lp_build_select(bld, cond, bld->zero, res); 1940 1941 return res; 1942} 1943 1944 1945/** 1946 * Set the sign of float vector 'a' according to 'sign'. 1947 * If sign==0, return abs(a). 1948 * If sign==1, return -abs(a); 1949 * Other values for sign produce undefined results. 1950 */ 1951LLVMValueRef 1952lp_build_set_sign(struct lp_build_context *bld, 1953 LLVMValueRef a, LLVMValueRef sign) 1954{ 1955 LLVMBuilderRef builder = bld->gallivm->builder; 1956 const struct lp_type type = bld->type; 1957 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 1958 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1959 LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1); 1960 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 1961 ~((unsigned long long) 1 << (type.width - 1))); 1962 LLVMValueRef val, res; 1963 1964 assert(type.floating); 1965 assert(lp_check_value(type, a)); 1966 1967 /* val = reinterpret_cast<int>(a) */ 1968 val = LLVMBuildBitCast(builder, a, int_vec_type, ""); 1969 /* val = val & mask */ 1970 val = LLVMBuildAnd(builder, val, mask, ""); 1971 /* sign = sign << shift */ 1972 sign = LLVMBuildShl(builder, sign, shift, ""); 1973 /* res = val | sign */ 1974 res = LLVMBuildOr(builder, val, sign, ""); 1975 /* res = reinterpret_cast<float>(res) */ 1976 res = LLVMBuildBitCast(builder, res, vec_type, ""); 1977 1978 return res; 1979} 1980 1981 1982/** 1983 * Convert vector of (or scalar) int to vector of (or scalar) float. 1984 */ 1985LLVMValueRef 1986lp_build_int_to_float(struct lp_build_context *bld, 1987 LLVMValueRef a) 1988{ 1989 LLVMBuilderRef builder = bld->gallivm->builder; 1990 const struct lp_type type = bld->type; 1991 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 1992 1993 assert(type.floating); 1994 1995 return LLVMBuildSIToFP(builder, a, vec_type, ""); 1996} 1997 1998static boolean 1999arch_rounding_available(const struct lp_type type) 2000{ 2001 if ((util_cpu_caps.has_sse4_1 && 2002 (type.length == 1 || type.width*type.length == 128)) || 2003 (util_cpu_caps.has_avx && type.width*type.length == 256) || 2004 (util_cpu_caps.has_avx512f && type.width*type.length == 512)) 2005 return TRUE; 2006 else if ((util_cpu_caps.has_altivec && 2007 (type.width == 32 && type.length == 4))) 2008 return TRUE; 2009 else if (util_cpu_caps.has_neon) 2010 return TRUE; 2011 2012 return FALSE; 2013} 2014 2015enum lp_build_round_mode 2016{ 2017 LP_BUILD_ROUND_NEAREST = 0, 2018 LP_BUILD_ROUND_FLOOR = 1, 2019 LP_BUILD_ROUND_CEIL = 2, 2020 LP_BUILD_ROUND_TRUNCATE = 3 2021}; 2022 2023static inline LLVMValueRef 2024lp_build_iround_nearest_sse2(struct lp_build_context *bld, 2025 LLVMValueRef a) 2026{ 2027 LLVMBuilderRef builder = bld->gallivm->builder; 2028 const struct lp_type type = bld->type; 2029 LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); 2030 LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type); 2031 const char *intrinsic; 2032 LLVMValueRef res; 2033 2034 assert(type.floating); 2035 /* using the double precision conversions is a bit more complicated */ 2036 assert(type.width == 32); 2037 2038 assert(lp_check_value(type, a)); 2039 assert(util_cpu_caps.has_sse2); 2040 2041 /* This is relying on MXCSR rounding mode, which should always be nearest. */ 2042 if (type.length == 1) { 2043 LLVMTypeRef vec_type; 2044 LLVMValueRef undef; 2045 LLVMValueRef arg; 2046 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); 2047 2048 vec_type = LLVMVectorType(bld->elem_type, 4); 2049 2050 intrinsic = "llvm.x86.sse.cvtss2si"; 2051 2052 undef = LLVMGetUndef(vec_type); 2053 2054 arg = LLVMBuildInsertElement(builder, undef, a, index0, ""); 2055 2056 res = lp_build_intrinsic_unary(builder, intrinsic, 2057 ret_type, arg); 2058 } 2059 else { 2060 if (type.width* type.length == 128) { 2061 intrinsic = "llvm.x86.sse2.cvtps2dq"; 2062 } 2063 else { 2064 assert(type.width*type.length == 256); 2065 assert(util_cpu_caps.has_avx); 2066 2067 intrinsic = "llvm.x86.avx.cvt.ps2dq.256"; 2068 } 2069 res = lp_build_intrinsic_unary(builder, intrinsic, 2070 ret_type, a); 2071 } 2072 2073 return res; 2074} 2075 2076 2077/* 2078 */ 2079static inline LLVMValueRef 2080lp_build_round_altivec(struct lp_build_context *bld, 2081 LLVMValueRef a, 2082 enum lp_build_round_mode mode) 2083{ 2084 LLVMBuilderRef builder = bld->gallivm->builder; 2085 const struct lp_type type = bld->type; 2086 const char *intrinsic = NULL; 2087 2088 assert(type.floating); 2089 2090 assert(lp_check_value(type, a)); 2091 assert(util_cpu_caps.has_altivec); 2092 2093 (void)type; 2094 2095 switch (mode) { 2096 case LP_BUILD_ROUND_NEAREST: 2097 intrinsic = "llvm.ppc.altivec.vrfin"; 2098 break; 2099 case LP_BUILD_ROUND_FLOOR: 2100 intrinsic = "llvm.ppc.altivec.vrfim"; 2101 break; 2102 case LP_BUILD_ROUND_CEIL: 2103 intrinsic = "llvm.ppc.altivec.vrfip"; 2104 break; 2105 case LP_BUILD_ROUND_TRUNCATE: 2106 intrinsic = "llvm.ppc.altivec.vrfiz"; 2107 break; 2108 } 2109 2110 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2111} 2112 2113static inline LLVMValueRef 2114lp_build_round_arch(struct lp_build_context *bld, 2115 LLVMValueRef a, 2116 enum lp_build_round_mode mode) 2117{ 2118 if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) { 2119 LLVMBuilderRef builder = bld->gallivm->builder; 2120 const struct lp_type type = bld->type; 2121 const char *intrinsic_root; 2122 char intrinsic[32]; 2123 2124 assert(type.floating); 2125 assert(lp_check_value(type, a)); 2126 (void)type; 2127 2128 switch (mode) { 2129 case LP_BUILD_ROUND_NEAREST: 2130 intrinsic_root = "llvm.nearbyint"; 2131 break; 2132 case LP_BUILD_ROUND_FLOOR: 2133 intrinsic_root = "llvm.floor"; 2134 break; 2135 case LP_BUILD_ROUND_CEIL: 2136 intrinsic_root = "llvm.ceil"; 2137 break; 2138 case LP_BUILD_ROUND_TRUNCATE: 2139 intrinsic_root = "llvm.trunc"; 2140 break; 2141 } 2142 2143 lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type); 2144 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2145 } 2146 else /* (util_cpu_caps.has_altivec) */ 2147 return lp_build_round_altivec(bld, a, mode); 2148} 2149 2150/** 2151 * Return the integer part of a float (vector) value (== round toward zero). 2152 * The returned value is a float (vector). 2153 * Ex: trunc(-1.5) = -1.0 2154 */ 2155LLVMValueRef 2156lp_build_trunc(struct lp_build_context *bld, 2157 LLVMValueRef a) 2158{ 2159 LLVMBuilderRef builder = bld->gallivm->builder; 2160 const struct lp_type type = bld->type; 2161 2162 assert(type.floating); 2163 assert(lp_check_value(type, a)); 2164 2165 if (arch_rounding_available(type)) { 2166 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE); 2167 } 2168 else { 2169 const struct lp_type type = bld->type; 2170 struct lp_type inttype; 2171 struct lp_build_context intbld; 2172 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2173 LLVMValueRef trunc, res, anosign, mask; 2174 LLVMTypeRef int_vec_type = bld->int_vec_type; 2175 LLVMTypeRef vec_type = bld->vec_type; 2176 2177 assert(type.width == 32); /* might want to handle doubles at some point */ 2178 2179 inttype = type; 2180 inttype.floating = 0; 2181 lp_build_context_init(&intbld, bld->gallivm, inttype); 2182 2183 /* round by truncation */ 2184 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2185 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2186 2187 /* mask out sign bit */ 2188 anosign = lp_build_abs(bld, a); 2189 /* 2190 * mask out all values if anosign > 2^24 2191 * This should work both for large ints (all rounding is no-op for them 2192 * because such floats are always exact) as well as special cases like 2193 * NaNs, Infs (taking advantage of the fact they use max exponent). 2194 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2195 */ 2196 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2197 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2198 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2199 return lp_build_select(bld, mask, a, res); 2200 } 2201} 2202 2203 2204/** 2205 * Return float (vector) rounded to nearest integer (vector). The returned 2206 * value is a float (vector). 2207 * Ex: round(0.9) = 1.0 2208 * Ex: round(-1.5) = -2.0 2209 */ 2210LLVMValueRef 2211lp_build_round(struct lp_build_context *bld, 2212 LLVMValueRef a) 2213{ 2214 LLVMBuilderRef builder = bld->gallivm->builder; 2215 const struct lp_type type = bld->type; 2216 2217 assert(type.floating); 2218 assert(lp_check_value(type, a)); 2219 2220 if (arch_rounding_available(type)) { 2221 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2222 } 2223 else { 2224 const struct lp_type type = bld->type; 2225 struct lp_type inttype; 2226 struct lp_build_context intbld; 2227 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2228 LLVMValueRef res, anosign, mask; 2229 LLVMTypeRef int_vec_type = bld->int_vec_type; 2230 LLVMTypeRef vec_type = bld->vec_type; 2231 2232 assert(type.width == 32); /* might want to handle doubles at some point */ 2233 2234 inttype = type; 2235 inttype.floating = 0; 2236 lp_build_context_init(&intbld, bld->gallivm, inttype); 2237 2238 res = lp_build_iround(bld, a); 2239 res = LLVMBuildSIToFP(builder, res, vec_type, ""); 2240 2241 /* mask out sign bit */ 2242 anosign = lp_build_abs(bld, a); 2243 /* 2244 * mask out all values if anosign > 2^24 2245 * This should work both for large ints (all rounding is no-op for them 2246 * because such floats are always exact) as well as special cases like 2247 * NaNs, Infs (taking advantage of the fact they use max exponent). 2248 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2249 */ 2250 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2251 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2252 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2253 return lp_build_select(bld, mask, a, res); 2254 } 2255} 2256 2257 2258/** 2259 * Return floor of float (vector), result is a float (vector) 2260 * Ex: floor(1.1) = 1.0 2261 * Ex: floor(-1.1) = -2.0 2262 */ 2263LLVMValueRef 2264lp_build_floor(struct lp_build_context *bld, 2265 LLVMValueRef a) 2266{ 2267 LLVMBuilderRef builder = bld->gallivm->builder; 2268 const struct lp_type type = bld->type; 2269 2270 assert(type.floating); 2271 assert(lp_check_value(type, a)); 2272 2273 if (arch_rounding_available(type)) { 2274 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2275 } 2276 else { 2277 const struct lp_type type = bld->type; 2278 struct lp_type inttype; 2279 struct lp_build_context intbld; 2280 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2281 LLVMValueRef trunc, res, anosign, mask; 2282 LLVMTypeRef int_vec_type = bld->int_vec_type; 2283 LLVMTypeRef vec_type = bld->vec_type; 2284 2285 if (type.width != 32) { 2286 char intrinsic[32]; 2287 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type); 2288 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2289 } 2290 2291 assert(type.width == 32); /* might want to handle doubles at some point */ 2292 2293 inttype = type; 2294 inttype.floating = 0; 2295 lp_build_context_init(&intbld, bld->gallivm, inttype); 2296 2297 /* round by truncation */ 2298 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2299 res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc"); 2300 2301 if (type.sign) { 2302 LLVMValueRef tmp; 2303 2304 /* 2305 * fix values if rounding is wrong (for non-special cases) 2306 * - this is the case if trunc > a 2307 */ 2308 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a); 2309 /* tmp = trunc > a ? 1.0 : 0.0 */ 2310 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2311 tmp = lp_build_and(&intbld, mask, tmp); 2312 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2313 res = lp_build_sub(bld, res, tmp); 2314 } 2315 2316 /* mask out sign bit */ 2317 anosign = lp_build_abs(bld, a); 2318 /* 2319 * mask out all values if anosign > 2^24 2320 * This should work both for large ints (all rounding is no-op for them 2321 * because such floats are always exact) as well as special cases like 2322 * NaNs, Infs (taking advantage of the fact they use max exponent). 2323 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2324 */ 2325 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2326 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2327 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2328 return lp_build_select(bld, mask, a, res); 2329 } 2330} 2331 2332 2333/** 2334 * Return ceiling of float (vector), returning float (vector). 2335 * Ex: ceil( 1.1) = 2.0 2336 * Ex: ceil(-1.1) = -1.0 2337 */ 2338LLVMValueRef 2339lp_build_ceil(struct lp_build_context *bld, 2340 LLVMValueRef a) 2341{ 2342 LLVMBuilderRef builder = bld->gallivm->builder; 2343 const struct lp_type type = bld->type; 2344 2345 assert(type.floating); 2346 assert(lp_check_value(type, a)); 2347 2348 if (arch_rounding_available(type)) { 2349 return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2350 } 2351 else { 2352 const struct lp_type type = bld->type; 2353 struct lp_type inttype; 2354 struct lp_build_context intbld; 2355 LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24); 2356 LLVMValueRef trunc, res, anosign, mask, tmp; 2357 LLVMTypeRef int_vec_type = bld->int_vec_type; 2358 LLVMTypeRef vec_type = bld->vec_type; 2359 2360 if (type.width != 32) { 2361 char intrinsic[32]; 2362 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type); 2363 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2364 } 2365 2366 assert(type.width == 32); /* might want to handle doubles at some point */ 2367 2368 inttype = type; 2369 inttype.floating = 0; 2370 lp_build_context_init(&intbld, bld->gallivm, inttype); 2371 2372 /* round by truncation */ 2373 trunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2374 trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc"); 2375 2376 /* 2377 * fix values if rounding is wrong (for non-special cases) 2378 * - this is the case if trunc < a 2379 */ 2380 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2381 /* tmp = trunc < a ? 1.0 : 0.0 */ 2382 tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, ""); 2383 tmp = lp_build_and(&intbld, mask, tmp); 2384 tmp = LLVMBuildBitCast(builder, tmp, vec_type, ""); 2385 res = lp_build_add(bld, trunc, tmp); 2386 2387 /* mask out sign bit */ 2388 anosign = lp_build_abs(bld, a); 2389 /* 2390 * mask out all values if anosign > 2^24 2391 * This should work both for large ints (all rounding is no-op for them 2392 * because such floats are always exact) as well as special cases like 2393 * NaNs, Infs (taking advantage of the fact they use max exponent). 2394 * (2^24 is arbitrary anything between 2^24 and 2^31 should work.) 2395 */ 2396 anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, ""); 2397 cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, ""); 2398 mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval); 2399 return lp_build_select(bld, mask, a, res); 2400 } 2401} 2402 2403 2404/** 2405 * Return fractional part of 'a' computed as a - floor(a) 2406 * Typically used in texture coord arithmetic. 2407 */ 2408LLVMValueRef 2409lp_build_fract(struct lp_build_context *bld, 2410 LLVMValueRef a) 2411{ 2412 assert(bld->type.floating); 2413 return lp_build_sub(bld, a, lp_build_floor(bld, a)); 2414} 2415 2416 2417/** 2418 * Prevent returning 1.0 for very small negative values of 'a' by clamping 2419 * against 0.99999(9). (Will also return that value for NaNs.) 2420 */ 2421static inline LLVMValueRef 2422clamp_fract(struct lp_build_context *bld, LLVMValueRef fract) 2423{ 2424 LLVMValueRef max; 2425 2426 /* this is the largest number smaller than 1.0 representable as float */ 2427 max = lp_build_const_vec(bld->gallivm, bld->type, 2428 1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1))); 2429 return lp_build_min_ext(bld, fract, max, 2430 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 2431} 2432 2433 2434/** 2435 * Same as lp_build_fract, but guarantees that the result is always smaller 2436 * than one. Will also return the smaller-than-one value for infs, NaNs. 2437 */ 2438LLVMValueRef 2439lp_build_fract_safe(struct lp_build_context *bld, 2440 LLVMValueRef a) 2441{ 2442 return clamp_fract(bld, lp_build_fract(bld, a)); 2443} 2444 2445 2446/** 2447 * Return the integer part of a float (vector) value (== round toward zero). 2448 * The returned value is an integer (vector). 2449 * Ex: itrunc(-1.5) = -1 2450 */ 2451LLVMValueRef 2452lp_build_itrunc(struct lp_build_context *bld, 2453 LLVMValueRef a) 2454{ 2455 LLVMBuilderRef builder = bld->gallivm->builder; 2456 const struct lp_type type = bld->type; 2457 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 2458 2459 assert(type.floating); 2460 assert(lp_check_value(type, a)); 2461 2462 return LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2463} 2464 2465 2466/** 2467 * Return float (vector) rounded to nearest integer (vector). The returned 2468 * value is an integer (vector). 2469 * Ex: iround(0.9) = 1 2470 * Ex: iround(-1.5) = -2 2471 */ 2472LLVMValueRef 2473lp_build_iround(struct lp_build_context *bld, 2474 LLVMValueRef a) 2475{ 2476 LLVMBuilderRef builder = bld->gallivm->builder; 2477 const struct lp_type type = bld->type; 2478 LLVMTypeRef int_vec_type = bld->int_vec_type; 2479 LLVMValueRef res; 2480 2481 assert(type.floating); 2482 2483 assert(lp_check_value(type, a)); 2484 2485 if ((util_cpu_caps.has_sse2 && 2486 ((type.width == 32) && (type.length == 1 || type.length == 4))) || 2487 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2488 return lp_build_iround_nearest_sse2(bld, a); 2489 } 2490 if (arch_rounding_available(type)) { 2491 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST); 2492 } 2493 else { 2494 LLVMValueRef half; 2495 2496 half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0)); 2497 2498 if (type.sign) { 2499 LLVMTypeRef vec_type = bld->vec_type; 2500 LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, 2501 (unsigned long long)1 << (type.width - 1)); 2502 LLVMValueRef sign; 2503 2504 /* get sign bit */ 2505 sign = LLVMBuildBitCast(builder, a, int_vec_type, ""); 2506 sign = LLVMBuildAnd(builder, sign, mask, ""); 2507 2508 /* sign * 0.5 */ 2509 half = LLVMBuildBitCast(builder, half, int_vec_type, ""); 2510 half = LLVMBuildOr(builder, sign, half, ""); 2511 half = LLVMBuildBitCast(builder, half, vec_type, ""); 2512 } 2513 2514 res = LLVMBuildFAdd(builder, a, half, ""); 2515 } 2516 2517 res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); 2518 2519 return res; 2520} 2521 2522 2523/** 2524 * Return floor of float (vector), result is an int (vector) 2525 * Ex: ifloor(1.1) = 1.0 2526 * Ex: ifloor(-1.1) = -2.0 2527 */ 2528LLVMValueRef 2529lp_build_ifloor(struct lp_build_context *bld, 2530 LLVMValueRef a) 2531{ 2532 LLVMBuilderRef builder = bld->gallivm->builder; 2533 const struct lp_type type = bld->type; 2534 LLVMTypeRef int_vec_type = bld->int_vec_type; 2535 LLVMValueRef res; 2536 2537 assert(type.floating); 2538 assert(lp_check_value(type, a)); 2539 2540 res = a; 2541 if (type.sign) { 2542 if (arch_rounding_available(type)) { 2543 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR); 2544 } 2545 else { 2546 struct lp_type inttype; 2547 struct lp_build_context intbld; 2548 LLVMValueRef trunc, itrunc, mask; 2549 2550 assert(type.floating); 2551 assert(lp_check_value(type, a)); 2552 2553 inttype = type; 2554 inttype.floating = 0; 2555 lp_build_context_init(&intbld, bld->gallivm, inttype); 2556 2557 /* round by truncation */ 2558 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2559 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc"); 2560 2561 /* 2562 * fix values if rounding is wrong (for non-special cases) 2563 * - this is the case if trunc > a 2564 * The results of doing this with NaNs, very large values etc. 2565 * are undefined but this seems to be the case anyway. 2566 */ 2567 mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a); 2568 /* cheapie minus one with mask since the mask is minus one / zero */ 2569 return lp_build_add(&intbld, itrunc, mask); 2570 } 2571 } 2572 2573 /* round to nearest (toward zero) */ 2574 res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res"); 2575 2576 return res; 2577} 2578 2579 2580/** 2581 * Return ceiling of float (vector), returning int (vector). 2582 * Ex: iceil( 1.1) = 2 2583 * Ex: iceil(-1.1) = -1 2584 */ 2585LLVMValueRef 2586lp_build_iceil(struct lp_build_context *bld, 2587 LLVMValueRef a) 2588{ 2589 LLVMBuilderRef builder = bld->gallivm->builder; 2590 const struct lp_type type = bld->type; 2591 LLVMTypeRef int_vec_type = bld->int_vec_type; 2592 LLVMValueRef res; 2593 2594 assert(type.floating); 2595 assert(lp_check_value(type, a)); 2596 2597 if (arch_rounding_available(type)) { 2598 res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL); 2599 } 2600 else { 2601 struct lp_type inttype; 2602 struct lp_build_context intbld; 2603 LLVMValueRef trunc, itrunc, mask; 2604 2605 assert(type.floating); 2606 assert(lp_check_value(type, a)); 2607 2608 inttype = type; 2609 inttype.floating = 0; 2610 lp_build_context_init(&intbld, bld->gallivm, inttype); 2611 2612 /* round by truncation */ 2613 itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, ""); 2614 trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc"); 2615 2616 /* 2617 * fix values if rounding is wrong (for non-special cases) 2618 * - this is the case if trunc < a 2619 * The results of doing this with NaNs, very large values etc. 2620 * are undefined but this seems to be the case anyway. 2621 */ 2622 mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a); 2623 /* cheapie plus one with mask since the mask is minus one / zero */ 2624 return lp_build_sub(&intbld, itrunc, mask); 2625 } 2626 2627 /* round to nearest (toward zero) */ 2628 res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res"); 2629 2630 return res; 2631} 2632 2633 2634/** 2635 * Combined ifloor() & fract(). 2636 * 2637 * Preferred to calling the functions separately, as it will ensure that the 2638 * strategy (floor() vs ifloor()) that results in less redundant work is used. 2639 */ 2640void 2641lp_build_ifloor_fract(struct lp_build_context *bld, 2642 LLVMValueRef a, 2643 LLVMValueRef *out_ipart, 2644 LLVMValueRef *out_fpart) 2645{ 2646 LLVMBuilderRef builder = bld->gallivm->builder; 2647 const struct lp_type type = bld->type; 2648 LLVMValueRef ipart; 2649 2650 assert(type.floating); 2651 assert(lp_check_value(type, a)); 2652 2653 if (arch_rounding_available(type)) { 2654 /* 2655 * floor() is easier. 2656 */ 2657 2658 ipart = lp_build_floor(bld, a); 2659 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2660 *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart"); 2661 } 2662 else { 2663 /* 2664 * ifloor() is easier. 2665 */ 2666 2667 *out_ipart = lp_build_ifloor(bld, a); 2668 ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart"); 2669 *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart"); 2670 } 2671} 2672 2673 2674/** 2675 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is 2676 * always smaller than one. 2677 */ 2678void 2679lp_build_ifloor_fract_safe(struct lp_build_context *bld, 2680 LLVMValueRef a, 2681 LLVMValueRef *out_ipart, 2682 LLVMValueRef *out_fpart) 2683{ 2684 lp_build_ifloor_fract(bld, a, out_ipart, out_fpart); 2685 *out_fpart = clamp_fract(bld, *out_fpart); 2686} 2687 2688 2689LLVMValueRef 2690lp_build_sqrt(struct lp_build_context *bld, 2691 LLVMValueRef a) 2692{ 2693 LLVMBuilderRef builder = bld->gallivm->builder; 2694 const struct lp_type type = bld->type; 2695 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 2696 char intrinsic[32]; 2697 2698 assert(lp_check_value(type, a)); 2699 2700 assert(type.floating); 2701 lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type); 2702 2703 return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); 2704} 2705 2706 2707/** 2708 * Do one Newton-Raphson step to improve reciprocate precision: 2709 * 2710 * x_{i+1} = x_i * (2 - a * x_i) 2711 * 2712 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or 2713 * +/-Inf, giving NaN instead. Certain applications rely on this behavior, 2714 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's 2715 * halo. It would be necessary to clamp the argument to prevent this. 2716 * 2717 * See also: 2718 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division 2719 * - http://softwarecommunity.intel.com/articles/eng/1818.htm 2720 */ 2721static inline LLVMValueRef 2722lp_build_rcp_refine(struct lp_build_context *bld, 2723 LLVMValueRef a, 2724 LLVMValueRef rcp_a) 2725{ 2726 LLVMBuilderRef builder = bld->gallivm->builder; 2727 LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0); 2728 LLVMValueRef res; 2729 2730 res = LLVMBuildFMul(builder, a, rcp_a, ""); 2731 res = LLVMBuildFSub(builder, two, res, ""); 2732 res = LLVMBuildFMul(builder, rcp_a, res, ""); 2733 2734 return res; 2735} 2736 2737 2738LLVMValueRef 2739lp_build_rcp(struct lp_build_context *bld, 2740 LLVMValueRef a) 2741{ 2742 LLVMBuilderRef builder = bld->gallivm->builder; 2743 const struct lp_type type = bld->type; 2744 2745 assert(lp_check_value(type, a)); 2746 2747 if(a == bld->zero) 2748 return bld->undef; 2749 if(a == bld->one) 2750 return bld->one; 2751 if(a == bld->undef) 2752 return bld->undef; 2753 2754 assert(type.floating); 2755 2756 if(LLVMIsConstant(a)) 2757 return LLVMConstFDiv(bld->one, a); 2758 2759 /* 2760 * We don't use RCPPS because: 2761 * - it only has 10bits of precision 2762 * - it doesn't even get the reciprocate of 1.0 exactly 2763 * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf 2764 * - for recent processors the benefit over DIVPS is marginal, a case 2765 * dependent 2766 * 2767 * We could still use it on certain processors if benchmarks show that the 2768 * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for 2769 * particular uses that require less workarounds. 2770 */ 2771 2772 if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2773 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){ 2774 const unsigned num_iterations = 0; 2775 LLVMValueRef res; 2776 unsigned i; 2777 const char *intrinsic = NULL; 2778 2779 if (type.length == 4) { 2780 intrinsic = "llvm.x86.sse.rcp.ps"; 2781 } 2782 else { 2783 intrinsic = "llvm.x86.avx.rcp.ps.256"; 2784 } 2785 2786 res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2787 2788 for (i = 0; i < num_iterations; ++i) { 2789 res = lp_build_rcp_refine(bld, a, res); 2790 } 2791 2792 return res; 2793 } 2794 2795 return LLVMBuildFDiv(builder, bld->one, a, ""); 2796} 2797 2798 2799/** 2800 * Do one Newton-Raphson step to improve rsqrt precision: 2801 * 2802 * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) 2803 * 2804 * See also Intel 64 and IA-32 Architectures Optimization Manual. 2805 */ 2806static inline LLVMValueRef 2807lp_build_rsqrt_refine(struct lp_build_context *bld, 2808 LLVMValueRef a, 2809 LLVMValueRef rsqrt_a) 2810{ 2811 LLVMBuilderRef builder = bld->gallivm->builder; 2812 LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5); 2813 LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0); 2814 LLVMValueRef res; 2815 2816 res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, ""); 2817 res = LLVMBuildFMul(builder, a, res, ""); 2818 res = LLVMBuildFSub(builder, three, res, ""); 2819 res = LLVMBuildFMul(builder, rsqrt_a, res, ""); 2820 res = LLVMBuildFMul(builder, half, res, ""); 2821 2822 return res; 2823} 2824 2825 2826/** 2827 * Generate 1/sqrt(a). 2828 * Result is undefined for values < 0, infinity for +0. 2829 */ 2830LLVMValueRef 2831lp_build_rsqrt(struct lp_build_context *bld, 2832 LLVMValueRef a) 2833{ 2834 const struct lp_type type = bld->type; 2835 2836 assert(lp_check_value(type, a)); 2837 2838 assert(type.floating); 2839 2840 /* 2841 * This should be faster but all denormals will end up as infinity. 2842 */ 2843 if (0 && lp_build_fast_rsqrt_available(type)) { 2844 const unsigned num_iterations = 1; 2845 LLVMValueRef res; 2846 unsigned i; 2847 2848 /* rsqrt(1.0) != 1.0 here */ 2849 res = lp_build_fast_rsqrt(bld, a); 2850 2851 if (num_iterations) { 2852 /* 2853 * Newton-Raphson will result in NaN instead of infinity for zero, 2854 * and NaN instead of zero for infinity. 2855 * Also, need to ensure rsqrt(1.0) == 1.0. 2856 * All numbers smaller than FLT_MIN will result in +infinity 2857 * (rsqrtps treats all denormals as zero). 2858 */ 2859 LLVMValueRef cmp; 2860 LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN); 2861 LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY); 2862 2863 for (i = 0; i < num_iterations; ++i) { 2864 res = lp_build_rsqrt_refine(bld, a, res); 2865 } 2866 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min); 2867 res = lp_build_select(bld, cmp, inf, res); 2868 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf); 2869 res = lp_build_select(bld, cmp, bld->zero, res); 2870 cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one); 2871 res = lp_build_select(bld, cmp, bld->one, res); 2872 } 2873 2874 return res; 2875 } 2876 2877 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2878} 2879 2880/** 2881 * If there's a fast (inaccurate) rsqrt instruction available 2882 * (caller may want to avoid to call rsqrt_fast if it's not available, 2883 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if 2884 * unavailable it would result in sqrt/div/mul so obviously 2885 * much better to just call sqrt, skipping both div and mul). 2886 */ 2887boolean 2888lp_build_fast_rsqrt_available(struct lp_type type) 2889{ 2890 assert(type.floating); 2891 2892 if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) || 2893 (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) { 2894 return true; 2895 } 2896 return false; 2897} 2898 2899 2900/** 2901 * Generate 1/sqrt(a). 2902 * Result is undefined for values < 0, infinity for +0. 2903 * Precision is limited, only ~10 bits guaranteed 2904 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0). 2905 */ 2906LLVMValueRef 2907lp_build_fast_rsqrt(struct lp_build_context *bld, 2908 LLVMValueRef a) 2909{ 2910 LLVMBuilderRef builder = bld->gallivm->builder; 2911 const struct lp_type type = bld->type; 2912 2913 assert(lp_check_value(type, a)); 2914 2915 if (lp_build_fast_rsqrt_available(type)) { 2916 const char *intrinsic = NULL; 2917 2918 if (type.length == 4) { 2919 intrinsic = "llvm.x86.sse.rsqrt.ps"; 2920 } 2921 else { 2922 intrinsic = "llvm.x86.avx.rsqrt.ps.256"; 2923 } 2924 return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a); 2925 } 2926 else { 2927 debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__); 2928 } 2929 return lp_build_rcp(bld, lp_build_sqrt(bld, a)); 2930} 2931 2932 2933/** 2934 * Generate sin(a) or cos(a) using polynomial approximation. 2935 * TODO: it might be worth recognizing sin and cos using same source 2936 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time 2937 * would be way cheaper than calculating (nearly) everything twice... 2938 * Not sure it's common enough to be worth bothering however, scs 2939 * opcode could also benefit from calculating both though. 2940 */ 2941static LLVMValueRef 2942lp_build_sin_or_cos(struct lp_build_context *bld, 2943 LLVMValueRef a, 2944 boolean cos) 2945{ 2946 struct gallivm_state *gallivm = bld->gallivm; 2947 LLVMBuilderRef b = gallivm->builder; 2948 struct lp_type int_type = lp_int_type(bld->type); 2949 2950 /* 2951 * take the absolute value, 2952 * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); 2953 */ 2954 2955 LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000); 2956 LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si"); 2957 2958 LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); 2959 LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs"); 2960 2961 /* 2962 * scale by 4/Pi 2963 * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); 2964 */ 2965 2966 LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516); 2967 LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); 2968 2969 /* 2970 * store the integer part of y in mm0 2971 * emm2 = _mm_cvttps_epi32(y); 2972 */ 2973 2974 LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i"); 2975 2976 /* 2977 * j=(j+1) & (~1) (see the cephes sources) 2978 * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); 2979 */ 2980 2981 LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1); 2982 LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); 2983 /* 2984 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); 2985 */ 2986 LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1); 2987 LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); 2988 2989 /* 2990 * y = _mm_cvtepi32_ps(emm2); 2991 */ 2992 LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2"); 2993 2994 LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2); 2995 LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4); 2996 LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29); 2997 LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000); 2998 2999 /* 3000 * Argument used for poly selection and sign bit determination 3001 * is different for sin vs. cos. 3002 */ 3003 LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") : 3004 emm2_and; 3005 3006 LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4, 3007 LLVMBuildNot(b, emm2_2, ""), ""), 3008 const_29, "sign_bit") : 3009 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si, 3010 LLVMBuildShl(b, emm2_add, 3011 const_29, ""), ""), 3012 sign_mask, "sign_bit"); 3013 3014 /* 3015 * get the polynom selection mask 3016 * there is one polynom for 0 <= x <= Pi/4 3017 * and another one for Pi/4<x<=Pi/2 3018 * Both branches will be computed. 3019 * 3020 * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); 3021 * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); 3022 */ 3023 3024 LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, const_2, "emm2_3"); 3025 LLVMValueRef poly_mask = lp_build_compare(gallivm, 3026 int_type, PIPE_FUNC_EQUAL, 3027 emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0)); 3028 3029 /* 3030 * _PS_CONST(minus_cephes_DP1, -0.78515625); 3031 * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); 3032 * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); 3033 */ 3034 LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625); 3035 LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4); 3036 LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8); 3037 3038 /* 3039 * The magic pass: "Extended precision modular arithmetic" 3040 * x = ((x - y * DP1) - y * DP2) - y * DP3; 3041 */ 3042 LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs); 3043 LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1); 3044 LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2); 3045 3046 /* 3047 * Evaluate the first polynom (0 <= x <= Pi/4) 3048 * 3049 * z = _mm_mul_ps(x,x); 3050 */ 3051 LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); 3052 3053 /* 3054 * _PS_CONST(coscof_p0, 2.443315711809948E-005); 3055 * _PS_CONST(coscof_p1, -1.388731625493765E-003); 3056 * _PS_CONST(coscof_p2, 4.166664568298827E-002); 3057 */ 3058 LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005); 3059 LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003); 3060 LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002); 3061 3062 /* 3063 * y = *(v4sf*)_ps_coscof_p0; 3064 * y = _mm_mul_ps(y, z); 3065 */ 3066 LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1); 3067 LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2); 3068 LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); 3069 LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); 3070 3071 3072 /* 3073 * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); 3074 * y = _mm_sub_ps(y, tmp); 3075 * y = _mm_add_ps(y, *(v4sf*)_ps_1); 3076 */ 3077 LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5); 3078 LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); 3079 LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); 3080 LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0); 3081 LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); 3082 3083 /* 3084 * _PS_CONST(sincof_p0, -1.9515295891E-4); 3085 * _PS_CONST(sincof_p1, 8.3321608736E-3); 3086 * _PS_CONST(sincof_p2, -1.6666654611E-1); 3087 */ 3088 LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4); 3089 LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3); 3090 LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1); 3091 3092 /* 3093 * Evaluate the second polynom (Pi/4 <= x <= 0) 3094 * 3095 * y2 = *(v4sf*)_ps_sincof_p0; 3096 * y2 = _mm_mul_ps(y2, z); 3097 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); 3098 * y2 = _mm_mul_ps(y2, z); 3099 * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); 3100 * y2 = _mm_mul_ps(y2, z); 3101 * y2 = _mm_mul_ps(y2, x); 3102 * y2 = _mm_add_ps(y2, x); 3103 */ 3104 3105 LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1); 3106 LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2); 3107 LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); 3108 LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3); 3109 3110 /* 3111 * select the correct result from the two polynoms 3112 * xmm3 = poly_mask; 3113 * y2 = _mm_and_ps(xmm3, y2); //, xmm3); 3114 * y = _mm_andnot_ps(xmm3, y); 3115 * y = _mm_or_ps(y,y2); 3116 */ 3117 LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i"); 3118 LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i"); 3119 LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); 3120 LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv"); 3121 LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); 3122 LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine"); 3123 3124 /* 3125 * update the sign 3126 * y = _mm_xor_ps(y, sign_bit); 3127 */ 3128 LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign"); 3129 LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result"); 3130 3131 LLVMValueRef isfinite = lp_build_isfinite(bld, a); 3132 3133 /* clamp output to be within [-1, 1] */ 3134 y_result = lp_build_clamp(bld, y_result, 3135 lp_build_const_vec(bld->gallivm, bld->type, -1.f), 3136 lp_build_const_vec(bld->gallivm, bld->type, 1.f)); 3137 /* If a is -inf, inf or NaN then return NaN */ 3138 y_result = lp_build_select(bld, isfinite, y_result, 3139 lp_build_const_vec(bld->gallivm, bld->type, NAN)); 3140 return y_result; 3141} 3142 3143 3144/** 3145 * Generate sin(a) 3146 */ 3147LLVMValueRef 3148lp_build_sin(struct lp_build_context *bld, 3149 LLVMValueRef a) 3150{ 3151 return lp_build_sin_or_cos(bld, a, FALSE); 3152} 3153 3154 3155/** 3156 * Generate cos(a) 3157 */ 3158LLVMValueRef 3159lp_build_cos(struct lp_build_context *bld, 3160 LLVMValueRef a) 3161{ 3162 return lp_build_sin_or_cos(bld, a, TRUE); 3163} 3164 3165 3166/** 3167 * Generate pow(x, y) 3168 */ 3169LLVMValueRef 3170lp_build_pow(struct lp_build_context *bld, 3171 LLVMValueRef x, 3172 LLVMValueRef y) 3173{ 3174 /* TODO: optimize the constant case */ 3175 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3176 LLVMIsConstant(x) && LLVMIsConstant(y)) { 3177 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3178 __FUNCTION__); 3179 } 3180 3181 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); 3182} 3183 3184 3185/** 3186 * Generate exp(x) 3187 */ 3188LLVMValueRef 3189lp_build_exp(struct lp_build_context *bld, 3190 LLVMValueRef x) 3191{ 3192 /* log2(e) = 1/log(2) */ 3193 LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type, 3194 1.4426950408889634); 3195 3196 assert(lp_check_value(bld->type, x)); 3197 3198 return lp_build_exp2(bld, lp_build_mul(bld, log2e, x)); 3199} 3200 3201 3202/** 3203 * Generate log(x) 3204 * Behavior is undefined with infs, 0s and nans 3205 */ 3206LLVMValueRef 3207lp_build_log(struct lp_build_context *bld, 3208 LLVMValueRef x) 3209{ 3210 /* log(2) */ 3211 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3212 0.69314718055994529); 3213 3214 assert(lp_check_value(bld->type, x)); 3215 3216 return lp_build_mul(bld, log2, lp_build_log2(bld, x)); 3217} 3218 3219/** 3220 * Generate log(x) that handles edge cases (infs, 0s and nans) 3221 */ 3222LLVMValueRef 3223lp_build_log_safe(struct lp_build_context *bld, 3224 LLVMValueRef x) 3225{ 3226 /* log(2) */ 3227 LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type, 3228 0.69314718055994529); 3229 3230 assert(lp_check_value(bld->type, x)); 3231 3232 return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x)); 3233} 3234 3235 3236/** 3237 * Generate polynomial. 3238 * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. 3239 */ 3240LLVMValueRef 3241lp_build_polynomial(struct lp_build_context *bld, 3242 LLVMValueRef x, 3243 const double *coeffs, 3244 unsigned num_coeffs) 3245{ 3246 const struct lp_type type = bld->type; 3247 LLVMValueRef even = NULL, odd = NULL; 3248 LLVMValueRef x2; 3249 unsigned i; 3250 3251 assert(lp_check_value(bld->type, x)); 3252 3253 /* TODO: optimize the constant case */ 3254 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3255 LLVMIsConstant(x)) { 3256 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3257 __FUNCTION__); 3258 } 3259 3260 /* 3261 * Calculate odd and even terms seperately to decrease data dependency 3262 * Ex: 3263 * c[0] + x^2 * c[2] + x^4 * c[4] ... 3264 * + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ... 3265 */ 3266 x2 = lp_build_mul(bld, x, x); 3267 3268 for (i = num_coeffs; i--; ) { 3269 LLVMValueRef coeff; 3270 3271 coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]); 3272 3273 if (i % 2 == 0) { 3274 if (even) 3275 even = lp_build_mad(bld, x2, even, coeff); 3276 else 3277 even = coeff; 3278 } else { 3279 if (odd) 3280 odd = lp_build_mad(bld, x2, odd, coeff); 3281 else 3282 odd = coeff; 3283 } 3284 } 3285 3286 if (odd) 3287 return lp_build_mad(bld, odd, x, even); 3288 else if (even) 3289 return even; 3290 else 3291 return bld->undef; 3292} 3293 3294 3295/** 3296 * Minimax polynomial fit of 2**x, in range [0, 1[ 3297 */ 3298const double lp_build_exp2_polynomial[] = { 3299#if EXP_POLY_DEGREE == 5 3300 1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */ 3301 0.693153073200168932794, 3302 0.240153617044375388211, 3303 0.0558263180532956664775, 3304 0.00898934009049466391101, 3305 0.00187757667519147912699 3306#elif EXP_POLY_DEGREE == 4 3307 1.00000259337069434683, 3308 0.693003834469974940458, 3309 0.24144275689150793076, 3310 0.0520114606103070150235, 3311 0.0135341679161270268764 3312#elif EXP_POLY_DEGREE == 3 3313 0.999925218562710312959, 3314 0.695833540494823811697, 3315 0.226067155427249155588, 3316 0.0780245226406372992967 3317#elif EXP_POLY_DEGREE == 2 3318 1.00172476321474503578, 3319 0.657636275736077639316, 3320 0.33718943461968720704 3321#else 3322#error 3323#endif 3324}; 3325 3326 3327LLVMValueRef 3328lp_build_exp2(struct lp_build_context *bld, 3329 LLVMValueRef x) 3330{ 3331 LLVMBuilderRef builder = bld->gallivm->builder; 3332 const struct lp_type type = bld->type; 3333 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3334 LLVMValueRef ipart = NULL; 3335 LLVMValueRef fpart = NULL; 3336 LLVMValueRef expipart = NULL; 3337 LLVMValueRef expfpart = NULL; 3338 LLVMValueRef res = NULL; 3339 3340 assert(lp_check_value(bld->type, x)); 3341 3342 /* TODO: optimize the constant case */ 3343 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3344 LLVMIsConstant(x)) { 3345 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3346 __FUNCTION__); 3347 } 3348 3349 assert(type.floating && type.width == 32); 3350 3351 /* We want to preserve NaN and make sure than for exp2 if x > 128, 3352 * the result is INF and if it's smaller than -126.9 the result is 0 */ 3353 x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type, 128.0), x, 3354 GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3355 x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999), 3356 x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN); 3357 3358 /* ipart = floor(x) */ 3359 /* fpart = x - ipart */ 3360 lp_build_ifloor_fract(bld, x, &ipart, &fpart); 3361 3362 /* expipart = (float) (1 << ipart) */ 3363 expipart = LLVMBuildAdd(builder, ipart, 3364 lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3365 expipart = LLVMBuildShl(builder, expipart, 3366 lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3367 expipart = LLVMBuildBitCast(builder, expipart, vec_type, ""); 3368 3369 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, 3370 ARRAY_SIZE(lp_build_exp2_polynomial)); 3371 3372 res = LLVMBuildFMul(builder, expipart, expfpart, ""); 3373 3374 return res; 3375} 3376 3377 3378 3379/** 3380 * Extract the exponent of a IEEE-754 floating point value. 3381 * 3382 * Optionally apply an integer bias. 3383 * 3384 * Result is an integer value with 3385 * 3386 * ifloor(log2(x)) + bias 3387 */ 3388LLVMValueRef 3389lp_build_extract_exponent(struct lp_build_context *bld, 3390 LLVMValueRef x, 3391 int bias) 3392{ 3393 LLVMBuilderRef builder = bld->gallivm->builder; 3394 const struct lp_type type = bld->type; 3395 unsigned mantissa = lp_mantissa(type); 3396 LLVMValueRef res; 3397 3398 assert(type.floating); 3399 3400 assert(lp_check_value(bld->type, x)); 3401 3402 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3403 3404 res = LLVMBuildLShr(builder, x, 3405 lp_build_const_int_vec(bld->gallivm, type, mantissa), ""); 3406 res = LLVMBuildAnd(builder, res, 3407 lp_build_const_int_vec(bld->gallivm, type, 255), ""); 3408 res = LLVMBuildSub(builder, res, 3409 lp_build_const_int_vec(bld->gallivm, type, 127 - bias), ""); 3410 3411 return res; 3412} 3413 3414 3415/** 3416 * Extract the mantissa of the a floating. 3417 * 3418 * Result is a floating point value with 3419 * 3420 * x / floor(log2(x)) 3421 */ 3422LLVMValueRef 3423lp_build_extract_mantissa(struct lp_build_context *bld, 3424 LLVMValueRef x) 3425{ 3426 LLVMBuilderRef builder = bld->gallivm->builder; 3427 const struct lp_type type = bld->type; 3428 unsigned mantissa = lp_mantissa(type); 3429 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 3430 (1ULL << mantissa) - 1); 3431 LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type); 3432 LLVMValueRef res; 3433 3434 assert(lp_check_value(bld->type, x)); 3435 3436 assert(type.floating); 3437 3438 x = LLVMBuildBitCast(builder, x, bld->int_vec_type, ""); 3439 3440 /* res = x / 2**ipart */ 3441 res = LLVMBuildAnd(builder, x, mantmask, ""); 3442 res = LLVMBuildOr(builder, res, one, ""); 3443 res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); 3444 3445 return res; 3446} 3447 3448 3449 3450/** 3451 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[ 3452 * These coefficients can be generate with 3453 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html 3454 */ 3455const double lp_build_log2_polynomial[] = { 3456#if LOG_POLY_DEGREE == 5 3457 2.88539008148777786488L, 3458 0.961796878841293367824L, 3459 0.577058946784739859012L, 3460 0.412914355135828735411L, 3461 0.308591899232910175289L, 3462 0.352376952300281371868L, 3463#elif LOG_POLY_DEGREE == 4 3464 2.88539009343309178325L, 3465 0.961791550404184197881L, 3466 0.577440339438736392009L, 3467 0.403343858251329912514L, 3468 0.406718052498846252698L, 3469#elif LOG_POLY_DEGREE == 3 3470 2.88538959748872753838L, 3471 0.961932915889597772928L, 3472 0.571118517972136195241L, 3473 0.493997535084709500285L, 3474#else 3475#error 3476#endif 3477}; 3478 3479/** 3480 * See http://www.devmaster.net/forums/showthread.php?p=43580 3481 * http://en.wikipedia.org/wiki/Logarithm#Calculation 3482 * http://www.nezumi.demon.co.uk/consult/logx.htm 3483 * 3484 * If handle_edge_cases is true the function will perform computations 3485 * to match the required D3D10+ behavior for each of the edge cases. 3486 * That means that if input is: 3487 * - less than zero (to and including -inf) then NaN will be returned 3488 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned 3489 * - +infinity, then +infinity will be returned 3490 * - NaN, then NaN will be returned 3491 * 3492 * Those checks are fairly expensive so if you don't need them make sure 3493 * handle_edge_cases is false. 3494 */ 3495void 3496lp_build_log2_approx(struct lp_build_context *bld, 3497 LLVMValueRef x, 3498 LLVMValueRef *p_exp, 3499 LLVMValueRef *p_floor_log2, 3500 LLVMValueRef *p_log2, 3501 boolean handle_edge_cases) 3502{ 3503 LLVMBuilderRef builder = bld->gallivm->builder; 3504 const struct lp_type type = bld->type; 3505 LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); 3506 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); 3507 3508 LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000); 3509 LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff); 3510 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); 3511 3512 LLVMValueRef i = NULL; 3513 LLVMValueRef y = NULL; 3514 LLVMValueRef z = NULL; 3515 LLVMValueRef exp = NULL; 3516 LLVMValueRef mant = NULL; 3517 LLVMValueRef logexp = NULL; 3518 LLVMValueRef p_z = NULL; 3519 LLVMValueRef res = NULL; 3520 3521 assert(lp_check_value(bld->type, x)); 3522 3523 if(p_exp || p_floor_log2 || p_log2) { 3524 /* TODO: optimize the constant case */ 3525 if (gallivm_debug & GALLIVM_DEBUG_PERF && 3526 LLVMIsConstant(x)) { 3527 debug_printf("%s: inefficient/imprecise constant arithmetic\n", 3528 __FUNCTION__); 3529 } 3530 3531 assert(type.floating && type.width == 32); 3532 3533 /* 3534 * We don't explicitly handle denormalized numbers. They will yield a 3535 * result in the neighbourhood of -127, which appears to be adequate 3536 * enough. 3537 */ 3538 3539 i = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3540 3541 /* exp = (float) exponent(x) */ 3542 exp = LLVMBuildAnd(builder, i, expmask, ""); 3543 } 3544 3545 if(p_floor_log2 || p_log2) { 3546 logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), ""); 3547 logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), ""); 3548 logexp = LLVMBuildSIToFP(builder, logexp, vec_type, ""); 3549 } 3550 3551 if (p_log2) { 3552 /* mant = 1 + (float) mantissa(x) */ 3553 mant = LLVMBuildAnd(builder, i, mantmask, ""); 3554 mant = LLVMBuildOr(builder, mant, one, ""); 3555 mant = LLVMBuildBitCast(builder, mant, vec_type, ""); 3556 3557 /* y = (mant - 1) / (mant + 1) */ 3558 y = lp_build_div(bld, 3559 lp_build_sub(bld, mant, bld->one), 3560 lp_build_add(bld, mant, bld->one) 3561 ); 3562 3563 /* z = y^2 */ 3564 z = lp_build_mul(bld, y, y); 3565 3566 /* compute P(z) */ 3567 p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial, 3568 ARRAY_SIZE(lp_build_log2_polynomial)); 3569 3570 /* y * P(z) + logexp */ 3571 res = lp_build_mad(bld, y, p_z, logexp); 3572 3573 if (type.floating && handle_edge_cases) { 3574 LLVMValueRef negmask, infmask, zmask; 3575 negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x, 3576 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3577 zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, 3578 lp_build_const_vec(bld->gallivm, type, 0.0f)); 3579 infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x, 3580 lp_build_const_vec(bld->gallivm, type, INFINITY)); 3581 3582 /* If x is qual to inf make sure we return inf */ 3583 res = lp_build_select(bld, infmask, 3584 lp_build_const_vec(bld->gallivm, type, INFINITY), 3585 res); 3586 /* If x is qual to 0, return -inf */ 3587 res = lp_build_select(bld, zmask, 3588 lp_build_const_vec(bld->gallivm, type, -INFINITY), 3589 res); 3590 /* If x is nan or less than 0, return nan */ 3591 res = lp_build_select(bld, negmask, 3592 lp_build_const_vec(bld->gallivm, type, NAN), 3593 res); 3594 } 3595 } 3596 3597 if (p_exp) { 3598 exp = LLVMBuildBitCast(builder, exp, vec_type, ""); 3599 *p_exp = exp; 3600 } 3601 3602 if (p_floor_log2) 3603 *p_floor_log2 = logexp; 3604 3605 if (p_log2) 3606 *p_log2 = res; 3607} 3608 3609 3610/* 3611 * log2 implementation which doesn't have special code to 3612 * handle edge cases (-inf, 0, inf, NaN). It's faster but 3613 * the results for those cases are undefined. 3614 */ 3615LLVMValueRef 3616lp_build_log2(struct lp_build_context *bld, 3617 LLVMValueRef x) 3618{ 3619 LLVMValueRef res; 3620 lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE); 3621 return res; 3622} 3623 3624/* 3625 * Version of log2 which handles all edge cases. 3626 * Look at documentation of lp_build_log2_approx for 3627 * description of the behavior for each of the edge cases. 3628 */ 3629LLVMValueRef 3630lp_build_log2_safe(struct lp_build_context *bld, 3631 LLVMValueRef x) 3632{ 3633 LLVMValueRef res; 3634 lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE); 3635 return res; 3636} 3637 3638 3639/** 3640 * Faster (and less accurate) log2. 3641 * 3642 * log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x)) 3643 * 3644 * Piece-wise linear approximation, with exact results when x is a 3645 * power of two. 3646 * 3647 * See http://www.flipcode.com/archives/Fast_log_Function.shtml 3648 */ 3649LLVMValueRef 3650lp_build_fast_log2(struct lp_build_context *bld, 3651 LLVMValueRef x) 3652{ 3653 LLVMBuilderRef builder = bld->gallivm->builder; 3654 LLVMValueRef ipart; 3655 LLVMValueRef fpart; 3656 3657 assert(lp_check_value(bld->type, x)); 3658 3659 assert(bld->type.floating); 3660 3661 /* ipart = floor(log2(x)) - 1 */ 3662 ipart = lp_build_extract_exponent(bld, x, -1); 3663 ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, ""); 3664 3665 /* fpart = x / 2**ipart */ 3666 fpart = lp_build_extract_mantissa(bld, x); 3667 3668 /* ipart + fpart */ 3669 return LLVMBuildFAdd(builder, ipart, fpart, ""); 3670} 3671 3672 3673/** 3674 * Fast implementation of iround(log2(x)). 3675 * 3676 * Not an approximation -- it should give accurate results all the time. 3677 */ 3678LLVMValueRef 3679lp_build_ilog2(struct lp_build_context *bld, 3680 LLVMValueRef x) 3681{ 3682 LLVMBuilderRef builder = bld->gallivm->builder; 3683 LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2); 3684 LLVMValueRef ipart; 3685 3686 assert(bld->type.floating); 3687 3688 assert(lp_check_value(bld->type, x)); 3689 3690 /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ 3691 x = LLVMBuildFMul(builder, x, sqrt2, ""); 3692 3693 /* ipart = floor(log2(x) + 0.5) */ 3694 ipart = lp_build_extract_exponent(bld, x, 0); 3695 3696 return ipart; 3697} 3698 3699LLVMValueRef 3700lp_build_mod(struct lp_build_context *bld, 3701 LLVMValueRef x, 3702 LLVMValueRef y) 3703{ 3704 LLVMBuilderRef builder = bld->gallivm->builder; 3705 LLVMValueRef res; 3706 const struct lp_type type = bld->type; 3707 3708 assert(lp_check_value(type, x)); 3709 assert(lp_check_value(type, y)); 3710 3711 if (type.floating) 3712 res = LLVMBuildFRem(builder, x, y, ""); 3713 else if (type.sign) 3714 res = LLVMBuildSRem(builder, x, y, ""); 3715 else 3716 res = LLVMBuildURem(builder, x, y, ""); 3717 return res; 3718} 3719 3720 3721/* 3722 * For floating inputs it creates and returns a mask 3723 * which is all 1's for channels which are NaN. 3724 * Channels inside x which are not NaN will be 0. 3725 */ 3726LLVMValueRef 3727lp_build_isnan(struct lp_build_context *bld, 3728 LLVMValueRef x) 3729{ 3730 LLVMValueRef mask; 3731 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3732 3733 assert(bld->type.floating); 3734 assert(lp_check_value(bld->type, x)); 3735 3736 mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x, 3737 "isnotnan"); 3738 mask = LLVMBuildNot(bld->gallivm->builder, mask, ""); 3739 mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan"); 3740 return mask; 3741} 3742 3743/* Returns all 1's for floating point numbers that are 3744 * finite numbers and returns all zeros for -inf, 3745 * inf and nan's */ 3746LLVMValueRef 3747lp_build_isfinite(struct lp_build_context *bld, 3748 LLVMValueRef x) 3749{ 3750 LLVMBuilderRef builder = bld->gallivm->builder; 3751 LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type); 3752 struct lp_type int_type = lp_int_type(bld->type); 3753 LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, ""); 3754 LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type, 3755 0x7f800000); 3756 3757 if (!bld->type.floating) { 3758 return lp_build_const_int_vec(bld->gallivm, bld->type, 0); 3759 } 3760 assert(bld->type.floating); 3761 assert(lp_check_value(bld->type, x)); 3762 assert(bld->type.width == 32); 3763 3764 intx = LLVMBuildAnd(builder, intx, infornan32, ""); 3765 return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL, 3766 intx, infornan32); 3767} 3768 3769/* 3770 * Returns true if the number is nan or inf and false otherwise. 3771 * The input has to be a floating point vector. 3772 */ 3773LLVMValueRef 3774lp_build_is_inf_or_nan(struct gallivm_state *gallivm, 3775 const struct lp_type type, 3776 LLVMValueRef x) 3777{ 3778 LLVMBuilderRef builder = gallivm->builder; 3779 struct lp_type int_type = lp_int_type(type); 3780 LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type, 3781 0x7f800000); 3782 LLVMValueRef ret; 3783 3784 assert(type.floating); 3785 3786 ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), ""); 3787 ret = LLVMBuildAnd(builder, ret, const0, ""); 3788 ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL, 3789 ret, const0); 3790 3791 return ret; 3792} 3793 3794 3795LLVMValueRef 3796lp_build_fpstate_get(struct gallivm_state *gallivm) 3797{ 3798 if (util_cpu_caps.has_sse) { 3799 LLVMBuilderRef builder = gallivm->builder; 3800 LLVMValueRef mxcsr_ptr = lp_build_alloca( 3801 gallivm, 3802 LLVMInt32TypeInContext(gallivm->context), 3803 "mxcsr_ptr"); 3804 LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr, 3805 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3806 lp_build_intrinsic(builder, 3807 "llvm.x86.sse.stmxcsr", 3808 LLVMVoidTypeInContext(gallivm->context), 3809 &mxcsr_ptr8, 1, 0); 3810 return mxcsr_ptr; 3811 } 3812 return 0; 3813} 3814 3815void 3816lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm, 3817 boolean zero) 3818{ 3819 if (util_cpu_caps.has_sse) { 3820 /* turn on DAZ (64) | FTZ (32768) = 32832 if available */ 3821 int daz_ftz = _MM_FLUSH_ZERO_MASK; 3822 3823 LLVMBuilderRef builder = gallivm->builder; 3824 LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm); 3825 LLVMValueRef mxcsr = 3826 LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr"); 3827 3828 if (util_cpu_caps.has_daz) { 3829 /* Enable denormals are zero mode */ 3830 daz_ftz |= _MM_DENORMALS_ZERO_MASK; 3831 } 3832 if (zero) { 3833 mxcsr = LLVMBuildOr(builder, mxcsr, 3834 LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), ""); 3835 } else { 3836 mxcsr = LLVMBuildAnd(builder, mxcsr, 3837 LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), ""); 3838 } 3839 3840 LLVMBuildStore(builder, mxcsr, mxcsr_ptr); 3841 lp_build_fpstate_set(gallivm, mxcsr_ptr); 3842 } 3843} 3844 3845void 3846lp_build_fpstate_set(struct gallivm_state *gallivm, 3847 LLVMValueRef mxcsr_ptr) 3848{ 3849 if (util_cpu_caps.has_sse) { 3850 LLVMBuilderRef builder = gallivm->builder; 3851 mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr, 3852 LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), ""); 3853 lp_build_intrinsic(builder, 3854 "llvm.x86.sse.ldmxcsr", 3855 LLVMVoidTypeInContext(gallivm->context), 3856 &mxcsr_ptr, 1, 0); 3857 } 3858} 3859