1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include <float.h>
49
50#include "util/u_memory.h"
51#include "util/u_debug.h"
52#include "util/u_math.h"
53#include "util/u_cpu_detect.h"
54
55#include "lp_bld_type.h"
56#include "lp_bld_const.h"
57#include "lp_bld_init.h"
58#include "lp_bld_intr.h"
59#include "lp_bld_logic.h"
60#include "lp_bld_pack.h"
61#include "lp_bld_debug.h"
62#include "lp_bld_bitarit.h"
63#include "lp_bld_arit.h"
64#include "lp_bld_flow.h"
65
66#if defined(PIPE_ARCH_SSE)
67#include <xmmintrin.h>
68#endif
69
70#ifndef _MM_DENORMALS_ZERO_MASK
71#define _MM_DENORMALS_ZERO_MASK 0x0040
72#endif
73
74#ifndef _MM_FLUSH_ZERO_MASK
75#define _MM_FLUSH_ZERO_MASK 0x8000
76#endif
77
78#define EXP_POLY_DEGREE 5
79
80#define LOG_POLY_DEGREE 4
81
82
83/**
84 * Generate min(a, b)
85 * No checks for special case values of a or b = 1 or 0 are done.
86 * NaN's are handled according to the behavior specified by the
87 * nan_behavior argument.
88 */
89static LLVMValueRef
90lp_build_min_simple(struct lp_build_context *bld,
91                    LLVMValueRef a,
92                    LLVMValueRef b,
93                    enum gallivm_nan_behavior nan_behavior)
94{
95   const struct lp_type type = bld->type;
96   const char *intrinsic = NULL;
97   unsigned intr_size = 0;
98   LLVMValueRef cond;
99
100   assert(lp_check_value(type, a));
101   assert(lp_check_value(type, b));
102
103   /* TODO: optimize the constant case */
104
105   if (type.floating && util_cpu_caps.has_sse) {
106      if (type.width == 32) {
107         if (type.length == 1) {
108            intrinsic = "llvm.x86.sse.min.ss";
109            intr_size = 128;
110         }
111         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
112            intrinsic = "llvm.x86.sse.min.ps";
113            intr_size = 128;
114         }
115         else {
116            intrinsic = "llvm.x86.avx.min.ps.256";
117            intr_size = 256;
118         }
119      }
120      if (type.width == 64 && util_cpu_caps.has_sse2) {
121         if (type.length == 1) {
122            intrinsic = "llvm.x86.sse2.min.sd";
123            intr_size = 128;
124         }
125         else if (type.length == 2 || !util_cpu_caps.has_avx) {
126            intrinsic = "llvm.x86.sse2.min.pd";
127            intr_size = 128;
128         }
129         else {
130            intrinsic = "llvm.x86.avx.min.pd.256";
131            intr_size = 256;
132         }
133      }
134   }
135   else if (type.floating && util_cpu_caps.has_altivec) {
136      if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
137          nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
138         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
139                      __FUNCTION__);
140      }
141      if (type.width == 32 && type.length == 4) {
142         intrinsic = "llvm.ppc.altivec.vminfp";
143         intr_size = 128;
144      }
145   } else if (HAVE_LLVM < 0x0309 &&
146              util_cpu_caps.has_avx2 && type.length > 4) {
147      intr_size = 256;
148      switch (type.width) {
149      case 8:
150         intrinsic = type.sign ? "llvm.x86.avx2.pmins.b" : "llvm.x86.avx2.pminu.b";
151         break;
152      case 16:
153         intrinsic = type.sign ? "llvm.x86.avx2.pmins.w" : "llvm.x86.avx2.pminu.w";
154         break;
155      case 32:
156         intrinsic = type.sign ? "llvm.x86.avx2.pmins.d" : "llvm.x86.avx2.pminu.d";
157         break;
158      }
159   } else if (HAVE_LLVM < 0x0309 &&
160              util_cpu_caps.has_sse2 && type.length >= 2) {
161      intr_size = 128;
162      if ((type.width == 8 || type.width == 16) &&
163          (type.width * type.length <= 64) &&
164          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
165         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
166                      __FUNCTION__);
167      }
168      if (type.width == 8 && !type.sign) {
169         intrinsic = "llvm.x86.sse2.pminu.b";
170      }
171      else if (type.width == 16 && type.sign) {
172         intrinsic = "llvm.x86.sse2.pmins.w";
173      }
174      if (util_cpu_caps.has_sse4_1) {
175         if (type.width == 8 && type.sign) {
176            intrinsic = "llvm.x86.sse41.pminsb";
177         }
178         if (type.width == 16 && !type.sign) {
179            intrinsic = "llvm.x86.sse41.pminuw";
180         }
181         if (type.width == 32 && !type.sign) {
182            intrinsic = "llvm.x86.sse41.pminud";
183         }
184         if (type.width == 32 && type.sign) {
185            intrinsic = "llvm.x86.sse41.pminsd";
186         }
187      }
188   } else if (util_cpu_caps.has_altivec) {
189      intr_size = 128;
190      if (type.width == 8) {
191         if (!type.sign) {
192            intrinsic = "llvm.ppc.altivec.vminub";
193         } else {
194            intrinsic = "llvm.ppc.altivec.vminsb";
195         }
196      } else if (type.width == 16) {
197         if (!type.sign) {
198            intrinsic = "llvm.ppc.altivec.vminuh";
199         } else {
200            intrinsic = "llvm.ppc.altivec.vminsh";
201         }
202      } else if (type.width == 32) {
203         if (!type.sign) {
204            intrinsic = "llvm.ppc.altivec.vminuw";
205         } else {
206            intrinsic = "llvm.ppc.altivec.vminsw";
207         }
208      }
209   }
210
211   if (intrinsic) {
212      /* We need to handle nan's for floating point numbers. If one of the
213       * inputs is nan the other should be returned (required by both D3D10+
214       * and OpenCL).
215       * The sse intrinsics return the second operator in case of nan by
216       * default so we need to special code to handle those.
217       */
218      if (util_cpu_caps.has_sse && type.floating &&
219          nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
220          nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
221          nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
222         LLVMValueRef isnan, min;
223         min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
224                                                   type,
225                                                   intr_size, a, b);
226         if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
227            isnan = lp_build_isnan(bld, b);
228            return lp_build_select(bld, isnan, a, min);
229         } else {
230            assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
231            isnan = lp_build_isnan(bld, a);
232            return lp_build_select(bld, isnan, a, min);
233         }
234      } else {
235         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
236                                                    type,
237                                                    intr_size, a, b);
238      }
239   }
240
241   if (type.floating) {
242      switch (nan_behavior) {
243      case GALLIVM_NAN_RETURN_NAN: {
244         LLVMValueRef isnan = lp_build_isnan(bld, b);
245         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
246         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
247         return lp_build_select(bld, cond, a, b);
248      }
249         break;
250      case GALLIVM_NAN_RETURN_OTHER: {
251         LLVMValueRef isnan = lp_build_isnan(bld, a);
252         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
253         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
254         return lp_build_select(bld, cond, a, b);
255      }
256         break;
257      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
258         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
259         return lp_build_select(bld, cond, a, b);
260      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
261         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
262         return lp_build_select(bld, cond, b, a);
263      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
264         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
265         return lp_build_select(bld, cond, a, b);
266         break;
267      default:
268         assert(0);
269         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
270         return lp_build_select(bld, cond, a, b);
271      }
272   } else {
273      cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
274      return lp_build_select(bld, cond, a, b);
275   }
276}
277
278
279LLVMValueRef
280lp_build_fmuladd(LLVMBuilderRef builder,
281                 LLVMValueRef a,
282                 LLVMValueRef b,
283                 LLVMValueRef c)
284{
285   LLVMTypeRef type = LLVMTypeOf(a);
286   assert(type == LLVMTypeOf(b));
287   assert(type == LLVMTypeOf(c));
288   if (HAVE_LLVM < 0x0304) {
289      /* XXX: LLVM 3.3 does not breakdown llvm.fmuladd into mul+add when FMA is
290       * not supported, and instead it falls-back to a C function.
291       */
292      return LLVMBuildFAdd(builder, LLVMBuildFMul(builder, a, b, ""), c, "");
293   }
294   char intrinsic[32];
295   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
296   LLVMValueRef args[] = { a, b, c };
297   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
298}
299
300
301/**
302 * Generate max(a, b)
303 * No checks for special case values of a or b = 1 or 0 are done.
304 * NaN's are handled according to the behavior specified by the
305 * nan_behavior argument.
306 */
307static LLVMValueRef
308lp_build_max_simple(struct lp_build_context *bld,
309                    LLVMValueRef a,
310                    LLVMValueRef b,
311                    enum gallivm_nan_behavior nan_behavior)
312{
313   const struct lp_type type = bld->type;
314   const char *intrinsic = NULL;
315   unsigned intr_size = 0;
316   LLVMValueRef cond;
317
318   assert(lp_check_value(type, a));
319   assert(lp_check_value(type, b));
320
321   /* TODO: optimize the constant case */
322
323   if (type.floating && util_cpu_caps.has_sse) {
324      if (type.width == 32) {
325         if (type.length == 1) {
326            intrinsic = "llvm.x86.sse.max.ss";
327            intr_size = 128;
328         }
329         else if (type.length <= 4 || !util_cpu_caps.has_avx) {
330            intrinsic = "llvm.x86.sse.max.ps";
331            intr_size = 128;
332         }
333         else {
334            intrinsic = "llvm.x86.avx.max.ps.256";
335            intr_size = 256;
336         }
337      }
338      if (type.width == 64 && util_cpu_caps.has_sse2) {
339         if (type.length == 1) {
340            intrinsic = "llvm.x86.sse2.max.sd";
341            intr_size = 128;
342         }
343         else if (type.length == 2 || !util_cpu_caps.has_avx) {
344            intrinsic = "llvm.x86.sse2.max.pd";
345            intr_size = 128;
346         }
347         else {
348            intrinsic = "llvm.x86.avx.max.pd.256";
349            intr_size = 256;
350         }
351      }
352   }
353   else if (type.floating && util_cpu_caps.has_altivec) {
354      if (nan_behavior == GALLIVM_NAN_RETURN_NAN ||
355          nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
356         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
357                      __FUNCTION__);
358      }
359      if (type.width == 32 || type.length == 4) {
360         intrinsic = "llvm.ppc.altivec.vmaxfp";
361         intr_size = 128;
362      }
363   } else if (HAVE_LLVM < 0x0309 &&
364              util_cpu_caps.has_avx2 && type.length > 4) {
365      intr_size = 256;
366      switch (type.width) {
367      case 8:
368         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.b" : "llvm.x86.avx2.pmaxu.b";
369         break;
370      case 16:
371         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.w" : "llvm.x86.avx2.pmaxu.w";
372         break;
373      case 32:
374         intrinsic = type.sign ? "llvm.x86.avx2.pmaxs.d" : "llvm.x86.avx2.pmaxu.d";
375         break;
376      }
377   } else if (HAVE_LLVM < 0x0309 &&
378              util_cpu_caps.has_sse2 && type.length >= 2) {
379      intr_size = 128;
380      if ((type.width == 8 || type.width == 16) &&
381          (type.width * type.length <= 64) &&
382          (gallivm_debug & GALLIVM_DEBUG_PERF)) {
383         debug_printf("%s: inefficient code, bogus shuffle due to packing\n",
384                      __FUNCTION__);
385         }
386      if (type.width == 8 && !type.sign) {
387         intrinsic = "llvm.x86.sse2.pmaxu.b";
388         intr_size = 128;
389      }
390      else if (type.width == 16 && type.sign) {
391         intrinsic = "llvm.x86.sse2.pmaxs.w";
392      }
393      if (util_cpu_caps.has_sse4_1) {
394         if (type.width == 8 && type.sign) {
395            intrinsic = "llvm.x86.sse41.pmaxsb";
396         }
397         if (type.width == 16 && !type.sign) {
398            intrinsic = "llvm.x86.sse41.pmaxuw";
399         }
400         if (type.width == 32 && !type.sign) {
401            intrinsic = "llvm.x86.sse41.pmaxud";
402        }
403         if (type.width == 32 && type.sign) {
404            intrinsic = "llvm.x86.sse41.pmaxsd";
405         }
406      }
407   } else if (util_cpu_caps.has_altivec) {
408     intr_size = 128;
409     if (type.width == 8) {
410       if (!type.sign) {
411         intrinsic = "llvm.ppc.altivec.vmaxub";
412       } else {
413         intrinsic = "llvm.ppc.altivec.vmaxsb";
414       }
415     } else if (type.width == 16) {
416       if (!type.sign) {
417         intrinsic = "llvm.ppc.altivec.vmaxuh";
418       } else {
419         intrinsic = "llvm.ppc.altivec.vmaxsh";
420       }
421     } else if (type.width == 32) {
422       if (!type.sign) {
423         intrinsic = "llvm.ppc.altivec.vmaxuw";
424       } else {
425         intrinsic = "llvm.ppc.altivec.vmaxsw";
426       }
427     }
428   }
429
430   if (intrinsic) {
431      if (util_cpu_caps.has_sse && type.floating &&
432          nan_behavior != GALLIVM_NAN_BEHAVIOR_UNDEFINED &&
433          nan_behavior != GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN &&
434          nan_behavior != GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
435         LLVMValueRef isnan, max;
436         max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
437                                                   type,
438                                                   intr_size, a, b);
439         if (nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
440            isnan = lp_build_isnan(bld, b);
441            return lp_build_select(bld, isnan, a, max);
442         } else {
443            assert(nan_behavior == GALLIVM_NAN_RETURN_NAN);
444            isnan = lp_build_isnan(bld, a);
445            return lp_build_select(bld, isnan, a, max);
446         }
447      } else {
448         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
449                                                    type,
450                                                    intr_size, a, b);
451      }
452   }
453
454   if (type.floating) {
455      switch (nan_behavior) {
456      case GALLIVM_NAN_RETURN_NAN: {
457         LLVMValueRef isnan = lp_build_isnan(bld, b);
458         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
459         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
460         return lp_build_select(bld, cond, a, b);
461      }
462         break;
463      case GALLIVM_NAN_RETURN_OTHER: {
464         LLVMValueRef isnan = lp_build_isnan(bld, a);
465         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
466         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
467         return lp_build_select(bld, cond, a, b);
468      }
469         break;
470      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
471         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
472         return lp_build_select(bld, cond, a, b);
473      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
474         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
475         return lp_build_select(bld, cond, b, a);
476      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
477         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
478         return lp_build_select(bld, cond, a, b);
479         break;
480      default:
481         assert(0);
482         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
483         return lp_build_select(bld, cond, a, b);
484      }
485   } else {
486      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
487      return lp_build_select(bld, cond, a, b);
488   }
489}
490
491
492/**
493 * Generate 1 - a, or ~a depending on bld->type.
494 */
495LLVMValueRef
496lp_build_comp(struct lp_build_context *bld,
497              LLVMValueRef a)
498{
499   LLVMBuilderRef builder = bld->gallivm->builder;
500   const struct lp_type type = bld->type;
501
502   assert(lp_check_value(type, a));
503
504   if(a == bld->one)
505      return bld->zero;
506   if(a == bld->zero)
507      return bld->one;
508
509   if(type.norm && !type.floating && !type.fixed && !type.sign) {
510      if(LLVMIsConstant(a))
511         return LLVMConstNot(a);
512      else
513         return LLVMBuildNot(builder, a, "");
514   }
515
516   if(LLVMIsConstant(a))
517      if (type.floating)
518          return LLVMConstFSub(bld->one, a);
519      else
520          return LLVMConstSub(bld->one, a);
521   else
522      if (type.floating)
523         return LLVMBuildFSub(builder, bld->one, a, "");
524      else
525         return LLVMBuildSub(builder, bld->one, a, "");
526}
527
528
529/**
530 * Generate a + b
531 */
532LLVMValueRef
533lp_build_add(struct lp_build_context *bld,
534             LLVMValueRef a,
535             LLVMValueRef b)
536{
537   LLVMBuilderRef builder = bld->gallivm->builder;
538   const struct lp_type type = bld->type;
539   LLVMValueRef res;
540
541   assert(lp_check_value(type, a));
542   assert(lp_check_value(type, b));
543
544   if (a == bld->zero)
545      return b;
546   if (b == bld->zero)
547      return a;
548   if (a == bld->undef || b == bld->undef)
549      return bld->undef;
550
551   if (type.norm) {
552      const char *intrinsic = NULL;
553
554      if (!type.sign && (a == bld->one || b == bld->one))
555        return bld->one;
556
557      if (!type.floating && !type.fixed) {
558         if (HAVE_LLVM >= 0x0900) {
559            char intrin[32];
560            intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
561            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
562            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
563         }
564         if (type.width * type.length == 128) {
565            if (util_cpu_caps.has_sse2) {
566               if (type.width == 8)
567                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" :
568                                         HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL;
569               if (type.width == 16)
570                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" :
571                                         HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL;
572            } else if (util_cpu_caps.has_altivec) {
573               if (type.width == 8)
574                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
575               if (type.width == 16)
576                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
577            }
578         }
579         if (type.width * type.length == 256) {
580            if (util_cpu_caps.has_avx2) {
581               if (type.width == 8)
582                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" :
583                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL;
584               if (type.width == 16)
585                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" :
586                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL;
587            }
588         }
589      }
590
591      if (intrinsic)
592         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
593   }
594
595   if(type.norm && !type.floating && !type.fixed) {
596      if (type.sign) {
597         uint64_t sign = (uint64_t)1 << (type.width - 1);
598         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
599         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
600         /* a_clamp_max is the maximum a for positive b,
601            a_clamp_min is the minimum a for negative b. */
602         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
603         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
604         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
605      }
606   }
607
608   if(LLVMIsConstant(a) && LLVMIsConstant(b))
609      if (type.floating)
610         res = LLVMConstFAdd(a, b);
611      else
612         res = LLVMConstAdd(a, b);
613   else
614      if (type.floating)
615         res = LLVMBuildFAdd(builder, a, b, "");
616      else
617         res = LLVMBuildAdd(builder, a, b, "");
618
619   /* clamp to ceiling of 1.0 */
620   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
621      res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
622
623   if (type.norm && !type.floating && !type.fixed) {
624      if (!type.sign) {
625         /*
626          * newer llvm versions no longer support the intrinsics, but recognize
627          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
628          * code, it is important we match the pattern llvm uses (and pray llvm
629          * doesn't change it - and hope they decide on the same pattern for
630          * all backends supporting it...).
631          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
632          * interfere with llvm's ability to recognize the pattern but seems
633          * a bit brittle.
634          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
635          */
636         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
637         res = lp_build_select(bld, overflowed,
638                               LLVMConstAllOnes(bld->int_vec_type), res);
639      }
640   }
641
642   /* XXX clamp to floor of -1 or 0??? */
643
644   return res;
645}
646
647
648/** Return the scalar sum of the elements of a.
649 * Should avoid this operation whenever possible.
650 */
651LLVMValueRef
652lp_build_horizontal_add(struct lp_build_context *bld,
653                        LLVMValueRef a)
654{
655   LLVMBuilderRef builder = bld->gallivm->builder;
656   const struct lp_type type = bld->type;
657   LLVMValueRef index, res;
658   unsigned i, length;
659   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
660   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
661   LLVMValueRef vecres, elem2;
662
663   assert(lp_check_value(type, a));
664
665   if (type.length == 1) {
666      return a;
667   }
668
669   assert(!bld->type.norm);
670
671   /*
672    * for byte vectors can do much better with psadbw.
673    * Using repeated shuffle/adds here. Note with multiple vectors
674    * this can be done more efficiently as outlined in the intel
675    * optimization manual.
676    * Note: could cause data rearrangement if used with smaller element
677    * sizes.
678    */
679
680   vecres = a;
681   length = type.length / 2;
682   while (length > 1) {
683      LLVMValueRef vec1, vec2;
684      for (i = 0; i < length; i++) {
685         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
686         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
687      }
688      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
689                                    LLVMConstVector(shuffles1, length), "");
690      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
691                                    LLVMConstVector(shuffles2, length), "");
692      if (type.floating) {
693         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
694      }
695      else {
696         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
697      }
698      length = length >> 1;
699   }
700
701   /* always have vector of size 2 here */
702   assert(length == 1);
703
704   index = lp_build_const_int32(bld->gallivm, 0);
705   res = LLVMBuildExtractElement(builder, vecres, index, "");
706   index = lp_build_const_int32(bld->gallivm, 1);
707   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
708
709   if (type.floating)
710      res = LLVMBuildFAdd(builder, res, elem2, "");
711    else
712      res = LLVMBuildAdd(builder, res, elem2, "");
713
714   return res;
715}
716
717/**
718 * Return the horizontal sums of 4 float vectors as a float4 vector.
719 * This uses the technique as outlined in Intel Optimization Manual.
720 */
721static LLVMValueRef
722lp_build_horizontal_add4x4f(struct lp_build_context *bld,
723                            LLVMValueRef src[4])
724{
725   struct gallivm_state *gallivm = bld->gallivm;
726   LLVMBuilderRef builder = gallivm->builder;
727   LLVMValueRef shuffles[4];
728   LLVMValueRef tmp[4];
729   LLVMValueRef sumtmp[2], shuftmp[2];
730
731   /* lower half of regs */
732   shuffles[0] = lp_build_const_int32(gallivm, 0);
733   shuffles[1] = lp_build_const_int32(gallivm, 1);
734   shuffles[2] = lp_build_const_int32(gallivm, 4);
735   shuffles[3] = lp_build_const_int32(gallivm, 5);
736   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
737                                   LLVMConstVector(shuffles, 4), "");
738   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
739                                   LLVMConstVector(shuffles, 4), "");
740
741   /* upper half of regs */
742   shuffles[0] = lp_build_const_int32(gallivm, 2);
743   shuffles[1] = lp_build_const_int32(gallivm, 3);
744   shuffles[2] = lp_build_const_int32(gallivm, 6);
745   shuffles[3] = lp_build_const_int32(gallivm, 7);
746   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
747                                   LLVMConstVector(shuffles, 4), "");
748   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
749                                   LLVMConstVector(shuffles, 4), "");
750
751   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
752   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
753
754   shuffles[0] = lp_build_const_int32(gallivm, 0);
755   shuffles[1] = lp_build_const_int32(gallivm, 2);
756   shuffles[2] = lp_build_const_int32(gallivm, 4);
757   shuffles[3] = lp_build_const_int32(gallivm, 6);
758   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
759                                       LLVMConstVector(shuffles, 4), "");
760
761   shuffles[0] = lp_build_const_int32(gallivm, 1);
762   shuffles[1] = lp_build_const_int32(gallivm, 3);
763   shuffles[2] = lp_build_const_int32(gallivm, 5);
764   shuffles[3] = lp_build_const_int32(gallivm, 7);
765   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
766                                       LLVMConstVector(shuffles, 4), "");
767
768   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
769}
770
771
772/*
773 * partially horizontally add 2-4 float vectors with length nx4,
774 * i.e. only four adjacent values in each vector will be added,
775 * assuming values are really grouped in 4 which also determines
776 * output order.
777 *
778 * Return a vector of the same length as the initial vectors,
779 * with the excess elements (if any) being undefined.
780 * The element order is independent of number of input vectors.
781 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
782 * the output order thus will be
783 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
784 */
785LLVMValueRef
786lp_build_hadd_partial4(struct lp_build_context *bld,
787                       LLVMValueRef vectors[],
788                       unsigned num_vecs)
789{
790   struct gallivm_state *gallivm = bld->gallivm;
791   LLVMBuilderRef builder = gallivm->builder;
792   LLVMValueRef ret_vec;
793   LLVMValueRef tmp[4];
794   const char *intrinsic = NULL;
795
796   assert(num_vecs >= 2 && num_vecs <= 4);
797   assert(bld->type.floating);
798
799   /* only use this with at least 2 vectors, as it is sort of expensive
800    * (depending on cpu) and we always need two horizontal adds anyway,
801    * so a shuffle/add approach might be better.
802    */
803
804   tmp[0] = vectors[0];
805   tmp[1] = vectors[1];
806
807   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
808   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
809
810   if (util_cpu_caps.has_sse3 && bld->type.width == 32 &&
811       bld->type.length == 4) {
812      intrinsic = "llvm.x86.sse3.hadd.ps";
813   }
814   else if (util_cpu_caps.has_avx && bld->type.width == 32 &&
815            bld->type.length == 8) {
816      intrinsic = "llvm.x86.avx.hadd.ps.256";
817   }
818   if (intrinsic) {
819      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
820                                       lp_build_vec_type(gallivm, bld->type),
821                                       tmp[0], tmp[1]);
822      if (num_vecs > 2) {
823         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
824                                          lp_build_vec_type(gallivm, bld->type),
825                                          tmp[2], tmp[3]);
826      }
827      else {
828         tmp[1] = tmp[0];
829      }
830      return lp_build_intrinsic_binary(builder, intrinsic,
831                                       lp_build_vec_type(gallivm, bld->type),
832                                       tmp[0], tmp[1]);
833   }
834
835   if (bld->type.length == 4) {
836      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
837   }
838   else {
839      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
840      unsigned j;
841      unsigned num_iter = bld->type.length / 4;
842      struct lp_type parttype = bld->type;
843      parttype.length = 4;
844      for (j = 0; j < num_iter; j++) {
845         LLVMValueRef partsrc[4];
846         unsigned i;
847         for (i = 0; i < 4; i++) {
848            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
849         }
850         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
851      }
852      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
853   }
854   return ret_vec;
855}
856
857/**
858 * Generate a - b
859 */
860LLVMValueRef
861lp_build_sub(struct lp_build_context *bld,
862             LLVMValueRef a,
863             LLVMValueRef b)
864{
865   LLVMBuilderRef builder = bld->gallivm->builder;
866   const struct lp_type type = bld->type;
867   LLVMValueRef res;
868
869   assert(lp_check_value(type, a));
870   assert(lp_check_value(type, b));
871
872   if (b == bld->zero)
873      return a;
874   if (a == bld->undef || b == bld->undef)
875      return bld->undef;
876   if (a == b)
877      return bld->zero;
878
879   if (type.norm) {
880      const char *intrinsic = NULL;
881
882      if (!type.sign && b == bld->one)
883        return bld->zero;
884
885      if (!type.floating && !type.fixed) {
886         if (HAVE_LLVM >= 0x0900) {
887            char intrin[32];
888            intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
889            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
890            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
891         }
892         if (type.width * type.length == 128) {
893            if (util_cpu_caps.has_sse2) {
894               if (type.width == 8)
895                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" :
896                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL;
897               if (type.width == 16)
898                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" :
899                                          HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL;
900            } else if (util_cpu_caps.has_altivec) {
901               if (type.width == 8)
902                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
903               if (type.width == 16)
904                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
905            }
906         }
907         if (type.width * type.length == 256) {
908            if (util_cpu_caps.has_avx2) {
909               if (type.width == 8)
910                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" :
911                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL;
912               if (type.width == 16)
913                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" :
914                                          HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL;
915            }
916         }
917      }
918
919      if (intrinsic)
920         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
921   }
922
923   if(type.norm && !type.floating && !type.fixed) {
924      if (type.sign) {
925         uint64_t sign = (uint64_t)1 << (type.width - 1);
926         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
927         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
928         /* a_clamp_max is the maximum a for negative b,
929            a_clamp_min is the minimum a for positive b. */
930         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
931         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
932         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
933      } else {
934         /*
935          * This must match llvm pattern for saturated unsigned sub.
936          * (lp_build_max_simple actually does the job with its current
937          * definition but do it explicitly here.)
938          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
939          * interfere with llvm's ability to recognize the pattern but seems
940          * a bit brittle.
941          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
942          */
943         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
944         a = lp_build_select(bld, no_ov, a, b);
945      }
946   }
947
948   if(LLVMIsConstant(a) && LLVMIsConstant(b))
949      if (type.floating)
950         res = LLVMConstFSub(a, b);
951      else
952         res = LLVMConstSub(a, b);
953   else
954      if (type.floating)
955         res = LLVMBuildFSub(builder, a, b, "");
956      else
957         res = LLVMBuildSub(builder, a, b, "");
958
959   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
960      res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
961
962   return res;
963}
964
965
966
967/**
968 * Normalized multiplication.
969 *
970 * There are several approaches for (using 8-bit normalized multiplication as
971 * an example):
972 *
973 * - alpha plus one
974 *
975 *     makes the following approximation to the division (Sree)
976 *
977 *       a*b/255 ~= (a*(b + 1)) >> 256
978 *
979 *     which is the fastest method that satisfies the following OpenGL criteria of
980 *
981 *       0*0 = 0 and 255*255 = 255
982 *
983 * - geometric series
984 *
985 *     takes the geometric series approximation to the division
986 *
987 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
988 *
989 *     in this case just the first two terms to fit in 16bit arithmetic
990 *
991 *       t/255 ~= (t + (t >> 8)) >> 8
992 *
993 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
994 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
995 *     must be used.
996 *
997 * - geometric series plus rounding
998 *
999 *     when using a geometric series division instead of truncating the result
1000 *     use roundoff in the approximation (Jim Blinn)
1001 *
1002 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
1003 *
1004 *     achieving the exact results.
1005 *
1006 *
1007 *
1008 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
1009 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
1010 * @sa Michael Herf, The "double blend trick", May 2000,
1011 *     http://www.stereopsis.com/doubleblend.html
1012 */
1013LLVMValueRef
1014lp_build_mul_norm(struct gallivm_state *gallivm,
1015                  struct lp_type wide_type,
1016                  LLVMValueRef a, LLVMValueRef b)
1017{
1018   LLVMBuilderRef builder = gallivm->builder;
1019   struct lp_build_context bld;
1020   unsigned n;
1021   LLVMValueRef half;
1022   LLVMValueRef ab;
1023
1024   assert(!wide_type.floating);
1025   assert(lp_check_value(wide_type, a));
1026   assert(lp_check_value(wide_type, b));
1027
1028   lp_build_context_init(&bld, gallivm, wide_type);
1029
1030   n = wide_type.width / 2;
1031   if (wide_type.sign) {
1032      --n;
1033   }
1034
1035   /*
1036    * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
1037    * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
1038    */
1039
1040   /*
1041    * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
1042    */
1043
1044   ab = LLVMBuildMul(builder, a, b, "");
1045   ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
1046
1047   /*
1048    * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
1049    */
1050
1051   half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
1052   if (wide_type.sign) {
1053      LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
1054      LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
1055      half = lp_build_select(&bld, sign, minus_half, half);
1056   }
1057   ab = LLVMBuildAdd(builder, ab, half, "");
1058
1059   /* Final division */
1060   ab = lp_build_shr_imm(&bld, ab, n);
1061
1062   return ab;
1063}
1064
1065/**
1066 * Generate a * b
1067 */
1068LLVMValueRef
1069lp_build_mul(struct lp_build_context *bld,
1070             LLVMValueRef a,
1071             LLVMValueRef b)
1072{
1073   LLVMBuilderRef builder = bld->gallivm->builder;
1074   const struct lp_type type = bld->type;
1075   LLVMValueRef shift;
1076   LLVMValueRef res;
1077
1078   assert(lp_check_value(type, a));
1079   assert(lp_check_value(type, b));
1080
1081   if(a == bld->zero)
1082      return bld->zero;
1083   if(a == bld->one)
1084      return b;
1085   if(b == bld->zero)
1086      return bld->zero;
1087   if(b == bld->one)
1088      return a;
1089   if(a == bld->undef || b == bld->undef)
1090      return bld->undef;
1091
1092   if (!type.floating && !type.fixed && type.norm) {
1093      struct lp_type wide_type = lp_wider_type(type);
1094      LLVMValueRef al, ah, bl, bh, abl, abh, ab;
1095
1096      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
1097      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
1098
1099      /* PMULLW, PSRLW, PADDW */
1100      abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
1101      abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
1102
1103      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
1104
1105      return ab;
1106   }
1107
1108   if(type.fixed)
1109      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
1110   else
1111      shift = NULL;
1112
1113   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1114      if (type.floating)
1115         res = LLVMConstFMul(a, b);
1116      else
1117         res = LLVMConstMul(a, b);
1118      if(shift) {
1119         if(type.sign)
1120            res = LLVMConstAShr(res, shift);
1121         else
1122            res = LLVMConstLShr(res, shift);
1123      }
1124   }
1125   else {
1126      if (type.floating)
1127         res = LLVMBuildFMul(builder, a, b, "");
1128      else
1129         res = LLVMBuildMul(builder, a, b, "");
1130      if(shift) {
1131         if(type.sign)
1132            res = LLVMBuildAShr(builder, res, shift, "");
1133         else
1134            res = LLVMBuildLShr(builder, res, shift, "");
1135      }
1136   }
1137
1138   return res;
1139}
1140
1141/*
1142 * Widening mul, valid for 32x32 bit -> 64bit only.
1143 * Result is low 32bits, high bits returned in res_hi.
1144 *
1145 * Emits code that is meant to be compiled for the host CPU.
1146 */
1147LLVMValueRef
1148lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1149                         LLVMValueRef a,
1150                         LLVMValueRef b,
1151                         LLVMValueRef *res_hi)
1152{
1153   struct gallivm_state *gallivm = bld->gallivm;
1154   LLVMBuilderRef builder = gallivm->builder;
1155
1156   assert(bld->type.width == 32);
1157   assert(bld->type.floating == 0);
1158   assert(bld->type.fixed == 0);
1159   assert(bld->type.norm == 0);
1160
1161   /*
1162    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1163    * for x86 simd is atrocious (even if the high bits weren't required),
1164    * trying to handle real 64bit inputs (which of course can't happen due
1165    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1166    * apparently llvm does not recognize this widening mul). This includes 6
1167    * (instead of 2) pmuludq plus extra adds and shifts
1168    * The same story applies to signed mul, albeit fixing this requires sse41.
1169    * https://llvm.org/bugs/show_bug.cgi?id=30845
1170    * So, whip up our own code, albeit only for length 4 and 8 (which
1171    * should be good enough)...
1172    */
1173   if ((bld->type.length == 4 || bld->type.length == 8) &&
1174       ((util_cpu_caps.has_sse2 && (bld->type.sign == 0)) ||
1175        util_cpu_caps.has_sse4_1)) {
1176      const char *intrinsic = NULL;
1177      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1178      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1179      struct lp_type type_wide = lp_wider_type(bld->type);
1180      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1181      unsigned i;
1182      for (i = 0; i < bld->type.length; i += 2) {
1183         shuf[i] = lp_build_const_int32(gallivm, i+1);
1184         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1185      }
1186      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1187      aeven = a;
1188      beven = b;
1189      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1190      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1191
1192      if (util_cpu_caps.has_avx2 && bld->type.length == 8) {
1193         if (bld->type.sign) {
1194            intrinsic = "llvm.x86.avx2.pmul.dq";
1195         } else {
1196            intrinsic = "llvm.x86.avx2.pmulu.dq";
1197         }
1198         muleven = lp_build_intrinsic_binary(builder, intrinsic,
1199                                             wider_type, aeven, beven);
1200         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1201                                            wider_type, aodd, bodd);
1202      }
1203      else {
1204         /* for consistent naming look elsewhere... */
1205         if (bld->type.sign) {
1206            intrinsic = "llvm.x86.sse41.pmuldq";
1207         } else {
1208            intrinsic = "llvm.x86.sse2.pmulu.dq";
1209         }
1210         /*
1211          * XXX If we only have AVX but not AVX2 this is a pain.
1212          * lp_build_intrinsic_binary_anylength() can't handle it
1213          * (due to src and dst type not being identical).
1214          */
1215         if (bld->type.length == 8) {
1216            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1217            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1218            LLVMValueRef muleven2[2], mulodd2[2];
1219            struct lp_type type_wide_half = type_wide;
1220            LLVMTypeRef wtype_half;
1221            type_wide_half.length = 2;
1222            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1223            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1224            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1225            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1226            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1227            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1228            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1229            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1230            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1231            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1232                                                    wtype_half, aevenlo, bevenlo);
1233            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1234                                                   wtype_half, aoddlo, boddlo);
1235            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1236                                                    wtype_half, aevenhi, bevenhi);
1237            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1238                                                   wtype_half, aoddhi, boddhi);
1239            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1240            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1241
1242         }
1243         else {
1244            muleven = lp_build_intrinsic_binary(builder, intrinsic,
1245                                                wider_type, aeven, beven);
1246            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1247                                               wider_type, aodd, bodd);
1248         }
1249      }
1250      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1251      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1252
1253      for (i = 0; i < bld->type.length; i += 2) {
1254         shuf[i] = lp_build_const_int32(gallivm, i + 1);
1255         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1256      }
1257      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1258      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1259
1260      for (i = 0; i < bld->type.length; i += 2) {
1261         shuf[i] = lp_build_const_int32(gallivm, i);
1262         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1263      }
1264      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1265      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1266   }
1267   else {
1268      return lp_build_mul_32_lohi(bld, a, b, res_hi);
1269   }
1270}
1271
1272
1273/*
1274 * Widening mul, valid for 32x32 bit -> 64bit only.
1275 * Result is low 32bits, high bits returned in res_hi.
1276 *
1277 * Emits generic code.
1278 */
1279LLVMValueRef
1280lp_build_mul_32_lohi(struct lp_build_context *bld,
1281                     LLVMValueRef a,
1282                     LLVMValueRef b,
1283                     LLVMValueRef *res_hi)
1284{
1285   struct gallivm_state *gallivm = bld->gallivm;
1286   LLVMBuilderRef builder = gallivm->builder;
1287   LLVMValueRef tmp, shift, res_lo;
1288   struct lp_type type_tmp;
1289   LLVMTypeRef wide_type, narrow_type;
1290
1291   type_tmp = bld->type;
1292   narrow_type = lp_build_vec_type(gallivm, type_tmp);
1293   type_tmp.width *= 2;
1294   wide_type = lp_build_vec_type(gallivm, type_tmp);
1295   shift = lp_build_const_vec(gallivm, type_tmp, 32);
1296
1297   if (bld->type.sign) {
1298      a = LLVMBuildSExt(builder, a, wide_type, "");
1299      b = LLVMBuildSExt(builder, b, wide_type, "");
1300   } else {
1301      a = LLVMBuildZExt(builder, a, wide_type, "");
1302      b = LLVMBuildZExt(builder, b, wide_type, "");
1303   }
1304   tmp = LLVMBuildMul(builder, a, b, "");
1305
1306   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1307
1308   /* Since we truncate anyway, LShr and AShr are equivalent. */
1309   tmp = LLVMBuildLShr(builder, tmp, shift, "");
1310   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1311
1312   return res_lo;
1313}
1314
1315
1316/* a * b + c */
1317LLVMValueRef
1318lp_build_mad(struct lp_build_context *bld,
1319             LLVMValueRef a,
1320             LLVMValueRef b,
1321             LLVMValueRef c)
1322{
1323   const struct lp_type type = bld->type;
1324   if (type.floating) {
1325      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1326   } else {
1327      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1328   }
1329}
1330
1331
1332/**
1333 * Small vector x scale multiplication optimization.
1334 */
1335LLVMValueRef
1336lp_build_mul_imm(struct lp_build_context *bld,
1337                 LLVMValueRef a,
1338                 int b)
1339{
1340   LLVMBuilderRef builder = bld->gallivm->builder;
1341   LLVMValueRef factor;
1342
1343   assert(lp_check_value(bld->type, a));
1344
1345   if(b == 0)
1346      return bld->zero;
1347
1348   if(b == 1)
1349      return a;
1350
1351   if(b == -1)
1352      return lp_build_negate(bld, a);
1353
1354   if(b == 2 && bld->type.floating)
1355      return lp_build_add(bld, a, a);
1356
1357   if(util_is_power_of_two_or_zero(b)) {
1358      unsigned shift = ffs(b) - 1;
1359
1360      if(bld->type.floating) {
1361#if 0
1362         /*
1363          * Power of two multiplication by directly manipulating the exponent.
1364          *
1365          * XXX: This might not be always faster, it will introduce a small error
1366          * for multiplication by zero, and it will produce wrong results
1367          * for Inf and NaN.
1368          */
1369         unsigned mantissa = lp_mantissa(bld->type);
1370         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1371         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1372         a = LLVMBuildAdd(builder, a, factor, "");
1373         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1374         return a;
1375#endif
1376      }
1377      else {
1378         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1379         return LLVMBuildShl(builder, a, factor, "");
1380      }
1381   }
1382
1383   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1384   return lp_build_mul(bld, a, factor);
1385}
1386
1387
1388/**
1389 * Generate a / b
1390 */
1391LLVMValueRef
1392lp_build_div(struct lp_build_context *bld,
1393             LLVMValueRef a,
1394             LLVMValueRef b)
1395{
1396   LLVMBuilderRef builder = bld->gallivm->builder;
1397   const struct lp_type type = bld->type;
1398
1399   assert(lp_check_value(type, a));
1400   assert(lp_check_value(type, b));
1401
1402   if(a == bld->zero)
1403      return bld->zero;
1404   if(a == bld->one && type.floating)
1405      return lp_build_rcp(bld, b);
1406   if(b == bld->zero)
1407      return bld->undef;
1408   if(b == bld->one)
1409      return a;
1410   if(a == bld->undef || b == bld->undef)
1411      return bld->undef;
1412
1413   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1414      if (type.floating)
1415         return LLVMConstFDiv(a, b);
1416      else if (type.sign)
1417         return LLVMConstSDiv(a, b);
1418      else
1419         return LLVMConstUDiv(a, b);
1420   }
1421
1422   /* fast rcp is disabled (just uses div), so makes no sense to try that */
1423   if(FALSE &&
1424      ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
1425       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) &&
1426      type.floating)
1427      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1428
1429   if (type.floating)
1430      return LLVMBuildFDiv(builder, a, b, "");
1431   else if (type.sign)
1432      return LLVMBuildSDiv(builder, a, b, "");
1433   else
1434      return LLVMBuildUDiv(builder, a, b, "");
1435}
1436
1437
1438/**
1439 * Linear interpolation helper.
1440 *
1441 * @param normalized whether we are interpolating normalized values,
1442 *        encoded in normalized integers, twice as wide.
1443 *
1444 * @sa http://www.stereopsis.com/doubleblend.html
1445 */
1446static inline LLVMValueRef
1447lp_build_lerp_simple(struct lp_build_context *bld,
1448                     LLVMValueRef x,
1449                     LLVMValueRef v0,
1450                     LLVMValueRef v1,
1451                     unsigned flags)
1452{
1453   unsigned half_width = bld->type.width/2;
1454   LLVMBuilderRef builder = bld->gallivm->builder;
1455   LLVMValueRef delta;
1456   LLVMValueRef res;
1457
1458   assert(lp_check_value(bld->type, x));
1459   assert(lp_check_value(bld->type, v0));
1460   assert(lp_check_value(bld->type, v1));
1461
1462   delta = lp_build_sub(bld, v1, v0);
1463
1464   if (bld->type.floating) {
1465      assert(flags == 0);
1466      return lp_build_mad(bld, x, delta, v0);
1467   }
1468
1469   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1470      if (!bld->type.sign) {
1471         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1472            /*
1473             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1474             * most-significant-bit to the lowest-significant-bit, so that
1475             * later we can just divide by 2**n instead of 2**n - 1.
1476             */
1477
1478            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1479         }
1480
1481         /* (x * delta) >> n */
1482         res = lp_build_mul(bld, x, delta);
1483         res = lp_build_shr_imm(bld, res, half_width);
1484      } else {
1485         /*
1486          * The rescaling trick above doesn't work for signed numbers, so
1487          * use the 2**n - 1 divison approximation in lp_build_mul_norm
1488          * instead.
1489          */
1490         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1491         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1492      }
1493   } else {
1494      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1495      res = lp_build_mul(bld, x, delta);
1496   }
1497
1498   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1499      /*
1500       * At this point both res and v0 only use the lower half of the bits,
1501       * the rest is zero. Instead of add / mask, do add with half wide type.
1502       */
1503      struct lp_type narrow_type;
1504      struct lp_build_context narrow_bld;
1505
1506      memset(&narrow_type, 0, sizeof narrow_type);
1507      narrow_type.sign   = bld->type.sign;
1508      narrow_type.width  = bld->type.width/2;
1509      narrow_type.length = bld->type.length*2;
1510
1511      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1512      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1513      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1514      res = lp_build_add(&narrow_bld, v0, res);
1515      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1516   } else {
1517      res = lp_build_add(bld, v0, res);
1518
1519      if (bld->type.fixed) {
1520         /*
1521          * We need to mask out the high order bits when lerping 8bit
1522          * normalized colors stored on 16bits
1523          */
1524         /* XXX: This step is necessary for lerping 8bit colors stored on
1525          * 16bits, but it will be wrong for true fixed point use cases.
1526          * Basically we need a more powerful lp_type, capable of further
1527          * distinguishing the values interpretation from the value storage.
1528          */
1529         LLVMValueRef low_bits;
1530         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1531         res = LLVMBuildAnd(builder, res, low_bits, "");
1532      }
1533   }
1534
1535   return res;
1536}
1537
1538
1539/**
1540 * Linear interpolation.
1541 */
1542LLVMValueRef
1543lp_build_lerp(struct lp_build_context *bld,
1544              LLVMValueRef x,
1545              LLVMValueRef v0,
1546              LLVMValueRef v1,
1547              unsigned flags)
1548{
1549   const struct lp_type type = bld->type;
1550   LLVMValueRef res;
1551
1552   assert(lp_check_value(type, x));
1553   assert(lp_check_value(type, v0));
1554   assert(lp_check_value(type, v1));
1555
1556   assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1557
1558   if (type.norm) {
1559      struct lp_type wide_type;
1560      struct lp_build_context wide_bld;
1561      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1562
1563      assert(type.length >= 2);
1564
1565      /*
1566       * Create a wider integer type, enough to hold the
1567       * intermediate result of the multiplication.
1568       */
1569      memset(&wide_type, 0, sizeof wide_type);
1570      wide_type.sign   = type.sign;
1571      wide_type.width  = type.width*2;
1572      wide_type.length = type.length/2;
1573
1574      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1575
1576      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1577      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1578      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1579
1580      /*
1581       * Lerp both halves.
1582       */
1583
1584      flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1585
1586      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1587      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1588
1589      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1590   } else {
1591      res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1592   }
1593
1594   return res;
1595}
1596
1597
1598/**
1599 * Bilinear interpolation.
1600 *
1601 * Values indices are in v_{yx}.
1602 */
1603LLVMValueRef
1604lp_build_lerp_2d(struct lp_build_context *bld,
1605                 LLVMValueRef x,
1606                 LLVMValueRef y,
1607                 LLVMValueRef v00,
1608                 LLVMValueRef v01,
1609                 LLVMValueRef v10,
1610                 LLVMValueRef v11,
1611                 unsigned flags)
1612{
1613   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1614   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1615   return lp_build_lerp(bld, y, v0, v1, flags);
1616}
1617
1618
1619LLVMValueRef
1620lp_build_lerp_3d(struct lp_build_context *bld,
1621                 LLVMValueRef x,
1622                 LLVMValueRef y,
1623                 LLVMValueRef z,
1624                 LLVMValueRef v000,
1625                 LLVMValueRef v001,
1626                 LLVMValueRef v010,
1627                 LLVMValueRef v011,
1628                 LLVMValueRef v100,
1629                 LLVMValueRef v101,
1630                 LLVMValueRef v110,
1631                 LLVMValueRef v111,
1632                 unsigned flags)
1633{
1634   LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1635   LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1636   return lp_build_lerp(bld, z, v0, v1, flags);
1637}
1638
1639
1640/**
1641 * Generate min(a, b)
1642 * Do checks for special cases but not for nans.
1643 */
1644LLVMValueRef
1645lp_build_min(struct lp_build_context *bld,
1646             LLVMValueRef a,
1647             LLVMValueRef b)
1648{
1649   assert(lp_check_value(bld->type, a));
1650   assert(lp_check_value(bld->type, b));
1651
1652   if(a == bld->undef || b == bld->undef)
1653      return bld->undef;
1654
1655   if(a == b)
1656      return a;
1657
1658   if (bld->type.norm) {
1659      if (!bld->type.sign) {
1660         if (a == bld->zero || b == bld->zero) {
1661            return bld->zero;
1662         }
1663      }
1664      if(a == bld->one)
1665         return b;
1666      if(b == bld->one)
1667         return a;
1668   }
1669
1670   return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1671}
1672
1673
1674/**
1675 * Generate min(a, b)
1676 * NaN's are handled according to the behavior specified by the
1677 * nan_behavior argument.
1678 */
1679LLVMValueRef
1680lp_build_min_ext(struct lp_build_context *bld,
1681                 LLVMValueRef a,
1682                 LLVMValueRef b,
1683                 enum gallivm_nan_behavior nan_behavior)
1684{
1685   assert(lp_check_value(bld->type, a));
1686   assert(lp_check_value(bld->type, b));
1687
1688   if(a == bld->undef || b == bld->undef)
1689      return bld->undef;
1690
1691   if(a == b)
1692      return a;
1693
1694   if (bld->type.norm) {
1695      if (!bld->type.sign) {
1696         if (a == bld->zero || b == bld->zero) {
1697            return bld->zero;
1698         }
1699      }
1700      if(a == bld->one)
1701         return b;
1702      if(b == bld->one)
1703         return a;
1704   }
1705
1706   return lp_build_min_simple(bld, a, b, nan_behavior);
1707}
1708
1709/**
1710 * Generate max(a, b)
1711 * Do checks for special cases, but NaN behavior is undefined.
1712 */
1713LLVMValueRef
1714lp_build_max(struct lp_build_context *bld,
1715             LLVMValueRef a,
1716             LLVMValueRef b)
1717{
1718   assert(lp_check_value(bld->type, a));
1719   assert(lp_check_value(bld->type, b));
1720
1721   if(a == bld->undef || b == bld->undef)
1722      return bld->undef;
1723
1724   if(a == b)
1725      return a;
1726
1727   if(bld->type.norm) {
1728      if(a == bld->one || b == bld->one)
1729         return bld->one;
1730      if (!bld->type.sign) {
1731         if (a == bld->zero) {
1732            return b;
1733         }
1734         if (b == bld->zero) {
1735            return a;
1736         }
1737      }
1738   }
1739
1740   return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1741}
1742
1743
1744/**
1745 * Generate max(a, b)
1746 * Checks for special cases.
1747 * NaN's are handled according to the behavior specified by the
1748 * nan_behavior argument.
1749 */
1750LLVMValueRef
1751lp_build_max_ext(struct lp_build_context *bld,
1752                  LLVMValueRef a,
1753                  LLVMValueRef b,
1754                  enum gallivm_nan_behavior nan_behavior)
1755{
1756   assert(lp_check_value(bld->type, a));
1757   assert(lp_check_value(bld->type, b));
1758
1759   if(a == bld->undef || b == bld->undef)
1760      return bld->undef;
1761
1762   if(a == b)
1763      return a;
1764
1765   if(bld->type.norm) {
1766      if(a == bld->one || b == bld->one)
1767         return bld->one;
1768      if (!bld->type.sign) {
1769         if (a == bld->zero) {
1770            return b;
1771         }
1772         if (b == bld->zero) {
1773            return a;
1774         }
1775      }
1776   }
1777
1778   return lp_build_max_simple(bld, a, b, nan_behavior);
1779}
1780
1781/**
1782 * Generate clamp(a, min, max)
1783 * NaN behavior (for any of a, min, max) is undefined.
1784 * Do checks for special cases.
1785 */
1786LLVMValueRef
1787lp_build_clamp(struct lp_build_context *bld,
1788               LLVMValueRef a,
1789               LLVMValueRef min,
1790               LLVMValueRef max)
1791{
1792   assert(lp_check_value(bld->type, a));
1793   assert(lp_check_value(bld->type, min));
1794   assert(lp_check_value(bld->type, max));
1795
1796   a = lp_build_min(bld, a, max);
1797   a = lp_build_max(bld, a, min);
1798   return a;
1799}
1800
1801
1802/**
1803 * Generate clamp(a, 0, 1)
1804 * A NaN will get converted to zero.
1805 */
1806LLVMValueRef
1807lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1808                                LLVMValueRef a)
1809{
1810   a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1811   a = lp_build_min(bld, a, bld->one);
1812   return a;
1813}
1814
1815
1816/**
1817 * Generate abs(a)
1818 */
1819LLVMValueRef
1820lp_build_abs(struct lp_build_context *bld,
1821             LLVMValueRef a)
1822{
1823   LLVMBuilderRef builder = bld->gallivm->builder;
1824   const struct lp_type type = bld->type;
1825   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1826
1827   assert(lp_check_value(type, a));
1828
1829   if(!type.sign)
1830      return a;
1831
1832   if(type.floating) {
1833      if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
1834         /* Workaround llvm.org/PR27332 */
1835         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1836         unsigned long long absMask = ~(1ULL << (type.width - 1));
1837         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
1838         a = LLVMBuildBitCast(builder, a, int_vec_type, "");
1839         a = LLVMBuildAnd(builder, a, mask, "");
1840         a = LLVMBuildBitCast(builder, a, vec_type, "");
1841         return a;
1842      } else {
1843         char intrinsic[32];
1844         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1845         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1846      }
1847   }
1848
1849   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3 && HAVE_LLVM < 0x0600) {
1850      switch(type.width) {
1851      case 8:
1852         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1853      case 16:
1854         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1855      case 32:
1856         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1857      }
1858   }
1859   else if (type.width*type.length == 256 && util_cpu_caps.has_avx2 && HAVE_LLVM < 0x0600) {
1860      switch(type.width) {
1861      case 8:
1862         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1863      case 16:
1864         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1865      case 32:
1866         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1867      }
1868   }
1869
1870   return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1871                          a, LLVMBuildNeg(builder, a, ""));
1872}
1873
1874
1875LLVMValueRef
1876lp_build_negate(struct lp_build_context *bld,
1877                LLVMValueRef a)
1878{
1879   LLVMBuilderRef builder = bld->gallivm->builder;
1880
1881   assert(lp_check_value(bld->type, a));
1882
1883   if (bld->type.floating)
1884      a = LLVMBuildFNeg(builder, a, "");
1885   else
1886      a = LLVMBuildNeg(builder, a, "");
1887
1888   return a;
1889}
1890
1891
1892/** Return -1, 0 or +1 depending on the sign of a */
1893LLVMValueRef
1894lp_build_sgn(struct lp_build_context *bld,
1895             LLVMValueRef a)
1896{
1897   LLVMBuilderRef builder = bld->gallivm->builder;
1898   const struct lp_type type = bld->type;
1899   LLVMValueRef cond;
1900   LLVMValueRef res;
1901
1902   assert(lp_check_value(type, a));
1903
1904   /* Handle non-zero case */
1905   if(!type.sign) {
1906      /* if not zero then sign must be positive */
1907      res = bld->one;
1908   }
1909   else if(type.floating) {
1910      LLVMTypeRef vec_type;
1911      LLVMTypeRef int_type;
1912      LLVMValueRef mask;
1913      LLVMValueRef sign;
1914      LLVMValueRef one;
1915      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1916
1917      int_type = lp_build_int_vec_type(bld->gallivm, type);
1918      vec_type = lp_build_vec_type(bld->gallivm, type);
1919      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1920
1921      /* Take the sign bit and add it to 1 constant */
1922      sign = LLVMBuildBitCast(builder, a, int_type, "");
1923      sign = LLVMBuildAnd(builder, sign, mask, "");
1924      one = LLVMConstBitCast(bld->one, int_type);
1925      res = LLVMBuildOr(builder, sign, one, "");
1926      res = LLVMBuildBitCast(builder, res, vec_type, "");
1927   }
1928   else
1929   {
1930      /* signed int/norm/fixed point */
1931      /* could use psign with sse3 and appropriate vectors here */
1932      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1933      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1934      res = lp_build_select(bld, cond, bld->one, minus_one);
1935   }
1936
1937   /* Handle zero */
1938   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1939   res = lp_build_select(bld, cond, bld->zero, res);
1940
1941   return res;
1942}
1943
1944
1945/**
1946 * Set the sign of float vector 'a' according to 'sign'.
1947 * If sign==0, return abs(a).
1948 * If sign==1, return -abs(a);
1949 * Other values for sign produce undefined results.
1950 */
1951LLVMValueRef
1952lp_build_set_sign(struct lp_build_context *bld,
1953                  LLVMValueRef a, LLVMValueRef sign)
1954{
1955   LLVMBuilderRef builder = bld->gallivm->builder;
1956   const struct lp_type type = bld->type;
1957   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1958   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1959   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1960   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1961                             ~((unsigned long long) 1 << (type.width - 1)));
1962   LLVMValueRef val, res;
1963
1964   assert(type.floating);
1965   assert(lp_check_value(type, a));
1966
1967   /* val = reinterpret_cast<int>(a) */
1968   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1969   /* val = val & mask */
1970   val = LLVMBuildAnd(builder, val, mask, "");
1971   /* sign = sign << shift */
1972   sign = LLVMBuildShl(builder, sign, shift, "");
1973   /* res = val | sign */
1974   res = LLVMBuildOr(builder, val, sign, "");
1975   /* res = reinterpret_cast<float>(res) */
1976   res = LLVMBuildBitCast(builder, res, vec_type, "");
1977
1978   return res;
1979}
1980
1981
1982/**
1983 * Convert vector of (or scalar) int to vector of (or scalar) float.
1984 */
1985LLVMValueRef
1986lp_build_int_to_float(struct lp_build_context *bld,
1987                      LLVMValueRef a)
1988{
1989   LLVMBuilderRef builder = bld->gallivm->builder;
1990   const struct lp_type type = bld->type;
1991   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1992
1993   assert(type.floating);
1994
1995   return LLVMBuildSIToFP(builder, a, vec_type, "");
1996}
1997
1998static boolean
1999arch_rounding_available(const struct lp_type type)
2000{
2001   if ((util_cpu_caps.has_sse4_1 &&
2002       (type.length == 1 || type.width*type.length == 128)) ||
2003       (util_cpu_caps.has_avx && type.width*type.length == 256) ||
2004       (util_cpu_caps.has_avx512f && type.width*type.length == 512))
2005      return TRUE;
2006   else if ((util_cpu_caps.has_altivec &&
2007            (type.width == 32 && type.length == 4)))
2008      return TRUE;
2009   else if (util_cpu_caps.has_neon)
2010      return TRUE;
2011
2012   return FALSE;
2013}
2014
2015enum lp_build_round_mode
2016{
2017   LP_BUILD_ROUND_NEAREST = 0,
2018   LP_BUILD_ROUND_FLOOR = 1,
2019   LP_BUILD_ROUND_CEIL = 2,
2020   LP_BUILD_ROUND_TRUNCATE = 3
2021};
2022
2023static inline LLVMValueRef
2024lp_build_iround_nearest_sse2(struct lp_build_context *bld,
2025                             LLVMValueRef a)
2026{
2027   LLVMBuilderRef builder = bld->gallivm->builder;
2028   const struct lp_type type = bld->type;
2029   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
2030   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
2031   const char *intrinsic;
2032   LLVMValueRef res;
2033
2034   assert(type.floating);
2035   /* using the double precision conversions is a bit more complicated */
2036   assert(type.width == 32);
2037
2038   assert(lp_check_value(type, a));
2039   assert(util_cpu_caps.has_sse2);
2040
2041   /* This is relying on MXCSR rounding mode, which should always be nearest. */
2042   if (type.length == 1) {
2043      LLVMTypeRef vec_type;
2044      LLVMValueRef undef;
2045      LLVMValueRef arg;
2046      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2047
2048      vec_type = LLVMVectorType(bld->elem_type, 4);
2049
2050      intrinsic = "llvm.x86.sse.cvtss2si";
2051
2052      undef = LLVMGetUndef(vec_type);
2053
2054      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
2055
2056      res = lp_build_intrinsic_unary(builder, intrinsic,
2057                                     ret_type, arg);
2058   }
2059   else {
2060      if (type.width* type.length == 128) {
2061         intrinsic = "llvm.x86.sse2.cvtps2dq";
2062      }
2063      else {
2064         assert(type.width*type.length == 256);
2065         assert(util_cpu_caps.has_avx);
2066
2067         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
2068      }
2069      res = lp_build_intrinsic_unary(builder, intrinsic,
2070                                     ret_type, a);
2071   }
2072
2073   return res;
2074}
2075
2076
2077/*
2078 */
2079static inline LLVMValueRef
2080lp_build_round_altivec(struct lp_build_context *bld,
2081                       LLVMValueRef a,
2082                       enum lp_build_round_mode mode)
2083{
2084   LLVMBuilderRef builder = bld->gallivm->builder;
2085   const struct lp_type type = bld->type;
2086   const char *intrinsic = NULL;
2087
2088   assert(type.floating);
2089
2090   assert(lp_check_value(type, a));
2091   assert(util_cpu_caps.has_altivec);
2092
2093   (void)type;
2094
2095   switch (mode) {
2096   case LP_BUILD_ROUND_NEAREST:
2097      intrinsic = "llvm.ppc.altivec.vrfin";
2098      break;
2099   case LP_BUILD_ROUND_FLOOR:
2100      intrinsic = "llvm.ppc.altivec.vrfim";
2101      break;
2102   case LP_BUILD_ROUND_CEIL:
2103      intrinsic = "llvm.ppc.altivec.vrfip";
2104      break;
2105   case LP_BUILD_ROUND_TRUNCATE:
2106      intrinsic = "llvm.ppc.altivec.vrfiz";
2107      break;
2108   }
2109
2110   return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2111}
2112
2113static inline LLVMValueRef
2114lp_build_round_arch(struct lp_build_context *bld,
2115                    LLVMValueRef a,
2116                    enum lp_build_round_mode mode)
2117{
2118   if (util_cpu_caps.has_sse4_1 || util_cpu_caps.has_neon) {
2119      LLVMBuilderRef builder = bld->gallivm->builder;
2120      const struct lp_type type = bld->type;
2121      const char *intrinsic_root;
2122      char intrinsic[32];
2123
2124      assert(type.floating);
2125      assert(lp_check_value(type, a));
2126      (void)type;
2127
2128      switch (mode) {
2129      case LP_BUILD_ROUND_NEAREST:
2130         intrinsic_root = "llvm.nearbyint";
2131         break;
2132      case LP_BUILD_ROUND_FLOOR:
2133         intrinsic_root = "llvm.floor";
2134         break;
2135      case LP_BUILD_ROUND_CEIL:
2136         intrinsic_root = "llvm.ceil";
2137         break;
2138      case LP_BUILD_ROUND_TRUNCATE:
2139         intrinsic_root = "llvm.trunc";
2140         break;
2141      }
2142
2143      lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2144      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2145   }
2146   else /* (util_cpu_caps.has_altivec) */
2147     return lp_build_round_altivec(bld, a, mode);
2148}
2149
2150/**
2151 * Return the integer part of a float (vector) value (== round toward zero).
2152 * The returned value is a float (vector).
2153 * Ex: trunc(-1.5) = -1.0
2154 */
2155LLVMValueRef
2156lp_build_trunc(struct lp_build_context *bld,
2157               LLVMValueRef a)
2158{
2159   LLVMBuilderRef builder = bld->gallivm->builder;
2160   const struct lp_type type = bld->type;
2161
2162   assert(type.floating);
2163   assert(lp_check_value(type, a));
2164
2165   if (arch_rounding_available(type)) {
2166      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2167   }
2168   else {
2169      const struct lp_type type = bld->type;
2170      struct lp_type inttype;
2171      struct lp_build_context intbld;
2172      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2173      LLVMValueRef trunc, res, anosign, mask;
2174      LLVMTypeRef int_vec_type = bld->int_vec_type;
2175      LLVMTypeRef vec_type = bld->vec_type;
2176
2177      assert(type.width == 32); /* might want to handle doubles at some point */
2178
2179      inttype = type;
2180      inttype.floating = 0;
2181      lp_build_context_init(&intbld, bld->gallivm, inttype);
2182
2183      /* round by truncation */
2184      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2185      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2186
2187      /* mask out sign bit */
2188      anosign = lp_build_abs(bld, a);
2189      /*
2190       * mask out all values if anosign > 2^24
2191       * This should work both for large ints (all rounding is no-op for them
2192       * because such floats are always exact) as well as special cases like
2193       * NaNs, Infs (taking advantage of the fact they use max exponent).
2194       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2195       */
2196      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2197      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2198      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2199      return lp_build_select(bld, mask, a, res);
2200   }
2201}
2202
2203
2204/**
2205 * Return float (vector) rounded to nearest integer (vector).  The returned
2206 * value is a float (vector).
2207 * Ex: round(0.9) = 1.0
2208 * Ex: round(-1.5) = -2.0
2209 */
2210LLVMValueRef
2211lp_build_round(struct lp_build_context *bld,
2212               LLVMValueRef a)
2213{
2214   LLVMBuilderRef builder = bld->gallivm->builder;
2215   const struct lp_type type = bld->type;
2216
2217   assert(type.floating);
2218   assert(lp_check_value(type, a));
2219
2220   if (arch_rounding_available(type)) {
2221      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2222   }
2223   else {
2224      const struct lp_type type = bld->type;
2225      struct lp_type inttype;
2226      struct lp_build_context intbld;
2227      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2228      LLVMValueRef res, anosign, mask;
2229      LLVMTypeRef int_vec_type = bld->int_vec_type;
2230      LLVMTypeRef vec_type = bld->vec_type;
2231
2232      assert(type.width == 32); /* might want to handle doubles at some point */
2233
2234      inttype = type;
2235      inttype.floating = 0;
2236      lp_build_context_init(&intbld, bld->gallivm, inttype);
2237
2238      res = lp_build_iround(bld, a);
2239      res = LLVMBuildSIToFP(builder, res, vec_type, "");
2240
2241      /* mask out sign bit */
2242      anosign = lp_build_abs(bld, a);
2243      /*
2244       * mask out all values if anosign > 2^24
2245       * This should work both for large ints (all rounding is no-op for them
2246       * because such floats are always exact) as well as special cases like
2247       * NaNs, Infs (taking advantage of the fact they use max exponent).
2248       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2249       */
2250      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2251      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2252      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2253      return lp_build_select(bld, mask, a, res);
2254   }
2255}
2256
2257
2258/**
2259 * Return floor of float (vector), result is a float (vector)
2260 * Ex: floor(1.1) = 1.0
2261 * Ex: floor(-1.1) = -2.0
2262 */
2263LLVMValueRef
2264lp_build_floor(struct lp_build_context *bld,
2265               LLVMValueRef a)
2266{
2267   LLVMBuilderRef builder = bld->gallivm->builder;
2268   const struct lp_type type = bld->type;
2269
2270   assert(type.floating);
2271   assert(lp_check_value(type, a));
2272
2273   if (arch_rounding_available(type)) {
2274      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2275   }
2276   else {
2277      const struct lp_type type = bld->type;
2278      struct lp_type inttype;
2279      struct lp_build_context intbld;
2280      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2281      LLVMValueRef trunc, res, anosign, mask;
2282      LLVMTypeRef int_vec_type = bld->int_vec_type;
2283      LLVMTypeRef vec_type = bld->vec_type;
2284
2285      if (type.width != 32) {
2286         char intrinsic[32];
2287         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2288         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2289      }
2290
2291      assert(type.width == 32); /* might want to handle doubles at some point */
2292
2293      inttype = type;
2294      inttype.floating = 0;
2295      lp_build_context_init(&intbld, bld->gallivm, inttype);
2296
2297      /* round by truncation */
2298      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2299      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2300
2301      if (type.sign) {
2302         LLVMValueRef tmp;
2303
2304         /*
2305          * fix values if rounding is wrong (for non-special cases)
2306          * - this is the case if trunc > a
2307          */
2308         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2309         /* tmp = trunc > a ? 1.0 : 0.0 */
2310         tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2311         tmp = lp_build_and(&intbld, mask, tmp);
2312         tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2313         res = lp_build_sub(bld, res, tmp);
2314      }
2315
2316      /* mask out sign bit */
2317      anosign = lp_build_abs(bld, a);
2318      /*
2319       * mask out all values if anosign > 2^24
2320       * This should work both for large ints (all rounding is no-op for them
2321       * because such floats are always exact) as well as special cases like
2322       * NaNs, Infs (taking advantage of the fact they use max exponent).
2323       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2324       */
2325      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2326      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2327      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2328      return lp_build_select(bld, mask, a, res);
2329   }
2330}
2331
2332
2333/**
2334 * Return ceiling of float (vector), returning float (vector).
2335 * Ex: ceil( 1.1) = 2.0
2336 * Ex: ceil(-1.1) = -1.0
2337 */
2338LLVMValueRef
2339lp_build_ceil(struct lp_build_context *bld,
2340              LLVMValueRef a)
2341{
2342   LLVMBuilderRef builder = bld->gallivm->builder;
2343   const struct lp_type type = bld->type;
2344
2345   assert(type.floating);
2346   assert(lp_check_value(type, a));
2347
2348   if (arch_rounding_available(type)) {
2349      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2350   }
2351   else {
2352      const struct lp_type type = bld->type;
2353      struct lp_type inttype;
2354      struct lp_build_context intbld;
2355      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2356      LLVMValueRef trunc, res, anosign, mask, tmp;
2357      LLVMTypeRef int_vec_type = bld->int_vec_type;
2358      LLVMTypeRef vec_type = bld->vec_type;
2359
2360      if (type.width != 32) {
2361         char intrinsic[32];
2362         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2363         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2364      }
2365
2366      assert(type.width == 32); /* might want to handle doubles at some point */
2367
2368      inttype = type;
2369      inttype.floating = 0;
2370      lp_build_context_init(&intbld, bld->gallivm, inttype);
2371
2372      /* round by truncation */
2373      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2374      trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2375
2376      /*
2377       * fix values if rounding is wrong (for non-special cases)
2378       * - this is the case if trunc < a
2379       */
2380      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2381      /* tmp = trunc < a ? 1.0 : 0.0 */
2382      tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2383      tmp = lp_build_and(&intbld, mask, tmp);
2384      tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2385      res = lp_build_add(bld, trunc, tmp);
2386
2387      /* mask out sign bit */
2388      anosign = lp_build_abs(bld, a);
2389      /*
2390       * mask out all values if anosign > 2^24
2391       * This should work both for large ints (all rounding is no-op for them
2392       * because such floats are always exact) as well as special cases like
2393       * NaNs, Infs (taking advantage of the fact they use max exponent).
2394       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2395       */
2396      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2397      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2398      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2399      return lp_build_select(bld, mask, a, res);
2400   }
2401}
2402
2403
2404/**
2405 * Return fractional part of 'a' computed as a - floor(a)
2406 * Typically used in texture coord arithmetic.
2407 */
2408LLVMValueRef
2409lp_build_fract(struct lp_build_context *bld,
2410               LLVMValueRef a)
2411{
2412   assert(bld->type.floating);
2413   return lp_build_sub(bld, a, lp_build_floor(bld, a));
2414}
2415
2416
2417/**
2418 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2419 * against 0.99999(9). (Will also return that value for NaNs.)
2420 */
2421static inline LLVMValueRef
2422clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2423{
2424   LLVMValueRef max;
2425
2426   /* this is the largest number smaller than 1.0 representable as float */
2427   max = lp_build_const_vec(bld->gallivm, bld->type,
2428                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2429   return lp_build_min_ext(bld, fract, max,
2430                           GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2431}
2432
2433
2434/**
2435 * Same as lp_build_fract, but guarantees that the result is always smaller
2436 * than one. Will also return the smaller-than-one value for infs, NaNs.
2437 */
2438LLVMValueRef
2439lp_build_fract_safe(struct lp_build_context *bld,
2440                    LLVMValueRef a)
2441{
2442   return clamp_fract(bld, lp_build_fract(bld, a));
2443}
2444
2445
2446/**
2447 * Return the integer part of a float (vector) value (== round toward zero).
2448 * The returned value is an integer (vector).
2449 * Ex: itrunc(-1.5) = -1
2450 */
2451LLVMValueRef
2452lp_build_itrunc(struct lp_build_context *bld,
2453                LLVMValueRef a)
2454{
2455   LLVMBuilderRef builder = bld->gallivm->builder;
2456   const struct lp_type type = bld->type;
2457   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2458
2459   assert(type.floating);
2460   assert(lp_check_value(type, a));
2461
2462   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2463}
2464
2465
2466/**
2467 * Return float (vector) rounded to nearest integer (vector).  The returned
2468 * value is an integer (vector).
2469 * Ex: iround(0.9) = 1
2470 * Ex: iround(-1.5) = -2
2471 */
2472LLVMValueRef
2473lp_build_iround(struct lp_build_context *bld,
2474                LLVMValueRef a)
2475{
2476   LLVMBuilderRef builder = bld->gallivm->builder;
2477   const struct lp_type type = bld->type;
2478   LLVMTypeRef int_vec_type = bld->int_vec_type;
2479   LLVMValueRef res;
2480
2481   assert(type.floating);
2482
2483   assert(lp_check_value(type, a));
2484
2485   if ((util_cpu_caps.has_sse2 &&
2486       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2487       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2488      return lp_build_iround_nearest_sse2(bld, a);
2489   }
2490   if (arch_rounding_available(type)) {
2491      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2492   }
2493   else {
2494      LLVMValueRef half;
2495
2496      half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2497
2498      if (type.sign) {
2499         LLVMTypeRef vec_type = bld->vec_type;
2500         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2501                                    (unsigned long long)1 << (type.width - 1));
2502         LLVMValueRef sign;
2503
2504         /* get sign bit */
2505         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2506         sign = LLVMBuildAnd(builder, sign, mask, "");
2507
2508         /* sign * 0.5 */
2509         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2510         half = LLVMBuildOr(builder, sign, half, "");
2511         half = LLVMBuildBitCast(builder, half, vec_type, "");
2512      }
2513
2514      res = LLVMBuildFAdd(builder, a, half, "");
2515   }
2516
2517   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2518
2519   return res;
2520}
2521
2522
2523/**
2524 * Return floor of float (vector), result is an int (vector)
2525 * Ex: ifloor(1.1) = 1.0
2526 * Ex: ifloor(-1.1) = -2.0
2527 */
2528LLVMValueRef
2529lp_build_ifloor(struct lp_build_context *bld,
2530                LLVMValueRef a)
2531{
2532   LLVMBuilderRef builder = bld->gallivm->builder;
2533   const struct lp_type type = bld->type;
2534   LLVMTypeRef int_vec_type = bld->int_vec_type;
2535   LLVMValueRef res;
2536
2537   assert(type.floating);
2538   assert(lp_check_value(type, a));
2539
2540   res = a;
2541   if (type.sign) {
2542      if (arch_rounding_available(type)) {
2543         res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2544      }
2545      else {
2546         struct lp_type inttype;
2547         struct lp_build_context intbld;
2548         LLVMValueRef trunc, itrunc, mask;
2549
2550         assert(type.floating);
2551         assert(lp_check_value(type, a));
2552
2553         inttype = type;
2554         inttype.floating = 0;
2555         lp_build_context_init(&intbld, bld->gallivm, inttype);
2556
2557         /* round by truncation */
2558         itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2559         trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2560
2561         /*
2562          * fix values if rounding is wrong (for non-special cases)
2563          * - this is the case if trunc > a
2564          * The results of doing this with NaNs, very large values etc.
2565          * are undefined but this seems to be the case anyway.
2566          */
2567         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2568         /* cheapie minus one with mask since the mask is minus one / zero */
2569         return lp_build_add(&intbld, itrunc, mask);
2570      }
2571   }
2572
2573   /* round to nearest (toward zero) */
2574   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2575
2576   return res;
2577}
2578
2579
2580/**
2581 * Return ceiling of float (vector), returning int (vector).
2582 * Ex: iceil( 1.1) = 2
2583 * Ex: iceil(-1.1) = -1
2584 */
2585LLVMValueRef
2586lp_build_iceil(struct lp_build_context *bld,
2587               LLVMValueRef a)
2588{
2589   LLVMBuilderRef builder = bld->gallivm->builder;
2590   const struct lp_type type = bld->type;
2591   LLVMTypeRef int_vec_type = bld->int_vec_type;
2592   LLVMValueRef res;
2593
2594   assert(type.floating);
2595   assert(lp_check_value(type, a));
2596
2597   if (arch_rounding_available(type)) {
2598      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2599   }
2600   else {
2601      struct lp_type inttype;
2602      struct lp_build_context intbld;
2603      LLVMValueRef trunc, itrunc, mask;
2604
2605      assert(type.floating);
2606      assert(lp_check_value(type, a));
2607
2608      inttype = type;
2609      inttype.floating = 0;
2610      lp_build_context_init(&intbld, bld->gallivm, inttype);
2611
2612      /* round by truncation */
2613      itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2614      trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2615
2616      /*
2617       * fix values if rounding is wrong (for non-special cases)
2618       * - this is the case if trunc < a
2619       * The results of doing this with NaNs, very large values etc.
2620       * are undefined but this seems to be the case anyway.
2621       */
2622      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2623      /* cheapie plus one with mask since the mask is minus one / zero */
2624      return lp_build_sub(&intbld, itrunc, mask);
2625   }
2626
2627   /* round to nearest (toward zero) */
2628   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2629
2630   return res;
2631}
2632
2633
2634/**
2635 * Combined ifloor() & fract().
2636 *
2637 * Preferred to calling the functions separately, as it will ensure that the
2638 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2639 */
2640void
2641lp_build_ifloor_fract(struct lp_build_context *bld,
2642                      LLVMValueRef a,
2643                      LLVMValueRef *out_ipart,
2644                      LLVMValueRef *out_fpart)
2645{
2646   LLVMBuilderRef builder = bld->gallivm->builder;
2647   const struct lp_type type = bld->type;
2648   LLVMValueRef ipart;
2649
2650   assert(type.floating);
2651   assert(lp_check_value(type, a));
2652
2653   if (arch_rounding_available(type)) {
2654      /*
2655       * floor() is easier.
2656       */
2657
2658      ipart = lp_build_floor(bld, a);
2659      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2660      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2661   }
2662   else {
2663      /*
2664       * ifloor() is easier.
2665       */
2666
2667      *out_ipart = lp_build_ifloor(bld, a);
2668      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2669      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2670   }
2671}
2672
2673
2674/**
2675 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2676 * always smaller than one.
2677 */
2678void
2679lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2680                           LLVMValueRef a,
2681                           LLVMValueRef *out_ipart,
2682                           LLVMValueRef *out_fpart)
2683{
2684   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2685   *out_fpart = clamp_fract(bld, *out_fpart);
2686}
2687
2688
2689LLVMValueRef
2690lp_build_sqrt(struct lp_build_context *bld,
2691              LLVMValueRef a)
2692{
2693   LLVMBuilderRef builder = bld->gallivm->builder;
2694   const struct lp_type type = bld->type;
2695   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2696   char intrinsic[32];
2697
2698   assert(lp_check_value(type, a));
2699
2700   assert(type.floating);
2701   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2702
2703   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2704}
2705
2706
2707/**
2708 * Do one Newton-Raphson step to improve reciprocate precision:
2709 *
2710 *   x_{i+1} = x_i * (2 - a * x_i)
2711 *
2712 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2713 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2714 * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
2715 * halo. It would be necessary to clamp the argument to prevent this.
2716 *
2717 * See also:
2718 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2719 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2720 */
2721static inline LLVMValueRef
2722lp_build_rcp_refine(struct lp_build_context *bld,
2723                    LLVMValueRef a,
2724                    LLVMValueRef rcp_a)
2725{
2726   LLVMBuilderRef builder = bld->gallivm->builder;
2727   LLVMValueRef two = lp_build_const_vec(bld->gallivm, bld->type, 2.0);
2728   LLVMValueRef res;
2729
2730   res = LLVMBuildFMul(builder, a, rcp_a, "");
2731   res = LLVMBuildFSub(builder, two, res, "");
2732   res = LLVMBuildFMul(builder, rcp_a, res, "");
2733
2734   return res;
2735}
2736
2737
2738LLVMValueRef
2739lp_build_rcp(struct lp_build_context *bld,
2740             LLVMValueRef a)
2741{
2742   LLVMBuilderRef builder = bld->gallivm->builder;
2743   const struct lp_type type = bld->type;
2744
2745   assert(lp_check_value(type, a));
2746
2747   if(a == bld->zero)
2748      return bld->undef;
2749   if(a == bld->one)
2750      return bld->one;
2751   if(a == bld->undef)
2752      return bld->undef;
2753
2754   assert(type.floating);
2755
2756   if(LLVMIsConstant(a))
2757      return LLVMConstFDiv(bld->one, a);
2758
2759   /*
2760    * We don't use RCPPS because:
2761    * - it only has 10bits of precision
2762    * - it doesn't even get the reciprocate of 1.0 exactly
2763    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2764    * - for recent processors the benefit over DIVPS is marginal, a case
2765    *   dependent
2766    *
2767    * We could still use it on certain processors if benchmarks show that the
2768    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2769    * particular uses that require less workarounds.
2770    */
2771
2772   if (FALSE && ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2773         (util_cpu_caps.has_avx && type.width == 32 && type.length == 8))){
2774      const unsigned num_iterations = 0;
2775      LLVMValueRef res;
2776      unsigned i;
2777      const char *intrinsic = NULL;
2778
2779      if (type.length == 4) {
2780         intrinsic = "llvm.x86.sse.rcp.ps";
2781      }
2782      else {
2783         intrinsic = "llvm.x86.avx.rcp.ps.256";
2784      }
2785
2786      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2787
2788      for (i = 0; i < num_iterations; ++i) {
2789         res = lp_build_rcp_refine(bld, a, res);
2790      }
2791
2792      return res;
2793   }
2794
2795   return LLVMBuildFDiv(builder, bld->one, a, "");
2796}
2797
2798
2799/**
2800 * Do one Newton-Raphson step to improve rsqrt precision:
2801 *
2802 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2803 *
2804 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2805 */
2806static inline LLVMValueRef
2807lp_build_rsqrt_refine(struct lp_build_context *bld,
2808                      LLVMValueRef a,
2809                      LLVMValueRef rsqrt_a)
2810{
2811   LLVMBuilderRef builder = bld->gallivm->builder;
2812   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2813   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2814   LLVMValueRef res;
2815
2816   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2817   res = LLVMBuildFMul(builder, a, res, "");
2818   res = LLVMBuildFSub(builder, three, res, "");
2819   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2820   res = LLVMBuildFMul(builder, half, res, "");
2821
2822   return res;
2823}
2824
2825
2826/**
2827 * Generate 1/sqrt(a).
2828 * Result is undefined for values < 0, infinity for +0.
2829 */
2830LLVMValueRef
2831lp_build_rsqrt(struct lp_build_context *bld,
2832               LLVMValueRef a)
2833{
2834   const struct lp_type type = bld->type;
2835
2836   assert(lp_check_value(type, a));
2837
2838   assert(type.floating);
2839
2840   /*
2841    * This should be faster but all denormals will end up as infinity.
2842    */
2843   if (0 && lp_build_fast_rsqrt_available(type)) {
2844      const unsigned num_iterations = 1;
2845      LLVMValueRef res;
2846      unsigned i;
2847
2848      /* rsqrt(1.0) != 1.0 here */
2849      res = lp_build_fast_rsqrt(bld, a);
2850
2851      if (num_iterations) {
2852         /*
2853          * Newton-Raphson will result in NaN instead of infinity for zero,
2854          * and NaN instead of zero for infinity.
2855          * Also, need to ensure rsqrt(1.0) == 1.0.
2856          * All numbers smaller than FLT_MIN will result in +infinity
2857          * (rsqrtps treats all denormals as zero).
2858          */
2859         LLVMValueRef cmp;
2860         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2861         LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2862
2863         for (i = 0; i < num_iterations; ++i) {
2864            res = lp_build_rsqrt_refine(bld, a, res);
2865         }
2866         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2867         res = lp_build_select(bld, cmp, inf, res);
2868         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2869         res = lp_build_select(bld, cmp, bld->zero, res);
2870         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2871         res = lp_build_select(bld, cmp, bld->one, res);
2872      }
2873
2874      return res;
2875   }
2876
2877   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2878}
2879
2880/**
2881 * If there's a fast (inaccurate) rsqrt instruction available
2882 * (caller may want to avoid to call rsqrt_fast if it's not available,
2883 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2884 * unavailable it would result in sqrt/div/mul so obviously
2885 * much better to just call sqrt, skipping both div and mul).
2886 */
2887boolean
2888lp_build_fast_rsqrt_available(struct lp_type type)
2889{
2890   assert(type.floating);
2891
2892   if ((util_cpu_caps.has_sse && type.width == 32 && type.length == 4) ||
2893       (util_cpu_caps.has_avx && type.width == 32 && type.length == 8)) {
2894      return true;
2895   }
2896   return false;
2897}
2898
2899
2900/**
2901 * Generate 1/sqrt(a).
2902 * Result is undefined for values < 0, infinity for +0.
2903 * Precision is limited, only ~10 bits guaranteed
2904 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2905 */
2906LLVMValueRef
2907lp_build_fast_rsqrt(struct lp_build_context *bld,
2908                    LLVMValueRef a)
2909{
2910   LLVMBuilderRef builder = bld->gallivm->builder;
2911   const struct lp_type type = bld->type;
2912
2913   assert(lp_check_value(type, a));
2914
2915   if (lp_build_fast_rsqrt_available(type)) {
2916      const char *intrinsic = NULL;
2917
2918      if (type.length == 4) {
2919         intrinsic = "llvm.x86.sse.rsqrt.ps";
2920      }
2921      else {
2922         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2923      }
2924      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2925   }
2926   else {
2927      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2928   }
2929   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2930}
2931
2932
2933/**
2934 * Generate sin(a) or cos(a) using polynomial approximation.
2935 * TODO: it might be worth recognizing sin and cos using same source
2936 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2937 * would be way cheaper than calculating (nearly) everything twice...
2938 * Not sure it's common enough to be worth bothering however, scs
2939 * opcode could also benefit from calculating both though.
2940 */
2941static LLVMValueRef
2942lp_build_sin_or_cos(struct lp_build_context *bld,
2943                    LLVMValueRef a,
2944                    boolean cos)
2945{
2946   struct gallivm_state *gallivm = bld->gallivm;
2947   LLVMBuilderRef b = gallivm->builder;
2948   struct lp_type int_type = lp_int_type(bld->type);
2949
2950   /*
2951    *  take the absolute value,
2952    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2953    */
2954
2955   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2956   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2957
2958   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2959   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2960
2961   /*
2962    * scale by 4/Pi
2963    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2964    */
2965
2966   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2967   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2968
2969   /*
2970    * store the integer part of y in mm0
2971    * emm2 = _mm_cvttps_epi32(y);
2972    */
2973
2974   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2975
2976   /*
2977    * j=(j+1) & (~1) (see the cephes sources)
2978    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2979    */
2980
2981   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2982   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2983   /*
2984    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2985    */
2986   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2987   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2988
2989   /*
2990    * y = _mm_cvtepi32_ps(emm2);
2991    */
2992   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2993
2994   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2995   LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2996   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2997   LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2998
2999   /*
3000    * Argument used for poly selection and sign bit determination
3001    * is different for sin vs. cos.
3002    */
3003   LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
3004                               emm2_and;
3005
3006   LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
3007                                                              LLVMBuildNot(b, emm2_2, ""), ""),
3008                                              const_29, "sign_bit") :
3009                                 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
3010                                                              LLVMBuildShl(b, emm2_add,
3011                                                                           const_29, ""), ""),
3012                                              sign_mask, "sign_bit");
3013
3014   /*
3015    * get the polynom selection mask
3016    * there is one polynom for 0 <= x <= Pi/4
3017    * and another one for Pi/4<x<=Pi/2
3018    * Both branches will be computed.
3019    *
3020    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
3021    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
3022    */
3023
3024   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
3025   LLVMValueRef poly_mask = lp_build_compare(gallivm,
3026                                             int_type, PIPE_FUNC_EQUAL,
3027                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
3028
3029   /*
3030    * _PS_CONST(minus_cephes_DP1, -0.78515625);
3031    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
3032    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
3033    */
3034   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
3035   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
3036   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
3037
3038   /*
3039    * The magic pass: "Extended precision modular arithmetic"
3040    * x = ((x - y * DP1) - y * DP2) - y * DP3;
3041    */
3042   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
3043   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
3044   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
3045
3046   /*
3047    * Evaluate the first polynom  (0 <= x <= Pi/4)
3048    *
3049    * z = _mm_mul_ps(x,x);
3050    */
3051   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
3052
3053   /*
3054    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
3055    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
3056    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
3057    */
3058   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
3059   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
3060   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
3061
3062   /*
3063    * y = *(v4sf*)_ps_coscof_p0;
3064    * y = _mm_mul_ps(y, z);
3065    */
3066   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
3067   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
3068   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
3069   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
3070
3071
3072   /*
3073    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
3074    * y = _mm_sub_ps(y, tmp);
3075    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
3076    */
3077   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
3078   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
3079   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
3080   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
3081   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
3082
3083   /*
3084    * _PS_CONST(sincof_p0, -1.9515295891E-4);
3085    * _PS_CONST(sincof_p1,  8.3321608736E-3);
3086    * _PS_CONST(sincof_p2, -1.6666654611E-1);
3087    */
3088   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
3089   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
3090   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
3091
3092   /*
3093    * Evaluate the second polynom  (Pi/4 <= x <= 0)
3094    *
3095    * y2 = *(v4sf*)_ps_sincof_p0;
3096    * y2 = _mm_mul_ps(y2, z);
3097    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
3098    * y2 = _mm_mul_ps(y2, z);
3099    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
3100    * y2 = _mm_mul_ps(y2, z);
3101    * y2 = _mm_mul_ps(y2, x);
3102    * y2 = _mm_add_ps(y2, x);
3103    */
3104
3105   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
3106   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
3107   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
3108   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
3109
3110   /*
3111    * select the correct result from the two polynoms
3112    * xmm3 = poly_mask;
3113    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3114    * y = _mm_andnot_ps(xmm3, y);
3115    * y = _mm_or_ps(y,y2);
3116    */
3117   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3118   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3119   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3120   LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3121   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3122   LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3123
3124   /*
3125    * update the sign
3126    * y = _mm_xor_ps(y, sign_bit);
3127    */
3128   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3129   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3130
3131   LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3132
3133   /* clamp output to be within [-1, 1] */
3134   y_result = lp_build_clamp(bld, y_result,
3135                             lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3136                             lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3137   /* If a is -inf, inf or NaN then return NaN */
3138   y_result = lp_build_select(bld, isfinite, y_result,
3139                              lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3140   return y_result;
3141}
3142
3143
3144/**
3145 * Generate sin(a)
3146 */
3147LLVMValueRef
3148lp_build_sin(struct lp_build_context *bld,
3149             LLVMValueRef a)
3150{
3151   return lp_build_sin_or_cos(bld, a, FALSE);
3152}
3153
3154
3155/**
3156 * Generate cos(a)
3157 */
3158LLVMValueRef
3159lp_build_cos(struct lp_build_context *bld,
3160             LLVMValueRef a)
3161{
3162   return lp_build_sin_or_cos(bld, a, TRUE);
3163}
3164
3165
3166/**
3167 * Generate pow(x, y)
3168 */
3169LLVMValueRef
3170lp_build_pow(struct lp_build_context *bld,
3171             LLVMValueRef x,
3172             LLVMValueRef y)
3173{
3174   /* TODO: optimize the constant case */
3175   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3176       LLVMIsConstant(x) && LLVMIsConstant(y)) {
3177      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3178                   __FUNCTION__);
3179   }
3180
3181   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
3182}
3183
3184
3185/**
3186 * Generate exp(x)
3187 */
3188LLVMValueRef
3189lp_build_exp(struct lp_build_context *bld,
3190             LLVMValueRef x)
3191{
3192   /* log2(e) = 1/log(2) */
3193   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3194                                           1.4426950408889634);
3195
3196   assert(lp_check_value(bld->type, x));
3197
3198   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3199}
3200
3201
3202/**
3203 * Generate log(x)
3204 * Behavior is undefined with infs, 0s and nans
3205 */
3206LLVMValueRef
3207lp_build_log(struct lp_build_context *bld,
3208             LLVMValueRef x)
3209{
3210   /* log(2) */
3211   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3212                                          0.69314718055994529);
3213
3214   assert(lp_check_value(bld->type, x));
3215
3216   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3217}
3218
3219/**
3220 * Generate log(x) that handles edge cases (infs, 0s and nans)
3221 */
3222LLVMValueRef
3223lp_build_log_safe(struct lp_build_context *bld,
3224                  LLVMValueRef x)
3225{
3226   /* log(2) */
3227   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3228                                          0.69314718055994529);
3229
3230   assert(lp_check_value(bld->type, x));
3231
3232   return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3233}
3234
3235
3236/**
3237 * Generate polynomial.
3238 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3239 */
3240LLVMValueRef
3241lp_build_polynomial(struct lp_build_context *bld,
3242                    LLVMValueRef x,
3243                    const double *coeffs,
3244                    unsigned num_coeffs)
3245{
3246   const struct lp_type type = bld->type;
3247   LLVMValueRef even = NULL, odd = NULL;
3248   LLVMValueRef x2;
3249   unsigned i;
3250
3251   assert(lp_check_value(bld->type, x));
3252
3253   /* TODO: optimize the constant case */
3254   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3255       LLVMIsConstant(x)) {
3256      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3257                   __FUNCTION__);
3258   }
3259
3260   /*
3261    * Calculate odd and even terms seperately to decrease data dependency
3262    * Ex:
3263    *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3264    *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3265    */
3266   x2 = lp_build_mul(bld, x, x);
3267
3268   for (i = num_coeffs; i--; ) {
3269      LLVMValueRef coeff;
3270
3271      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3272
3273      if (i % 2 == 0) {
3274         if (even)
3275            even = lp_build_mad(bld, x2, even, coeff);
3276         else
3277            even = coeff;
3278      } else {
3279         if (odd)
3280            odd = lp_build_mad(bld, x2, odd, coeff);
3281         else
3282            odd = coeff;
3283      }
3284   }
3285
3286   if (odd)
3287      return lp_build_mad(bld, odd, x, even);
3288   else if (even)
3289      return even;
3290   else
3291      return bld->undef;
3292}
3293
3294
3295/**
3296 * Minimax polynomial fit of 2**x, in range [0, 1[
3297 */
3298const double lp_build_exp2_polynomial[] = {
3299#if EXP_POLY_DEGREE == 5
3300   1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3301   0.693153073200168932794,
3302   0.240153617044375388211,
3303   0.0558263180532956664775,
3304   0.00898934009049466391101,
3305   0.00187757667519147912699
3306#elif EXP_POLY_DEGREE == 4
3307   1.00000259337069434683,
3308   0.693003834469974940458,
3309   0.24144275689150793076,
3310   0.0520114606103070150235,
3311   0.0135341679161270268764
3312#elif EXP_POLY_DEGREE == 3
3313   0.999925218562710312959,
3314   0.695833540494823811697,
3315   0.226067155427249155588,
3316   0.0780245226406372992967
3317#elif EXP_POLY_DEGREE == 2
3318   1.00172476321474503578,
3319   0.657636275736077639316,
3320   0.33718943461968720704
3321#else
3322#error
3323#endif
3324};
3325
3326
3327LLVMValueRef
3328lp_build_exp2(struct lp_build_context *bld,
3329              LLVMValueRef x)
3330{
3331   LLVMBuilderRef builder = bld->gallivm->builder;
3332   const struct lp_type type = bld->type;
3333   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3334   LLVMValueRef ipart = NULL;
3335   LLVMValueRef fpart = NULL;
3336   LLVMValueRef expipart = NULL;
3337   LLVMValueRef expfpart = NULL;
3338   LLVMValueRef res = NULL;
3339
3340   assert(lp_check_value(bld->type, x));
3341
3342   /* TODO: optimize the constant case */
3343   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3344       LLVMIsConstant(x)) {
3345      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3346                   __FUNCTION__);
3347   }
3348
3349   assert(type.floating && type.width == 32);
3350
3351   /* We want to preserve NaN and make sure than for exp2 if x > 128,
3352    * the result is INF  and if it's smaller than -126.9 the result is 0 */
3353   x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3354                        GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3355   x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3356                        x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3357
3358   /* ipart = floor(x) */
3359   /* fpart = x - ipart */
3360   lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3361
3362   /* expipart = (float) (1 << ipart) */
3363   expipart = LLVMBuildAdd(builder, ipart,
3364                           lp_build_const_int_vec(bld->gallivm, type, 127), "");
3365   expipart = LLVMBuildShl(builder, expipart,
3366                           lp_build_const_int_vec(bld->gallivm, type, 23), "");
3367   expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3368
3369   expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3370                                  ARRAY_SIZE(lp_build_exp2_polynomial));
3371
3372   res = LLVMBuildFMul(builder, expipart, expfpart, "");
3373
3374   return res;
3375}
3376
3377
3378
3379/**
3380 * Extract the exponent of a IEEE-754 floating point value.
3381 *
3382 * Optionally apply an integer bias.
3383 *
3384 * Result is an integer value with
3385 *
3386 *   ifloor(log2(x)) + bias
3387 */
3388LLVMValueRef
3389lp_build_extract_exponent(struct lp_build_context *bld,
3390                          LLVMValueRef x,
3391                          int bias)
3392{
3393   LLVMBuilderRef builder = bld->gallivm->builder;
3394   const struct lp_type type = bld->type;
3395   unsigned mantissa = lp_mantissa(type);
3396   LLVMValueRef res;
3397
3398   assert(type.floating);
3399
3400   assert(lp_check_value(bld->type, x));
3401
3402   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3403
3404   res = LLVMBuildLShr(builder, x,
3405                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3406   res = LLVMBuildAnd(builder, res,
3407                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
3408   res = LLVMBuildSub(builder, res,
3409                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3410
3411   return res;
3412}
3413
3414
3415/**
3416 * Extract the mantissa of the a floating.
3417 *
3418 * Result is a floating point value with
3419 *
3420 *   x / floor(log2(x))
3421 */
3422LLVMValueRef
3423lp_build_extract_mantissa(struct lp_build_context *bld,
3424                          LLVMValueRef x)
3425{
3426   LLVMBuilderRef builder = bld->gallivm->builder;
3427   const struct lp_type type = bld->type;
3428   unsigned mantissa = lp_mantissa(type);
3429   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3430                                                  (1ULL << mantissa) - 1);
3431   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3432   LLVMValueRef res;
3433
3434   assert(lp_check_value(bld->type, x));
3435
3436   assert(type.floating);
3437
3438   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3439
3440   /* res = x / 2**ipart */
3441   res = LLVMBuildAnd(builder, x, mantmask, "");
3442   res = LLVMBuildOr(builder, res, one, "");
3443   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3444
3445   return res;
3446}
3447
3448
3449
3450/**
3451 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3452 * These coefficients can be generate with
3453 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3454 */
3455const double lp_build_log2_polynomial[] = {
3456#if LOG_POLY_DEGREE == 5
3457   2.88539008148777786488L,
3458   0.961796878841293367824L,
3459   0.577058946784739859012L,
3460   0.412914355135828735411L,
3461   0.308591899232910175289L,
3462   0.352376952300281371868L,
3463#elif LOG_POLY_DEGREE == 4
3464   2.88539009343309178325L,
3465   0.961791550404184197881L,
3466   0.577440339438736392009L,
3467   0.403343858251329912514L,
3468   0.406718052498846252698L,
3469#elif LOG_POLY_DEGREE == 3
3470   2.88538959748872753838L,
3471   0.961932915889597772928L,
3472   0.571118517972136195241L,
3473   0.493997535084709500285L,
3474#else
3475#error
3476#endif
3477};
3478
3479/**
3480 * See http://www.devmaster.net/forums/showthread.php?p=43580
3481 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3482 * http://www.nezumi.demon.co.uk/consult/logx.htm
3483 *
3484 * If handle_edge_cases is true the function will perform computations
3485 * to match the required D3D10+ behavior for each of the edge cases.
3486 * That means that if input is:
3487 * - less than zero (to and including -inf) then NaN will be returned
3488 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3489 * - +infinity, then +infinity will be returned
3490 * - NaN, then NaN will be returned
3491 *
3492 * Those checks are fairly expensive so if you don't need them make sure
3493 * handle_edge_cases is false.
3494 */
3495void
3496lp_build_log2_approx(struct lp_build_context *bld,
3497                     LLVMValueRef x,
3498                     LLVMValueRef *p_exp,
3499                     LLVMValueRef *p_floor_log2,
3500                     LLVMValueRef *p_log2,
3501                     boolean handle_edge_cases)
3502{
3503   LLVMBuilderRef builder = bld->gallivm->builder;
3504   const struct lp_type type = bld->type;
3505   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3506   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3507
3508   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3509   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3510   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3511
3512   LLVMValueRef i = NULL;
3513   LLVMValueRef y = NULL;
3514   LLVMValueRef z = NULL;
3515   LLVMValueRef exp = NULL;
3516   LLVMValueRef mant = NULL;
3517   LLVMValueRef logexp = NULL;
3518   LLVMValueRef p_z = NULL;
3519   LLVMValueRef res = NULL;
3520
3521   assert(lp_check_value(bld->type, x));
3522
3523   if(p_exp || p_floor_log2 || p_log2) {
3524      /* TODO: optimize the constant case */
3525      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3526          LLVMIsConstant(x)) {
3527         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3528                      __FUNCTION__);
3529      }
3530
3531      assert(type.floating && type.width == 32);
3532
3533      /*
3534       * We don't explicitly handle denormalized numbers. They will yield a
3535       * result in the neighbourhood of -127, which appears to be adequate
3536       * enough.
3537       */
3538
3539      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3540
3541      /* exp = (float) exponent(x) */
3542      exp = LLVMBuildAnd(builder, i, expmask, "");
3543   }
3544
3545   if(p_floor_log2 || p_log2) {
3546      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3547      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3548      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3549   }
3550
3551   if (p_log2) {
3552      /* mant = 1 + (float) mantissa(x) */
3553      mant = LLVMBuildAnd(builder, i, mantmask, "");
3554      mant = LLVMBuildOr(builder, mant, one, "");
3555      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3556
3557      /* y = (mant - 1) / (mant + 1) */
3558      y = lp_build_div(bld,
3559         lp_build_sub(bld, mant, bld->one),
3560         lp_build_add(bld, mant, bld->one)
3561      );
3562
3563      /* z = y^2 */
3564      z = lp_build_mul(bld, y, y);
3565
3566      /* compute P(z) */
3567      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3568                                ARRAY_SIZE(lp_build_log2_polynomial));
3569
3570      /* y * P(z) + logexp */
3571      res = lp_build_mad(bld, y, p_z, logexp);
3572
3573      if (type.floating && handle_edge_cases) {
3574         LLVMValueRef negmask, infmask,  zmask;
3575         negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3576                                lp_build_const_vec(bld->gallivm, type,  0.0f));
3577         zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3578                              lp_build_const_vec(bld->gallivm, type,  0.0f));
3579         infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3580                                lp_build_const_vec(bld->gallivm, type,  INFINITY));
3581
3582         /* If x is qual to inf make sure we return inf */
3583         res = lp_build_select(bld, infmask,
3584                               lp_build_const_vec(bld->gallivm, type,  INFINITY),
3585                               res);
3586         /* If x is qual to 0, return -inf */
3587         res = lp_build_select(bld, zmask,
3588                               lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3589                               res);
3590         /* If x is nan or less than 0, return nan */
3591         res = lp_build_select(bld, negmask,
3592                               lp_build_const_vec(bld->gallivm, type,  NAN),
3593                               res);
3594      }
3595   }
3596
3597   if (p_exp) {
3598      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3599      *p_exp = exp;
3600   }
3601
3602   if (p_floor_log2)
3603      *p_floor_log2 = logexp;
3604
3605   if (p_log2)
3606      *p_log2 = res;
3607}
3608
3609
3610/*
3611 * log2 implementation which doesn't have special code to
3612 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3613 * the results for those cases are undefined.
3614 */
3615LLVMValueRef
3616lp_build_log2(struct lp_build_context *bld,
3617              LLVMValueRef x)
3618{
3619   LLVMValueRef res;
3620   lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3621   return res;
3622}
3623
3624/*
3625 * Version of log2 which handles all edge cases.
3626 * Look at documentation of lp_build_log2_approx for
3627 * description of the behavior for each of the edge cases.
3628 */
3629LLVMValueRef
3630lp_build_log2_safe(struct lp_build_context *bld,
3631                   LLVMValueRef x)
3632{
3633   LLVMValueRef res;
3634   lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3635   return res;
3636}
3637
3638
3639/**
3640 * Faster (and less accurate) log2.
3641 *
3642 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3643 *
3644 * Piece-wise linear approximation, with exact results when x is a
3645 * power of two.
3646 *
3647 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3648 */
3649LLVMValueRef
3650lp_build_fast_log2(struct lp_build_context *bld,
3651                   LLVMValueRef x)
3652{
3653   LLVMBuilderRef builder = bld->gallivm->builder;
3654   LLVMValueRef ipart;
3655   LLVMValueRef fpart;
3656
3657   assert(lp_check_value(bld->type, x));
3658
3659   assert(bld->type.floating);
3660
3661   /* ipart = floor(log2(x)) - 1 */
3662   ipart = lp_build_extract_exponent(bld, x, -1);
3663   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3664
3665   /* fpart = x / 2**ipart */
3666   fpart = lp_build_extract_mantissa(bld, x);
3667
3668   /* ipart + fpart */
3669   return LLVMBuildFAdd(builder, ipart, fpart, "");
3670}
3671
3672
3673/**
3674 * Fast implementation of iround(log2(x)).
3675 *
3676 * Not an approximation -- it should give accurate results all the time.
3677 */
3678LLVMValueRef
3679lp_build_ilog2(struct lp_build_context *bld,
3680               LLVMValueRef x)
3681{
3682   LLVMBuilderRef builder = bld->gallivm->builder;
3683   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3684   LLVMValueRef ipart;
3685
3686   assert(bld->type.floating);
3687
3688   assert(lp_check_value(bld->type, x));
3689
3690   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3691   x = LLVMBuildFMul(builder, x, sqrt2, "");
3692
3693   /* ipart = floor(log2(x) + 0.5)  */
3694   ipart = lp_build_extract_exponent(bld, x, 0);
3695
3696   return ipart;
3697}
3698
3699LLVMValueRef
3700lp_build_mod(struct lp_build_context *bld,
3701             LLVMValueRef x,
3702             LLVMValueRef y)
3703{
3704   LLVMBuilderRef builder = bld->gallivm->builder;
3705   LLVMValueRef res;
3706   const struct lp_type type = bld->type;
3707
3708   assert(lp_check_value(type, x));
3709   assert(lp_check_value(type, y));
3710
3711   if (type.floating)
3712      res = LLVMBuildFRem(builder, x, y, "");
3713   else if (type.sign)
3714      res = LLVMBuildSRem(builder, x, y, "");
3715   else
3716      res = LLVMBuildURem(builder, x, y, "");
3717   return res;
3718}
3719
3720
3721/*
3722 * For floating inputs it creates and returns a mask
3723 * which is all 1's for channels which are NaN.
3724 * Channels inside x which are not NaN will be 0.
3725 */
3726LLVMValueRef
3727lp_build_isnan(struct lp_build_context *bld,
3728               LLVMValueRef x)
3729{
3730   LLVMValueRef mask;
3731   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3732
3733   assert(bld->type.floating);
3734   assert(lp_check_value(bld->type, x));
3735
3736   mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3737                        "isnotnan");
3738   mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3739   mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3740   return mask;
3741}
3742
3743/* Returns all 1's for floating point numbers that are
3744 * finite numbers and returns all zeros for -inf,
3745 * inf and nan's */
3746LLVMValueRef
3747lp_build_isfinite(struct lp_build_context *bld,
3748                  LLVMValueRef x)
3749{
3750   LLVMBuilderRef builder = bld->gallivm->builder;
3751   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3752   struct lp_type int_type = lp_int_type(bld->type);
3753   LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3754   LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3755                                                    0x7f800000);
3756
3757   if (!bld->type.floating) {
3758      return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3759   }
3760   assert(bld->type.floating);
3761   assert(lp_check_value(bld->type, x));
3762   assert(bld->type.width == 32);
3763
3764   intx = LLVMBuildAnd(builder, intx, infornan32, "");
3765   return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3766                           intx, infornan32);
3767}
3768
3769/*
3770 * Returns true if the number is nan or inf and false otherwise.
3771 * The input has to be a floating point vector.
3772 */
3773LLVMValueRef
3774lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3775                       const struct lp_type type,
3776                       LLVMValueRef x)
3777{
3778   LLVMBuilderRef builder = gallivm->builder;
3779   struct lp_type int_type = lp_int_type(type);
3780   LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3781                                                0x7f800000);
3782   LLVMValueRef ret;
3783
3784   assert(type.floating);
3785
3786   ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3787   ret = LLVMBuildAnd(builder, ret, const0, "");
3788   ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3789                          ret, const0);
3790
3791   return ret;
3792}
3793
3794
3795LLVMValueRef
3796lp_build_fpstate_get(struct gallivm_state *gallivm)
3797{
3798   if (util_cpu_caps.has_sse) {
3799      LLVMBuilderRef builder = gallivm->builder;
3800      LLVMValueRef mxcsr_ptr = lp_build_alloca(
3801         gallivm,
3802         LLVMInt32TypeInContext(gallivm->context),
3803         "mxcsr_ptr");
3804      LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3805          LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3806      lp_build_intrinsic(builder,
3807                         "llvm.x86.sse.stmxcsr",
3808                         LLVMVoidTypeInContext(gallivm->context),
3809                         &mxcsr_ptr8, 1, 0);
3810      return mxcsr_ptr;
3811   }
3812   return 0;
3813}
3814
3815void
3816lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3817                                  boolean zero)
3818{
3819   if (util_cpu_caps.has_sse) {
3820      /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3821      int daz_ftz = _MM_FLUSH_ZERO_MASK;
3822
3823      LLVMBuilderRef builder = gallivm->builder;
3824      LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3825      LLVMValueRef mxcsr =
3826         LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3827
3828      if (util_cpu_caps.has_daz) {
3829         /* Enable denormals are zero mode */
3830         daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3831      }
3832      if (zero) {
3833         mxcsr = LLVMBuildOr(builder, mxcsr,
3834                             LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3835      } else {
3836         mxcsr = LLVMBuildAnd(builder, mxcsr,
3837                              LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3838      }
3839
3840      LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3841      lp_build_fpstate_set(gallivm, mxcsr_ptr);
3842   }
3843}
3844
3845void
3846lp_build_fpstate_set(struct gallivm_state *gallivm,
3847                     LLVMValueRef mxcsr_ptr)
3848{
3849   if (util_cpu_caps.has_sse) {
3850      LLVMBuilderRef builder = gallivm->builder;
3851      mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3852                     LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3853      lp_build_intrinsic(builder,
3854                         "llvm.x86.sse.ldmxcsr",
3855                         LLVMVoidTypeInContext(gallivm->context),
3856                         &mxcsr_ptr, 1, 0);
3857   }
3858}
3859