1/**************************************************************************
2 *
3 * Copyright 2009-2010 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28
29/**
30 * @file
31 * Helper
32 *
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
37 *
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 *   of knowing, such as when source arguments are known to be in [0, 1] range.
43 *
44 * @author Jose Fonseca <jfonseca@vmware.com>
45 */
46
47
48#include <float.h>
49
50#include <llvm/Config/llvm-config.h>
51
52#include "util/u_memory.h"
53#include "util/u_debug.h"
54#include "util/u_math.h"
55#include "util/u_cpu_detect.h"
56
57#include "lp_bld_type.h"
58#include "lp_bld_const.h"
59#include "lp_bld_init.h"
60#include "lp_bld_intr.h"
61#include "lp_bld_logic.h"
62#include "lp_bld_pack.h"
63#include "lp_bld_debug.h"
64#include "lp_bld_bitarit.h"
65#include "lp_bld_arit.h"
66#include "lp_bld_flow.h"
67
68#if defined(PIPE_ARCH_SSE)
69#include <xmmintrin.h>
70#endif
71
72#ifndef _MM_DENORMALS_ZERO_MASK
73#define _MM_DENORMALS_ZERO_MASK 0x0040
74#endif
75
76#ifndef _MM_FLUSH_ZERO_MASK
77#define _MM_FLUSH_ZERO_MASK 0x8000
78#endif
79
80#define EXP_POLY_DEGREE 5
81
82#define LOG_POLY_DEGREE 4
83
84
85/**
86 * Generate min(a, b)
87 * No checks for special case values of a or b = 1 or 0 are done.
88 * NaN's are handled according to the behavior specified by the
89 * nan_behavior argument.
90 */
91static LLVMValueRef
92lp_build_min_simple(struct lp_build_context *bld,
93                    LLVMValueRef a,
94                    LLVMValueRef b,
95                    enum gallivm_nan_behavior nan_behavior)
96{
97   const struct lp_type type = bld->type;
98   const char *intrinsic = NULL;
99   unsigned intr_size = 0;
100   LLVMValueRef cond;
101
102   assert(lp_check_value(type, a));
103   assert(lp_check_value(type, b));
104
105   /* TODO: optimize the constant case */
106
107   if (type.floating && util_get_cpu_caps()->has_sse) {
108      if (type.width == 32) {
109         if (type.length == 1) {
110            intrinsic = "llvm.x86.sse.min.ss";
111            intr_size = 128;
112         }
113         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
114            intrinsic = "llvm.x86.sse.min.ps";
115            intr_size = 128;
116         }
117         else {
118            intrinsic = "llvm.x86.avx.min.ps.256";
119            intr_size = 256;
120         }
121      }
122      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
123         if (type.length == 1) {
124            intrinsic = "llvm.x86.sse2.min.sd";
125            intr_size = 128;
126         }
127         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
128            intrinsic = "llvm.x86.sse2.min.pd";
129            intr_size = 128;
130         }
131         else {
132            intrinsic = "llvm.x86.avx.min.pd.256";
133            intr_size = 256;
134         }
135      }
136   }
137   else if (type.floating && util_get_cpu_caps()->has_altivec) {
138      if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
139         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
140                      __FUNCTION__);
141      }
142      if (type.width == 32 && type.length == 4) {
143         intrinsic = "llvm.ppc.altivec.vminfp";
144         intr_size = 128;
145      }
146   } else if (util_get_cpu_caps()->has_altivec) {
147      intr_size = 128;
148      if (type.width == 8) {
149         if (!type.sign) {
150            intrinsic = "llvm.ppc.altivec.vminub";
151         } else {
152            intrinsic = "llvm.ppc.altivec.vminsb";
153         }
154      } else if (type.width == 16) {
155         if (!type.sign) {
156            intrinsic = "llvm.ppc.altivec.vminuh";
157         } else {
158            intrinsic = "llvm.ppc.altivec.vminsh";
159         }
160      } else if (type.width == 32) {
161         if (!type.sign) {
162            intrinsic = "llvm.ppc.altivec.vminuw";
163         } else {
164            intrinsic = "llvm.ppc.altivec.vminsw";
165         }
166      }
167   }
168
169   if (intrinsic) {
170      /* We need to handle nan's for floating point numbers. If one of the
171       * inputs is nan the other should be returned (required by both D3D10+
172       * and OpenCL).
173       * The sse intrinsics return the second operator in case of nan by
174       * default so we need to special code to handle those.
175       */
176      if (util_get_cpu_caps()->has_sse && type.floating &&
177          nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
178         LLVMValueRef isnan, min;
179         min = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
180                                                   type,
181                                                   intr_size, a, b);
182         isnan = lp_build_isnan(bld, b);
183         return lp_build_select(bld, isnan, a, min);
184      } else {
185         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
186                                                    type,
187                                                    intr_size, a, b);
188      }
189   }
190
191   if (type.floating) {
192      switch (nan_behavior) {
193      case GALLIVM_NAN_RETURN_OTHER: {
194         LLVMValueRef isnan = lp_build_isnan(bld, a);
195         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
196         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
197         return lp_build_select(bld, cond, a, b);
198      }
199         break;
200      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
201         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_LESS, a, b);
202         return lp_build_select(bld, cond, a, b);
203      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
204         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, b, a);
205         return lp_build_select(bld, cond, b, a);
206      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
207         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
208         return lp_build_select(bld, cond, a, b);
209         break;
210      default:
211         assert(0);
212         cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
213         return lp_build_select(bld, cond, a, b);
214      }
215   } else {
216      cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
217      return lp_build_select(bld, cond, a, b);
218   }
219}
220
221
222LLVMValueRef
223lp_build_fmuladd(LLVMBuilderRef builder,
224                 LLVMValueRef a,
225                 LLVMValueRef b,
226                 LLVMValueRef c)
227{
228   LLVMTypeRef type = LLVMTypeOf(a);
229   assert(type == LLVMTypeOf(b));
230   assert(type == LLVMTypeOf(c));
231
232   char intrinsic[32];
233   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fmuladd", type);
234   LLVMValueRef args[] = { a, b, c };
235   return lp_build_intrinsic(builder, intrinsic, type, args, 3, 0);
236}
237
238
239/**
240 * Generate max(a, b)
241 * No checks for special case values of a or b = 1 or 0 are done.
242 * NaN's are handled according to the behavior specified by the
243 * nan_behavior argument.
244 */
245static LLVMValueRef
246lp_build_max_simple(struct lp_build_context *bld,
247                    LLVMValueRef a,
248                    LLVMValueRef b,
249                    enum gallivm_nan_behavior nan_behavior)
250{
251   const struct lp_type type = bld->type;
252   const char *intrinsic = NULL;
253   unsigned intr_size = 0;
254   LLVMValueRef cond;
255
256   assert(lp_check_value(type, a));
257   assert(lp_check_value(type, b));
258
259   /* TODO: optimize the constant case */
260
261   if (type.floating && util_get_cpu_caps()->has_sse) {
262      if (type.width == 32) {
263         if (type.length == 1) {
264            intrinsic = "llvm.x86.sse.max.ss";
265            intr_size = 128;
266         }
267         else if (type.length <= 4 || !util_get_cpu_caps()->has_avx) {
268            intrinsic = "llvm.x86.sse.max.ps";
269            intr_size = 128;
270         }
271         else {
272            intrinsic = "llvm.x86.avx.max.ps.256";
273            intr_size = 256;
274         }
275      }
276      if (type.width == 64 && util_get_cpu_caps()->has_sse2) {
277         if (type.length == 1) {
278            intrinsic = "llvm.x86.sse2.max.sd";
279            intr_size = 128;
280         }
281         else if (type.length == 2 || !util_get_cpu_caps()->has_avx) {
282            intrinsic = "llvm.x86.sse2.max.pd";
283            intr_size = 128;
284         }
285         else {
286            intrinsic = "llvm.x86.avx.max.pd.256";
287            intr_size = 256;
288         }
289      }
290   }
291   else if (type.floating && util_get_cpu_caps()->has_altivec) {
292      if (nan_behavior == GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN) {
293         debug_printf("%s: altivec doesn't support nan return nan behavior\n",
294                      __FUNCTION__);
295      }
296      if (type.width == 32 || type.length == 4) {
297         intrinsic = "llvm.ppc.altivec.vmaxfp";
298         intr_size = 128;
299      }
300   } else if (util_get_cpu_caps()->has_altivec) {
301     intr_size = 128;
302     if (type.width == 8) {
303       if (!type.sign) {
304         intrinsic = "llvm.ppc.altivec.vmaxub";
305       } else {
306         intrinsic = "llvm.ppc.altivec.vmaxsb";
307       }
308     } else if (type.width == 16) {
309       if (!type.sign) {
310         intrinsic = "llvm.ppc.altivec.vmaxuh";
311       } else {
312         intrinsic = "llvm.ppc.altivec.vmaxsh";
313       }
314     } else if (type.width == 32) {
315       if (!type.sign) {
316         intrinsic = "llvm.ppc.altivec.vmaxuw";
317       } else {
318         intrinsic = "llvm.ppc.altivec.vmaxsw";
319       }
320     }
321   }
322
323   if (intrinsic) {
324      if (util_get_cpu_caps()->has_sse && type.floating &&
325          nan_behavior == GALLIVM_NAN_RETURN_OTHER) {
326         LLVMValueRef isnan, max;
327         max = lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
328                                                   type,
329                                                   intr_size, a, b);
330         isnan = lp_build_isnan(bld, b);
331         return lp_build_select(bld, isnan, a, max);
332      } else {
333         return lp_build_intrinsic_binary_anylength(bld->gallivm, intrinsic,
334                                                    type,
335                                                    intr_size, a, b);
336      }
337   }
338
339   if (type.floating) {
340      switch (nan_behavior) {
341      case GALLIVM_NAN_RETURN_OTHER: {
342         LLVMValueRef isnan = lp_build_isnan(bld, a);
343         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
344         cond = LLVMBuildXor(bld->gallivm->builder, cond, isnan, "");
345         return lp_build_select(bld, cond, a, b);
346      }
347         break;
348      case GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN:
349         cond = lp_build_cmp_ordered(bld, PIPE_FUNC_GREATER, a, b);
350         return lp_build_select(bld, cond, a, b);
351      case GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN:
352         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, b, a);
353         return lp_build_select(bld, cond, b, a);
354      case GALLIVM_NAN_BEHAVIOR_UNDEFINED:
355         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
356         return lp_build_select(bld, cond, a, b);
357         break;
358      default:
359         assert(0);
360         cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
361         return lp_build_select(bld, cond, a, b);
362      }
363   } else {
364      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
365      return lp_build_select(bld, cond, a, b);
366   }
367}
368
369
370/**
371 * Generate 1 - a, or ~a depending on bld->type.
372 */
373LLVMValueRef
374lp_build_comp(struct lp_build_context *bld,
375              LLVMValueRef a)
376{
377   LLVMBuilderRef builder = bld->gallivm->builder;
378   const struct lp_type type = bld->type;
379
380   assert(lp_check_value(type, a));
381
382   if(a == bld->one)
383      return bld->zero;
384   if(a == bld->zero)
385      return bld->one;
386
387   if(type.norm && !type.floating && !type.fixed && !type.sign) {
388      if(LLVMIsConstant(a))
389         return LLVMConstNot(a);
390      else
391         return LLVMBuildNot(builder, a, "");
392   }
393
394   if(LLVMIsConstant(a))
395      if (type.floating)
396          return LLVMConstFSub(bld->one, a);
397      else
398          return LLVMConstSub(bld->one, a);
399   else
400      if (type.floating)
401         return LLVMBuildFSub(builder, bld->one, a, "");
402      else
403         return LLVMBuildSub(builder, bld->one, a, "");
404}
405
406
407/**
408 * Generate a + b
409 */
410LLVMValueRef
411lp_build_add(struct lp_build_context *bld,
412             LLVMValueRef a,
413             LLVMValueRef b)
414{
415   LLVMBuilderRef builder = bld->gallivm->builder;
416   const struct lp_type type = bld->type;
417   LLVMValueRef res;
418
419   assert(lp_check_value(type, a));
420   assert(lp_check_value(type, b));
421
422   if (a == bld->zero)
423      return b;
424   if (b == bld->zero)
425      return a;
426   if (a == bld->undef || b == bld->undef)
427      return bld->undef;
428
429   if (type.norm) {
430      const char *intrinsic = NULL;
431
432      if (!type.sign && (a == bld->one || b == bld->one))
433        return bld->one;
434
435      if (!type.floating && !type.fixed) {
436         if (LLVM_VERSION_MAJOR >= 8) {
437            char intrin[32];
438            intrinsic = type.sign ? "llvm.sadd.sat" : "llvm.uadd.sat";
439            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
440            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
441         }
442         if (type.width * type.length == 128) {
443            if (util_get_cpu_caps()->has_sse2) {
444               if (type.width == 8)
445                 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
446               if (type.width == 16)
447                 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
448            } else if (util_get_cpu_caps()->has_altivec) {
449               if (type.width == 8)
450                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs";
451               if (type.width == 16)
452                  intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs";
453            }
454         }
455         if (type.width * type.length == 256) {
456            if (util_get_cpu_caps()->has_avx2) {
457               if (type.width == 8)
458                  intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b";
459               if (type.width == 16)
460                  intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w";
461            }
462         }
463      }
464
465      if (intrinsic)
466         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
467   }
468
469   if(type.norm && !type.floating && !type.fixed) {
470      if (type.sign) {
471         uint64_t sign = (uint64_t)1 << (type.width - 1);
472         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
473         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
474         /* a_clamp_max is the maximum a for positive b,
475            a_clamp_min is the minimum a for negative b. */
476         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
477         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
478         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min);
479      }
480   }
481
482   if(LLVMIsConstant(a) && LLVMIsConstant(b))
483      if (type.floating)
484         res = LLVMConstFAdd(a, b);
485      else
486         res = LLVMConstAdd(a, b);
487   else
488      if (type.floating)
489         res = LLVMBuildFAdd(builder, a, b, "");
490      else
491         res = LLVMBuildAdd(builder, a, b, "");
492
493   /* clamp to ceiling of 1.0 */
494   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
495      res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
496
497   if (type.norm && !type.floating && !type.fixed) {
498      if (!type.sign) {
499         /*
500          * newer llvm versions no longer support the intrinsics, but recognize
501          * the pattern. Since auto-upgrade of intrinsics doesn't work for jit
502          * code, it is important we match the pattern llvm uses (and pray llvm
503          * doesn't change it - and hope they decide on the same pattern for
504          * all backends supporting it...).
505          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
506          * interfere with llvm's ability to recognize the pattern but seems
507          * a bit brittle.
508          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
509          */
510         LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res);
511         res = lp_build_select(bld, overflowed,
512                               LLVMConstAllOnes(bld->int_vec_type), res);
513      }
514   }
515
516   /* XXX clamp to floor of -1 or 0??? */
517
518   return res;
519}
520
521
522/** Return the scalar sum of the elements of a.
523 * Should avoid this operation whenever possible.
524 */
525LLVMValueRef
526lp_build_horizontal_add(struct lp_build_context *bld,
527                        LLVMValueRef a)
528{
529   LLVMBuilderRef builder = bld->gallivm->builder;
530   const struct lp_type type = bld->type;
531   LLVMValueRef index, res;
532   unsigned i, length;
533   LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 2];
534   LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 2];
535   LLVMValueRef vecres, elem2;
536
537   assert(lp_check_value(type, a));
538
539   if (type.length == 1) {
540      return a;
541   }
542
543   assert(!bld->type.norm);
544
545   /*
546    * for byte vectors can do much better with psadbw.
547    * Using repeated shuffle/adds here. Note with multiple vectors
548    * this can be done more efficiently as outlined in the intel
549    * optimization manual.
550    * Note: could cause data rearrangement if used with smaller element
551    * sizes.
552    */
553
554   vecres = a;
555   length = type.length / 2;
556   while (length > 1) {
557      LLVMValueRef vec1, vec2;
558      for (i = 0; i < length; i++) {
559         shuffles1[i] = lp_build_const_int32(bld->gallivm, i);
560         shuffles2[i] = lp_build_const_int32(bld->gallivm, i + length);
561      }
562      vec1 = LLVMBuildShuffleVector(builder, vecres, vecres,
563                                    LLVMConstVector(shuffles1, length), "");
564      vec2 = LLVMBuildShuffleVector(builder, vecres, vecres,
565                                    LLVMConstVector(shuffles2, length), "");
566      if (type.floating) {
567         vecres = LLVMBuildFAdd(builder, vec1, vec2, "");
568      }
569      else {
570         vecres = LLVMBuildAdd(builder, vec1, vec2, "");
571      }
572      length = length >> 1;
573   }
574
575   /* always have vector of size 2 here */
576   assert(length == 1);
577
578   index = lp_build_const_int32(bld->gallivm, 0);
579   res = LLVMBuildExtractElement(builder, vecres, index, "");
580   index = lp_build_const_int32(bld->gallivm, 1);
581   elem2 = LLVMBuildExtractElement(builder, vecres, index, "");
582
583   if (type.floating)
584      res = LLVMBuildFAdd(builder, res, elem2, "");
585    else
586      res = LLVMBuildAdd(builder, res, elem2, "");
587
588   return res;
589}
590
591/**
592 * Return the horizontal sums of 4 float vectors as a float4 vector.
593 * This uses the technique as outlined in Intel Optimization Manual.
594 */
595static LLVMValueRef
596lp_build_horizontal_add4x4f(struct lp_build_context *bld,
597                            LLVMValueRef src[4])
598{
599   struct gallivm_state *gallivm = bld->gallivm;
600   LLVMBuilderRef builder = gallivm->builder;
601   LLVMValueRef shuffles[4];
602   LLVMValueRef tmp[4];
603   LLVMValueRef sumtmp[2], shuftmp[2];
604
605   /* lower half of regs */
606   shuffles[0] = lp_build_const_int32(gallivm, 0);
607   shuffles[1] = lp_build_const_int32(gallivm, 1);
608   shuffles[2] = lp_build_const_int32(gallivm, 4);
609   shuffles[3] = lp_build_const_int32(gallivm, 5);
610   tmp[0] = LLVMBuildShuffleVector(builder, src[0], src[1],
611                                   LLVMConstVector(shuffles, 4), "");
612   tmp[2] = LLVMBuildShuffleVector(builder, src[2], src[3],
613                                   LLVMConstVector(shuffles, 4), "");
614
615   /* upper half of regs */
616   shuffles[0] = lp_build_const_int32(gallivm, 2);
617   shuffles[1] = lp_build_const_int32(gallivm, 3);
618   shuffles[2] = lp_build_const_int32(gallivm, 6);
619   shuffles[3] = lp_build_const_int32(gallivm, 7);
620   tmp[1] = LLVMBuildShuffleVector(builder, src[0], src[1],
621                                   LLVMConstVector(shuffles, 4), "");
622   tmp[3] = LLVMBuildShuffleVector(builder, src[2], src[3],
623                                   LLVMConstVector(shuffles, 4), "");
624
625   sumtmp[0] = LLVMBuildFAdd(builder, tmp[0], tmp[1], "");
626   sumtmp[1] = LLVMBuildFAdd(builder, tmp[2], tmp[3], "");
627
628   shuffles[0] = lp_build_const_int32(gallivm, 0);
629   shuffles[1] = lp_build_const_int32(gallivm, 2);
630   shuffles[2] = lp_build_const_int32(gallivm, 4);
631   shuffles[3] = lp_build_const_int32(gallivm, 6);
632   shuftmp[0] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
633                                       LLVMConstVector(shuffles, 4), "");
634
635   shuffles[0] = lp_build_const_int32(gallivm, 1);
636   shuffles[1] = lp_build_const_int32(gallivm, 3);
637   shuffles[2] = lp_build_const_int32(gallivm, 5);
638   shuffles[3] = lp_build_const_int32(gallivm, 7);
639   shuftmp[1] = LLVMBuildShuffleVector(builder, sumtmp[0], sumtmp[1],
640                                       LLVMConstVector(shuffles, 4), "");
641
642   return LLVMBuildFAdd(builder, shuftmp[0], shuftmp[1], "");
643}
644
645
646/*
647 * partially horizontally add 2-4 float vectors with length nx4,
648 * i.e. only four adjacent values in each vector will be added,
649 * assuming values are really grouped in 4 which also determines
650 * output order.
651 *
652 * Return a vector of the same length as the initial vectors,
653 * with the excess elements (if any) being undefined.
654 * The element order is independent of number of input vectors.
655 * For 3 vectors x0x1x2x3x4x5x6x7, y0y1y2y3y4y5y6y7, z0z1z2z3z4z5z6z7
656 * the output order thus will be
657 * sumx0-x3,sumy0-y3,sumz0-z3,undef,sumx4-x7,sumy4-y7,sumz4z7,undef
658 */
659LLVMValueRef
660lp_build_hadd_partial4(struct lp_build_context *bld,
661                       LLVMValueRef vectors[],
662                       unsigned num_vecs)
663{
664   struct gallivm_state *gallivm = bld->gallivm;
665   LLVMBuilderRef builder = gallivm->builder;
666   LLVMValueRef ret_vec;
667   LLVMValueRef tmp[4];
668   const char *intrinsic = NULL;
669
670   assert(num_vecs >= 2 && num_vecs <= 4);
671   assert(bld->type.floating);
672
673   /* only use this with at least 2 vectors, as it is sort of expensive
674    * (depending on cpu) and we always need two horizontal adds anyway,
675    * so a shuffle/add approach might be better.
676    */
677
678   tmp[0] = vectors[0];
679   tmp[1] = vectors[1];
680
681   tmp[2] = num_vecs > 2 ? vectors[2] : vectors[0];
682   tmp[3] = num_vecs > 3 ? vectors[3] : vectors[0];
683
684   if (util_get_cpu_caps()->has_sse3 && bld->type.width == 32 &&
685       bld->type.length == 4) {
686      intrinsic = "llvm.x86.sse3.hadd.ps";
687   }
688   else if (util_get_cpu_caps()->has_avx && bld->type.width == 32 &&
689            bld->type.length == 8) {
690      intrinsic = "llvm.x86.avx.hadd.ps.256";
691   }
692   if (intrinsic) {
693      tmp[0] = lp_build_intrinsic_binary(builder, intrinsic,
694                                       lp_build_vec_type(gallivm, bld->type),
695                                       tmp[0], tmp[1]);
696      if (num_vecs > 2) {
697         tmp[1] = lp_build_intrinsic_binary(builder, intrinsic,
698                                          lp_build_vec_type(gallivm, bld->type),
699                                          tmp[2], tmp[3]);
700      }
701      else {
702         tmp[1] = tmp[0];
703      }
704      return lp_build_intrinsic_binary(builder, intrinsic,
705                                       lp_build_vec_type(gallivm, bld->type),
706                                       tmp[0], tmp[1]);
707   }
708
709   if (bld->type.length == 4) {
710      ret_vec = lp_build_horizontal_add4x4f(bld, tmp);
711   }
712   else {
713      LLVMValueRef partres[LP_MAX_VECTOR_LENGTH/4];
714      unsigned j;
715      unsigned num_iter = bld->type.length / 4;
716      struct lp_type parttype = bld->type;
717      parttype.length = 4;
718      for (j = 0; j < num_iter; j++) {
719         LLVMValueRef partsrc[4];
720         unsigned i;
721         for (i = 0; i < 4; i++) {
722            partsrc[i] = lp_build_extract_range(gallivm, tmp[i], j*4, 4);
723         }
724         partres[j] = lp_build_horizontal_add4x4f(bld, partsrc);
725      }
726      ret_vec = lp_build_concat(gallivm, partres, parttype, num_iter);
727   }
728   return ret_vec;
729}
730
731/**
732 * Generate a - b
733 */
734LLVMValueRef
735lp_build_sub(struct lp_build_context *bld,
736             LLVMValueRef a,
737             LLVMValueRef b)
738{
739   LLVMBuilderRef builder = bld->gallivm->builder;
740   const struct lp_type type = bld->type;
741   LLVMValueRef res;
742
743   assert(lp_check_value(type, a));
744   assert(lp_check_value(type, b));
745
746   if (b == bld->zero)
747      return a;
748   if (a == bld->undef || b == bld->undef)
749      return bld->undef;
750   if (a == b)
751      return bld->zero;
752
753   if (type.norm) {
754      const char *intrinsic = NULL;
755
756      if (!type.sign && b == bld->one)
757        return bld->zero;
758
759      if (!type.floating && !type.fixed) {
760         if (LLVM_VERSION_MAJOR >= 8) {
761            char intrin[32];
762            intrinsic = type.sign ? "llvm.ssub.sat" : "llvm.usub.sat";
763            lp_format_intrinsic(intrin, sizeof intrin, intrinsic, bld->vec_type);
764            return lp_build_intrinsic_binary(builder, intrin, bld->vec_type, a, b);
765         }
766         if (type.width * type.length == 128) {
767            if (util_get_cpu_caps()->has_sse2) {
768               if (type.width == 8)
769                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
770               if (type.width == 16)
771                  intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
772            } else if (util_get_cpu_caps()->has_altivec) {
773               if (type.width == 8)
774                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs";
775               if (type.width == 16)
776                  intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs";
777            }
778         }
779         if (type.width * type.length == 256) {
780            if (util_get_cpu_caps()->has_avx2) {
781               if (type.width == 8)
782                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b";
783               if (type.width == 16)
784                  intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w";
785            }
786         }
787      }
788
789      if (intrinsic)
790         return lp_build_intrinsic_binary(builder, intrinsic, lp_build_vec_type(bld->gallivm, bld->type), a, b);
791   }
792
793   if(type.norm && !type.floating && !type.fixed) {
794      if (type.sign) {
795         uint64_t sign = (uint64_t)1 << (type.width - 1);
796         LLVMValueRef max_val = lp_build_const_int_vec(bld->gallivm, type, sign - 1);
797         LLVMValueRef min_val = lp_build_const_int_vec(bld->gallivm, type, sign);
798         /* a_clamp_max is the maximum a for negative b,
799            a_clamp_min is the minimum a for positive b. */
800         LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildAdd(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
801         LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED);
802         a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max);
803      } else {
804         /*
805          * This must match llvm pattern for saturated unsigned sub.
806          * (lp_build_max_simple actually does the job with its current
807          * definition but do it explicitly here.)
808          * NOTE: cmp/select does sext/trunc of the mask. Does not seem to
809          * interfere with llvm's ability to recognize the pattern but seems
810          * a bit brittle.
811          * NOTE: llvm 9+ always uses (non arch specific) intrinsic.
812          */
813         LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
814         a = lp_build_select(bld, no_ov, a, b);
815      }
816   }
817
818   if(LLVMIsConstant(a) && LLVMIsConstant(b))
819      if (type.floating)
820         res = LLVMConstFSub(a, b);
821      else
822         res = LLVMConstSub(a, b);
823   else
824      if (type.floating)
825         res = LLVMBuildFSub(builder, a, b, "");
826      else
827         res = LLVMBuildSub(builder, a, b, "");
828
829   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
830      res = lp_build_max_simple(bld, res, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
831
832   return res;
833}
834
835
836
837/**
838 * Normalized multiplication.
839 *
840 * There are several approaches for (using 8-bit normalized multiplication as
841 * an example):
842 *
843 * - alpha plus one
844 *
845 *     makes the following approximation to the division (Sree)
846 *
847 *       a*b/255 ~= (a*(b + 1)) >> 256
848 *
849 *     which is the fastest method that satisfies the following OpenGL criteria of
850 *
851 *       0*0 = 0 and 255*255 = 255
852 *
853 * - geometric series
854 *
855 *     takes the geometric series approximation to the division
856 *
857 *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
858 *
859 *     in this case just the first two terms to fit in 16bit arithmetic
860 *
861 *       t/255 ~= (t + (t >> 8)) >> 8
862 *
863 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
864 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
865 *     must be used.
866 *
867 * - geometric series plus rounding
868 *
869 *     when using a geometric series division instead of truncating the result
870 *     use roundoff in the approximation (Jim Blinn)
871 *
872 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
873 *
874 *     achieving the exact results.
875 *
876 *
877 *
878 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
879 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
880 * @sa Michael Herf, The "double blend trick", May 2000,
881 *     http://www.stereopsis.com/doubleblend.html
882 */
883LLVMValueRef
884lp_build_mul_norm(struct gallivm_state *gallivm,
885                  struct lp_type wide_type,
886                  LLVMValueRef a, LLVMValueRef b)
887{
888   LLVMBuilderRef builder = gallivm->builder;
889   struct lp_build_context bld;
890   unsigned n;
891   LLVMValueRef half;
892   LLVMValueRef ab;
893
894   assert(!wide_type.floating);
895   assert(lp_check_value(wide_type, a));
896   assert(lp_check_value(wide_type, b));
897
898   lp_build_context_init(&bld, gallivm, wide_type);
899
900   n = wide_type.width / 2;
901   if (wide_type.sign) {
902      --n;
903   }
904
905   /*
906    * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
907    * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
908    */
909
910   /*
911    * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
912    */
913
914   ab = LLVMBuildMul(builder, a, b, "");
915   ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");
916
917   /*
918    * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
919    */
920
921   half = lp_build_const_int_vec(gallivm, wide_type, 1LL << (n - 1));
922   if (wide_type.sign) {
923      LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
924      LLVMValueRef sign = lp_build_shr_imm(&bld, ab, wide_type.width - 1);
925      half = lp_build_select(&bld, sign, minus_half, half);
926   }
927   ab = LLVMBuildAdd(builder, ab, half, "");
928
929   /* Final division */
930   ab = lp_build_shr_imm(&bld, ab, n);
931
932   return ab;
933}
934
935/**
936 * Generate a * b
937 */
938LLVMValueRef
939lp_build_mul(struct lp_build_context *bld,
940             LLVMValueRef a,
941             LLVMValueRef b)
942{
943   LLVMBuilderRef builder = bld->gallivm->builder;
944   const struct lp_type type = bld->type;
945   LLVMValueRef shift;
946   LLVMValueRef res;
947
948   assert(lp_check_value(type, a));
949   assert(lp_check_value(type, b));
950
951   if(a == bld->zero)
952      return bld->zero;
953   if(a == bld->one)
954      return b;
955   if(b == bld->zero)
956      return bld->zero;
957   if(b == bld->one)
958      return a;
959   if(a == bld->undef || b == bld->undef)
960      return bld->undef;
961
962   if (!type.floating && !type.fixed && type.norm) {
963      struct lp_type wide_type = lp_wider_type(type);
964      LLVMValueRef al, ah, bl, bh, abl, abh, ab;
965
966      lp_build_unpack2_native(bld->gallivm, type, wide_type, a, &al, &ah);
967      lp_build_unpack2_native(bld->gallivm, type, wide_type, b, &bl, &bh);
968
969      /* PMULLW, PSRLW, PADDW */
970      abl = lp_build_mul_norm(bld->gallivm, wide_type, al, bl);
971      abh = lp_build_mul_norm(bld->gallivm, wide_type, ah, bh);
972
973      ab = lp_build_pack2_native(bld->gallivm, wide_type, type, abl, abh);
974
975      return ab;
976   }
977
978   if(type.fixed)
979      shift = lp_build_const_int_vec(bld->gallivm, type, type.width/2);
980   else
981      shift = NULL;
982
983   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
984      if (type.floating)
985         res = LLVMConstFMul(a, b);
986      else
987         res = LLVMConstMul(a, b);
988      if(shift) {
989         if(type.sign)
990            res = LLVMConstAShr(res, shift);
991         else
992            res = LLVMConstLShr(res, shift);
993      }
994   }
995   else {
996      if (type.floating)
997         res = LLVMBuildFMul(builder, a, b, "");
998      else
999         res = LLVMBuildMul(builder, a, b, "");
1000      if(shift) {
1001         if(type.sign)
1002            res = LLVMBuildAShr(builder, res, shift, "");
1003         else
1004            res = LLVMBuildLShr(builder, res, shift, "");
1005      }
1006   }
1007
1008   return res;
1009}
1010
1011/*
1012 * Widening mul, valid for 32x32 bit -> 64bit only.
1013 * Result is low 32bits, high bits returned in res_hi.
1014 *
1015 * Emits code that is meant to be compiled for the host CPU.
1016 */
1017LLVMValueRef
1018lp_build_mul_32_lohi_cpu(struct lp_build_context *bld,
1019                         LLVMValueRef a,
1020                         LLVMValueRef b,
1021                         LLVMValueRef *res_hi)
1022{
1023   struct gallivm_state *gallivm = bld->gallivm;
1024   LLVMBuilderRef builder = gallivm->builder;
1025
1026   assert(bld->type.width == 32);
1027   assert(bld->type.floating == 0);
1028   assert(bld->type.fixed == 0);
1029   assert(bld->type.norm == 0);
1030
1031   /*
1032    * XXX: for some reason, with zext/zext/mul/trunc the code llvm produces
1033    * for x86 simd is atrocious (even if the high bits weren't required),
1034    * trying to handle real 64bit inputs (which of course can't happen due
1035    * to using 64bit umul with 32bit numbers zero-extended to 64bit, but
1036    * apparently llvm does not recognize this widening mul). This includes 6
1037    * (instead of 2) pmuludq plus extra adds and shifts
1038    * The same story applies to signed mul, albeit fixing this requires sse41.
1039    * https://llvm.org/bugs/show_bug.cgi?id=30845
1040    * So, whip up our own code, albeit only for length 4 and 8 (which
1041    * should be good enough)...
1042    * FIXME: For llvm >= 7.0 we should match the autoupgrade pattern
1043    * (bitcast/and/mul/shuffle for unsigned, bitcast/shl/ashr/mul/shuffle
1044    * for signed), which the fallback code does not, without this llvm
1045    * will likely still produce atrocious code.
1046    */
1047   if (LLVM_VERSION_MAJOR < 7 &&
1048       (bld->type.length == 4 || bld->type.length == 8) &&
1049       ((util_get_cpu_caps()->has_sse2 && (bld->type.sign == 0)) ||
1050        util_get_cpu_caps()->has_sse4_1)) {
1051      const char *intrinsic = NULL;
1052      LLVMValueRef aeven, aodd, beven, bodd, muleven, mulodd;
1053      LLVMValueRef shuf[LP_MAX_VECTOR_WIDTH / 32], shuf_vec;
1054      struct lp_type type_wide = lp_wider_type(bld->type);
1055      LLVMTypeRef wider_type = lp_build_vec_type(gallivm, type_wide);
1056      unsigned i;
1057      for (i = 0; i < bld->type.length; i += 2) {
1058         shuf[i] = lp_build_const_int32(gallivm, i+1);
1059         shuf[i+1] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
1060      }
1061      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1062      aeven = a;
1063      beven = b;
1064      aodd = LLVMBuildShuffleVector(builder, aeven, bld->undef, shuf_vec, "");
1065      bodd = LLVMBuildShuffleVector(builder, beven, bld->undef, shuf_vec, "");
1066
1067      if (util_get_cpu_caps()->has_avx2 && bld->type.length == 8) {
1068         if (bld->type.sign) {
1069            intrinsic = "llvm.x86.avx2.pmul.dq";
1070         } else {
1071            intrinsic = "llvm.x86.avx2.pmulu.dq";
1072         }
1073         muleven = lp_build_intrinsic_binary(builder, intrinsic,
1074                                             wider_type, aeven, beven);
1075         mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1076                                            wider_type, aodd, bodd);
1077      }
1078      else {
1079         /* for consistent naming look elsewhere... */
1080         if (bld->type.sign) {
1081            intrinsic = "llvm.x86.sse41.pmuldq";
1082         } else {
1083            intrinsic = "llvm.x86.sse2.pmulu.dq";
1084         }
1085         /*
1086          * XXX If we only have AVX but not AVX2 this is a pain.
1087          * lp_build_intrinsic_binary_anylength() can't handle it
1088          * (due to src and dst type not being identical).
1089          */
1090         if (bld->type.length == 8) {
1091            LLVMValueRef aevenlo, aevenhi, bevenlo, bevenhi;
1092            LLVMValueRef aoddlo, aoddhi, boddlo, boddhi;
1093            LLVMValueRef muleven2[2], mulodd2[2];
1094            struct lp_type type_wide_half = type_wide;
1095            LLVMTypeRef wtype_half;
1096            type_wide_half.length = 2;
1097            wtype_half = lp_build_vec_type(gallivm, type_wide_half);
1098            aevenlo = lp_build_extract_range(gallivm, aeven, 0, 4);
1099            aevenhi = lp_build_extract_range(gallivm, aeven, 4, 4);
1100            bevenlo = lp_build_extract_range(gallivm, beven, 0, 4);
1101            bevenhi = lp_build_extract_range(gallivm, beven, 4, 4);
1102            aoddlo = lp_build_extract_range(gallivm, aodd, 0, 4);
1103            aoddhi = lp_build_extract_range(gallivm, aodd, 4, 4);
1104            boddlo = lp_build_extract_range(gallivm, bodd, 0, 4);
1105            boddhi = lp_build_extract_range(gallivm, bodd, 4, 4);
1106            muleven2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1107                                                    wtype_half, aevenlo, bevenlo);
1108            mulodd2[0] = lp_build_intrinsic_binary(builder, intrinsic,
1109                                                   wtype_half, aoddlo, boddlo);
1110            muleven2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1111                                                    wtype_half, aevenhi, bevenhi);
1112            mulodd2[1] = lp_build_intrinsic_binary(builder, intrinsic,
1113                                                   wtype_half, aoddhi, boddhi);
1114            muleven = lp_build_concat(gallivm, muleven2, type_wide_half, 2);
1115            mulodd = lp_build_concat(gallivm, mulodd2, type_wide_half, 2);
1116
1117         }
1118         else {
1119            muleven = lp_build_intrinsic_binary(builder, intrinsic,
1120                                                wider_type, aeven, beven);
1121            mulodd = lp_build_intrinsic_binary(builder, intrinsic,
1122                                               wider_type, aodd, bodd);
1123         }
1124      }
1125      muleven = LLVMBuildBitCast(builder, muleven, bld->vec_type, "");
1126      mulodd = LLVMBuildBitCast(builder, mulodd, bld->vec_type, "");
1127
1128      for (i = 0; i < bld->type.length; i += 2) {
1129         shuf[i] = lp_build_const_int32(gallivm, i + 1);
1130         shuf[i+1] = lp_build_const_int32(gallivm, i + 1 + bld->type.length);
1131      }
1132      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1133      *res_hi = LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1134
1135      for (i = 0; i < bld->type.length; i += 2) {
1136         shuf[i] = lp_build_const_int32(gallivm, i);
1137         shuf[i+1] = lp_build_const_int32(gallivm, i + bld->type.length);
1138      }
1139      shuf_vec = LLVMConstVector(shuf, bld->type.length);
1140      return LLVMBuildShuffleVector(builder, muleven, mulodd, shuf_vec, "");
1141   }
1142   else {
1143      return lp_build_mul_32_lohi(bld, a, b, res_hi);
1144   }
1145}
1146
1147
1148/*
1149 * Widening mul, valid for <= 32 (8, 16, 32) -> 64
1150 * Result is low N bits, high bits returned in res_hi.
1151 *
1152 * Emits generic code.
1153 */
1154LLVMValueRef
1155lp_build_mul_32_lohi(struct lp_build_context *bld,
1156                     LLVMValueRef a,
1157                     LLVMValueRef b,
1158                     LLVMValueRef *res_hi)
1159{
1160   struct gallivm_state *gallivm = bld->gallivm;
1161   LLVMBuilderRef builder = gallivm->builder;
1162   LLVMValueRef tmp, shift, res_lo;
1163   struct lp_type type_tmp;
1164   LLVMTypeRef wide_type, narrow_type;
1165
1166   type_tmp = bld->type;
1167   narrow_type = lp_build_vec_type(gallivm, type_tmp);
1168   if (bld->type.width < 32)
1169      type_tmp.width = 32;
1170   else
1171      type_tmp.width *= 2;
1172   wide_type = lp_build_vec_type(gallivm, type_tmp);
1173   shift = lp_build_const_vec(gallivm, type_tmp, bld->type.width);
1174
1175   if (bld->type.sign) {
1176      a = LLVMBuildSExt(builder, a, wide_type, "");
1177      b = LLVMBuildSExt(builder, b, wide_type, "");
1178   } else {
1179      a = LLVMBuildZExt(builder, a, wide_type, "");
1180      b = LLVMBuildZExt(builder, b, wide_type, "");
1181   }
1182   tmp = LLVMBuildMul(builder, a, b, "");
1183
1184   res_lo = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1185
1186   /* Since we truncate anyway, LShr and AShr are equivalent. */
1187   tmp = LLVMBuildLShr(builder, tmp, shift, "");
1188   *res_hi = LLVMBuildTrunc(builder, tmp, narrow_type, "");
1189
1190   return res_lo;
1191}
1192
1193
1194/* a * b + c */
1195LLVMValueRef
1196lp_build_mad(struct lp_build_context *bld,
1197             LLVMValueRef a,
1198             LLVMValueRef b,
1199             LLVMValueRef c)
1200{
1201   const struct lp_type type = bld->type;
1202   if (type.floating) {
1203      return lp_build_fmuladd(bld->gallivm->builder, a, b, c);
1204   } else {
1205      return lp_build_add(bld, lp_build_mul(bld, a, b), c);
1206   }
1207}
1208
1209
1210/**
1211 * Small vector x scale multiplication optimization.
1212 */
1213LLVMValueRef
1214lp_build_mul_imm(struct lp_build_context *bld,
1215                 LLVMValueRef a,
1216                 int b)
1217{
1218   LLVMBuilderRef builder = bld->gallivm->builder;
1219   LLVMValueRef factor;
1220
1221   assert(lp_check_value(bld->type, a));
1222
1223   if(b == 0)
1224      return bld->zero;
1225
1226   if(b == 1)
1227      return a;
1228
1229   if(b == -1)
1230      return lp_build_negate(bld, a);
1231
1232   if(b == 2 && bld->type.floating)
1233      return lp_build_add(bld, a, a);
1234
1235   if(util_is_power_of_two_or_zero(b)) {
1236      unsigned shift = ffs(b) - 1;
1237
1238      if(bld->type.floating) {
1239#if 0
1240         /*
1241          * Power of two multiplication by directly manipulating the exponent.
1242          *
1243          * XXX: This might not be always faster, it will introduce a small error
1244          * for multiplication by zero, and it will produce wrong results
1245          * for Inf and NaN.
1246          */
1247         unsigned mantissa = lp_mantissa(bld->type);
1248         factor = lp_build_const_int_vec(bld->gallivm, bld->type, (unsigned long long)shift << mantissa);
1249         a = LLVMBuildBitCast(builder, a, lp_build_int_vec_type(bld->type), "");
1250         a = LLVMBuildAdd(builder, a, factor, "");
1251         a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, bld->type), "");
1252         return a;
1253#endif
1254      }
1255      else {
1256         factor = lp_build_const_vec(bld->gallivm, bld->type, shift);
1257         return LLVMBuildShl(builder, a, factor, "");
1258      }
1259   }
1260
1261   factor = lp_build_const_vec(bld->gallivm, bld->type, (double)b);
1262   return lp_build_mul(bld, a, factor);
1263}
1264
1265
1266/**
1267 * Generate a / b
1268 */
1269LLVMValueRef
1270lp_build_div(struct lp_build_context *bld,
1271             LLVMValueRef a,
1272             LLVMValueRef b)
1273{
1274   LLVMBuilderRef builder = bld->gallivm->builder;
1275   const struct lp_type type = bld->type;
1276
1277   assert(lp_check_value(type, a));
1278   assert(lp_check_value(type, b));
1279
1280   if(a == bld->zero)
1281      return bld->zero;
1282   if(a == bld->one && type.floating)
1283      return lp_build_rcp(bld, b);
1284   if(b == bld->zero)
1285      return bld->undef;
1286   if(b == bld->one)
1287      return a;
1288   if(a == bld->undef || b == bld->undef)
1289      return bld->undef;
1290
1291   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
1292      if (type.floating)
1293         return LLVMConstFDiv(a, b);
1294      else if (type.sign)
1295         return LLVMConstSDiv(a, b);
1296      else
1297         return LLVMConstUDiv(a, b);
1298   }
1299
1300   /* fast rcp is disabled (just uses div), so makes no sense to try that */
1301   if(FALSE &&
1302      ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
1303       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) &&
1304      type.floating)
1305      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
1306
1307   if (type.floating)
1308      return LLVMBuildFDiv(builder, a, b, "");
1309   else if (type.sign)
1310      return LLVMBuildSDiv(builder, a, b, "");
1311   else
1312      return LLVMBuildUDiv(builder, a, b, "");
1313}
1314
1315
1316/**
1317 * Linear interpolation helper.
1318 *
1319 * @param normalized whether we are interpolating normalized values,
1320 *        encoded in normalized integers, twice as wide.
1321 *
1322 * @sa http://www.stereopsis.com/doubleblend.html
1323 */
1324static inline LLVMValueRef
1325lp_build_lerp_simple(struct lp_build_context *bld,
1326                     LLVMValueRef x,
1327                     LLVMValueRef v0,
1328                     LLVMValueRef v1,
1329                     unsigned flags)
1330{
1331   unsigned half_width = bld->type.width/2;
1332   LLVMBuilderRef builder = bld->gallivm->builder;
1333   LLVMValueRef delta;
1334   LLVMValueRef res;
1335
1336   assert(lp_check_value(bld->type, x));
1337   assert(lp_check_value(bld->type, v0));
1338   assert(lp_check_value(bld->type, v1));
1339
1340   delta = lp_build_sub(bld, v1, v0);
1341
1342   if (bld->type.floating) {
1343      assert(flags == 0);
1344      return lp_build_mad(bld, x, delta, v0);
1345   }
1346
1347   if (flags & LP_BLD_LERP_WIDE_NORMALIZED) {
1348      if (!bld->type.sign) {
1349         if (!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS)) {
1350            /*
1351             * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
1352             * most-significant-bit to the lowest-significant-bit, so that
1353             * later we can just divide by 2**n instead of 2**n - 1.
1354             */
1355
1356            x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));
1357         }
1358
1359         /* (x * delta) >> n */
1360	 /*
1361	  * For this multiply, higher internal precision is required to pass CTS,
1362	  * the most efficient path to that is pmulhrsw on ssse3 and above.
1363	  * This could be opencoded on other arches if conformance was required.
1364	  */
1365         if (bld->type.width == 16 && bld->type.length == 8 && util_get_cpu_caps()->has_ssse3) {
1366            res = lp_build_intrinsic_binary(builder, "llvm.x86.ssse3.pmul.hr.sw.128", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1367            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1368         } else if (bld->type.width == 16 && bld->type.length == 16 && util_get_cpu_caps()->has_avx2) {
1369            res = lp_build_intrinsic_binary(builder, "llvm.x86.avx2.pmul.hr.sw", bld->vec_type, x, lp_build_shl_imm(bld, delta, 7));
1370            res = lp_build_and(bld, res, lp_build_const_int_vec(bld->gallivm, bld->type, 0xff));
1371         } else {
1372            res = lp_build_mul(bld, x, delta);
1373            res = lp_build_shr_imm(bld, res, half_width);
1374         }
1375      } else {
1376         /*
1377          * The rescaling trick above doesn't work for signed numbers, so
1378          * use the 2**n - 1 divison approximation in lp_build_mul_norm
1379          * instead.
1380          */
1381         assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1382         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
1383      }
1384   } else {
1385      assert(!(flags & LP_BLD_LERP_PRESCALED_WEIGHTS));
1386      res = lp_build_mul(bld, x, delta);
1387   }
1388
1389   if ((flags & LP_BLD_LERP_WIDE_NORMALIZED) && !bld->type.sign) {
1390      /*
1391       * At this point both res and v0 only use the lower half of the bits,
1392       * the rest is zero. Instead of add / mask, do add with half wide type.
1393       */
1394      struct lp_type narrow_type;
1395      struct lp_build_context narrow_bld;
1396
1397      memset(&narrow_type, 0, sizeof narrow_type);
1398      narrow_type.sign   = bld->type.sign;
1399      narrow_type.width  = bld->type.width/2;
1400      narrow_type.length = bld->type.length*2;
1401
1402      lp_build_context_init(&narrow_bld, bld->gallivm, narrow_type);
1403      res = LLVMBuildBitCast(builder, res, narrow_bld.vec_type, "");
1404      v0 = LLVMBuildBitCast(builder, v0, narrow_bld.vec_type, "");
1405      res = lp_build_add(&narrow_bld, v0, res);
1406      res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
1407   } else {
1408      res = lp_build_add(bld, v0, res);
1409
1410      if (bld->type.fixed) {
1411         /*
1412          * We need to mask out the high order bits when lerping 8bit
1413          * normalized colors stored on 16bits
1414          */
1415         /* XXX: This step is necessary for lerping 8bit colors stored on
1416          * 16bits, but it will be wrong for true fixed point use cases.
1417          * Basically we need a more powerful lp_type, capable of further
1418          * distinguishing the values interpretation from the value storage.
1419          */
1420         LLVMValueRef low_bits;
1421         low_bits = lp_build_const_int_vec(bld->gallivm, bld->type, (1 << half_width) - 1);
1422         res = LLVMBuildAnd(builder, res, low_bits, "");
1423      }
1424   }
1425
1426   return res;
1427}
1428
1429
1430/**
1431 * Linear interpolation.
1432 */
1433LLVMValueRef
1434lp_build_lerp(struct lp_build_context *bld,
1435              LLVMValueRef x,
1436              LLVMValueRef v0,
1437              LLVMValueRef v1,
1438              unsigned flags)
1439{
1440   const struct lp_type type = bld->type;
1441   LLVMValueRef res;
1442
1443   assert(lp_check_value(type, x));
1444   assert(lp_check_value(type, v0));
1445   assert(lp_check_value(type, v1));
1446
1447   assert(!(flags & LP_BLD_LERP_WIDE_NORMALIZED));
1448
1449   if (type.norm) {
1450      struct lp_type wide_type;
1451      struct lp_build_context wide_bld;
1452      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
1453
1454      assert(type.length >= 2);
1455
1456      /*
1457       * Create a wider integer type, enough to hold the
1458       * intermediate result of the multiplication.
1459       */
1460      memset(&wide_type, 0, sizeof wide_type);
1461      wide_type.sign   = type.sign;
1462      wide_type.width  = type.width*2;
1463      wide_type.length = type.length/2;
1464
1465      lp_build_context_init(&wide_bld, bld->gallivm, wide_type);
1466
1467      lp_build_unpack2_native(bld->gallivm, type, wide_type, x,  &xl,  &xh);
1468      lp_build_unpack2_native(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
1469      lp_build_unpack2_native(bld->gallivm, type, wide_type, v1, &v1l, &v1h);
1470
1471      /*
1472       * Lerp both halves.
1473       */
1474
1475      flags |= LP_BLD_LERP_WIDE_NORMALIZED;
1476
1477      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l, flags);
1478      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h, flags);
1479
1480      res = lp_build_pack2_native(bld->gallivm, wide_type, type, resl, resh);
1481   } else {
1482      res = lp_build_lerp_simple(bld, x, v0, v1, flags);
1483   }
1484
1485   return res;
1486}
1487
1488
1489/**
1490 * Bilinear interpolation.
1491 *
1492 * Values indices are in v_{yx}.
1493 */
1494LLVMValueRef
1495lp_build_lerp_2d(struct lp_build_context *bld,
1496                 LLVMValueRef x,
1497                 LLVMValueRef y,
1498                 LLVMValueRef v00,
1499                 LLVMValueRef v01,
1500                 LLVMValueRef v10,
1501                 LLVMValueRef v11,
1502                 unsigned flags)
1503{
1504   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01, flags);
1505   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11, flags);
1506   return lp_build_lerp(bld, y, v0, v1, flags);
1507}
1508
1509
1510LLVMValueRef
1511lp_build_lerp_3d(struct lp_build_context *bld,
1512                 LLVMValueRef x,
1513                 LLVMValueRef y,
1514                 LLVMValueRef z,
1515                 LLVMValueRef v000,
1516                 LLVMValueRef v001,
1517                 LLVMValueRef v010,
1518                 LLVMValueRef v011,
1519                 LLVMValueRef v100,
1520                 LLVMValueRef v101,
1521                 LLVMValueRef v110,
1522                 LLVMValueRef v111,
1523                 unsigned flags)
1524{
1525   LLVMValueRef v0 = lp_build_lerp_2d(bld, x, y, v000, v001, v010, v011, flags);
1526   LLVMValueRef v1 = lp_build_lerp_2d(bld, x, y, v100, v101, v110, v111, flags);
1527   return lp_build_lerp(bld, z, v0, v1, flags);
1528}
1529
1530
1531/**
1532 * Generate min(a, b)
1533 * Do checks for special cases but not for nans.
1534 */
1535LLVMValueRef
1536lp_build_min(struct lp_build_context *bld,
1537             LLVMValueRef a,
1538             LLVMValueRef b)
1539{
1540   assert(lp_check_value(bld->type, a));
1541   assert(lp_check_value(bld->type, b));
1542
1543   if(a == bld->undef || b == bld->undef)
1544      return bld->undef;
1545
1546   if(a == b)
1547      return a;
1548
1549   if (bld->type.norm) {
1550      if (!bld->type.sign) {
1551         if (a == bld->zero || b == bld->zero) {
1552            return bld->zero;
1553         }
1554      }
1555      if(a == bld->one)
1556         return b;
1557      if(b == bld->one)
1558         return a;
1559   }
1560
1561   return lp_build_min_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1562}
1563
1564/**
1565 * Generate min(a, b)
1566 * NaN's are handled according to the behavior specified by the
1567 * nan_behavior argument.
1568 */
1569LLVMValueRef
1570lp_build_min_ext(struct lp_build_context *bld,
1571                 LLVMValueRef a,
1572                 LLVMValueRef b,
1573                 enum gallivm_nan_behavior nan_behavior)
1574{
1575   assert(lp_check_value(bld->type, a));
1576   assert(lp_check_value(bld->type, b));
1577
1578   if(a == bld->undef || b == bld->undef)
1579      return bld->undef;
1580
1581   if(a == b)
1582      return a;
1583
1584   if (bld->type.norm) {
1585      if (!bld->type.sign) {
1586         if (a == bld->zero || b == bld->zero) {
1587            return bld->zero;
1588         }
1589      }
1590      if(a == bld->one)
1591         return b;
1592      if(b == bld->one)
1593         return a;
1594   }
1595
1596   return lp_build_min_simple(bld, a, b, nan_behavior);
1597}
1598
1599/**
1600 * Generate max(a, b)
1601 * Do checks for special cases, but NaN behavior is undefined.
1602 */
1603LLVMValueRef
1604lp_build_max(struct lp_build_context *bld,
1605             LLVMValueRef a,
1606             LLVMValueRef b)
1607{
1608   assert(lp_check_value(bld->type, a));
1609   assert(lp_check_value(bld->type, b));
1610
1611   if(a == bld->undef || b == bld->undef)
1612      return bld->undef;
1613
1614   if(a == b)
1615      return a;
1616
1617   if(bld->type.norm) {
1618      if(a == bld->one || b == bld->one)
1619         return bld->one;
1620      if (!bld->type.sign) {
1621         if (a == bld->zero) {
1622            return b;
1623         }
1624         if (b == bld->zero) {
1625            return a;
1626         }
1627      }
1628   }
1629
1630   return lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED);
1631}
1632
1633
1634/**
1635 * Generate max(a, b)
1636 * Checks for special cases.
1637 * NaN's are handled according to the behavior specified by the
1638 * nan_behavior argument.
1639 */
1640LLVMValueRef
1641lp_build_max_ext(struct lp_build_context *bld,
1642                  LLVMValueRef a,
1643                  LLVMValueRef b,
1644                  enum gallivm_nan_behavior nan_behavior)
1645{
1646   assert(lp_check_value(bld->type, a));
1647   assert(lp_check_value(bld->type, b));
1648
1649   if(a == bld->undef || b == bld->undef)
1650      return bld->undef;
1651
1652   if(a == b)
1653      return a;
1654
1655   if(bld->type.norm) {
1656      if(a == bld->one || b == bld->one)
1657         return bld->one;
1658      if (!bld->type.sign) {
1659         if (a == bld->zero) {
1660            return b;
1661         }
1662         if (b == bld->zero) {
1663            return a;
1664         }
1665      }
1666   }
1667
1668   return lp_build_max_simple(bld, a, b, nan_behavior);
1669}
1670
1671/**
1672 * Generate clamp(a, min, max)
1673 * NaN behavior (for any of a, min, max) is undefined.
1674 * Do checks for special cases.
1675 */
1676LLVMValueRef
1677lp_build_clamp(struct lp_build_context *bld,
1678               LLVMValueRef a,
1679               LLVMValueRef min,
1680               LLVMValueRef max)
1681{
1682   assert(lp_check_value(bld->type, a));
1683   assert(lp_check_value(bld->type, min));
1684   assert(lp_check_value(bld->type, max));
1685
1686   a = lp_build_min(bld, a, max);
1687   a = lp_build_max(bld, a, min);
1688   return a;
1689}
1690
1691
1692/**
1693 * Generate clamp(a, 0, 1)
1694 * A NaN will get converted to zero.
1695 */
1696LLVMValueRef
1697lp_build_clamp_zero_one_nanzero(struct lp_build_context *bld,
1698                                LLVMValueRef a)
1699{
1700   a = lp_build_max_ext(bld, a, bld->zero, GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1701   a = lp_build_min(bld, a, bld->one);
1702   return a;
1703}
1704
1705
1706/**
1707 * Generate abs(a)
1708 */
1709LLVMValueRef
1710lp_build_abs(struct lp_build_context *bld,
1711             LLVMValueRef a)
1712{
1713   LLVMBuilderRef builder = bld->gallivm->builder;
1714   const struct lp_type type = bld->type;
1715   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1716
1717   assert(lp_check_value(type, a));
1718
1719   if(!type.sign)
1720      return a;
1721
1722   if(type.floating) {
1723      char intrinsic[32];
1724      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
1725      return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
1726   }
1727
1728   if(type.width*type.length == 128 && util_get_cpu_caps()->has_ssse3 && LLVM_VERSION_MAJOR < 6) {
1729      switch(type.width) {
1730      case 8:
1731         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
1732      case 16:
1733         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
1734      case 32:
1735         return lp_build_intrinsic_unary(builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
1736      }
1737   }
1738   else if (type.width*type.length == 256 && util_get_cpu_caps()->has_avx2 && LLVM_VERSION_MAJOR < 6) {
1739      switch(type.width) {
1740      case 8:
1741         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.b", vec_type, a);
1742      case 16:
1743         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.w", vec_type, a);
1744      case 32:
1745         return lp_build_intrinsic_unary(builder, "llvm.x86.avx2.pabs.d", vec_type, a);
1746      }
1747   }
1748
1749   return lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero),
1750                          a, LLVMBuildNeg(builder, a, ""));
1751}
1752
1753
1754LLVMValueRef
1755lp_build_negate(struct lp_build_context *bld,
1756                LLVMValueRef a)
1757{
1758   LLVMBuilderRef builder = bld->gallivm->builder;
1759
1760   assert(lp_check_value(bld->type, a));
1761
1762   if (bld->type.floating)
1763      a = LLVMBuildFNeg(builder, a, "");
1764   else
1765      a = LLVMBuildNeg(builder, a, "");
1766
1767   return a;
1768}
1769
1770
1771/** Return -1, 0 or +1 depending on the sign of a */
1772LLVMValueRef
1773lp_build_sgn(struct lp_build_context *bld,
1774             LLVMValueRef a)
1775{
1776   LLVMBuilderRef builder = bld->gallivm->builder;
1777   const struct lp_type type = bld->type;
1778   LLVMValueRef cond;
1779   LLVMValueRef res;
1780
1781   assert(lp_check_value(type, a));
1782
1783   /* Handle non-zero case */
1784   if(!type.sign) {
1785      /* if not zero then sign must be positive */
1786      res = bld->one;
1787   }
1788   else if(type.floating) {
1789      LLVMTypeRef vec_type;
1790      LLVMTypeRef int_type;
1791      LLVMValueRef mask;
1792      LLVMValueRef sign;
1793      LLVMValueRef one;
1794      unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
1795
1796      int_type = lp_build_int_vec_type(bld->gallivm, type);
1797      vec_type = lp_build_vec_type(bld->gallivm, type);
1798      mask = lp_build_const_int_vec(bld->gallivm, type, maskBit);
1799
1800      /* Take the sign bit and add it to 1 constant */
1801      sign = LLVMBuildBitCast(builder, a, int_type, "");
1802      sign = LLVMBuildAnd(builder, sign, mask, "");
1803      one = LLVMConstBitCast(bld->one, int_type);
1804      res = LLVMBuildOr(builder, sign, one, "");
1805      res = LLVMBuildBitCast(builder, res, vec_type, "");
1806   }
1807   else
1808   {
1809      /* signed int/norm/fixed point */
1810      /* could use psign with sse3 and appropriate vectors here */
1811      LLVMValueRef minus_one = lp_build_const_vec(bld->gallivm, type, -1.0);
1812      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
1813      res = lp_build_select(bld, cond, bld->one, minus_one);
1814   }
1815
1816   /* Handle zero */
1817   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
1818   res = lp_build_select(bld, cond, bld->zero, res);
1819
1820   return res;
1821}
1822
1823
1824/**
1825 * Set the sign of float vector 'a' according to 'sign'.
1826 * If sign==0, return abs(a).
1827 * If sign==1, return -abs(a);
1828 * Other values for sign produce undefined results.
1829 */
1830LLVMValueRef
1831lp_build_set_sign(struct lp_build_context *bld,
1832                  LLVMValueRef a, LLVMValueRef sign)
1833{
1834   LLVMBuilderRef builder = bld->gallivm->builder;
1835   const struct lp_type type = bld->type;
1836   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
1837   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1838   LLVMValueRef shift = lp_build_const_int_vec(bld->gallivm, type, type.width - 1);
1839   LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
1840                             ~((unsigned long long) 1 << (type.width - 1)));
1841   LLVMValueRef val, res;
1842
1843   assert(type.floating);
1844   assert(lp_check_value(type, a));
1845
1846   /* val = reinterpret_cast<int>(a) */
1847   val = LLVMBuildBitCast(builder, a, int_vec_type, "");
1848   /* val = val & mask */
1849   val = LLVMBuildAnd(builder, val, mask, "");
1850   /* sign = sign << shift */
1851   sign = LLVMBuildShl(builder, sign, shift, "");
1852   /* res = val | sign */
1853   res = LLVMBuildOr(builder, val, sign, "");
1854   /* res = reinterpret_cast<float>(res) */
1855   res = LLVMBuildBitCast(builder, res, vec_type, "");
1856
1857   return res;
1858}
1859
1860
1861/**
1862 * Convert vector of (or scalar) int to vector of (or scalar) float.
1863 */
1864LLVMValueRef
1865lp_build_int_to_float(struct lp_build_context *bld,
1866                      LLVMValueRef a)
1867{
1868   LLVMBuilderRef builder = bld->gallivm->builder;
1869   const struct lp_type type = bld->type;
1870   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
1871
1872   assert(type.floating);
1873
1874   return LLVMBuildSIToFP(builder, a, vec_type, "");
1875}
1876
1877static boolean
1878arch_rounding_available(const struct lp_type type)
1879{
1880   if ((util_get_cpu_caps()->has_sse4_1 &&
1881       (type.length == 1 || type.width*type.length == 128)) ||
1882       (util_get_cpu_caps()->has_avx && type.width*type.length == 256) ||
1883       (util_get_cpu_caps()->has_avx512f && type.width*type.length == 512))
1884      return TRUE;
1885   else if ((util_get_cpu_caps()->has_altivec &&
1886            (type.width == 32 && type.length == 4)))
1887      return TRUE;
1888   else if (util_get_cpu_caps()->has_neon)
1889      return TRUE;
1890
1891   return FALSE;
1892}
1893
1894enum lp_build_round_mode
1895{
1896   LP_BUILD_ROUND_NEAREST = 0,
1897   LP_BUILD_ROUND_FLOOR = 1,
1898   LP_BUILD_ROUND_CEIL = 2,
1899   LP_BUILD_ROUND_TRUNCATE = 3
1900};
1901
1902static inline LLVMValueRef
1903lp_build_iround_nearest_sse2(struct lp_build_context *bld,
1904                             LLVMValueRef a)
1905{
1906   LLVMBuilderRef builder = bld->gallivm->builder;
1907   const struct lp_type type = bld->type;
1908   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
1909   LLVMTypeRef ret_type = lp_build_int_vec_type(bld->gallivm, type);
1910   const char *intrinsic;
1911   LLVMValueRef res;
1912
1913   assert(type.floating);
1914   /* using the double precision conversions is a bit more complicated */
1915   assert(type.width == 32);
1916
1917   assert(lp_check_value(type, a));
1918   assert(util_get_cpu_caps()->has_sse2);
1919
1920   /* This is relying on MXCSR rounding mode, which should always be nearest. */
1921   if (type.length == 1) {
1922      LLVMTypeRef vec_type;
1923      LLVMValueRef undef;
1924      LLVMValueRef arg;
1925      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
1926
1927      vec_type = LLVMVectorType(bld->elem_type, 4);
1928
1929      intrinsic = "llvm.x86.sse.cvtss2si";
1930
1931      undef = LLVMGetUndef(vec_type);
1932
1933      arg = LLVMBuildInsertElement(builder, undef, a, index0, "");
1934
1935      res = lp_build_intrinsic_unary(builder, intrinsic,
1936                                     ret_type, arg);
1937   }
1938   else {
1939      if (type.width* type.length == 128) {
1940         intrinsic = "llvm.x86.sse2.cvtps2dq";
1941      }
1942      else {
1943         assert(type.width*type.length == 256);
1944         assert(util_get_cpu_caps()->has_avx);
1945
1946         intrinsic = "llvm.x86.avx.cvt.ps2dq.256";
1947      }
1948      res = lp_build_intrinsic_unary(builder, intrinsic,
1949                                     ret_type, a);
1950   }
1951
1952   return res;
1953}
1954
1955
1956/*
1957 */
1958static inline LLVMValueRef
1959lp_build_round_altivec(struct lp_build_context *bld,
1960                       LLVMValueRef a,
1961                       enum lp_build_round_mode mode)
1962{
1963   LLVMBuilderRef builder = bld->gallivm->builder;
1964   const struct lp_type type = bld->type;
1965   const char *intrinsic = NULL;
1966
1967   assert(type.floating);
1968
1969   assert(lp_check_value(type, a));
1970   assert(util_get_cpu_caps()->has_altivec);
1971
1972   (void)type;
1973
1974   switch (mode) {
1975   case LP_BUILD_ROUND_NEAREST:
1976      intrinsic = "llvm.ppc.altivec.vrfin";
1977      break;
1978   case LP_BUILD_ROUND_FLOOR:
1979      intrinsic = "llvm.ppc.altivec.vrfim";
1980      break;
1981   case LP_BUILD_ROUND_CEIL:
1982      intrinsic = "llvm.ppc.altivec.vrfip";
1983      break;
1984   case LP_BUILD_ROUND_TRUNCATE:
1985      intrinsic = "llvm.ppc.altivec.vrfiz";
1986      break;
1987   }
1988
1989   return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
1990}
1991
1992static inline LLVMValueRef
1993lp_build_round_arch(struct lp_build_context *bld,
1994                    LLVMValueRef a,
1995                    enum lp_build_round_mode mode)
1996{
1997   if (util_get_cpu_caps()->has_sse4_1 || util_get_cpu_caps()->has_neon) {
1998      LLVMBuilderRef builder = bld->gallivm->builder;
1999      const struct lp_type type = bld->type;
2000      const char *intrinsic_root;
2001      char intrinsic[32];
2002
2003      assert(type.floating);
2004      assert(lp_check_value(type, a));
2005      (void)type;
2006
2007      switch (mode) {
2008      case LP_BUILD_ROUND_NEAREST:
2009         intrinsic_root = "llvm.nearbyint";
2010         break;
2011      case LP_BUILD_ROUND_FLOOR:
2012         intrinsic_root = "llvm.floor";
2013         break;
2014      case LP_BUILD_ROUND_CEIL:
2015         intrinsic_root = "llvm.ceil";
2016         break;
2017      case LP_BUILD_ROUND_TRUNCATE:
2018         intrinsic_root = "llvm.trunc";
2019         break;
2020      default:
2021         unreachable("unhandled lp_build_round_mode");
2022      }
2023
2024      lp_format_intrinsic(intrinsic, sizeof intrinsic, intrinsic_root, bld->vec_type);
2025      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2026   }
2027   else /* (util_get_cpu_caps()->has_altivec) */
2028     return lp_build_round_altivec(bld, a, mode);
2029}
2030
2031/**
2032 * Return the integer part of a float (vector) value (== round toward zero).
2033 * The returned value is a float (vector).
2034 * Ex: trunc(-1.5) = -1.0
2035 */
2036LLVMValueRef
2037lp_build_trunc(struct lp_build_context *bld,
2038               LLVMValueRef a)
2039{
2040   LLVMBuilderRef builder = bld->gallivm->builder;
2041   const struct lp_type type = bld->type;
2042
2043   assert(type.floating);
2044   assert(lp_check_value(type, a));
2045
2046   if (type.width == 16) {
2047      char intrinsic[64];
2048      lp_format_intrinsic(intrinsic, 64, "llvm.trunc", bld->vec_type);
2049      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2050   }
2051
2052   if (arch_rounding_available(type)) {
2053      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_TRUNCATE);
2054   }
2055   else {
2056      const struct lp_type type = bld->type;
2057      struct lp_type inttype;
2058      struct lp_build_context intbld;
2059      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2060      LLVMValueRef trunc, res, anosign, mask;
2061      LLVMTypeRef int_vec_type = bld->int_vec_type;
2062      LLVMTypeRef vec_type = bld->vec_type;
2063
2064      inttype = type;
2065      inttype.floating = 0;
2066      lp_build_context_init(&intbld, bld->gallivm, inttype);
2067
2068      /* round by truncation */
2069      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2070      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2071
2072      /* mask out sign bit */
2073      anosign = lp_build_abs(bld, a);
2074      /*
2075       * mask out all values if anosign > 2^24
2076       * This should work both for large ints (all rounding is no-op for them
2077       * because such floats are always exact) as well as special cases like
2078       * NaNs, Infs (taking advantage of the fact they use max exponent).
2079       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2080       */
2081      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2082      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2083      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2084      return lp_build_select(bld, mask, a, res);
2085   }
2086}
2087
2088
2089/**
2090 * Return float (vector) rounded to nearest integer (vector).  The returned
2091 * value is a float (vector).
2092 * Ex: round(0.9) = 1.0
2093 * Ex: round(-1.5) = -2.0
2094 */
2095LLVMValueRef
2096lp_build_round(struct lp_build_context *bld,
2097               LLVMValueRef a)
2098{
2099   LLVMBuilderRef builder = bld->gallivm->builder;
2100   const struct lp_type type = bld->type;
2101
2102   assert(type.floating);
2103   assert(lp_check_value(type, a));
2104
2105   if (type.width == 16) {
2106      char intrinsic[64];
2107      lp_format_intrinsic(intrinsic, 64, "llvm.round", bld->vec_type);
2108      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2109   }
2110
2111   if (arch_rounding_available(type)) {
2112      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2113   }
2114   else {
2115      const struct lp_type type = bld->type;
2116      struct lp_type inttype;
2117      struct lp_build_context intbld;
2118      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2119      LLVMValueRef res, anosign, mask;
2120      LLVMTypeRef int_vec_type = bld->int_vec_type;
2121      LLVMTypeRef vec_type = bld->vec_type;
2122
2123      inttype = type;
2124      inttype.floating = 0;
2125      lp_build_context_init(&intbld, bld->gallivm, inttype);
2126
2127      res = lp_build_iround(bld, a);
2128      res = LLVMBuildSIToFP(builder, res, vec_type, "");
2129
2130      /* mask out sign bit */
2131      anosign = lp_build_abs(bld, a);
2132      /*
2133       * mask out all values if anosign > 2^24
2134       * This should work both for large ints (all rounding is no-op for them
2135       * because such floats are always exact) as well as special cases like
2136       * NaNs, Infs (taking advantage of the fact they use max exponent).
2137       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2138       */
2139      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2140      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2141      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2142      return lp_build_select(bld, mask, a, res);
2143   }
2144}
2145
2146
2147/**
2148 * Return floor of float (vector), result is a float (vector)
2149 * Ex: floor(1.1) = 1.0
2150 * Ex: floor(-1.1) = -2.0
2151 */
2152LLVMValueRef
2153lp_build_floor(struct lp_build_context *bld,
2154               LLVMValueRef a)
2155{
2156   LLVMBuilderRef builder = bld->gallivm->builder;
2157   const struct lp_type type = bld->type;
2158
2159   assert(type.floating);
2160   assert(lp_check_value(type, a));
2161
2162   if (arch_rounding_available(type)) {
2163      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2164   }
2165   else {
2166      const struct lp_type type = bld->type;
2167      struct lp_type inttype;
2168      struct lp_build_context intbld;
2169      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2170      LLVMValueRef trunc, res, anosign, mask;
2171      LLVMTypeRef int_vec_type = bld->int_vec_type;
2172      LLVMTypeRef vec_type = bld->vec_type;
2173
2174      if (type.width != 32) {
2175         char intrinsic[32];
2176         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.floor", vec_type);
2177         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2178      }
2179
2180      assert(type.width == 32); /* might want to handle doubles at some point */
2181
2182      inttype = type;
2183      inttype.floating = 0;
2184      lp_build_context_init(&intbld, bld->gallivm, inttype);
2185
2186      /* round by truncation */
2187      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2188      res = LLVMBuildSIToFP(builder, trunc, vec_type, "floor.trunc");
2189
2190      if (type.sign) {
2191         LLVMValueRef tmp;
2192
2193         /*
2194          * fix values if rounding is wrong (for non-special cases)
2195          * - this is the case if trunc > a
2196          */
2197         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, res, a);
2198         /* tmp = trunc > a ? 1.0 : 0.0 */
2199         tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2200         tmp = lp_build_and(&intbld, mask, tmp);
2201         tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2202         res = lp_build_sub(bld, res, tmp);
2203      }
2204
2205      /* mask out sign bit */
2206      anosign = lp_build_abs(bld, a);
2207      /*
2208       * mask out all values if anosign > 2^24
2209       * This should work both for large ints (all rounding is no-op for them
2210       * because such floats are always exact) as well as special cases like
2211       * NaNs, Infs (taking advantage of the fact they use max exponent).
2212       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2213       */
2214      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2215      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2216      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2217      return lp_build_select(bld, mask, a, res);
2218   }
2219}
2220
2221
2222/**
2223 * Return ceiling of float (vector), returning float (vector).
2224 * Ex: ceil( 1.1) = 2.0
2225 * Ex: ceil(-1.1) = -1.0
2226 */
2227LLVMValueRef
2228lp_build_ceil(struct lp_build_context *bld,
2229              LLVMValueRef a)
2230{
2231   LLVMBuilderRef builder = bld->gallivm->builder;
2232   const struct lp_type type = bld->type;
2233
2234   assert(type.floating);
2235   assert(lp_check_value(type, a));
2236
2237   if (arch_rounding_available(type)) {
2238      return lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2239   }
2240   else {
2241      const struct lp_type type = bld->type;
2242      struct lp_type inttype;
2243      struct lp_build_context intbld;
2244      LLVMValueRef cmpval = lp_build_const_vec(bld->gallivm, type, 1<<24);
2245      LLVMValueRef trunc, res, anosign, mask, tmp;
2246      LLVMTypeRef int_vec_type = bld->int_vec_type;
2247      LLVMTypeRef vec_type = bld->vec_type;
2248
2249      if (type.width != 32) {
2250         char intrinsic[32];
2251         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.ceil", vec_type);
2252         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2253      }
2254
2255      assert(type.width == 32); /* might want to handle doubles at some point */
2256
2257      inttype = type;
2258      inttype.floating = 0;
2259      lp_build_context_init(&intbld, bld->gallivm, inttype);
2260
2261      /* round by truncation */
2262      trunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2263      trunc = LLVMBuildSIToFP(builder, trunc, vec_type, "ceil.trunc");
2264
2265      /*
2266       * fix values if rounding is wrong (for non-special cases)
2267       * - this is the case if trunc < a
2268       */
2269      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2270      /* tmp = trunc < a ? 1.0 : 0.0 */
2271      tmp = LLVMBuildBitCast(builder, bld->one, int_vec_type, "");
2272      tmp = lp_build_and(&intbld, mask, tmp);
2273      tmp = LLVMBuildBitCast(builder, tmp, vec_type, "");
2274      res = lp_build_add(bld, trunc, tmp);
2275
2276      /* mask out sign bit */
2277      anosign = lp_build_abs(bld, a);
2278      /*
2279       * mask out all values if anosign > 2^24
2280       * This should work both for large ints (all rounding is no-op for them
2281       * because such floats are always exact) as well as special cases like
2282       * NaNs, Infs (taking advantage of the fact they use max exponent).
2283       * (2^24 is arbitrary anything between 2^24 and 2^31 should work.)
2284       */
2285      anosign = LLVMBuildBitCast(builder, anosign, int_vec_type, "");
2286      cmpval = LLVMBuildBitCast(builder, cmpval, int_vec_type, "");
2287      mask = lp_build_cmp(&intbld, PIPE_FUNC_GREATER, anosign, cmpval);
2288      return lp_build_select(bld, mask, a, res);
2289   }
2290}
2291
2292
2293/**
2294 * Return fractional part of 'a' computed as a - floor(a)
2295 * Typically used in texture coord arithmetic.
2296 */
2297LLVMValueRef
2298lp_build_fract(struct lp_build_context *bld,
2299               LLVMValueRef a)
2300{
2301   assert(bld->type.floating);
2302   return lp_build_sub(bld, a, lp_build_floor(bld, a));
2303}
2304
2305
2306/**
2307 * Prevent returning 1.0 for very small negative values of 'a' by clamping
2308 * against 0.99999(9). (Will also return that value for NaNs.)
2309 */
2310static inline LLVMValueRef
2311clamp_fract(struct lp_build_context *bld, LLVMValueRef fract)
2312{
2313   LLVMValueRef max;
2314
2315   /* this is the largest number smaller than 1.0 representable as float */
2316   max = lp_build_const_vec(bld->gallivm, bld->type,
2317                            1.0 - 1.0/(1LL << (lp_mantissa(bld->type) + 1)));
2318   return lp_build_min_ext(bld, fract, max,
2319                           GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
2320}
2321
2322
2323/**
2324 * Same as lp_build_fract, but guarantees that the result is always smaller
2325 * than one. Will also return the smaller-than-one value for infs, NaNs.
2326 */
2327LLVMValueRef
2328lp_build_fract_safe(struct lp_build_context *bld,
2329                    LLVMValueRef a)
2330{
2331   return clamp_fract(bld, lp_build_fract(bld, a));
2332}
2333
2334
2335/**
2336 * Return the integer part of a float (vector) value (== round toward zero).
2337 * The returned value is an integer (vector).
2338 * Ex: itrunc(-1.5) = -1
2339 */
2340LLVMValueRef
2341lp_build_itrunc(struct lp_build_context *bld,
2342                LLVMValueRef a)
2343{
2344   LLVMBuilderRef builder = bld->gallivm->builder;
2345   const struct lp_type type = bld->type;
2346   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
2347
2348   assert(type.floating);
2349   assert(lp_check_value(type, a));
2350
2351   return LLVMBuildFPToSI(builder, a, int_vec_type, "");
2352}
2353
2354
2355/**
2356 * Return float (vector) rounded to nearest integer (vector).  The returned
2357 * value is an integer (vector).
2358 * Ex: iround(0.9) = 1
2359 * Ex: iround(-1.5) = -2
2360 */
2361LLVMValueRef
2362lp_build_iround(struct lp_build_context *bld,
2363                LLVMValueRef a)
2364{
2365   LLVMBuilderRef builder = bld->gallivm->builder;
2366   const struct lp_type type = bld->type;
2367   LLVMTypeRef int_vec_type = bld->int_vec_type;
2368   LLVMValueRef res;
2369
2370   assert(type.floating);
2371
2372   assert(lp_check_value(type, a));
2373
2374   if ((util_get_cpu_caps()->has_sse2 &&
2375       ((type.width == 32) && (type.length == 1 || type.length == 4))) ||
2376       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2377      return lp_build_iround_nearest_sse2(bld, a);
2378   }
2379   if (arch_rounding_available(type)) {
2380      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_NEAREST);
2381   }
2382   else {
2383      LLVMValueRef half;
2384
2385      half = lp_build_const_vec(bld->gallivm, type, nextafterf(0.5, 0.0));
2386
2387      if (type.sign) {
2388         LLVMTypeRef vec_type = bld->vec_type;
2389         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type,
2390                                    (unsigned long long)1 << (type.width - 1));
2391         LLVMValueRef sign;
2392
2393         /* get sign bit */
2394         sign = LLVMBuildBitCast(builder, a, int_vec_type, "");
2395         sign = LLVMBuildAnd(builder, sign, mask, "");
2396
2397         /* sign * 0.5 */
2398         half = LLVMBuildBitCast(builder, half, int_vec_type, "");
2399         half = LLVMBuildOr(builder, sign, half, "");
2400         half = LLVMBuildBitCast(builder, half, vec_type, "");
2401      }
2402
2403      res = LLVMBuildFAdd(builder, a, half, "");
2404   }
2405
2406   res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
2407
2408   return res;
2409}
2410
2411
2412/**
2413 * Return floor of float (vector), result is an int (vector)
2414 * Ex: ifloor(1.1) = 1.0
2415 * Ex: ifloor(-1.1) = -2.0
2416 */
2417LLVMValueRef
2418lp_build_ifloor(struct lp_build_context *bld,
2419                LLVMValueRef a)
2420{
2421   LLVMBuilderRef builder = bld->gallivm->builder;
2422   const struct lp_type type = bld->type;
2423   LLVMTypeRef int_vec_type = bld->int_vec_type;
2424   LLVMValueRef res;
2425
2426   assert(type.floating);
2427   assert(lp_check_value(type, a));
2428
2429   res = a;
2430   if (type.sign) {
2431      if (arch_rounding_available(type)) {
2432         res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_FLOOR);
2433      }
2434      else {
2435         struct lp_type inttype;
2436         struct lp_build_context intbld;
2437         LLVMValueRef trunc, itrunc, mask;
2438
2439         assert(type.floating);
2440         assert(lp_check_value(type, a));
2441
2442         inttype = type;
2443         inttype.floating = 0;
2444         lp_build_context_init(&intbld, bld->gallivm, inttype);
2445
2446         /* round by truncation */
2447         itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2448         trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "ifloor.trunc");
2449
2450         /*
2451          * fix values if rounding is wrong (for non-special cases)
2452          * - this is the case if trunc > a
2453          * The results of doing this with NaNs, very large values etc.
2454          * are undefined but this seems to be the case anyway.
2455          */
2456         mask = lp_build_cmp(bld, PIPE_FUNC_GREATER, trunc, a);
2457         /* cheapie minus one with mask since the mask is minus one / zero */
2458         return lp_build_add(&intbld, itrunc, mask);
2459      }
2460   }
2461
2462   /* round to nearest (toward zero) */
2463   res = LLVMBuildFPToSI(builder, res, int_vec_type, "ifloor.res");
2464
2465   return res;
2466}
2467
2468
2469/**
2470 * Return ceiling of float (vector), returning int (vector).
2471 * Ex: iceil( 1.1) = 2
2472 * Ex: iceil(-1.1) = -1
2473 */
2474LLVMValueRef
2475lp_build_iceil(struct lp_build_context *bld,
2476               LLVMValueRef a)
2477{
2478   LLVMBuilderRef builder = bld->gallivm->builder;
2479   const struct lp_type type = bld->type;
2480   LLVMTypeRef int_vec_type = bld->int_vec_type;
2481   LLVMValueRef res;
2482
2483   assert(type.floating);
2484   assert(lp_check_value(type, a));
2485
2486   if (arch_rounding_available(type)) {
2487      res = lp_build_round_arch(bld, a, LP_BUILD_ROUND_CEIL);
2488   }
2489   else {
2490      struct lp_type inttype;
2491      struct lp_build_context intbld;
2492      LLVMValueRef trunc, itrunc, mask;
2493
2494      assert(type.floating);
2495      assert(lp_check_value(type, a));
2496
2497      inttype = type;
2498      inttype.floating = 0;
2499      lp_build_context_init(&intbld, bld->gallivm, inttype);
2500
2501      /* round by truncation */
2502      itrunc = LLVMBuildFPToSI(builder, a, int_vec_type, "");
2503      trunc = LLVMBuildSIToFP(builder, itrunc, bld->vec_type, "iceil.trunc");
2504
2505      /*
2506       * fix values if rounding is wrong (for non-special cases)
2507       * - this is the case if trunc < a
2508       * The results of doing this with NaNs, very large values etc.
2509       * are undefined but this seems to be the case anyway.
2510       */
2511      mask = lp_build_cmp(bld, PIPE_FUNC_LESS, trunc, a);
2512      /* cheapie plus one with mask since the mask is minus one / zero */
2513      return lp_build_sub(&intbld, itrunc, mask);
2514   }
2515
2516   /* round to nearest (toward zero) */
2517   res = LLVMBuildFPToSI(builder, res, int_vec_type, "iceil.res");
2518
2519   return res;
2520}
2521
2522
2523/**
2524 * Combined ifloor() & fract().
2525 *
2526 * Preferred to calling the functions separately, as it will ensure that the
2527 * strategy (floor() vs ifloor()) that results in less redundant work is used.
2528 */
2529void
2530lp_build_ifloor_fract(struct lp_build_context *bld,
2531                      LLVMValueRef a,
2532                      LLVMValueRef *out_ipart,
2533                      LLVMValueRef *out_fpart)
2534{
2535   LLVMBuilderRef builder = bld->gallivm->builder;
2536   const struct lp_type type = bld->type;
2537   LLVMValueRef ipart;
2538
2539   assert(type.floating);
2540   assert(lp_check_value(type, a));
2541
2542   if (arch_rounding_available(type)) {
2543      /*
2544       * floor() is easier.
2545       */
2546
2547      ipart = lp_build_floor(bld, a);
2548      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2549      *out_ipart = LLVMBuildFPToSI(builder, ipart, bld->int_vec_type, "ipart");
2550   }
2551   else {
2552      /*
2553       * ifloor() is easier.
2554       */
2555
2556      *out_ipart = lp_build_ifloor(bld, a);
2557      ipart = LLVMBuildSIToFP(builder, *out_ipart, bld->vec_type, "ipart");
2558      *out_fpart = LLVMBuildFSub(builder, a, ipart, "fpart");
2559   }
2560}
2561
2562
2563/**
2564 * Same as lp_build_ifloor_fract, but guarantees that the fractional part is
2565 * always smaller than one.
2566 */
2567void
2568lp_build_ifloor_fract_safe(struct lp_build_context *bld,
2569                           LLVMValueRef a,
2570                           LLVMValueRef *out_ipart,
2571                           LLVMValueRef *out_fpart)
2572{
2573   lp_build_ifloor_fract(bld, a, out_ipart, out_fpart);
2574   *out_fpart = clamp_fract(bld, *out_fpart);
2575}
2576
2577
2578LLVMValueRef
2579lp_build_sqrt(struct lp_build_context *bld,
2580              LLVMValueRef a)
2581{
2582   LLVMBuilderRef builder = bld->gallivm->builder;
2583   const struct lp_type type = bld->type;
2584   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
2585   char intrinsic[32];
2586
2587   assert(lp_check_value(type, a));
2588
2589   assert(type.floating);
2590   lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sqrt", vec_type);
2591
2592   return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
2593}
2594
2595
2596/**
2597 * Do one Newton-Raphson step to improve reciprocate precision:
2598 *
2599 *   x_{i+1} = x_i + x_i * (1 - a * x_i)
2600 *
2601 * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
2602 * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
2603 * such as Google Earth, which does RCP(RSQRT(0.0)) when drawing the Earth's
2604 * halo. It would be necessary to clamp the argument to prevent this.
2605 *
2606 * See also:
2607 * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
2608 * - http://softwarecommunity.intel.com/articles/eng/1818.htm
2609 */
2610static inline LLVMValueRef
2611lp_build_rcp_refine(struct lp_build_context *bld,
2612                    LLVMValueRef a,
2613                    LLVMValueRef rcp_a)
2614{
2615   LLVMBuilderRef builder = bld->gallivm->builder;
2616   LLVMValueRef neg_a;
2617   LLVMValueRef res;
2618
2619   neg_a = LLVMBuildFNeg(builder, a, "");
2620   res = lp_build_fmuladd(builder, neg_a, rcp_a, bld->one);
2621   res = lp_build_fmuladd(builder, res, rcp_a, rcp_a);
2622
2623   return res;
2624}
2625
2626
2627LLVMValueRef
2628lp_build_rcp(struct lp_build_context *bld,
2629             LLVMValueRef a)
2630{
2631   LLVMBuilderRef builder = bld->gallivm->builder;
2632   const struct lp_type type = bld->type;
2633
2634   assert(lp_check_value(type, a));
2635
2636   if(a == bld->zero)
2637      return bld->undef;
2638   if(a == bld->one)
2639      return bld->one;
2640   if(a == bld->undef)
2641      return bld->undef;
2642
2643   assert(type.floating);
2644
2645   if(LLVMIsConstant(a))
2646      return LLVMConstFDiv(bld->one, a);
2647
2648   /*
2649    * We don't use RCPPS because:
2650    * - it only has 10bits of precision
2651    * - it doesn't even get the reciprocate of 1.0 exactly
2652    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
2653    * - for recent processors the benefit over DIVPS is marginal, a case
2654    *   dependent
2655    *
2656    * We could still use it on certain processors if benchmarks show that the
2657    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
2658    * particular uses that require less workarounds.
2659    */
2660
2661   if (FALSE && ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2662         (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8))){
2663      const unsigned num_iterations = 0;
2664      LLVMValueRef res;
2665      unsigned i;
2666      const char *intrinsic = NULL;
2667
2668      if (type.length == 4) {
2669         intrinsic = "llvm.x86.sse.rcp.ps";
2670      }
2671      else {
2672         intrinsic = "llvm.x86.avx.rcp.ps.256";
2673      }
2674
2675      res = lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2676
2677      for (i = 0; i < num_iterations; ++i) {
2678         res = lp_build_rcp_refine(bld, a, res);
2679      }
2680
2681      return res;
2682   }
2683
2684   return LLVMBuildFDiv(builder, bld->one, a, "");
2685}
2686
2687
2688/**
2689 * Do one Newton-Raphson step to improve rsqrt precision:
2690 *
2691 *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
2692 *
2693 * See also Intel 64 and IA-32 Architectures Optimization Manual.
2694 */
2695static inline LLVMValueRef
2696lp_build_rsqrt_refine(struct lp_build_context *bld,
2697                      LLVMValueRef a,
2698                      LLVMValueRef rsqrt_a)
2699{
2700   LLVMBuilderRef builder = bld->gallivm->builder;
2701   LLVMValueRef half = lp_build_const_vec(bld->gallivm, bld->type, 0.5);
2702   LLVMValueRef three = lp_build_const_vec(bld->gallivm, bld->type, 3.0);
2703   LLVMValueRef res;
2704
2705   res = LLVMBuildFMul(builder, rsqrt_a, rsqrt_a, "");
2706   res = LLVMBuildFMul(builder, a, res, "");
2707   res = LLVMBuildFSub(builder, three, res, "");
2708   res = LLVMBuildFMul(builder, rsqrt_a, res, "");
2709   res = LLVMBuildFMul(builder, half, res, "");
2710
2711   return res;
2712}
2713
2714
2715/**
2716 * Generate 1/sqrt(a).
2717 * Result is undefined for values < 0, infinity for +0.
2718 */
2719LLVMValueRef
2720lp_build_rsqrt(struct lp_build_context *bld,
2721               LLVMValueRef a)
2722{
2723   const struct lp_type type = bld->type;
2724
2725   assert(lp_check_value(type, a));
2726
2727   assert(type.floating);
2728
2729   /*
2730    * This should be faster but all denormals will end up as infinity.
2731    */
2732   if (0 && lp_build_fast_rsqrt_available(type)) {
2733      const unsigned num_iterations = 1;
2734      LLVMValueRef res;
2735      unsigned i;
2736
2737      /* rsqrt(1.0) != 1.0 here */
2738      res = lp_build_fast_rsqrt(bld, a);
2739
2740      if (num_iterations) {
2741         /*
2742          * Newton-Raphson will result in NaN instead of infinity for zero,
2743          * and NaN instead of zero for infinity.
2744          * Also, need to ensure rsqrt(1.0) == 1.0.
2745          * All numbers smaller than FLT_MIN will result in +infinity
2746          * (rsqrtps treats all denormals as zero).
2747          */
2748         LLVMValueRef cmp;
2749         LLVMValueRef flt_min = lp_build_const_vec(bld->gallivm, type, FLT_MIN);
2750         LLVMValueRef inf = lp_build_const_vec(bld->gallivm, type, INFINITY);
2751
2752         for (i = 0; i < num_iterations; ++i) {
2753            res = lp_build_rsqrt_refine(bld, a, res);
2754         }
2755         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_LESS, a, flt_min);
2756         res = lp_build_select(bld, cmp, inf, res);
2757         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, inf);
2758         res = lp_build_select(bld, cmp, bld->zero, res);
2759         cmp = lp_build_compare(bld->gallivm, type, PIPE_FUNC_EQUAL, a, bld->one);
2760         res = lp_build_select(bld, cmp, bld->one, res);
2761      }
2762
2763      return res;
2764   }
2765
2766   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2767}
2768
2769/**
2770 * If there's a fast (inaccurate) rsqrt instruction available
2771 * (caller may want to avoid to call rsqrt_fast if it's not available,
2772 * i.e. for calculating x^0.5 it may do rsqrt_fast(x) * x but if
2773 * unavailable it would result in sqrt/div/mul so obviously
2774 * much better to just call sqrt, skipping both div and mul).
2775 */
2776boolean
2777lp_build_fast_rsqrt_available(struct lp_type type)
2778{
2779   assert(type.floating);
2780
2781   if ((util_get_cpu_caps()->has_sse && type.width == 32 && type.length == 4) ||
2782       (util_get_cpu_caps()->has_avx && type.width == 32 && type.length == 8)) {
2783      return true;
2784   }
2785   return false;
2786}
2787
2788
2789/**
2790 * Generate 1/sqrt(a).
2791 * Result is undefined for values < 0, infinity for +0.
2792 * Precision is limited, only ~10 bits guaranteed
2793 * (rsqrt 1.0 may not be 1.0, denorms may be flushed to 0).
2794 */
2795LLVMValueRef
2796lp_build_fast_rsqrt(struct lp_build_context *bld,
2797                    LLVMValueRef a)
2798{
2799   LLVMBuilderRef builder = bld->gallivm->builder;
2800   const struct lp_type type = bld->type;
2801
2802   assert(lp_check_value(type, a));
2803
2804   if (lp_build_fast_rsqrt_available(type)) {
2805      const char *intrinsic = NULL;
2806
2807      if (type.length == 4) {
2808         intrinsic = "llvm.x86.sse.rsqrt.ps";
2809      }
2810      else {
2811         intrinsic = "llvm.x86.avx.rsqrt.ps.256";
2812      }
2813      return lp_build_intrinsic_unary(builder, intrinsic, bld->vec_type, a);
2814   }
2815   else {
2816      debug_printf("%s: emulating fast rsqrt with rcp/sqrt\n", __FUNCTION__);
2817   }
2818   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
2819}
2820
2821
2822/**
2823 * Generate sin(a) or cos(a) using polynomial approximation.
2824 * TODO: it might be worth recognizing sin and cos using same source
2825 * (i.e. d3d10 sincos opcode). Obviously doing both at the same time
2826 * would be way cheaper than calculating (nearly) everything twice...
2827 * Not sure it's common enough to be worth bothering however, scs
2828 * opcode could also benefit from calculating both though.
2829 */
2830static LLVMValueRef
2831lp_build_sin_or_cos(struct lp_build_context *bld,
2832                    LLVMValueRef a,
2833                    boolean cos)
2834{
2835   struct gallivm_state *gallivm = bld->gallivm;
2836   LLVMBuilderRef b = gallivm->builder;
2837   struct lp_type int_type = lp_int_type(bld->type);
2838
2839   /*
2840    *  take the absolute value,
2841    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
2842    */
2843
2844   LLVMValueRef inv_sig_mask = lp_build_const_int_vec(gallivm, bld->type, ~0x80000000);
2845   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, bld->int_vec_type, "a_v4si");
2846
2847   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
2848   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, bld->vec_type, "x_abs");
2849
2850   /*
2851    * scale by 4/Pi
2852    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
2853    */
2854
2855   LLVMValueRef FOPi = lp_build_const_vec(gallivm, bld->type, 1.27323954473516);
2856   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
2857
2858   /*
2859    * store the integer part of y in mm0
2860    * emm2 = _mm_cvttps_epi32(y);
2861    */
2862
2863   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, bld->int_vec_type, "emm2_i");
2864
2865   /*
2866    * j=(j+1) & (~1) (see the cephes sources)
2867    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
2868    */
2869
2870   LLVMValueRef all_one = lp_build_const_int_vec(gallivm, bld->type, 1);
2871   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
2872   /*
2873    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
2874    */
2875   LLVMValueRef inv_one = lp_build_const_int_vec(gallivm, bld->type, ~1);
2876   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
2877
2878   /*
2879    * y = _mm_cvtepi32_ps(emm2);
2880    */
2881   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, bld->vec_type, "y_2");
2882
2883   LLVMValueRef const_2 = lp_build_const_int_vec(gallivm, bld->type, 2);
2884   LLVMValueRef const_4 = lp_build_const_int_vec(gallivm, bld->type, 4);
2885   LLVMValueRef const_29 = lp_build_const_int_vec(gallivm, bld->type, 29);
2886   LLVMValueRef sign_mask = lp_build_const_int_vec(gallivm, bld->type, 0x80000000);
2887
2888   /*
2889    * Argument used for poly selection and sign bit determination
2890    * is different for sin vs. cos.
2891    */
2892   LLVMValueRef emm2_2 = cos ? LLVMBuildSub(b, emm2_and, const_2, "emm2_2") :
2893                               emm2_and;
2894
2895   LLVMValueRef sign_bit = cos ? LLVMBuildShl(b, LLVMBuildAnd(b, const_4,
2896                                                              LLVMBuildNot(b, emm2_2, ""), ""),
2897                                              const_29, "sign_bit") :
2898                                 LLVMBuildAnd(b, LLVMBuildXor(b, a_v4si,
2899                                                              LLVMBuildShl(b, emm2_add,
2900                                                                           const_29, ""), ""),
2901                                              sign_mask, "sign_bit");
2902
2903   /*
2904    * get the polynom selection mask
2905    * there is one polynom for 0 <= x <= Pi/4
2906    * and another one for Pi/4<x<=Pi/2
2907    * Both branches will be computed.
2908    *
2909    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
2910    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
2911    */
2912
2913   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, const_2, "emm2_3");
2914   LLVMValueRef poly_mask = lp_build_compare(gallivm,
2915                                             int_type, PIPE_FUNC_EQUAL,
2916                                             emm2_3, lp_build_const_int_vec(gallivm, bld->type, 0));
2917
2918   /*
2919    * _PS_CONST(minus_cephes_DP1, -0.78515625);
2920    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
2921    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
2922    */
2923   LLVMValueRef DP1 = lp_build_const_vec(gallivm, bld->type, -0.78515625);
2924   LLVMValueRef DP2 = lp_build_const_vec(gallivm, bld->type, -2.4187564849853515625e-4);
2925   LLVMValueRef DP3 = lp_build_const_vec(gallivm, bld->type, -3.77489497744594108e-8);
2926
2927   /*
2928    * The magic pass: "Extended precision modular arithmetic"
2929    * x = ((x - y * DP1) - y * DP2) - y * DP3;
2930    */
2931   LLVMValueRef x_1 = lp_build_fmuladd(b, y_2, DP1, x_abs);
2932   LLVMValueRef x_2 = lp_build_fmuladd(b, y_2, DP2, x_1);
2933   LLVMValueRef x_3 = lp_build_fmuladd(b, y_2, DP3, x_2);
2934
2935   /*
2936    * Evaluate the first polynom  (0 <= x <= Pi/4)
2937    *
2938    * z = _mm_mul_ps(x,x);
2939    */
2940   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
2941
2942   /*
2943    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
2944    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
2945    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
2946    */
2947   LLVMValueRef coscof_p0 = lp_build_const_vec(gallivm, bld->type, 2.443315711809948E-005);
2948   LLVMValueRef coscof_p1 = lp_build_const_vec(gallivm, bld->type, -1.388731625493765E-003);
2949   LLVMValueRef coscof_p2 = lp_build_const_vec(gallivm, bld->type, 4.166664568298827E-002);
2950
2951   /*
2952    * y = *(v4sf*)_ps_coscof_p0;
2953    * y = _mm_mul_ps(y, z);
2954    */
2955   LLVMValueRef y_4 = lp_build_fmuladd(b, z, coscof_p0, coscof_p1);
2956   LLVMValueRef y_6 = lp_build_fmuladd(b, y_4, z, coscof_p2);
2957   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
2958   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
2959
2960
2961   /*
2962    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
2963    * y = _mm_sub_ps(y, tmp);
2964    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
2965    */
2966   LLVMValueRef half = lp_build_const_vec(gallivm, bld->type, 0.5);
2967   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
2968   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
2969   LLVMValueRef one = lp_build_const_vec(gallivm, bld->type, 1.0);
2970   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
2971
2972   /*
2973    * _PS_CONST(sincof_p0, -1.9515295891E-4);
2974    * _PS_CONST(sincof_p1,  8.3321608736E-3);
2975    * _PS_CONST(sincof_p2, -1.6666654611E-1);
2976    */
2977   LLVMValueRef sincof_p0 = lp_build_const_vec(gallivm, bld->type, -1.9515295891E-4);
2978   LLVMValueRef sincof_p1 = lp_build_const_vec(gallivm, bld->type, 8.3321608736E-3);
2979   LLVMValueRef sincof_p2 = lp_build_const_vec(gallivm, bld->type, -1.6666654611E-1);
2980
2981   /*
2982    * Evaluate the second polynom  (Pi/4 <= x <= 0)
2983    *
2984    * y2 = *(v4sf*)_ps_sincof_p0;
2985    * y2 = _mm_mul_ps(y2, z);
2986    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
2987    * y2 = _mm_mul_ps(y2, z);
2988    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
2989    * y2 = _mm_mul_ps(y2, z);
2990    * y2 = _mm_mul_ps(y2, x);
2991    * y2 = _mm_add_ps(y2, x);
2992    */
2993
2994   LLVMValueRef y2_4 = lp_build_fmuladd(b, z, sincof_p0, sincof_p1);
2995   LLVMValueRef y2_6 = lp_build_fmuladd(b, y2_4, z, sincof_p2);
2996   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
2997   LLVMValueRef y2_9 = lp_build_fmuladd(b, y2_7, x_3, x_3);
2998
2999   /*
3000    * select the correct result from the two polynoms
3001    * xmm3 = poly_mask;
3002    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
3003    * y = _mm_andnot_ps(xmm3, y);
3004    * y = _mm_or_ps(y,y2);
3005    */
3006   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, bld->int_vec_type, "y2_i");
3007   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, bld->int_vec_type, "y_i");
3008   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
3009   LLVMValueRef poly_mask_inv = LLVMBuildNot(b, poly_mask, "poly_mask_inv");
3010   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
3011   LLVMValueRef y_combine = LLVMBuildOr(b, y_and, y2_and, "y_combine");
3012
3013   /*
3014    * update the sign
3015    * y = _mm_xor_ps(y, sign_bit);
3016    */
3017   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sign");
3018   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, bld->vec_type, "y_result");
3019
3020   LLVMValueRef isfinite = lp_build_isfinite(bld, a);
3021
3022   /* clamp output to be within [-1, 1] */
3023   y_result = lp_build_clamp(bld, y_result,
3024                             lp_build_const_vec(bld->gallivm, bld->type,  -1.f),
3025                             lp_build_const_vec(bld->gallivm, bld->type,  1.f));
3026   /* If a is -inf, inf or NaN then return NaN */
3027   y_result = lp_build_select(bld, isfinite, y_result,
3028                              lp_build_const_vec(bld->gallivm, bld->type,  NAN));
3029   return y_result;
3030}
3031
3032
3033/**
3034 * Generate sin(a)
3035 */
3036LLVMValueRef
3037lp_build_sin(struct lp_build_context *bld,
3038             LLVMValueRef a)
3039{
3040   const struct lp_type type = bld->type;
3041
3042   if (type.width == 16) {
3043      LLVMBuilderRef builder = bld->gallivm->builder;
3044      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3045      char intrinsic[32];
3046      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.sin", vec_type);
3047      LLVMValueRef args[] = { a };
3048      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3049   }
3050
3051   return lp_build_sin_or_cos(bld, a, FALSE);
3052}
3053
3054
3055/**
3056 * Generate cos(a)
3057 */
3058LLVMValueRef
3059lp_build_cos(struct lp_build_context *bld,
3060             LLVMValueRef a)
3061{
3062   const struct lp_type type = bld->type;
3063
3064   if (type.width == 16) {
3065      LLVMBuilderRef builder = bld->gallivm->builder;
3066      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3067      char intrinsic[32];
3068      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.cos", vec_type);
3069      LLVMValueRef args[] = { a };
3070      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3071   }
3072
3073   return lp_build_sin_or_cos(bld, a, TRUE);
3074}
3075
3076
3077/**
3078 * Generate pow(x, y)
3079 */
3080LLVMValueRef
3081lp_build_pow(struct lp_build_context *bld,
3082             LLVMValueRef x,
3083             LLVMValueRef y)
3084{
3085   /* TODO: optimize the constant case */
3086   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3087       LLVMIsConstant(x) && LLVMIsConstant(y)) {
3088      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3089                   __FUNCTION__);
3090   }
3091
3092   LLVMValueRef cmp = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x, lp_build_const_vec(bld->gallivm, bld->type, 0.0f));
3093   LLVMValueRef res = lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2_safe(bld, x), y));
3094
3095   res = lp_build_select(bld, cmp, lp_build_const_vec(bld->gallivm, bld->type, 0.0f), res);
3096   return res;
3097}
3098
3099
3100/**
3101 * Generate exp(x)
3102 */
3103LLVMValueRef
3104lp_build_exp(struct lp_build_context *bld,
3105             LLVMValueRef x)
3106{
3107   /* log2(e) = 1/log(2) */
3108   LLVMValueRef log2e = lp_build_const_vec(bld->gallivm, bld->type,
3109                                           1.4426950408889634);
3110
3111   assert(lp_check_value(bld->type, x));
3112
3113   return lp_build_exp2(bld, lp_build_mul(bld, log2e, x));
3114}
3115
3116
3117/**
3118 * Generate log(x)
3119 * Behavior is undefined with infs, 0s and nans
3120 */
3121LLVMValueRef
3122lp_build_log(struct lp_build_context *bld,
3123             LLVMValueRef x)
3124{
3125   /* log(2) */
3126   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3127                                          0.69314718055994529);
3128
3129   assert(lp_check_value(bld->type, x));
3130
3131   return lp_build_mul(bld, log2, lp_build_log2(bld, x));
3132}
3133
3134/**
3135 * Generate log(x) that handles edge cases (infs, 0s and nans)
3136 */
3137LLVMValueRef
3138lp_build_log_safe(struct lp_build_context *bld,
3139                  LLVMValueRef x)
3140{
3141   /* log(2) */
3142   LLVMValueRef log2 = lp_build_const_vec(bld->gallivm, bld->type,
3143                                          0.69314718055994529);
3144
3145   assert(lp_check_value(bld->type, x));
3146
3147   return lp_build_mul(bld, log2, lp_build_log2_safe(bld, x));
3148}
3149
3150
3151/**
3152 * Generate polynomial.
3153 * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
3154 */
3155LLVMValueRef
3156lp_build_polynomial(struct lp_build_context *bld,
3157                    LLVMValueRef x,
3158                    const double *coeffs,
3159                    unsigned num_coeffs)
3160{
3161   const struct lp_type type = bld->type;
3162   LLVMValueRef even = NULL, odd = NULL;
3163   LLVMValueRef x2;
3164   unsigned i;
3165
3166   assert(lp_check_value(bld->type, x));
3167
3168   /* TODO: optimize the constant case */
3169   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3170       LLVMIsConstant(x)) {
3171      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3172                   __FUNCTION__);
3173   }
3174
3175   /*
3176    * Calculate odd and even terms seperately to decrease data dependency
3177    * Ex:
3178    *     c[0] + x^2 * c[2] + x^4 * c[4] ...
3179    *     + x * (c[1] + x^2 * c[3] + x^4 * c[5]) ...
3180    */
3181   x2 = lp_build_mul(bld, x, x);
3182
3183   for (i = num_coeffs; i--; ) {
3184      LLVMValueRef coeff;
3185
3186      coeff = lp_build_const_vec(bld->gallivm, type, coeffs[i]);
3187
3188      if (i % 2 == 0) {
3189         if (even)
3190            even = lp_build_mad(bld, x2, even, coeff);
3191         else
3192            even = coeff;
3193      } else {
3194         if (odd)
3195            odd = lp_build_mad(bld, x2, odd, coeff);
3196         else
3197            odd = coeff;
3198      }
3199   }
3200
3201   if (odd)
3202      return lp_build_mad(bld, odd, x, even);
3203   else if (even)
3204      return even;
3205   else
3206      return bld->undef;
3207}
3208
3209
3210/**
3211 * Minimax polynomial fit of 2**x, in range [0, 1[
3212 */
3213const double lp_build_exp2_polynomial[] = {
3214#if EXP_POLY_DEGREE == 5
3215   1.000000000000000000000, /*XXX: was 0.999999925063526176901, recompute others */
3216   0.693153073200168932794,
3217   0.240153617044375388211,
3218   0.0558263180532956664775,
3219   0.00898934009049466391101,
3220   0.00187757667519147912699
3221#elif EXP_POLY_DEGREE == 4
3222   1.00000259337069434683,
3223   0.693003834469974940458,
3224   0.24144275689150793076,
3225   0.0520114606103070150235,
3226   0.0135341679161270268764
3227#elif EXP_POLY_DEGREE == 3
3228   0.999925218562710312959,
3229   0.695833540494823811697,
3230   0.226067155427249155588,
3231   0.0780245226406372992967
3232#elif EXP_POLY_DEGREE == 2
3233   1.00172476321474503578,
3234   0.657636275736077639316,
3235   0.33718943461968720704
3236#else
3237#error
3238#endif
3239};
3240
3241
3242LLVMValueRef
3243lp_build_exp2(struct lp_build_context *bld,
3244              LLVMValueRef x)
3245{
3246   LLVMBuilderRef builder = bld->gallivm->builder;
3247   const struct lp_type type = bld->type;
3248   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3249   LLVMValueRef ipart = NULL;
3250   LLVMValueRef fpart = NULL;
3251   LLVMValueRef expipart = NULL;
3252   LLVMValueRef expfpart = NULL;
3253   LLVMValueRef res = NULL;
3254
3255   if (type.floating && type.width == 16) {
3256      char intrinsic[32];
3257      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.exp2", vec_type);
3258      LLVMValueRef args[] = { x };
3259      return lp_build_intrinsic(builder, intrinsic, vec_type, args, 1, 0);
3260   }
3261
3262   assert(lp_check_value(bld->type, x));
3263
3264   /* TODO: optimize the constant case */
3265   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3266       LLVMIsConstant(x)) {
3267      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3268                   __FUNCTION__);
3269   }
3270
3271   assert(type.floating && type.width == 32);
3272
3273   /* We want to preserve NaN and make sure than for exp2 if x > 128,
3274    * the result is INF  and if it's smaller than -126.9 the result is 0 */
3275   x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,  128.0), x,
3276                        GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3277   x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type, -126.99999),
3278                        x, GALLIVM_NAN_RETURN_NAN_FIRST_NONNAN);
3279
3280   /* ipart = floor(x) */
3281   /* fpart = x - ipart */
3282   lp_build_ifloor_fract(bld, x, &ipart, &fpart);
3283
3284   /* expipart = (float) (1 << ipart) */
3285   expipart = LLVMBuildAdd(builder, ipart,
3286                           lp_build_const_int_vec(bld->gallivm, type, 127), "");
3287   expipart = LLVMBuildShl(builder, expipart,
3288                           lp_build_const_int_vec(bld->gallivm, type, 23), "");
3289   expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
3290
3291   expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
3292                                  ARRAY_SIZE(lp_build_exp2_polynomial));
3293
3294   res = LLVMBuildFMul(builder, expipart, expfpart, "");
3295
3296   return res;
3297}
3298
3299
3300
3301/**
3302 * Extract the exponent of a IEEE-754 floating point value.
3303 *
3304 * Optionally apply an integer bias.
3305 *
3306 * Result is an integer value with
3307 *
3308 *   ifloor(log2(x)) + bias
3309 */
3310LLVMValueRef
3311lp_build_extract_exponent(struct lp_build_context *bld,
3312                          LLVMValueRef x,
3313                          int bias)
3314{
3315   LLVMBuilderRef builder = bld->gallivm->builder;
3316   const struct lp_type type = bld->type;
3317   unsigned mantissa = lp_mantissa(type);
3318   LLVMValueRef res;
3319
3320   assert(type.floating);
3321
3322   assert(lp_check_value(bld->type, x));
3323
3324   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3325
3326   res = LLVMBuildLShr(builder, x,
3327                       lp_build_const_int_vec(bld->gallivm, type, mantissa), "");
3328   res = LLVMBuildAnd(builder, res,
3329                      lp_build_const_int_vec(bld->gallivm, type, 255), "");
3330   res = LLVMBuildSub(builder, res,
3331                      lp_build_const_int_vec(bld->gallivm, type, 127 - bias), "");
3332
3333   return res;
3334}
3335
3336
3337/**
3338 * Extract the mantissa of the a floating.
3339 *
3340 * Result is a floating point value with
3341 *
3342 *   x / floor(log2(x))
3343 */
3344LLVMValueRef
3345lp_build_extract_mantissa(struct lp_build_context *bld,
3346                          LLVMValueRef x)
3347{
3348   LLVMBuilderRef builder = bld->gallivm->builder;
3349   const struct lp_type type = bld->type;
3350   unsigned mantissa = lp_mantissa(type);
3351   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type,
3352                                                  (1ULL << mantissa) - 1);
3353   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
3354   LLVMValueRef res;
3355
3356   assert(lp_check_value(bld->type, x));
3357
3358   assert(type.floating);
3359
3360   x = LLVMBuildBitCast(builder, x, bld->int_vec_type, "");
3361
3362   /* res = x / 2**ipart */
3363   res = LLVMBuildAnd(builder, x, mantmask, "");
3364   res = LLVMBuildOr(builder, res, one, "");
3365   res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
3366
3367   return res;
3368}
3369
3370
3371
3372/**
3373 * Minimax polynomial fit of log2((1.0 + sqrt(x))/(1.0 - sqrt(x)))/sqrt(x) ,for x in range of [0, 1/9[
3374 * These coefficients can be generate with
3375 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
3376 */
3377const double lp_build_log2_polynomial[] = {
3378#if LOG_POLY_DEGREE == 5
3379   2.88539008148777786488L,
3380   0.961796878841293367824L,
3381   0.577058946784739859012L,
3382   0.412914355135828735411L,
3383   0.308591899232910175289L,
3384   0.352376952300281371868L,
3385#elif LOG_POLY_DEGREE == 4
3386   2.88539009343309178325L,
3387   0.961791550404184197881L,
3388   0.577440339438736392009L,
3389   0.403343858251329912514L,
3390   0.406718052498846252698L,
3391#elif LOG_POLY_DEGREE == 3
3392   2.88538959748872753838L,
3393   0.961932915889597772928L,
3394   0.571118517972136195241L,
3395   0.493997535084709500285L,
3396#else
3397#error
3398#endif
3399};
3400
3401/**
3402 * See http://www.devmaster.net/forums/showthread.php?p=43580
3403 * http://en.wikipedia.org/wiki/Logarithm#Calculation
3404 * http://www.nezumi.demon.co.uk/consult/logx.htm
3405 *
3406 * If handle_edge_cases is true the function will perform computations
3407 * to match the required D3D10+ behavior for each of the edge cases.
3408 * That means that if input is:
3409 * - less than zero (to and including -inf) then NaN will be returned
3410 * - equal to zero (-denorm, -0, +0 or +denorm), then -inf will be returned
3411 * - +infinity, then +infinity will be returned
3412 * - NaN, then NaN will be returned
3413 *
3414 * Those checks are fairly expensive so if you don't need them make sure
3415 * handle_edge_cases is false.
3416 */
3417void
3418lp_build_log2_approx(struct lp_build_context *bld,
3419                     LLVMValueRef x,
3420                     LLVMValueRef *p_exp,
3421                     LLVMValueRef *p_floor_log2,
3422                     LLVMValueRef *p_log2,
3423                     boolean handle_edge_cases)
3424{
3425   LLVMBuilderRef builder = bld->gallivm->builder;
3426   const struct lp_type type = bld->type;
3427   LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
3428   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
3429
3430   LLVMValueRef expmask = lp_build_const_int_vec(bld->gallivm, type, 0x7f800000);
3431   LLVMValueRef mantmask = lp_build_const_int_vec(bld->gallivm, type, 0x007fffff);
3432   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
3433
3434   LLVMValueRef i = NULL;
3435   LLVMValueRef y = NULL;
3436   LLVMValueRef z = NULL;
3437   LLVMValueRef exp = NULL;
3438   LLVMValueRef mant = NULL;
3439   LLVMValueRef logexp = NULL;
3440   LLVMValueRef p_z = NULL;
3441   LLVMValueRef res = NULL;
3442
3443   if (bld->type.width == 16) {
3444      char intrinsic[32];
3445      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.log2", bld->vec_type);
3446      LLVMValueRef args[] = { x };
3447      if (p_log2)
3448         *p_log2 = lp_build_intrinsic(builder, intrinsic, bld->vec_type, args, 1, 0);
3449      return;
3450   }
3451
3452   assert(lp_check_value(bld->type, x));
3453
3454   if(p_exp || p_floor_log2 || p_log2) {
3455      /* TODO: optimize the constant case */
3456      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
3457          LLVMIsConstant(x)) {
3458         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
3459                      __FUNCTION__);
3460      }
3461
3462      assert(type.floating && type.width == 32);
3463
3464      /*
3465       * We don't explicitly handle denormalized numbers. They will yield a
3466       * result in the neighbourhood of -127, which appears to be adequate
3467       * enough.
3468       */
3469
3470      i = LLVMBuildBitCast(builder, x, int_vec_type, "");
3471
3472      /* exp = (float) exponent(x) */
3473      exp = LLVMBuildAnd(builder, i, expmask, "");
3474   }
3475
3476   if(p_floor_log2 || p_log2) {
3477      logexp = LLVMBuildLShr(builder, exp, lp_build_const_int_vec(bld->gallivm, type, 23), "");
3478      logexp = LLVMBuildSub(builder, logexp, lp_build_const_int_vec(bld->gallivm, type, 127), "");
3479      logexp = LLVMBuildSIToFP(builder, logexp, vec_type, "");
3480   }
3481
3482   if (p_log2) {
3483      /* mant = 1 + (float) mantissa(x) */
3484      mant = LLVMBuildAnd(builder, i, mantmask, "");
3485      mant = LLVMBuildOr(builder, mant, one, "");
3486      mant = LLVMBuildBitCast(builder, mant, vec_type, "");
3487
3488      /* y = (mant - 1) / (mant + 1) */
3489      y = lp_build_div(bld,
3490         lp_build_sub(bld, mant, bld->one),
3491         lp_build_add(bld, mant, bld->one)
3492      );
3493
3494      /* z = y^2 */
3495      z = lp_build_mul(bld, y, y);
3496
3497      /* compute P(z) */
3498      p_z = lp_build_polynomial(bld, z, lp_build_log2_polynomial,
3499                                ARRAY_SIZE(lp_build_log2_polynomial));
3500
3501      /* y * P(z) + logexp */
3502      res = lp_build_mad(bld, y, p_z, logexp);
3503
3504      if (type.floating && handle_edge_cases) {
3505         LLVMValueRef negmask, infmask,  zmask;
3506         negmask = lp_build_cmp(bld, PIPE_FUNC_LESS, x,
3507                                lp_build_const_vec(bld->gallivm, type,  0.0f));
3508         zmask = lp_build_cmp(bld, PIPE_FUNC_EQUAL, x,
3509                              lp_build_const_vec(bld->gallivm, type,  0.0f));
3510         infmask = lp_build_cmp(bld, PIPE_FUNC_GEQUAL, x,
3511                                lp_build_const_vec(bld->gallivm, type,  INFINITY));
3512
3513         /* If x is qual to inf make sure we return inf */
3514         res = lp_build_select(bld, infmask,
3515                               lp_build_const_vec(bld->gallivm, type,  INFINITY),
3516                               res);
3517         /* If x is qual to 0, return -inf */
3518         res = lp_build_select(bld, zmask,
3519                               lp_build_const_vec(bld->gallivm, type,  -INFINITY),
3520                               res);
3521         /* If x is nan or less than 0, return nan */
3522         res = lp_build_select(bld, negmask,
3523                               lp_build_const_vec(bld->gallivm, type,  NAN),
3524                               res);
3525      }
3526   }
3527
3528   if (p_exp) {
3529      exp = LLVMBuildBitCast(builder, exp, vec_type, "");
3530      *p_exp = exp;
3531   }
3532
3533   if (p_floor_log2)
3534      *p_floor_log2 = logexp;
3535
3536   if (p_log2)
3537      *p_log2 = res;
3538}
3539
3540
3541/*
3542 * log2 implementation which doesn't have special code to
3543 * handle edge cases (-inf, 0, inf, NaN). It's faster but
3544 * the results for those cases are undefined.
3545 */
3546LLVMValueRef
3547lp_build_log2(struct lp_build_context *bld,
3548              LLVMValueRef x)
3549{
3550   LLVMValueRef res;
3551   lp_build_log2_approx(bld, x, NULL, NULL, &res, FALSE);
3552   return res;
3553}
3554
3555/*
3556 * Version of log2 which handles all edge cases.
3557 * Look at documentation of lp_build_log2_approx for
3558 * description of the behavior for each of the edge cases.
3559 */
3560LLVMValueRef
3561lp_build_log2_safe(struct lp_build_context *bld,
3562                   LLVMValueRef x)
3563{
3564   LLVMValueRef res;
3565   lp_build_log2_approx(bld, x, NULL, NULL, &res, TRUE);
3566   return res;
3567}
3568
3569
3570/**
3571 * Faster (and less accurate) log2.
3572 *
3573 *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
3574 *
3575 * Piece-wise linear approximation, with exact results when x is a
3576 * power of two.
3577 *
3578 * See http://www.flipcode.com/archives/Fast_log_Function.shtml
3579 */
3580LLVMValueRef
3581lp_build_fast_log2(struct lp_build_context *bld,
3582                   LLVMValueRef x)
3583{
3584   LLVMBuilderRef builder = bld->gallivm->builder;
3585   LLVMValueRef ipart;
3586   LLVMValueRef fpart;
3587
3588   assert(lp_check_value(bld->type, x));
3589
3590   assert(bld->type.floating);
3591
3592   /* ipart = floor(log2(x)) - 1 */
3593   ipart = lp_build_extract_exponent(bld, x, -1);
3594   ipart = LLVMBuildSIToFP(builder, ipart, bld->vec_type, "");
3595
3596   /* fpart = x / 2**ipart */
3597   fpart = lp_build_extract_mantissa(bld, x);
3598
3599   /* ipart + fpart */
3600   return LLVMBuildFAdd(builder, ipart, fpart, "");
3601}
3602
3603
3604/**
3605 * Fast implementation of iround(log2(x)).
3606 *
3607 * Not an approximation -- it should give accurate results all the time.
3608 */
3609LLVMValueRef
3610lp_build_ilog2(struct lp_build_context *bld,
3611               LLVMValueRef x)
3612{
3613   LLVMBuilderRef builder = bld->gallivm->builder;
3614   LLVMValueRef sqrt2 = lp_build_const_vec(bld->gallivm, bld->type, M_SQRT2);
3615   LLVMValueRef ipart;
3616
3617   assert(bld->type.floating);
3618
3619   assert(lp_check_value(bld->type, x));
3620
3621   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
3622   x = LLVMBuildFMul(builder, x, sqrt2, "");
3623
3624   /* ipart = floor(log2(x) + 0.5)  */
3625   ipart = lp_build_extract_exponent(bld, x, 0);
3626
3627   return ipart;
3628}
3629
3630LLVMValueRef
3631lp_build_mod(struct lp_build_context *bld,
3632             LLVMValueRef x,
3633             LLVMValueRef y)
3634{
3635   LLVMBuilderRef builder = bld->gallivm->builder;
3636   LLVMValueRef res;
3637   const struct lp_type type = bld->type;
3638
3639   assert(lp_check_value(type, x));
3640   assert(lp_check_value(type, y));
3641
3642   if (type.floating)
3643      res = LLVMBuildFRem(builder, x, y, "");
3644   else if (type.sign)
3645      res = LLVMBuildSRem(builder, x, y, "");
3646   else
3647      res = LLVMBuildURem(builder, x, y, "");
3648   return res;
3649}
3650
3651
3652/*
3653 * For floating inputs it creates and returns a mask
3654 * which is all 1's for channels which are NaN.
3655 * Channels inside x which are not NaN will be 0.
3656 */
3657LLVMValueRef
3658lp_build_isnan(struct lp_build_context *bld,
3659               LLVMValueRef x)
3660{
3661   LLVMValueRef mask;
3662   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3663
3664   assert(bld->type.floating);
3665   assert(lp_check_value(bld->type, x));
3666
3667   mask = LLVMBuildFCmp(bld->gallivm->builder, LLVMRealOEQ, x, x,
3668                        "isnotnan");
3669   mask = LLVMBuildNot(bld->gallivm->builder, mask, "");
3670   mask = LLVMBuildSExt(bld->gallivm->builder, mask, int_vec_type, "isnan");
3671   return mask;
3672}
3673
3674/* Returns all 1's for floating point numbers that are
3675 * finite numbers and returns all zeros for -inf,
3676 * inf and nan's */
3677LLVMValueRef
3678lp_build_isfinite(struct lp_build_context *bld,
3679                  LLVMValueRef x)
3680{
3681   LLVMBuilderRef builder = bld->gallivm->builder;
3682   LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, bld->type);
3683   struct lp_type int_type = lp_int_type(bld->type);
3684   LLVMValueRef intx = LLVMBuildBitCast(builder, x, int_vec_type, "");
3685   LLVMValueRef infornan32 = lp_build_const_int_vec(bld->gallivm, bld->type,
3686                                                    0x7f800000);
3687
3688   if (!bld->type.floating) {
3689      return lp_build_const_int_vec(bld->gallivm, bld->type, 0);
3690   }
3691   assert(bld->type.floating);
3692   assert(lp_check_value(bld->type, x));
3693   assert(bld->type.width == 32);
3694
3695   intx = LLVMBuildAnd(builder, intx, infornan32, "");
3696   return lp_build_compare(bld->gallivm, int_type, PIPE_FUNC_NOTEQUAL,
3697                           intx, infornan32);
3698}
3699
3700/*
3701 * Returns true if the number is nan or inf and false otherwise.
3702 * The input has to be a floating point vector.
3703 */
3704LLVMValueRef
3705lp_build_is_inf_or_nan(struct gallivm_state *gallivm,
3706                       const struct lp_type type,
3707                       LLVMValueRef x)
3708{
3709   LLVMBuilderRef builder = gallivm->builder;
3710   struct lp_type int_type = lp_int_type(type);
3711   LLVMValueRef const0 = lp_build_const_int_vec(gallivm, int_type,
3712                                                0x7f800000);
3713   LLVMValueRef ret;
3714
3715   assert(type.floating);
3716
3717   ret = LLVMBuildBitCast(builder, x, lp_build_vec_type(gallivm, int_type), "");
3718   ret = LLVMBuildAnd(builder, ret, const0, "");
3719   ret = lp_build_compare(gallivm, int_type, PIPE_FUNC_EQUAL,
3720                          ret, const0);
3721
3722   return ret;
3723}
3724
3725
3726LLVMValueRef
3727lp_build_fpstate_get(struct gallivm_state *gallivm)
3728{
3729   if (util_get_cpu_caps()->has_sse) {
3730      LLVMBuilderRef builder = gallivm->builder;
3731      LLVMValueRef mxcsr_ptr = lp_build_alloca(
3732         gallivm,
3733         LLVMInt32TypeInContext(gallivm->context),
3734         "mxcsr_ptr");
3735      LLVMValueRef mxcsr_ptr8 = LLVMBuildPointerCast(builder, mxcsr_ptr,
3736          LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3737      lp_build_intrinsic(builder,
3738                         "llvm.x86.sse.stmxcsr",
3739                         LLVMVoidTypeInContext(gallivm->context),
3740                         &mxcsr_ptr8, 1, 0);
3741      return mxcsr_ptr;
3742   }
3743   return 0;
3744}
3745
3746void
3747lp_build_fpstate_set_denorms_zero(struct gallivm_state *gallivm,
3748                                  boolean zero)
3749{
3750   if (util_get_cpu_caps()->has_sse) {
3751      /* turn on DAZ (64) | FTZ (32768) = 32832 if available */
3752      int daz_ftz = _MM_FLUSH_ZERO_MASK;
3753
3754      LLVMBuilderRef builder = gallivm->builder;
3755      LLVMValueRef mxcsr_ptr = lp_build_fpstate_get(gallivm);
3756      LLVMValueRef mxcsr =
3757         LLVMBuildLoad(builder, mxcsr_ptr, "mxcsr");
3758
3759      if (util_get_cpu_caps()->has_daz) {
3760         /* Enable denormals are zero mode */
3761         daz_ftz |= _MM_DENORMALS_ZERO_MASK;
3762      }
3763      if (zero) {
3764         mxcsr = LLVMBuildOr(builder, mxcsr,
3765                             LLVMConstInt(LLVMTypeOf(mxcsr), daz_ftz, 0), "");
3766      } else {
3767         mxcsr = LLVMBuildAnd(builder, mxcsr,
3768                              LLVMConstInt(LLVMTypeOf(mxcsr), ~daz_ftz, 0), "");
3769      }
3770
3771      LLVMBuildStore(builder, mxcsr, mxcsr_ptr);
3772      lp_build_fpstate_set(gallivm, mxcsr_ptr);
3773   }
3774}
3775
3776void
3777lp_build_fpstate_set(struct gallivm_state *gallivm,
3778                     LLVMValueRef mxcsr_ptr)
3779{
3780   if (util_get_cpu_caps()->has_sse) {
3781      LLVMBuilderRef builder = gallivm->builder;
3782      mxcsr_ptr = LLVMBuildPointerCast(builder, mxcsr_ptr,
3783                     LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
3784      lp_build_intrinsic(builder,
3785                         "llvm.x86.sse.ldmxcsr",
3786                         LLVMVoidTypeInContext(gallivm->context),
3787                         &mxcsr_ptr, 1, 0);
3788   }
3789}
3790