1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Helper functions for logical operations.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35
36#include "util/u_cpu_detect.h"
37#include "util/u_memory.h"
38#include "util/u_debug.h"
39
40#include "lp_bld_type.h"
41#include "lp_bld_const.h"
42#include "lp_bld_swizzle.h"
43#include "lp_bld_init.h"
44#include "lp_bld_intr.h"
45#include "lp_bld_debug.h"
46#include "lp_bld_logic.h"
47
48
49/*
50 * XXX
51 *
52 * Selection with vector conditional like
53 *
54 *    select <4 x i1> %C, %A, %B
55 *
56 * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only
57 * supported on some backends (x86) starting with llvm 3.1.
58 *
59 * Expanding the boolean vector to full SIMD register width, as in
60 *
61 *    sext <4 x i1> %C to <4 x i32>
62 *
63 * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but
64 * it causes assertion failures in LLVM 2.6. It appears to work correctly on
65 * LLVM 2.7.
66 */
67
68
69/**
70 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
71 * \param func  one of PIPE_FUNC_x
72 * If the ordered argument is true the function will use LLVM's ordered
73 * comparisons, otherwise unordered comparisons will be used.
74 * The result values will be 0 for false or ~0 for true.
75 */
76static LLVMValueRef
77lp_build_compare_ext(struct gallivm_state *gallivm,
78                     const struct lp_type type,
79                     unsigned func,
80                     LLVMValueRef a,
81                     LLVMValueRef b,
82                     boolean ordered)
83{
84   LLVMBuilderRef builder = gallivm->builder;
85   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
86   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
87   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
88   LLVMValueRef cond;
89   LLVMValueRef res;
90
91   assert(lp_check_value(type, a));
92   assert(lp_check_value(type, b));
93
94   if(func == PIPE_FUNC_NEVER)
95      return zeros;
96   if(func == PIPE_FUNC_ALWAYS)
97      return ones;
98
99   assert(func > PIPE_FUNC_NEVER);
100   assert(func < PIPE_FUNC_ALWAYS);
101
102   if(type.floating) {
103      LLVMRealPredicate op;
104      switch(func) {
105      case PIPE_FUNC_EQUAL:
106         op = ordered ? LLVMRealOEQ : LLVMRealUEQ;
107         break;
108      case PIPE_FUNC_NOTEQUAL:
109         op = ordered ? LLVMRealONE : LLVMRealUNE;
110         break;
111      case PIPE_FUNC_LESS:
112         op = ordered ? LLVMRealOLT : LLVMRealULT;
113         break;
114      case PIPE_FUNC_LEQUAL:
115         op = ordered ? LLVMRealOLE : LLVMRealULE;
116         break;
117      case PIPE_FUNC_GREATER:
118         op = ordered ? LLVMRealOGT : LLVMRealUGT;
119         break;
120      case PIPE_FUNC_GEQUAL:
121         op = ordered ? LLVMRealOGE : LLVMRealUGE;
122         break;
123      default:
124         assert(0);
125         return lp_build_undef(gallivm, type);
126      }
127
128      cond = LLVMBuildFCmp(builder, op, a, b, "");
129      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
130   }
131   else {
132      LLVMIntPredicate op;
133      switch(func) {
134      case PIPE_FUNC_EQUAL:
135         op = LLVMIntEQ;
136         break;
137      case PIPE_FUNC_NOTEQUAL:
138         op = LLVMIntNE;
139         break;
140      case PIPE_FUNC_LESS:
141         op = type.sign ? LLVMIntSLT : LLVMIntULT;
142         break;
143      case PIPE_FUNC_LEQUAL:
144         op = type.sign ? LLVMIntSLE : LLVMIntULE;
145         break;
146      case PIPE_FUNC_GREATER:
147         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
148         break;
149      case PIPE_FUNC_GEQUAL:
150         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
151         break;
152      default:
153         assert(0);
154         return lp_build_undef(gallivm, type);
155      }
156
157      cond = LLVMBuildICmp(builder, op, a, b, "");
158      res = LLVMBuildSExt(builder, cond, int_vec_type, "");
159   }
160
161   return res;
162}
163
164/**
165 * Build code to compare two values 'a' and 'b' of 'type' using the given func.
166 * \param func  one of PIPE_FUNC_x
167 * The result values will be 0 for false or ~0 for true.
168 */
169LLVMValueRef
170lp_build_compare(struct gallivm_state *gallivm,
171                 const struct lp_type type,
172                 unsigned func,
173                 LLVMValueRef a,
174                 LLVMValueRef b)
175{
176   LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
177   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
178   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
179
180   assert(lp_check_value(type, a));
181   assert(lp_check_value(type, b));
182
183   if(func == PIPE_FUNC_NEVER)
184      return zeros;
185   if(func == PIPE_FUNC_ALWAYS)
186      return ones;
187
188   assert(func > PIPE_FUNC_NEVER);
189   assert(func < PIPE_FUNC_ALWAYS);
190
191#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
192   /*
193    * There are no unsigned integer comparison instructions in SSE.
194    */
195
196   if (!type.floating && !type.sign &&
197       type.width * type.length == 128 &&
198       util_cpu_caps.has_sse2 &&
199       (func == PIPE_FUNC_LESS ||
200        func == PIPE_FUNC_LEQUAL ||
201        func == PIPE_FUNC_GREATER ||
202        func == PIPE_FUNC_GEQUAL) &&
203       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
204         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
205                      __FUNCTION__, type.length, type.width);
206   }
207#endif
208
209   return lp_build_compare_ext(gallivm, type, func, a, b, FALSE);
210}
211
212/**
213 * Build code to compare two values 'a' and 'b' using the given func.
214 * \param func  one of PIPE_FUNC_x
215 * If the operands are floating point numbers, the function will use
216 * ordered comparison which means that it will return true if both
217 * operands are not a NaN and the specified condition evaluates to true.
218 * The result values will be 0 for false or ~0 for true.
219 */
220LLVMValueRef
221lp_build_cmp_ordered(struct lp_build_context *bld,
222                     unsigned func,
223                     LLVMValueRef a,
224                     LLVMValueRef b)
225{
226   return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE);
227}
228
229/**
230 * Build code to compare two values 'a' and 'b' using the given func.
231 * \param func  one of PIPE_FUNC_x
232 * If the operands are floating point numbers, the function will use
233 * unordered comparison which means that it will return true if either
234 * operand is a NaN or the specified condition evaluates to true.
235 * The result values will be 0 for false or ~0 for true.
236 */
237LLVMValueRef
238lp_build_cmp(struct lp_build_context *bld,
239             unsigned func,
240             LLVMValueRef a,
241             LLVMValueRef b)
242{
243   return lp_build_compare(bld->gallivm, bld->type, func, a, b);
244}
245
246
247/**
248 * Return (mask & a) | (~mask & b);
249 */
250LLVMValueRef
251lp_build_select_bitwise(struct lp_build_context *bld,
252                        LLVMValueRef mask,
253                        LLVMValueRef a,
254                        LLVMValueRef b)
255{
256   LLVMBuilderRef builder = bld->gallivm->builder;
257   struct lp_type type = bld->type;
258   LLVMValueRef res;
259
260   assert(lp_check_value(type, a));
261   assert(lp_check_value(type, b));
262
263   if (a == b) {
264      return a;
265   }
266
267   if(type.floating) {
268      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
269      a = LLVMBuildBitCast(builder, a, int_vec_type, "");
270      b = LLVMBuildBitCast(builder, b, int_vec_type, "");
271   }
272
273   a = LLVMBuildAnd(builder, a, mask, "");
274
275   /* This often gets translated to PANDN, but sometimes the NOT is
276    * pre-computed and stored in another constant. The best strategy depends
277    * on available registers, so it is not a big deal -- hopefully LLVM does
278    * the right decision attending the rest of the program.
279    */
280   b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), "");
281
282   res = LLVMBuildOr(builder, a, b, "");
283
284   if(type.floating) {
285      LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type);
286      res = LLVMBuildBitCast(builder, res, vec_type, "");
287   }
288
289   return res;
290}
291
292
293/**
294 * Return mask ? a : b;
295 *
296 * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value
297 * will yield unpredictable results.
298 */
299LLVMValueRef
300lp_build_select(struct lp_build_context *bld,
301                LLVMValueRef mask,
302                LLVMValueRef a,
303                LLVMValueRef b)
304{
305   LLVMBuilderRef builder = bld->gallivm->builder;
306   LLVMContextRef lc = bld->gallivm->context;
307   struct lp_type type = bld->type;
308   LLVMValueRef res;
309
310   assert(lp_check_value(type, a));
311   assert(lp_check_value(type, b));
312
313   if(a == b)
314      return a;
315
316   if (type.length == 1) {
317      mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
318      res = LLVMBuildSelect(builder, mask, a, b, "");
319   }
320   else if (!(HAVE_LLVM == 0x0307) &&
321            (LLVMIsConstant(mask) ||
322             LLVMGetInstructionOpcode(mask) == LLVMSExt)) {
323      /* Generate a vector select.
324       *
325       * Using vector selects should avoid emitting intrinsics hence avoid
326       * hindering optimization passes, but vector selects weren't properly
327       * supported yet for a long time, and LLVM will generate poor code when
328       * the mask is not the result of a comparison.
329       * Also, llvm 3.7 may miscompile them (bug 94972).
330       * XXX: Even if the instruction was an SExt, this may still produce
331       * terrible code. Try piglit stencil-twoside.
332       */
333
334      /* Convert the mask to a vector of booleans.
335       *
336       * XXX: In x86 the mask is controlled by the MSB, so if we shifted the
337       * mask by `type.width - 1`, LLVM should realize the mask is ready.  Alas
338       * what really happens is that LLVM will emit two shifts back to back.
339       */
340      if (0) {
341         LLVMValueRef shift = LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0);
342         shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift);
343         mask = LLVMBuildLShr(builder, mask, shift, "");
344      }
345      LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length);
346      mask = LLVMBuildTrunc(builder, mask, bool_vec_type, "");
347
348      res = LLVMBuildSelect(builder, mask, a, b, "");
349   }
350   else if (((util_cpu_caps.has_sse4_1 &&
351              type.width * type.length == 128) ||
352             (util_cpu_caps.has_avx &&
353              type.width * type.length == 256 && type.width >= 32) ||
354             (util_cpu_caps.has_avx2 &&
355              type.width * type.length == 256)) &&
356            !LLVMIsConstant(a) &&
357            !LLVMIsConstant(b) &&
358            !LLVMIsConstant(mask)) {
359      const char *intrinsic;
360      LLVMTypeRef arg_type;
361      LLVMValueRef args[3];
362
363      /*
364       *  There's only float blend in AVX but can just cast i32/i64
365       *  to float.
366       */
367      if (type.width * type.length == 256) {
368         if (type.width == 64) {
369           intrinsic = "llvm.x86.avx.blendv.pd.256";
370           arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4);
371         }
372         else if (type.width == 32) {
373            intrinsic = "llvm.x86.avx.blendv.ps.256";
374            arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8);
375         } else {
376            assert(util_cpu_caps.has_avx2);
377            intrinsic = "llvm.x86.avx2.pblendvb";
378            arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32);
379         }
380      }
381      else if (type.floating &&
382               type.width == 64) {
383         intrinsic = "llvm.x86.sse41.blendvpd";
384         arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2);
385      } else if (type.floating &&
386                 type.width == 32) {
387         intrinsic = "llvm.x86.sse41.blendvps";
388         arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4);
389      } else {
390         intrinsic = "llvm.x86.sse41.pblendvb";
391         arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16);
392      }
393
394      if (arg_type != bld->int_vec_type) {
395         mask = LLVMBuildBitCast(builder, mask, arg_type, "");
396      }
397
398      if (arg_type != bld->vec_type) {
399         a = LLVMBuildBitCast(builder, a, arg_type, "");
400         b = LLVMBuildBitCast(builder, b, arg_type, "");
401      }
402
403      args[0] = b;
404      args[1] = a;
405      args[2] = mask;
406
407      res = lp_build_intrinsic(builder, intrinsic,
408                               arg_type, args, ARRAY_SIZE(args), 0);
409
410      if (arg_type != bld->vec_type) {
411         res = LLVMBuildBitCast(builder, res, bld->vec_type, "");
412      }
413   }
414   else {
415      res = lp_build_select_bitwise(bld, mask, a, b);
416   }
417
418   return res;
419}
420
421
422/**
423 * Return mask ? a : b;
424 *
425 * mask is a TGSI_WRITEMASK_xxx.
426 */
427LLVMValueRef
428lp_build_select_aos(struct lp_build_context *bld,
429                    unsigned mask,
430                    LLVMValueRef a,
431                    LLVMValueRef b,
432                    unsigned num_channels)
433{
434   LLVMBuilderRef builder = bld->gallivm->builder;
435   const struct lp_type type = bld->type;
436   const unsigned n = type.length;
437   unsigned i, j;
438
439   assert((mask & ~0xf) == 0);
440   assert(lp_check_value(type, a));
441   assert(lp_check_value(type, b));
442
443   if(a == b)
444      return a;
445   if((mask & 0xf) == 0xf)
446      return a;
447   if((mask & 0xf) == 0x0)
448      return b;
449   if(a == bld->undef || b == bld->undef)
450      return bld->undef;
451
452   /*
453    * There are two major ways of accomplishing this:
454    * - with a shuffle
455    * - with a select
456    *
457    * The flip between these is empirical and might need to be adjusted.
458    */
459   if (n <= 4) {
460      /*
461       * Shuffle.
462       */
463      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
464      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
465
466      for(j = 0; j < n; j += num_channels)
467         for(i = 0; i < num_channels; ++i)
468            shuffles[j + i] = LLVMConstInt(elem_type,
469                                           (mask & (1 << i) ? 0 : n) + j + i,
470                                           0);
471
472      return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), "");
473   }
474   else {
475      LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels);
476      return lp_build_select(bld, mask_vec, a, b);
477   }
478}
479
480
481/**
482 * Return (scalar-cast)val ? true : false;
483 */
484LLVMValueRef
485lp_build_any_true_range(struct lp_build_context *bld,
486                        unsigned real_length,
487                        LLVMValueRef val)
488{
489   LLVMBuilderRef builder = bld->gallivm->builder;
490   LLVMTypeRef scalar_type;
491   LLVMTypeRef true_type;
492
493   assert(real_length <= bld->type.length);
494
495   true_type = LLVMIntTypeInContext(bld->gallivm->context,
496                                    bld->type.width * real_length);
497   scalar_type = LLVMIntTypeInContext(bld->gallivm->context,
498                                      bld->type.width * bld->type.length);
499   val = LLVMBuildBitCast(builder, val, scalar_type, "");
500   /*
501    * We're using always native types so we can use intrinsics.
502    * However, if we don't do per-element calculations, we must ensure
503    * the excess elements aren't used since they may contain garbage.
504    */
505   if (real_length < bld->type.length) {
506      val = LLVMBuildTrunc(builder, val, true_type, "");
507   }
508   return LLVMBuildICmp(builder, LLVMIntNE,
509                        val, LLVMConstNull(true_type), "");
510}
511