1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Helper functions for swizzling/shuffling.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 */
34
35#include <inttypes.h>  /* for PRIx64 macro */
36#include "util/compiler.h"
37#include "util/u_debug.h"
38
39#include "lp_bld_type.h"
40#include "lp_bld_const.h"
41#include "lp_bld_init.h"
42#include "lp_bld_logic.h"
43#include "lp_bld_swizzle.h"
44#include "lp_bld_pack.h"
45
46
47LLVMValueRef
48lp_build_broadcast(struct gallivm_state *gallivm,
49                   LLVMTypeRef vec_type,
50                   LLVMValueRef scalar)
51{
52   LLVMValueRef res;
53
54   if (LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind) {
55      /* scalar */
56      assert(vec_type == LLVMTypeOf(scalar));
57      res = scalar;
58   } else {
59      LLVMBuilderRef builder = gallivm->builder;
60      const unsigned length = LLVMGetVectorSize(vec_type);
61      LLVMValueRef undef = LLVMGetUndef(vec_type);
62      /* The shuffle vector is always made of int32 elements */
63      LLVMTypeRef i32_type = LLVMInt32TypeInContext(gallivm->context);
64      LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length);
65
66      assert(LLVMGetElementType(vec_type) == LLVMTypeOf(scalar));
67
68      res = LLVMBuildInsertElement(builder, undef, scalar, LLVMConstNull(i32_type), "");
69      res = LLVMBuildShuffleVector(builder, res, undef, LLVMConstNull(i32_vec_type), "");
70   }
71
72   return res;
73}
74
75
76/**
77 * Broadcast
78 */
79LLVMValueRef
80lp_build_broadcast_scalar(struct lp_build_context *bld,
81                          LLVMValueRef scalar)
82{
83   assert(lp_check_elem_type(bld->type, LLVMTypeOf(scalar)));
84
85   return lp_build_broadcast(bld->gallivm, bld->vec_type, scalar);
86}
87
88
89/**
90 * Combined extract and broadcast (mere shuffle in most cases)
91 */
92LLVMValueRef
93lp_build_extract_broadcast(struct gallivm_state *gallivm,
94                           struct lp_type src_type,
95                           struct lp_type dst_type,
96                           LLVMValueRef vector,
97                           LLVMValueRef index)
98{
99   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
100   LLVMValueRef res;
101
102   assert(src_type.floating == dst_type.floating);
103   assert(src_type.width    == dst_type.width);
104
105   assert(lp_check_value(src_type, vector));
106   assert(LLVMTypeOf(index) == i32t);
107
108   if (src_type.length == 1) {
109      if (dst_type.length == 1) {
110         /*
111          * Trivial scalar -> scalar.
112          */
113
114         res = vector;
115      }
116      else {
117         /*
118          * Broadcast scalar -> vector.
119          */
120
121         res = lp_build_broadcast(gallivm,
122                                  lp_build_vec_type(gallivm, dst_type),
123                                  vector);
124      }
125   }
126   else {
127      if (dst_type.length > 1) {
128         /*
129          * shuffle - result can be of different length.
130          */
131
132         LLVMValueRef shuffle;
133         shuffle = lp_build_broadcast(gallivm,
134                                      LLVMVectorType(i32t, dst_type.length),
135                                      index);
136         res = LLVMBuildShuffleVector(gallivm->builder, vector,
137                                      LLVMGetUndef(lp_build_vec_type(gallivm, src_type)),
138                                      shuffle, "");
139      }
140      else {
141         /*
142          * Trivial extract scalar from vector.
143          */
144          res = LLVMBuildExtractElement(gallivm->builder, vector, index, "");
145      }
146   }
147
148   return res;
149}
150
151
152/**
153 * Swizzle one channel into other channels.
154 */
155LLVMValueRef
156lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
157                            LLVMValueRef a,
158                            unsigned channel,
159                            unsigned num_channels)
160{
161   LLVMBuilderRef builder = bld->gallivm->builder;
162   const struct lp_type type = bld->type;
163   const unsigned n = type.length;
164   unsigned i, j;
165
166   if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1)
167      return a;
168
169   assert(num_channels == 2 || num_channels == 4);
170
171   /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
172    * using shuffles here actually causes worst results. More investigation is
173    * needed. */
174   if (LLVMIsConstant(a) ||
175       type.width >= 16) {
176      /*
177       * Shuffle.
178       */
179      LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context);
180      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
181
182      for(j = 0; j < n; j += num_channels)
183         for(i = 0; i < num_channels; ++i)
184            shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
185
186      return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
187   }
188   else if (num_channels == 2) {
189      /*
190       * Bit mask and shifts
191       *
192       *   XY XY .... XY  <= input
193       *   0Y 0Y .... 0Y
194       *   YY YY .... YY
195       *   YY YY .... YY  <= output
196       */
197      struct lp_type type2;
198      LLVMValueRef tmp = NULL;
199      int shift;
200
201      a = LLVMBuildAnd(builder, a,
202                       lp_build_const_mask_aos(bld->gallivm,
203                                               type, 1 << channel, num_channels), "");
204
205      type2 = type;
206      type2.floating = FALSE;
207      type2.width *= 2;
208      type2.length /= 2;
209
210      a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), "");
211
212      /*
213       * Vector element 0 is always channel X.
214       *
215       *                        76 54 32 10 (array numbering)
216       * Little endian reg in:  YX YX YX YX
217       * Little endian reg out: YY YY YY YY if shift right (shift == -1)
218       *                        XX XX XX XX if shift left (shift == 1)
219       *
220       *                        01 23 45 67 (array numbering)
221       * Big endian reg in:     XY XY XY XY
222       * Big endian reg out:    YY YY YY YY if shift left (shift == 1)
223       *                        XX XX XX XX if shift right (shift == -1)
224       *
225       */
226#if UTIL_ARCH_LITTLE_ENDIAN
227      shift = channel == 0 ? 1 : -1;
228#else
229      shift = channel == 0 ? -1 : 1;
230#endif
231
232      if (shift > 0) {
233         tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), "");
234      } else if (shift < 0) {
235         tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), "");
236      }
237
238      assert(tmp);
239      if (tmp) {
240         a = LLVMBuildOr(builder, a, tmp, "");
241      }
242
243      return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
244   }
245   else {
246      /*
247       * Bit mask and recursive shifts
248       *
249       * Little-endian registers:
250       *
251       *   7654 3210
252       *   WZYX WZYX .... WZYX  <= input
253       *   00Y0 00Y0 .... 00Y0  <= mask
254       *   00YY 00YY .... 00YY  <= shift right 1 (shift amount -1)
255       *   YYYY YYYY .... YYYY  <= shift left 2 (shift amount 2)
256       *
257       * Big-endian registers:
258       *
259       *   0123 4567
260       *   XYZW XYZW .... XYZW  <= input
261       *   0Y00 0Y00 .... 0Y00  <= mask
262       *   YY00 YY00 .... YY00  <= shift left 1 (shift amount 1)
263       *   YYYY YYYY .... YYYY  <= shift right 2 (shift amount -2)
264       *
265       * shifts[] gives little-endian shift amounts; we need to negate for big-endian.
266       */
267      struct lp_type type4;
268      const int shifts[4][2] = {
269         { 1,  2},
270         {-1,  2},
271         { 1, -2},
272         {-1, -2}
273      };
274      unsigned i;
275
276      a = LLVMBuildAnd(builder, a,
277                       lp_build_const_mask_aos(bld->gallivm,
278                                               type, 1 << channel, 4), "");
279
280      /*
281       * Build a type where each element is an integer that cover the four
282       * channels.
283       */
284
285      type4 = type;
286      type4.floating = FALSE;
287      type4.width *= 4;
288      type4.length /= 4;
289
290      a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
291
292      for(i = 0; i < 2; ++i) {
293         LLVMValueRef tmp = NULL;
294         int shift = shifts[channel][i];
295
296         /* See endianness diagram above */
297#if UTIL_ARCH_BIG_ENDIAN
298         shift = -shift;
299#endif
300
301         if(shift > 0)
302            tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
303         if(shift < 0)
304            tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
305
306         assert(tmp);
307         if(tmp)
308            a = LLVMBuildOr(builder, a, tmp, "");
309      }
310
311      return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), "");
312   }
313}
314
315
316/**
317 * Swizzle a vector consisting of an array of XYZW structs.
318 *
319 * This fills a vector of dst_len length with the swizzled channels from src.
320 *
321 * e.g. with swizzles = { 2, 1, 0 } and swizzle_count = 6 results in
322 *      RGBA RGBA = BGR BGR BG
323 *
324 * @param swizzles        the swizzle array
325 * @param num_swizzles    the number of elements in swizzles
326 * @param dst_len         the length of the result
327 */
328LLVMValueRef
329lp_build_swizzle_aos_n(struct gallivm_state* gallivm,
330                       LLVMValueRef src,
331                       const unsigned char* swizzles,
332                       unsigned num_swizzles,
333                       unsigned dst_len)
334{
335   LLVMBuilderRef builder = gallivm->builder;
336   LLVMValueRef shuffles[LP_MAX_VECTOR_WIDTH];
337   unsigned i;
338
339   assert(dst_len < LP_MAX_VECTOR_WIDTH);
340
341   for (i = 0; i < dst_len; ++i) {
342      int swizzle = swizzles[i % num_swizzles];
343
344      if (swizzle == LP_BLD_SWIZZLE_DONTCARE) {
345         shuffles[i] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
346      } else {
347         shuffles[i] = lp_build_const_int32(gallivm, swizzle);
348      }
349   }
350
351   return LLVMBuildShuffleVector(builder, src, LLVMGetUndef(LLVMTypeOf(src)), LLVMConstVector(shuffles, dst_len), "");
352}
353
354
355LLVMValueRef
356lp_build_swizzle_aos(struct lp_build_context *bld,
357                     LLVMValueRef a,
358                     const unsigned char swizzles[4])
359{
360   LLVMBuilderRef builder = bld->gallivm->builder;
361   const struct lp_type type = bld->type;
362   const unsigned n = type.length;
363   unsigned i, j;
364
365   if (swizzles[0] == PIPE_SWIZZLE_X &&
366       swizzles[1] == PIPE_SWIZZLE_Y &&
367       swizzles[2] == PIPE_SWIZZLE_Z &&
368       swizzles[3] == PIPE_SWIZZLE_W) {
369      return a;
370   }
371
372   if (swizzles[0] == swizzles[1] &&
373       swizzles[1] == swizzles[2] &&
374       swizzles[2] == swizzles[3]) {
375      switch (swizzles[0]) {
376      case PIPE_SWIZZLE_X:
377      case PIPE_SWIZZLE_Y:
378      case PIPE_SWIZZLE_Z:
379      case PIPE_SWIZZLE_W:
380         return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4);
381      case PIPE_SWIZZLE_0:
382         return bld->zero;
383      case PIPE_SWIZZLE_1:
384         return bld->one;
385      case LP_BLD_SWIZZLE_DONTCARE:
386         return bld->undef;
387      default:
388         assert(0);
389         return bld->undef;
390      }
391   }
392
393   if (LLVMIsConstant(a) ||
394       type.width >= 16) {
395      /*
396       * Shuffle.
397       */
398      LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(bld->gallivm, type));
399      LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
400      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
401      LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
402
403      memset(aux, 0, sizeof aux);
404
405      for(j = 0; j < n; j += 4) {
406         for(i = 0; i < 4; ++i) {
407            unsigned shuffle;
408            switch (swizzles[i]) {
409            default:
410               assert(0);
411#if defined(NDEBUG) || defined(DEBUG)
412               FALLTHROUGH;
413#endif
414            case PIPE_SWIZZLE_X:
415            case PIPE_SWIZZLE_Y:
416            case PIPE_SWIZZLE_Z:
417            case PIPE_SWIZZLE_W:
418               shuffle = j + swizzles[i];
419               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
420               break;
421            case PIPE_SWIZZLE_0:
422               shuffle = type.length + 0;
423               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
424               if (!aux[0]) {
425                  aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0);
426               }
427               break;
428            case PIPE_SWIZZLE_1:
429               shuffle = type.length + 1;
430               shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
431               if (!aux[1]) {
432                  aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0);
433               }
434               break;
435            case LP_BLD_SWIZZLE_DONTCARE:
436               shuffles[j + i] = LLVMGetUndef(i32t);
437               break;
438            }
439         }
440      }
441
442      for (i = 0; i < n; ++i) {
443         if (!aux[i]) {
444            aux[i] = undef;
445         }
446      }
447
448      return LLVMBuildShuffleVector(builder, a,
449                                    LLVMConstVector(aux, n),
450                                    LLVMConstVector(shuffles, n), "");
451   } else {
452      /*
453       * Bit mask and shifts.
454       *
455       * For example, this will convert BGRA to RGBA by doing
456       *
457       * Little endian:
458       *   rgba = (bgra & 0x00ff0000) >> 16
459       *        | (bgra & 0xff00ff00)
460       *        | (bgra & 0x000000ff) << 16
461       *
462       * Big endian:A
463       *   rgba = (bgra & 0x0000ff00) << 16
464       *        | (bgra & 0x00ff00ff)
465       *        | (bgra & 0xff000000) >> 16
466       *
467       * This is necessary not only for faster cause, but because X86 backend
468       * will refuse shuffles of <4 x i8> vectors
469       */
470      LLVMValueRef res;
471      struct lp_type type4;
472      unsigned cond = 0;
473      int chan;
474      int shift;
475
476      /*
477       * Start with a mixture of 1 and 0.
478       */
479      for (chan = 0; chan < 4; ++chan) {
480         if (swizzles[chan] == PIPE_SWIZZLE_1) {
481            cond |= 1 << chan;
482         }
483      }
484      res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4);
485
486      /*
487       * Build a type where each element is an integer that cover the four
488       * channels.
489       */
490      type4 = type;
491      type4.floating = FALSE;
492      type4.width *= 4;
493      type4.length /= 4;
494
495      a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), "");
496      res = LLVMBuildBitCast(builder, res, lp_build_vec_type(bld->gallivm, type4), "");
497
498      /*
499       * Mask and shift the channels, trying to group as many channels in the
500       * same shift as possible.  The shift amount is positive for shifts left
501       * and negative for shifts right.
502       */
503      for (shift = -3; shift <= 3; ++shift) {
504         uint64_t mask = 0;
505
506         assert(type4.width <= sizeof(mask)*8);
507
508         /*
509          * Vector element numbers follow the XYZW order, so 0 is always X, etc.
510          * After widening 4 times we have:
511          *
512          *                                3210
513          * Little-endian register layout: WZYX
514          *
515          *                                0123
516          * Big-endian register layout:    XYZW
517          *
518          * For little-endian, higher-numbered channels are obtained by a shift right
519          * (negative shift amount) and lower-numbered channels by a shift left
520          * (positive shift amount).  The opposite is true for big-endian.
521          */
522         for (chan = 0; chan < 4; ++chan) {
523            if (swizzles[chan] < 4) {
524               /* We need to move channel swizzles[chan] into channel chan */
525#if UTIL_ARCH_LITTLE_ENDIAN
526               if (swizzles[chan] - chan == -shift) {
527                  mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
528               }
529#else
530               if (swizzles[chan] - chan == shift) {
531                  mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width);
532               }
533#endif
534            }
535         }
536
537         if (mask) {
538            LLVMValueRef masked;
539            LLVMValueRef shifted;
540            if (0)
541               debug_printf("shift = %i, mask = %" PRIx64 "\n", shift, mask);
542
543            masked = LLVMBuildAnd(builder, a,
544                                  lp_build_const_int_vec(bld->gallivm, type4, mask), "");
545            if (shift > 0) {
546               shifted = LLVMBuildShl(builder, masked,
547                                      lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), "");
548            } else if (shift < 0) {
549               shifted = LLVMBuildLShr(builder, masked,
550                                       lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), "");
551            } else {
552               shifted = masked;
553            }
554
555            res = LLVMBuildOr(builder, res, shifted, "");
556         }
557      }
558
559      return LLVMBuildBitCast(builder, res,
560                              lp_build_vec_type(bld->gallivm, type), "");
561   }
562}
563
564
565/**
566 * Extended swizzle of a single channel of a SoA vector.
567 *
568 * @param bld         building context
569 * @param unswizzled  array with the 4 unswizzled values
570 * @param swizzle     one of the PIPE_SWIZZLE_*
571 *
572 * @return  the swizzled value.
573 */
574LLVMValueRef
575lp_build_swizzle_soa_channel(struct lp_build_context *bld,
576                             const LLVMValueRef *unswizzled,
577                             unsigned swizzle)
578{
579   switch (swizzle) {
580   case PIPE_SWIZZLE_X:
581   case PIPE_SWIZZLE_Y:
582   case PIPE_SWIZZLE_Z:
583   case PIPE_SWIZZLE_W:
584      return unswizzled[swizzle];
585   case PIPE_SWIZZLE_0:
586      return bld->zero;
587   case PIPE_SWIZZLE_1:
588      return bld->one;
589   default:
590      assert(0);
591      return bld->undef;
592   }
593}
594
595
596/**
597 * Extended swizzle of a SoA vector.
598 *
599 * @param bld         building context
600 * @param unswizzled  array with the 4 unswizzled values
601 * @param swizzles    array of PIPE_SWIZZLE_*
602 * @param swizzled    output swizzled values
603 */
604void
605lp_build_swizzle_soa(struct lp_build_context *bld,
606                     const LLVMValueRef *unswizzled,
607                     const unsigned char swizzles[4],
608                     LLVMValueRef *swizzled)
609{
610   unsigned chan;
611
612   for (chan = 0; chan < 4; ++chan) {
613      swizzled[chan] = lp_build_swizzle_soa_channel(bld, unswizzled,
614                                                    swizzles[chan]);
615   }
616}
617
618
619/**
620 * Do an extended swizzle of a SoA vector inplace.
621 *
622 * @param bld         building context
623 * @param values      intput/output array with the 4 values
624 * @param swizzles    array of PIPE_SWIZZLE_*
625 */
626void
627lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
628                             LLVMValueRef *values,
629                             const unsigned char swizzles[4])
630{
631   LLVMValueRef unswizzled[4];
632   unsigned chan;
633
634   for (chan = 0; chan < 4; ++chan) {
635      unswizzled[chan] = values[chan];
636   }
637
638   lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
639}
640
641
642/**
643 * Transpose from AOS <-> SOA
644 *
645 * @param single_type_lp   type of pixels
646 * @param src              the 4 * n pixel input
647 * @param dst              the 4 * n pixel output
648 */
649void
650lp_build_transpose_aos(struct gallivm_state *gallivm,
651                       struct lp_type single_type_lp,
652                       const LLVMValueRef src[4],
653                       LLVMValueRef dst[4])
654{
655   struct lp_type double_type_lp = single_type_lp;
656   LLVMTypeRef single_type;
657   LLVMTypeRef double_type;
658   LLVMValueRef t0 = NULL, t1 = NULL, t2 = NULL, t3 = NULL;
659
660   double_type_lp.length >>= 1;
661   double_type_lp.width  <<= 1;
662
663   double_type = lp_build_vec_type(gallivm, double_type_lp);
664   single_type = lp_build_vec_type(gallivm, single_type_lp);
665
666   LLVMValueRef double_type_zero = LLVMConstNull(double_type);
667   /* Interleave x, y, z, w -> xy and zw */
668   if (src[0] || src[1]) {
669      LLVMValueRef src0 = src[0];
670      LLVMValueRef src1 = src[1];
671      if (!src0)
672         src0 = LLVMConstNull(single_type);
673      if (!src1)
674         src1 = LLVMConstNull(single_type);
675      t0 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 0);
676      t2 = lp_build_interleave2_half(gallivm, single_type_lp, src0, src1, 1);
677
678      /* Cast to double width type for second interleave */
679      t0 = LLVMBuildBitCast(gallivm->builder, t0, double_type, "t0");
680      t2 = LLVMBuildBitCast(gallivm->builder, t2, double_type, "t2");
681   }
682   if (src[2] || src[3]) {
683      LLVMValueRef src2 = src[2];
684      LLVMValueRef src3 = src[3];
685      if (!src2)
686         src2 = LLVMConstNull(single_type);
687      if (!src3)
688         src3 = LLVMConstNull(single_type);
689      t1 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 0);
690      t3 = lp_build_interleave2_half(gallivm, single_type_lp, src2, src3, 1);
691
692      /* Cast to double width type for second interleave */
693      t1 = LLVMBuildBitCast(gallivm->builder, t1, double_type, "t1");
694      t3 = LLVMBuildBitCast(gallivm->builder, t3, double_type, "t3");
695   }
696
697   if (!t0)
698      t0 = double_type_zero;
699   if (!t1)
700      t1 = double_type_zero;
701   if (!t2)
702      t2 = double_type_zero;
703   if (!t3)
704      t3 = double_type_zero;
705
706   /* Interleave xy, zw -> xyzw */
707   dst[0] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 0);
708   dst[1] = lp_build_interleave2_half(gallivm, double_type_lp, t0, t1, 1);
709   dst[2] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 0);
710   dst[3] = lp_build_interleave2_half(gallivm, double_type_lp, t2, t3, 1);
711
712   /* Cast back to original single width type */
713   dst[0] = LLVMBuildBitCast(gallivm->builder, dst[0], single_type, "dst0");
714   dst[1] = LLVMBuildBitCast(gallivm->builder, dst[1], single_type, "dst1");
715   dst[2] = LLVMBuildBitCast(gallivm->builder, dst[2], single_type, "dst2");
716   dst[3] = LLVMBuildBitCast(gallivm->builder, dst[3], single_type, "dst3");
717}
718
719
720/**
721 * Transpose from AOS <-> SOA for num_srcs
722 */
723void
724lp_build_transpose_aos_n(struct gallivm_state *gallivm,
725                         struct lp_type type,
726                         const LLVMValueRef* src,
727                         unsigned num_srcs,
728                         LLVMValueRef* dst)
729{
730   switch (num_srcs) {
731      case 1:
732         dst[0] = src[0];
733         break;
734
735      case 2:
736      {
737         /* Note: we must use a temporary incase src == dst */
738         LLVMValueRef lo, hi;
739
740         lo = lp_build_interleave2_half(gallivm, type, src[0], src[1], 0);
741         hi = lp_build_interleave2_half(gallivm, type, src[0], src[1], 1);
742
743         dst[0] = lo;
744         dst[1] = hi;
745         break;
746      }
747
748      case 4:
749         lp_build_transpose_aos(gallivm, type, src, dst);
750         break;
751
752      default:
753         assert(0);
754   }
755}
756
757
758/**
759 * Pack n-th element of aos values,
760 * pad out to destination size.
761 * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
762 */
763LLVMValueRef
764lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
765                          struct lp_type src_type,
766                          struct lp_type dst_type,
767                          const LLVMValueRef src,
768                          unsigned channel)
769{
770   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
771   LLVMValueRef undef = LLVMGetUndef(i32t);
772   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
773   unsigned num_src = src_type.length / 4;
774   unsigned num_dst = dst_type.length;
775   unsigned i;
776
777   assert(num_src <= num_dst);
778
779   for (i = 0; i < num_src; i++) {
780      shuffles[i] = LLVMConstInt(i32t, i * 4 + channel, 0);
781   }
782   for (i = num_src; i < num_dst; i++) {
783      shuffles[i] = undef;
784   }
785
786   if (num_dst == 1) {
787      return LLVMBuildExtractElement(gallivm->builder, src, shuffles[0], "");
788   }
789   else {
790      return LLVMBuildShuffleVector(gallivm->builder, src, src,
791                                    LLVMConstVector(shuffles, num_dst), "");
792   }
793}
794
795
796/**
797 * Unpack and broadcast packed aos values consisting of only the
798 * first value, i.e. x1 x2 _ _ will become x1 x1 x1 x1 x2 x2 x2 x2
799 */
800LLVMValueRef
801lp_build_unpack_broadcast_aos_scalars(struct gallivm_state *gallivm,
802                                      struct lp_type src_type,
803                                      struct lp_type dst_type,
804                                      const LLVMValueRef src)
805{
806   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
807   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
808   unsigned num_dst = dst_type.length;
809   unsigned num_src = dst_type.length / 4;
810   unsigned i;
811
812   assert(num_dst / 4 <= src_type.length);
813
814   for (i = 0; i < num_src; i++) {
815      shuffles[i*4] = LLVMConstInt(i32t, i, 0);
816      shuffles[i*4+1] = LLVMConstInt(i32t, i, 0);
817      shuffles[i*4+2] = LLVMConstInt(i32t, i, 0);
818      shuffles[i*4+3] = LLVMConstInt(i32t, i, 0);
819   }
820
821   if (num_src == 1) {
822      return lp_build_extract_broadcast(gallivm, src_type, dst_type,
823                                        src, shuffles[0]);
824   }
825   else {
826      return LLVMBuildShuffleVector(gallivm->builder, src, src,
827                                    LLVMConstVector(shuffles, num_dst), "");
828   }
829}
830
831