1/****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30#include "jit_pch.hpp"
31#include "builder.h"
32
33#include <cstdarg>
34
35namespace SwrJit
36{
37    void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
38    {
39        SWR_ASSERT(
40            ptr->getType() != mInt64Ty,
41            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
42    }
43
44    Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
45    {
46        return IRB()->CreateGEP(Ptr, Idx, Name);
47    }
48
49    Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50    {
51        return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52    }
53
54    Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55    {
56        std::vector<Value*> indices;
57        for (auto i : indexList)
58            indices.push_back(i);
59        return GEPA(ptr, indices);
60    }
61
62    Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63    {
64        std::vector<Value*> indices;
65        for (auto i : indexList)
66            indices.push_back(C(i));
67        return GEPA(ptr, indices);
68    }
69
70    Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71    {
72        return IRB()->CreateGEP(Ptr, IdxList, Name);
73    }
74
75    Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76    {
77        return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78    }
79
80    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81    {
82        std::vector<Value*> indices;
83        for (auto i : indexList)
84            indices.push_back(i);
85        return IN_BOUNDS_GEP(ptr, indices);
86    }
87
88    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89    {
90        std::vector<Value*> indices;
91        for (auto i : indexList)
92            indices.push_back(C(i));
93        return IN_BOUNDS_GEP(ptr, indices);
94    }
95
96    LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
97    {
98        AssertMemoryUsageParams(Ptr, usage);
99        return IRB()->CreateLoad(Ptr, Name);
100    }
101
102    LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
103    {
104        AssertMemoryUsageParams(Ptr, usage);
105        return IRB()->CreateLoad(Ptr, Name);
106    }
107
108    LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
109    {
110        AssertMemoryUsageParams(Ptr, usage);
111        return IRB()->CreateLoad(Ty, Ptr, Name);
112    }
113
114    LoadInst*
115    Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
116    {
117        AssertMemoryUsageParams(Ptr, usage);
118        return IRB()->CreateLoad(Ptr, isVolatile, Name);
119    }
120
121    LoadInst* Builder::LOAD(Value*                                 basePtr,
122                            const std::initializer_list<uint32_t>& indices,
123                            const llvm::Twine&                     name,
124                            Type*                                  Ty,
125                            JIT_MEM_CLIENT                         usage)
126    {
127        std::vector<Value*> valIndices;
128        for (auto i : indices)
129            valIndices.push_back(C(i));
130        return Builder::LOAD(GEPA(basePtr, valIndices), name);
131    }
132
133    LoadInst* Builder::LOADV(Value*                               basePtr,
134                             const std::initializer_list<Value*>& indices,
135                             const llvm::Twine&                   name)
136    {
137        std::vector<Value*> valIndices;
138        for (auto i : indices)
139            valIndices.push_back(i);
140        return LOAD(GEPA(basePtr, valIndices), name);
141    }
142
143    StoreInst*
144    Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, JIT_MEM_CLIENT usage)
145    {
146        std::vector<Value*> valIndices;
147        for (auto i : indices)
148            valIndices.push_back(C(i));
149        return STORE(val, GEPA(basePtr, valIndices));
150    }
151
152    StoreInst*
153    Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154    {
155        std::vector<Value*> valIndices;
156        for (auto i : indices)
157            valIndices.push_back(i);
158        return STORE(val, GEPA(basePtr, valIndices));
159    }
160
161    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162    {
163        return GEP(base, offset);
164    }
165
166    Value* Builder::MEM_ADD(Value*                                 i32Incr,
167                            Value*                                 basePtr,
168                            const std::initializer_list<uint32_t>& indices,
169                            const llvm::Twine&                     name)
170    {
171        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
172        Value* i32Result = ADD(i32Value, i32Incr);
173        return STORE(i32Result, GEP(basePtr, indices));
174    }
175
176    //////////////////////////////////////////////////////////////////////////
177    /// @brief Generate a masked gather operation in LLVM IR.  If not
178    /// supported on the underlying platform, emulate it with loads
179    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180    /// @param pBase - Int8* base VB address pointer value
181    /// @param vIndices - SIMD wide value of VB byte offsets
182    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183    /// @param scale - value to scale indices by
184    Value* Builder::GATHERPS(Value*         vSrc,
185                             Value*         pBase,
186                             Value*         vIndices,
187                             Value*         vMask,
188                             uint8_t        scale,
189                             JIT_MEM_CLIENT usage)
190    {
191        AssertMemoryUsageParams(pBase, usage);
192
193        return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194    }
195
196    //////////////////////////////////////////////////////////////////////////
197    /// @brief Generate a masked gather operation in LLVM IR.  If not
198    /// supported on the underlying platform, emulate it with loads
199    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200    /// @param pBase - Int8* base VB address pointer value
201    /// @param vIndices - SIMD wide value of VB byte offsets
202    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203    /// @param scale - value to scale indices by
204    Value* Builder::GATHERDD(Value*         vSrc,
205                             Value*         pBase,
206                             Value*         vIndices,
207                             Value*         vMask,
208                             uint8_t        scale,
209                             JIT_MEM_CLIENT usage)
210    {
211        AssertMemoryUsageParams(pBase, usage);
212
213        return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214    }
215
216    //////////////////////////////////////////////////////////////////////////
217    /// @brief Generate a masked gather operation in LLVM IR.  If not
218    /// supported on the underlying platform, emulate it with loads
219    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220    /// @param pBase - Int8* base VB address pointer value
221    /// @param vIndices - SIMD wide value of VB byte offsets
222    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223    /// @param scale - value to scale indices by
224    Value*
225    Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226    {
227        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228    }
229
230    //////////////////////////////////////////////////////////////////////////
231    /// @brief Alternative masked gather where source is a vector of pointers
232    /// @param pVecSrcPtr   - SIMD wide vector of pointers
233    /// @param pVecMask     - SIMD active lanes
234    /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235    Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236    {
237        return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
238    }
239
240    void Builder::Gather4(const SWR_FORMAT format,
241                          Value*           pSrcBase,
242                          Value*           byteOffsets,
243                          Value*           mask,
244                          Value*           vGatherComponents[],
245                          bool             bPackedOutput,
246                          JIT_MEM_CLIENT   usage)
247    {
248        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
249        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
250        {
251            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
252        }
253        else
254        {
255            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
256        }
257    }
258
259    void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
260                            Value*                 pSrcBase,
261                            Value*                 byteOffsets,
262                            Value*                 vMask,
263                            Value*                 vGatherComponents[],
264                            bool                   bPackedOutput,
265                            JIT_MEM_CLIENT         usage)
266    {
267        switch (info.bpp / info.numComps)
268        {
269        case 16:
270        {
271            Value* vGatherResult[2];
272
273            // TODO: vGatherMaskedVal
274            Value* vGatherMaskedVal = VIMMED1((float)0);
275
276            // always have at least one component out of x or y to fetch
277
278            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
279            // e.g. result of first 8x32bit integer gather for 16bit components
280            // 256i - 0    1    2    3    4    5    6    7
281            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
282            //
283
284            // if we have at least one component out of x or y to fetch
285            if (info.numComps > 2)
286            {
287                // offset base to the next components(zw) in the vertex to gather
288                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
289
290                vGatherResult[1] =
291                    GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
292                // e.g. result of second 8x32bit integer gather for 16bit components
293                // 256i - 0    1    2    3    4    5    6    7
294                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
295                //
296            }
297            else
298            {
299                vGatherResult[1] = vGatherMaskedVal;
300            }
301
302            // Shuffle gathered components into place, each row is a component
303            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
304        }
305        break;
306        case 32:
307        {
308            // apply defaults
309            for (uint32_t i = 0; i < 4; ++i)
310            {
311                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
312            }
313
314            for (uint32_t i = 0; i < info.numComps; i++)
315            {
316                uint32_t swizzleIndex = info.swizzle[i];
317
318                // Gather a SIMD of components
319                vGatherComponents[swizzleIndex] = GATHERPS(
320                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
321
322                // offset base to the next component to gather
323                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
324            }
325        }
326        break;
327        default:
328            SWR_INVALID("Invalid float format");
329            break;
330        }
331    }
332
333    void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
334                            Value*                 pSrcBase,
335                            Value*                 byteOffsets,
336                            Value*                 vMask,
337                            Value*                 vGatherComponents[],
338                            bool                   bPackedOutput,
339                            JIT_MEM_CLIENT         usage)
340    {
341        switch (info.bpp / info.numComps)
342        {
343        case 8:
344        {
345            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
346            Value* vGatherResult =
347                GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
348            // e.g. result of an 8x32bit integer gather for 8bit components
349            // 256i - 0    1    2    3    4    5    6    7
350            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
351
352            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
353        }
354        break;
355        case 16:
356        {
357            Value* vGatherResult[2];
358
359            // TODO: vGatherMaskedVal
360            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
361
362            // always have at least one component out of x or y to fetch
363
364            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
365            // e.g. result of first 8x32bit integer gather for 16bit components
366            // 256i - 0    1    2    3    4    5    6    7
367            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
368            //
369
370            // if we have at least one component out of x or y to fetch
371            if (info.numComps > 2)
372            {
373                // offset base to the next components(zw) in the vertex to gather
374                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
375
376                vGatherResult[1] =
377                    GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
378                // e.g. result of second 8x32bit integer gather for 16bit components
379                // 256i - 0    1    2    3    4    5    6    7
380                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
381                //
382            }
383            else
384            {
385                vGatherResult[1] = vGatherMaskedVal;
386            }
387
388            // Shuffle gathered components into place, each row is a component
389            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
390        }
391        break;
392        case 32:
393        {
394            // apply defaults
395            for (uint32_t i = 0; i < 4; ++i)
396            {
397                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
398            }
399
400            for (uint32_t i = 0; i < info.numComps; i++)
401            {
402                uint32_t swizzleIndex = info.swizzle[i];
403
404                // Gather a SIMD of components
405                vGatherComponents[swizzleIndex] = GATHERDD(
406                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
407
408                // offset base to the next component to gather
409                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
410            }
411        }
412        break;
413        default:
414            SWR_INVALID("unsupported format");
415            break;
416        }
417    }
418
419    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
420                                      Value*                 vGatherInput[2],
421                                      Value*                 vGatherOutput[4],
422                                      bool                   bPackedOutput)
423    {
424        // cast types
425        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
426        Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
427
428        // input could either be float or int vector; do shuffle work in int
429        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
430        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
431
432        if (bPackedOutput)
433        {
434            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
435                                              mVWidth / 4); // vwidth is units of 32 bits
436
437            // shuffle mask
438            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
439                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
440            Value* vShufResult =
441                BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
442            // after pshufb: group components together in each 128bit lane
443            // 256i - 0    1    2    3    4    5    6    7
444            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
445
446            Value* vi128XY =
447                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
448            // after PERMD: move and pack xy components into each 128bit lane
449            // 256i - 0    1    2    3    4    5    6    7
450            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
451
452            // do the same for zw components
453            Value* vi128ZW = nullptr;
454            if (info.numComps > 2)
455            {
456                Value* vShufResult =
457                    BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
458                vi128ZW =
459                    BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
460            }
461
462            for (uint32_t i = 0; i < 4; i++)
463            {
464                uint32_t swizzleIndex = info.swizzle[i];
465                // todo: fixed for packed
466                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
467                if (i >= info.numComps)
468                {
469                    // set the default component val
470                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
471                    continue;
472                }
473
474                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
475                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
476                // if x or y, use vi128XY permute result, else use vi128ZW
477                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
478
479                // extract packed component 128 bit lanes
480                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
481            }
482        }
483        else
484        {
485            // pshufb masks for each component
486            Value* vConstMask[2];
487            // x/z shuffle mask
488            vConstMask[0] = C<char>({
489                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
490                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
491            });
492
493            // y/w shuffle mask
494            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
495                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
496
497            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
498            // apply defaults
499            for (uint32_t i = 0; i < 4; ++i)
500            {
501                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
502            }
503
504            for (uint32_t i = 0; i < info.numComps; i++)
505            {
506                uint32_t swizzleIndex = info.swizzle[i];
507
508                // select correct constMask for x/z or y/w pshufb
509                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
510                // if x or y, use vi128XY permute result, else use vi128ZW
511                uint32_t selectedGather = (i < 2) ? 0 : 1;
512
513                vGatherOutput[swizzleIndex] =
514                    BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
515                                   vConstMask[selectedMask]),
516                            vGatherTy);
517                // after pshufb mask for x channel; z uses the same shuffle from the second gather
518                // 256i - 0    1    2    3    4    5    6    7
519                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
520            }
521        }
522    }
523
524    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
525                                     Value*                 vGatherInput,
526                                     Value*                 vGatherOutput[],
527                                     bool                   bPackedOutput)
528    {
529        // cast types
530        Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
531        Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
532
533        if (bPackedOutput)
534        {
535            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
536                                           mVWidth / 4); // vwidth is units of 32 bits
537                                                         // shuffle mask
538            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
539                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
540            Value* vShufResult =
541                BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
542            // after pshufb: group components together in each 128bit lane
543            // 256i - 0    1    2    3    4    5    6    7
544            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
545
546            Value* vi128XY =
547                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
548            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
549            // 256i - 0    1    2    3    4    5    6    7
550            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
551
552            // do the same for zw components
553            Value* vi128ZW = nullptr;
554            if (info.numComps > 2)
555            {
556                vi128ZW =
557                    BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
558            }
559
560            // sign extend all enabled components. If we have a fill vVertexElements, output to
561            // current simdvertex
562            for (uint32_t i = 0; i < 4; i++)
563            {
564                uint32_t swizzleIndex = info.swizzle[i];
565                // todo: fix for packed
566                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
567                if (i >= info.numComps)
568                {
569                    // set the default component val
570                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
571                    continue;
572                }
573
574                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
575                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
576                // if x or y, use vi128XY permute result, else use vi128ZW
577                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
578
579                // sign extend
580                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
581            }
582        }
583        // else zero extend
584        else
585        {
586            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
587            // apply defaults
588            for (uint32_t i = 0; i < 4; ++i)
589            {
590                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
591            }
592
593            for (uint32_t i = 0; i < info.numComps; i++)
594            {
595                uint32_t swizzleIndex = info.swizzle[i];
596
597                // pshufb masks for each component
598                Value* vConstMask;
599                switch (i)
600                {
601                case 0:
602                    // x shuffle mask
603                    vConstMask =
604                        C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
605                                 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
606                    break;
607                case 1:
608                    // y shuffle mask
609                    vConstMask =
610                        C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
611                                 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
612                    break;
613                case 2:
614                    // z shuffle mask
615                    vConstMask =
616                        C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
617                                 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
618                    break;
619                case 3:
620                    // w shuffle mask
621                    vConstMask =
622                        C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
623                                 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
624                    break;
625                default:
626                    vConstMask = nullptr;
627                    break;
628                }
629
630                vGatherOutput[swizzleIndex] =
631                    BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
632                // after pshufb for x channel
633                // 256i - 0    1    2    3    4    5    6    7
634                //        x000 x000 x000 x000 x000 x000 x000 x000
635            }
636        }
637    }
638
639    //////////////////////////////////////////////////////////////////////////
640    /// @brief emulates a scatter operation.
641    /// @param pDst - pointer to destination
642    /// @param vSrc - vector of src data to scatter
643    /// @param vOffsets - vector of byte offsets from pDst
644    /// @param vMask - mask of valid lanes
645    void Builder::SCATTERPS(
646        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
647    {
648        AssertMemoryUsageParams(pDst, usage);
649
650        /* Scatter algorithm
651
652        while(Index = BitScanForward(mask))
653        srcElem = srcVector[Index]
654        offsetElem = offsetVector[Index]
655        *(pDst + offsetElem) = srcElem
656        Update mask (&= ~(1<<Index)
657
658        */
659
660        BasicBlock* pCurBB = IRB()->GetInsertBlock();
661        Function*   pFunc  = pCurBB->getParent();
662        Type*       pSrcTy = vSrc->getType()->getVectorElementType();
663
664        // Store vectors on stack
665        if (pScatterStackSrc == nullptr)
666        {
667            // Save off stack allocations and reuse per scatter. Significantly reduces stack
668            // requirements for shaders with a lot of scatters.
669            pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
670            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
671        }
672
673        Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
674        Value* pOffsetsArrayPtr = pScatterStackOffsets;
675        STORE(vSrc, pSrcArrayPtr);
676        STORE(vOffsets, pOffsetsArrayPtr);
677
678        // Cast to pointers for random access
679        pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
680        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
681
682        Value* pMask = VMOVMSK(vMask);
683
684        // Setup loop basic block
685        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
686
687        // compute first set bit
688        Value* pIndex = CTTZ(pMask, C(false));
689
690        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
691
692        // Split current block or create new one if building inline
693        BasicBlock* pPostLoop;
694        if (pCurBB->getTerminator())
695        {
696            pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
697
698            // Remove unconditional jump created by splitBasicBlock
699            pCurBB->getTerminator()->eraseFromParent();
700
701            // Add terminator to end of original block
702            IRB()->SetInsertPoint(pCurBB);
703
704            // Add conditional branch
705            COND_BR(pIsUndef, pPostLoop, pLoop);
706        }
707        else
708        {
709            pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
710
711            // Add conditional branch
712            COND_BR(pIsUndef, pPostLoop, pLoop);
713        }
714
715        // Add loop basic block contents
716        IRB()->SetInsertPoint(pLoop);
717        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
718        PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
719
720        pIndexPhi->addIncoming(pIndex, pCurBB);
721        pMaskPhi->addIncoming(pMask, pCurBB);
722
723        // Extract elements for this index
724        Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
725        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
726
727        // GEP to this offset in dst
728        Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
729        pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
730        STORE(pSrcElem, pCurDst);
731
732        // Update the mask
733        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
734
735        // Terminator
736        Value* pNewIndex = CTTZ(pNewMask, C(false));
737
738        pIsUndef = ICMP_EQ(pNewIndex, C(32));
739        COND_BR(pIsUndef, pPostLoop, pLoop);
740
741        // Update phi edges
742        pIndexPhi->addIncoming(pNewIndex, pLoop);
743        pMaskPhi->addIncoming(pNewMask, pLoop);
744
745        // Move builder to beginning of post loop
746        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
747    }
748} // namespace SwrJit
749