1/****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file builder_misc.cpp
24 *
25 * @brief Implementation for miscellaneous builder functions
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30#include "jit_pch.hpp"
31#include "builder.h"
32
33#include <cstdarg>
34
35namespace SwrJit
36{
37    void Builder::AssertMemoryUsageParams(Value* ptr, MEM_CLIENT usage)
38    {
39        SWR_ASSERT(
40            ptr->getType() != mInt64Ty,
41            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
42    }
43
44    Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, bool isReadOnly, const Twine& Name)
45    {
46        return IRB()->CreateGEP(Ptr, Idx, Name);
47    }
48
49    Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
50    {
51        return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
52    }
53
54    Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
55    {
56        std::vector<Value*> indices;
57        for (auto i : indexList)
58            indices.push_back(i);
59        return GEPA(ptr, indices);
60    }
61
62    Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
63    {
64        std::vector<Value*> indices;
65        for (auto i : indexList)
66            indices.push_back(C(i));
67        return GEPA(ptr, indices);
68    }
69
70    Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
71    {
72        return IRB()->CreateGEP(Ptr, IdxList, Name);
73    }
74
75    Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
76    {
77        return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
78    }
79
80    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
81    {
82        std::vector<Value*> indices;
83        for (auto i : indexList)
84            indices.push_back(i);
85        return IN_BOUNDS_GEP(ptr, indices);
86    }
87
88    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
89    {
90        std::vector<Value*> indices;
91        for (auto i : indexList)
92            indices.push_back(C(i));
93        return IN_BOUNDS_GEP(ptr, indices);
94    }
95
96    LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, MEM_CLIENT usage)
97    {
98        AssertMemoryUsageParams(Ptr, usage);
99        return IRB()->CreateLoad(Ptr, Name);
100    }
101
102    LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, MEM_CLIENT usage)
103    {
104        AssertMemoryUsageParams(Ptr, usage);
105        return IRB()->CreateLoad(Ptr, Name);
106    }
107
108    LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, MEM_CLIENT usage)
109    {
110        AssertMemoryUsageParams(Ptr, usage);
111        return IRB()->CreateLoad(Ty, Ptr, Name);
112    }
113
114    LoadInst*
115    Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, MEM_CLIENT usage)
116    {
117        AssertMemoryUsageParams(Ptr, usage);
118        return IRB()->CreateLoad(Ptr, isVolatile, Name);
119    }
120
121    LoadInst* Builder::LOAD(Value*                                 basePtr,
122                            const std::initializer_list<uint32_t>& indices,
123                            const llvm::Twine&                     name,
124                            Type*                                  Ty,
125                            MEM_CLIENT                             usage)
126    {
127        std::vector<Value*> valIndices;
128        for (auto i : indices)
129            valIndices.push_back(C(i));
130        return Builder::LOAD(GEPA(basePtr, valIndices), name);
131    }
132
133    LoadInst* Builder::LOADV(Value*                               basePtr,
134                             const std::initializer_list<Value*>& indices,
135                             const llvm::Twine&                   name)
136    {
137        std::vector<Value*> valIndices;
138        for (auto i : indices)
139            valIndices.push_back(i);
140        return LOAD(GEPA(basePtr, valIndices), name);
141    }
142
143    StoreInst*
144    Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, MEM_CLIENT usage)
145    {
146        std::vector<Value*> valIndices;
147        for (auto i : indices)
148            valIndices.push_back(C(i));
149        return STORE(val, GEPA(basePtr, valIndices));
150    }
151
152    StoreInst*
153    Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
154    {
155        std::vector<Value*> valIndices;
156        for (auto i : indices)
157            valIndices.push_back(i);
158        return STORE(val, GEPA(basePtr, valIndices));
159    }
160
161    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
162    {
163        return GEP(base, offset);
164    }
165
166    Value* Builder::MEM_ADD(Value*                                 i32Incr,
167                            Value*                                 basePtr,
168                            const std::initializer_list<uint32_t>& indices,
169                            const llvm::Twine&                     name)
170    {
171        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
172        Value* i32Result = ADD(i32Value, i32Incr);
173        return STORE(i32Result, GEP(basePtr, indices));
174    }
175
176    //////////////////////////////////////////////////////////////////////////
177    /// @brief Generate a masked gather operation in LLVM IR.  If not
178    /// supported on the underlying platform, emulate it with loads
179    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
180    /// @param pBase - Int8* base VB address pointer value
181    /// @param vIndices - SIMD wide value of VB byte offsets
182    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
183    /// @param scale - value to scale indices by
184    Value* Builder::GATHERPS(Value*         vSrc,
185                             Value*         pBase,
186                             Value*         vIndices,
187                             Value*         vMask,
188                             uint8_t        scale,
189                             MEM_CLIENT     usage)
190    {
191        AssertMemoryUsageParams(pBase, usage);
192
193        return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale));
194    }
195
196    //////////////////////////////////////////////////////////////////////////
197    /// @brief Generate a masked gather operation in LLVM IR.  If not
198    /// supported on the underlying platform, emulate it with loads
199    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
200    /// @param pBase - Int8* base VB address pointer value
201    /// @param vIndices - SIMD wide value of VB byte offsets
202    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
203    /// @param scale - value to scale indices by
204    Value* Builder::GATHERDD(Value*         vSrc,
205                             Value*         pBase,
206                             Value*         vIndices,
207                             Value*         vMask,
208                             uint8_t        scale,
209                             MEM_CLIENT     usage)
210    {
211        AssertMemoryUsageParams(pBase, usage);
212
213        return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
214    }
215
216    //////////////////////////////////////////////////////////////////////////
217    /// @brief Generate a masked gather operation in LLVM IR.  If not
218    /// supported on the underlying platform, emulate it with loads
219    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
220    /// @param pBase - Int8* base VB address pointer value
221    /// @param vIndices - SIMD wide value of VB byte offsets
222    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
223    /// @param scale - value to scale indices by
224    Value*
225    Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
226    {
227        return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
228    }
229
230    //////////////////////////////////////////////////////////////////////////
231    /// @brief Alternative masked gather where source is a vector of pointers
232    /// @param pVecSrcPtr   - SIMD wide vector of pointers
233    /// @param pVecMask     - SIMD active lanes
234    /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
235    Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
236    {
237        return MASKED_GATHER(pVecSrcPtr, AlignType(4), pVecMask, pVecPassthru);
238    }
239
240    void Builder::SCATTER_PTR(Value* pVecDstPtr, Value* pVecSrc, Value* pVecMask)
241    {
242        MASKED_SCATTER(pVecSrc, pVecDstPtr, AlignType(4), pVecMask);
243    }
244
245    void Builder::Gather4(const SWR_FORMAT format,
246                          Value*           pSrcBase,
247                          Value*           byteOffsets,
248                          Value*           mask,
249                          Value*           vGatherComponents[],
250                          bool             bPackedOutput,
251                          MEM_CLIENT       usage)
252    {
253        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
254        if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
255        {
256            GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
257        }
258        else
259        {
260            GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
261        }
262    }
263
264    void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
265                            Value*                 pSrcBase,
266                            Value*                 byteOffsets,
267                            Value*                 vMask,
268                            Value*                 vGatherComponents[],
269                            bool                   bPackedOutput,
270                            MEM_CLIENT             usage)
271    {
272        switch (info.bpp / info.numComps)
273        {
274        case 16:
275        {
276            Value* vGatherResult[2];
277
278            // TODO: vGatherMaskedVal
279            Value* vGatherMaskedVal = VIMMED1((float)0);
280
281            // always have at least one component out of x or y to fetch
282
283            vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
284            // e.g. result of first 8x32bit integer gather for 16bit components
285            // 256i - 0    1    2    3    4    5    6    7
286            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
287            //
288
289            // if we have at least one component out of x or y to fetch
290            if (info.numComps > 2)
291            {
292                // offset base to the next components(zw) in the vertex to gather
293                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
294
295                vGatherResult[1] =
296                    GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
297                // e.g. result of second 8x32bit integer gather for 16bit components
298                // 256i - 0    1    2    3    4    5    6    7
299                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
300                //
301            }
302            else
303            {
304                vGatherResult[1] = vGatherMaskedVal;
305            }
306
307            // Shuffle gathered components into place, each row is a component
308            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
309        }
310        break;
311        case 32:
312        {
313            // apply defaults
314            for (uint32_t i = 0; i < 4; ++i)
315            {
316                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
317            }
318
319            for (uint32_t i = 0; i < info.numComps; i++)
320            {
321                uint32_t swizzleIndex = info.swizzle[i];
322
323                // Gather a SIMD of components
324                vGatherComponents[swizzleIndex] = GATHERPS(
325                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
326
327                // offset base to the next component to gather
328                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
329            }
330        }
331        break;
332        default:
333            SWR_INVALID("Invalid float format");
334            break;
335        }
336    }
337
338    void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
339                            Value*                 pSrcBase,
340                            Value*                 byteOffsets,
341                            Value*                 vMask,
342                            Value*                 vGatherComponents[],
343                            bool                   bPackedOutput,
344                            MEM_CLIENT             usage)
345    {
346        switch (info.bpp / info.numComps)
347        {
348        case 8:
349        {
350            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
351            Value* vGatherResult =
352                GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
353            // e.g. result of an 8x32bit integer gather for 8bit components
354            // 256i - 0    1    2    3    4    5    6    7
355            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
356
357            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
358        }
359        break;
360        case 16:
361        {
362            Value* vGatherResult[2];
363
364            // TODO: vGatherMaskedVal
365            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
366
367            // always have at least one component out of x or y to fetch
368
369            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
370            // e.g. result of first 8x32bit integer gather for 16bit components
371            // 256i - 0    1    2    3    4    5    6    7
372            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
373            //
374
375            // if we have at least one component out of x or y to fetch
376            if (info.numComps > 2)
377            {
378                // offset base to the next components(zw) in the vertex to gather
379                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
380
381                vGatherResult[1] =
382                    GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
383                // e.g. result of second 8x32bit integer gather for 16bit components
384                // 256i - 0    1    2    3    4    5    6    7
385                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
386                //
387            }
388            else
389            {
390                vGatherResult[1] = vGatherMaskedVal;
391            }
392
393            // Shuffle gathered components into place, each row is a component
394            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
395        }
396        break;
397        case 32:
398        {
399            // apply defaults
400            for (uint32_t i = 0; i < 4; ++i)
401            {
402                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
403            }
404
405            for (uint32_t i = 0; i < info.numComps; i++)
406            {
407                uint32_t swizzleIndex = info.swizzle[i];
408
409                // Gather a SIMD of components
410                vGatherComponents[swizzleIndex] = GATHERDD(
411                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
412
413                // offset base to the next component to gather
414                pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
415            }
416        }
417        break;
418        default:
419            SWR_INVALID("unsupported format");
420            break;
421        }
422    }
423
424    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
425                                      Value*                 vGatherInput[2],
426                                      Value*                 vGatherOutput[4],
427                                      bool                   bPackedOutput)
428    {
429        // cast types
430        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
431        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
432
433        // input could either be float or int vector; do shuffle work in int
434        vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
435        vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
436
437        if (bPackedOutput)
438        {
439            Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
440                                              mVWidth / 4); // vwidth is units of 32 bits
441
442            // shuffle mask
443            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
444                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
445            Value* vShufResult =
446                BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
447            // after pshufb: group components together in each 128bit lane
448            // 256i - 0    1    2    3    4    5    6    7
449            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
450
451            Value* vi128XY =
452                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
453            // after PERMD: move and pack xy components into each 128bit lane
454            // 256i - 0    1    2    3    4    5    6    7
455            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
456
457            // do the same for zw components
458            Value* vi128ZW = nullptr;
459            if (info.numComps > 2)
460            {
461                Value* vShufResult =
462                    BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
463                vi128ZW =
464                    BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
465            }
466
467            for (uint32_t i = 0; i < 4; i++)
468            {
469                uint32_t swizzleIndex = info.swizzle[i];
470                // todo: fixed for packed
471                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
472                if (i >= info.numComps)
473                {
474                    // set the default component val
475                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
476                    continue;
477                }
478
479                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
480                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
481                // if x or y, use vi128XY permute result, else use vi128ZW
482                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
483
484                // extract packed component 128 bit lanes
485                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
486            }
487        }
488        else
489        {
490            // pshufb masks for each component
491            Value* vConstMask[2];
492            // x/z shuffle mask
493            vConstMask[0] = C<char>({
494                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
495                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
496            });
497
498            // y/w shuffle mask
499            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
500                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
501
502            // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
503            // apply defaults
504            for (uint32_t i = 0; i < 4; ++i)
505            {
506                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
507            }
508
509            for (uint32_t i = 0; i < info.numComps; i++)
510            {
511                uint32_t swizzleIndex = info.swizzle[i];
512
513                // select correct constMask for x/z or y/w pshufb
514                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
515                // if x or y, use vi128XY permute result, else use vi128ZW
516                uint32_t selectedGather = (i < 2) ? 0 : 1;
517
518                vGatherOutput[swizzleIndex] =
519                    BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
520                                   vConstMask[selectedMask]),
521                            vGatherTy);
522                // after pshufb mask for x channel; z uses the same shuffle from the second gather
523                // 256i - 0    1    2    3    4    5    6    7
524                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
525            }
526        }
527    }
528
529    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
530                                     Value*                 vGatherInput,
531                                     Value*                 vGatherOutput[],
532                                     bool                   bPackedOutput)
533    {
534        // cast types
535        Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
536        Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
537
538        if (bPackedOutput)
539        {
540            Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
541                                           mVWidth / 4); // vwidth is units of 32 bits
542                                                         // shuffle mask
543            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
544                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
545            Value* vShufResult =
546                BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
547            // after pshufb: group components together in each 128bit lane
548            // 256i - 0    1    2    3    4    5    6    7
549            //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
550
551            Value* vi128XY =
552                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
553            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
554            // 256i - 0    1    2    3    4    5    6    7
555            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
556
557            // do the same for zw components
558            Value* vi128ZW = nullptr;
559            if (info.numComps > 2)
560            {
561                vi128ZW =
562                    BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
563            }
564
565            // sign extend all enabled components. If we have a fill vVertexElements, output to
566            // current simdvertex
567            for (uint32_t i = 0; i < 4; i++)
568            {
569                uint32_t swizzleIndex = info.swizzle[i];
570                // todo: fix for packed
571                Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
572                if (i >= info.numComps)
573                {
574                    // set the default component val
575                    vGatherOutput[swizzleIndex] = vGatherMaskedVal;
576                    continue;
577                }
578
579                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
580                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
581                // if x or y, use vi128XY permute result, else use vi128ZW
582                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
583
584                // sign extend
585                vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
586            }
587        }
588        // else zero extend
589        else
590        {
591            // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
592            // apply defaults
593            for (uint32_t i = 0; i < 4; ++i)
594            {
595                vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
596            }
597
598            for (uint32_t i = 0; i < info.numComps; i++)
599            {
600                uint32_t swizzleIndex = info.swizzle[i];
601
602                // pshufb masks for each component
603                Value* vConstMask;
604                switch (i)
605                {
606                case 0:
607                    // x shuffle mask
608                    vConstMask =
609                        C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
610                                 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
611                    break;
612                case 1:
613                    // y shuffle mask
614                    vConstMask =
615                        C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
616                                 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
617                    break;
618                case 2:
619                    // z shuffle mask
620                    vConstMask =
621                        C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
622                                 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
623                    break;
624                case 3:
625                    // w shuffle mask
626                    vConstMask =
627                        C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
628                                 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
629                    break;
630                default:
631                    vConstMask = nullptr;
632                    break;
633                }
634
635                assert(vConstMask && "Invalid info.numComps value");
636                vGatherOutput[swizzleIndex] =
637                    BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
638                // after pshufb for x channel
639                // 256i - 0    1    2    3    4    5    6    7
640                //        x000 x000 x000 x000 x000 x000 x000 x000
641            }
642        }
643    }
644
645    //////////////////////////////////////////////////////////////////////////
646    /// @brief emulates a scatter operation.
647    /// @param pDst - pointer to destination
648    /// @param vSrc - vector of src data to scatter
649    /// @param vOffsets - vector of byte offsets from pDst
650    /// @param vMask - mask of valid lanes
651    void Builder::SCATTERPS(
652        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, MEM_CLIENT usage)
653    {
654        AssertMemoryUsageParams(pDst, usage);
655#if LLVM_VERSION_MAJOR >= 11
656        SWR_ASSERT(cast<VectorType>(vSrc->getType())->getElementType()->isFloatTy());
657#else
658        SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy());
659#endif
660        VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1));
661        return;
662
663        /* Scatter algorithm
664
665        while(Index = BitScanForward(mask))
666        srcElem = srcVector[Index]
667        offsetElem = offsetVector[Index]
668        *(pDst + offsetElem) = srcElem
669        Update mask (&= ~(1<<Index)
670
671        */
672
673        /*
674
675        // Reference implementation kept around for reference
676
677        BasicBlock* pCurBB = IRB()->GetInsertBlock();
678        Function*   pFunc  = pCurBB->getParent();
679        Type*       pSrcTy = vSrc->getType()->getVectorElementType();
680
681        // Store vectors on stack
682        if (pScatterStackSrc == nullptr)
683        {
684            // Save off stack allocations and reuse per scatter. Significantly reduces stack
685            // requirements for shaders with a lot of scatters.
686            pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
687            pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
688        }
689
690        Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
691        Value* pOffsetsArrayPtr = pScatterStackOffsets;
692        STORE(vSrc, pSrcArrayPtr);
693        STORE(vOffsets, pOffsetsArrayPtr);
694
695        // Cast to pointers for random access
696        pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
697        pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
698
699        Value* pMask = VMOVMSK(vMask);
700
701        // Setup loop basic block
702        BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
703
704        // compute first set bit
705        Value* pIndex = CTTZ(pMask, C(false));
706
707        Value* pIsUndef = ICMP_EQ(pIndex, C(32));
708
709        // Split current block or create new one if building inline
710        BasicBlock* pPostLoop;
711        if (pCurBB->getTerminator())
712        {
713            pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
714
715            // Remove unconditional jump created by splitBasicBlock
716            pCurBB->getTerminator()->eraseFromParent();
717
718            // Add terminator to end of original block
719            IRB()->SetInsertPoint(pCurBB);
720
721            // Add conditional branch
722            COND_BR(pIsUndef, pPostLoop, pLoop);
723        }
724        else
725        {
726            pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
727
728            // Add conditional branch
729            COND_BR(pIsUndef, pPostLoop, pLoop);
730        }
731
732        // Add loop basic block contents
733        IRB()->SetInsertPoint(pLoop);
734        PHINode* pIndexPhi = PHI(mInt32Ty, 2);
735        PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
736
737        pIndexPhi->addIncoming(pIndex, pCurBB);
738        pMaskPhi->addIncoming(pMask, pCurBB);
739
740        // Extract elements for this index
741        Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
742        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
743
744        // GEP to this offset in dst
745        Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
746        pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
747        STORE(pSrcElem, pCurDst);
748
749        // Update the mask
750        Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
751
752        // Terminator
753        Value* pNewIndex = CTTZ(pNewMask, C(false));
754
755        pIsUndef = ICMP_EQ(pNewIndex, C(32));
756        COND_BR(pIsUndef, pPostLoop, pLoop);
757
758        // Update phi edges
759        pIndexPhi->addIncoming(pNewIndex, pLoop);
760        pMaskPhi->addIncoming(pNewMask, pLoop);
761
762        // Move builder to beginning of post loop
763        IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
764
765        */
766    }
767} // namespace SwrJit
768