1/****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30#include "jit_pch.hpp"
31#include "builder_gfx_mem.h"
32#include "jit_api.h"
33#include "fetch_jit.h"
34#include "gen_state_llvm.h"
35#include "functionpasses/passes.h"
36
37//#define FETCH_DUMP_VERTEX 1
38using namespace llvm;
39using namespace SwrJit;
40
41bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43enum ConversionType
44{
45    CONVERT_NONE,
46    CONVERT_NORMALIZED,
47    CONVERT_USCALED,
48    CONVERT_SSCALED,
49    CONVERT_SFIXED,
50};
51
52//////////////////////////////////////////////////////////////////////////
53/// Interface to Jitting a fetch shader
54//////////////////////////////////////////////////////////////////////////
55struct FetchJit : public BuilderGfxMem
56{
57    FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {}
58
59    Function* Create(const FETCH_COMPILE_STATE& fetchState);
60
61    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
64    template <typename T>
65    Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
66
67    // package up Shuffle*bpcGatherd args into a tuple for convenience
68    typedef std::tuple<Value*&,
69                       Value*,
70                       const Instruction::CastOps,
71                       const ConversionType,
72                       uint32_t&,
73                       uint32_t&,
74                       const ComponentEnable,
75                       const ComponentControl (&)[4],
76                       Value* (&)[4],
77                       const uint32_t (&)[4]>
78        Shuffle8bpcArgs;
79
80    void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81    void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
82
83    typedef std::tuple<Value* (&)[2],
84                       Value*,
85                       const Instruction::CastOps,
86                       const ConversionType,
87                       uint32_t&,
88                       uint32_t&,
89                       const ComponentEnable,
90                       const ComponentControl (&)[4],
91                       Value* (&)[4]>
92        Shuffle16bpcArgs;
93
94    void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95    void Shuffle16bpcGather(Shuffle16bpcArgs& args);
96
97    void StoreVertexElements(Value*         pVtxOut,
98                             const uint32_t outputElt,
99                             const uint32_t numEltsToStore,
100                             Value* (&vVertexElements)[4]);
101
102    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
103
104    void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
105                           Value*                     streams,
106                           Value*                     vIndices,
107                           Value*                     pVtxOut);
108
109    bool IsOddFormat(SWR_FORMAT format);
110    bool IsUniformFormat(SWR_FORMAT format);
111    void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112    void CreateGatherOddFormats(
113        SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114    void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
115
116    Value* mpFetchInfo;
117};
118
119Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
120{
121    std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
122    fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
123
124    Function* fetch = Function::Create(
125        JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
126    BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
127
128    fetch->getParent()->setModuleIdentifier(fetch->getName());
129
130    IRB()->SetInsertPoint(entry);
131
132    auto argitr = fetch->arg_begin();
133
134    // Fetch shader arguments
135    Value* privateContext = &*argitr;
136    ++argitr;
137    privateContext->setName("privateContext");
138    SetPrivateContext(privateContext);
139
140    mpWorkerData = &*argitr;
141    ++argitr;
142    mpWorkerData->setName("pWorkerData");
143
144    mpFetchInfo = &*argitr;
145    ++argitr;
146    mpFetchInfo->setName("fetchInfo");
147    Value* pVtxOut = &*argitr;
148    pVtxOut->setName("vtxOutput");
149
150    uint32_t baseWidth = mVWidth;
151
152    SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
153
154    // Override builder target width to force 16-wide SIMD
155#if USE_SIMD16_SHADERS
156    SetTargetWidth(16);
157#endif
158
159    pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
160
161    // SWR_FETCH_CONTEXT::pStreams
162    Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163    streams->setName("pStreams");
164
165    // SWR_FETCH_CONTEXT::pIndices
166    Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167    indices->setName("pIndices");
168
169    // SWR_FETCH_CONTEXT::pLastIndex
170    Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171    pLastIndex->setName("pLastIndex");
172
173    Value* vIndices;
174    switch (fetchState.indexType)
175    {
176    case R8_UINT:
177        indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178        if (fetchState.bDisableIndexOOBCheck)
179        {
180            vIndices = LOAD(
181                BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)),
182                {(uint32_t)0});
183            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
184        }
185        else
186        {
187            vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
188        }
189        break;
190    case R16_UINT:
191        if (fetchState.bDisableIndexOOBCheck)
192        {
193            vIndices = LOAD(
194                BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)),
195                {(uint32_t)0});
196            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
197        }
198        else
199        {
200            vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
201        }
202        break;
203    case R32_UINT:
204        (fetchState.bDisableIndexOOBCheck)
205            ? vIndices = LOAD(indices,
206                              "",
207                              PointerType::get(mSimdInt32Ty, 0),
208                              MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
209            : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
210        break; // incoming type is already 32bit int
211    default:
212        vIndices = nullptr;
213        assert(false && "Unsupported index type");
214        break;
215    }
216
217    if (fetchState.bForceSequentialAccessEnable)
218    {
219        Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
220                                       : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
221
222        // VertexData buffers are accessed sequentially, the index is equal to the vertex number
223        vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
224        vIndices = ADD(vIndices, pOffsets);
225    }
226
227    Value* vVertexId = vIndices;
228    if (fetchState.bVertexIDOffsetEnable)
229    {
230        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
231        // correct
232        Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
233        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
234        vVertexId           = ADD(vIndices, vBaseVertex);
235        vVertexId           = ADD(vVertexId, vStartVertex);
236    }
237
238    // store out vertex IDs
239    if (mVWidth == 16)
240    {
241        // store out in simd8 halves until core supports 16-wide natively
242        auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
243        auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
244        STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
245        STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
246    }
247    else if (mVWidth == 8)
248    {
249        STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
250    }
251
252    // store out cut mask if enabled
253    if (fetchState.bEnableCutIndex)
254    {
255        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
256        Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
257
258        if (mVWidth == 16)
259        {
260            auto cutMaskLo = EXTRACT_16(cutMask, 0);
261            auto cutMaskHi = EXTRACT_16(cutMask, 1);
262            STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
263            STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
264        }
265        else if (mVWidth == 8)
266        {
267            STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
268        }
269    }
270
271    // Fetch attributes from memory and output to a simdvertex struct
272    JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
273
274    RET_VOID();
275
276    JitManager::DumpToFile(fetch, "src");
277
278#if defined(_DEBUG)
279    verifyFunction(*fetch);
280#endif
281
282    ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
283
284    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
285    setupPasses.add(createBreakCriticalEdgesPass());
286    setupPasses.add(createCFGSimplificationPass());
287    setupPasses.add(createEarlyCSEPass());
288    setupPasses.add(createPromoteMemoryToRegisterPass());
289
290    setupPasses.run(*fetch);
291
292    JitManager::DumpToFile(fetch, "se");
293
294    ::FunctionPassManager optPasses(JM()->mpCurrentModule);
295
296    ///@todo Haven't touched these either. Need to remove some of these and add others.
297    optPasses.add(createCFGSimplificationPass());
298    optPasses.add(createEarlyCSEPass());
299    optPasses.add(createInstructionCombiningPass());
300#if LLVM_VERSION_MAJOR <= 11
301    optPasses.add(createConstantPropagationPass());
302#endif
303    optPasses.add(createSCCPPass());
304    optPasses.add(createAggressiveDCEPass());
305
306    optPasses.run(*fetch);
307
308    optPasses.add(createLowerX86Pass(this));
309    optPasses.run(*fetch);
310
311    JitManager::DumpToFile(fetch, "opt");
312
313
314    // Revert 16-wide override
315#if USE_SIMD16_SHADERS
316    SetTargetWidth(baseWidth);
317#endif
318
319    return fetch;
320}
321
322// returns true for odd formats that require special state.gather handling
323bool FetchJit::IsOddFormat(SWR_FORMAT format)
324{
325    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
326    if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
327    {
328        return true;
329    }
330    return false;
331}
332
333// format is uniform if all components are the same size and type
334bool FetchJit::IsUniformFormat(SWR_FORMAT format)
335{
336    const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
337    uint32_t               bpc0  = info.bpc[0];
338    uint32_t               type0 = info.type[0];
339
340    for (uint32_t c = 1; c < info.numComps; ++c)
341    {
342        if (bpc0 != info.bpc[c] || type0 != info.type[c])
343        {
344            return false;
345        }
346    }
347    return true;
348}
349
350// unpacks components based on format
351// foreach component in the pixel
352//   mask off everything but this component
353//   shift component to LSB
354void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
355{
356    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
357
358    uint32_t bitOffset = 0;
359    for (uint32_t c = 0; c < info.numComps; ++c)
360    {
361        uint32_t swizzledIndex = info.swizzle[c];
362        uint32_t compBits      = info.bpc[c];
363        uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
364        Value*   comp          = AND(vInput, bitmask);
365        comp                   = LSHR(comp, bitOffset);
366
367        result[swizzledIndex] = comp;
368        bitOffset += compBits;
369    }
370}
371
372// gather for odd component size formats
373// gather SIMD full pixels per lane then shift/mask to move each component to their
374// own vector
375void FetchJit::CreateGatherOddFormats(
376    SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
377{
378    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
379
380    // only works if pixel size is <= 32bits
381    SWR_ASSERT(info.bpp <= 32);
382
383    Value* pGather;
384    if (info.bpp == 32)
385    {
386        pGather =
387            GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
388    }
389    else
390    {
391        // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
392        Value* pMem = ALLOCA(mSimdInt32Ty);
393        STORE(VIMMED1(0u), pMem);
394
395        Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
396
397        for (uint32_t lane = 0; lane < mVWidth; ++lane)
398        {
399            // Get index
400            Value* index = VEXTRACT(pOffsets, C(lane));
401            Value* mask  = VEXTRACT(pMask, C(lane));
402
403            // use branch around load based on mask
404            // Needed to avoid page-faults on unmasked lanes
405            BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
406            BasicBlock* pMaskedLoadBlock =
407                BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
408            BasicBlock* pEndLoadBB =
409                BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
410
411            COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
412
413            JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
414
415            switch (info.bpp)
416            {
417            case 8:
418            {
419                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
420                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
421                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
422                break;
423            }
424
425            case 16:
426            {
427                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
428                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
429                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
430                break;
431            }
432            break;
433
434            case 24:
435            {
436                // First 16-bits of data
437                Value* pDst  = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
438                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
439                STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
440
441                // Last 8-bits of data
442                pDst  = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
443                xpSrc = ADD(xpSrc, C((int64_t)2));
444                STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
445                break;
446            }
447
448            default:
449                SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
450                break;
451            }
452
453            BR(pEndLoadBB);
454            JM()->mBuilder.SetInsertPoint(pEndLoadBB);
455        }
456
457        pGather = LOAD(pMem);
458    }
459
460    for (uint32_t comp = 0; comp < 4; ++comp)
461    {
462        pResult[comp] = VIMMED1((int)info.defaults[comp]);
463    }
464
465    UnpackComponents(format, pGather, pResult);
466
467    // cast to fp32
468    pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
469    pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
470    pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
471    pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
472}
473
474void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
475{
476    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
477
478    for (uint32_t c = 0; c < info.numComps; ++c)
479    {
480        uint32_t compIndex = info.swizzle[c];
481
482        // skip any conversion on UNUSED components
483        if (info.type[c] == SWR_TYPE_UNUSED)
484        {
485            continue;
486        }
487
488        if (info.isNormalized[c])
489        {
490            if (info.type[c] == SWR_TYPE_SNORM)
491            {
492                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
493                /// -1.0f.
494
495                /// result = c * (1.0f / (2^(n-1) - 1);
496                uint32_t n        = info.bpc[c];
497                uint32_t pow2     = 1 << (n - 1);
498                float    scale    = 1.0f / (float)(pow2 - 1);
499                Value*   vScale   = VIMMED1(scale);
500                texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
501                texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
502                texels[compIndex] = FMUL(texels[compIndex], vScale);
503            }
504            else
505            {
506                SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
507
508                /// result = c * (1.0f / (2^n - 1))
509                uint32_t n    = info.bpc[c];
510                uint32_t pow2 = 1 << n;
511                // special case 24bit unorm format, which requires a full divide to meet ULP
512                // requirement
513                if (n == 24)
514                {
515                    float  scale      = (float)(pow2 - 1);
516                    Value* vScale     = VIMMED1(scale);
517                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
518                    texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
519                    texels[compIndex] = FDIV(texels[compIndex], vScale);
520                }
521                else
522                {
523                    float  scale      = 1.0f / (float)(pow2 - 1);
524                    Value* vScale     = VIMMED1(scale);
525                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
526                    texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
527                    texels[compIndex] = FMUL(texels[compIndex], vScale);
528                }
529            }
530            continue;
531        }
532    }
533}
534
535//////////////////////////////////////////////////////////////////////////
536/// @brief Loads attributes from memory using AVX2 GATHER(s)
537/// @param fetchState - info about attributes to be fetched from memory
538/// @param streams - value pointer to the current vertex stream
539/// @param vIndices - vector value of indices to gather
540/// @param pVtxOut - value pointer to output simdvertex struct
541void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
542                                 Value*                     streams,
543                                 Value*                     vIndices,
544                                 Value*                     pVtxOut)
545{
546    uint32_t currentVertexElement = 0;
547    uint32_t outputElt            = 0;
548    Value*   vVertexElements[4];
549
550    Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
551    Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
552    Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
553    Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
554    curInstance->setName("curInstance");
555
556    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
557    {
558        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
559
560        // skip element if all components are disabled
561        if (ied.ComponentPacking == ComponentEnable::NONE)
562        {
563            continue;
564        }
565
566        const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
567        SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
568        uint32_t bpc =
569            info.bpp /
570            info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
571
572        Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
573
574        Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
575        Value* vStride = VBROADCAST(stride);
576
577        // max vertex index that is fully in bounds
578        Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
579        maxVertex        = LOAD(maxVertex);
580
581        Value* minVertex = NULL;
582        if (fetchState.bPartialVertexBuffer)
583        {
584            // min vertex index for low bounds OOB checking
585            minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
586            minVertex = LOAD(minVertex);
587        }
588
589        if (fetchState.bInstanceIDOffsetEnable)
590        {
591            // the InstanceID (curInstance) value is offset by StartInstanceLocation
592            curInstance = ADD(curInstance, startInstance);
593        }
594
595        Value* vCurIndices;
596        Value* startOffset;
597        Value* vInstanceStride = VIMMED1(0);
598
599        if (ied.InstanceEnable)
600        {
601            Value* stepRate = C(ied.InstanceAdvancementState);
602
603            // prevent a div by 0 for 0 step rate
604            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
605            stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
606
607            // calc the current offset into instanced data buffer
608            Value* calcInstance = UDIV(curInstance, stepRate);
609
610            // if step rate is 0, every instance gets instance 0
611            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
612
613            vCurIndices = VBROADCAST(calcInstance);
614            startOffset = startInstance;
615        }
616        else if (ied.InstanceStrideEnable)
617        {
618            // grab the instance advancement state, determines stride in bytes from one instance to
619            // the next
620            Value* stepRate = C(ied.InstanceAdvancementState);
621            vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
622
623            // offset indices by baseVertex
624            vCurIndices = ADD(vIndices, vBaseVertex);
625
626            startOffset = startVertex;
627            SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
628        }
629        else
630        {
631            // offset indices by baseVertex
632            vCurIndices = ADD(vIndices, vBaseVertex);
633            startOffset = startVertex;
634        }
635
636        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
637        // do 64bit address offset calculations.
638
639        // calculate byte offset to the start of the VB
640        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
641
642        // VGATHER* takes an *i8 src pointer so that's what stream is
643        Value* pStreamBaseGFX = ADD(stream, baseOffset);
644
645        // if we have a start offset, subtract from max vertex. Used for OOB check
646        maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
647        Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
648        // if we have a negative value, we're already OOB. clamp at 0.
649        maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
650
651        if (fetchState.bPartialVertexBuffer)
652        {
653            // similary for min vertex
654            minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
655            Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
656            minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
657        }
658
659        // Load the in bounds size of a partially valid vertex
660        Value* partialInboundsSize =
661            GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
662        partialInboundsSize       = LOAD(partialInboundsSize);
663        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
664        Value* vBpp               = VBROADCAST(C(info.Bpp));
665        Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
666
667        // is the element is <= the partially valid size
668        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
669
670        // override cur indices with 0 if pitch is 0
671        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
672        vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
673
674        // are vertices partially OOB?
675        Value* vMaxVertex      = VBROADCAST(maxVertex);
676        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
677
678        // are vertices fully in bounds?
679        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
680
681        Value* vGatherMask;
682        if (fetchState.bPartialVertexBuffer)
683        {
684            // are vertices below minVertex limit?
685            Value* vMinVertex     = VBROADCAST(minVertex);
686            Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
687
688            // only fetch lanes that pass both tests
689            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
690        }
691        else
692        {
693            vGatherMask = vMaxGatherMask;
694        }
695
696        // blend in any partially OOB indices that have valid elements
697        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
698
699        // calculate the actual offsets into the VB
700        Value* vOffsets = MUL(vCurIndices, vStride);
701        vOffsets        = ADD(vOffsets, vAlignmentOffsets);
702
703        // if instance stride enable is:
704        //  true  - add product of the instanceID and advancement state to the offset into the VB
705        //  false - value of vInstanceStride has been initialized to zero
706        vOffsets = ADD(vOffsets, vInstanceStride);
707
708        // Packing and component control
709        ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
710        const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
711                                           (ComponentControl)ied.ComponentControl1,
712                                           (ComponentControl)ied.ComponentControl2,
713                                           (ComponentControl)ied.ComponentControl3};
714
715        // Special gather/conversion for formats without equal component sizes
716        if (IsOddFormat((SWR_FORMAT)ied.Format))
717        {
718            Value* pResults[4];
719            CreateGatherOddFormats(
720                (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
721            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
722
723            for (uint32_t c = 0; c < 4; c += 1)
724            {
725                if (isComponentEnabled(compMask, c))
726                {
727                    vVertexElements[currentVertexElement++] = pResults[c];
728                    if (currentVertexElement > 3)
729                    {
730                        StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
731                        // reset to the next vVertexElement to output
732                        currentVertexElement = 0;
733                    }
734                }
735            }
736        }
737        else if (info.type[0] == SWR_TYPE_FLOAT)
738        {
739            ///@todo: support 64 bit vb accesses
740            Value* gatherSrc = VIMMED1(0.0f);
741
742            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
743                       "Unsupported format for standard gather fetch.");
744
745            // Gather components from memory to store in a simdvertex structure
746            switch (bpc)
747            {
748            case 16:
749            {
750                Value* vGatherResult[2];
751
752                // if we have at least one component out of x or y to fetch
753                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
754                {
755                    vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
756                    // e.g. result of first 8x32bit integer gather for 16bit components
757                    // 256i - 0    1    2    3    4    5    6    7
758                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
759                    //
760                }
761
762                // if we have at least one component out of z or w to fetch
763                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
764                {
765                    // offset base to the next components(zw) in the vertex to gather
766                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
767
768                    vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
769                    // e.g. result of second 8x32bit integer gather for 16bit components
770                    // 256i - 0    1    2    3    4    5    6    7
771                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
772                    //
773                }
774
775                // if we have at least one component to shuffle into place
776                if (compMask)
777                {
778                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
779                                                                  pVtxOut,
780                                                                  Instruction::CastOps::FPExt,
781                                                                  CONVERT_NONE,
782                                                                  currentVertexElement,
783                                                                  outputElt,
784                                                                  compMask,
785                                                                  compCtrl,
786                                                                  vVertexElements);
787
788                    // Shuffle gathered components into place in simdvertex struct
789                    mVWidth == 16 ? Shuffle16bpcGather16(args)
790                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
791                }
792            }
793            break;
794            case 32:
795            {
796                for (uint32_t i = 0; i < 4; i += 1)
797                {
798                    if (isComponentEnabled(compMask, i))
799                    {
800                        // if we need to gather the component
801                        if (compCtrl[i] == StoreSrc)
802                        {
803                            // Gather a SIMD of vertices
804                            // APIs allow a 4GB range for offsets
805                            // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
806                            // Add 2GB to the base pointer and 2GB to the offsets.  This makes
807                            // "negative" (large) offsets into positive offsets and small offsets
808                            // into negative offsets.
809                            Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
810                            vVertexElements[currentVertexElement++] =
811                                GATHERPS(gatherSrc,
812                                         ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
813                                         vNewOffsets,
814                                         vGatherMask,
815                                         1,
816                                         MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
817                        }
818                        else
819                        {
820                            vVertexElements[currentVertexElement++] =
821                                GenerateCompCtrlVector(compCtrl[i]);
822                        }
823
824                        if (currentVertexElement > 3)
825                        {
826                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
827                            // reset to the next vVertexElement to output
828                            currentVertexElement = 0;
829                        }
830                    }
831
832                    // offset base to the next component in the vertex to gather
833                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
834                }
835            }
836            break;
837            case 64:
838            {
839                for (uint32_t i = 0; i < 4; i += 1)
840                {
841                    if (isComponentEnabled(compMask, i))
842                    {
843                        // if we need to gather the component
844                        if (compCtrl[i] == StoreSrc)
845                        {
846                            Value* vShufLo;
847                            Value* vShufHi;
848                            Value* vShufAll;
849
850                            if (mVWidth == 8)
851                            {
852                                vShufLo  = C({0, 1, 2, 3});
853                                vShufHi  = C({4, 5, 6, 7});
854                                vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
855                            }
856                            else
857                            {
858                                SWR_ASSERT(mVWidth == 16);
859                                vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
860                                vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
861                                vShufAll =
862                                    C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
863                            }
864
865                            Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
866                            Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
867
868                            Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
869                            Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
870
871                            Value* vZeroDouble = VECTOR_SPLAT(
872                                mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
873
874                            Value* pGatherLo =
875                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
876                            Value* pGatherHi =
877                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
878
879                            Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
880                            pGather        = FP_TRUNC(pGather, mSimdFP32Ty);
881
882                            vVertexElements[currentVertexElement++] = pGather;
883                        }
884                        else
885                        {
886                            vVertexElements[currentVertexElement++] =
887                                GenerateCompCtrlVector(compCtrl[i]);
888                        }
889
890                        if (currentVertexElement > 3)
891                        {
892                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
893                            // reset to the next vVertexElement to output
894                            currentVertexElement = 0;
895                        }
896                    }
897
898                    // offset base to the next component  in the vertex to gather
899                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
900                }
901            }
902            break;
903            default:
904                SWR_INVALID("Tried to fetch invalid FP format");
905                break;
906            }
907        }
908        else
909        {
910            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
911            ConversionType       conversionType = CONVERT_NONE;
912
913            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
914                       "Unsupported format for standard gather fetch.");
915
916            switch (info.type[0])
917            {
918            case SWR_TYPE_UNORM:
919                conversionType = CONVERT_NORMALIZED;
920            case SWR_TYPE_UINT:
921                extendCastType = Instruction::CastOps::ZExt;
922                break;
923            case SWR_TYPE_SNORM:
924                conversionType = CONVERT_NORMALIZED;
925            case SWR_TYPE_SINT:
926                extendCastType = Instruction::CastOps::SExt;
927                break;
928            case SWR_TYPE_USCALED:
929                conversionType = CONVERT_USCALED;
930                extendCastType = Instruction::CastOps::UIToFP;
931                break;
932            case SWR_TYPE_SSCALED:
933                conversionType = CONVERT_SSCALED;
934                extendCastType = Instruction::CastOps::SIToFP;
935                break;
936            case SWR_TYPE_SFIXED:
937                conversionType = CONVERT_SFIXED;
938                extendCastType = Instruction::CastOps::SExt;
939                break;
940            default:
941                break;
942            }
943
944            // value substituted when component of gather is masked
945            Value* gatherSrc = VIMMED1(0);
946
947            // Gather components from memory to store in a simdvertex structure
948            switch (bpc)
949            {
950            case 8:
951            {
952                // if we have at least one component to fetch
953                if (compMask)
954                {
955                    Value* vGatherResult = GATHERDD(gatherSrc,
956                                                    pStreamBaseGFX,
957                                                    vOffsets,
958                                                    vGatherMask,
959                                                    1,
960                                                    MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
961                    // e.g. result of an 8x32bit integer gather for 8bit components
962                    // 256i - 0    1    2    3    4    5    6    7
963                    //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
964
965                    Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
966                                                                 pVtxOut,
967                                                                 extendCastType,
968                                                                 conversionType,
969                                                                 currentVertexElement,
970                                                                 outputElt,
971                                                                 compMask,
972                                                                 compCtrl,
973                                                                 vVertexElements,
974                                                                 info.swizzle);
975
976                    // Shuffle gathered components into place in simdvertex struct
977                    mVWidth == 16 ? Shuffle8bpcGatherd16(args)
978                                  : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
979                }
980            }
981            break;
982            case 16:
983            {
984                Value* vGatherResult[2];
985
986                // if we have at least one component out of x or y to fetch
987                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
988                {
989                    vGatherResult[0] = GATHERDD(gatherSrc,
990                                                pStreamBaseGFX,
991                                                vOffsets,
992                                                vGatherMask,
993                                                1,
994                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
995                    // e.g. result of first 8x32bit integer gather for 16bit components
996                    // 256i - 0    1    2    3    4    5    6    7
997                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
998                    //
999                }
1000
1001                // if we have at least one component out of z or w to fetch
1002                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1003                {
1004                    // offset base to the next components(zw) in the vertex to gather
1005                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1006
1007                    vGatherResult[1] = GATHERDD(gatherSrc,
1008                                                pStreamBaseGFX,
1009                                                vOffsets,
1010                                                vGatherMask,
1011                                                1,
1012                                                MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1013                    // e.g. result of second 8x32bit integer gather for 16bit components
1014                    // 256i - 0    1    2    3    4    5    6    7
1015                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
1016                    //
1017                }
1018
1019                // if we have at least one component to shuffle into place
1020                if (compMask)
1021                {
1022                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1023                                                                  pVtxOut,
1024                                                                  extendCastType,
1025                                                                  conversionType,
1026                                                                  currentVertexElement,
1027                                                                  outputElt,
1028                                                                  compMask,
1029                                                                  compCtrl,
1030                                                                  vVertexElements);
1031
1032                    // Shuffle gathered components into place in simdvertex struct
1033                    mVWidth == 16 ? Shuffle16bpcGather16(args)
1034                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1035                }
1036            }
1037            break;
1038            case 32:
1039            {
1040                // Gathered components into place in simdvertex struct
1041                for (uint32_t i = 0; i < 4; i++)
1042                {
1043                    if (isComponentEnabled(compMask, i))
1044                    {
1045                        // if we need to gather the component
1046                        if (compCtrl[i] == StoreSrc)
1047                        {
1048                            Value* pGather = GATHERDD(gatherSrc,
1049                                                      pStreamBaseGFX,
1050                                                      vOffsets,
1051                                                      vGatherMask,
1052                                                      1,
1053                                                      MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1054
1055                            if (conversionType == CONVERT_USCALED)
1056                            {
1057                                pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1058                            }
1059                            else if (conversionType == CONVERT_SSCALED)
1060                            {
1061                                pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1062                            }
1063                            else if (conversionType == CONVERT_SFIXED)
1064                            {
1065                                pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1066                                               VBROADCAST(C(1 / 65536.0f)));
1067                            }
1068
1069                            vVertexElements[currentVertexElement++] = pGather;
1070
1071                            // e.g. result of a single 8x32bit integer gather for 32bit components
1072                            // 256i - 0    1    2    3    4    5    6    7
1073                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1074                        }
1075                        else
1076                        {
1077                            vVertexElements[currentVertexElement++] =
1078                                GenerateCompCtrlVector(compCtrl[i]);
1079                        }
1080
1081                        if (currentVertexElement > 3)
1082                        {
1083                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1084
1085                            // reset to the next vVertexElement to output
1086                            currentVertexElement = 0;
1087                        }
1088                    }
1089
1090                    // offset base to the next component  in the vertex to gather
1091                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1092                }
1093            }
1094            break;
1095            }
1096        }
1097    }
1098
1099    // if we have a partially filled vVertexElement struct, output it
1100    if (currentVertexElement > 0)
1101    {
1102        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1103    }
1104}
1105
1106
1107typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData);
1108
1109template <typename T>
1110void GetSimdValidIndicesGfx(gfxptr_t                     indices,
1111                            gfxptr_t                     lastIndex,
1112                            uint32_t                     vWidth,
1113                            PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1114                            void*                        pdc,
1115                            uint32_t*                    outIndices,
1116                            void*                        pWorkerData)
1117{
1118    SWR_ASSERT(outIndices != nullptr);
1119
1120    gfxptr_t indexPtr = indices;
1121    for (int64_t lane = 0; lane < vWidth; lane++)
1122    {
1123        uint32_t index = 0;
1124
1125        if (indexPtr < lastIndex)
1126        {
1127            // translate indexPtr and load from it
1128            T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData);
1129            SWR_ASSERT(addr != nullptr);
1130            index = *addr;
1131        }
1132
1133        // index to 32 bits and insert into the correct simd lane
1134        outIndices[lane] = index;
1135
1136        indexPtr += sizeof(T);
1137    }
1138}
1139
1140void GetSimdValid8bitIndicesGfx(gfxptr_t                     indices,
1141                                gfxptr_t                     lastIndex,
1142                                uint32_t                     vWidth,
1143                                PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1144                                void*                        pdc,
1145                                uint32_t*                    outIndices,
1146                                void*                        pWorkerData)
1147{
1148    GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1149}
1150
1151void GetSimdValid16bitIndicesGfx(gfxptr_t                     indices,
1152                                 gfxptr_t                     lastIndex,
1153                                 uint32_t                     vWidth,
1154                                 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate,
1155                                 void*                        pdc,
1156                                 uint32_t*                    outIndices,
1157                                 void*                        pWorkerData)
1158{
1159    GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData);
1160}
1161
1162
1163template <typename T>
1164Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1165{
1166    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1167               "Function expects gfxptr_t for both input parameters.");
1168
1169    Type* Ty = nullptr;
1170
1171    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1172                  "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1173    constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1174    if (bSize)
1175    {
1176        Ty = mInt16PtrTy;
1177    }
1178    else if (sizeof(T) == sizeof(uint8_t))
1179    {
1180        Ty = mInt8PtrTy;
1181    }
1182    else
1183    {
1184        SWR_ASSERT(false, "This should never happen as per static_assert above.");
1185    }
1186
1187    Value* vIndices = VUNDEF_I();
1188
1189    {
1190        // store 0 index on stack to be used to conditionally load from if index address is OOB
1191        Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1192        STORE(C((T)0), pZeroIndex);
1193
1194        // Load a SIMD of index pointers
1195        for (int64_t lane = 0; lane < mVWidth; lane++)
1196        {
1197            // Calculate the address of the requested index
1198            Value* pIndex = GEP(pIndices, C(lane), Ty);
1199
1200            pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1201
1202            // check if the address is less than the max index,
1203            Value* mask = ICMP_ULT(pIndex, pLastIndex);
1204
1205            // if valid, load the index. if not, load 0 from the stack
1206            Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1207            Value* index  = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1208
1209            // zero extended index to 32 bits and insert into the correct simd lane
1210            index    = Z_EXT(index, mInt32Ty);
1211            vIndices = VINSERT(vIndices, index, lane);
1212        }
1213    }
1214
1215    return vIndices;
1216}
1217
1218//////////////////////////////////////////////////////////////////////////
1219/// @brief Loads a simd of valid indices. OOB indices are set to 0
1220/// *Note* have to do 8bit index checking in scalar until we have AVX-512
1221/// support
1222/// @param pIndices - pointer to 8 bit indices
1223/// @param pLastIndex - pointer to last valid index
1224Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1225{
1226    return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1227}
1228
1229//////////////////////////////////////////////////////////////////////////
1230/// @brief Loads a simd of valid indices. OOB indices are set to 0
1231/// *Note* have to do 16bit index checking in scalar until we have AVX-512
1232/// support
1233/// @param pIndices - pointer to 16 bit indices
1234/// @param pLastIndex - pointer to last valid index
1235Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1236{
1237    return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1238}
1239
1240//////////////////////////////////////////////////////////////////////////
1241/// @brief Loads a simd of valid indices. OOB indices are set to 0
1242/// @param pIndices - pointer to 32 bit indices
1243/// @param pLastIndex - pointer to last valid index
1244Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1245{
1246    DataLayout dL(JM()->mpCurrentModule);
1247    Value*     iLastIndex = pLastIndex;
1248    Value*     iIndices   = pIndices;
1249
1250    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1251    Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1252    numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1253    numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1254
1255    // create a vector of index counts from the base index ptr passed into the fetch
1256    Constant* vIndexOffsets;
1257    if (mVWidth == 8)
1258    {
1259        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1260    }
1261    else
1262    {
1263        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1264    }
1265
1266    // compare index count to the max valid index
1267    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1268    //     vIndexOffsets  0 1 2 3 4 5 6 7
1269    //     ------------------------------
1270    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1271    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1272    Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1273    Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1274
1275    // Load the indices; OOB loads 0
1276    return MASKED_LOAD(pIndices,
1277                       4,
1278                       vIndexMask,
1279                       VIMMED1(0),
1280                       "vIndices",
1281                       PointerType::get(mSimdInt32Ty, 0),
1282                       MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1283}
1284
1285//////////////////////////////////////////////////////////////////////////
1286/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1287/// denormalizes if needed, converts to F32 if needed, and positions in
1288//  the proper SIMD rows to be output to the simdvertex structure
1289/// @param args: (tuple of args, listed below)
1290///   @param vGatherResult - 8 gathered 8bpc vertices
1291///   @param pVtxOut - base pointer to output simdvertex struct
1292///   @param extendType - sign extend or zero extend
1293///   @param bNormalized - do we need to denormalize?
1294///   @param currentVertexElement - reference to the current vVertexElement
1295///   @param outputElt - reference to the current offset from simdvertex we're o
1296///   @param compMask - component packing mask
1297///   @param compCtrl - component control val
1298///   @param vVertexElements[4] - vertex components to output
1299///   @param swizzle[4] - component swizzle location
1300void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1301{
1302    // Unpack tuple args
1303    Value*&                    vGatherResult        = std::get<0>(args);
1304    Value*                     pVtxOut              = std::get<1>(args);
1305    const Instruction::CastOps extendType           = std::get<2>(args);
1306    const ConversionType       conversionType       = std::get<3>(args);
1307    uint32_t&                  currentVertexElement = std::get<4>(args);
1308    uint32_t&                  outputElt            = std::get<5>(args);
1309    const ComponentEnable      compMask             = std::get<6>(args);
1310    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1311    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1312    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1313
1314    // cast types
1315    Type* vGatherTy = getVectorType(mInt32Ty, 8);
1316    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
1317
1318    // have to do extra work for sign extending
1319    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1320    {
1321        Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1322        Type* v128Ty  = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1323
1324        // shuffle mask, including any swizzling
1325        const char x          = (char)swizzle[0];
1326        const char y          = (char)swizzle[1];
1327        const char z          = (char)swizzle[2];
1328        const char w          = (char)swizzle[3];
1329        Value*     vConstMask = C<char>(
1330            {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1331             char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1332             char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1333             char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1334             char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1335             char(w + 8), char(w + 12)});
1336
1337        // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1338
1339        Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1340        Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1341
1342        Value* vShufResult_lo =
1343            BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1344        Value* vShufResult_hi =
1345            BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1346
1347        // after pshufb: group components together in each 128bit lane
1348        // 256i - 0    1    2    3    4    5    6    7
1349        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1350
1351        Value* vi128XY_lo = nullptr;
1352        Value* vi128XY_hi = nullptr;
1353        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1354        {
1355            vi128XY_lo = BITCAST(
1356                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1357                v128Ty);
1358            vi128XY_hi = BITCAST(
1359                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1360                v128Ty);
1361
1362            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1363            // 256i - 0    1    2    3    4    5    6    7
1364            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1365        }
1366
1367        // do the same for zw components
1368        Value* vi128ZW_lo = nullptr;
1369        Value* vi128ZW_hi = nullptr;
1370        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1371        {
1372            vi128ZW_lo = BITCAST(
1373                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1374                v128Ty);
1375            vi128ZW_hi = BITCAST(
1376                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1377                v128Ty);
1378        }
1379
1380        // init denormalize variables if needed
1381        Instruction::CastOps fpCast;
1382        Value*               conversionFactor;
1383
1384        switch (conversionType)
1385        {
1386        case CONVERT_NORMALIZED:
1387            fpCast           = Instruction::CastOps::SIToFP;
1388            conversionFactor = VIMMED1((float)(1.0 / 127.0));
1389            break;
1390        case CONVERT_SSCALED:
1391            fpCast           = Instruction::CastOps::SIToFP;
1392            conversionFactor = VIMMED1((float)(1.0));
1393            break;
1394        case CONVERT_USCALED:
1395            assert(false && "Type should not be sign extended!");
1396            conversionFactor = nullptr;
1397            break;
1398        default:
1399            assert(conversionType == CONVERT_NONE);
1400            conversionFactor = nullptr;
1401            break;
1402        }
1403
1404        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1405        // simdvertex
1406        for (uint32_t i = 0; i < 4; i++)
1407        {
1408            if (isComponentEnabled(compMask, i))
1409            {
1410                if (compCtrl[i] == ComponentControl::StoreSrc)
1411                {
1412                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1413                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1414                    // if x or y, use vi128XY permute result, else use vi128ZW
1415                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1416                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1417
1418                    // sign extend
1419                    Value* temp_lo =
1420                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1421                    Value* temp_hi =
1422                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1423
1424                    Value* temp = JOIN_16(temp_lo, temp_hi);
1425
1426                    // denormalize if needed
1427                    if (conversionType != CONVERT_NONE)
1428                    {
1429                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1430                    }
1431
1432                    vVertexElements[currentVertexElement] = temp;
1433
1434                    currentVertexElement += 1;
1435                }
1436                else
1437                {
1438                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1439                }
1440
1441                if (currentVertexElement > 3)
1442                {
1443                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1444                    // reset to the next vVertexElement to output
1445                    currentVertexElement = 0;
1446                }
1447            }
1448        }
1449    }
1450    // else zero extend
1451    else if ((extendType == Instruction::CastOps::ZExt) ||
1452             (extendType == Instruction::CastOps::UIToFP))
1453    {
1454        // init denormalize variables if needed
1455        Instruction::CastOps fpCast;
1456        Value*               conversionFactor;
1457
1458        switch (conversionType)
1459        {
1460        case CONVERT_NORMALIZED:
1461            fpCast           = Instruction::CastOps::UIToFP;
1462            conversionFactor = VIMMED1((float)(1.0 / 255.0));
1463            break;
1464        case CONVERT_USCALED:
1465            fpCast           = Instruction::CastOps::UIToFP;
1466            conversionFactor = VIMMED1((float)(1.0));
1467            break;
1468        case CONVERT_SSCALED:
1469            assert(false && "Type should not be zero extended!");
1470            conversionFactor = nullptr;
1471            break;
1472        default:
1473            assert(conversionType == CONVERT_NONE);
1474            conversionFactor = nullptr;
1475            break;
1476        }
1477
1478        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1479        for (uint32_t i = 0; i < 4; i++)
1480        {
1481            if (isComponentEnabled(compMask, i))
1482            {
1483                if (compCtrl[i] == ComponentControl::StoreSrc)
1484                {
1485                    // pshufb masks for each component
1486                    Value* vConstMask;
1487                    switch (swizzle[i])
1488                    {
1489                    case 0:
1490                        // x shuffle mask
1491                        vConstMask =
1492                            C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1493                                     0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1494                        break;
1495                    case 1:
1496                        // y shuffle mask
1497                        vConstMask =
1498                            C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1499                                     1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1500                        break;
1501                    case 2:
1502                        // z shuffle mask
1503                        vConstMask =
1504                            C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1505                                     2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1506                        break;
1507                    case 3:
1508                        // w shuffle mask
1509                        vConstMask =
1510                            C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1511                                     3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1512                        break;
1513                    default:
1514                        assert(false && "Invalid component");
1515                        vConstMask = nullptr;
1516                        break;
1517                    }
1518
1519                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1520                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1521
1522                    Value* temp_lo =
1523                        BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1524                    Value* temp_hi =
1525                        BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1526
1527                    // after pshufb for x channel
1528                    // 256i - 0    1    2    3    4    5    6    7
1529                    //        x000 x000 x000 x000 x000 x000 x000 x000
1530
1531                    Value* temp = JOIN_16(temp_lo, temp_hi);
1532
1533                    // denormalize if needed
1534                    if (conversionType != CONVERT_NONE)
1535                    {
1536                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1537                    }
1538
1539                    vVertexElements[currentVertexElement] = temp;
1540
1541                    currentVertexElement += 1;
1542                }
1543                else
1544                {
1545                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1546                }
1547
1548                if (currentVertexElement > 3)
1549                {
1550                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1551                    // reset to the next vVertexElement to output
1552                    currentVertexElement = 0;
1553                }
1554            }
1555        }
1556    }
1557    else
1558    {
1559        SWR_INVALID("Unsupported conversion type");
1560    }
1561}
1562
1563void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1564{
1565    // Unpack tuple args
1566    Value*&                    vGatherResult        = std::get<0>(args);
1567    Value*                     pVtxOut              = std::get<1>(args);
1568    const Instruction::CastOps extendType           = std::get<2>(args);
1569    const ConversionType       conversionType       = std::get<3>(args);
1570    uint32_t&                  currentVertexElement = std::get<4>(args);
1571    uint32_t&                  outputElt            = std::get<5>(args);
1572    const ComponentEnable      compMask             = std::get<6>(args);
1573    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1574    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1575    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1576
1577    // cast types
1578    Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1579
1580    for (uint32_t i = 0; i < 4; i++)
1581    {
1582        if (!isComponentEnabled(compMask, i))
1583            continue;
1584
1585        if (compCtrl[i] == ComponentControl::StoreSrc)
1586        {
1587#if LLVM_VERSION_MAJOR >= 11
1588            using MaskType = int32_t;
1589#else
1590            using MaskType = uint32_t;
1591#endif
1592            std::vector<MaskType> vShuffleMasks[4] = {
1593                {0, 4, 8, 12, 16, 20, 24, 28},  // x
1594                {1, 5, 9, 13, 17, 21, 25, 29},  // y
1595                {2, 6, 10, 14, 18, 22, 26, 30}, // z
1596                {3, 7, 11, 15, 19, 23, 27, 31}, // w
1597            };
1598
1599            Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1600                                  UndefValue::get(v32x8Ty),
1601                                  vShuffleMasks[swizzle[i]]);
1602
1603            if ((extendType == Instruction::CastOps::SExt) ||
1604                (extendType == Instruction::CastOps::SIToFP))
1605            {
1606                switch (conversionType)
1607                {
1608                case CONVERT_NORMALIZED:
1609                    val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1610                    break;
1611                case CONVERT_SSCALED:
1612                    val = SI_TO_FP(val, mSimdFP32Ty);
1613                    break;
1614                case CONVERT_USCALED:
1615                    SWR_INVALID("Type should not be sign extended!");
1616                    break;
1617                default:
1618                    SWR_ASSERT(conversionType == CONVERT_NONE);
1619                    val = S_EXT(val, mSimdInt32Ty);
1620                    break;
1621                }
1622            }
1623            else if ((extendType == Instruction::CastOps::ZExt) ||
1624                     (extendType == Instruction::CastOps::UIToFP))
1625            {
1626                switch (conversionType)
1627                {
1628                case CONVERT_NORMALIZED:
1629                    val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1630                    break;
1631                case CONVERT_SSCALED:
1632                    SWR_INVALID("Type should not be zero extended!");
1633                    break;
1634                case CONVERT_USCALED:
1635                    val = UI_TO_FP(val, mSimdFP32Ty);
1636                    break;
1637                default:
1638                    SWR_ASSERT(conversionType == CONVERT_NONE);
1639                    val = Z_EXT(val, mSimdInt32Ty);
1640                    break;
1641                }
1642            }
1643            else
1644            {
1645                SWR_INVALID("Unsupported conversion type");
1646            }
1647
1648            vVertexElements[currentVertexElement++] = val;
1649        }
1650        else
1651        {
1652            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1653        }
1654
1655        if (currentVertexElement > 3)
1656        {
1657            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1658            // reset to the next vVertexElement to output
1659            currentVertexElement = 0;
1660        }
1661    }
1662}
1663
1664//////////////////////////////////////////////////////////////////////////
1665/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1666/// denormalizes if needed, converts to F32 if needed, and positions in
1667//  the proper SIMD rows to be output to the simdvertex structure
1668/// @param args: (tuple of args, listed below)
1669///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1670///   @param pVtxOut - base pointer to output simdvertex struct
1671///   @param extendType - sign extend or zero extend
1672///   @param bNormalized - do we need to denormalize?
1673///   @param currentVertexElement - reference to the current vVertexElement
1674///   @param outputElt - reference to the current offset from simdvertex we're o
1675///   @param compMask - component packing mask
1676///   @param compCtrl - component control val
1677///   @param vVertexElements[4] - vertex components to output
1678void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1679{
1680    // Unpack tuple args
1681    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1682    Value*                     pVtxOut              = std::get<1>(args);
1683    const Instruction::CastOps extendType           = std::get<2>(args);
1684    const ConversionType       conversionType       = std::get<3>(args);
1685    uint32_t&                  currentVertexElement = std::get<4>(args);
1686    uint32_t&                  outputElt            = std::get<5>(args);
1687    const ComponentEnable      compMask             = std::get<6>(args);
1688    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1689    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1690
1691    // cast types
1692    Type* vGatherTy = getVectorType(mInt32Ty, 8);
1693    Type* v32x8Ty   = getVectorType(mInt8Ty, 32);
1694
1695    // have to do extra work for sign extending
1696    if ((extendType == Instruction::CastOps::SExt) ||
1697        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1698    {
1699        // is this PP float?
1700        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1701
1702        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1703        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1704
1705        // shuffle mask
1706        Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1707                                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1708        Value* vi128XY_lo = nullptr;
1709        Value* vi128XY_hi = nullptr;
1710        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1711        {
1712            // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1713            // now..
1714
1715            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1716            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1717
1718            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1719            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1720
1721            // after pshufb: group components together in each 128bit lane
1722            // 256i - 0    1    2    3    4    5    6    7
1723            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1724
1725            vi128XY_lo = BITCAST(
1726                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1727                v128bitTy);
1728            vi128XY_hi = BITCAST(
1729                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1730                v128bitTy);
1731
1732            // after PERMD: move and pack xy components into each 128bit lane
1733            // 256i - 0    1    2    3    4    5    6    7
1734            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1735        }
1736
1737        // do the same for zw components
1738        Value* vi128ZW_lo = nullptr;
1739        Value* vi128ZW_hi = nullptr;
1740        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1741        {
1742            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1743            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1744
1745            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1746            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1747
1748            vi128ZW_lo = BITCAST(
1749                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1750                v128bitTy);
1751            vi128ZW_hi = BITCAST(
1752                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1753                v128bitTy);
1754        }
1755
1756        // init denormalize variables if needed
1757        Instruction::CastOps IntToFpCast;
1758        Value*               conversionFactor;
1759
1760        switch (conversionType)
1761        {
1762        case CONVERT_NORMALIZED:
1763            IntToFpCast      = Instruction::CastOps::SIToFP;
1764            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1765            break;
1766        case CONVERT_SSCALED:
1767            IntToFpCast      = Instruction::CastOps::SIToFP;
1768            conversionFactor = VIMMED1((float)(1.0));
1769            break;
1770        case CONVERT_USCALED:
1771            assert(false && "Type should not be sign extended!");
1772            conversionFactor = nullptr;
1773            break;
1774        default:
1775            assert(conversionType == CONVERT_NONE);
1776            conversionFactor = nullptr;
1777            break;
1778        }
1779
1780        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1781        // simdvertex
1782        for (uint32_t i = 0; i < 4; i++)
1783        {
1784            if (isComponentEnabled(compMask, i))
1785            {
1786                if (compCtrl[i] == ComponentControl::StoreSrc)
1787                {
1788                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1789                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1790                    // if x or y, use vi128XY permute result, else use vi128ZW
1791                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1792                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1793
1794                    if (bFP)
1795                    {
1796                        // extract 128 bit lanes to sign extend each component
1797                        Value* temp_lo =
1798                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1799                        Value* temp_hi =
1800                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1801
1802                        vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1803                    }
1804                    else
1805                    {
1806                        // extract 128 bit lanes to sign extend each component
1807                        Value* temp_lo =
1808                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1809                        Value* temp_hi =
1810                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1811
1812                        Value* temp = JOIN_16(temp_lo, temp_hi);
1813
1814                        // denormalize if needed
1815                        if (conversionType != CONVERT_NONE)
1816                        {
1817                            temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1818                        }
1819
1820                        vVertexElements[currentVertexElement] = temp;
1821                    }
1822
1823                    currentVertexElement += 1;
1824                }
1825                else
1826                {
1827                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1828                }
1829
1830                if (currentVertexElement > 3)
1831                {
1832                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1833                    // reset to the next vVertexElement to output
1834                    currentVertexElement = 0;
1835                }
1836            }
1837        }
1838    }
1839    // else zero extend
1840    else if ((extendType == Instruction::CastOps::ZExt) ||
1841             (extendType == Instruction::CastOps::UIToFP))
1842    {
1843        // pshufb masks for each component
1844        Value* vConstMask[2];
1845
1846        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1847        {
1848            // x/z shuffle mask
1849            vConstMask[0] = C<char>({
1850                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1851                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1852            });
1853        }
1854
1855        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1856        {
1857            // y/w shuffle mask
1858            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1859                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1860        }
1861
1862        // init denormalize variables if needed
1863        Instruction::CastOps fpCast;
1864        Value*               conversionFactor;
1865
1866        switch (conversionType)
1867        {
1868        case CONVERT_NORMALIZED:
1869            fpCast           = Instruction::CastOps::UIToFP;
1870            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1871            break;
1872        case CONVERT_USCALED:
1873            fpCast           = Instruction::CastOps::UIToFP;
1874            conversionFactor = VIMMED1((float)(1.0f));
1875            break;
1876        case CONVERT_SSCALED:
1877            SWR_INVALID("Type should not be zero extended!");
1878            conversionFactor = nullptr;
1879            break;
1880        default:
1881            SWR_ASSERT(conversionType == CONVERT_NONE);
1882            conversionFactor = nullptr;
1883            break;
1884        }
1885
1886        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1887        for (uint32_t i = 0; i < 4; i++)
1888        {
1889            if (isComponentEnabled(compMask, i))
1890            {
1891                if (compCtrl[i] == ComponentControl::StoreSrc)
1892                {
1893                    // select correct constMask for x/z or y/w pshufb
1894                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1895                    // if x or y, use vi128XY permute result, else use vi128ZW
1896                    uint32_t selectedGather = (i < 2) ? 0 : 1;
1897
1898                    // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL,
1899                    // for now..
1900
1901                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1902                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1903
1904                    Value* temp_lo = BITCAST(
1905                        PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1906                        vGatherTy);
1907                    Value* temp_hi = BITCAST(
1908                        PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1909                        vGatherTy);
1910
1911                    // after pshufb mask for x channel; z uses the same shuffle from the second
1912                    // gather 256i - 0    1    2    3    4    5    6    7
1913                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1914
1915                    Value* temp = JOIN_16(temp_lo, temp_hi);
1916
1917                    // denormalize if needed
1918                    if (conversionType != CONVERT_NONE)
1919                    {
1920                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1921                    }
1922
1923                    vVertexElements[currentVertexElement] = temp;
1924
1925                    currentVertexElement += 1;
1926                }
1927                else
1928                {
1929                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1930                }
1931
1932                if (currentVertexElement > 3)
1933                {
1934                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1935                    // reset to the next vVertexElement to output
1936                    currentVertexElement = 0;
1937                }
1938            }
1939        }
1940    }
1941    else
1942    {
1943        SWR_INVALID("Unsupported conversion type");
1944    }
1945}
1946
1947void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1948{
1949    // Unpack tuple args
1950    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1951    Value*                     pVtxOut              = std::get<1>(args);
1952    const Instruction::CastOps extendType           = std::get<2>(args);
1953    const ConversionType       conversionType       = std::get<3>(args);
1954    uint32_t&                  currentVertexElement = std::get<4>(args);
1955    uint32_t&                  outputElt            = std::get<5>(args);
1956    const ComponentEnable      compMask             = std::get<6>(args);
1957    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1958    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1959
1960    // cast types
1961    Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1962    Type* v32x8Ty   = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1963
1964    // have to do extra work for sign extending
1965    if ((extendType == Instruction::CastOps::SExt) ||
1966        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1967    {
1968        // is this PP float?
1969        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1970
1971        Type* v8x16Ty   = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane
1972        Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128),
1973                                          mVWidth / 4); // vwidth is units of 32 bits
1974
1975        // shuffle mask
1976        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1977                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1978        Value* vi128XY    = nullptr;
1979        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1980        {
1981            Value* vShufResult =
1982                BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1983            // after pshufb: group components together in each 128bit lane
1984            // 256i - 0    1    2    3    4    5    6    7
1985            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1986
1987            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1988            // after PERMD: move and pack xy components into each 128bit lane
1989            // 256i - 0    1    2    3    4    5    6    7
1990            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1991        }
1992
1993        // do the same for zw components
1994        Value* vi128ZW = nullptr;
1995        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1996        {
1997            Value* vShufResult =
1998                BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1999            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
2000        }
2001
2002        // init denormalize variables if needed
2003        Instruction::CastOps IntToFpCast;
2004        Value*               conversionFactor;
2005
2006        switch (conversionType)
2007        {
2008        case CONVERT_NORMALIZED:
2009            IntToFpCast      = Instruction::CastOps::SIToFP;
2010            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
2011            break;
2012        case CONVERT_SSCALED:
2013            IntToFpCast      = Instruction::CastOps::SIToFP;
2014            conversionFactor = VIMMED1((float)(1.0));
2015            break;
2016        case CONVERT_USCALED:
2017            SWR_INVALID("Type should not be sign extended!");
2018            conversionFactor = nullptr;
2019            break;
2020        default:
2021            SWR_ASSERT(conversionType == CONVERT_NONE);
2022            conversionFactor = nullptr;
2023            break;
2024        }
2025
2026        // sign extend all enabled components. If we have a fill vVertexElements, output to current
2027        // simdvertex
2028        for (uint32_t i = 0; i < 4; i++)
2029        {
2030            if (isComponentEnabled(compMask, i))
2031            {
2032                if (compCtrl[i] == ComponentControl::StoreSrc)
2033                {
2034                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
2035                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
2036                    // if x or y, use vi128XY permute result, else use vi128ZW
2037                    Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
2038
2039                    if (bFP)
2040                    {
2041                        // extract 128 bit lanes to sign extend each component
2042                        vVertexElements[currentVertexElement] =
2043                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2044                    }
2045                    else
2046                    {
2047                        // extract 128 bit lanes to sign extend each component
2048                        vVertexElements[currentVertexElement] =
2049                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
2050
2051                        // denormalize if needed
2052                        if (conversionType != CONVERT_NONE)
2053                        {
2054                            vVertexElements[currentVertexElement] =
2055                                FMUL(CAST(IntToFpCast,
2056                                          vVertexElements[currentVertexElement],
2057                                          mSimdFP32Ty),
2058                                     conversionFactor);
2059                        }
2060                    }
2061                    currentVertexElement++;
2062                }
2063                else
2064                {
2065                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2066                }
2067
2068                if (currentVertexElement > 3)
2069                {
2070                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2071                    // reset to the next vVertexElement to output
2072                    currentVertexElement = 0;
2073                }
2074            }
2075        }
2076    }
2077    // else zero extend
2078    else if ((extendType == Instruction::CastOps::ZExt) ||
2079             (extendType == Instruction::CastOps::UIToFP))
2080    {
2081        // pshufb masks for each component
2082        Value* vConstMask[2];
2083        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
2084        {
2085            // x/z shuffle mask
2086            vConstMask[0] = C<char>({
2087                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2088                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2089            });
2090        }
2091
2092        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2093        {
2094            // y/w shuffle mask
2095            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2096                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2097        }
2098
2099        // init denormalize variables if needed
2100        Instruction::CastOps fpCast;
2101        Value*               conversionFactor;
2102
2103        switch (conversionType)
2104        {
2105        case CONVERT_NORMALIZED:
2106            fpCast           = Instruction::CastOps::UIToFP;
2107            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2108            break;
2109        case CONVERT_USCALED:
2110            fpCast           = Instruction::CastOps::UIToFP;
2111            conversionFactor = VIMMED1((float)(1.0f));
2112            break;
2113        case CONVERT_SSCALED:
2114            SWR_INVALID("Type should not be zero extended!");
2115            conversionFactor = nullptr;
2116            break;
2117        default:
2118            SWR_ASSERT(conversionType == CONVERT_NONE);
2119            conversionFactor = nullptr;
2120            break;
2121        }
2122
2123        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2124        for (uint32_t i = 0; i < 4; i++)
2125        {
2126            if (isComponentEnabled(compMask, i))
2127            {
2128                if (compCtrl[i] == ComponentControl::StoreSrc)
2129                {
2130                    // select correct constMask for x/z or y/w pshufb
2131                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2132                    // if x or y, use vi128XY permute result, else use vi128ZW
2133                    uint32_t selectedGather = (i < 2) ? 0 : 1;
2134
2135                    vVertexElements[currentVertexElement] =
2136                        BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2137                                       vConstMask[selectedMask]),
2138                                vGatherTy);
2139                    // after pshufb mask for x channel; z uses the same shuffle from the second
2140                    // gather 256i - 0    1    2    3    4    5    6    7
2141                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2142
2143                    // denormalize if needed
2144                    if (conversionType != CONVERT_NONE)
2145                    {
2146                        vVertexElements[currentVertexElement] =
2147                            FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2148                                 conversionFactor);
2149                    }
2150                    currentVertexElement++;
2151                }
2152                else
2153                {
2154                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2155                }
2156
2157                if (currentVertexElement > 3)
2158                {
2159                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2160                    // reset to the next vVertexElement to output
2161                    currentVertexElement = 0;
2162                }
2163            }
2164        }
2165    }
2166    else
2167    {
2168        SWR_INVALID("Unsupported conversion type");
2169    }
2170}
2171
2172//////////////////////////////////////////////////////////////////////////
2173/// @brief Output a simdvertex worth of elements to the current outputElt
2174/// @param pVtxOut - base address of VIN output struct
2175/// @param outputElt - simdvertex offset in VIN to write to
2176/// @param numEltsToStore - number of simdvertex rows to write out
2177/// @param vVertexElements - LLVM Value*[] simdvertex to write out
2178void FetchJit::StoreVertexElements(Value*         pVtxOut,
2179                                   const uint32_t outputElt,
2180                                   const uint32_t numEltsToStore,
2181                                   Value* (&vVertexElements)[4])
2182{
2183    SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2184
2185    for (uint32_t c = 0; c < numEltsToStore; ++c)
2186    {
2187        // STORE expects FP32 x vWidth type, just bitcast if needed
2188        if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2189        {
2190#if FETCH_DUMP_VERTEX
2191            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2192#endif
2193            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2194        }
2195#if FETCH_DUMP_VERTEX
2196        else
2197        {
2198            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2199        }
2200#endif
2201        // outputElt * 4 = offsetting by the size of a simdvertex
2202        // + c offsets to a 32bit x vWidth row within the current vertex
2203        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2204        STORE(vVertexElements[c], dest);
2205    }
2206}
2207
2208//////////////////////////////////////////////////////////////////////////
2209/// @brief Generates a constant vector of values based on the
2210/// ComponentControl value
2211/// @param ctrl - ComponentControl value
2212Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2213{
2214    switch (ctrl)
2215    {
2216    case NoStore:
2217        return VUNDEF_I();
2218    case Store0:
2219        return VIMMED1(0);
2220    case Store1Fp:
2221        return VIMMED1(1.0f);
2222    case Store1Int:
2223        return VIMMED1(1);
2224    case StoreVertexId:
2225    {
2226        if (mVWidth == 16)
2227        {
2228            Type*  pSimd8FPTy = getVectorType(mFP32Ty, 8);
2229            Value* pIdLo =
2230                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2231            Value* pIdHi =
2232                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2233            return JOIN_16(pIdLo, pIdHi);
2234        }
2235        else
2236        {
2237            return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2238        }
2239    }
2240    case StoreInstanceId:
2241    {
2242        Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2243        return VBROADCAST(pId);
2244    }
2245
2246
2247    case StoreSrc:
2248    default:
2249        SWR_INVALID("Invalid component control");
2250        return VUNDEF_I();
2251    }
2252}
2253
2254//////////////////////////////////////////////////////////////////////////
2255/// @brief Returns the enable mask for the specified component.
2256/// @param enableMask - enable bits
2257/// @param component - component to check if enabled.
2258bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2259{
2260    switch (component)
2261    {
2262        // X
2263    case 0:
2264        return (enableMask & ComponentEnable::X);
2265        // Y
2266    case 1:
2267        return (enableMask & ComponentEnable::Y);
2268        // Z
2269    case 2:
2270        return (enableMask & ComponentEnable::Z);
2271        // W
2272    case 3:
2273        return (enableMask & ComponentEnable::W);
2274
2275    default:
2276        return false;
2277    }
2278}
2279
2280// Don't want two threads compiling the same fetch shader simultaneously
2281// Has problems in the JIT cache implementation
2282// This is only a problem for fetch right now.
2283static std::mutex gFetchCodegenMutex;
2284
2285//////////////////////////////////////////////////////////////////////////
2286/// @brief JITs from fetch shader IR
2287/// @param hJitMgr - JitManager handle
2288/// @param func   - LLVM function IR
2289/// @return PFN_FETCH_FUNC - pointer to fetch code
2290PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2291{
2292    const llvm::Function* func    = (const llvm::Function*)hFunc;
2293    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2294    PFN_FETCH_FUNC        pfnFetch;
2295
2296    gFetchCodegenMutex.lock();
2297    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2298    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2299    // add new IR to the module
2300    pJitMgr->mIsModuleFinalized = true;
2301
2302#if defined(KNOB_SWRC_TRACING)
2303    char        fName[1024];
2304    const char* funcName = func->getName().data();
2305    sprintf(fName, "%s.bin", funcName);
2306    FILE* fd = fopen(fName, "wb");
2307    fwrite((void*)pfnFetch, 1, 2048, fd);
2308    fclose(fd);
2309#endif
2310
2311    pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2312    gFetchCodegenMutex.unlock();
2313
2314
2315    return pfnFetch;
2316}
2317
2318//////////////////////////////////////////////////////////////////////////
2319/// @brief JIT compiles fetch shader
2320/// @param hJitMgr - JitManager handle
2321/// @param state   - fetch state to build function from
2322extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2323{
2324    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2325
2326    pJitMgr->SetupNewModule();
2327
2328    FetchJit theJit(pJitMgr);
2329    HANDLE   hFunc = theJit.Create(state);
2330
2331    return JitFetchFunc(hJitMgr, hFunc);
2332}
2333