1/****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file fetch_jit.cpp
24 *
25 * @brief Implementation of the fetch jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30#include "jit_pch.hpp"
31#include "builder_gfx_mem.h"
32#include "jit_api.h"
33#include "fetch_jit.h"
34#include "gen_state_llvm.h"
35#include "functionpasses/passes.h"
36
37//#define FETCH_DUMP_VERTEX 1
38using namespace llvm;
39using namespace SwrJit;
40
41bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
42
43enum ConversionType
44{
45    CONVERT_NONE,
46    CONVERT_NORMALIZED,
47    CONVERT_USCALED,
48    CONVERT_SSCALED,
49    CONVERT_SFIXED,
50};
51
52//////////////////////////////////////////////////////////////////////////
53/// Interface to Jitting a fetch shader
54//////////////////////////////////////////////////////////////////////////
55struct FetchJit : public BuilderGfxMem
56{
57    FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
58
59    Function* Create(const FETCH_COMPILE_STATE& fetchState);
60
61    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
62    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
63    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
64    template <typename T>
65    Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
66
67    // package up Shuffle*bpcGatherd args into a tuple for convenience
68    typedef std::tuple<Value*&,
69                       Value*,
70                       const Instruction::CastOps,
71                       const ConversionType,
72                       uint32_t&,
73                       uint32_t&,
74                       const ComponentEnable,
75                       const ComponentControl (&)[4],
76                       Value* (&)[4],
77                       const uint32_t (&)[4]>
78        Shuffle8bpcArgs;
79
80    void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
81    void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
82
83    typedef std::tuple<Value* (&)[2],
84                       Value*,
85                       const Instruction::CastOps,
86                       const ConversionType,
87                       uint32_t&,
88                       uint32_t&,
89                       const ComponentEnable,
90                       const ComponentControl (&)[4],
91                       Value* (&)[4]>
92        Shuffle16bpcArgs;
93
94    void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
95    void Shuffle16bpcGather(Shuffle16bpcArgs& args);
96
97    void StoreVertexElements(Value*         pVtxOut,
98                             const uint32_t outputElt,
99                             const uint32_t numEltsToStore,
100                             Value* (&vVertexElements)[4]);
101
102    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
103
104    void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
105                           Value*                     streams,
106                           Value*                     vIndices,
107                           Value*                     pVtxOut);
108
109    bool IsOddFormat(SWR_FORMAT format);
110    bool IsUniformFormat(SWR_FORMAT format);
111    void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
112    void CreateGatherOddFormats(
113        SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
114    void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
115
116    Value* mpWorkerData;
117    Value* mpFetchInfo;
118};
119
120Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
121{
122    std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
123    fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
124
125    Function* fetch = Function::Create(
126        JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
127    BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
128
129    fetch->getParent()->setModuleIdentifier(fetch->getName());
130
131    IRB()->SetInsertPoint(entry);
132
133    auto argitr = fetch->arg_begin();
134
135    // Fetch shader arguments
136    Value* privateContext = &*argitr;
137    ++argitr;
138    privateContext->setName("privateContext");
139    SetPrivateContext(privateContext);
140
141    mpWorkerData = &*argitr;
142    ++argitr;
143    mpWorkerData->setName("pWorkerData");
144    mpFetchInfo = &*argitr;
145    ++argitr;
146    mpFetchInfo->setName("fetchInfo");
147    Value* pVtxOut = &*argitr;
148    pVtxOut->setName("vtxOutput");
149
150    uint32_t baseWidth = mVWidth;
151
152    SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth);
153
154    // Override builder target width to force 16-wide SIMD
155#if USE_SIMD16_SHADERS
156    SetTargetWidth(16);
157#endif
158
159    pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
160
161    // SWR_FETCH_CONTEXT::pStreams
162    Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
163    streams->setName("pStreams");
164
165    // SWR_FETCH_CONTEXT::pIndices
166    Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
167    indices->setName("pIndices");
168
169    // SWR_FETCH_CONTEXT::pLastIndex
170    Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
171    pLastIndex->setName("pLastIndex");
172
173    Value* vIndices;
174    switch (fetchState.indexType)
175    {
176    case R8_UINT:
177        indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
178        if (fetchState.bDisableIndexOOBCheck)
179        {
180            vIndices = LOAD(
181                BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
182                {(uint32_t)0});
183            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
184        }
185        else
186        {
187            vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
188        }
189        break;
190    case R16_UINT:
191        if (fetchState.bDisableIndexOOBCheck)
192        {
193            vIndices = LOAD(
194                BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
195                {(uint32_t)0});
196            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
197        }
198        else
199        {
200            vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
201        }
202        break;
203    case R32_UINT:
204        (fetchState.bDisableIndexOOBCheck)
205            ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH)
206            : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
207        break; // incoming type is already 32bit int
208    default:
209        SWR_INVALID("Unsupported index type");
210        vIndices = nullptr;
211        break;
212    }
213
214    if (fetchState.bForceSequentialAccessEnable)
215    {
216        Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
217                                       : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
218
219        // VertexData buffers are accessed sequentially, the index is equal to the vertex number
220        vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
221        vIndices = ADD(vIndices, pOffsets);
222    }
223
224    Value* vVertexId = vIndices;
225    if (fetchState.bVertexIDOffsetEnable)
226    {
227        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
228        // correct
229        Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
230        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
231        vVertexId           = ADD(vIndices, vBaseVertex);
232        vVertexId           = ADD(vVertexId, vStartVertex);
233    }
234
235    // store out vertex IDs
236    if (mVWidth == 16)
237    {
238        // store out in simd8 halves until core supports 16-wide natively
239        auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
240        auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
241        STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
242        STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
243    }
244    else if (mVWidth == 8)
245    {
246        STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
247    }
248
249    // store out cut mask if enabled
250    if (fetchState.bEnableCutIndex)
251    {
252        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
253        Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
254
255        if (mVWidth == 16)
256        {
257            auto cutMaskLo = EXTRACT_16(cutMask, 0);
258            auto cutMaskHi = EXTRACT_16(cutMask, 1);
259            STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
260            STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
261        }
262        else if (mVWidth == 8)
263        {
264            STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
265        }
266    }
267
268    // Fetch attributes from memory and output to a simdvertex struct
269    JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
270
271    RET_VOID();
272
273    JitManager::DumpToFile(fetch, "src");
274
275#if defined(_DEBUG)
276    verifyFunction(*fetch);
277#endif
278
279    ::FunctionPassManager setupPasses(JM()->mpCurrentModule);
280
281    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
282    setupPasses.add(createBreakCriticalEdgesPass());
283    setupPasses.add(createCFGSimplificationPass());
284    setupPasses.add(createEarlyCSEPass());
285    setupPasses.add(createPromoteMemoryToRegisterPass());
286
287    setupPasses.run(*fetch);
288
289    JitManager::DumpToFile(fetch, "se");
290
291    ::FunctionPassManager optPasses(JM()->mpCurrentModule);
292
293    ///@todo Haven't touched these either. Need to remove some of these and add others.
294    optPasses.add(createCFGSimplificationPass());
295    optPasses.add(createEarlyCSEPass());
296    optPasses.add(createInstructionCombiningPass());
297    optPasses.add(createConstantPropagationPass());
298    optPasses.add(createSCCPPass());
299    optPasses.add(createAggressiveDCEPass());
300
301    optPasses.run(*fetch);
302
303    optPasses.add(createLowerX86Pass(this));
304    optPasses.run(*fetch);
305
306    JitManager::DumpToFile(fetch, "opt");
307
308
309    // Revert 16-wide override
310#if USE_SIMD16_SHADERS
311    SetTargetWidth(baseWidth);
312#endif
313
314    return fetch;
315}
316
317// returns true for odd formats that require special state.gather handling
318bool FetchJit::IsOddFormat(SWR_FORMAT format)
319{
320    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
321    if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64)
322    {
323        return true;
324    }
325    return false;
326}
327
328// format is uniform if all components are the same size and type
329bool FetchJit::IsUniformFormat(SWR_FORMAT format)
330{
331    const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
332    uint32_t               bpc0  = info.bpc[0];
333    uint32_t               type0 = info.type[0];
334
335    for (uint32_t c = 1; c < info.numComps; ++c)
336    {
337        if (bpc0 != info.bpc[c] || type0 != info.type[c])
338        {
339            return false;
340        }
341    }
342    return true;
343}
344
345// unpacks components based on format
346// foreach component in the pixel
347//   mask off everything but this component
348//   shift component to LSB
349void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4])
350{
351    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
352
353    uint32_t bitOffset = 0;
354    for (uint32_t c = 0; c < info.numComps; ++c)
355    {
356        uint32_t swizzledIndex = info.swizzle[c];
357        uint32_t compBits      = info.bpc[c];
358        uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
359        Value*   comp          = AND(vInput, bitmask);
360        comp                   = LSHR(comp, bitOffset);
361
362        result[swizzledIndex] = comp;
363        bitOffset += compBits;
364    }
365}
366
367// gather for odd component size formats
368// gather SIMD full pixels per lane then shift/mask to move each component to their
369// own vector
370void FetchJit::CreateGatherOddFormats(
371    SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4])
372{
373    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
374
375    // only works if pixel size is <= 32bits
376    SWR_ASSERT(info.bpp <= 32);
377
378    Value* pGather;
379    if (info.bpp == 32)
380    {
381        pGather = GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
382    }
383    else
384    {
385        // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
386        Value* pMem = ALLOCA(mSimdInt32Ty);
387        STORE(VIMMED1(0u), pMem);
388
389        Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy);
390
391        for (uint32_t lane = 0; lane < mVWidth; ++lane)
392        {
393            // Get index
394            Value* index = VEXTRACT(pOffsets, C(lane));
395            Value* mask  = VEXTRACT(pMask, C(lane));
396
397            // use branch around load based on mask
398            // Needed to avoid page-faults on unmasked lanes
399            BasicBlock* pCurrentBB = IRB()->GetInsertBlock();
400            BasicBlock* pMaskedLoadBlock =
401                BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent());
402            BasicBlock* pEndLoadBB = BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent());
403
404            COND_BR(mask, pMaskedLoadBlock, pEndLoadBB);
405
406            JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock);
407
408            switch (info.bpp)
409            {
410            case 8:
411            {
412                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
413                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
414                STORE(LOAD(xpSrc, "", mInt8PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
415                break;
416            }
417
418            case 16:
419            {
420                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
421                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
422                STORE(LOAD(xpSrc, "", mInt16PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
423                break;
424            }
425            break;
426
427            case 24:
428            {
429                // First 16-bits of data
430                Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
431                Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType()));
432                STORE(LOAD(xpSrc, "", mInt16PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
433
434                // Last 8-bits of data
435                pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
436                xpSrc = ADD(xpSrc, C(2));
437                STORE(LOAD(xpSrc, "", mInt8PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst);
438                break;
439            }
440
441            default:
442                SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
443                break;
444            }
445
446            BR(pEndLoadBB);
447            JM()->mBuilder.SetInsertPoint(pEndLoadBB);
448        }
449
450        pGather = LOAD(pMem);
451    }
452
453    for (uint32_t comp = 0; comp < 4; ++comp)
454    {
455        pResult[comp] = VIMMED1((int)info.defaults[comp]);
456    }
457
458    UnpackComponents(format, pGather, pResult);
459
460    // cast to fp32
461    pResult[0] = BITCAST(pResult[0], mSimdFP32Ty);
462    pResult[1] = BITCAST(pResult[1], mSimdFP32Ty);
463    pResult[2] = BITCAST(pResult[2], mSimdFP32Ty);
464    pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
465}
466
467void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
468{
469    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
470
471    for (uint32_t c = 0; c < info.numComps; ++c)
472    {
473        uint32_t compIndex = info.swizzle[c];
474
475        // skip any conversion on UNUSED components
476        if (info.type[c] == SWR_TYPE_UNUSED)
477        {
478            continue;
479        }
480
481        if (info.isNormalized[c])
482        {
483            if (info.type[c] == SWR_TYPE_SNORM)
484            {
485                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
486                /// -1.0f.
487
488                /// result = c * (1.0f / (2^(n-1) - 1);
489                uint32_t n        = info.bpc[c];
490                uint32_t pow2     = 1 << (n - 1);
491                float    scale    = 1.0f / (float)(pow2 - 1);
492                Value*   vScale   = VIMMED1(scale);
493                texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
494                texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
495                texels[compIndex] = FMUL(texels[compIndex], vScale);
496            }
497            else
498            {
499                SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
500
501                /// result = c * (1.0f / (2^n - 1))
502                uint32_t n    = info.bpc[c];
503                uint32_t pow2 = 1 << n;
504                // special case 24bit unorm format, which requires a full divide to meet ULP
505                // requirement
506                if (n == 24)
507                {
508                    float  scale      = (float)(pow2 - 1);
509                    Value* vScale     = VIMMED1(scale);
510                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
511                    texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
512                    texels[compIndex] = FDIV(texels[compIndex], vScale);
513                }
514                else
515                {
516                    float  scale      = 1.0f / (float)(pow2 - 1);
517                    Value* vScale     = VIMMED1(scale);
518                    texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
519                    texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
520                    texels[compIndex] = FMUL(texels[compIndex], vScale);
521                }
522            }
523            continue;
524        }
525    }
526}
527
528//////////////////////////////////////////////////////////////////////////
529/// @brief Loads attributes from memory using AVX2 GATHER(s)
530/// @param fetchState - info about attributes to be fetched from memory
531/// @param streams - value pointer to the current vertex stream
532/// @param vIndices - vector value of indices to gather
533/// @param pVtxOut - value pointer to output simdvertex struct
534void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
535                                 Value*                     streams,
536                                 Value*                     vIndices,
537                                 Value*                     pVtxOut)
538{
539    uint32_t currentVertexElement = 0;
540    uint32_t outputElt            = 0;
541    Value*   vVertexElements[4];
542
543    Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
544    Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
545    Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
546    Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
547    curInstance->setName("curInstance");
548
549    for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
550    {
551        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
552
553        // skip element if all components are disabled
554        if (ied.ComponentPacking == ComponentEnable::NONE)
555        {
556            continue;
557        }
558
559        const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
560        SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
561        uint32_t bpc =
562            info.bpp /
563            info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
564
565        Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
566
567        Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
568        Value* vStride = VBROADCAST(stride);
569
570        // max vertex index that is fully in bounds
571        Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
572        maxVertex        = LOAD(maxVertex);
573
574        Value* minVertex = NULL;
575        if (fetchState.bPartialVertexBuffer)
576        {
577            // min vertex index for low bounds OOB checking
578            minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)});
579            minVertex = LOAD(minVertex);
580        }
581
582        if (fetchState.bInstanceIDOffsetEnable)
583        {
584            // the InstanceID (curInstance) value is offset by StartInstanceLocation
585            curInstance = ADD(curInstance, startInstance);
586        }
587
588        Value* vCurIndices;
589        Value* startOffset;
590        Value* vInstanceStride = VIMMED1(0);
591
592        if (ied.InstanceEnable)
593        {
594            Value* stepRate = C(ied.InstanceAdvancementState);
595
596            // prevent a div by 0 for 0 step rate
597            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
598            stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
599
600            // calc the current offset into instanced data buffer
601            Value* calcInstance = UDIV(curInstance, stepRate);
602
603            // if step rate is 0, every instance gets instance 0
604            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
605
606            vCurIndices = VBROADCAST(calcInstance);
607            startOffset = startInstance;
608        }
609        else if (ied.InstanceStrideEnable)
610        {
611            // grab the instance advancement state, determines stride in bytes from one instance to
612            // the next
613            Value* stepRate = C(ied.InstanceAdvancementState);
614            vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
615
616            // offset indices by baseVertex
617            vCurIndices = ADD(vIndices, vBaseVertex);
618
619            startOffset = startVertex;
620            SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
621        }
622        else
623        {
624            // offset indices by baseVertex
625            vCurIndices = ADD(vIndices, vBaseVertex);
626            startOffset = startVertex;
627        }
628
629        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
630        // do 64bit address offset calculations.
631
632        // calculate byte offset to the start of the VB
633        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
634
635        // VGATHER* takes an *i8 src pointer so that's what stream is
636        Value* pStreamBaseGFX = ADD(stream, baseOffset);
637
638        // if we have a start offset, subtract from max vertex. Used for OOB check
639        maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
640        Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
641        // if we have a negative value, we're already OOB. clamp at 0.
642        maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
643
644        if (fetchState.bPartialVertexBuffer)
645        {
646            // similary for min vertex
647            minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
648            Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
649            minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
650        }
651
652        // Load the in bounds size of a partially valid vertex
653        Value* partialInboundsSize =
654            GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
655        partialInboundsSize       = LOAD(partialInboundsSize);
656        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
657        Value* vBpp               = VBROADCAST(C(info.Bpp));
658        Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
659
660        // is the element is <= the partially valid size
661        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
662
663        // override cur indices with 0 if pitch is 0
664        Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
665        vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
666
667        // are vertices partially OOB?
668        Value* vMaxVertex      = VBROADCAST(maxVertex);
669        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
670
671        // are vertices fully in bounds?
672        Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
673
674        Value* vGatherMask;
675        if (fetchState.bPartialVertexBuffer)
676        {
677            // are vertices below minVertex limit?
678            Value* vMinVertex     = VBROADCAST(minVertex);
679            Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
680
681            // only fetch lanes that pass both tests
682            vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
683        }
684        else
685        {
686            vGatherMask = vMaxGatherMask;
687        }
688
689        // blend in any partially OOB indices that have valid elements
690        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
691
692        // calculate the actual offsets into the VB
693        Value* vOffsets = MUL(vCurIndices, vStride);
694        vOffsets        = ADD(vOffsets, vAlignmentOffsets);
695
696        // if instance stride enable is:
697        //  true  - add product of the instanceID and advancement state to the offst into the VB
698        //  false - value of vInstanceStride has been initialialized to zero
699        vOffsets = ADD(vOffsets, vInstanceStride);
700
701        // Packing and component control
702        ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
703        const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
704                                           (ComponentControl)ied.ComponentControl1,
705                                           (ComponentControl)ied.ComponentControl2,
706                                           (ComponentControl)ied.ComponentControl3};
707
708        // Special gather/conversion for formats without equal component sizes
709        if (IsOddFormat((SWR_FORMAT)ied.Format))
710        {
711            Value* pResults[4];
712            CreateGatherOddFormats(
713                (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults);
714            ConvertFormat((SWR_FORMAT)ied.Format, pResults);
715
716            for (uint32_t c = 0; c < 4; c += 1)
717            {
718                if (isComponentEnabled(compMask, c))
719                {
720                    vVertexElements[currentVertexElement++] = pResults[c];
721                    if (currentVertexElement > 3)
722                    {
723                        StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
724                        // reset to the next vVertexElement to output
725                        currentVertexElement = 0;
726                    }
727                }
728            }
729        }
730        else if (info.type[0] == SWR_TYPE_FLOAT)
731        {
732            ///@todo: support 64 bit vb accesses
733            Value* gatherSrc = VIMMED1(0.0f);
734
735            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
736                       "Unsupported format for standard gather fetch.");
737
738            // Gather components from memory to store in a simdvertex structure
739            switch (bpc)
740            {
741            case 16:
742            {
743                Value* vGatherResult[2];
744
745                // if we have at least one component out of x or y to fetch
746                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
747                {
748                    vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask);
749                    // e.g. result of first 8x32bit integer gather for 16bit components
750                    // 256i - 0    1    2    3    4    5    6    7
751                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
752                    //
753                }
754
755                // if we have at least one component out of z or w to fetch
756                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
757                {
758                    // offset base to the next components(zw) in the vertex to gather
759                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
760
761                    vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask);
762                    // e.g. result of second 8x32bit integer gather for 16bit components
763                    // 256i - 0    1    2    3    4    5    6    7
764                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
765                    //
766                }
767
768                // if we have at least one component to shuffle into place
769                if (compMask)
770                {
771                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
772                                                                  pVtxOut,
773                                                                  Instruction::CastOps::FPExt,
774                                                                  CONVERT_NONE,
775                                                                  currentVertexElement,
776                                                                  outputElt,
777                                                                  compMask,
778                                                                  compCtrl,
779                                                                  vVertexElements);
780
781                    // Shuffle gathered components into place in simdvertex struct
782                    mVWidth == 16 ? Shuffle16bpcGather16(args)
783                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
784                }
785            }
786            break;
787            case 32:
788            {
789                for (uint32_t i = 0; i < 4; i += 1)
790                {
791                    if (isComponentEnabled(compMask, i))
792                    {
793                        // if we need to gather the component
794                        if (compCtrl[i] == StoreSrc)
795                        {
796                            // Gather a SIMD of vertices
797                            // APIs allow a 4GB range for offsets
798                            // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :(
799                            // Add 2GB to the base pointer and 2GB to the offsets.  This makes
800                            // "negative" (large) offsets into positive offsets and small offsets
801                            // into negative offsets.
802                            Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000));
803                            vVertexElements[currentVertexElement++] =
804                                GATHERPS(gatherSrc,
805                                         ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)),
806                                         vNewOffsets,
807                                         vGatherMask,
808                                         1,
809                                         JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
810                        }
811                        else
812                        {
813                            vVertexElements[currentVertexElement++] =
814                                GenerateCompCtrlVector(compCtrl[i]);
815                        }
816
817                        if (currentVertexElement > 3)
818                        {
819                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
820                            // reset to the next vVertexElement to output
821                            currentVertexElement = 0;
822                        }
823                    }
824
825                    // offset base to the next component in the vertex to gather
826                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
827                }
828            }
829            break;
830            case 64:
831            {
832                for (uint32_t i = 0; i < 4; i += 1)
833                {
834                    if (isComponentEnabled(compMask, i))
835                    {
836                        // if we need to gather the component
837                        if (compCtrl[i] == StoreSrc)
838                        {
839                            Value* vShufLo;
840                            Value* vShufHi;
841                            Value* vShufAll;
842
843                            if (mVWidth == 8)
844                            {
845                                vShufLo  = C({0, 1, 2, 3});
846                                vShufHi  = C({4, 5, 6, 7});
847                                vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
848                            }
849                            else
850                            {
851                                SWR_ASSERT(mVWidth == 16);
852                                vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
853                                vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
854                                vShufAll =
855                                    C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
856                            }
857
858                            Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
859                            Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
860
861                            Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
862                            Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
863
864                            Value* vZeroDouble = VECTOR_SPLAT(
865                                mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
866
867                            Value* pGatherLo =
868                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo);
869                            Value* pGatherHi =
870                                GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi);
871
872                            pGatherLo = VCVTPD2PS(pGatherLo);
873                            pGatherHi = VCVTPD2PS(pGatherHi);
874
875                            Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
876
877                            vVertexElements[currentVertexElement++] = pGather;
878                        }
879                        else
880                        {
881                            vVertexElements[currentVertexElement++] =
882                                GenerateCompCtrlVector(compCtrl[i]);
883                        }
884
885                        if (currentVertexElement > 3)
886                        {
887                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
888                            // reset to the next vVertexElement to output
889                            currentVertexElement = 0;
890                        }
891                    }
892
893                    // offset base to the next component  in the vertex to gather
894                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8));
895                }
896            }
897            break;
898            default:
899                SWR_INVALID("Tried to fetch invalid FP format");
900                break;
901            }
902        }
903        else
904        {
905            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
906            ConversionType       conversionType = CONVERT_NONE;
907
908            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
909                       "Unsupported format for standard gather fetch.");
910
911            switch (info.type[0])
912            {
913            case SWR_TYPE_UNORM:
914                conversionType = CONVERT_NORMALIZED;
915            case SWR_TYPE_UINT:
916                extendCastType = Instruction::CastOps::ZExt;
917                break;
918            case SWR_TYPE_SNORM:
919                conversionType = CONVERT_NORMALIZED;
920            case SWR_TYPE_SINT:
921                extendCastType = Instruction::CastOps::SExt;
922                break;
923            case SWR_TYPE_USCALED:
924                conversionType = CONVERT_USCALED;
925                extendCastType = Instruction::CastOps::UIToFP;
926                break;
927            case SWR_TYPE_SSCALED:
928                conversionType = CONVERT_SSCALED;
929                extendCastType = Instruction::CastOps::SIToFP;
930                break;
931            case SWR_TYPE_SFIXED:
932                conversionType = CONVERT_SFIXED;
933                extendCastType = Instruction::CastOps::SExt;
934                break;
935            default:
936                break;
937            }
938
939            // value substituted when component of gather is masked
940            Value* gatherSrc = VIMMED1(0);
941
942            // Gather components from memory to store in a simdvertex structure
943            switch (bpc)
944            {
945            case 8:
946            {
947                // if we have at least one component to fetch
948                if (compMask)
949                {
950                    Value* vGatherResult = GATHERDD(
951                        gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
952                    // e.g. result of an 8x32bit integer gather for 8bit components
953                    // 256i - 0    1    2    3    4    5    6    7
954                    //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
955
956                    Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
957                                                                 pVtxOut,
958                                                                 extendCastType,
959                                                                 conversionType,
960                                                                 currentVertexElement,
961                                                                 outputElt,
962                                                                 compMask,
963                                                                 compCtrl,
964                                                                 vVertexElements,
965                                                                 info.swizzle);
966
967                    // Shuffle gathered components into place in simdvertex struct
968                    mVWidth == 16 ? Shuffle8bpcGatherd16(args)
969                                  : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
970                }
971            }
972            break;
973            case 16:
974            {
975                Value* vGatherResult[2];
976
977                // if we have at least one component out of x or y to fetch
978                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
979                {
980                    vGatherResult[0] = GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
981                    // e.g. result of first 8x32bit integer gather for 16bit components
982                    // 256i - 0    1    2    3    4    5    6    7
983                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
984                    //
985                }
986
987                // if we have at least one component out of z or w to fetch
988                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
989                {
990                    // offset base to the next components(zw) in the vertex to gather
991                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
992
993                    vGatherResult[1] = GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
994                    // e.g. result of second 8x32bit integer gather for 16bit components
995                    // 256i - 0    1    2    3    4    5    6    7
996                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
997                    //
998                }
999
1000                // if we have at least one component to shuffle into place
1001                if (compMask)
1002                {
1003                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
1004                                                                  pVtxOut,
1005                                                                  extendCastType,
1006                                                                  conversionType,
1007                                                                  currentVertexElement,
1008                                                                  outputElt,
1009                                                                  compMask,
1010                                                                  compCtrl,
1011                                                                  vVertexElements);
1012
1013                    // Shuffle gathered components into place in simdvertex struct
1014                    mVWidth == 16 ? Shuffle16bpcGather16(args)
1015                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
1016                }
1017            }
1018            break;
1019            case 32:
1020            {
1021                // Gathered components into place in simdvertex struct
1022                for (uint32_t i = 0; i < 4; i++)
1023                {
1024                    if (isComponentEnabled(compMask, i))
1025                    {
1026                        // if we need to gather the component
1027                        if (compCtrl[i] == StoreSrc)
1028                        {
1029                            Value* pGather =
1030                                GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1031
1032                            if (conversionType == CONVERT_USCALED)
1033                            {
1034                                pGather = UI_TO_FP(pGather, mSimdFP32Ty);
1035                            }
1036                            else if (conversionType == CONVERT_SSCALED)
1037                            {
1038                                pGather = SI_TO_FP(pGather, mSimdFP32Ty);
1039                            }
1040                            else if (conversionType == CONVERT_SFIXED)
1041                            {
1042                                pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
1043                                               VBROADCAST(C(1 / 65536.0f)));
1044                            }
1045
1046                            vVertexElements[currentVertexElement++] = pGather;
1047
1048                            // e.g. result of a single 8x32bit integer gather for 32bit components
1049                            // 256i - 0    1    2    3    4    5    6    7
1050                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
1051                        }
1052                        else
1053                        {
1054                            vVertexElements[currentVertexElement++] =
1055                                GenerateCompCtrlVector(compCtrl[i]);
1056                        }
1057
1058                        if (currentVertexElement > 3)
1059                        {
1060                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1061
1062                            // reset to the next vVertexElement to output
1063                            currentVertexElement = 0;
1064                        }
1065                    }
1066
1067                    // offset base to the next component  in the vertex to gather
1068                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
1069                }
1070            }
1071            break;
1072            }
1073        }
1074    }
1075
1076    // if we have a partially filled vVertexElement struct, output it
1077    if (currentVertexElement > 0)
1078    {
1079        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
1080    }
1081}
1082
1083template <typename T>
1084Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
1085{
1086    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
1087               "Function expects gfxptr_t for both input parameters.");
1088
1089    Type* Ty = nullptr;
1090
1091    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
1092                  "Unsupported type for use with GetSimdValidIndicesHelper<T>");
1093    constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
1094    if (bSize)
1095    {
1096        Ty = mInt16PtrTy;
1097    }
1098    else if (sizeof(T) == sizeof(uint8_t))
1099    {
1100        Ty = mInt8PtrTy;
1101    }
1102    else
1103    {
1104        SWR_ASSERT(false, "This should never happen as per static_assert above.");
1105    }
1106
1107    Value* vIndices = VUNDEF_I();
1108
1109    {
1110        // store 0 index on stack to be used to conditionally load from if index address is OOB
1111        Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
1112        STORE(C((T)0), pZeroIndex);
1113
1114        // Load a SIMD of index pointers
1115        for (int64_t lane = 0; lane < mVWidth; lane++)
1116        {
1117            // Calculate the address of the requested index
1118            Value* pIndex = GEP(pIndices, C(lane), Ty);
1119
1120            pLastIndex = INT_TO_PTR(pLastIndex, Ty);
1121
1122            // check if the address is less than the max index,
1123            Value* mask = ICMP_ULT(pIndex, pLastIndex);
1124
1125            // if valid, load the index. if not, load 0 from the stack
1126            Value* pValid = SELECT(mask, pIndex, pZeroIndex);
1127            Value* index  = LOAD(pValid, "valid index", Ty, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1128
1129            // zero extended index to 32 bits and insert into the correct simd lane
1130            index    = Z_EXT(index, mInt32Ty);
1131            vIndices = VINSERT(vIndices, index, lane);
1132        }
1133    }
1134
1135    return vIndices;
1136}
1137
1138//////////////////////////////////////////////////////////////////////////
1139/// @brief Loads a simd of valid indices. OOB indices are set to 0
1140/// *Note* have to do 8bit index checking in scalar until we have AVX-512
1141/// support
1142/// @param pIndices - pointer to 8 bit indices
1143/// @param pLastIndex - pointer to last valid index
1144Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
1145{
1146    return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex);
1147}
1148
1149//////////////////////////////////////////////////////////////////////////
1150/// @brief Loads a simd of valid indices. OOB indices are set to 0
1151/// *Note* have to do 16bit index checking in scalar until we have AVX-512
1152/// support
1153/// @param pIndices - pointer to 16 bit indices
1154/// @param pLastIndex - pointer to last valid index
1155Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
1156{
1157    return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex);
1158}
1159
1160//////////////////////////////////////////////////////////////////////////
1161/// @brief Loads a simd of valid indices. OOB indices are set to 0
1162/// @param pIndices - pointer to 32 bit indices
1163/// @param pLastIndex - pointer to last valid index
1164Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
1165{
1166    DataLayout dL(JM()->mpCurrentModule);
1167    Value*     iLastIndex = pLastIndex;
1168    Value*     iIndices   = pIndices;
1169
1170    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
1171    Value* numIndicesLeft = SUB(iLastIndex, iIndices);
1172    numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
1173    numIndicesLeft        = SDIV(numIndicesLeft, C(4));
1174
1175    // create a vector of index counts from the base index ptr passed into the fetch
1176    Constant* vIndexOffsets;
1177    if (mVWidth == 8)
1178    {
1179        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
1180    }
1181    else
1182    {
1183        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
1184    }
1185
1186    // compare index count to the max valid index
1187    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
1188    //     vIndexOffsets  0 1 2 3 4 5 6 7
1189    //     ------------------------------
1190    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
1191    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
1192    Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
1193    Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
1194
1195    // Load the indices; OOB loads 0
1196    return MASKED_LOAD(pIndices,
1197                       4,
1198                       vIndexMask,
1199                       VIMMED1(0),
1200                       "vIndices",
1201                       PointerType::get(mSimdInt32Ty, 0),
1202                       JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH);
1203}
1204
1205//////////////////////////////////////////////////////////////////////////
1206/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
1207/// denormalizes if needed, converts to F32 if needed, and positions in
1208//  the proper SIMD rows to be output to the simdvertex structure
1209/// @param args: (tuple of args, listed below)
1210///   @param vGatherResult - 8 gathered 8bpc vertices
1211///   @param pVtxOut - base pointer to output simdvertex struct
1212///   @param extendType - sign extend or zero extend
1213///   @param bNormalized - do we need to denormalize?
1214///   @param currentVertexElement - reference to the current vVertexElement
1215///   @param outputElt - reference to the current offset from simdvertex we're o
1216///   @param compMask - component packing mask
1217///   @param compCtrl - component control val
1218///   @param vVertexElements[4] - vertex components to output
1219///   @param swizzle[4] - component swizzle location
1220void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
1221{
1222    // Unpack tuple args
1223    Value*&                    vGatherResult        = std::get<0>(args);
1224    Value*                     pVtxOut              = std::get<1>(args);
1225    const Instruction::CastOps extendType           = std::get<2>(args);
1226    const ConversionType       conversionType       = std::get<3>(args);
1227    uint32_t&                  currentVertexElement = std::get<4>(args);
1228    uint32_t&                  outputElt            = std::get<5>(args);
1229    const ComponentEnable      compMask             = std::get<6>(args);
1230    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1231    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1232    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1233
1234    // cast types
1235    Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1236    Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1237
1238    // have to do extra work for sign extending
1239    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
1240    {
1241        Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
1242        Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1243
1244        // shuffle mask, including any swizzling
1245        const char x          = (char)swizzle[0];
1246        const char y          = (char)swizzle[1];
1247        const char z          = (char)swizzle[2];
1248        const char w          = (char)swizzle[3];
1249        Value*     vConstMask = C<char>(
1250            {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
1251             char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
1252             char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
1253             char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
1254             char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
1255             char(w + 8), char(w + 12)});
1256
1257        // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
1258
1259        Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1260        Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1261
1262        Value* vShufResult_lo =
1263            BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1264        Value* vShufResult_hi =
1265            BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1266
1267        // after pshufb: group components together in each 128bit lane
1268        // 256i - 0    1    2    3    4    5    6    7
1269        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
1270
1271        Value* vi128XY_lo = nullptr;
1272        Value* vi128XY_hi = nullptr;
1273        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1274        {
1275            vi128XY_lo = BITCAST(
1276                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1277                v128Ty);
1278            vi128XY_hi = BITCAST(
1279                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
1280                v128Ty);
1281
1282            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
1283            // 256i - 0    1    2    3    4    5    6    7
1284            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
1285        }
1286
1287        // do the same for zw components
1288        Value* vi128ZW_lo = nullptr;
1289        Value* vi128ZW_hi = nullptr;
1290        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1291        {
1292            vi128ZW_lo = BITCAST(
1293                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1294                v128Ty);
1295            vi128ZW_hi = BITCAST(
1296                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
1297                v128Ty);
1298        }
1299
1300        // init denormalize variables if needed
1301        Instruction::CastOps fpCast;
1302        Value*               conversionFactor;
1303
1304        switch (conversionType)
1305        {
1306        case CONVERT_NORMALIZED:
1307            fpCast           = Instruction::CastOps::SIToFP;
1308            conversionFactor = VIMMED1((float)(1.0 / 127.0));
1309            break;
1310        case CONVERT_SSCALED:
1311            fpCast           = Instruction::CastOps::SIToFP;
1312            conversionFactor = VIMMED1((float)(1.0));
1313            break;
1314        case CONVERT_USCALED:
1315            SWR_INVALID("Type should not be sign extended!");
1316            conversionFactor = nullptr;
1317            break;
1318        default:
1319            SWR_ASSERT(conversionType == CONVERT_NONE);
1320            conversionFactor = nullptr;
1321            break;
1322        }
1323
1324        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1325        // simdvertex
1326        for (uint32_t i = 0; i < 4; i++)
1327        {
1328            if (isComponentEnabled(compMask, i))
1329            {
1330                if (compCtrl[i] == ComponentControl::StoreSrc)
1331                {
1332                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1333                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1334                    // if x or y, use vi128XY permute result, else use vi128ZW
1335                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1336                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1337
1338                    // sign extend
1339                    Value* temp_lo =
1340                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
1341                    Value* temp_hi =
1342                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
1343
1344                    Value* temp = JOIN_16(temp_lo, temp_hi);
1345
1346                    // denormalize if needed
1347                    if (conversionType != CONVERT_NONE)
1348                    {
1349                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1350                    }
1351
1352                    vVertexElements[currentVertexElement] = temp;
1353
1354                    currentVertexElement += 1;
1355                }
1356                else
1357                {
1358                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1359                }
1360
1361                if (currentVertexElement > 3)
1362                {
1363                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1364                    // reset to the next vVertexElement to output
1365                    currentVertexElement = 0;
1366                }
1367            }
1368        }
1369    }
1370    // else zero extend
1371    else if ((extendType == Instruction::CastOps::ZExt) ||
1372             (extendType == Instruction::CastOps::UIToFP))
1373    {
1374        // init denormalize variables if needed
1375        Instruction::CastOps fpCast;
1376        Value*               conversionFactor;
1377
1378        switch (conversionType)
1379        {
1380        case CONVERT_NORMALIZED:
1381            fpCast           = Instruction::CastOps::UIToFP;
1382            conversionFactor = VIMMED1((float)(1.0 / 255.0));
1383            break;
1384        case CONVERT_USCALED:
1385            fpCast           = Instruction::CastOps::UIToFP;
1386            conversionFactor = VIMMED1((float)(1.0));
1387            break;
1388        case CONVERT_SSCALED:
1389            SWR_INVALID("Type should not be zero extended!");
1390            conversionFactor = nullptr;
1391            break;
1392        default:
1393            SWR_ASSERT(conversionType == CONVERT_NONE);
1394            conversionFactor = nullptr;
1395            break;
1396        }
1397
1398        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
1399        for (uint32_t i = 0; i < 4; i++)
1400        {
1401            if (isComponentEnabled(compMask, i))
1402            {
1403                if (compCtrl[i] == ComponentControl::StoreSrc)
1404                {
1405                    // pshufb masks for each component
1406                    Value* vConstMask;
1407                    switch (swizzle[i])
1408                    {
1409                    case 0:
1410                        // x shuffle mask
1411                        vConstMask =
1412                            C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
1413                                     0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
1414                        break;
1415                    case 1:
1416                        // y shuffle mask
1417                        vConstMask =
1418                            C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
1419                                     1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
1420                        break;
1421                    case 2:
1422                        // z shuffle mask
1423                        vConstMask =
1424                            C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
1425                                     2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
1426                        break;
1427                    case 3:
1428                        // w shuffle mask
1429                        vConstMask =
1430                            C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
1431                                     3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
1432                        break;
1433                    default:
1434                        vConstMask = nullptr;
1435                        break;
1436                    }
1437
1438                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
1439                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
1440
1441                    Value* temp_lo =
1442                        BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
1443                    Value* temp_hi =
1444                        BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
1445
1446                    // after pshufb for x channel
1447                    // 256i - 0    1    2    3    4    5    6    7
1448                    //        x000 x000 x000 x000 x000 x000 x000 x000
1449
1450                    Value* temp = JOIN_16(temp_lo, temp_hi);
1451
1452                    // denormalize if needed
1453                    if (conversionType != CONVERT_NONE)
1454                    {
1455                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1456                    }
1457
1458                    vVertexElements[currentVertexElement] = temp;
1459
1460                    currentVertexElement += 1;
1461                }
1462                else
1463                {
1464                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1465                }
1466
1467                if (currentVertexElement > 3)
1468                {
1469                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1470                    // reset to the next vVertexElement to output
1471                    currentVertexElement = 0;
1472                }
1473            }
1474        }
1475    }
1476    else
1477    {
1478        SWR_INVALID("Unsupported conversion type");
1479    }
1480}
1481
1482void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
1483{
1484    // Unpack tuple args
1485    Value*&                    vGatherResult        = std::get<0>(args);
1486    Value*                     pVtxOut              = std::get<1>(args);
1487    const Instruction::CastOps extendType           = std::get<2>(args);
1488    const ConversionType       conversionType       = std::get<3>(args);
1489    uint32_t&                  currentVertexElement = std::get<4>(args);
1490    uint32_t&                  outputElt            = std::get<5>(args);
1491    const ComponentEnable      compMask             = std::get<6>(args);
1492    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1493    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1494    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
1495
1496    // cast types
1497    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1498
1499    for (uint32_t i = 0; i < 4; i++)
1500    {
1501        if (!isComponentEnabled(compMask, i))
1502            continue;
1503
1504        if (compCtrl[i] == ComponentControl::StoreSrc)
1505        {
1506            std::vector<uint32_t> vShuffleMasks[4] = {
1507                {0, 4, 8, 12, 16, 20, 24, 28},  // x
1508                {1, 5, 9, 13, 17, 21, 25, 29},  // y
1509                {2, 6, 10, 14, 18, 22, 26, 30}, // z
1510                {3, 7, 11, 15, 19, 23, 27, 31}, // w
1511            };
1512
1513            Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
1514                                  UndefValue::get(v32x8Ty),
1515                                  vShuffleMasks[swizzle[i]]);
1516
1517            if ((extendType == Instruction::CastOps::SExt) ||
1518                (extendType == Instruction::CastOps::SIToFP))
1519            {
1520                switch (conversionType)
1521                {
1522                case CONVERT_NORMALIZED:
1523                    val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0)));
1524                    break;
1525                case CONVERT_SSCALED:
1526                    val = SI_TO_FP(val, mSimdFP32Ty);
1527                    break;
1528                case CONVERT_USCALED:
1529                    SWR_INVALID("Type should not be sign extended!");
1530                    break;
1531                default:
1532                    SWR_ASSERT(conversionType == CONVERT_NONE);
1533                    val = S_EXT(val, mSimdInt32Ty);
1534                    break;
1535                }
1536            }
1537            else if ((extendType == Instruction::CastOps::ZExt) ||
1538                     (extendType == Instruction::CastOps::UIToFP))
1539            {
1540                switch (conversionType)
1541                {
1542                case CONVERT_NORMALIZED:
1543                    val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0)));
1544                    break;
1545                case CONVERT_SSCALED:
1546                    SWR_INVALID("Type should not be zero extended!");
1547                    break;
1548                case CONVERT_USCALED:
1549                    val = UI_TO_FP(val, mSimdFP32Ty);
1550                    break;
1551                default:
1552                    SWR_ASSERT(conversionType == CONVERT_NONE);
1553                    val = Z_EXT(val, mSimdInt32Ty);
1554                    break;
1555                }
1556            }
1557            else
1558            {
1559                SWR_INVALID("Unsupported conversion type");
1560            }
1561
1562            vVertexElements[currentVertexElement++] = val;
1563        }
1564        else
1565        {
1566            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1567        }
1568
1569        if (currentVertexElement > 3)
1570        {
1571            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1572            // reset to the next vVertexElement to output
1573            currentVertexElement = 0;
1574        }
1575    }
1576}
1577
1578//////////////////////////////////////////////////////////////////////////
1579/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
1580/// denormalizes if needed, converts to F32 if needed, and positions in
1581//  the proper SIMD rows to be output to the simdvertex structure
1582/// @param args: (tuple of args, listed below)
1583///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
1584///   @param pVtxOut - base pointer to output simdvertex struct
1585///   @param extendType - sign extend or zero extend
1586///   @param bNormalized - do we need to denormalize?
1587///   @param currentVertexElement - reference to the current vVertexElement
1588///   @param outputElt - reference to the current offset from simdvertex we're o
1589///   @param compMask - component packing mask
1590///   @param compCtrl - component control val
1591///   @param vVertexElements[4] - vertex components to output
1592void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
1593{
1594    // Unpack tuple args
1595    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1596    Value*                     pVtxOut              = std::get<1>(args);
1597    const Instruction::CastOps extendType           = std::get<2>(args);
1598    const ConversionType       conversionType       = std::get<3>(args);
1599    uint32_t&                  currentVertexElement = std::get<4>(args);
1600    uint32_t&                  outputElt            = std::get<5>(args);
1601    const ComponentEnable      compMask             = std::get<6>(args);
1602    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1603    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1604
1605    // cast types
1606    Type* vGatherTy = VectorType::get(mInt32Ty, 8);
1607    Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
1608
1609    // have to do extra work for sign extending
1610    if ((extendType == Instruction::CastOps::SExt) ||
1611        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1612    {
1613        // is this PP float?
1614        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1615
1616        Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1617        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
1618
1619        // shuffle mask
1620        Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1621                                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1622        Value* vi128XY_lo = nullptr;
1623        Value* vi128XY_hi = nullptr;
1624        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1625        {
1626            // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
1627            // now..
1628
1629            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
1630            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
1631
1632            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1633            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1634
1635            // after pshufb: group components together in each 128bit lane
1636            // 256i - 0    1    2    3    4    5    6    7
1637            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1638
1639            vi128XY_lo = BITCAST(
1640                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1641                v128bitTy);
1642            vi128XY_hi = BITCAST(
1643                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1644                v128bitTy);
1645
1646            // after PERMD: move and pack xy components into each 128bit lane
1647            // 256i - 0    1    2    3    4    5    6    7
1648            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1649        }
1650
1651        // do the same for zw components
1652        Value* vi128ZW_lo = nullptr;
1653        Value* vi128ZW_hi = nullptr;
1654        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1655        {
1656            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
1657            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
1658
1659            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
1660            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
1661
1662            vi128ZW_lo = BITCAST(
1663                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1664                v128bitTy);
1665            vi128ZW_hi = BITCAST(
1666                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
1667                v128bitTy);
1668        }
1669
1670        // init denormalize variables if needed
1671        Instruction::CastOps IntToFpCast;
1672        Value*               conversionFactor;
1673
1674        switch (conversionType)
1675        {
1676        case CONVERT_NORMALIZED:
1677            IntToFpCast      = Instruction::CastOps::SIToFP;
1678            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1679            break;
1680        case CONVERT_SSCALED:
1681            IntToFpCast      = Instruction::CastOps::SIToFP;
1682            conversionFactor = VIMMED1((float)(1.0));
1683            break;
1684        case CONVERT_USCALED:
1685            SWR_INVALID("Type should not be sign extended!");
1686            conversionFactor = nullptr;
1687            break;
1688        default:
1689            SWR_ASSERT(conversionType == CONVERT_NONE);
1690            conversionFactor = nullptr;
1691            break;
1692        }
1693
1694        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1695        // simdvertex
1696        for (uint32_t i = 0; i < 4; i++)
1697        {
1698            if (isComponentEnabled(compMask, i))
1699            {
1700                if (compCtrl[i] == ComponentControl::StoreSrc)
1701                {
1702                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1703                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1704                    // if x or y, use vi128XY permute result, else use vi128ZW
1705                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
1706                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
1707
1708                    if (bFP)
1709                    {
1710                        // extract 128 bit lanes to sign extend each component
1711                        Value* temp_lo =
1712                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1713                        Value* temp_hi =
1714                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1715
1716                        vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
1717                    }
1718                    else
1719                    {
1720                        // extract 128 bit lanes to sign extend each component
1721                        Value* temp_lo =
1722                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
1723                        Value* temp_hi =
1724                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
1725
1726                        Value* temp = JOIN_16(temp_lo, temp_hi);
1727
1728                        // denormalize if needed
1729                        if (conversionType != CONVERT_NONE)
1730                        {
1731                            temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor);
1732                        }
1733
1734                        vVertexElements[currentVertexElement] = temp;
1735                    }
1736
1737                    currentVertexElement += 1;
1738                }
1739                else
1740                {
1741                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1742                }
1743
1744                if (currentVertexElement > 3)
1745                {
1746                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1747                    // reset to the next vVertexElement to output
1748                    currentVertexElement = 0;
1749                }
1750            }
1751        }
1752    }
1753    // else zero extend
1754    else if ((extendType == Instruction::CastOps::ZExt) ||
1755             (extendType == Instruction::CastOps::UIToFP))
1756    {
1757        // pshufb masks for each component
1758        Value* vConstMask[2];
1759
1760        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1761        {
1762            // x/z shuffle mask
1763            vConstMask[0] = C<char>({
1764                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1765                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
1766            });
1767        }
1768
1769        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
1770        {
1771            // y/w shuffle mask
1772            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
1773                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
1774        }
1775
1776        // init denormalize variables if needed
1777        Instruction::CastOps fpCast;
1778        Value*               conversionFactor;
1779
1780        switch (conversionType)
1781        {
1782        case CONVERT_NORMALIZED:
1783            fpCast           = Instruction::CastOps::UIToFP;
1784            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
1785            break;
1786        case CONVERT_USCALED:
1787            fpCast           = Instruction::CastOps::UIToFP;
1788            conversionFactor = VIMMED1((float)(1.0f));
1789            break;
1790        case CONVERT_SSCALED:
1791            SWR_INVALID("Type should not be zero extended!");
1792            conversionFactor = nullptr;
1793            break;
1794        default:
1795            SWR_ASSERT(conversionType == CONVERT_NONE);
1796            conversionFactor = nullptr;
1797            break;
1798        }
1799
1800        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
1801        for (uint32_t i = 0; i < 4; i++)
1802        {
1803            if (isComponentEnabled(compMask, i))
1804            {
1805                if (compCtrl[i] == ComponentControl::StoreSrc)
1806                {
1807                    // select correct constMask for x/z or y/w pshufb
1808                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
1809                    // if x or y, use vi128XY permute result, else use vi128ZW
1810                    uint32_t selectedGather = (i < 2) ? 0 : 1;
1811
1812                    // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
1813                    // for now..
1814
1815                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
1816                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
1817
1818                    Value* temp_lo = BITCAST(
1819                        PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
1820                        vGatherTy);
1821                    Value* temp_hi = BITCAST(
1822                        PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
1823                        vGatherTy);
1824
1825                    // after pshufb mask for x channel; z uses the same shuffle from the second
1826                    // gather 256i - 0    1    2    3    4    5    6    7
1827                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
1828
1829                    Value* temp = JOIN_16(temp_lo, temp_hi);
1830
1831                    // denormalize if needed
1832                    if (conversionType != CONVERT_NONE)
1833                    {
1834                        temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor);
1835                    }
1836
1837                    vVertexElements[currentVertexElement] = temp;
1838
1839                    currentVertexElement += 1;
1840                }
1841                else
1842                {
1843                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1844                }
1845
1846                if (currentVertexElement > 3)
1847                {
1848                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1849                    // reset to the next vVertexElement to output
1850                    currentVertexElement = 0;
1851                }
1852            }
1853        }
1854    }
1855    else
1856    {
1857        SWR_INVALID("Unsupported conversion type");
1858    }
1859}
1860
1861void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
1862{
1863    // Unpack tuple args
1864    Value*(&vGatherResult)[2]                       = std::get<0>(args);
1865    Value*                     pVtxOut              = std::get<1>(args);
1866    const Instruction::CastOps extendType           = std::get<2>(args);
1867    const ConversionType       conversionType       = std::get<3>(args);
1868    uint32_t&                  currentVertexElement = std::get<4>(args);
1869    uint32_t&                  outputElt            = std::get<5>(args);
1870    const ComponentEnable      compMask             = std::get<6>(args);
1871    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
1872    Value*(&vVertexElements)[4]                     = std::get<8>(args);
1873
1874    // cast types
1875    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
1876    Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
1877
1878    // have to do extra work for sign extending
1879    if ((extendType == Instruction::CastOps::SExt) ||
1880        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
1881    {
1882        // is this PP float?
1883        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
1884
1885        Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
1886        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
1887                                          mVWidth / 4); // vwidth is units of 32 bits
1888
1889        // shuffle mask
1890        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
1891                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
1892        Value* vi128XY    = nullptr;
1893        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
1894        {
1895            Value* vShufResult =
1896                BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
1897            // after pshufb: group components together in each 128bit lane
1898            // 256i - 0    1    2    3    4    5    6    7
1899            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
1900
1901            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1902            // after PERMD: move and pack xy components into each 128bit lane
1903            // 256i - 0    1    2    3    4    5    6    7
1904            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
1905        }
1906
1907        // do the same for zw components
1908        Value* vi128ZW = nullptr;
1909        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
1910        {
1911            Value* vShufResult =
1912                BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
1913            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
1914        }
1915
1916        // init denormalize variables if needed
1917        Instruction::CastOps IntToFpCast;
1918        Value*               conversionFactor;
1919
1920        switch (conversionType)
1921        {
1922        case CONVERT_NORMALIZED:
1923            IntToFpCast      = Instruction::CastOps::SIToFP;
1924            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
1925            break;
1926        case CONVERT_SSCALED:
1927            IntToFpCast      = Instruction::CastOps::SIToFP;
1928            conversionFactor = VIMMED1((float)(1.0));
1929            break;
1930        case CONVERT_USCALED:
1931            SWR_INVALID("Type should not be sign extended!");
1932            conversionFactor = nullptr;
1933            break;
1934        default:
1935            SWR_ASSERT(conversionType == CONVERT_NONE);
1936            conversionFactor = nullptr;
1937            break;
1938        }
1939
1940        // sign extend all enabled components. If we have a fill vVertexElements, output to current
1941        // simdvertex
1942        for (uint32_t i = 0; i < 4; i++)
1943        {
1944            if (isComponentEnabled(compMask, i))
1945            {
1946                if (compCtrl[i] == ComponentControl::StoreSrc)
1947                {
1948                    // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
1949                    uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
1950                    // if x or y, use vi128XY permute result, else use vi128ZW
1951                    Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
1952
1953                    if (bFP)
1954                    {
1955                        // extract 128 bit lanes to sign extend each component
1956                        vVertexElements[currentVertexElement] =
1957                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1958                    }
1959                    else
1960                    {
1961                        // extract 128 bit lanes to sign extend each component
1962                        vVertexElements[currentVertexElement] =
1963                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
1964
1965                        // denormalize if needed
1966                        if (conversionType != CONVERT_NONE)
1967                        {
1968                            vVertexElements[currentVertexElement] =
1969                                FMUL(CAST(IntToFpCast,
1970                                          vVertexElements[currentVertexElement],
1971                                          mSimdFP32Ty),
1972                                     conversionFactor);
1973                        }
1974                    }
1975                    currentVertexElement++;
1976                }
1977                else
1978                {
1979                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
1980                }
1981
1982                if (currentVertexElement > 3)
1983                {
1984                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
1985                    // reset to the next vVertexElement to output
1986                    currentVertexElement = 0;
1987                }
1988            }
1989        }
1990    }
1991    // else zero extend
1992    else if ((extendType == Instruction::CastOps::ZExt) ||
1993             (extendType == Instruction::CastOps::UIToFP))
1994    {
1995        // pshufb masks for each component
1996        Value* vConstMask[2];
1997        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
1998        {
1999            // x/z shuffle mask
2000            vConstMask[0] = C<char>({
2001                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2002                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
2003            });
2004        }
2005
2006        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
2007        {
2008            // y/w shuffle mask
2009            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
2010                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
2011        }
2012
2013        // init denormalize variables if needed
2014        Instruction::CastOps fpCast;
2015        Value*               conversionFactor;
2016
2017        switch (conversionType)
2018        {
2019        case CONVERT_NORMALIZED:
2020            fpCast           = Instruction::CastOps::UIToFP;
2021            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
2022            break;
2023        case CONVERT_USCALED:
2024            fpCast           = Instruction::CastOps::UIToFP;
2025            conversionFactor = VIMMED1((float)(1.0f));
2026            break;
2027        case CONVERT_SSCALED:
2028            SWR_INVALID("Type should not be zero extended!");
2029            conversionFactor = nullptr;
2030            break;
2031        default:
2032            SWR_ASSERT(conversionType == CONVERT_NONE);
2033            conversionFactor = nullptr;
2034            break;
2035        }
2036
2037        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
2038        for (uint32_t i = 0; i < 4; i++)
2039        {
2040            if (isComponentEnabled(compMask, i))
2041            {
2042                if (compCtrl[i] == ComponentControl::StoreSrc)
2043                {
2044                    // select correct constMask for x/z or y/w pshufb
2045                    uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
2046                    // if x or y, use vi128XY permute result, else use vi128ZW
2047                    uint32_t selectedGather = (i < 2) ? 0 : 1;
2048
2049                    vVertexElements[currentVertexElement] =
2050                        BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
2051                                       vConstMask[selectedMask]),
2052                                vGatherTy);
2053                    // after pshufb mask for x channel; z uses the same shuffle from the second
2054                    // gather 256i - 0    1    2    3    4    5    6    7
2055                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
2056
2057                    // denormalize if needed
2058                    if (conversionType != CONVERT_NONE)
2059                    {
2060                        vVertexElements[currentVertexElement] =
2061                            FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
2062                                 conversionFactor);
2063                    }
2064                    currentVertexElement++;
2065                }
2066                else
2067                {
2068                    vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
2069                }
2070
2071                if (currentVertexElement > 3)
2072                {
2073                    StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
2074                    // reset to the next vVertexElement to output
2075                    currentVertexElement = 0;
2076                }
2077            }
2078        }
2079    }
2080    else
2081    {
2082        SWR_INVALID("Unsupported conversion type");
2083    }
2084}
2085
2086//////////////////////////////////////////////////////////////////////////
2087/// @brief Output a simdvertex worth of elements to the current outputElt
2088/// @param pVtxOut - base address of VIN output struct
2089/// @param outputElt - simdvertex offset in VIN to write to
2090/// @param numEltsToStore - number of simdvertex rows to write out
2091/// @param vVertexElements - LLVM Value*[] simdvertex to write out
2092void FetchJit::StoreVertexElements(Value*         pVtxOut,
2093                                   const uint32_t outputElt,
2094                                   const uint32_t numEltsToStore,
2095                                   Value* (&vVertexElements)[4])
2096{
2097    SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
2098
2099    for (uint32_t c = 0; c < numEltsToStore; ++c)
2100    {
2101        // STORE expects FP32 x vWidth type, just bitcast if needed
2102        if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
2103        {
2104#if FETCH_DUMP_VERTEX
2105            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
2106#endif
2107            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
2108        }
2109#if FETCH_DUMP_VERTEX
2110        else
2111        {
2112            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
2113        }
2114#endif
2115        // outputElt * 4 = offsetting by the size of a simdvertex
2116        // + c offsets to a 32bit x vWidth row within the current vertex
2117        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP");
2118        STORE(vVertexElements[c], dest);
2119    }
2120}
2121
2122//////////////////////////////////////////////////////////////////////////
2123/// @brief Generates a constant vector of values based on the
2124/// ComponentControl value
2125/// @param ctrl - ComponentControl value
2126Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
2127{
2128    switch (ctrl)
2129    {
2130    case NoStore:
2131        return VUNDEF_I();
2132    case Store0:
2133        return VIMMED1(0);
2134    case Store1Fp:
2135        return VIMMED1(1.0f);
2136    case Store1Int:
2137        return VIMMED1(1);
2138    case StoreVertexId:
2139    {
2140        if (mVWidth == 16)
2141        {
2142            Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
2143            Value* pIdLo =
2144                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
2145            Value* pIdHi =
2146                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
2147            return JOIN_16(pIdLo, pIdHi);
2148        }
2149        else
2150        {
2151            return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
2152        }
2153    }
2154    case StoreInstanceId:
2155    {
2156        Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
2157        return VBROADCAST(pId);
2158    }
2159
2160
2161    case StoreSrc:
2162    default:
2163        SWR_INVALID("Invalid component control");
2164        return VUNDEF_I();
2165    }
2166}
2167
2168//////////////////////////////////////////////////////////////////////////
2169/// @brief Returns the enable mask for the specified component.
2170/// @param enableMask - enable bits
2171/// @param component - component to check if enabled.
2172bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
2173{
2174    switch (component)
2175    {
2176        // X
2177    case 0:
2178        return (enableMask & ComponentEnable::X);
2179        // Y
2180    case 1:
2181        return (enableMask & ComponentEnable::Y);
2182        // Z
2183    case 2:
2184        return (enableMask & ComponentEnable::Z);
2185        // W
2186    case 3:
2187        return (enableMask & ComponentEnable::W);
2188
2189    default:
2190        return false;
2191    }
2192}
2193
2194// Don't want two threads compiling the same fetch shader simultaneously
2195// Has problems in the JIT cache implementation
2196// This is only a problem for fetch right now.
2197static std::mutex gFetchCodegenMutex;
2198
2199//////////////////////////////////////////////////////////////////////////
2200/// @brief JITs from fetch shader IR
2201/// @param hJitMgr - JitManager handle
2202/// @param func   - LLVM function IR
2203/// @return PFN_FETCH_FUNC - pointer to fetch code
2204PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
2205{
2206    const llvm::Function* func    = (const llvm::Function*)hFunc;
2207    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2208    PFN_FETCH_FUNC        pfnFetch;
2209
2210    gFetchCodegenMutex.lock();
2211    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
2212    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
2213    // add new IR to the module
2214    pJitMgr->mIsModuleFinalized = true;
2215
2216#if defined(KNOB_SWRC_TRACING)
2217    char        fName[1024];
2218    const char* funcName = func->getName().data();
2219    sprintf(fName, "%s.bin", funcName);
2220    FILE* fd = fopen(fName, "wb");
2221    fwrite((void*)pfnFetch, 1, 2048, fd);
2222    fclose(fd);
2223#endif
2224
2225    pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
2226    gFetchCodegenMutex.unlock();
2227
2228
2229    return pfnFetch;
2230}
2231
2232//////////////////////////////////////////////////////////////////////////
2233/// @brief JIT compiles fetch shader
2234/// @param hJitMgr - JitManager handle
2235/// @param state   - fetch state to build function from
2236extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
2237{
2238    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
2239
2240    pJitMgr->SetupNewModule();
2241
2242    FetchJit theJit(pJitMgr);
2243    HANDLE   hFunc = theJit.Create(state);
2244
2245    return JitFetchFunc(hJitMgr, hFunc);
2246}
2247