1/**************************************************************************** 2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file fetch_jit.cpp 24 * 25 * @brief Implementation of the fetch jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30#include "jit_pch.hpp" 31#include "builder_gfx_mem.h" 32#include "jit_api.h" 33#include "fetch_jit.h" 34#include "gen_state_llvm.h" 35#include "functionpasses/passes.h" 36 37//#define FETCH_DUMP_VERTEX 1 38using namespace llvm; 39using namespace SwrJit; 40 41bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); 42 43enum ConversionType 44{ 45 CONVERT_NONE, 46 CONVERT_NORMALIZED, 47 CONVERT_USCALED, 48 CONVERT_SSCALED, 49 CONVERT_SFIXED, 50}; 51 52////////////////////////////////////////////////////////////////////////// 53/// Interface to Jitting a fetch shader 54////////////////////////////////////////////////////////////////////////// 55struct FetchJit : public BuilderGfxMem 56{ 57 FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr), mpFetchInfo(NULL) {} 58 59 Function* Create(const FETCH_COMPILE_STATE& fetchState); 60 61 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); 62 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); 63 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); 64 template <typename T> 65 Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex); 66 67 // package up Shuffle*bpcGatherd args into a tuple for convenience 68 typedef std::tuple<Value*&, 69 Value*, 70 const Instruction::CastOps, 71 const ConversionType, 72 uint32_t&, 73 uint32_t&, 74 const ComponentEnable, 75 const ComponentControl (&)[4], 76 Value* (&)[4], 77 const uint32_t (&)[4]> 78 Shuffle8bpcArgs; 79 80 void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args); 81 void Shuffle8bpcGatherd(Shuffle8bpcArgs& args); 82 83 typedef std::tuple<Value* (&)[2], 84 Value*, 85 const Instruction::CastOps, 86 const ConversionType, 87 uint32_t&, 88 uint32_t&, 89 const ComponentEnable, 90 const ComponentControl (&)[4], 91 Value* (&)[4]> 92 Shuffle16bpcArgs; 93 94 void Shuffle16bpcGather16(Shuffle16bpcArgs& args); 95 void Shuffle16bpcGather(Shuffle16bpcArgs& args); 96 97 void StoreVertexElements(Value* pVtxOut, 98 const uint32_t outputElt, 99 const uint32_t numEltsToStore, 100 Value* (&vVertexElements)[4]); 101 102 Value* GenerateCompCtrlVector(const ComponentControl ctrl); 103 104 void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, 105 Value* streams, 106 Value* vIndices, 107 Value* pVtxOut); 108 109 bool IsOddFormat(SWR_FORMAT format); 110 bool IsUniformFormat(SWR_FORMAT format); 111 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); 112 void CreateGatherOddFormats( 113 SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); 114 void ConvertFormat(SWR_FORMAT format, Value* texels[4]); 115 116 Value* mpFetchInfo; 117}; 118 119Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) 120{ 121 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 122 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); 123 124 Function* fetch = Function::Create( 125 JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 126 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); 127 128 fetch->getParent()->setModuleIdentifier(fetch->getName()); 129 130 IRB()->SetInsertPoint(entry); 131 132 auto argitr = fetch->arg_begin(); 133 134 // Fetch shader arguments 135 Value* privateContext = &*argitr; 136 ++argitr; 137 privateContext->setName("privateContext"); 138 SetPrivateContext(privateContext); 139 140 mpWorkerData = &*argitr; 141 ++argitr; 142 mpWorkerData->setName("pWorkerData"); 143 144 mpFetchInfo = &*argitr; 145 ++argitr; 146 mpFetchInfo->setName("fetchInfo"); 147 Value* pVtxOut = &*argitr; 148 pVtxOut->setName("vtxOutput"); 149 150 uint32_t baseWidth = mVWidth; 151 152 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth); 153 154 // Override builder target width to force 16-wide SIMD 155#if USE_SIMD16_SHADERS 156 SetTargetWidth(16); 157#endif 158 159 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0)); 160 161 // SWR_FETCH_CONTEXT::pStreams 162 Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams}); 163 streams->setName("pStreams"); 164 165 // SWR_FETCH_CONTEXT::pIndices 166 Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices}); 167 indices->setName("pIndices"); 168 169 // SWR_FETCH_CONTEXT::pLastIndex 170 Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex}); 171 pLastIndex->setName("pLastIndex"); 172 173 Value* vIndices; 174 switch (fetchState.indexType) 175 { 176 case R8_UINT: 177 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); 178 if (fetchState.bDisableIndexOOBCheck) 179 { 180 vIndices = LOAD( 181 BITCAST(indices, PointerType::get(getVectorType(mInt8Ty, mpJitMgr->mVWidth), 0)), 182 {(uint32_t)0}); 183 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 184 } 185 else 186 { 187 vIndices = GetSimdValid8bitIndices(indices, pLastIndex); 188 } 189 break; 190 case R16_UINT: 191 if (fetchState.bDisableIndexOOBCheck) 192 { 193 vIndices = LOAD( 194 BITCAST(indices, PointerType::get(getVectorType(mInt16Ty, mpJitMgr->mVWidth), 0)), 195 {(uint32_t)0}); 196 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 197 } 198 else 199 { 200 vIndices = GetSimdValid16bitIndices(indices, pLastIndex); 201 } 202 break; 203 case R32_UINT: 204 (fetchState.bDisableIndexOOBCheck) 205 ? vIndices = LOAD(indices, 206 "", 207 PointerType::get(mSimdInt32Ty, 0), 208 MEM_CLIENT::GFX_MEM_CLIENT_FETCH) 209 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); 210 break; // incoming type is already 32bit int 211 default: 212 vIndices = nullptr; 213 assert(false && "Unsupported index type"); 214 break; 215 } 216 217 if (fetchState.bForceSequentialAccessEnable) 218 { 219 Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7}) 220 : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 221 222 // VertexData buffers are accessed sequentially, the index is equal to the vertex number 223 vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); 224 vIndices = ADD(vIndices, pOffsets); 225 } 226 227 Value* vVertexId = vIndices; 228 if (fetchState.bVertexIDOffsetEnable) 229 { 230 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally 231 // correct 232 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 233 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); 234 vVertexId = ADD(vIndices, vBaseVertex); 235 vVertexId = ADD(vVertexId, vStartVertex); 236 } 237 238 // store out vertex IDs 239 if (mVWidth == 16) 240 { 241 // store out in simd8 halves until core supports 16-wide natively 242 auto vVertexIdLo = EXTRACT_16(vVertexId, 0); 243 auto vVertexIdHi = EXTRACT_16(vVertexId, 1); 244 STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); 245 STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})); 246 } 247 else if (mVWidth == 8) 248 { 249 STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); 250 } 251 252 // store out cut mask if enabled 253 if (fetchState.bEnableCutIndex) 254 { 255 Value* vCutIndex = VIMMED1(fetchState.cutIndex); 256 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); 257 258 if (mVWidth == 16) 259 { 260 auto cutMaskLo = EXTRACT_16(cutMask, 0); 261 auto cutMaskHi = EXTRACT_16(cutMask, 1); 262 STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); 263 STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2})); 264 } 265 else if (mVWidth == 8) 266 { 267 STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); 268 } 269 } 270 271 // Fetch attributes from memory and output to a simdvertex struct 272 JitGatherVertices(fetchState, streams, vIndices, pVtxOut); 273 274 RET_VOID(); 275 276 JitManager::DumpToFile(fetch, "src"); 277 278#if defined(_DEBUG) 279 verifyFunction(*fetch); 280#endif 281 282 ::FunctionPassManager setupPasses(JM()->mpCurrentModule); 283 284 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) 285 setupPasses.add(createBreakCriticalEdgesPass()); 286 setupPasses.add(createCFGSimplificationPass()); 287 setupPasses.add(createEarlyCSEPass()); 288 setupPasses.add(createPromoteMemoryToRegisterPass()); 289 290 setupPasses.run(*fetch); 291 292 JitManager::DumpToFile(fetch, "se"); 293 294 ::FunctionPassManager optPasses(JM()->mpCurrentModule); 295 296 ///@todo Haven't touched these either. Need to remove some of these and add others. 297 optPasses.add(createCFGSimplificationPass()); 298 optPasses.add(createEarlyCSEPass()); 299 optPasses.add(createInstructionCombiningPass()); 300#if LLVM_VERSION_MAJOR <= 11 301 optPasses.add(createConstantPropagationPass()); 302#endif 303 optPasses.add(createSCCPPass()); 304 optPasses.add(createAggressiveDCEPass()); 305 306 optPasses.run(*fetch); 307 308 optPasses.add(createLowerX86Pass(this)); 309 optPasses.run(*fetch); 310 311 JitManager::DumpToFile(fetch, "opt"); 312 313 314 // Revert 16-wide override 315#if USE_SIMD16_SHADERS 316 SetTargetWidth(baseWidth); 317#endif 318 319 return fetch; 320} 321 322// returns true for odd formats that require special state.gather handling 323bool FetchJit::IsOddFormat(SWR_FORMAT format) 324{ 325 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 326 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64) 327 { 328 return true; 329 } 330 return false; 331} 332 333// format is uniform if all components are the same size and type 334bool FetchJit::IsUniformFormat(SWR_FORMAT format) 335{ 336 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 337 uint32_t bpc0 = info.bpc[0]; 338 uint32_t type0 = info.type[0]; 339 340 for (uint32_t c = 1; c < info.numComps; ++c) 341 { 342 if (bpc0 != info.bpc[c] || type0 != info.type[c]) 343 { 344 return false; 345 } 346 } 347 return true; 348} 349 350// unpacks components based on format 351// foreach component in the pixel 352// mask off everything but this component 353// shift component to LSB 354void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]) 355{ 356 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 357 358 uint32_t bitOffset = 0; 359 for (uint32_t c = 0; c < info.numComps; ++c) 360 { 361 uint32_t swizzledIndex = info.swizzle[c]; 362 uint32_t compBits = info.bpc[c]; 363 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; 364 Value* comp = AND(vInput, bitmask); 365 comp = LSHR(comp, bitOffset); 366 367 result[swizzledIndex] = comp; 368 bitOffset += compBits; 369 } 370} 371 372// gather for odd component size formats 373// gather SIMD full pixels per lane then shift/mask to move each component to their 374// own vector 375void FetchJit::CreateGatherOddFormats( 376 SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4]) 377{ 378 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 379 380 // only works if pixel size is <= 32bits 381 SWR_ASSERT(info.bpp <= 32); 382 383 Value* pGather; 384 if (info.bpp == 32) 385 { 386 pGather = 387 GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 388 } 389 else 390 { 391 // Can't use 32-bit gather for items less than 32-bits, could cause page faults. 392 Value* pMem = ALLOCA(mSimdInt32Ty); 393 STORE(VIMMED1(0u), pMem); 394 395 Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy); 396 397 for (uint32_t lane = 0; lane < mVWidth; ++lane) 398 { 399 // Get index 400 Value* index = VEXTRACT(pOffsets, C(lane)); 401 Value* mask = VEXTRACT(pMask, C(lane)); 402 403 // use branch around load based on mask 404 // Needed to avoid page-faults on unmasked lanes 405 BasicBlock* pCurrentBB = IRB()->GetInsertBlock(); 406 BasicBlock* pMaskedLoadBlock = 407 BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent()); 408 BasicBlock* pEndLoadBB = 409 BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent()); 410 411 COND_BR(mask, pMaskedLoadBlock, pEndLoadBB); 412 413 JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock); 414 415 switch (info.bpp) 416 { 417 case 8: 418 { 419 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0)); 420 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 421 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 422 break; 423 } 424 425 case 16: 426 { 427 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); 428 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 429 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 430 break; 431 } 432 break; 433 434 case 24: 435 { 436 // First 16-bits of data 437 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); 438 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 439 STORE(LOAD(xpSrc, "", mInt16PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 440 441 // Last 8-bits of data 442 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0)); 443 xpSrc = ADD(xpSrc, C((int64_t)2)); 444 STORE(LOAD(xpSrc, "", mInt8PtrTy, MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 445 break; 446 } 447 448 default: 449 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp); 450 break; 451 } 452 453 BR(pEndLoadBB); 454 JM()->mBuilder.SetInsertPoint(pEndLoadBB); 455 } 456 457 pGather = LOAD(pMem); 458 } 459 460 for (uint32_t comp = 0; comp < 4; ++comp) 461 { 462 pResult[comp] = VIMMED1((int)info.defaults[comp]); 463 } 464 465 UnpackComponents(format, pGather, pResult); 466 467 // cast to fp32 468 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty); 469 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty); 470 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty); 471 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty); 472} 473 474void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4]) 475{ 476 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 477 478 for (uint32_t c = 0; c < info.numComps; ++c) 479 { 480 uint32_t compIndex = info.swizzle[c]; 481 482 // skip any conversion on UNUSED components 483 if (info.type[c] == SWR_TYPE_UNUSED) 484 { 485 continue; 486 } 487 488 if (info.isNormalized[c]) 489 { 490 if (info.type[c] == SWR_TYPE_SNORM) 491 { 492 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to 493 /// -1.0f. 494 495 /// result = c * (1.0f / (2^(n-1) - 1); 496 uint32_t n = info.bpc[c]; 497 uint32_t pow2 = 1 << (n - 1); 498 float scale = 1.0f / (float)(pow2 - 1); 499 Value* vScale = VIMMED1(scale); 500 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 501 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 502 texels[compIndex] = FMUL(texels[compIndex], vScale); 503 } 504 else 505 { 506 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); 507 508 /// result = c * (1.0f / (2^n - 1)) 509 uint32_t n = info.bpc[c]; 510 uint32_t pow2 = 1 << n; 511 // special case 24bit unorm format, which requires a full divide to meet ULP 512 // requirement 513 if (n == 24) 514 { 515 float scale = (float)(pow2 - 1); 516 Value* vScale = VIMMED1(scale); 517 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 518 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 519 texels[compIndex] = FDIV(texels[compIndex], vScale); 520 } 521 else 522 { 523 float scale = 1.0f / (float)(pow2 - 1); 524 Value* vScale = VIMMED1(scale); 525 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 526 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); 527 texels[compIndex] = FMUL(texels[compIndex], vScale); 528 } 529 } 530 continue; 531 } 532 } 533} 534 535////////////////////////////////////////////////////////////////////////// 536/// @brief Loads attributes from memory using AVX2 GATHER(s) 537/// @param fetchState - info about attributes to be fetched from memory 538/// @param streams - value pointer to the current vertex stream 539/// @param vIndices - vector value of indices to gather 540/// @param pVtxOut - value pointer to output simdvertex struct 541void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, 542 Value* streams, 543 Value* vIndices, 544 Value* pVtxOut) 545{ 546 uint32_t currentVertexElement = 0; 547 uint32_t outputElt = 0; 548 Value* vVertexElements[4]; 549 550 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); 551 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); 552 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); 553 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 554 curInstance->setName("curInstance"); 555 556 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) 557 { 558 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; 559 560 // skip element if all components are disabled 561 if (ied.ComponentPacking == ComponentEnable::NONE) 562 { 563 continue; 564 } 565 566 const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format); 567 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); 568 uint32_t bpc = 569 info.bpp / 570 info.numComps; ///@todo Code below assumes all components are same size. Need to fix. 571 572 Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); 573 574 Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); 575 Value* vStride = VBROADCAST(stride); 576 577 // max vertex index that is fully in bounds 578 Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); 579 maxVertex = LOAD(maxVertex); 580 581 Value* minVertex = NULL; 582 if (fetchState.bPartialVertexBuffer) 583 { 584 // min vertex index for low bounds OOB checking 585 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); 586 minVertex = LOAD(minVertex); 587 } 588 589 if (fetchState.bInstanceIDOffsetEnable) 590 { 591 // the InstanceID (curInstance) value is offset by StartInstanceLocation 592 curInstance = ADD(curInstance, startInstance); 593 } 594 595 Value* vCurIndices; 596 Value* startOffset; 597 Value* vInstanceStride = VIMMED1(0); 598 599 if (ied.InstanceEnable) 600 { 601 Value* stepRate = C(ied.InstanceAdvancementState); 602 603 // prevent a div by 0 for 0 step rate 604 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); 605 stepRate = SELECT(isNonZeroStep, stepRate, C(1)); 606 607 // calc the current offset into instanced data buffer 608 Value* calcInstance = UDIV(curInstance, stepRate); 609 610 // if step rate is 0, every instance gets instance 0 611 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); 612 613 vCurIndices = VBROADCAST(calcInstance); 614 startOffset = startInstance; 615 } 616 else if (ied.InstanceStrideEnable) 617 { 618 // grab the instance advancement state, determines stride in bytes from one instance to 619 // the next 620 Value* stepRate = C(ied.InstanceAdvancementState); 621 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); 622 623 // offset indices by baseVertex 624 vCurIndices = ADD(vIndices, vBaseVertex); 625 626 startOffset = startVertex; 627 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); 628 } 629 else 630 { 631 // offset indices by baseVertex 632 vCurIndices = ADD(vIndices, vBaseVertex); 633 startOffset = startVertex; 634 } 635 636 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 637 // do 64bit address offset calculations. 638 639 // calculate byte offset to the start of the VB 640 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); 641 642 // VGATHER* takes an *i8 src pointer so that's what stream is 643 Value* pStreamBaseGFX = ADD(stream, baseOffset); 644 645 // if we have a start offset, subtract from max vertex. Used for OOB check 646 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); 647 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0)); 648 // if we have a negative value, we're already OOB. clamp at 0. 649 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); 650 651 if (fetchState.bPartialVertexBuffer) 652 { 653 // similary for min vertex 654 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); 655 Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0)); 656 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); 657 } 658 659 // Load the in bounds size of a partially valid vertex 660 Value* partialInboundsSize = 661 GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); 662 partialInboundsSize = LOAD(partialInboundsSize); 663 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); 664 Value* vBpp = VBROADCAST(C(info.Bpp)); 665 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); 666 667 // is the element is <= the partially valid size 668 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); 669 670 // override cur indices with 0 if pitch is 0 671 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); 672 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); 673 674 // are vertices partially OOB? 675 Value* vMaxVertex = VBROADCAST(maxVertex); 676 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); 677 678 // are vertices fully in bounds? 679 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); 680 681 Value* vGatherMask; 682 if (fetchState.bPartialVertexBuffer) 683 { 684 // are vertices below minVertex limit? 685 Value* vMinVertex = VBROADCAST(minVertex); 686 Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); 687 688 // only fetch lanes that pass both tests 689 vGatherMask = AND(vMaxGatherMask, vMinGatherMask); 690 } 691 else 692 { 693 vGatherMask = vMaxGatherMask; 694 } 695 696 // blend in any partially OOB indices that have valid elements 697 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); 698 699 // calculate the actual offsets into the VB 700 Value* vOffsets = MUL(vCurIndices, vStride); 701 vOffsets = ADD(vOffsets, vAlignmentOffsets); 702 703 // if instance stride enable is: 704 // true - add product of the instanceID and advancement state to the offset into the VB 705 // false - value of vInstanceStride has been initialized to zero 706 vOffsets = ADD(vOffsets, vInstanceStride); 707 708 // Packing and component control 709 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; 710 const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0, 711 (ComponentControl)ied.ComponentControl1, 712 (ComponentControl)ied.ComponentControl2, 713 (ComponentControl)ied.ComponentControl3}; 714 715 // Special gather/conversion for formats without equal component sizes 716 if (IsOddFormat((SWR_FORMAT)ied.Format)) 717 { 718 Value* pResults[4]; 719 CreateGatherOddFormats( 720 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults); 721 ConvertFormat((SWR_FORMAT)ied.Format, pResults); 722 723 for (uint32_t c = 0; c < 4; c += 1) 724 { 725 if (isComponentEnabled(compMask, c)) 726 { 727 vVertexElements[currentVertexElement++] = pResults[c]; 728 if (currentVertexElement > 3) 729 { 730 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 731 // reset to the next vVertexElement to output 732 currentVertexElement = 0; 733 } 734 } 735 } 736 } 737 else if (info.type[0] == SWR_TYPE_FLOAT) 738 { 739 ///@todo: support 64 bit vb accesses 740 Value* gatherSrc = VIMMED1(0.0f); 741 742 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 743 "Unsupported format for standard gather fetch."); 744 745 // Gather components from memory to store in a simdvertex structure 746 switch (bpc) 747 { 748 case 16: 749 { 750 Value* vGatherResult[2]; 751 752 // if we have at least one component out of x or y to fetch 753 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 754 { 755 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 756 // e.g. result of first 8x32bit integer gather for 16bit components 757 // 256i - 0 1 2 3 4 5 6 7 758 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 759 // 760 } 761 762 // if we have at least one component out of z or w to fetch 763 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 764 { 765 // offset base to the next components(zw) in the vertex to gather 766 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 767 768 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 769 // e.g. result of second 8x32bit integer gather for 16bit components 770 // 256i - 0 1 2 3 4 5 6 7 771 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 772 // 773 } 774 775 // if we have at least one component to shuffle into place 776 if (compMask) 777 { 778 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, 779 pVtxOut, 780 Instruction::CastOps::FPExt, 781 CONVERT_NONE, 782 currentVertexElement, 783 outputElt, 784 compMask, 785 compCtrl, 786 vVertexElements); 787 788 // Shuffle gathered components into place in simdvertex struct 789 mVWidth == 16 ? Shuffle16bpcGather16(args) 790 : Shuffle16bpcGather(args); // outputs to vVertexElements ref 791 } 792 } 793 break; 794 case 32: 795 { 796 for (uint32_t i = 0; i < 4; i += 1) 797 { 798 if (isComponentEnabled(compMask, i)) 799 { 800 // if we need to gather the component 801 if (compCtrl[i] == StoreSrc) 802 { 803 // Gather a SIMD of vertices 804 // APIs allow a 4GB range for offsets 805 // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :( 806 // Add 2GB to the base pointer and 2GB to the offsets. This makes 807 // "negative" (large) offsets into positive offsets and small offsets 808 // into negative offsets. 809 Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000)); 810 vVertexElements[currentVertexElement++] = 811 GATHERPS(gatherSrc, 812 ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)), 813 vNewOffsets, 814 vGatherMask, 815 1, 816 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 817 } 818 else 819 { 820 vVertexElements[currentVertexElement++] = 821 GenerateCompCtrlVector(compCtrl[i]); 822 } 823 824 if (currentVertexElement > 3) 825 { 826 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 827 // reset to the next vVertexElement to output 828 currentVertexElement = 0; 829 } 830 } 831 832 // offset base to the next component in the vertex to gather 833 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 834 } 835 } 836 break; 837 case 64: 838 { 839 for (uint32_t i = 0; i < 4; i += 1) 840 { 841 if (isComponentEnabled(compMask, i)) 842 { 843 // if we need to gather the component 844 if (compCtrl[i] == StoreSrc) 845 { 846 Value* vShufLo; 847 Value* vShufHi; 848 Value* vShufAll; 849 850 if (mVWidth == 8) 851 { 852 vShufLo = C({0, 1, 2, 3}); 853 vShufHi = C({4, 5, 6, 7}); 854 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7}); 855 } 856 else 857 { 858 SWR_ASSERT(mVWidth == 16); 859 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7}); 860 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15}); 861 vShufAll = 862 C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 863 } 864 865 Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); 866 Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); 867 868 Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); 869 Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); 870 871 Value* vZeroDouble = VECTOR_SPLAT( 872 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); 873 874 Value* pGatherLo = 875 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo); 876 Value* pGatherHi = 877 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi); 878 879 Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); 880 pGather = FP_TRUNC(pGather, mSimdFP32Ty); 881 882 vVertexElements[currentVertexElement++] = pGather; 883 } 884 else 885 { 886 vVertexElements[currentVertexElement++] = 887 GenerateCompCtrlVector(compCtrl[i]); 888 } 889 890 if (currentVertexElement > 3) 891 { 892 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 893 // reset to the next vVertexElement to output 894 currentVertexElement = 0; 895 } 896 } 897 898 // offset base to the next component in the vertex to gather 899 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8)); 900 } 901 } 902 break; 903 default: 904 SWR_INVALID("Tried to fetch invalid FP format"); 905 break; 906 } 907 } 908 else 909 { 910 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; 911 ConversionType conversionType = CONVERT_NONE; 912 913 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 914 "Unsupported format for standard gather fetch."); 915 916 switch (info.type[0]) 917 { 918 case SWR_TYPE_UNORM: 919 conversionType = CONVERT_NORMALIZED; 920 case SWR_TYPE_UINT: 921 extendCastType = Instruction::CastOps::ZExt; 922 break; 923 case SWR_TYPE_SNORM: 924 conversionType = CONVERT_NORMALIZED; 925 case SWR_TYPE_SINT: 926 extendCastType = Instruction::CastOps::SExt; 927 break; 928 case SWR_TYPE_USCALED: 929 conversionType = CONVERT_USCALED; 930 extendCastType = Instruction::CastOps::UIToFP; 931 break; 932 case SWR_TYPE_SSCALED: 933 conversionType = CONVERT_SSCALED; 934 extendCastType = Instruction::CastOps::SIToFP; 935 break; 936 case SWR_TYPE_SFIXED: 937 conversionType = CONVERT_SFIXED; 938 extendCastType = Instruction::CastOps::SExt; 939 break; 940 default: 941 break; 942 } 943 944 // value substituted when component of gather is masked 945 Value* gatherSrc = VIMMED1(0); 946 947 // Gather components from memory to store in a simdvertex structure 948 switch (bpc) 949 { 950 case 8: 951 { 952 // if we have at least one component to fetch 953 if (compMask) 954 { 955 Value* vGatherResult = GATHERDD(gatherSrc, 956 pStreamBaseGFX, 957 vOffsets, 958 vGatherMask, 959 1, 960 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 961 // e.g. result of an 8x32bit integer gather for 8bit components 962 // 256i - 0 1 2 3 4 5 6 7 963 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 964 965 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, 966 pVtxOut, 967 extendCastType, 968 conversionType, 969 currentVertexElement, 970 outputElt, 971 compMask, 972 compCtrl, 973 vVertexElements, 974 info.swizzle); 975 976 // Shuffle gathered components into place in simdvertex struct 977 mVWidth == 16 ? Shuffle8bpcGatherd16(args) 978 : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref 979 } 980 } 981 break; 982 case 16: 983 { 984 Value* vGatherResult[2]; 985 986 // if we have at least one component out of x or y to fetch 987 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 988 { 989 vGatherResult[0] = GATHERDD(gatherSrc, 990 pStreamBaseGFX, 991 vOffsets, 992 vGatherMask, 993 1, 994 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 995 // e.g. result of first 8x32bit integer gather for 16bit components 996 // 256i - 0 1 2 3 4 5 6 7 997 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 998 // 999 } 1000 1001 // if we have at least one component out of z or w to fetch 1002 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1003 { 1004 // offset base to the next components(zw) in the vertex to gather 1005 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 1006 1007 vGatherResult[1] = GATHERDD(gatherSrc, 1008 pStreamBaseGFX, 1009 vOffsets, 1010 vGatherMask, 1011 1, 1012 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1013 // e.g. result of second 8x32bit integer gather for 16bit components 1014 // 256i - 0 1 2 3 4 5 6 7 1015 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 1016 // 1017 } 1018 1019 // if we have at least one component to shuffle into place 1020 if (compMask) 1021 { 1022 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, 1023 pVtxOut, 1024 extendCastType, 1025 conversionType, 1026 currentVertexElement, 1027 outputElt, 1028 compMask, 1029 compCtrl, 1030 vVertexElements); 1031 1032 // Shuffle gathered components into place in simdvertex struct 1033 mVWidth == 16 ? Shuffle16bpcGather16(args) 1034 : Shuffle16bpcGather(args); // outputs to vVertexElements ref 1035 } 1036 } 1037 break; 1038 case 32: 1039 { 1040 // Gathered components into place in simdvertex struct 1041 for (uint32_t i = 0; i < 4; i++) 1042 { 1043 if (isComponentEnabled(compMask, i)) 1044 { 1045 // if we need to gather the component 1046 if (compCtrl[i] == StoreSrc) 1047 { 1048 Value* pGather = GATHERDD(gatherSrc, 1049 pStreamBaseGFX, 1050 vOffsets, 1051 vGatherMask, 1052 1, 1053 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1054 1055 if (conversionType == CONVERT_USCALED) 1056 { 1057 pGather = UI_TO_FP(pGather, mSimdFP32Ty); 1058 } 1059 else if (conversionType == CONVERT_SSCALED) 1060 { 1061 pGather = SI_TO_FP(pGather, mSimdFP32Ty); 1062 } 1063 else if (conversionType == CONVERT_SFIXED) 1064 { 1065 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), 1066 VBROADCAST(C(1 / 65536.0f))); 1067 } 1068 1069 vVertexElements[currentVertexElement++] = pGather; 1070 1071 // e.g. result of a single 8x32bit integer gather for 32bit components 1072 // 256i - 0 1 2 3 4 5 6 7 1073 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 1074 } 1075 else 1076 { 1077 vVertexElements[currentVertexElement++] = 1078 GenerateCompCtrlVector(compCtrl[i]); 1079 } 1080 1081 if (currentVertexElement > 3) 1082 { 1083 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1084 1085 // reset to the next vVertexElement to output 1086 currentVertexElement = 0; 1087 } 1088 } 1089 1090 // offset base to the next component in the vertex to gather 1091 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 1092 } 1093 } 1094 break; 1095 } 1096 } 1097 } 1098 1099 // if we have a partially filled vVertexElement struct, output it 1100 if (currentVertexElement > 0) 1101 { 1102 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); 1103 } 1104} 1105 1106 1107typedef void* (*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va, bool* out_pbNullTileAccessed, void* pWorkerData); 1108 1109template <typename T> 1110void GetSimdValidIndicesGfx(gfxptr_t indices, 1111 gfxptr_t lastIndex, 1112 uint32_t vWidth, 1113 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, 1114 void* pdc, 1115 uint32_t* outIndices, 1116 void* pWorkerData) 1117{ 1118 SWR_ASSERT(outIndices != nullptr); 1119 1120 gfxptr_t indexPtr = indices; 1121 for (int64_t lane = 0; lane < vWidth; lane++) 1122 { 1123 uint32_t index = 0; 1124 1125 if (indexPtr < lastIndex) 1126 { 1127 // translate indexPtr and load from it 1128 T* addr = (T*)pfnTranslate(pdc, indexPtr, nullptr, pWorkerData); 1129 SWR_ASSERT(addr != nullptr); 1130 index = *addr; 1131 } 1132 1133 // index to 32 bits and insert into the correct simd lane 1134 outIndices[lane] = index; 1135 1136 indexPtr += sizeof(T); 1137 } 1138} 1139 1140void GetSimdValid8bitIndicesGfx(gfxptr_t indices, 1141 gfxptr_t lastIndex, 1142 uint32_t vWidth, 1143 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, 1144 void* pdc, 1145 uint32_t* outIndices, 1146 void* pWorkerData) 1147{ 1148 GetSimdValidIndicesGfx<uint8_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); 1149} 1150 1151void GetSimdValid16bitIndicesGfx(gfxptr_t indices, 1152 gfxptr_t lastIndex, 1153 uint32_t vWidth, 1154 PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, 1155 void* pdc, 1156 uint32_t* outIndices, 1157 void* pWorkerData) 1158{ 1159 GetSimdValidIndicesGfx<uint16_t>(indices, lastIndex, vWidth, pfnTranslate, pdc, outIndices, pWorkerData); 1160} 1161 1162 1163template <typename T> 1164Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) 1165{ 1166 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, 1167 "Function expects gfxptr_t for both input parameters."); 1168 1169 Type* Ty = nullptr; 1170 1171 static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), 1172 "Unsupported type for use with GetSimdValidIndicesHelper<T>"); 1173 constexpr bool bSize = (sizeof(T) == sizeof(uint16_t)); 1174 if (bSize) 1175 { 1176 Ty = mInt16PtrTy; 1177 } 1178 else if (sizeof(T) == sizeof(uint8_t)) 1179 { 1180 Ty = mInt8PtrTy; 1181 } 1182 else 1183 { 1184 SWR_ASSERT(false, "This should never happen as per static_assert above."); 1185 } 1186 1187 Value* vIndices = VUNDEF_I(); 1188 1189 { 1190 // store 0 index on stack to be used to conditionally load from if index address is OOB 1191 Value* pZeroIndex = ALLOCA(Ty->getPointerElementType()); 1192 STORE(C((T)0), pZeroIndex); 1193 1194 // Load a SIMD of index pointers 1195 for (int64_t lane = 0; lane < mVWidth; lane++) 1196 { 1197 // Calculate the address of the requested index 1198 Value* pIndex = GEP(pIndices, C(lane), Ty); 1199 1200 pLastIndex = INT_TO_PTR(pLastIndex, Ty); 1201 1202 // check if the address is less than the max index, 1203 Value* mask = ICMP_ULT(pIndex, pLastIndex); 1204 1205 // if valid, load the index. if not, load 0 from the stack 1206 Value* pValid = SELECT(mask, pIndex, pZeroIndex); 1207 Value* index = LOAD(pValid, "valid index", Ty, MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1208 1209 // zero extended index to 32 bits and insert into the correct simd lane 1210 index = Z_EXT(index, mInt32Ty); 1211 vIndices = VINSERT(vIndices, index, lane); 1212 } 1213 } 1214 1215 return vIndices; 1216} 1217 1218////////////////////////////////////////////////////////////////////////// 1219/// @brief Loads a simd of valid indices. OOB indices are set to 0 1220/// *Note* have to do 8bit index checking in scalar until we have AVX-512 1221/// support 1222/// @param pIndices - pointer to 8 bit indices 1223/// @param pLastIndex - pointer to last valid index 1224Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) 1225{ 1226 return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex); 1227} 1228 1229////////////////////////////////////////////////////////////////////////// 1230/// @brief Loads a simd of valid indices. OOB indices are set to 0 1231/// *Note* have to do 16bit index checking in scalar until we have AVX-512 1232/// support 1233/// @param pIndices - pointer to 16 bit indices 1234/// @param pLastIndex - pointer to last valid index 1235Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) 1236{ 1237 return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex); 1238} 1239 1240////////////////////////////////////////////////////////////////////////// 1241/// @brief Loads a simd of valid indices. OOB indices are set to 0 1242/// @param pIndices - pointer to 32 bit indices 1243/// @param pLastIndex - pointer to last valid index 1244Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) 1245{ 1246 DataLayout dL(JM()->mpCurrentModule); 1247 Value* iLastIndex = pLastIndex; 1248 Value* iIndices = pIndices; 1249 1250 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) 1251 Value* numIndicesLeft = SUB(iLastIndex, iIndices); 1252 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); 1253 numIndicesLeft = SDIV(numIndicesLeft, C(4)); 1254 1255 // create a vector of index counts from the base index ptr passed into the fetch 1256 Constant* vIndexOffsets; 1257 if (mVWidth == 8) 1258 { 1259 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7}); 1260 } 1261 else 1262 { 1263 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 1264 } 1265 1266 // compare index count to the max valid index 1267 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load 1268 // vIndexOffsets 0 1 2 3 4 5 6 7 1269 // ------------------------------ 1270 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass 1271 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 1272 Value* vMaxIndex = VBROADCAST(numIndicesLeft); 1273 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); 1274 1275 // Load the indices; OOB loads 0 1276 return MASKED_LOAD(pIndices, 1277 4, 1278 vIndexMask, 1279 VIMMED1(0), 1280 "vIndices", 1281 PointerType::get(mSimdInt32Ty, 0), 1282 MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1283} 1284 1285////////////////////////////////////////////////////////////////////////// 1286/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 1287/// denormalizes if needed, converts to F32 if needed, and positions in 1288// the proper SIMD rows to be output to the simdvertex structure 1289/// @param args: (tuple of args, listed below) 1290/// @param vGatherResult - 8 gathered 8bpc vertices 1291/// @param pVtxOut - base pointer to output simdvertex struct 1292/// @param extendType - sign extend or zero extend 1293/// @param bNormalized - do we need to denormalize? 1294/// @param currentVertexElement - reference to the current vVertexElement 1295/// @param outputElt - reference to the current offset from simdvertex we're o 1296/// @param compMask - component packing mask 1297/// @param compCtrl - component control val 1298/// @param vVertexElements[4] - vertex components to output 1299/// @param swizzle[4] - component swizzle location 1300void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args) 1301{ 1302 // Unpack tuple args 1303 Value*& vGatherResult = std::get<0>(args); 1304 Value* pVtxOut = std::get<1>(args); 1305 const Instruction::CastOps extendType = std::get<2>(args); 1306 const ConversionType conversionType = std::get<3>(args); 1307 uint32_t& currentVertexElement = std::get<4>(args); 1308 uint32_t& outputElt = std::get<5>(args); 1309 const ComponentEnable compMask = std::get<6>(args); 1310 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1311 Value*(&vVertexElements)[4] = std::get<8>(args); 1312 const uint32_t(&swizzle)[4] = std::get<9>(args); 1313 1314 // cast types 1315 Type* vGatherTy = getVectorType(mInt32Ty, 8); 1316 Type* v32x8Ty = getVectorType(mInt8Ty, 32); 1317 1318 // have to do extra work for sign extending 1319 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)) 1320 { 1321 Type* v16x8Ty = getVectorType(mInt8Ty, 16); // 8x16bit ints in a 128bit lane 1322 Type* v128Ty = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2); 1323 1324 // shuffle mask, including any swizzling 1325 const char x = (char)swizzle[0]; 1326 const char y = (char)swizzle[1]; 1327 const char z = (char)swizzle[2]; 1328 const char w = (char)swizzle[3]; 1329 Value* vConstMask = C<char>( 1330 {char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4), 1331 char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12), 1332 char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4), 1333 char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12), 1334 char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4), 1335 char(w + 8), char(w + 12)}); 1336 1337 // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. 1338 1339 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); 1340 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); 1341 1342 Value* vShufResult_lo = 1343 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); 1344 Value* vShufResult_hi = 1345 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); 1346 1347 // after pshufb: group components together in each 128bit lane 1348 // 256i - 0 1 2 3 4 5 6 7 1349 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 1350 1351 Value* vi128XY_lo = nullptr; 1352 Value* vi128XY_hi = nullptr; 1353 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1354 { 1355 vi128XY_lo = BITCAST( 1356 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), 1357 v128Ty); 1358 vi128XY_hi = BITCAST( 1359 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), 1360 v128Ty); 1361 1362 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 1363 // 256i - 0 1 2 3 4 5 6 7 1364 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 1365 } 1366 1367 // do the same for zw components 1368 Value* vi128ZW_lo = nullptr; 1369 Value* vi128ZW_hi = nullptr; 1370 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1371 { 1372 vi128ZW_lo = BITCAST( 1373 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), 1374 v128Ty); 1375 vi128ZW_hi = BITCAST( 1376 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), 1377 v128Ty); 1378 } 1379 1380 // init denormalize variables if needed 1381 Instruction::CastOps fpCast; 1382 Value* conversionFactor; 1383 1384 switch (conversionType) 1385 { 1386 case CONVERT_NORMALIZED: 1387 fpCast = Instruction::CastOps::SIToFP; 1388 conversionFactor = VIMMED1((float)(1.0 / 127.0)); 1389 break; 1390 case CONVERT_SSCALED: 1391 fpCast = Instruction::CastOps::SIToFP; 1392 conversionFactor = VIMMED1((float)(1.0)); 1393 break; 1394 case CONVERT_USCALED: 1395 assert(false && "Type should not be sign extended!"); 1396 conversionFactor = nullptr; 1397 break; 1398 default: 1399 assert(conversionType == CONVERT_NONE); 1400 conversionFactor = nullptr; 1401 break; 1402 } 1403 1404 // sign extend all enabled components. If we have a fill vVertexElements, output to current 1405 // simdvertex 1406 for (uint32_t i = 0; i < 4; i++) 1407 { 1408 if (isComponentEnabled(compMask, i)) 1409 { 1410 if (compCtrl[i] == ComponentControl::StoreSrc) 1411 { 1412 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1413 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1414 // if x or y, use vi128XY permute result, else use vi128ZW 1415 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; 1416 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; 1417 1418 // sign extend 1419 Value* temp_lo = 1420 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); 1421 Value* temp_hi = 1422 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); 1423 1424 Value* temp = JOIN_16(temp_lo, temp_hi); 1425 1426 // denormalize if needed 1427 if (conversionType != CONVERT_NONE) 1428 { 1429 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1430 } 1431 1432 vVertexElements[currentVertexElement] = temp; 1433 1434 currentVertexElement += 1; 1435 } 1436 else 1437 { 1438 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1439 } 1440 1441 if (currentVertexElement > 3) 1442 { 1443 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1444 // reset to the next vVertexElement to output 1445 currentVertexElement = 0; 1446 } 1447 } 1448 } 1449 } 1450 // else zero extend 1451 else if ((extendType == Instruction::CastOps::ZExt) || 1452 (extendType == Instruction::CastOps::UIToFP)) 1453 { 1454 // init denormalize variables if needed 1455 Instruction::CastOps fpCast; 1456 Value* conversionFactor; 1457 1458 switch (conversionType) 1459 { 1460 case CONVERT_NORMALIZED: 1461 fpCast = Instruction::CastOps::UIToFP; 1462 conversionFactor = VIMMED1((float)(1.0 / 255.0)); 1463 break; 1464 case CONVERT_USCALED: 1465 fpCast = Instruction::CastOps::UIToFP; 1466 conversionFactor = VIMMED1((float)(1.0)); 1467 break; 1468 case CONVERT_SSCALED: 1469 assert(false && "Type should not be zero extended!"); 1470 conversionFactor = nullptr; 1471 break; 1472 default: 1473 assert(conversionType == CONVERT_NONE); 1474 conversionFactor = nullptr; 1475 break; 1476 } 1477 1478 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 1479 for (uint32_t i = 0; i < 4; i++) 1480 { 1481 if (isComponentEnabled(compMask, i)) 1482 { 1483 if (compCtrl[i] == ComponentControl::StoreSrc) 1484 { 1485 // pshufb masks for each component 1486 Value* vConstMask; 1487 switch (swizzle[i]) 1488 { 1489 case 0: 1490 // x shuffle mask 1491 vConstMask = 1492 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 1493 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 1494 break; 1495 case 1: 1496 // y shuffle mask 1497 vConstMask = 1498 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1499 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 1500 break; 1501 case 2: 1502 // z shuffle mask 1503 vConstMask = 1504 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 1505 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 1506 break; 1507 case 3: 1508 // w shuffle mask 1509 vConstMask = 1510 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 1511 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 1512 break; 1513 default: 1514 assert(false && "Invalid component"); 1515 vConstMask = nullptr; 1516 break; 1517 } 1518 1519 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); 1520 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); 1521 1522 Value* temp_lo = 1523 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); 1524 Value* temp_hi = 1525 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); 1526 1527 // after pshufb for x channel 1528 // 256i - 0 1 2 3 4 5 6 7 1529 // x000 x000 x000 x000 x000 x000 x000 x000 1530 1531 Value* temp = JOIN_16(temp_lo, temp_hi); 1532 1533 // denormalize if needed 1534 if (conversionType != CONVERT_NONE) 1535 { 1536 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1537 } 1538 1539 vVertexElements[currentVertexElement] = temp; 1540 1541 currentVertexElement += 1; 1542 } 1543 else 1544 { 1545 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1546 } 1547 1548 if (currentVertexElement > 3) 1549 { 1550 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1551 // reset to the next vVertexElement to output 1552 currentVertexElement = 0; 1553 } 1554 } 1555 } 1556 } 1557 else 1558 { 1559 SWR_INVALID("Unsupported conversion type"); 1560 } 1561} 1562 1563void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args) 1564{ 1565 // Unpack tuple args 1566 Value*& vGatherResult = std::get<0>(args); 1567 Value* pVtxOut = std::get<1>(args); 1568 const Instruction::CastOps extendType = std::get<2>(args); 1569 const ConversionType conversionType = std::get<3>(args); 1570 uint32_t& currentVertexElement = std::get<4>(args); 1571 uint32_t& outputElt = std::get<5>(args); 1572 const ComponentEnable compMask = std::get<6>(args); 1573 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1574 Value*(&vVertexElements)[4] = std::get<8>(args); 1575 const uint32_t(&swizzle)[4] = std::get<9>(args); 1576 1577 // cast types 1578 Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1579 1580 for (uint32_t i = 0; i < 4; i++) 1581 { 1582 if (!isComponentEnabled(compMask, i)) 1583 continue; 1584 1585 if (compCtrl[i] == ComponentControl::StoreSrc) 1586 { 1587#if LLVM_VERSION_MAJOR >= 11 1588 using MaskType = int32_t; 1589#else 1590 using MaskType = uint32_t; 1591#endif 1592 std::vector<MaskType> vShuffleMasks[4] = { 1593 {0, 4, 8, 12, 16, 20, 24, 28}, // x 1594 {1, 5, 9, 13, 17, 21, 25, 29}, // y 1595 {2, 6, 10, 14, 18, 22, 26, 30}, // z 1596 {3, 7, 11, 15, 19, 23, 27, 31}, // w 1597 }; 1598 1599 Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), 1600 UndefValue::get(v32x8Ty), 1601 vShuffleMasks[swizzle[i]]); 1602 1603 if ((extendType == Instruction::CastOps::SExt) || 1604 (extendType == Instruction::CastOps::SIToFP)) 1605 { 1606 switch (conversionType) 1607 { 1608 case CONVERT_NORMALIZED: 1609 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); 1610 break; 1611 case CONVERT_SSCALED: 1612 val = SI_TO_FP(val, mSimdFP32Ty); 1613 break; 1614 case CONVERT_USCALED: 1615 SWR_INVALID("Type should not be sign extended!"); 1616 break; 1617 default: 1618 SWR_ASSERT(conversionType == CONVERT_NONE); 1619 val = S_EXT(val, mSimdInt32Ty); 1620 break; 1621 } 1622 } 1623 else if ((extendType == Instruction::CastOps::ZExt) || 1624 (extendType == Instruction::CastOps::UIToFP)) 1625 { 1626 switch (conversionType) 1627 { 1628 case CONVERT_NORMALIZED: 1629 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); 1630 break; 1631 case CONVERT_SSCALED: 1632 SWR_INVALID("Type should not be zero extended!"); 1633 break; 1634 case CONVERT_USCALED: 1635 val = UI_TO_FP(val, mSimdFP32Ty); 1636 break; 1637 default: 1638 SWR_ASSERT(conversionType == CONVERT_NONE); 1639 val = Z_EXT(val, mSimdInt32Ty); 1640 break; 1641 } 1642 } 1643 else 1644 { 1645 SWR_INVALID("Unsupported conversion type"); 1646 } 1647 1648 vVertexElements[currentVertexElement++] = val; 1649 } 1650 else 1651 { 1652 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1653 } 1654 1655 if (currentVertexElement > 3) 1656 { 1657 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1658 // reset to the next vVertexElement to output 1659 currentVertexElement = 0; 1660 } 1661 } 1662} 1663 1664////////////////////////////////////////////////////////////////////////// 1665/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 1666/// denormalizes if needed, converts to F32 if needed, and positions in 1667// the proper SIMD rows to be output to the simdvertex structure 1668/// @param args: (tuple of args, listed below) 1669/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index 1670/// @param pVtxOut - base pointer to output simdvertex struct 1671/// @param extendType - sign extend or zero extend 1672/// @param bNormalized - do we need to denormalize? 1673/// @param currentVertexElement - reference to the current vVertexElement 1674/// @param outputElt - reference to the current offset from simdvertex we're o 1675/// @param compMask - component packing mask 1676/// @param compCtrl - component control val 1677/// @param vVertexElements[4] - vertex components to output 1678void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args) 1679{ 1680 // Unpack tuple args 1681 Value*(&vGatherResult)[2] = std::get<0>(args); 1682 Value* pVtxOut = std::get<1>(args); 1683 const Instruction::CastOps extendType = std::get<2>(args); 1684 const ConversionType conversionType = std::get<3>(args); 1685 uint32_t& currentVertexElement = std::get<4>(args); 1686 uint32_t& outputElt = std::get<5>(args); 1687 const ComponentEnable compMask = std::get<6>(args); 1688 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1689 Value*(&vVertexElements)[4] = std::get<8>(args); 1690 1691 // cast types 1692 Type* vGatherTy = getVectorType(mInt32Ty, 8); 1693 Type* v32x8Ty = getVectorType(mInt8Ty, 32); 1694 1695 // have to do extra work for sign extending 1696 if ((extendType == Instruction::CastOps::SExt) || 1697 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) 1698 { 1699 // is this PP float? 1700 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; 1701 1702 Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane 1703 Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 2); 1704 1705 // shuffle mask 1706 Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1707 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1708 Value* vi128XY_lo = nullptr; 1709 Value* vi128XY_hi = nullptr; 1710 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1711 { 1712 // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, for 1713 // now.. 1714 1715 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty); 1716 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty); 1717 1718 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); 1719 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); 1720 1721 // after pshufb: group components together in each 128bit lane 1722 // 256i - 0 1 2 3 4 5 6 7 1723 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1724 1725 vi128XY_lo = BITCAST( 1726 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1727 v128bitTy); 1728 vi128XY_hi = BITCAST( 1729 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1730 v128bitTy); 1731 1732 // after PERMD: move and pack xy components into each 128bit lane 1733 // 256i - 0 1 2 3 4 5 6 7 1734 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1735 } 1736 1737 // do the same for zw components 1738 Value* vi128ZW_lo = nullptr; 1739 Value* vi128ZW_hi = nullptr; 1740 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1741 { 1742 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty); 1743 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty); 1744 1745 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); 1746 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); 1747 1748 vi128ZW_lo = BITCAST( 1749 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1750 v128bitTy); 1751 vi128ZW_hi = BITCAST( 1752 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1753 v128bitTy); 1754 } 1755 1756 // init denormalize variables if needed 1757 Instruction::CastOps IntToFpCast; 1758 Value* conversionFactor; 1759 1760 switch (conversionType) 1761 { 1762 case CONVERT_NORMALIZED: 1763 IntToFpCast = Instruction::CastOps::SIToFP; 1764 conversionFactor = VIMMED1((float)(1.0 / 32767.0)); 1765 break; 1766 case CONVERT_SSCALED: 1767 IntToFpCast = Instruction::CastOps::SIToFP; 1768 conversionFactor = VIMMED1((float)(1.0)); 1769 break; 1770 case CONVERT_USCALED: 1771 assert(false && "Type should not be sign extended!"); 1772 conversionFactor = nullptr; 1773 break; 1774 default: 1775 assert(conversionType == CONVERT_NONE); 1776 conversionFactor = nullptr; 1777 break; 1778 } 1779 1780 // sign extend all enabled components. If we have a fill vVertexElements, output to current 1781 // simdvertex 1782 for (uint32_t i = 0; i < 4; i++) 1783 { 1784 if (isComponentEnabled(compMask, i)) 1785 { 1786 if (compCtrl[i] == ComponentControl::StoreSrc) 1787 { 1788 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1789 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1790 // if x or y, use vi128XY permute result, else use vi128ZW 1791 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; 1792 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; 1793 1794 if (bFP) 1795 { 1796 // extract 128 bit lanes to sign extend each component 1797 Value* temp_lo = 1798 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); 1799 Value* temp_hi = 1800 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); 1801 1802 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); 1803 } 1804 else 1805 { 1806 // extract 128 bit lanes to sign extend each component 1807 Value* temp_lo = 1808 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); 1809 Value* temp_hi = 1810 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); 1811 1812 Value* temp = JOIN_16(temp_lo, temp_hi); 1813 1814 // denormalize if needed 1815 if (conversionType != CONVERT_NONE) 1816 { 1817 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor); 1818 } 1819 1820 vVertexElements[currentVertexElement] = temp; 1821 } 1822 1823 currentVertexElement += 1; 1824 } 1825 else 1826 { 1827 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1828 } 1829 1830 if (currentVertexElement > 3) 1831 { 1832 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1833 // reset to the next vVertexElement to output 1834 currentVertexElement = 0; 1835 } 1836 } 1837 } 1838 } 1839 // else zero extend 1840 else if ((extendType == Instruction::CastOps::ZExt) || 1841 (extendType == Instruction::CastOps::UIToFP)) 1842 { 1843 // pshufb masks for each component 1844 Value* vConstMask[2]; 1845 1846 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) 1847 { 1848 // x/z shuffle mask 1849 vConstMask[0] = C<char>({ 1850 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1851 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1852 }); 1853 } 1854 1855 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) 1856 { 1857 // y/w shuffle mask 1858 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 1859 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 1860 } 1861 1862 // init denormalize variables if needed 1863 Instruction::CastOps fpCast; 1864 Value* conversionFactor; 1865 1866 switch (conversionType) 1867 { 1868 case CONVERT_NORMALIZED: 1869 fpCast = Instruction::CastOps::UIToFP; 1870 conversionFactor = VIMMED1((float)(1.0 / 65535.0)); 1871 break; 1872 case CONVERT_USCALED: 1873 fpCast = Instruction::CastOps::UIToFP; 1874 conversionFactor = VIMMED1((float)(1.0f)); 1875 break; 1876 case CONVERT_SSCALED: 1877 SWR_INVALID("Type should not be zero extended!"); 1878 conversionFactor = nullptr; 1879 break; 1880 default: 1881 SWR_ASSERT(conversionType == CONVERT_NONE); 1882 conversionFactor = nullptr; 1883 break; 1884 } 1885 1886 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 1887 for (uint32_t i = 0; i < 4; i++) 1888 { 1889 if (isComponentEnabled(compMask, i)) 1890 { 1891 if (compCtrl[i] == ComponentControl::StoreSrc) 1892 { 1893 // select correct constMask for x/z or y/w pshufb 1894 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 1895 // if x or y, use vi128XY permute result, else use vi128ZW 1896 uint32_t selectedGather = (i < 2) ? 0 : 1; 1897 1898 // SIMD16 PSHUFB isn't part of AVX-512F, so split into SIMD8 for the sake of KNL, 1899 // for now.. 1900 1901 Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); 1902 Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); 1903 1904 Value* temp_lo = BITCAST( 1905 PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), 1906 vGatherTy); 1907 Value* temp_hi = BITCAST( 1908 PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), 1909 vGatherTy); 1910 1911 // after pshufb mask for x channel; z uses the same shuffle from the second 1912 // gather 256i - 0 1 2 3 4 5 6 7 1913 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 1914 1915 Value* temp = JOIN_16(temp_lo, temp_hi); 1916 1917 // denormalize if needed 1918 if (conversionType != CONVERT_NONE) 1919 { 1920 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1921 } 1922 1923 vVertexElements[currentVertexElement] = temp; 1924 1925 currentVertexElement += 1; 1926 } 1927 else 1928 { 1929 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1930 } 1931 1932 if (currentVertexElement > 3) 1933 { 1934 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1935 // reset to the next vVertexElement to output 1936 currentVertexElement = 0; 1937 } 1938 } 1939 } 1940 } 1941 else 1942 { 1943 SWR_INVALID("Unsupported conversion type"); 1944 } 1945} 1946 1947void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args) 1948{ 1949 // Unpack tuple args 1950 Value*(&vGatherResult)[2] = std::get<0>(args); 1951 Value* pVtxOut = std::get<1>(args); 1952 const Instruction::CastOps extendType = std::get<2>(args); 1953 const ConversionType conversionType = std::get<3>(args); 1954 uint32_t& currentVertexElement = std::get<4>(args); 1955 uint32_t& outputElt = std::get<5>(args); 1956 const ComponentEnable compMask = std::get<6>(args); 1957 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1958 Value*(&vVertexElements)[4] = std::get<8>(args); 1959 1960 // cast types 1961 Type* vGatherTy = getVectorType(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1962 Type* v32x8Ty = getVectorType(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1963 1964 // have to do extra work for sign extending 1965 if ((extendType == Instruction::CastOps::SExt) || 1966 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) 1967 { 1968 // is this PP float? 1969 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; 1970 1971 Type* v8x16Ty = getVectorType(mInt16Ty, 8); // 8x16bit in a 128bit lane 1972 Type* v128bitTy = getVectorType(IntegerType::getIntNTy(JM()->mContext, 128), 1973 mVWidth / 4); // vwidth is units of 32 bits 1974 1975 // shuffle mask 1976 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1977 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1978 Value* vi128XY = nullptr; 1979 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1980 { 1981 Value* vShufResult = 1982 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); 1983 // after pshufb: group components together in each 128bit lane 1984 // 256i - 0 1 2 3 4 5 6 7 1985 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1986 1987 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1988 // after PERMD: move and pack xy components into each 128bit lane 1989 // 256i - 0 1 2 3 4 5 6 7 1990 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1991 } 1992 1993 // do the same for zw components 1994 Value* vi128ZW = nullptr; 1995 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1996 { 1997 Value* vShufResult = 1998 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); 1999 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 2000 } 2001 2002 // init denormalize variables if needed 2003 Instruction::CastOps IntToFpCast; 2004 Value* conversionFactor; 2005 2006 switch (conversionType) 2007 { 2008 case CONVERT_NORMALIZED: 2009 IntToFpCast = Instruction::CastOps::SIToFP; 2010 conversionFactor = VIMMED1((float)(1.0 / 32767.0)); 2011 break; 2012 case CONVERT_SSCALED: 2013 IntToFpCast = Instruction::CastOps::SIToFP; 2014 conversionFactor = VIMMED1((float)(1.0)); 2015 break; 2016 case CONVERT_USCALED: 2017 SWR_INVALID("Type should not be sign extended!"); 2018 conversionFactor = nullptr; 2019 break; 2020 default: 2021 SWR_ASSERT(conversionType == CONVERT_NONE); 2022 conversionFactor = nullptr; 2023 break; 2024 } 2025 2026 // sign extend all enabled components. If we have a fill vVertexElements, output to current 2027 // simdvertex 2028 for (uint32_t i = 0; i < 4; i++) 2029 { 2030 if (isComponentEnabled(compMask, i)) 2031 { 2032 if (compCtrl[i] == ComponentControl::StoreSrc) 2033 { 2034 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 2035 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 2036 // if x or y, use vi128XY permute result, else use vi128ZW 2037 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 2038 2039 if (bFP) 2040 { 2041 // extract 128 bit lanes to sign extend each component 2042 vVertexElements[currentVertexElement] = 2043 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 2044 } 2045 else 2046 { 2047 // extract 128 bit lanes to sign extend each component 2048 vVertexElements[currentVertexElement] = 2049 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 2050 2051 // denormalize if needed 2052 if (conversionType != CONVERT_NONE) 2053 { 2054 vVertexElements[currentVertexElement] = 2055 FMUL(CAST(IntToFpCast, 2056 vVertexElements[currentVertexElement], 2057 mSimdFP32Ty), 2058 conversionFactor); 2059 } 2060 } 2061 currentVertexElement++; 2062 } 2063 else 2064 { 2065 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 2066 } 2067 2068 if (currentVertexElement > 3) 2069 { 2070 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 2071 // reset to the next vVertexElement to output 2072 currentVertexElement = 0; 2073 } 2074 } 2075 } 2076 } 2077 // else zero extend 2078 else if ((extendType == Instruction::CastOps::ZExt) || 2079 (extendType == Instruction::CastOps::UIToFP)) 2080 { 2081 // pshufb masks for each component 2082 Value* vConstMask[2]; 2083 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) 2084 { 2085 // x/z shuffle mask 2086 vConstMask[0] = C<char>({ 2087 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 2088 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 2089 }); 2090 } 2091 2092 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) 2093 { 2094 // y/w shuffle mask 2095 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 2096 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 2097 } 2098 2099 // init denormalize variables if needed 2100 Instruction::CastOps fpCast; 2101 Value* conversionFactor; 2102 2103 switch (conversionType) 2104 { 2105 case CONVERT_NORMALIZED: 2106 fpCast = Instruction::CastOps::UIToFP; 2107 conversionFactor = VIMMED1((float)(1.0 / 65535.0)); 2108 break; 2109 case CONVERT_USCALED: 2110 fpCast = Instruction::CastOps::UIToFP; 2111 conversionFactor = VIMMED1((float)(1.0f)); 2112 break; 2113 case CONVERT_SSCALED: 2114 SWR_INVALID("Type should not be zero extended!"); 2115 conversionFactor = nullptr; 2116 break; 2117 default: 2118 SWR_ASSERT(conversionType == CONVERT_NONE); 2119 conversionFactor = nullptr; 2120 break; 2121 } 2122 2123 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 2124 for (uint32_t i = 0; i < 4; i++) 2125 { 2126 if (isComponentEnabled(compMask, i)) 2127 { 2128 if (compCtrl[i] == ComponentControl::StoreSrc) 2129 { 2130 // select correct constMask for x/z or y/w pshufb 2131 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 2132 // if x or y, use vi128XY permute result, else use vi128ZW 2133 uint32_t selectedGather = (i < 2) ? 0 : 1; 2134 2135 vVertexElements[currentVertexElement] = 2136 BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), 2137 vConstMask[selectedMask]), 2138 vGatherTy); 2139 // after pshufb mask for x channel; z uses the same shuffle from the second 2140 // gather 256i - 0 1 2 3 4 5 6 7 2141 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 2142 2143 // denormalize if needed 2144 if (conversionType != CONVERT_NONE) 2145 { 2146 vVertexElements[currentVertexElement] = 2147 FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), 2148 conversionFactor); 2149 } 2150 currentVertexElement++; 2151 } 2152 else 2153 { 2154 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 2155 } 2156 2157 if (currentVertexElement > 3) 2158 { 2159 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 2160 // reset to the next vVertexElement to output 2161 currentVertexElement = 0; 2162 } 2163 } 2164 } 2165 } 2166 else 2167 { 2168 SWR_INVALID("Unsupported conversion type"); 2169 } 2170} 2171 2172////////////////////////////////////////////////////////////////////////// 2173/// @brief Output a simdvertex worth of elements to the current outputElt 2174/// @param pVtxOut - base address of VIN output struct 2175/// @param outputElt - simdvertex offset in VIN to write to 2176/// @param numEltsToStore - number of simdvertex rows to write out 2177/// @param vVertexElements - LLVM Value*[] simdvertex to write out 2178void FetchJit::StoreVertexElements(Value* pVtxOut, 2179 const uint32_t outputElt, 2180 const uint32_t numEltsToStore, 2181 Value* (&vVertexElements)[4]) 2182{ 2183 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); 2184 2185 for (uint32_t c = 0; c < numEltsToStore; ++c) 2186 { 2187 // STORE expects FP32 x vWidth type, just bitcast if needed 2188 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) 2189 { 2190#if FETCH_DUMP_VERTEX 2191 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); 2192#endif 2193 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); 2194 } 2195#if FETCH_DUMP_VERTEX 2196 else 2197 { 2198 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); 2199 } 2200#endif 2201 // outputElt * 4 = offsetting by the size of a simdvertex 2202 // + c offsets to a 32bit x vWidth row within the current vertex 2203 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP"); 2204 STORE(vVertexElements[c], dest); 2205 } 2206} 2207 2208////////////////////////////////////////////////////////////////////////// 2209/// @brief Generates a constant vector of values based on the 2210/// ComponentControl value 2211/// @param ctrl - ComponentControl value 2212Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) 2213{ 2214 switch (ctrl) 2215 { 2216 case NoStore: 2217 return VUNDEF_I(); 2218 case Store0: 2219 return VIMMED1(0); 2220 case Store1Fp: 2221 return VIMMED1(1.0f); 2222 case Store1Int: 2223 return VIMMED1(1); 2224 case StoreVertexId: 2225 { 2226 if (mVWidth == 16) 2227 { 2228 Type* pSimd8FPTy = getVectorType(mFP32Ty, 8); 2229 Value* pIdLo = 2230 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy); 2231 Value* pIdHi = 2232 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy); 2233 return JOIN_16(pIdLo, pIdHi); 2234 } 2235 else 2236 { 2237 return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty); 2238 } 2239 } 2240 case StoreInstanceId: 2241 { 2242 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty); 2243 return VBROADCAST(pId); 2244 } 2245 2246 2247 case StoreSrc: 2248 default: 2249 SWR_INVALID("Invalid component control"); 2250 return VUNDEF_I(); 2251 } 2252} 2253 2254////////////////////////////////////////////////////////////////////////// 2255/// @brief Returns the enable mask for the specified component. 2256/// @param enableMask - enable bits 2257/// @param component - component to check if enabled. 2258bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) 2259{ 2260 switch (component) 2261 { 2262 // X 2263 case 0: 2264 return (enableMask & ComponentEnable::X); 2265 // Y 2266 case 1: 2267 return (enableMask & ComponentEnable::Y); 2268 // Z 2269 case 2: 2270 return (enableMask & ComponentEnable::Z); 2271 // W 2272 case 3: 2273 return (enableMask & ComponentEnable::W); 2274 2275 default: 2276 return false; 2277 } 2278} 2279 2280// Don't want two threads compiling the same fetch shader simultaneously 2281// Has problems in the JIT cache implementation 2282// This is only a problem for fetch right now. 2283static std::mutex gFetchCodegenMutex; 2284 2285////////////////////////////////////////////////////////////////////////// 2286/// @brief JITs from fetch shader IR 2287/// @param hJitMgr - JitManager handle 2288/// @param func - LLVM function IR 2289/// @return PFN_FETCH_FUNC - pointer to fetch code 2290PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) 2291{ 2292 const llvm::Function* func = (const llvm::Function*)hFunc; 2293 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 2294 PFN_FETCH_FUNC pfnFetch; 2295 2296 gFetchCodegenMutex.lock(); 2297 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 2298 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot 2299 // add new IR to the module 2300 pJitMgr->mIsModuleFinalized = true; 2301 2302#if defined(KNOB_SWRC_TRACING) 2303 char fName[1024]; 2304 const char* funcName = func->getName().data(); 2305 sprintf(fName, "%s.bin", funcName); 2306 FILE* fd = fopen(fName, "wb"); 2307 fwrite((void*)pfnFetch, 1, 2048, fd); 2308 fclose(fd); 2309#endif 2310 2311 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final"); 2312 gFetchCodegenMutex.unlock(); 2313 2314 2315 return pfnFetch; 2316} 2317 2318////////////////////////////////////////////////////////////////////////// 2319/// @brief JIT compiles fetch shader 2320/// @param hJitMgr - JitManager handle 2321/// @param state - fetch state to build function from 2322extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) 2323{ 2324 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 2325 2326 pJitMgr->SetupNewModule(); 2327 2328 FetchJit theJit(pJitMgr); 2329 HANDLE hFunc = theJit.Create(state); 2330 2331 return JitFetchFunc(hJitMgr, hFunc); 2332} 2333