1/**************************************************************************** 2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file fetch_jit.cpp 24 * 25 * @brief Implementation of the fetch jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30#include "jit_pch.hpp" 31#include "builder_gfx_mem.h" 32#include "jit_api.h" 33#include "fetch_jit.h" 34#include "gen_state_llvm.h" 35#include "functionpasses/passes.h" 36 37//#define FETCH_DUMP_VERTEX 1 38using namespace llvm; 39using namespace SwrJit; 40 41bool isComponentEnabled(ComponentEnable enableMask, uint8_t component); 42 43enum ConversionType 44{ 45 CONVERT_NONE, 46 CONVERT_NORMALIZED, 47 CONVERT_USCALED, 48 CONVERT_SSCALED, 49 CONVERT_SFIXED, 50}; 51 52////////////////////////////////////////////////////////////////////////// 53/// Interface to Jitting a fetch shader 54////////////////////////////////////////////////////////////////////////// 55struct FetchJit : public BuilderGfxMem 56{ 57 FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {} 58 59 Function* Create(const FETCH_COMPILE_STATE& fetchState); 60 61 Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); 62 Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); 63 Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); 64 template <typename T> 65 Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex); 66 67 // package up Shuffle*bpcGatherd args into a tuple for convenience 68 typedef std::tuple<Value*&, 69 Value*, 70 const Instruction::CastOps, 71 const ConversionType, 72 uint32_t&, 73 uint32_t&, 74 const ComponentEnable, 75 const ComponentControl (&)[4], 76 Value* (&)[4], 77 const uint32_t (&)[4]> 78 Shuffle8bpcArgs; 79 80 void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args); 81 void Shuffle8bpcGatherd(Shuffle8bpcArgs& args); 82 83 typedef std::tuple<Value* (&)[2], 84 Value*, 85 const Instruction::CastOps, 86 const ConversionType, 87 uint32_t&, 88 uint32_t&, 89 const ComponentEnable, 90 const ComponentControl (&)[4], 91 Value* (&)[4]> 92 Shuffle16bpcArgs; 93 94 void Shuffle16bpcGather16(Shuffle16bpcArgs& args); 95 void Shuffle16bpcGather(Shuffle16bpcArgs& args); 96 97 void StoreVertexElements(Value* pVtxOut, 98 const uint32_t outputElt, 99 const uint32_t numEltsToStore, 100 Value* (&vVertexElements)[4]); 101 102 Value* GenerateCompCtrlVector(const ComponentControl ctrl); 103 104 void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, 105 Value* streams, 106 Value* vIndices, 107 Value* pVtxOut); 108 109 bool IsOddFormat(SWR_FORMAT format); 110 bool IsUniformFormat(SWR_FORMAT format); 111 void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]); 112 void CreateGatherOddFormats( 113 SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); 114 void ConvertFormat(SWR_FORMAT format, Value* texels[4]); 115 116 Value* mpWorkerData; 117 Value* mpFetchInfo; 118}; 119 120Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) 121{ 122 std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate); 123 fnName << ComputeCRC(0, &fetchState, sizeof(fetchState)); 124 125 Function* fetch = Function::Create( 126 JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 127 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch); 128 129 fetch->getParent()->setModuleIdentifier(fetch->getName()); 130 131 IRB()->SetInsertPoint(entry); 132 133 auto argitr = fetch->arg_begin(); 134 135 // Fetch shader arguments 136 Value* privateContext = &*argitr; 137 ++argitr; 138 privateContext->setName("privateContext"); 139 SetPrivateContext(privateContext); 140 141 mpWorkerData = &*argitr; 142 ++argitr; 143 mpWorkerData->setName("pWorkerData"); 144 mpFetchInfo = &*argitr; 145 ++argitr; 146 mpFetchInfo->setName("fetchInfo"); 147 Value* pVtxOut = &*argitr; 148 pVtxOut->setName("vtxOutput"); 149 150 uint32_t baseWidth = mVWidth; 151 152 SWR_ASSERT(mVWidth == 8 || mVWidth == 16, "Unsupported vector width %d", mVWidth); 153 154 // Override builder target width to force 16-wide SIMD 155#if USE_SIMD16_SHADERS 156 SetTargetWidth(16); 157#endif 158 159 pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0)); 160 161 // SWR_FETCH_CONTEXT::pStreams 162 Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams}); 163 streams->setName("pStreams"); 164 165 // SWR_FETCH_CONTEXT::pIndices 166 Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices}); 167 indices->setName("pIndices"); 168 169 // SWR_FETCH_CONTEXT::pLastIndex 170 Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex}); 171 pLastIndex->setName("pLastIndex"); 172 173 Value* vIndices; 174 switch (fetchState.indexType) 175 { 176 case R8_UINT: 177 indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0)); 178 if (fetchState.bDisableIndexOOBCheck) 179 { 180 vIndices = LOAD( 181 BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), 182 {(uint32_t)0}); 183 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 184 } 185 else 186 { 187 vIndices = GetSimdValid8bitIndices(indices, pLastIndex); 188 } 189 break; 190 case R16_UINT: 191 if (fetchState.bDisableIndexOOBCheck) 192 { 193 vIndices = LOAD( 194 BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), 195 {(uint32_t)0}); 196 vIndices = Z_EXT(vIndices, mSimdInt32Ty); 197 } 198 else 199 { 200 vIndices = GetSimdValid16bitIndices(indices, pLastIndex); 201 } 202 break; 203 case R32_UINT: 204 (fetchState.bDisableIndexOOBCheck) 205 ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH) 206 : vIndices = GetSimdValid32bitIndices(indices, pLastIndex); 207 break; // incoming type is already 32bit int 208 default: 209 SWR_INVALID("Unsupported index type"); 210 vIndices = nullptr; 211 break; 212 } 213 214 if (fetchState.bForceSequentialAccessEnable) 215 { 216 Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7}) 217 : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 218 219 // VertexData buffers are accessed sequentially, the index is equal to the vertex number 220 vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); 221 vIndices = ADD(vIndices, pOffsets); 222 } 223 224 Value* vVertexId = vIndices; 225 if (fetchState.bVertexIDOffsetEnable) 226 { 227 // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally 228 // correct 229 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 230 Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex})); 231 vVertexId = ADD(vIndices, vBaseVertex); 232 vVertexId = ADD(vVertexId, vStartVertex); 233 } 234 235 // store out vertex IDs 236 if (mVWidth == 16) 237 { 238 // store out in simd8 halves until core supports 16-wide natively 239 auto vVertexIdLo = EXTRACT_16(vVertexId, 0); 240 auto vVertexIdHi = EXTRACT_16(vVertexId, 1); 241 STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); 242 STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})); 243 } 244 else if (mVWidth == 8) 245 { 246 STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})); 247 } 248 249 // store out cut mask if enabled 250 if (fetchState.bEnableCutIndex) 251 { 252 Value* vCutIndex = VIMMED1(fetchState.cutIndex); 253 Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex)); 254 255 if (mVWidth == 16) 256 { 257 auto cutMaskLo = EXTRACT_16(cutMask, 0); 258 auto cutMaskHi = EXTRACT_16(cutMask, 1); 259 STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); 260 STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2})); 261 } 262 else if (mVWidth == 8) 263 { 264 STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask})); 265 } 266 } 267 268 // Fetch attributes from memory and output to a simdvertex struct 269 JitGatherVertices(fetchState, streams, vIndices, pVtxOut); 270 271 RET_VOID(); 272 273 JitManager::DumpToFile(fetch, "src"); 274 275#if defined(_DEBUG) 276 verifyFunction(*fetch); 277#endif 278 279 ::FunctionPassManager setupPasses(JM()->mpCurrentModule); 280 281 ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) 282 setupPasses.add(createBreakCriticalEdgesPass()); 283 setupPasses.add(createCFGSimplificationPass()); 284 setupPasses.add(createEarlyCSEPass()); 285 setupPasses.add(createPromoteMemoryToRegisterPass()); 286 287 setupPasses.run(*fetch); 288 289 JitManager::DumpToFile(fetch, "se"); 290 291 ::FunctionPassManager optPasses(JM()->mpCurrentModule); 292 293 ///@todo Haven't touched these either. Need to remove some of these and add others. 294 optPasses.add(createCFGSimplificationPass()); 295 optPasses.add(createEarlyCSEPass()); 296 optPasses.add(createInstructionCombiningPass()); 297 optPasses.add(createConstantPropagationPass()); 298 optPasses.add(createSCCPPass()); 299 optPasses.add(createAggressiveDCEPass()); 300 301 optPasses.run(*fetch); 302 303 optPasses.add(createLowerX86Pass(this)); 304 optPasses.run(*fetch); 305 306 JitManager::DumpToFile(fetch, "opt"); 307 308 309 // Revert 16-wide override 310#if USE_SIMD16_SHADERS 311 SetTargetWidth(baseWidth); 312#endif 313 314 return fetch; 315} 316 317// returns true for odd formats that require special state.gather handling 318bool FetchJit::IsOddFormat(SWR_FORMAT format) 319{ 320 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 321 if (info.bpc[0] != 8 && info.bpc[0] != 16 && info.bpc[0] != 32 && info.bpc[0] != 64) 322 { 323 return true; 324 } 325 return false; 326} 327 328// format is uniform if all components are the same size and type 329bool FetchJit::IsUniformFormat(SWR_FORMAT format) 330{ 331 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 332 uint32_t bpc0 = info.bpc[0]; 333 uint32_t type0 = info.type[0]; 334 335 for (uint32_t c = 1; c < info.numComps; ++c) 336 { 337 if (bpc0 != info.bpc[c] || type0 != info.type[c]) 338 { 339 return false; 340 } 341 } 342 return true; 343} 344 345// unpacks components based on format 346// foreach component in the pixel 347// mask off everything but this component 348// shift component to LSB 349void FetchJit::UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]) 350{ 351 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 352 353 uint32_t bitOffset = 0; 354 for (uint32_t c = 0; c < info.numComps; ++c) 355 { 356 uint32_t swizzledIndex = info.swizzle[c]; 357 uint32_t compBits = info.bpc[c]; 358 uint32_t bitmask = ((1 << compBits) - 1) << bitOffset; 359 Value* comp = AND(vInput, bitmask); 360 comp = LSHR(comp, bitOffset); 361 362 result[swizzledIndex] = comp; 363 bitOffset += compBits; 364 } 365} 366 367// gather for odd component size formats 368// gather SIMD full pixels per lane then shift/mask to move each component to their 369// own vector 370void FetchJit::CreateGatherOddFormats( 371 SWR_FORMAT format, Value* pMask, Value* xpBase, Value* pOffsets, Value* pResult[4]) 372{ 373 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 374 375 // only works if pixel size is <= 32bits 376 SWR_ASSERT(info.bpp <= 32); 377 378 Value* pGather; 379 if (info.bpp == 32) 380 { 381 pGather = GATHERDD(VIMMED1(0), xpBase, pOffsets, pMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 382 } 383 else 384 { 385 // Can't use 32-bit gather for items less than 32-bits, could cause page faults. 386 Value* pMem = ALLOCA(mSimdInt32Ty); 387 STORE(VIMMED1(0u), pMem); 388 389 Value* pDstMem = POINTER_CAST(pMem, mInt32PtrTy); 390 391 for (uint32_t lane = 0; lane < mVWidth; ++lane) 392 { 393 // Get index 394 Value* index = VEXTRACT(pOffsets, C(lane)); 395 Value* mask = VEXTRACT(pMask, C(lane)); 396 397 // use branch around load based on mask 398 // Needed to avoid page-faults on unmasked lanes 399 BasicBlock* pCurrentBB = IRB()->GetInsertBlock(); 400 BasicBlock* pMaskedLoadBlock = 401 BasicBlock::Create(JM()->mContext, "MaskedLaneLoad", pCurrentBB->getParent()); 402 BasicBlock* pEndLoadBB = BasicBlock::Create(JM()->mContext, "AfterMaskedLoad", pCurrentBB->getParent()); 403 404 COND_BR(mask, pMaskedLoadBlock, pEndLoadBB); 405 406 JM()->mBuilder.SetInsertPoint(pMaskedLoadBlock); 407 408 switch (info.bpp) 409 { 410 case 8: 411 { 412 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0)); 413 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 414 STORE(LOAD(xpSrc, "", mInt8PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 415 break; 416 } 417 418 case 16: 419 { 420 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); 421 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 422 STORE(LOAD(xpSrc, "", mInt16PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 423 break; 424 } 425 break; 426 427 case 24: 428 { 429 // First 16-bits of data 430 Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0)); 431 Value* xpSrc = ADD(xpBase, Z_EXT(index, xpBase->getType())); 432 STORE(LOAD(xpSrc, "", mInt16PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 433 434 // Last 8-bits of data 435 pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0)); 436 xpSrc = ADD(xpSrc, C(2)); 437 STORE(LOAD(xpSrc, "", mInt8PtrTy, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH), pDst); 438 break; 439 } 440 441 default: 442 SWR_INVALID("Shouldn't have BPP = %d now", info.bpp); 443 break; 444 } 445 446 BR(pEndLoadBB); 447 JM()->mBuilder.SetInsertPoint(pEndLoadBB); 448 } 449 450 pGather = LOAD(pMem); 451 } 452 453 for (uint32_t comp = 0; comp < 4; ++comp) 454 { 455 pResult[comp] = VIMMED1((int)info.defaults[comp]); 456 } 457 458 UnpackComponents(format, pGather, pResult); 459 460 // cast to fp32 461 pResult[0] = BITCAST(pResult[0], mSimdFP32Ty); 462 pResult[1] = BITCAST(pResult[1], mSimdFP32Ty); 463 pResult[2] = BITCAST(pResult[2], mSimdFP32Ty); 464 pResult[3] = BITCAST(pResult[3], mSimdFP32Ty); 465} 466 467void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4]) 468{ 469 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 470 471 for (uint32_t c = 0; c < info.numComps; ++c) 472 { 473 uint32_t compIndex = info.swizzle[c]; 474 475 // skip any conversion on UNUSED components 476 if (info.type[c] == SWR_TYPE_UNUSED) 477 { 478 continue; 479 } 480 481 if (info.isNormalized[c]) 482 { 483 if (info.type[c] == SWR_TYPE_SNORM) 484 { 485 /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to 486 /// -1.0f. 487 488 /// result = c * (1.0f / (2^(n-1) - 1); 489 uint32_t n = info.bpc[c]; 490 uint32_t pow2 = 1 << (n - 1); 491 float scale = 1.0f / (float)(pow2 - 1); 492 Value* vScale = VIMMED1(scale); 493 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 494 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 495 texels[compIndex] = FMUL(texels[compIndex], vScale); 496 } 497 else 498 { 499 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM); 500 501 /// result = c * (1.0f / (2^n - 1)) 502 uint32_t n = info.bpc[c]; 503 uint32_t pow2 = 1 << n; 504 // special case 24bit unorm format, which requires a full divide to meet ULP 505 // requirement 506 if (n == 24) 507 { 508 float scale = (float)(pow2 - 1); 509 Value* vScale = VIMMED1(scale); 510 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 511 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty); 512 texels[compIndex] = FDIV(texels[compIndex], vScale); 513 } 514 else 515 { 516 float scale = 1.0f / (float)(pow2 - 1); 517 Value* vScale = VIMMED1(scale); 518 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty); 519 texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty); 520 texels[compIndex] = FMUL(texels[compIndex], vScale); 521 } 522 } 523 continue; 524 } 525 } 526} 527 528////////////////////////////////////////////////////////////////////////// 529/// @brief Loads attributes from memory using AVX2 GATHER(s) 530/// @param fetchState - info about attributes to be fetched from memory 531/// @param streams - value pointer to the current vertex stream 532/// @param vIndices - vector value of indices to gather 533/// @param pVtxOut - value pointer to output simdvertex struct 534void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState, 535 Value* streams, 536 Value* vIndices, 537 Value* pVtxOut) 538{ 539 uint32_t currentVertexElement = 0; 540 uint32_t outputElt = 0; 541 Value* vVertexElements[4]; 542 543 Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); 544 Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); 545 Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); 546 Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); 547 curInstance->setName("curInstance"); 548 549 for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) 550 { 551 const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; 552 553 // skip element if all components are disabled 554 if (ied.ComponentPacking == ComponentEnable::NONE) 555 { 556 continue; 557 } 558 559 const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format); 560 SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices."); 561 uint32_t bpc = 562 info.bpp / 563 info.numComps; ///@todo Code below assumes all components are same size. Need to fix. 564 565 Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData}); 566 567 Value* stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); 568 Value* vStride = VBROADCAST(stride); 569 570 // max vertex index that is fully in bounds 571 Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); 572 maxVertex = LOAD(maxVertex); 573 574 Value* minVertex = NULL; 575 if (fetchState.bPartialVertexBuffer) 576 { 577 // min vertex index for low bounds OOB checking 578 minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); 579 minVertex = LOAD(minVertex); 580 } 581 582 if (fetchState.bInstanceIDOffsetEnable) 583 { 584 // the InstanceID (curInstance) value is offset by StartInstanceLocation 585 curInstance = ADD(curInstance, startInstance); 586 } 587 588 Value* vCurIndices; 589 Value* startOffset; 590 Value* vInstanceStride = VIMMED1(0); 591 592 if (ied.InstanceEnable) 593 { 594 Value* stepRate = C(ied.InstanceAdvancementState); 595 596 // prevent a div by 0 for 0 step rate 597 Value* isNonZeroStep = ICMP_UGT(stepRate, C(0)); 598 stepRate = SELECT(isNonZeroStep, stepRate, C(1)); 599 600 // calc the current offset into instanced data buffer 601 Value* calcInstance = UDIV(curInstance, stepRate); 602 603 // if step rate is 0, every instance gets instance 0 604 calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); 605 606 vCurIndices = VBROADCAST(calcInstance); 607 startOffset = startInstance; 608 } 609 else if (ied.InstanceStrideEnable) 610 { 611 // grab the instance advancement state, determines stride in bytes from one instance to 612 // the next 613 Value* stepRate = C(ied.InstanceAdvancementState); 614 vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); 615 616 // offset indices by baseVertex 617 vCurIndices = ADD(vIndices, vBaseVertex); 618 619 startOffset = startVertex; 620 SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); 621 } 622 else 623 { 624 // offset indices by baseVertex 625 vCurIndices = ADD(vIndices, vBaseVertex); 626 startOffset = startVertex; 627 } 628 629 // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 630 // do 64bit address offset calculations. 631 632 // calculate byte offset to the start of the VB 633 Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty)); 634 635 // VGATHER* takes an *i8 src pointer so that's what stream is 636 Value* pStreamBaseGFX = ADD(stream, baseOffset); 637 638 // if we have a start offset, subtract from max vertex. Used for OOB check 639 maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); 640 Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0)); 641 // if we have a negative value, we're already OOB. clamp at 0. 642 maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); 643 644 if (fetchState.bPartialVertexBuffer) 645 { 646 // similary for min vertex 647 minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); 648 Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0)); 649 minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty)); 650 } 651 652 // Load the in bounds size of a partially valid vertex 653 Value* partialInboundsSize = 654 GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); 655 partialInboundsSize = LOAD(partialInboundsSize); 656 Value* vPartialVertexSize = VBROADCAST(partialInboundsSize); 657 Value* vBpp = VBROADCAST(C(info.Bpp)); 658 Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); 659 660 // is the element is <= the partially valid size 661 Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); 662 663 // override cur indices with 0 if pitch is 0 664 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); 665 vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); 666 667 // are vertices partially OOB? 668 Value* vMaxVertex = VBROADCAST(maxVertex); 669 Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); 670 671 // are vertices fully in bounds? 672 Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); 673 674 Value* vGatherMask; 675 if (fetchState.bPartialVertexBuffer) 676 { 677 // are vertices below minVertex limit? 678 Value* vMinVertex = VBROADCAST(minVertex); 679 Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); 680 681 // only fetch lanes that pass both tests 682 vGatherMask = AND(vMaxGatherMask, vMinGatherMask); 683 } 684 else 685 { 686 vGatherMask = vMaxGatherMask; 687 } 688 689 // blend in any partially OOB indices that have valid elements 690 vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); 691 692 // calculate the actual offsets into the VB 693 Value* vOffsets = MUL(vCurIndices, vStride); 694 vOffsets = ADD(vOffsets, vAlignmentOffsets); 695 696 // if instance stride enable is: 697 // true - add product of the instanceID and advancement state to the offst into the VB 698 // false - value of vInstanceStride has been initialialized to zero 699 vOffsets = ADD(vOffsets, vInstanceStride); 700 701 // Packing and component control 702 ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; 703 const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0, 704 (ComponentControl)ied.ComponentControl1, 705 (ComponentControl)ied.ComponentControl2, 706 (ComponentControl)ied.ComponentControl3}; 707 708 // Special gather/conversion for formats without equal component sizes 709 if (IsOddFormat((SWR_FORMAT)ied.Format)) 710 { 711 Value* pResults[4]; 712 CreateGatherOddFormats( 713 (SWR_FORMAT)ied.Format, vGatherMask, pStreamBaseGFX, vOffsets, pResults); 714 ConvertFormat((SWR_FORMAT)ied.Format, pResults); 715 716 for (uint32_t c = 0; c < 4; c += 1) 717 { 718 if (isComponentEnabled(compMask, c)) 719 { 720 vVertexElements[currentVertexElement++] = pResults[c]; 721 if (currentVertexElement > 3) 722 { 723 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 724 // reset to the next vVertexElement to output 725 currentVertexElement = 0; 726 } 727 } 728 } 729 } 730 else if (info.type[0] == SWR_TYPE_FLOAT) 731 { 732 ///@todo: support 64 bit vb accesses 733 Value* gatherSrc = VIMMED1(0.0f); 734 735 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 736 "Unsupported format for standard gather fetch."); 737 738 // Gather components from memory to store in a simdvertex structure 739 switch (bpc) 740 { 741 case 16: 742 { 743 Value* vGatherResult[2]; 744 745 // if we have at least one component out of x or y to fetch 746 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 747 { 748 vGatherResult[0] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask); 749 // e.g. result of first 8x32bit integer gather for 16bit components 750 // 256i - 0 1 2 3 4 5 6 7 751 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 752 // 753 } 754 755 // if we have at least one component out of z or w to fetch 756 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 757 { 758 // offset base to the next components(zw) in the vertex to gather 759 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 760 761 vGatherResult[1] = GATHERPS(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask); 762 // e.g. result of second 8x32bit integer gather for 16bit components 763 // 256i - 0 1 2 3 4 5 6 7 764 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 765 // 766 } 767 768 // if we have at least one component to shuffle into place 769 if (compMask) 770 { 771 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, 772 pVtxOut, 773 Instruction::CastOps::FPExt, 774 CONVERT_NONE, 775 currentVertexElement, 776 outputElt, 777 compMask, 778 compCtrl, 779 vVertexElements); 780 781 // Shuffle gathered components into place in simdvertex struct 782 mVWidth == 16 ? Shuffle16bpcGather16(args) 783 : Shuffle16bpcGather(args); // outputs to vVertexElements ref 784 } 785 } 786 break; 787 case 32: 788 { 789 for (uint32_t i = 0; i < 4; i += 1) 790 { 791 if (isComponentEnabled(compMask, i)) 792 { 793 // if we need to gather the component 794 if (compCtrl[i] == StoreSrc) 795 { 796 // Gather a SIMD of vertices 797 // APIs allow a 4GB range for offsets 798 // However, GATHERPS uses signed 32-bit offsets, so +/- 2GB range :( 799 // Add 2GB to the base pointer and 2GB to the offsets. This makes 800 // "negative" (large) offsets into positive offsets and small offsets 801 // into negative offsets. 802 Value* vNewOffsets = ADD(vOffsets, VIMMED1(0x80000000)); 803 vVertexElements[currentVertexElement++] = 804 GATHERPS(gatherSrc, 805 ADD(pStreamBaseGFX, C((uintptr_t)0x80000000U)), 806 vNewOffsets, 807 vGatherMask, 808 1, 809 JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 810 } 811 else 812 { 813 vVertexElements[currentVertexElement++] = 814 GenerateCompCtrlVector(compCtrl[i]); 815 } 816 817 if (currentVertexElement > 3) 818 { 819 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 820 // reset to the next vVertexElement to output 821 currentVertexElement = 0; 822 } 823 } 824 825 // offset base to the next component in the vertex to gather 826 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 827 } 828 } 829 break; 830 case 64: 831 { 832 for (uint32_t i = 0; i < 4; i += 1) 833 { 834 if (isComponentEnabled(compMask, i)) 835 { 836 // if we need to gather the component 837 if (compCtrl[i] == StoreSrc) 838 { 839 Value* vShufLo; 840 Value* vShufHi; 841 Value* vShufAll; 842 843 if (mVWidth == 8) 844 { 845 vShufLo = C({0, 1, 2, 3}); 846 vShufHi = C({4, 5, 6, 7}); 847 vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7}); 848 } 849 else 850 { 851 SWR_ASSERT(mVWidth == 16); 852 vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7}); 853 vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15}); 854 vShufAll = 855 C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 856 } 857 858 Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo); 859 Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi); 860 861 Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo); 862 Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi); 863 864 Value* vZeroDouble = VECTOR_SPLAT( 865 mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); 866 867 Value* pGatherLo = 868 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsLo, vMaskLo); 869 Value* pGatherHi = 870 GATHERPD(vZeroDouble, pStreamBaseGFX, vOffsetsHi, vMaskHi); 871 872 pGatherLo = VCVTPD2PS(pGatherLo); 873 pGatherHi = VCVTPD2PS(pGatherHi); 874 875 Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll); 876 877 vVertexElements[currentVertexElement++] = pGather; 878 } 879 else 880 { 881 vVertexElements[currentVertexElement++] = 882 GenerateCompCtrlVector(compCtrl[i]); 883 } 884 885 if (currentVertexElement > 3) 886 { 887 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 888 // reset to the next vVertexElement to output 889 currentVertexElement = 0; 890 } 891 } 892 893 // offset base to the next component in the vertex to gather 894 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)8)); 895 } 896 } 897 break; 898 default: 899 SWR_INVALID("Tried to fetch invalid FP format"); 900 break; 901 } 902 } 903 else 904 { 905 Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd; 906 ConversionType conversionType = CONVERT_NONE; 907 908 SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 909 "Unsupported format for standard gather fetch."); 910 911 switch (info.type[0]) 912 { 913 case SWR_TYPE_UNORM: 914 conversionType = CONVERT_NORMALIZED; 915 case SWR_TYPE_UINT: 916 extendCastType = Instruction::CastOps::ZExt; 917 break; 918 case SWR_TYPE_SNORM: 919 conversionType = CONVERT_NORMALIZED; 920 case SWR_TYPE_SINT: 921 extendCastType = Instruction::CastOps::SExt; 922 break; 923 case SWR_TYPE_USCALED: 924 conversionType = CONVERT_USCALED; 925 extendCastType = Instruction::CastOps::UIToFP; 926 break; 927 case SWR_TYPE_SSCALED: 928 conversionType = CONVERT_SSCALED; 929 extendCastType = Instruction::CastOps::SIToFP; 930 break; 931 case SWR_TYPE_SFIXED: 932 conversionType = CONVERT_SFIXED; 933 extendCastType = Instruction::CastOps::SExt; 934 break; 935 default: 936 break; 937 } 938 939 // value substituted when component of gather is masked 940 Value* gatherSrc = VIMMED1(0); 941 942 // Gather components from memory to store in a simdvertex structure 943 switch (bpc) 944 { 945 case 8: 946 { 947 // if we have at least one component to fetch 948 if (compMask) 949 { 950 Value* vGatherResult = GATHERDD( 951 gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 952 // e.g. result of an 8x32bit integer gather for 8bit components 953 // 256i - 0 1 2 3 4 5 6 7 954 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 955 956 Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, 957 pVtxOut, 958 extendCastType, 959 conversionType, 960 currentVertexElement, 961 outputElt, 962 compMask, 963 compCtrl, 964 vVertexElements, 965 info.swizzle); 966 967 // Shuffle gathered components into place in simdvertex struct 968 mVWidth == 16 ? Shuffle8bpcGatherd16(args) 969 : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref 970 } 971 } 972 break; 973 case 16: 974 { 975 Value* vGatherResult[2]; 976 977 // if we have at least one component out of x or y to fetch 978 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 979 { 980 vGatherResult[0] = GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 981 // e.g. result of first 8x32bit integer gather for 16bit components 982 // 256i - 0 1 2 3 4 5 6 7 983 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 984 // 985 } 986 987 // if we have at least one component out of z or w to fetch 988 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 989 { 990 // offset base to the next components(zw) in the vertex to gather 991 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 992 993 vGatherResult[1] = GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 994 // e.g. result of second 8x32bit integer gather for 16bit components 995 // 256i - 0 1 2 3 4 5 6 7 996 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 997 // 998 } 999 1000 // if we have at least one component to shuffle into place 1001 if (compMask) 1002 { 1003 Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, 1004 pVtxOut, 1005 extendCastType, 1006 conversionType, 1007 currentVertexElement, 1008 outputElt, 1009 compMask, 1010 compCtrl, 1011 vVertexElements); 1012 1013 // Shuffle gathered components into place in simdvertex struct 1014 mVWidth == 16 ? Shuffle16bpcGather16(args) 1015 : Shuffle16bpcGather(args); // outputs to vVertexElements ref 1016 } 1017 } 1018 break; 1019 case 32: 1020 { 1021 // Gathered components into place in simdvertex struct 1022 for (uint32_t i = 0; i < 4; i++) 1023 { 1024 if (isComponentEnabled(compMask, i)) 1025 { 1026 // if we need to gather the component 1027 if (compCtrl[i] == StoreSrc) 1028 { 1029 Value* pGather = 1030 GATHERDD(gatherSrc, pStreamBaseGFX, vOffsets, vGatherMask, 1, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1031 1032 if (conversionType == CONVERT_USCALED) 1033 { 1034 pGather = UI_TO_FP(pGather, mSimdFP32Ty); 1035 } 1036 else if (conversionType == CONVERT_SSCALED) 1037 { 1038 pGather = SI_TO_FP(pGather, mSimdFP32Ty); 1039 } 1040 else if (conversionType == CONVERT_SFIXED) 1041 { 1042 pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), 1043 VBROADCAST(C(1 / 65536.0f))); 1044 } 1045 1046 vVertexElements[currentVertexElement++] = pGather; 1047 1048 // e.g. result of a single 8x32bit integer gather for 32bit components 1049 // 256i - 0 1 2 3 4 5 6 7 1050 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 1051 } 1052 else 1053 { 1054 vVertexElements[currentVertexElement++] = 1055 GenerateCompCtrlVector(compCtrl[i]); 1056 } 1057 1058 if (currentVertexElement > 3) 1059 { 1060 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1061 1062 // reset to the next vVertexElement to output 1063 currentVertexElement = 0; 1064 } 1065 } 1066 1067 // offset base to the next component in the vertex to gather 1068 pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4)); 1069 } 1070 } 1071 break; 1072 } 1073 } 1074 } 1075 1076 // if we have a partially filled vVertexElement struct, output it 1077 if (currentVertexElement > 0) 1078 { 1079 StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); 1080 } 1081} 1082 1083template <typename T> 1084Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex) 1085{ 1086 SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, 1087 "Function expects gfxptr_t for both input parameters."); 1088 1089 Type* Ty = nullptr; 1090 1091 static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), 1092 "Unsupported type for use with GetSimdValidIndicesHelper<T>"); 1093 constexpr bool bSize = (sizeof(T) == sizeof(uint16_t)); 1094 if (bSize) 1095 { 1096 Ty = mInt16PtrTy; 1097 } 1098 else if (sizeof(T) == sizeof(uint8_t)) 1099 { 1100 Ty = mInt8PtrTy; 1101 } 1102 else 1103 { 1104 SWR_ASSERT(false, "This should never happen as per static_assert above."); 1105 } 1106 1107 Value* vIndices = VUNDEF_I(); 1108 1109 { 1110 // store 0 index on stack to be used to conditionally load from if index address is OOB 1111 Value* pZeroIndex = ALLOCA(Ty->getPointerElementType()); 1112 STORE(C((T)0), pZeroIndex); 1113 1114 // Load a SIMD of index pointers 1115 for (int64_t lane = 0; lane < mVWidth; lane++) 1116 { 1117 // Calculate the address of the requested index 1118 Value* pIndex = GEP(pIndices, C(lane), Ty); 1119 1120 pLastIndex = INT_TO_PTR(pLastIndex, Ty); 1121 1122 // check if the address is less than the max index, 1123 Value* mask = ICMP_ULT(pIndex, pLastIndex); 1124 1125 // if valid, load the index. if not, load 0 from the stack 1126 Value* pValid = SELECT(mask, pIndex, pZeroIndex); 1127 Value* index = LOAD(pValid, "valid index", Ty, JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1128 1129 // zero extended index to 32 bits and insert into the correct simd lane 1130 index = Z_EXT(index, mInt32Ty); 1131 vIndices = VINSERT(vIndices, index, lane); 1132 } 1133 } 1134 1135 return vIndices; 1136} 1137 1138////////////////////////////////////////////////////////////////////////// 1139/// @brief Loads a simd of valid indices. OOB indices are set to 0 1140/// *Note* have to do 8bit index checking in scalar until we have AVX-512 1141/// support 1142/// @param pIndices - pointer to 8 bit indices 1143/// @param pLastIndex - pointer to last valid index 1144Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) 1145{ 1146 return GetSimdValidIndicesHelper<uint8_t>(pIndices, pLastIndex); 1147} 1148 1149////////////////////////////////////////////////////////////////////////// 1150/// @brief Loads a simd of valid indices. OOB indices are set to 0 1151/// *Note* have to do 16bit index checking in scalar until we have AVX-512 1152/// support 1153/// @param pIndices - pointer to 16 bit indices 1154/// @param pLastIndex - pointer to last valid index 1155Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) 1156{ 1157 return GetSimdValidIndicesHelper<uint16_t>(pIndices, pLastIndex); 1158} 1159 1160////////////////////////////////////////////////////////////////////////// 1161/// @brief Loads a simd of valid indices. OOB indices are set to 0 1162/// @param pIndices - pointer to 32 bit indices 1163/// @param pLastIndex - pointer to last valid index 1164Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) 1165{ 1166 DataLayout dL(JM()->mpCurrentModule); 1167 Value* iLastIndex = pLastIndex; 1168 Value* iIndices = pIndices; 1169 1170 // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index) 1171 Value* numIndicesLeft = SUB(iLastIndex, iIndices); 1172 numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty); 1173 numIndicesLeft = SDIV(numIndicesLeft, C(4)); 1174 1175 // create a vector of index counts from the base index ptr passed into the fetch 1176 Constant* vIndexOffsets; 1177 if (mVWidth == 8) 1178 { 1179 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7}); 1180 } 1181 else 1182 { 1183 vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); 1184 } 1185 1186 // compare index count to the max valid index 1187 // e.g vMaxIndex 4 4 4 4 4 4 4 4 : 4 indices left to load 1188 // vIndexOffsets 0 1 2 3 4 5 6 7 1189 // ------------------------------ 1190 // vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass 1191 // vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0 1192 Value* vMaxIndex = VBROADCAST(numIndicesLeft); 1193 Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets); 1194 1195 // Load the indices; OOB loads 0 1196 return MASKED_LOAD(pIndices, 1197 4, 1198 vIndexMask, 1199 VIMMED1(0), 1200 "vIndices", 1201 PointerType::get(mSimdInt32Ty, 0), 1202 JIT_MEM_CLIENT::GFX_MEM_CLIENT_FETCH); 1203} 1204 1205////////////////////////////////////////////////////////////////////////// 1206/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 1207/// denormalizes if needed, converts to F32 if needed, and positions in 1208// the proper SIMD rows to be output to the simdvertex structure 1209/// @param args: (tuple of args, listed below) 1210/// @param vGatherResult - 8 gathered 8bpc vertices 1211/// @param pVtxOut - base pointer to output simdvertex struct 1212/// @param extendType - sign extend or zero extend 1213/// @param bNormalized - do we need to denormalize? 1214/// @param currentVertexElement - reference to the current vVertexElement 1215/// @param outputElt - reference to the current offset from simdvertex we're o 1216/// @param compMask - component packing mask 1217/// @param compCtrl - component control val 1218/// @param vVertexElements[4] - vertex components to output 1219/// @param swizzle[4] - component swizzle location 1220void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args) 1221{ 1222 // Unpack tuple args 1223 Value*& vGatherResult = std::get<0>(args); 1224 Value* pVtxOut = std::get<1>(args); 1225 const Instruction::CastOps extendType = std::get<2>(args); 1226 const ConversionType conversionType = std::get<3>(args); 1227 uint32_t& currentVertexElement = std::get<4>(args); 1228 uint32_t& outputElt = std::get<5>(args); 1229 const ComponentEnable compMask = std::get<6>(args); 1230 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1231 Value*(&vVertexElements)[4] = std::get<8>(args); 1232 const uint32_t(&swizzle)[4] = std::get<9>(args); 1233 1234 // cast types 1235 Type* vGatherTy = VectorType::get(mInt32Ty, 8); 1236 Type* v32x8Ty = VectorType::get(mInt8Ty, 32); 1237 1238 // have to do extra work for sign extending 1239 if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)) 1240 { 1241 Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane 1242 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); 1243 1244 // shuffle mask, including any swizzling 1245 const char x = (char)swizzle[0]; 1246 const char y = (char)swizzle[1]; 1247 const char z = (char)swizzle[2]; 1248 const char w = (char)swizzle[3]; 1249 Value* vConstMask = C<char>( 1250 {char(x), char(x + 4), char(x + 8), char(x + 12), char(y), char(y + 4), 1251 char(y + 8), char(y + 12), char(z), char(z + 4), char(z + 8), char(z + 12), 1252 char(w), char(w + 4), char(w + 8), char(w + 12), char(x), char(x + 4), 1253 char(x + 8), char(x + 12), char(y), char(y + 4), char(y + 8), char(y + 12), 1254 char(z), char(z + 4), char(z + 8), char(z + 12), char(w), char(w + 4), 1255 char(w + 8), char(w + 12)}); 1256 1257 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. 1258 1259 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); 1260 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); 1261 1262 Value* vShufResult_lo = 1263 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); 1264 Value* vShufResult_hi = 1265 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); 1266 1267 // after pshufb: group components together in each 128bit lane 1268 // 256i - 0 1 2 3 4 5 6 7 1269 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 1270 1271 Value* vi128XY_lo = nullptr; 1272 Value* vi128XY_hi = nullptr; 1273 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1274 { 1275 vi128XY_lo = BITCAST( 1276 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), 1277 v128Ty); 1278 vi128XY_hi = BITCAST( 1279 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), 1280 v128Ty); 1281 1282 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 1283 // 256i - 0 1 2 3 4 5 6 7 1284 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 1285 } 1286 1287 // do the same for zw components 1288 Value* vi128ZW_lo = nullptr; 1289 Value* vi128ZW_hi = nullptr; 1290 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1291 { 1292 vi128ZW_lo = BITCAST( 1293 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), 1294 v128Ty); 1295 vi128ZW_hi = BITCAST( 1296 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), 1297 v128Ty); 1298 } 1299 1300 // init denormalize variables if needed 1301 Instruction::CastOps fpCast; 1302 Value* conversionFactor; 1303 1304 switch (conversionType) 1305 { 1306 case CONVERT_NORMALIZED: 1307 fpCast = Instruction::CastOps::SIToFP; 1308 conversionFactor = VIMMED1((float)(1.0 / 127.0)); 1309 break; 1310 case CONVERT_SSCALED: 1311 fpCast = Instruction::CastOps::SIToFP; 1312 conversionFactor = VIMMED1((float)(1.0)); 1313 break; 1314 case CONVERT_USCALED: 1315 SWR_INVALID("Type should not be sign extended!"); 1316 conversionFactor = nullptr; 1317 break; 1318 default: 1319 SWR_ASSERT(conversionType == CONVERT_NONE); 1320 conversionFactor = nullptr; 1321 break; 1322 } 1323 1324 // sign extend all enabled components. If we have a fill vVertexElements, output to current 1325 // simdvertex 1326 for (uint32_t i = 0; i < 4; i++) 1327 { 1328 if (isComponentEnabled(compMask, i)) 1329 { 1330 if (compCtrl[i] == ComponentControl::StoreSrc) 1331 { 1332 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1333 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1334 // if x or y, use vi128XY permute result, else use vi128ZW 1335 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; 1336 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; 1337 1338 // sign extend 1339 Value* temp_lo = 1340 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); 1341 Value* temp_hi = 1342 PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); 1343 1344 Value* temp = JOIN_16(temp_lo, temp_hi); 1345 1346 // denormalize if needed 1347 if (conversionType != CONVERT_NONE) 1348 { 1349 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1350 } 1351 1352 vVertexElements[currentVertexElement] = temp; 1353 1354 currentVertexElement += 1; 1355 } 1356 else 1357 { 1358 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1359 } 1360 1361 if (currentVertexElement > 3) 1362 { 1363 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1364 // reset to the next vVertexElement to output 1365 currentVertexElement = 0; 1366 } 1367 } 1368 } 1369 } 1370 // else zero extend 1371 else if ((extendType == Instruction::CastOps::ZExt) || 1372 (extendType == Instruction::CastOps::UIToFP)) 1373 { 1374 // init denormalize variables if needed 1375 Instruction::CastOps fpCast; 1376 Value* conversionFactor; 1377 1378 switch (conversionType) 1379 { 1380 case CONVERT_NORMALIZED: 1381 fpCast = Instruction::CastOps::UIToFP; 1382 conversionFactor = VIMMED1((float)(1.0 / 255.0)); 1383 break; 1384 case CONVERT_USCALED: 1385 fpCast = Instruction::CastOps::UIToFP; 1386 conversionFactor = VIMMED1((float)(1.0)); 1387 break; 1388 case CONVERT_SSCALED: 1389 SWR_INVALID("Type should not be zero extended!"); 1390 conversionFactor = nullptr; 1391 break; 1392 default: 1393 SWR_ASSERT(conversionType == CONVERT_NONE); 1394 conversionFactor = nullptr; 1395 break; 1396 } 1397 1398 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 1399 for (uint32_t i = 0; i < 4; i++) 1400 { 1401 if (isComponentEnabled(compMask, i)) 1402 { 1403 if (compCtrl[i] == ComponentControl::StoreSrc) 1404 { 1405 // pshufb masks for each component 1406 Value* vConstMask; 1407 switch (swizzle[i]) 1408 { 1409 case 0: 1410 // x shuffle mask 1411 vConstMask = 1412 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 1413 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 1414 break; 1415 case 1: 1416 // y shuffle mask 1417 vConstMask = 1418 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 1419 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 1420 break; 1421 case 2: 1422 // z shuffle mask 1423 vConstMask = 1424 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 1425 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 1426 break; 1427 case 3: 1428 // w shuffle mask 1429 vConstMask = 1430 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 1431 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 1432 break; 1433 default: 1434 vConstMask = nullptr; 1435 break; 1436 } 1437 1438 Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0); 1439 Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1); 1440 1441 Value* temp_lo = 1442 BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); 1443 Value* temp_hi = 1444 BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); 1445 1446 // after pshufb for x channel 1447 // 256i - 0 1 2 3 4 5 6 7 1448 // x000 x000 x000 x000 x000 x000 x000 x000 1449 1450 Value* temp = JOIN_16(temp_lo, temp_hi); 1451 1452 // denormalize if needed 1453 if (conversionType != CONVERT_NONE) 1454 { 1455 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1456 } 1457 1458 vVertexElements[currentVertexElement] = temp; 1459 1460 currentVertexElement += 1; 1461 } 1462 else 1463 { 1464 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1465 } 1466 1467 if (currentVertexElement > 3) 1468 { 1469 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1470 // reset to the next vVertexElement to output 1471 currentVertexElement = 0; 1472 } 1473 } 1474 } 1475 } 1476 else 1477 { 1478 SWR_INVALID("Unsupported conversion type"); 1479 } 1480} 1481 1482void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args) 1483{ 1484 // Unpack tuple args 1485 Value*& vGatherResult = std::get<0>(args); 1486 Value* pVtxOut = std::get<1>(args); 1487 const Instruction::CastOps extendType = std::get<2>(args); 1488 const ConversionType conversionType = std::get<3>(args); 1489 uint32_t& currentVertexElement = std::get<4>(args); 1490 uint32_t& outputElt = std::get<5>(args); 1491 const ComponentEnable compMask = std::get<6>(args); 1492 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1493 Value*(&vVertexElements)[4] = std::get<8>(args); 1494 const uint32_t(&swizzle)[4] = std::get<9>(args); 1495 1496 // cast types 1497 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1498 1499 for (uint32_t i = 0; i < 4; i++) 1500 { 1501 if (!isComponentEnabled(compMask, i)) 1502 continue; 1503 1504 if (compCtrl[i] == ComponentControl::StoreSrc) 1505 { 1506 std::vector<uint32_t> vShuffleMasks[4] = { 1507 {0, 4, 8, 12, 16, 20, 24, 28}, // x 1508 {1, 5, 9, 13, 17, 21, 25, 29}, // y 1509 {2, 6, 10, 14, 18, 22, 26, 30}, // z 1510 {3, 7, 11, 15, 19, 23, 27, 31}, // w 1511 }; 1512 1513 Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), 1514 UndefValue::get(v32x8Ty), 1515 vShuffleMasks[swizzle[i]]); 1516 1517 if ((extendType == Instruction::CastOps::SExt) || 1518 (extendType == Instruction::CastOps::SIToFP)) 1519 { 1520 switch (conversionType) 1521 { 1522 case CONVERT_NORMALIZED: 1523 val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); 1524 break; 1525 case CONVERT_SSCALED: 1526 val = SI_TO_FP(val, mSimdFP32Ty); 1527 break; 1528 case CONVERT_USCALED: 1529 SWR_INVALID("Type should not be sign extended!"); 1530 break; 1531 default: 1532 SWR_ASSERT(conversionType == CONVERT_NONE); 1533 val = S_EXT(val, mSimdInt32Ty); 1534 break; 1535 } 1536 } 1537 else if ((extendType == Instruction::CastOps::ZExt) || 1538 (extendType == Instruction::CastOps::UIToFP)) 1539 { 1540 switch (conversionType) 1541 { 1542 case CONVERT_NORMALIZED: 1543 val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); 1544 break; 1545 case CONVERT_SSCALED: 1546 SWR_INVALID("Type should not be zero extended!"); 1547 break; 1548 case CONVERT_USCALED: 1549 val = UI_TO_FP(val, mSimdFP32Ty); 1550 break; 1551 default: 1552 SWR_ASSERT(conversionType == CONVERT_NONE); 1553 val = Z_EXT(val, mSimdInt32Ty); 1554 break; 1555 } 1556 } 1557 else 1558 { 1559 SWR_INVALID("Unsupported conversion type"); 1560 } 1561 1562 vVertexElements[currentVertexElement++] = val; 1563 } 1564 else 1565 { 1566 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1567 } 1568 1569 if (currentVertexElement > 3) 1570 { 1571 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1572 // reset to the next vVertexElement to output 1573 currentVertexElement = 0; 1574 } 1575 } 1576} 1577 1578////////////////////////////////////////////////////////////////////////// 1579/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 1580/// denormalizes if needed, converts to F32 if needed, and positions in 1581// the proper SIMD rows to be output to the simdvertex structure 1582/// @param args: (tuple of args, listed below) 1583/// @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index 1584/// @param pVtxOut - base pointer to output simdvertex struct 1585/// @param extendType - sign extend or zero extend 1586/// @param bNormalized - do we need to denormalize? 1587/// @param currentVertexElement - reference to the current vVertexElement 1588/// @param outputElt - reference to the current offset from simdvertex we're o 1589/// @param compMask - component packing mask 1590/// @param compCtrl - component control val 1591/// @param vVertexElements[4] - vertex components to output 1592void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args) 1593{ 1594 // Unpack tuple args 1595 Value*(&vGatherResult)[2] = std::get<0>(args); 1596 Value* pVtxOut = std::get<1>(args); 1597 const Instruction::CastOps extendType = std::get<2>(args); 1598 const ConversionType conversionType = std::get<3>(args); 1599 uint32_t& currentVertexElement = std::get<4>(args); 1600 uint32_t& outputElt = std::get<5>(args); 1601 const ComponentEnable compMask = std::get<6>(args); 1602 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1603 Value*(&vVertexElements)[4] = std::get<8>(args); 1604 1605 // cast types 1606 Type* vGatherTy = VectorType::get(mInt32Ty, 8); 1607 Type* v32x8Ty = VectorType::get(mInt8Ty, 32); 1608 1609 // have to do extra work for sign extending 1610 if ((extendType == Instruction::CastOps::SExt) || 1611 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) 1612 { 1613 // is this PP float? 1614 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; 1615 1616 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane 1617 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2); 1618 1619 // shuffle mask 1620 Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1621 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1622 Value* vi128XY_lo = nullptr; 1623 Value* vi128XY_hi = nullptr; 1624 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1625 { 1626 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for 1627 // now.. 1628 1629 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty); 1630 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty); 1631 1632 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); 1633 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); 1634 1635 // after pshufb: group components together in each 128bit lane 1636 // 256i - 0 1 2 3 4 5 6 7 1637 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1638 1639 vi128XY_lo = BITCAST( 1640 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1641 v128bitTy); 1642 vi128XY_hi = BITCAST( 1643 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1644 v128bitTy); 1645 1646 // after PERMD: move and pack xy components into each 128bit lane 1647 // 256i - 0 1 2 3 4 5 6 7 1648 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1649 } 1650 1651 // do the same for zw components 1652 Value* vi128ZW_lo = nullptr; 1653 Value* vi128ZW_hi = nullptr; 1654 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1655 { 1656 Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty); 1657 Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty); 1658 1659 Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy); 1660 Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy); 1661 1662 vi128ZW_lo = BITCAST( 1663 VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1664 v128bitTy); 1665 vi128ZW_hi = BITCAST( 1666 VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), 1667 v128bitTy); 1668 } 1669 1670 // init denormalize variables if needed 1671 Instruction::CastOps IntToFpCast; 1672 Value* conversionFactor; 1673 1674 switch (conversionType) 1675 { 1676 case CONVERT_NORMALIZED: 1677 IntToFpCast = Instruction::CastOps::SIToFP; 1678 conversionFactor = VIMMED1((float)(1.0 / 32767.0)); 1679 break; 1680 case CONVERT_SSCALED: 1681 IntToFpCast = Instruction::CastOps::SIToFP; 1682 conversionFactor = VIMMED1((float)(1.0)); 1683 break; 1684 case CONVERT_USCALED: 1685 SWR_INVALID("Type should not be sign extended!"); 1686 conversionFactor = nullptr; 1687 break; 1688 default: 1689 SWR_ASSERT(conversionType == CONVERT_NONE); 1690 conversionFactor = nullptr; 1691 break; 1692 } 1693 1694 // sign extend all enabled components. If we have a fill vVertexElements, output to current 1695 // simdvertex 1696 for (uint32_t i = 0; i < 4; i++) 1697 { 1698 if (isComponentEnabled(compMask, i)) 1699 { 1700 if (compCtrl[i] == ComponentControl::StoreSrc) 1701 { 1702 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1703 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1704 // if x or y, use vi128XY permute result, else use vi128ZW 1705 Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; 1706 Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; 1707 1708 if (bFP) 1709 { 1710 // extract 128 bit lanes to sign extend each component 1711 Value* temp_lo = 1712 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); 1713 Value* temp_hi = 1714 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); 1715 1716 vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); 1717 } 1718 else 1719 { 1720 // extract 128 bit lanes to sign extend each component 1721 Value* temp_lo = 1722 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); 1723 Value* temp_hi = 1724 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); 1725 1726 Value* temp = JOIN_16(temp_lo, temp_hi); 1727 1728 // denormalize if needed 1729 if (conversionType != CONVERT_NONE) 1730 { 1731 temp = FMUL(CAST(IntToFpCast, temp, mSimdFP32Ty), conversionFactor); 1732 } 1733 1734 vVertexElements[currentVertexElement] = temp; 1735 } 1736 1737 currentVertexElement += 1; 1738 } 1739 else 1740 { 1741 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1742 } 1743 1744 if (currentVertexElement > 3) 1745 { 1746 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1747 // reset to the next vVertexElement to output 1748 currentVertexElement = 0; 1749 } 1750 } 1751 } 1752 } 1753 // else zero extend 1754 else if ((extendType == Instruction::CastOps::ZExt) || 1755 (extendType == Instruction::CastOps::UIToFP)) 1756 { 1757 // pshufb masks for each component 1758 Value* vConstMask[2]; 1759 1760 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) 1761 { 1762 // x/z shuffle mask 1763 vConstMask[0] = C<char>({ 1764 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1765 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 1766 }); 1767 } 1768 1769 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) 1770 { 1771 // y/w shuffle mask 1772 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 1773 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 1774 } 1775 1776 // init denormalize variables if needed 1777 Instruction::CastOps fpCast; 1778 Value* conversionFactor; 1779 1780 switch (conversionType) 1781 { 1782 case CONVERT_NORMALIZED: 1783 fpCast = Instruction::CastOps::UIToFP; 1784 conversionFactor = VIMMED1((float)(1.0 / 65535.0)); 1785 break; 1786 case CONVERT_USCALED: 1787 fpCast = Instruction::CastOps::UIToFP; 1788 conversionFactor = VIMMED1((float)(1.0f)); 1789 break; 1790 case CONVERT_SSCALED: 1791 SWR_INVALID("Type should not be zero extended!"); 1792 conversionFactor = nullptr; 1793 break; 1794 default: 1795 SWR_ASSERT(conversionType == CONVERT_NONE); 1796 conversionFactor = nullptr; 1797 break; 1798 } 1799 1800 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 1801 for (uint32_t i = 0; i < 4; i++) 1802 { 1803 if (isComponentEnabled(compMask, i)) 1804 { 1805 if (compCtrl[i] == ComponentControl::StoreSrc) 1806 { 1807 // select correct constMask for x/z or y/w pshufb 1808 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 1809 // if x or y, use vi128XY permute result, else use vi128ZW 1810 uint32_t selectedGather = (i < 2) ? 0 : 1; 1811 1812 // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, 1813 // for now.. 1814 1815 Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); 1816 Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); 1817 1818 Value* temp_lo = BITCAST( 1819 PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), 1820 vGatherTy); 1821 Value* temp_hi = BITCAST( 1822 PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), 1823 vGatherTy); 1824 1825 // after pshufb mask for x channel; z uses the same shuffle from the second 1826 // gather 256i - 0 1 2 3 4 5 6 7 1827 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 1828 1829 Value* temp = JOIN_16(temp_lo, temp_hi); 1830 1831 // denormalize if needed 1832 if (conversionType != CONVERT_NONE) 1833 { 1834 temp = FMUL(CAST(fpCast, temp, mSimdFP32Ty), conversionFactor); 1835 } 1836 1837 vVertexElements[currentVertexElement] = temp; 1838 1839 currentVertexElement += 1; 1840 } 1841 else 1842 { 1843 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1844 } 1845 1846 if (currentVertexElement > 3) 1847 { 1848 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1849 // reset to the next vVertexElement to output 1850 currentVertexElement = 0; 1851 } 1852 } 1853 } 1854 } 1855 else 1856 { 1857 SWR_INVALID("Unsupported conversion type"); 1858 } 1859} 1860 1861void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args) 1862{ 1863 // Unpack tuple args 1864 Value*(&vGatherResult)[2] = std::get<0>(args); 1865 Value* pVtxOut = std::get<1>(args); 1866 const Instruction::CastOps extendType = std::get<2>(args); 1867 const ConversionType conversionType = std::get<3>(args); 1868 uint32_t& currentVertexElement = std::get<4>(args); 1869 uint32_t& outputElt = std::get<5>(args); 1870 const ComponentEnable compMask = std::get<6>(args); 1871 const ComponentControl(&compCtrl)[4] = std::get<7>(args); 1872 Value*(&vVertexElements)[4] = std::get<8>(args); 1873 1874 // cast types 1875 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 1876 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 1877 1878 // have to do extra work for sign extending 1879 if ((extendType == Instruction::CastOps::SExt) || 1880 (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) 1881 { 1882 // is this PP float? 1883 bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; 1884 1885 Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane 1886 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 1887 mVWidth / 4); // vwidth is units of 32 bits 1888 1889 // shuffle mask 1890 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 1891 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 1892 Value* vi128XY = nullptr; 1893 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) 1894 { 1895 Value* vShufResult = 1896 BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); 1897 // after pshufb: group components together in each 128bit lane 1898 // 256i - 0 1 2 3 4 5 6 7 1899 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 1900 1901 vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1902 // after PERMD: move and pack xy components into each 128bit lane 1903 // 256i - 0 1 2 3 4 5 6 7 1904 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 1905 } 1906 1907 // do the same for zw components 1908 Value* vi128ZW = nullptr; 1909 if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) 1910 { 1911 Value* vShufResult = 1912 BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); 1913 vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 1914 } 1915 1916 // init denormalize variables if needed 1917 Instruction::CastOps IntToFpCast; 1918 Value* conversionFactor; 1919 1920 switch (conversionType) 1921 { 1922 case CONVERT_NORMALIZED: 1923 IntToFpCast = Instruction::CastOps::SIToFP; 1924 conversionFactor = VIMMED1((float)(1.0 / 32767.0)); 1925 break; 1926 case CONVERT_SSCALED: 1927 IntToFpCast = Instruction::CastOps::SIToFP; 1928 conversionFactor = VIMMED1((float)(1.0)); 1929 break; 1930 case CONVERT_USCALED: 1931 SWR_INVALID("Type should not be sign extended!"); 1932 conversionFactor = nullptr; 1933 break; 1934 default: 1935 SWR_ASSERT(conversionType == CONVERT_NONE); 1936 conversionFactor = nullptr; 1937 break; 1938 } 1939 1940 // sign extend all enabled components. If we have a fill vVertexElements, output to current 1941 // simdvertex 1942 for (uint32_t i = 0; i < 4; i++) 1943 { 1944 if (isComponentEnabled(compMask, i)) 1945 { 1946 if (compCtrl[i] == ComponentControl::StoreSrc) 1947 { 1948 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 1949 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 1950 // if x or y, use vi128XY permute result, else use vi128ZW 1951 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 1952 1953 if (bFP) 1954 { 1955 // extract 128 bit lanes to sign extend each component 1956 vVertexElements[currentVertexElement] = 1957 CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 1958 } 1959 else 1960 { 1961 // extract 128 bit lanes to sign extend each component 1962 vVertexElements[currentVertexElement] = 1963 PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); 1964 1965 // denormalize if needed 1966 if (conversionType != CONVERT_NONE) 1967 { 1968 vVertexElements[currentVertexElement] = 1969 FMUL(CAST(IntToFpCast, 1970 vVertexElements[currentVertexElement], 1971 mSimdFP32Ty), 1972 conversionFactor); 1973 } 1974 } 1975 currentVertexElement++; 1976 } 1977 else 1978 { 1979 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 1980 } 1981 1982 if (currentVertexElement > 3) 1983 { 1984 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 1985 // reset to the next vVertexElement to output 1986 currentVertexElement = 0; 1987 } 1988 } 1989 } 1990 } 1991 // else zero extend 1992 else if ((extendType == Instruction::CastOps::ZExt) || 1993 (extendType == Instruction::CastOps::UIToFP)) 1994 { 1995 // pshufb masks for each component 1996 Value* vConstMask[2]; 1997 if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) 1998 { 1999 // x/z shuffle mask 2000 vConstMask[0] = C<char>({ 2001 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 2002 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 2003 }); 2004 } 2005 2006 if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) 2007 { 2008 // y/w shuffle mask 2009 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 2010 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 2011 } 2012 2013 // init denormalize variables if needed 2014 Instruction::CastOps fpCast; 2015 Value* conversionFactor; 2016 2017 switch (conversionType) 2018 { 2019 case CONVERT_NORMALIZED: 2020 fpCast = Instruction::CastOps::UIToFP; 2021 conversionFactor = VIMMED1((float)(1.0 / 65535.0)); 2022 break; 2023 case CONVERT_USCALED: 2024 fpCast = Instruction::CastOps::UIToFP; 2025 conversionFactor = VIMMED1((float)(1.0f)); 2026 break; 2027 case CONVERT_SSCALED: 2028 SWR_INVALID("Type should not be zero extended!"); 2029 conversionFactor = nullptr; 2030 break; 2031 default: 2032 SWR_ASSERT(conversionType == CONVERT_NONE); 2033 conversionFactor = nullptr; 2034 break; 2035 } 2036 2037 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 2038 for (uint32_t i = 0; i < 4; i++) 2039 { 2040 if (isComponentEnabled(compMask, i)) 2041 { 2042 if (compCtrl[i] == ComponentControl::StoreSrc) 2043 { 2044 // select correct constMask for x/z or y/w pshufb 2045 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 2046 // if x or y, use vi128XY permute result, else use vi128ZW 2047 uint32_t selectedGather = (i < 2) ? 0 : 1; 2048 2049 vVertexElements[currentVertexElement] = 2050 BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), 2051 vConstMask[selectedMask]), 2052 vGatherTy); 2053 // after pshufb mask for x channel; z uses the same shuffle from the second 2054 // gather 256i - 0 1 2 3 4 5 6 7 2055 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 2056 2057 // denormalize if needed 2058 if (conversionType != CONVERT_NONE) 2059 { 2060 vVertexElements[currentVertexElement] = 2061 FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), 2062 conversionFactor); 2063 } 2064 currentVertexElement++; 2065 } 2066 else 2067 { 2068 vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); 2069 } 2070 2071 if (currentVertexElement > 3) 2072 { 2073 StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); 2074 // reset to the next vVertexElement to output 2075 currentVertexElement = 0; 2076 } 2077 } 2078 } 2079 } 2080 else 2081 { 2082 SWR_INVALID("Unsupported conversion type"); 2083 } 2084} 2085 2086////////////////////////////////////////////////////////////////////////// 2087/// @brief Output a simdvertex worth of elements to the current outputElt 2088/// @param pVtxOut - base address of VIN output struct 2089/// @param outputElt - simdvertex offset in VIN to write to 2090/// @param numEltsToStore - number of simdvertex rows to write out 2091/// @param vVertexElements - LLVM Value*[] simdvertex to write out 2092void FetchJit::StoreVertexElements(Value* pVtxOut, 2093 const uint32_t outputElt, 2094 const uint32_t numEltsToStore, 2095 Value* (&vVertexElements)[4]) 2096{ 2097 SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); 2098 2099 for (uint32_t c = 0; c < numEltsToStore; ++c) 2100 { 2101 // STORE expects FP32 x vWidth type, just bitcast if needed 2102 if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) 2103 { 2104#if FETCH_DUMP_VERTEX 2105 PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); 2106#endif 2107 vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); 2108 } 2109#if FETCH_DUMP_VERTEX 2110 else 2111 { 2112 PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); 2113 } 2114#endif 2115 // outputElt * 4 = offsetting by the size of a simdvertex 2116 // + c offsets to a 32bit x vWidth row within the current vertex 2117 Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), nullptr, "destGEP"); 2118 STORE(vVertexElements[c], dest); 2119 } 2120} 2121 2122////////////////////////////////////////////////////////////////////////// 2123/// @brief Generates a constant vector of values based on the 2124/// ComponentControl value 2125/// @param ctrl - ComponentControl value 2126Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) 2127{ 2128 switch (ctrl) 2129 { 2130 case NoStore: 2131 return VUNDEF_I(); 2132 case Store0: 2133 return VIMMED1(0); 2134 case Store1Fp: 2135 return VIMMED1(1.0f); 2136 case Store1Int: 2137 return VIMMED1(1); 2138 case StoreVertexId: 2139 { 2140 if (mVWidth == 16) 2141 { 2142 Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8); 2143 Value* pIdLo = 2144 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy); 2145 Value* pIdHi = 2146 BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy); 2147 return JOIN_16(pIdLo, pIdHi); 2148 } 2149 else 2150 { 2151 return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty); 2152 } 2153 } 2154 case StoreInstanceId: 2155 { 2156 Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty); 2157 return VBROADCAST(pId); 2158 } 2159 2160 2161 case StoreSrc: 2162 default: 2163 SWR_INVALID("Invalid component control"); 2164 return VUNDEF_I(); 2165 } 2166} 2167 2168////////////////////////////////////////////////////////////////////////// 2169/// @brief Returns the enable mask for the specified component. 2170/// @param enableMask - enable bits 2171/// @param component - component to check if enabled. 2172bool isComponentEnabled(ComponentEnable enableMask, uint8_t component) 2173{ 2174 switch (component) 2175 { 2176 // X 2177 case 0: 2178 return (enableMask & ComponentEnable::X); 2179 // Y 2180 case 1: 2181 return (enableMask & ComponentEnable::Y); 2182 // Z 2183 case 2: 2184 return (enableMask & ComponentEnable::Z); 2185 // W 2186 case 3: 2187 return (enableMask & ComponentEnable::W); 2188 2189 default: 2190 return false; 2191 } 2192} 2193 2194// Don't want two threads compiling the same fetch shader simultaneously 2195// Has problems in the JIT cache implementation 2196// This is only a problem for fetch right now. 2197static std::mutex gFetchCodegenMutex; 2198 2199////////////////////////////////////////////////////////////////////////// 2200/// @brief JITs from fetch shader IR 2201/// @param hJitMgr - JitManager handle 2202/// @param func - LLVM function IR 2203/// @return PFN_FETCH_FUNC - pointer to fetch code 2204PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc) 2205{ 2206 const llvm::Function* func = (const llvm::Function*)hFunc; 2207 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 2208 PFN_FETCH_FUNC pfnFetch; 2209 2210 gFetchCodegenMutex.lock(); 2211 pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 2212 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot 2213 // add new IR to the module 2214 pJitMgr->mIsModuleFinalized = true; 2215 2216#if defined(KNOB_SWRC_TRACING) 2217 char fName[1024]; 2218 const char* funcName = func->getName().data(); 2219 sprintf(fName, "%s.bin", funcName); 2220 FILE* fd = fopen(fName, "wb"); 2221 fwrite((void*)pfnFetch, 1, 2048, fd); 2222 fclose(fd); 2223#endif 2224 2225 pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final"); 2226 gFetchCodegenMutex.unlock(); 2227 2228 2229 return pfnFetch; 2230} 2231 2232////////////////////////////////////////////////////////////////////////// 2233/// @brief JIT compiles fetch shader 2234/// @param hJitMgr - JitManager handle 2235/// @param state - fetch state to build function from 2236extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state) 2237{ 2238 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 2239 2240 pJitMgr->SetupNewModule(); 2241 2242 FetchJit theJit(pJitMgr); 2243 HANDLE hFunc = theJit.Create(state); 2244 2245 return JitFetchFunc(hJitMgr, hFunc); 2246} 2247