1/**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file builder_misc.cpp 24 * 25 * @brief Implementation for miscellaneous builder functions 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30#include "jit_pch.hpp" 31#include "builder.h" 32 33#include <cstdarg> 34 35namespace SwrJit 36{ 37 void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage) 38 { 39 SWR_ASSERT( 40 ptr->getType() != mInt64Ty, 41 "Address appears to be GFX access. Requires translation through BuilderGfxMem."); 42 } 43 44 Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name) 45 { 46 return IRB()->CreateGEP(Ptr, Idx, Name); 47 } 48 49 Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) 50 { 51 return IRB()->CreateGEP(Ty, Ptr, Idx, Name); 52 } 53 54 Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty) 55 { 56 std::vector<Value*> indices; 57 for (auto i : indexList) 58 indices.push_back(i); 59 return GEPA(ptr, indices); 60 } 61 62 Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty) 63 { 64 std::vector<Value*> indices; 65 for (auto i : indexList) 66 indices.push_back(C(i)); 67 return GEPA(ptr, indices); 68 } 69 70 Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) 71 { 72 return IRB()->CreateGEP(Ptr, IdxList, Name); 73 } 74 75 Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name) 76 { 77 return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); 78 } 79 80 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList) 81 { 82 std::vector<Value*> indices; 83 for (auto i : indexList) 84 indices.push_back(i); 85 return IN_BOUNDS_GEP(ptr, indices); 86 } 87 88 Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList) 89 { 90 std::vector<Value*> indices; 91 for (auto i : indexList) 92 indices.push_back(C(i)); 93 return IN_BOUNDS_GEP(ptr, indices); 94 } 95 96 LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) 97 { 98 AssertMemoryUsageParams(Ptr, usage); 99 return IRB()->CreateLoad(Ptr, Name); 100 } 101 102 LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) 103 { 104 AssertMemoryUsageParams(Ptr, usage); 105 return IRB()->CreateLoad(Ptr, Name); 106 } 107 108 LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage) 109 { 110 AssertMemoryUsageParams(Ptr, usage); 111 return IRB()->CreateLoad(Ty, Ptr, Name); 112 } 113 114 LoadInst* 115 Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) 116 { 117 AssertMemoryUsageParams(Ptr, usage); 118 return IRB()->CreateLoad(Ptr, isVolatile, Name); 119 } 120 121 LoadInst* Builder::LOAD(Value* basePtr, 122 const std::initializer_list<uint32_t>& indices, 123 const llvm::Twine& name, 124 Type* Ty, 125 JIT_MEM_CLIENT usage) 126 { 127 std::vector<Value*> valIndices; 128 for (auto i : indices) 129 valIndices.push_back(C(i)); 130 return Builder::LOAD(GEPA(basePtr, valIndices), name); 131 } 132 133 LoadInst* Builder::LOADV(Value* basePtr, 134 const std::initializer_list<Value*>& indices, 135 const llvm::Twine& name) 136 { 137 std::vector<Value*> valIndices; 138 for (auto i : indices) 139 valIndices.push_back(i); 140 return LOAD(GEPA(basePtr, valIndices), name); 141 } 142 143 StoreInst* 144 Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices, Type* Ty, JIT_MEM_CLIENT usage) 145 { 146 std::vector<Value*> valIndices; 147 for (auto i : indices) 148 valIndices.push_back(C(i)); 149 return STORE(val, GEPA(basePtr, valIndices)); 150 } 151 152 StoreInst* 153 Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices) 154 { 155 std::vector<Value*> valIndices; 156 for (auto i : indices) 157 valIndices.push_back(i); 158 return STORE(val, GEPA(basePtr, valIndices)); 159 } 160 161 Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) 162 { 163 return GEP(base, offset); 164 } 165 166 Value* Builder::MEM_ADD(Value* i32Incr, 167 Value* basePtr, 168 const std::initializer_list<uint32_t>& indices, 169 const llvm::Twine& name) 170 { 171 Value* i32Value = LOAD(GEP(basePtr, indices), name); 172 Value* i32Result = ADD(i32Value, i32Incr); 173 return STORE(i32Result, GEP(basePtr, indices)); 174 } 175 176 ////////////////////////////////////////////////////////////////////////// 177 /// @brief Generate a masked gather operation in LLVM IR. If not 178 /// supported on the underlying platform, emulate it with loads 179 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 180 /// @param pBase - Int8* base VB address pointer value 181 /// @param vIndices - SIMD wide value of VB byte offsets 182 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 183 /// @param scale - value to scale indices by 184 Value* Builder::GATHERPS(Value* vSrc, 185 Value* pBase, 186 Value* vIndices, 187 Value* vMask, 188 uint8_t scale, 189 JIT_MEM_CLIENT usage) 190 { 191 AssertMemoryUsageParams(pBase, usage); 192 193 return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale)); 194 } 195 196 ////////////////////////////////////////////////////////////////////////// 197 /// @brief Generate a masked gather operation in LLVM IR. If not 198 /// supported on the underlying platform, emulate it with loads 199 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 200 /// @param pBase - Int8* base VB address pointer value 201 /// @param vIndices - SIMD wide value of VB byte offsets 202 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 203 /// @param scale - value to scale indices by 204 Value* Builder::GATHERDD(Value* vSrc, 205 Value* pBase, 206 Value* vIndices, 207 Value* vMask, 208 uint8_t scale, 209 JIT_MEM_CLIENT usage) 210 { 211 AssertMemoryUsageParams(pBase, usage); 212 213 return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); 214 } 215 216 ////////////////////////////////////////////////////////////////////////// 217 /// @brief Generate a masked gather operation in LLVM IR. If not 218 /// supported on the underlying platform, emulate it with loads 219 /// @param vSrc - SIMD wide value that will be loaded if mask is invalid 220 /// @param pBase - Int8* base VB address pointer value 221 /// @param vIndices - SIMD wide value of VB byte offsets 222 /// @param vMask - SIMD wide mask that controls whether to access memory or the src values 223 /// @param scale - value to scale indices by 224 Value* 225 Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) 226 { 227 return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); 228 } 229 230 ////////////////////////////////////////////////////////////////////////// 231 /// @brief Alternative masked gather where source is a vector of pointers 232 /// @param pVecSrcPtr - SIMD wide vector of pointers 233 /// @param pVecMask - SIMD active lanes 234 /// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive 235 Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru) 236 { 237 return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru); 238 } 239 240 void Builder::Gather4(const SWR_FORMAT format, 241 Value* pSrcBase, 242 Value* byteOffsets, 243 Value* mask, 244 Value* vGatherComponents[], 245 bool bPackedOutput, 246 JIT_MEM_CLIENT usage) 247 { 248 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 249 if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) 250 { 251 GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); 252 } 253 else 254 { 255 GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage); 256 } 257 } 258 259 void Builder::GATHER4PS(const SWR_FORMAT_INFO& info, 260 Value* pSrcBase, 261 Value* byteOffsets, 262 Value* vMask, 263 Value* vGatherComponents[], 264 bool bPackedOutput, 265 JIT_MEM_CLIENT usage) 266 { 267 switch (info.bpp / info.numComps) 268 { 269 case 16: 270 { 271 Value* vGatherResult[2]; 272 273 // TODO: vGatherMaskedVal 274 Value* vGatherMaskedVal = VIMMED1((float)0); 275 276 // always have at least one component out of x or y to fetch 277 278 vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 279 // e.g. result of first 8x32bit integer gather for 16bit components 280 // 256i - 0 1 2 3 4 5 6 7 281 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 282 // 283 284 // if we have at least one component out of x or y to fetch 285 if (info.numComps > 2) 286 { 287 // offset base to the next components(zw) in the vertex to gather 288 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 289 290 vGatherResult[1] = 291 GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 292 // e.g. result of second 8x32bit integer gather for 16bit components 293 // 256i - 0 1 2 3 4 5 6 7 294 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 295 // 296 } 297 else 298 { 299 vGatherResult[1] = vGatherMaskedVal; 300 } 301 302 // Shuffle gathered components into place, each row is a component 303 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 304 } 305 break; 306 case 32: 307 { 308 // apply defaults 309 for (uint32_t i = 0; i < 4; ++i) 310 { 311 vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); 312 } 313 314 for (uint32_t i = 0; i < info.numComps; i++) 315 { 316 uint32_t swizzleIndex = info.swizzle[i]; 317 318 // Gather a SIMD of components 319 vGatherComponents[swizzleIndex] = GATHERPS( 320 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); 321 322 // offset base to the next component to gather 323 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 324 } 325 } 326 break; 327 default: 328 SWR_INVALID("Invalid float format"); 329 break; 330 } 331 } 332 333 void Builder::GATHER4DD(const SWR_FORMAT_INFO& info, 334 Value* pSrcBase, 335 Value* byteOffsets, 336 Value* vMask, 337 Value* vGatherComponents[], 338 bool bPackedOutput, 339 JIT_MEM_CLIENT usage) 340 { 341 switch (info.bpp / info.numComps) 342 { 343 case 8: 344 { 345 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 346 Value* vGatherResult = 347 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 348 // e.g. result of an 8x32bit integer gather for 8bit components 349 // 256i - 0 1 2 3 4 5 6 7 350 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 351 352 Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 353 } 354 break; 355 case 16: 356 { 357 Value* vGatherResult[2]; 358 359 // TODO: vGatherMaskedVal 360 Value* vGatherMaskedVal = VIMMED1((int32_t)0); 361 362 // always have at least one component out of x or y to fetch 363 364 vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 365 // e.g. result of first 8x32bit integer gather for 16bit components 366 // 256i - 0 1 2 3 4 5 6 7 367 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy 368 // 369 370 // if we have at least one component out of x or y to fetch 371 if (info.numComps > 2) 372 { 373 // offset base to the next components(zw) in the vertex to gather 374 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 375 376 vGatherResult[1] = 377 GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage); 378 // e.g. result of second 8x32bit integer gather for 16bit components 379 // 256i - 0 1 2 3 4 5 6 7 380 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 381 // 382 } 383 else 384 { 385 vGatherResult[1] = vGatherMaskedVal; 386 } 387 388 // Shuffle gathered components into place, each row is a component 389 Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); 390 } 391 break; 392 case 32: 393 { 394 // apply defaults 395 for (uint32_t i = 0; i < 4; ++i) 396 { 397 vGatherComponents[i] = VIMMED1((int)info.defaults[i]); 398 } 399 400 for (uint32_t i = 0; i < info.numComps; i++) 401 { 402 uint32_t swizzleIndex = info.swizzle[i]; 403 404 // Gather a SIMD of components 405 vGatherComponents[swizzleIndex] = GATHERDD( 406 vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage); 407 408 // offset base to the next component to gather 409 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4)); 410 } 411 } 412 break; 413 default: 414 SWR_INVALID("unsupported format"); 415 break; 416 } 417 } 418 419 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info, 420 Value* vGatherInput[2], 421 Value* vGatherOutput[4], 422 bool bPackedOutput) 423 { 424 // cast types 425 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 426 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 427 428 // input could either be float or int vector; do shuffle work in int 429 vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); 430 vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); 431 432 if (bPackedOutput) 433 { 434 Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 435 mVWidth / 4); // vwidth is units of 32 bits 436 437 // shuffle mask 438 Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, 439 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); 440 Value* vShufResult = 441 BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); 442 // after pshufb: group components together in each 128bit lane 443 // 256i - 0 1 2 3 4 5 6 7 444 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy 445 446 Value* vi128XY = 447 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 448 // after PERMD: move and pack xy components into each 128bit lane 449 // 256i - 0 1 2 3 4 5 6 7 450 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy 451 452 // do the same for zw components 453 Value* vi128ZW = nullptr; 454 if (info.numComps > 2) 455 { 456 Value* vShufResult = 457 BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); 458 vi128ZW = 459 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); 460 } 461 462 for (uint32_t i = 0; i < 4; i++) 463 { 464 uint32_t swizzleIndex = info.swizzle[i]; 465 // todo: fixed for packed 466 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 467 if (i >= info.numComps) 468 { 469 // set the default component val 470 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 471 continue; 472 } 473 474 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 475 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 476 // if x or y, use vi128XY permute result, else use vi128ZW 477 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 478 479 // extract packed component 128 bit lanes 480 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 481 } 482 } 483 else 484 { 485 // pshufb masks for each component 486 Value* vConstMask[2]; 487 // x/z shuffle mask 488 vConstMask[0] = C<char>({ 489 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 490 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 491 }); 492 493 // y/w shuffle mask 494 vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 495 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); 496 497 // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits 498 // apply defaults 499 for (uint32_t i = 0; i < 4; ++i) 500 { 501 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 502 } 503 504 for (uint32_t i = 0; i < info.numComps; i++) 505 { 506 uint32_t swizzleIndex = info.swizzle[i]; 507 508 // select correct constMask for x/z or y/w pshufb 509 uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; 510 // if x or y, use vi128XY permute result, else use vi128ZW 511 uint32_t selectedGather = (i < 2) ? 0 : 1; 512 513 vGatherOutput[swizzleIndex] = 514 BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), 515 vConstMask[selectedMask]), 516 vGatherTy); 517 // after pshufb mask for x channel; z uses the same shuffle from the second gather 518 // 256i - 0 1 2 3 4 5 6 7 519 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 520 } 521 } 522 } 523 524 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info, 525 Value* vGatherInput, 526 Value* vGatherOutput[], 527 bool bPackedOutput) 528 { 529 // cast types 530 Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); 531 Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits 532 533 if (bPackedOutput) 534 { 535 Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 536 mVWidth / 4); // vwidth is units of 32 bits 537 // shuffle mask 538 Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 539 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); 540 Value* vShufResult = 541 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 542 // after pshufb: group components together in each 128bit lane 543 // 256i - 0 1 2 3 4 5 6 7 544 // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww 545 546 Value* vi128XY = 547 BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); 548 // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane 549 // 256i - 0 1 2 3 4 5 6 7 550 // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) 551 552 // do the same for zw components 553 Value* vi128ZW = nullptr; 554 if (info.numComps > 2) 555 { 556 vi128ZW = 557 BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); 558 } 559 560 // sign extend all enabled components. If we have a fill vVertexElements, output to 561 // current simdvertex 562 for (uint32_t i = 0; i < 4; i++) 563 { 564 uint32_t swizzleIndex = info.swizzle[i]; 565 // todo: fix for packed 566 Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); 567 if (i >= info.numComps) 568 { 569 // set the default component val 570 vGatherOutput[swizzleIndex] = vGatherMaskedVal; 571 continue; 572 } 573 574 // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 575 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 576 // if x or y, use vi128XY permute result, else use vi128ZW 577 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; 578 579 // sign extend 580 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); 581 } 582 } 583 // else zero extend 584 else 585 { 586 // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits 587 // apply defaults 588 for (uint32_t i = 0; i < 4; ++i) 589 { 590 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); 591 } 592 593 for (uint32_t i = 0; i < info.numComps; i++) 594 { 595 uint32_t swizzleIndex = info.swizzle[i]; 596 597 // pshufb masks for each component 598 Value* vConstMask; 599 switch (i) 600 { 601 case 0: 602 // x shuffle mask 603 vConstMask = 604 C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, 605 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); 606 break; 607 case 1: 608 // y shuffle mask 609 vConstMask = 610 C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, 611 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); 612 break; 613 case 2: 614 // z shuffle mask 615 vConstMask = 616 C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 617 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); 618 break; 619 case 3: 620 // w shuffle mask 621 vConstMask = 622 C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, 623 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); 624 break; 625 default: 626 vConstMask = nullptr; 627 break; 628 } 629 630 vGatherOutput[swizzleIndex] = 631 BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); 632 // after pshufb for x channel 633 // 256i - 0 1 2 3 4 5 6 7 634 // x000 x000 x000 x000 x000 x000 x000 x000 635 } 636 } 637 } 638 639 ////////////////////////////////////////////////////////////////////////// 640 /// @brief emulates a scatter operation. 641 /// @param pDst - pointer to destination 642 /// @param vSrc - vector of src data to scatter 643 /// @param vOffsets - vector of byte offsets from pDst 644 /// @param vMask - mask of valid lanes 645 void Builder::SCATTERPS( 646 Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage) 647 { 648 AssertMemoryUsageParams(pDst, usage); 649 650 /* Scatter algorithm 651 652 while(Index = BitScanForward(mask)) 653 srcElem = srcVector[Index] 654 offsetElem = offsetVector[Index] 655 *(pDst + offsetElem) = srcElem 656 Update mask (&= ~(1<<Index) 657 658 */ 659 660 BasicBlock* pCurBB = IRB()->GetInsertBlock(); 661 Function* pFunc = pCurBB->getParent(); 662 Type* pSrcTy = vSrc->getType()->getVectorElementType(); 663 664 // Store vectors on stack 665 if (pScatterStackSrc == nullptr) 666 { 667 // Save off stack allocations and reuse per scatter. Significantly reduces stack 668 // requirements for shaders with a lot of scatters. 669 pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); 670 pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); 671 } 672 673 Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); 674 Value* pOffsetsArrayPtr = pScatterStackOffsets; 675 STORE(vSrc, pSrcArrayPtr); 676 STORE(vOffsets, pOffsetsArrayPtr); 677 678 // Cast to pointers for random access 679 pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); 680 pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); 681 682 Value* pMask = VMOVMSK(vMask); 683 684 // Setup loop basic block 685 BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc); 686 687 // compute first set bit 688 Value* pIndex = CTTZ(pMask, C(false)); 689 690 Value* pIsUndef = ICMP_EQ(pIndex, C(32)); 691 692 // Split current block or create new one if building inline 693 BasicBlock* pPostLoop; 694 if (pCurBB->getTerminator()) 695 { 696 pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode()); 697 698 // Remove unconditional jump created by splitBasicBlock 699 pCurBB->getTerminator()->eraseFromParent(); 700 701 // Add terminator to end of original block 702 IRB()->SetInsertPoint(pCurBB); 703 704 // Add conditional branch 705 COND_BR(pIsUndef, pPostLoop, pLoop); 706 } 707 else 708 { 709 pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc); 710 711 // Add conditional branch 712 COND_BR(pIsUndef, pPostLoop, pLoop); 713 } 714 715 // Add loop basic block contents 716 IRB()->SetInsertPoint(pLoop); 717 PHINode* pIndexPhi = PHI(mInt32Ty, 2); 718 PHINode* pMaskPhi = PHI(mInt32Ty, 2); 719 720 pIndexPhi->addIncoming(pIndex, pCurBB); 721 pMaskPhi->addIncoming(pMask, pCurBB); 722 723 // Extract elements for this index 724 Value* pSrcElem = LOADV(pSrcArrayPtr, {pIndexPhi}); 725 Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi}); 726 727 // GEP to this offset in dst 728 Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy); 729 pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); 730 STORE(pSrcElem, pCurDst); 731 732 // Update the mask 733 Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); 734 735 // Terminator 736 Value* pNewIndex = CTTZ(pNewMask, C(false)); 737 738 pIsUndef = ICMP_EQ(pNewIndex, C(32)); 739 COND_BR(pIsUndef, pPostLoop, pLoop); 740 741 // Update phi edges 742 pIndexPhi->addIncoming(pNewIndex, pLoop); 743 pMaskPhi->addIncoming(pNewMask, pLoop); 744 745 // Move builder to beginning of post loop 746 IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); 747 } 748} // namespace SwrJit 749