1/************************************************************************** 2 * 3 * Copyright 2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20 * USE OR OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * The above copyright notice and this permission notice (including the 23 * next paragraph) shall be included in all copies or substantial portions 24 * of the Software. 25 * 26 **************************************************************************/ 27 28 29#include "util/u_debug.h" 30#include "util/u_cpu_detect.h" 31#include "util/u_math.h" 32#include "lp_bld_debug.h" 33#include "lp_bld_const.h" 34#include "lp_bld_format.h" 35#include "lp_bld_gather.h" 36#include "lp_bld_swizzle.h" 37#include "lp_bld_type.h" 38#include "lp_bld_init.h" 39#include "lp_bld_intr.h" 40#include "lp_bld_pack.h" 41 42 43/** 44 * Get the pointer to one element from scatter positions in memory. 45 * 46 * @sa lp_build_gather() 47 */ 48LLVMValueRef 49lp_build_gather_elem_ptr(struct gallivm_state *gallivm, 50 unsigned length, 51 LLVMValueRef base_ptr, 52 LLVMValueRef offsets, 53 unsigned i) 54{ 55 LLVMValueRef offset; 56 LLVMValueRef ptr; 57 58 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 59 60 if (length == 1) { 61 assert(i == 0); 62 offset = offsets; 63 } else { 64 LLVMValueRef index = lp_build_const_int32(gallivm, i); 65 offset = LLVMBuildExtractElement(gallivm->builder, offsets, index, ""); 66 } 67 68 ptr = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, ""); 69 70 return ptr; 71} 72 73 74/** 75 * Gather one element from scatter positions in memory. 76 * 77 * @sa lp_build_gather() 78 */ 79LLVMValueRef 80lp_build_gather_elem(struct gallivm_state *gallivm, 81 unsigned length, 82 unsigned src_width, 83 unsigned dst_width, 84 boolean aligned, 85 LLVMValueRef base_ptr, 86 LLVMValueRef offsets, 87 unsigned i, 88 boolean vector_justify) 89{ 90 LLVMTypeRef src_type = LLVMIntTypeInContext(gallivm->context, src_width); 91 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 92 LLVMTypeRef dst_elem_type = LLVMIntTypeInContext(gallivm->context, dst_width); 93 LLVMValueRef ptr; 94 LLVMValueRef res; 95 96 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 97 98 ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); 99 ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); 100 res = LLVMBuildLoad(gallivm->builder, ptr, ""); 101 102 /* XXX 103 * On some archs we probably really want to avoid having to deal 104 * with alignments lower than 4 bytes (if fetch size is a power of 105 * two >= 32). On x86 it doesn't matter, however. 106 * We should be able to guarantee full alignment for any kind of texture 107 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch 108 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends 109 * but I don't think that's quite what we wanted). 110 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT 111 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't 112 * enforcing what we want (which is what d3d10 does, the offset needs to 113 * be aligned to element size, but GL has bytes regardless of element 114 * size which would only leave us with minimum alignment restriction of 16 115 * which doesn't make much sense if the type isn't 4x32bit). Due to 116 * translation of offsets to first_elem in sampler_views it actually seems 117 * gallium could not do anything else except 16 no matter what... 118 */ 119 if (!aligned) { 120 LLVMSetAlignment(res, 1); 121 } else if (!util_is_power_of_two_or_zero(src_width)) { 122 /* 123 * Full alignment is impossible, assume the caller really meant 124 * the individual elements were aligned (e.g. 3x32bit format). 125 * And yes the generated code may otherwise crash, llvm will 126 * really assume 128bit alignment with a 96bit fetch (I suppose 127 * that makes sense as it can just assume the upper 32bit to be 128 * whatever). 129 * Maybe the caller should be able to explicitly set this, but 130 * this should cover all the 3-channel formats. 131 */ 132 if (((src_width / 24) * 24 == src_width) && 133 util_is_power_of_two_or_zero(src_width / 24)) { 134 LLVMSetAlignment(res, src_width / 24); 135 } else { 136 LLVMSetAlignment(res, 1); 137 } 138 } 139 140 assert(src_width <= dst_width); 141 if (src_width < dst_width) { 142 res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); 143 if (vector_justify) { 144#ifdef PIPE_ARCH_BIG_ENDIAN 145 res = LLVMBuildShl(gallivm->builder, res, 146 LLVMConstInt(dst_elem_type, dst_width - src_width, 0), ""); 147#endif 148 } 149 } 150 151 return res; 152} 153 154 155/** 156 * Gather one element from scatter positions in memory. 157 * Nearly the same as above, however the individual elements 158 * may be vectors themselves, and fetches may be float type. 159 * Can also do pad vector instead of ZExt. 160 * 161 * @sa lp_build_gather() 162 */ 163static LLVMValueRef 164lp_build_gather_elem_vec(struct gallivm_state *gallivm, 165 unsigned length, 166 unsigned src_width, 167 LLVMTypeRef src_type, 168 struct lp_type dst_type, 169 boolean aligned, 170 LLVMValueRef base_ptr, 171 LLVMValueRef offsets, 172 unsigned i, 173 boolean vector_justify) 174{ 175 LLVMValueRef ptr, res; 176 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 177 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 178 179 ptr = lp_build_gather_elem_ptr(gallivm, length, base_ptr, offsets, i); 180 ptr = LLVMBuildBitCast(gallivm->builder, ptr, src_ptr_type, ""); 181 res = LLVMBuildLoad(gallivm->builder, ptr, ""); 182 183 /* XXX 184 * On some archs we probably really want to avoid having to deal 185 * with alignments lower than 4 bytes (if fetch size is a power of 186 * two >= 32). On x86 it doesn't matter, however. 187 * We should be able to guarantee full alignment for any kind of texture 188 * fetch (except ARB_texture_buffer_range, oops), but not vertex fetch 189 * (there's PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY and friends 190 * but I don't think that's quite what we wanted). 191 * For ARB_texture_buffer_range, PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT 192 * looks like a good fit, but it seems this cap bit (and OpenGL) aren't 193 * enforcing what we want (which is what d3d10 does, the offset needs to 194 * be aligned to element size, but GL has bytes regardless of element 195 * size which would only leave us with minimum alignment restriction of 16 196 * which doesn't make much sense if the type isn't 4x32bit). Due to 197 * translation of offsets to first_elem in sampler_views it actually seems 198 * gallium could not do anything else except 16 no matter what... 199 */ 200 if (!aligned) { 201 LLVMSetAlignment(res, 1); 202 } else if (!util_is_power_of_two_or_zero(src_width)) { 203 /* 204 * Full alignment is impossible, assume the caller really meant 205 * the individual elements were aligned (e.g. 3x32bit format). 206 * And yes the generated code may otherwise crash, llvm will 207 * really assume 128bit alignment with a 96bit fetch (I suppose 208 * that makes sense as it can just assume the upper 32bit to be 209 * whatever). 210 * Maybe the caller should be able to explicitly set this, but 211 * this should cover all the 3-channel formats. 212 */ 213 if (((src_width / 24) * 24 == src_width) && 214 util_is_power_of_two_or_zero(src_width / 24)) { 215 LLVMSetAlignment(res, src_width / 24); 216 } else { 217 LLVMSetAlignment(res, 1); 218 } 219 } 220 221 assert(src_width <= dst_type.width * dst_type.length); 222 if (src_width < dst_type.width * dst_type.length) { 223 if (dst_type.length > 1) { 224 res = lp_build_pad_vector(gallivm, res, dst_type.length); 225 /* 226 * vector_justify hopefully a non-issue since we only deal 227 * with src_width >= 32 here? 228 */ 229 } else { 230 LLVMTypeRef dst_elem_type = lp_build_vec_type(gallivm, dst_type); 231 232 /* 233 * Only valid if src_ptr_type is int type... 234 */ 235 res = LLVMBuildZExt(gallivm->builder, res, dst_elem_type, ""); 236 237#ifdef PIPE_ARCH_BIG_ENDIAN 238 if (vector_justify) { 239 res = LLVMBuildShl(gallivm->builder, res, 240 LLVMConstInt(dst_elem_type, 241 dst_type.width - src_width, 0), ""); 242 } 243 if (src_width == 48) { 244 /* Load 3x16 bit vector. 245 * The sequence of loads on big-endian hardware proceeds as follows. 246 * 16-bit fields are denoted by X, Y, Z, and 0. In memory, the sequence 247 * of three fields appears in the order X, Y, Z. 248 * 249 * Load 32-bit word: 0.0.X.Y 250 * Load 16-bit halfword: 0.0.0.Z 251 * Rotate left: 0.X.Y.0 252 * Bitwise OR: 0.X.Y.Z 253 * 254 * The order in which we need the fields in the result is 0.Z.Y.X, 255 * the same as on little-endian; permute 16-bit fields accordingly 256 * within 64-bit register: 257 */ 258 LLVMValueRef shuffles[4] = { 259 lp_build_const_int32(gallivm, 2), 260 lp_build_const_int32(gallivm, 1), 261 lp_build_const_int32(gallivm, 0), 262 lp_build_const_int32(gallivm, 3), 263 }; 264 res = LLVMBuildBitCast(gallivm->builder, res, 265 lp_build_vec_type(gallivm, lp_type_uint_vec(16, 4*16)), ""); 266 res = LLVMBuildShuffleVector(gallivm->builder, res, res, LLVMConstVector(shuffles, 4), ""); 267 res = LLVMBuildBitCast(gallivm->builder, res, dst_elem_type, ""); 268 } 269#endif 270 } 271 } 272 return res; 273} 274 275 276 277 278static LLVMValueRef 279lp_build_gather_avx2(struct gallivm_state *gallivm, 280 unsigned length, 281 unsigned src_width, 282 struct lp_type dst_type, 283 LLVMValueRef base_ptr, 284 LLVMValueRef offsets) 285{ 286 LLVMBuilderRef builder = gallivm->builder; 287 LLVMTypeRef src_type, src_vec_type; 288 LLVMValueRef res; 289 struct lp_type res_type = dst_type; 290 res_type.length *= length; 291 292 if (dst_type.floating) { 293 src_type = src_width == 64 ? LLVMDoubleTypeInContext(gallivm->context) : 294 LLVMFloatTypeInContext(gallivm->context); 295 } else { 296 src_type = LLVMIntTypeInContext(gallivm->context, src_width); 297 } 298 src_vec_type = LLVMVectorType(src_type, length); 299 300 /* XXX should allow hw scaling (can handle i8, i16, i32, i64 for x86) */ 301 assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0)); 302 303 if (0) { 304 /* 305 * XXX: This will cause LLVM pre 3.7 to hang; it works on LLVM 3.8 but 306 * will not use the AVX2 gather instrinsics (even with llvm 4.0), at 307 * least with Haswell. See 308 * http://lists.llvm.org/pipermail/llvm-dev/2016-January/094448.html 309 * And the generated code doing the emulation is quite a bit worse 310 * than what we get by doing it ourselves too. 311 */ 312 LLVMTypeRef i32_type = LLVMIntTypeInContext(gallivm->context, 32); 313 LLVMTypeRef i32_vec_type = LLVMVectorType(i32_type, length); 314 LLVMTypeRef i1_type = LLVMIntTypeInContext(gallivm->context, 1); 315 LLVMTypeRef i1_vec_type = LLVMVectorType(i1_type, length); 316 LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); 317 LLVMValueRef src_ptr; 318 319 base_ptr = LLVMBuildBitCast(builder, base_ptr, src_ptr_type, ""); 320 321 /* Rescale offsets from bytes to elements */ 322 LLVMValueRef scale = LLVMConstInt(i32_type, src_width/8, 0); 323 scale = lp_build_broadcast(gallivm, i32_vec_type, scale); 324 assert(LLVMTypeOf(offsets) == i32_vec_type); 325 offsets = LLVMBuildSDiv(builder, offsets, scale, ""); 326 327 src_ptr = LLVMBuildGEP(builder, base_ptr, &offsets, 1, "vector-gep"); 328 329 char intrinsic[64]; 330 util_snprintf(intrinsic, sizeof intrinsic, "llvm.masked.gather.v%u%s%u", 331 length, dst_type.floating ? "f" : "i", src_width); 332 LLVMValueRef alignment = LLVMConstInt(i32_type, src_width/8, 0); 333 LLVMValueRef mask = LLVMConstAllOnes(i1_vec_type); 334 LLVMValueRef passthru = LLVMGetUndef(src_vec_type); 335 336 LLVMValueRef args[] = { src_ptr, alignment, mask, passthru }; 337 338 res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 4, 0); 339 } else { 340 LLVMTypeRef i8_type = LLVMIntTypeInContext(gallivm->context, 8); 341 const char *intrinsic = NULL; 342 unsigned l_idx = 0; 343 344 assert(src_width == 32 || src_width == 64); 345 if (src_width == 32) { 346 assert(length == 4 || length == 8); 347 } else { 348 assert(length == 2 || length == 4); 349 } 350 351 static const char *intrinsics[2][2][2] = { 352 353 {{"llvm.x86.avx2.gather.d.d", 354 "llvm.x86.avx2.gather.d.d.256"}, 355 {"llvm.x86.avx2.gather.d.q", 356 "llvm.x86.avx2.gather.d.q.256"}}, 357 358 {{"llvm.x86.avx2.gather.d.ps", 359 "llvm.x86.avx2.gather.d.ps.256"}, 360 {"llvm.x86.avx2.gather.d.pd", 361 "llvm.x86.avx2.gather.d.pd.256"}}, 362 }; 363 364 if ((src_width == 32 && length == 8) || 365 (src_width == 64 && length == 4)) { 366 l_idx = 1; 367 } 368 intrinsic = intrinsics[dst_type.floating][src_width == 64][l_idx]; 369 370 LLVMValueRef passthru = LLVMGetUndef(src_vec_type); 371 LLVMValueRef mask = LLVMConstAllOnes(src_vec_type); 372 mask = LLVMConstBitCast(mask, src_vec_type); 373 LLVMValueRef scale = LLVMConstInt(i8_type, 1, 0); 374 375 LLVMValueRef args[] = { passthru, base_ptr, offsets, mask, scale }; 376 377 res = lp_build_intrinsic(builder, intrinsic, src_vec_type, args, 5, 0); 378 } 379 res = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, res_type), ""); 380 381 return res; 382} 383 384 385/** 386 * Gather elements from scatter positions in memory into a single vector. 387 * Use for fetching texels from a texture. 388 * For SSE, typical values are length=4, src_width=32, dst_width=32. 389 * 390 * When src_width < dst_width, the return value can be justified in 391 * one of two ways: 392 * "integer justification" is used when the caller treats the destination 393 * as a packed integer bitmask, as described by the channels' "shift" and 394 * "width" fields; 395 * "vector justification" is used when the caller casts the destination 396 * to a vector and needs channel X to be in vector element 0. 397 * 398 * @param length length of the offsets 399 * @param src_width src element width in bits 400 * @param dst_type result element type (src will be expanded to fit, 401 * but truncation is not allowed) 402 * (this may be a vector, must be pot sized) 403 * @param aligned whether the data is guaranteed to be aligned (to src_width) 404 * @param base_ptr base pointer, needs to be a i8 pointer type. 405 * @param offsets vector with offsets 406 * @param vector_justify select vector rather than integer justification 407 */ 408LLVMValueRef 409lp_build_gather(struct gallivm_state *gallivm, 410 unsigned length, 411 unsigned src_width, 412 struct lp_type dst_type, 413 boolean aligned, 414 LLVMValueRef base_ptr, 415 LLVMValueRef offsets, 416 boolean vector_justify) 417{ 418 LLVMValueRef res; 419 boolean need_expansion = src_width < dst_type.width * dst_type.length; 420 boolean vec_fetch; 421 struct lp_type fetch_type, fetch_dst_type; 422 LLVMTypeRef src_type; 423 424 assert(src_width <= dst_type.width * dst_type.length); 425 426 /* 427 * This is quite a mess... 428 * Figure out if the fetch should be done as: 429 * a) scalar or vector 430 * b) float or int 431 * 432 * As an example, for a 96bit fetch expanded into 4x32bit, it is better 433 * to use (3x32bit) vector type (then pad the vector). Otherwise, the 434 * zext will cause extra instructions. 435 * However, the same isn't true for 3x16bit (the codegen for that is 436 * completely worthless on x86 simd, and for 3x8bit is is way worse 437 * still, don't try that... (To get really good code out of llvm for 438 * these cases, the only way is to decompose the fetches manually 439 * into 1x32bit/1x16bit, or 1x16/1x8bit respectively, although the latter 440 * case requires sse41, otherwise simple scalar zext is way better. 441 * But probably not important enough, so don't bother.) 442 * Also, we try to honor the floating bit of destination (but isn't 443 * possible if caller asks for instance for 2x32bit dst_type with 444 * 48bit fetch - the idea would be to use 3x16bit fetch, pad and 445 * cast to 2x32f type, so the fetch is always int and on top of that 446 * we avoid the vec pad and use scalar zext due the above mentioned 447 * issue). 448 * Note this is optimized for x86 sse2 and up backend. Could be tweaked 449 * for other archs if necessary... 450 */ 451 if (((src_width % 32) == 0) && ((src_width % dst_type.width) == 0) && 452 (dst_type.length > 1)) { 453 /* use vector fetch (if dst_type is vector) */ 454 vec_fetch = TRUE; 455 if (dst_type.floating) { 456 fetch_type = lp_type_float_vec(dst_type.width, src_width); 457 } else { 458 fetch_type = lp_type_int_vec(dst_type.width, src_width); 459 } 460 /* intentionally not using lp_build_vec_type here */ 461 src_type = LLVMVectorType(lp_build_elem_type(gallivm, fetch_type), 462 fetch_type.length); 463 fetch_dst_type = fetch_type; 464 fetch_dst_type.length = dst_type.length; 465 } else { 466 /* use scalar fetch */ 467 vec_fetch = FALSE; 468 if (dst_type.floating && ((src_width == 32) || (src_width == 64))) { 469 fetch_type = lp_type_float(src_width); 470 } else { 471 fetch_type = lp_type_int(src_width); 472 } 473 src_type = lp_build_vec_type(gallivm, fetch_type); 474 fetch_dst_type = fetch_type; 475 fetch_dst_type.width = dst_type.width * dst_type.length; 476 } 477 478 if (length == 1) { 479 /* Scalar */ 480 res = lp_build_gather_elem_vec(gallivm, length, 481 src_width, src_type, fetch_dst_type, 482 aligned, base_ptr, offsets, 0, 483 vector_justify); 484 return LLVMBuildBitCast(gallivm->builder, res, 485 lp_build_vec_type(gallivm, dst_type), ""); 486 /* 487 * Excluding expansion from these paths because if you need it for 488 * 32bit/64bit fetches you're doing it wrong (this is gather, not 489 * conversion) and it would be awkward for floats. 490 */ 491 } else if (util_cpu_caps.has_avx2 && !need_expansion && 492 src_width == 32 && (length == 4 || length == 8)) { 493 return lp_build_gather_avx2(gallivm, length, src_width, dst_type, 494 base_ptr, offsets); 495 /* 496 * This looks bad on paper wrt throughtput/latency on Haswell. 497 * Even on Broadwell it doesn't look stellar. 498 * Albeit no measurements were done (but tested to work). 499 * Should definitely enable on Skylake. 500 * (In general, should be more of a win if the fetch is 256bit wide - 501 * this is true for the 32bit case above too.) 502 */ 503 } else if (0 && util_cpu_caps.has_avx2 && !need_expansion && 504 src_width == 64 && (length == 2 || length == 4)) { 505 return lp_build_gather_avx2(gallivm, length, src_width, dst_type, 506 base_ptr, offsets); 507 } else { 508 /* Vector */ 509 510 LLVMValueRef elems[LP_MAX_VECTOR_WIDTH / 8]; 511 unsigned i; 512 boolean vec_zext = FALSE; 513 struct lp_type res_type, gather_res_type; 514 LLVMTypeRef res_t, gather_res_t; 515 516 res_type = fetch_dst_type; 517 res_type.length *= length; 518 gather_res_type = res_type; 519 520 if (src_width == 16 && dst_type.width == 32 && dst_type.length == 1) { 521 /* 522 * Note that llvm is never able to optimize zext/insert combos 523 * directly (i.e. zero the simd reg, then place the elements into 524 * the appropriate place directly). (I think this has to do with 525 * scalar/vector transition.) And scalar 16->32bit zext simd loads 526 * aren't possible (instead loading to scalar reg first). 527 * No idea about other archs... 528 * We could do this manually, but instead we just use a vector 529 * zext, which is simple enough (and, in fact, llvm might optimize 530 * this away). 531 * (We're not trying that with other bit widths as that might not be 532 * easier, in particular with 8 bit values at least with only sse2.) 533 */ 534 assert(vec_fetch == FALSE); 535 gather_res_type.width /= 2; 536 fetch_dst_type = fetch_type; 537 src_type = lp_build_vec_type(gallivm, fetch_type); 538 vec_zext = TRUE; 539 } 540 res_t = lp_build_vec_type(gallivm, res_type); 541 gather_res_t = lp_build_vec_type(gallivm, gather_res_type); 542 res = LLVMGetUndef(gather_res_t); 543 for (i = 0; i < length; ++i) { 544 LLVMValueRef index = lp_build_const_int32(gallivm, i); 545 elems[i] = lp_build_gather_elem_vec(gallivm, length, 546 src_width, src_type, fetch_dst_type, 547 aligned, base_ptr, offsets, i, 548 vector_justify); 549 if (!vec_fetch) { 550 res = LLVMBuildInsertElement(gallivm->builder, res, elems[i], index, ""); 551 } 552 } 553 if (vec_zext) { 554 res = LLVMBuildZExt(gallivm->builder, res, res_t, ""); 555 if (vector_justify) { 556#ifdef PIPE_ARCH_BIG_ENDIAN 557 unsigned sv = dst_type.width - src_width; 558 res = LLVMBuildShl(gallivm->builder, res, 559 lp_build_const_int_vec(gallivm, res_type, sv), ""); 560#endif 561 } 562 } 563 if (vec_fetch) { 564 /* 565 * Do bitcast now otherwise llvm might get some funny ideas wrt 566 * float/int types... 567 */ 568 for (i = 0; i < length; i++) { 569 elems[i] = LLVMBuildBitCast(gallivm->builder, elems[i], 570 lp_build_vec_type(gallivm, dst_type), ""); 571 } 572 res = lp_build_concat(gallivm, elems, dst_type, length); 573 } else { 574 struct lp_type really_final_type = dst_type; 575 assert(res_type.length * res_type.width == 576 dst_type.length * dst_type.width * length); 577 really_final_type.length *= length; 578 res = LLVMBuildBitCast(gallivm->builder, res, 579 lp_build_vec_type(gallivm, really_final_type), ""); 580 } 581 } 582 583 return res; 584} 585 586LLVMValueRef 587lp_build_gather_values(struct gallivm_state * gallivm, 588 LLVMValueRef * values, 589 unsigned value_count) 590{ 591 LLVMTypeRef vec_type = LLVMVectorType(LLVMTypeOf(values[0]), value_count); 592 LLVMBuilderRef builder = gallivm->builder; 593 LLVMValueRef vec = LLVMGetUndef(vec_type); 594 unsigned i; 595 596 for (i = 0; i < value_count; i++) { 597 LLVMValueRef index = lp_build_const_int32(gallivm, i); 598 vec = LLVMBuildInsertElement(builder, vec, values[i], index, ""); 599 } 600 return vec; 601} 602