1/**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 24#ifndef __SWR_SIMDINTRIN_H__ 25#define __SWR_SIMDINTRIN_H__ 26 27#include "common/intrin.h" 28#include "common/simdlib.hpp" 29 30#if KNOB_SIMD_WIDTH == 8 31typedef SIMD256 SIMD; 32#else 33#error Unsupported vector width 34#endif // KNOB_SIMD16_WIDTH == 16 35 36#define _simd128_maskstore_ps SIMD128::maskstore_ps 37#define _simd128_fmadd_ps SIMD128::fmadd_ps 38 39#define _simd_load_ps SIMD::load_ps 40#define _simd_load1_ps SIMD::broadcast_ss 41#define _simd_loadu_ps SIMD::loadu_ps 42#define _simd_setzero_ps SIMD::setzero_ps 43#define _simd_set1_ps SIMD::set1_ps 44#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b) 45#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b) 46#define _simd_blendv_ps SIMD::blendv_ps 47#define _simd_store_ps SIMD::store_ps 48#define _simd_mul_ps SIMD::mul_ps 49#define _simd_add_ps SIMD::add_ps 50#define _simd_sub_ps SIMD::sub_ps 51#define _simd_rsqrt_ps SIMD::rsqrt_ps 52#define _simd_min_ps SIMD::min_ps 53#define _simd_max_ps SIMD::max_ps 54#define _simd_movemask_ps SIMD::movemask_ps 55#define _simd_cvtps_epi32 SIMD::cvtps_epi32 56#define _simd_cvttps_epi32 SIMD::cvttps_epi32 57#define _simd_cvtepi32_ps SIMD::cvtepi32_ps 58#define _simd_cmplt_ps SIMD::cmplt_ps 59#define _simd_cmpgt_ps SIMD::cmpgt_ps 60#define _simd_cmpneq_ps SIMD::cmpneq_ps 61#define _simd_cmpeq_ps SIMD::cmpeq_ps 62#define _simd_cmpge_ps SIMD::cmpge_ps 63#define _simd_cmple_ps SIMD::cmple_ps 64#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b) 65#define _simd_and_ps SIMD::and_ps 66#define _simd_or_ps SIMD::or_ps 67#define _simd_rcp_ps SIMD::rcp_ps 68#define _simd_div_ps SIMD::div_ps 69#define _simd_castsi_ps SIMD::castsi_ps 70#define _simd_castps_pd SIMD::castps_pd 71#define _simd_castpd_ps SIMD::castpd_ps 72#define _simd_andnot_ps SIMD::andnot_ps 73#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a) 74#define _simd_castpd_ps SIMD::castpd_ps 75#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a)) 76#define _simd_stream_ps SIMD::stream_ps 77 78#define _simd_movemask_pd SIMD::movemask_pd 79#define _simd_castsi_pd SIMD::castsi_pd 80 81#define _simd_mul_epi32 SIMD::mul_epi32 82#define _simd_mullo_epi32 SIMD::mullo_epi32 83#define _simd_sub_epi32 SIMD::sub_epi32 84#define _simd_sub_epi64 SIMD::sub_epi64 85#define _simd_min_epi32 SIMD::min_epi32 86#define _simd_min_epu32 SIMD::min_epu32 87#define _simd_max_epi32 SIMD::max_epi32 88#define _simd_max_epu32 SIMD::max_epu32 89#define _simd_add_epi32 SIMD::add_epi32 90#define _simd_and_si SIMD::and_si 91#define _simd_andnot_si SIMD::andnot_si 92#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32 93#define _simd_cmplt_epi32 SIMD::cmplt_epi32 94#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32 95#define _simd_or_si SIMD::or_si 96#define _simd_xor_si SIMD::xor_si 97#define _simd_castps_si SIMD::castps_si 98#define _simd_adds_epu8 SIMD::adds_epu8 99#define _simd_subs_epu8 SIMD::subs_epu8 100#define _simd_add_epi8 SIMD::add_epi8 101#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64 102#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64 103#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8 104#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8 105#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16 106#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16 107#define _simd_movemask_epi8 SIMD::movemask_epi8 108#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a) 109#define _simd_permute_ps SIMD::permute_ps 110#define _simd_permute_epi32 SIMD::permute_epi32 111#define _simd_srlv_epi32 SIMD::srlv_epi32 112#define _simd_sllv_epi32 SIMD::sllv_epi32 113 114#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8 115#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8 116#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16 117#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16 118#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32 119#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32 120#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64 121#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64 122 123#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a) 124#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a) 125#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a) 126#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a) 127 128#define _simd_fmadd_ps SIMD::fmadd_ps 129#define _simd_fmsub_ps SIMD::fmsub_ps 130#define _simd_shuffle_epi8 SIMD::shuffle_epi8 131 132#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o) 133#define _simd_mask_i32gather_ps(r, p, o, m, s) \ 134 SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m) 135#define _simd_abs_epi32 SIMD::abs_epi32 136 137#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16 138#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32 139#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32 140#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64 141#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64 142 143#define _simd_packus_epi16 SIMD::packus_epi16 144#define _simd_packs_epi16 SIMD::packs_epi16 145#define _simd_packus_epi32 SIMD::packus_epi32 146#define _simd_packs_epi32 SIMD::packs_epi32 147 148#define _simd_unpacklo_ps SIMD::unpacklo_ps 149#define _simd_unpackhi_ps SIMD::unpackhi_ps 150#define _simd_unpacklo_pd SIMD::unpacklo_pd 151#define _simd_unpackhi_pd SIMD::unpackhi_pd 152#define _simd_insertf128_ps SIMD::insertf128_ps 153#define _simd_insertf128_pd SIMD::insertf128_pd 154#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b) 155#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a) 156#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a) 157#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a) 158#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b) 159#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b) 160#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b) 161#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b) 162#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b) 163#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b) 164#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b) 165#define _simd_set1_epi32 SIMD::set1_epi32 166#define _simd_set_epi32 SIMD::set_epi32 167#define _simd_set_ps SIMD::set_ps 168#define _simd_set1_epi8 SIMD::set1_epi8 169#define _simd_setzero_si SIMD::setzero_si 170#define _simd_cvttps_epi32 SIMD::cvttps_epi32 171#define _simd_store_si SIMD::store_si 172#define _simd_broadcast_ss SIMD::broadcast_ss 173#define _simd_maskstore_ps SIMD::maskstore_ps 174#define _simd_load_si SIMD::load_si 175#define _simd_loadu_si SIMD::loadu_si 176#define _simd_sub_ps SIMD::sub_ps 177#define _simd_testz_ps SIMD::testz_ps 178#define _simd_testz_si SIMD::testz_si 179#define _simd_xor_ps SIMD::xor_ps 180 181#define _simd_loadu2_si SIMD::loadu2_si 182#define _simd_storeu2_si SIMD::storeu2_si 183 184#define _simd_blendv_epi32 SIMD::blendv_epi32 185#define _simd_vmask_ps SIMD::vmask_ps 186 187template <int mask> 188SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b) 189{ 190 return SIMD128::castps_si( 191 SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b))); 192} 193 194SIMDINLINE 195void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane) 196{ 197 OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; 198 SIMD256::store_ps(rArray, r); 199 SIMD256::store_ps(sArray, s); 200 rArray[rlane] = sArray[slane]; 201 r = SIMD256::load_ps(rArray); 202} 203 204// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. 205#define _simdvec_load_ps SIMD::vec4_load1_ps 206 207SIMDINLINE 208void _simdvec_mov(simdvector& r, const simdscalar& s) 209{ 210 SIMD::vec4_set1_vps(r, s); 211} 212 213SIMDINLINE 214void _simdvec_mov(simdvector& r, const simdvector& v) 215{ 216 r = v; 217} 218 219#if 0 220// just move a lane from the source simdvector to dest simdvector 221SIMDINLINE 222void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) 223{ 224 _simd_mov(r[0], rlane, s[0], slane); 225 _simd_mov(r[1], rlane, s[1], slane); 226 _simd_mov(r[2], rlane, s[2], slane); 227 _simd_mov(r[3], rlane, s[3], slane); 228} 229 230#endif 231 232#define _simdvec_dp3_ps SIMD::vec4_dp3_ps 233#define _simdvec_dp4_ps SIMD::vec4_dp4_ps 234#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps 235#define _simdvec_normalize_ps SIMD::vec4_normalize_ps 236#define _simdvec_mul_ps SIMD::vec4_mul_ps 237#define _simdvec_add_ps SIMD::vec4_add_ps 238#define _simdvec_min_ps SIMD::vec4_min_ps 239#define _simdvec_max_ps SIMD::vec4_max_ps 240#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply 241#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply 242#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply 243#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply 244 245////////////////////////////////////////////////////////////////////////// 246/// @brief Compute plane equation vA * vX + vB * vY + vC 247SIMDINLINE simdscalar vplaneps(simdscalar const& vA, 248 simdscalar const& vB, 249 simdscalar const& vC, 250 simdscalar const& vX, 251 simdscalar const& vY) 252{ 253 simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); 254 vOut = _simd_fmadd_ps(vB, vY, vOut); 255 return vOut; 256} 257 258////////////////////////////////////////////////////////////////////////// 259/// @brief Compute plane equation vA * vX + vB * vY + vC 260SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA, 261 simd4scalar const& vB, 262 simd4scalar const& vC, 263 simd4scalar const& vX, 264 simd4scalar const& vY) 265{ 266 simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC); 267 vOut = _simd128_fmadd_ps(vB, vY, vOut); 268 return vOut; 269} 270 271////////////////////////////////////////////////////////////////////////// 272/// @brief Interpolates a single component. 273/// @param vI - barycentric I 274/// @param vJ - barycentric J 275/// @param pInterpBuffer - pointer to attribute barycentric coeffs 276template <UINT Attrib, UINT Comp, UINT numComponents = 4> 277static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI, 278 simdscalar const& vJ, 279 const float* pInterpBuffer) 280{ 281 const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 282 const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; 283 const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; 284 285 simdscalar vA = _simd_broadcast_ss(pInterpA); 286 simdscalar vB = _simd_broadcast_ss(pInterpB); 287 simdscalar vC = _simd_broadcast_ss(pInterpC); 288 289 simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); 290 vC = _simd_mul_ps(vk, vC); 291 292 return vplaneps(vA, vB, vC, vI, vJ); 293} 294 295////////////////////////////////////////////////////////////////////////// 296/// @brief Interpolates a single component (flat shade). 297/// @param pInterpBuffer - pointer to attribute barycentric coeffs 298template <UINT Attrib, UINT Comp, UINT numComponents = 4> 299static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer) 300{ 301 const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 302 303 simdscalar vA = _simd_broadcast_ss(pInterpA); 304 305 return vA; 306} 307 308////////////////////////////////////////////////////////////////////////// 309/// @brief Interpolates a single component (flat shade). 310/// @param pInterpBuffer - pointer to attribute barycentric coeffs 311template <UINT Attrib, UINT Comp, UINT numComponents = 4> 312static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer) 313{ 314 const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 315 316 simdscalari vA = _simd_set1_epi32(interpA); 317 318 return vA; 319} 320 321////////////////////////////////////////////////////////////////////////// 322/// @brief Interpolates a single component. 323/// @param vI - barycentric I 324/// @param vJ - barycentric J 325/// @param pInterpBuffer - pointer to attribute barycentric coeffs 326template <UINT Attrib, UINT Comp, UINT numComponents = 4> 327static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI, 328 simd4scalar const& vJ, 329 const float* pInterpBuffer) 330{ 331 const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 332 const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; 333 const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; 334 335 simd4scalar vA = SIMD128::broadcast_ss(pInterpA); 336 simd4scalar vB = SIMD128::broadcast_ss(pInterpB); 337 simd4scalar vC = SIMD128::broadcast_ss(pInterpC); 338 339 simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ); 340 vC = SIMD128::mul_ps(vk, vC); 341 342 return vplaneps(vA, vB, vC, vI, vJ); 343} 344 345static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a) 346{ 347 simd4scalari ai = SIMD128::castps_si(a); 348 return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff))); 349} 350 351static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a) 352{ 353 simdscalari ai = _simd_castps_si(a); 354 return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); 355} 356 357#include "simd16intrin.h" 358 359#endif //__SWR_SIMDINTRIN_H__ 360