1/**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23#pragma once 24 25#include "simdlib_types.hpp" 26 27// For documentation, please see the following include... 28// #include "simdlib_interface.hpp" 29 30namespace SIMDImpl 31{ 32 namespace SIMD128Impl 33 { 34#if SIMD_ARCH >= SIMD_ARCH_AVX 35 struct AVXImpl 36 { 37#define __SIMD_LIB_AVX_HPP__ 38#include "simdlib_128_avx.inl" 39#undef __SIMD_LIB_AVX_HPP__ 40 }; // struct AVXImpl 41#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 42 43#if SIMD_ARCH >= SIMD_ARCH_AVX2 44 struct AVX2Impl : AVXImpl 45 { 46#define __SIMD_LIB_AVX2_HPP__ 47#include "simdlib_128_avx2.inl" 48#undef __SIMD_LIB_AVX2_HPP__ 49 }; // struct AVX2Impl 50#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 51 52#if SIMD_ARCH >= SIMD_ARCH_AVX512 53 struct AVX512Impl : AVX2Impl 54 { 55#if defined(SIMD_OPT_128_AVX512) 56#define __SIMD_LIB_AVX512_HPP__ 57#include "simdlib_128_avx512.inl" 58#if defined(SIMD_ARCH_KNIGHTS) 59#include "simdlib_128_avx512_knights.inl" 60#else // optimize for core 61#include "simdlib_128_avx512_core.inl" 62#endif // defined(SIMD_ARCH_KNIGHTS) 63#undef __SIMD_LIB_AVX512_HPP__ 64#endif // SIMD_OPT_128_AVX512 65 }; // struct AVX2Impl 66#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 67 68 struct Traits : SIMDImpl::Traits 69 { 70#if SIMD_ARCH == SIMD_ARCH_AVX 71 using IsaImpl = AVXImpl; 72#elif SIMD_ARCH == SIMD_ARCH_AVX2 73 using IsaImpl = AVX2Impl; 74#elif SIMD_ARCH == SIMD_ARCH_AVX512 75 using IsaImpl = AVX512Impl; 76#else 77#error Invalid value for SIMD_ARCH 78#endif 79 80 using Float = SIMD128Impl::Float; 81 using Double = SIMD128Impl::Double; 82 using Integer = SIMD128Impl::Integer; 83 using Vec4 = SIMD128Impl::Vec4; 84 using Mask = SIMD128Impl::Mask; 85 }; 86 } // namespace SIMD128Impl 87 88 namespace SIMD256Impl 89 { 90#if SIMD_ARCH >= SIMD_ARCH_AVX 91 struct AVXImpl 92 { 93#define __SIMD_LIB_AVX_HPP__ 94#include "simdlib_256_avx.inl" 95#undef __SIMD_LIB_AVX_HPP__ 96 }; // struct AVXImpl 97#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 98 99#if SIMD_ARCH >= SIMD_ARCH_AVX2 100 struct AVX2Impl : AVXImpl 101 { 102#define __SIMD_LIB_AVX2_HPP__ 103#include "simdlib_256_avx2.inl" 104#undef __SIMD_LIB_AVX2_HPP__ 105 }; // struct AVX2Impl 106#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 107 108#if SIMD_ARCH >= SIMD_ARCH_AVX512 109 struct AVX512Impl : AVX2Impl 110 { 111#if defined(SIMD_OPT_256_AVX512) 112#define __SIMD_LIB_AVX512_HPP__ 113#include "simdlib_256_avx512.inl" 114#if defined(SIMD_ARCH_KNIGHTS) 115#include "simdlib_256_avx512_knights.inl" 116#else // optimize for core 117#include "simdlib_256_avx512_core.inl" 118#endif // defined(SIMD_ARCH_KNIGHTS) 119#undef __SIMD_LIB_AVX512_HPP__ 120#endif // SIMD_OPT_256_AVX512 121 }; // struct AVX2Impl 122#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 123 124 struct Traits : SIMDImpl::Traits 125 { 126#if SIMD_ARCH == SIMD_ARCH_AVX 127 using IsaImpl = AVXImpl; 128#elif SIMD_ARCH == SIMD_ARCH_AVX2 129 using IsaImpl = AVX2Impl; 130#elif SIMD_ARCH == SIMD_ARCH_AVX512 131 using IsaImpl = AVX512Impl; 132#else 133#error Invalid value for SIMD_ARCH 134#endif 135 136 using Float = SIMD256Impl::Float; 137 using Double = SIMD256Impl::Double; 138 using Integer = SIMD256Impl::Integer; 139 using Vec4 = SIMD256Impl::Vec4; 140 using Mask = SIMD256Impl::Mask; 141 }; 142 } // namespace SIMD256Impl 143 144 namespace SIMD512Impl 145 { 146#if SIMD_ARCH >= SIMD_ARCH_AVX 147 template <typename SIMD256T> 148 struct AVXImplBase 149 { 150#define __SIMD_LIB_AVX_HPP__ 151#include "simdlib_512_emu.inl" 152#include "simdlib_512_emu_masks.inl" 153#undef __SIMD_LIB_AVX_HPP__ 154 }; // struct AVXImplBase 155 using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>; 156#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX 157 158#if SIMD_ARCH >= SIMD_ARCH_AVX2 159 using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>; 160#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2 161 162#if SIMD_ARCH >= SIMD_ARCH_AVX512 163 struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl> 164 { 165#define __SIMD_LIB_AVX512_HPP__ 166#include "simdlib_512_avx512.inl" 167#include "simdlib_512_avx512_masks.inl" 168#if defined(SIMD_ARCH_KNIGHTS) 169#include "simdlib_512_avx512_knights.inl" 170#include "simdlib_512_avx512_masks_knights.inl" 171#else // optimize for core 172#include "simdlib_512_avx512_core.inl" 173#include "simdlib_512_avx512_masks_core.inl" 174#endif // defined(SIMD_ARCH_KNIGHTS) 175#undef __SIMD_LIB_AVX512_HPP__ 176 }; // struct AVX512ImplBase 177#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512 178 179 struct Traits : SIMDImpl::Traits 180 { 181#if SIMD_ARCH == SIMD_ARCH_AVX 182 using IsaImpl = AVXImpl; 183#elif SIMD_ARCH == SIMD_ARCH_AVX2 184 using IsaImpl = AVX2Impl; 185#elif SIMD_ARCH == SIMD_ARCH_AVX512 186 using IsaImpl = AVX512Impl; 187#else 188#error Invalid value for SIMD_ARCH 189#endif 190 191 using Float = SIMD512Impl::Float; 192 using Double = SIMD512Impl::Double; 193 using Integer = SIMD512Impl::Integer; 194 using Vec4 = SIMD512Impl::Vec4; 195 using Mask = SIMD512Impl::Mask; 196 }; 197 } // namespace SIMD512Impl 198} // namespace SIMDImpl 199 200template <typename Traits> 201struct SIMDBase : Traits::IsaImpl 202{ 203 using CompareType = typename Traits::CompareType; 204 using ScaleFactor = typename Traits::ScaleFactor; 205 using RoundMode = typename Traits::RoundMode; 206 using SIMD = typename Traits::IsaImpl; 207 using Float = typename Traits::Float; 208 using Double = typename Traits::Double; 209 using Integer = typename Traits::Integer; 210 using Vec4 = typename Traits::Vec4; 211 using Mask = typename Traits::Mask; 212 213 static const size_t VECTOR_BYTES = sizeof(Float); 214 215 // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww. 216 static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p) 217 { 218 r[0] = SIMD::set1_ps(p[0]); 219 r[1] = SIMD::set1_ps(p[1]); 220 r[2] = SIMD::set1_ps(p[2]); 221 r[3] = SIMD::set1_ps(p[3]); 222 } 223 224 static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s) 225 { 226 r[0] = s; 227 r[1] = s; 228 r[2] = s; 229 r[3] = s; 230 } 231 232 static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1) 233 { 234 Float tmp, r; 235 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 236 237 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 238 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 239 240 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 241 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 242 243 return r; 244 } 245 246 static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1) 247 { 248 Float tmp, r; 249 r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 250 251 tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 252 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 253 254 tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 255 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 256 257 tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w) 258 r = SIMD::add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 259 260 return r; 261 } 262 263 static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v) 264 { 265 Float length = vec4_dp4_ps(v, v); 266 return SIMD::rsqrt_ps(length); 267 } 268 269 static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v) 270 { 271 Float rcpLength = vec4_rcp_length_ps(v); 272 273 r[0] = SIMD::mul_ps(v[0], rcpLength); 274 r[1] = SIMD::mul_ps(v[1], rcpLength); 275 r[2] = SIMD::mul_ps(v[2], rcpLength); 276 r[3] = SIMD::mul_ps(v[3], rcpLength); 277 } 278 279 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s) 280 { 281 r[0] = SIMD::mul_ps(v[0], s); 282 r[1] = SIMD::mul_ps(v[1], s); 283 r[2] = SIMD::mul_ps(v[2], s); 284 r[3] = SIMD::mul_ps(v[3], s); 285 } 286 287 static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1) 288 { 289 r[0] = SIMD::mul_ps(v0[0], v1[0]); 290 r[1] = SIMD::mul_ps(v0[1], v1[1]); 291 r[2] = SIMD::mul_ps(v0[2], v1[2]); 292 r[3] = SIMD::mul_ps(v0[3], v1[3]); 293 } 294 295 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s) 296 { 297 r[0] = SIMD::add_ps(v0[0], s); 298 r[1] = SIMD::add_ps(v0[1], s); 299 r[2] = SIMD::add_ps(v0[2], s); 300 r[3] = SIMD::add_ps(v0[3], s); 301 } 302 303 static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1) 304 { 305 r[0] = SIMD::add_ps(v0[0], v1[0]); 306 r[1] = SIMD::add_ps(v0[1], v1[1]); 307 r[2] = SIMD::add_ps(v0[2], v1[2]); 308 r[3] = SIMD::add_ps(v0[3], v1[3]); 309 } 310 311 static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s) 312 { 313 r[0] = SIMD::min_ps(v0[0], s); 314 r[1] = SIMD::min_ps(v0[1], s); 315 r[2] = SIMD::min_ps(v0[2], s); 316 r[3] = SIMD::min_ps(v0[3], s); 317 } 318 319 static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s) 320 { 321 r[0] = SIMD::max_ps(v0[0], s); 322 r[1] = SIMD::max_ps(v0[1], s); 323 r[2] = SIMD::max_ps(v0[2], s); 324 r[3] = SIMD::max_ps(v0[3], s); 325 } 326 327 // Matrix4x4 * Vector4 328 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) 329 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) 330 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) 331 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) 332 static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4& result, 333 const float* pMatrix, 334 const Vec4& v) 335 { 336 Float m; 337 Float r0; 338 Float r1; 339 340 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] 341 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 342 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] 343 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 344 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 345 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] 346 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 347 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 348 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] 349 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 350 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 351 result[0] = r0; 352 353 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] 354 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 355 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] 356 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 357 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 358 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] 359 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 360 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 361 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] 362 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 363 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 364 result[1] = r0; 365 366 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] 367 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 368 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] 369 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 370 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 371 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] 372 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 373 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 374 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] 375 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 376 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 377 result[2] = r0; 378 379 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] 380 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 381 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] 382 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 383 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 384 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] 385 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 386 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 387 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] 388 r1 = SIMD::mul_ps(m, v[3]); // (m3 * v.z) 389 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 390 result[3] = r0; 391 } 392 393 // Matrix4x4 * Vector3 - Direction Vector where w = 0. 394 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) 395 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) 396 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) 397 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) 398 static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4& result, 399 const float* pMatrix, 400 const Vec4& v) 401 { 402 Float m; 403 Float r0; 404 Float r1; 405 406 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] 407 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 408 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] 409 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 410 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 411 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] 412 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 413 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 414 result[0] = r0; 415 416 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] 417 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 418 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] 419 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 420 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 421 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] 422 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 423 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 424 result[1] = r0; 425 426 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] 427 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 428 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] 429 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 430 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 431 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] 432 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 433 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 434 result[2] = r0; 435 436 result[3] = SIMD::setzero_ps(); 437 } 438 439 // Matrix4x4 * Vector3 - Position vector where w = 1. 440 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) 441 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) 442 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) 443 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) 444 static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4& result, 445 const float* pMatrix, 446 const Vec4& v) 447 { 448 Float m; 449 Float r0; 450 Float r1; 451 452 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] 453 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 454 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] 455 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 456 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 457 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] 458 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 459 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 460 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] 461 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 462 result[0] = r0; 463 464 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] 465 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 466 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] 467 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 468 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 469 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] 470 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 471 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 472 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] 473 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 474 result[1] = r0; 475 476 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] 477 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 478 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] 479 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 480 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 481 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] 482 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 483 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 484 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] 485 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 486 result[2] = r0; 487 488 m = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0] 489 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 490 m = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1] 491 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 492 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 493 m = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2] 494 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 495 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 496 m = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3] 497 result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 498 } 499 500 static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4& result, 501 const float* pMatrix, 502 const Vec4& v) 503 { 504 Float m; 505 Float r0; 506 Float r1; 507 508 m = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0] 509 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 510 m = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1] 511 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 512 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 513 m = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2] 514 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 515 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 516 m = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3] 517 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 518 result[0] = r0; 519 520 m = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0] 521 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 522 m = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1] 523 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 524 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 525 m = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2] 526 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 527 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 528 m = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3] 529 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 530 result[1] = r0; 531 532 m = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0] 533 r0 = SIMD::mul_ps(m, v[0]); // (m00 * v.x) 534 m = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1] 535 r1 = SIMD::mul_ps(m, v[1]); // (m1 * v.y) 536 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 537 m = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2] 538 r1 = SIMD::mul_ps(m, v[2]); // (m2 * v.z) 539 r0 = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 540 m = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3] 541 r0 = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 542 result[2] = r0; 543 result[3] = SIMD::set1_ps(1.0f); 544 } 545}; // struct SIMDBase 546 547using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>; 548using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>; 549using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>; 550 551template <typename SIMD_T> 552using CompareType = typename SIMD_T::CompareType; 553template <typename SIMD_T> 554using ScaleFactor = typename SIMD_T::ScaleFactor; 555template <typename SIMD_T> 556using RoundMode = typename SIMD_T::RoundMode; 557template <typename SIMD_T> 558using Float = typename SIMD_T::Float; 559template <typename SIMD_T> 560using Double = typename SIMD_T::Double; 561template <typename SIMD_T> 562using Integer = typename SIMD_T::Integer; 563template <typename SIMD_T> 564using Vec4 = typename SIMD_T::Vec4; 565template <typename SIMD_T> 566using Mask = typename SIMD_T::Mask; 567 568