1/****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23#pragma once
24
25#include "simdlib_types.hpp"
26
27// For documentation, please see the following include...
28// #include "simdlib_interface.hpp"
29
30namespace SIMDImpl
31{
32    namespace SIMD128Impl
33    {
34#if SIMD_ARCH >= SIMD_ARCH_AVX
35        struct AVXImpl
36        {
37#define __SIMD_LIB_AVX_HPP__
38#include "simdlib_128_avx.inl"
39#undef __SIMD_LIB_AVX_HPP__
40        }; // struct AVXImpl
41#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
42
43#if SIMD_ARCH >= SIMD_ARCH_AVX2
44        struct AVX2Impl : AVXImpl
45        {
46#define __SIMD_LIB_AVX2_HPP__
47#include "simdlib_128_avx2.inl"
48#undef __SIMD_LIB_AVX2_HPP__
49        }; // struct AVX2Impl
50#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
51
52#if SIMD_ARCH >= SIMD_ARCH_AVX512
53        struct AVX512Impl : AVX2Impl
54        {
55#if defined(SIMD_OPT_128_AVX512)
56#define __SIMD_LIB_AVX512_HPP__
57#include "simdlib_128_avx512.inl"
58#if defined(SIMD_ARCH_KNIGHTS)
59#include "simdlib_128_avx512_knights.inl"
60#else // optimize for core
61#include "simdlib_128_avx512_core.inl"
62#endif // defined(SIMD_ARCH_KNIGHTS)
63#undef __SIMD_LIB_AVX512_HPP__
64#endif     // SIMD_OPT_128_AVX512
65        }; // struct AVX2Impl
66#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
67
68        struct Traits : SIMDImpl::Traits
69        {
70#if SIMD_ARCH == SIMD_ARCH_AVX
71            using IsaImpl = AVXImpl;
72#elif SIMD_ARCH == SIMD_ARCH_AVX2
73            using IsaImpl = AVX2Impl;
74#elif SIMD_ARCH == SIMD_ARCH_AVX512
75            using IsaImpl = AVX512Impl;
76#else
77#error Invalid value for SIMD_ARCH
78#endif
79
80            using Float   = SIMD128Impl::Float;
81            using Double  = SIMD128Impl::Double;
82            using Integer = SIMD128Impl::Integer;
83            using Vec4    = SIMD128Impl::Vec4;
84            using Mask    = SIMD128Impl::Mask;
85        };
86    } // namespace SIMD128Impl
87
88    namespace SIMD256Impl
89    {
90#if SIMD_ARCH >= SIMD_ARCH_AVX
91        struct AVXImpl
92        {
93#define __SIMD_LIB_AVX_HPP__
94#include "simdlib_256_avx.inl"
95#undef __SIMD_LIB_AVX_HPP__
96        }; // struct AVXImpl
97#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
98
99#if SIMD_ARCH >= SIMD_ARCH_AVX2
100        struct AVX2Impl : AVXImpl
101        {
102#define __SIMD_LIB_AVX2_HPP__
103#include "simdlib_256_avx2.inl"
104#undef __SIMD_LIB_AVX2_HPP__
105        }; // struct AVX2Impl
106#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
107
108#if SIMD_ARCH >= SIMD_ARCH_AVX512
109        struct AVX512Impl : AVX2Impl
110        {
111#if defined(SIMD_OPT_256_AVX512)
112#define __SIMD_LIB_AVX512_HPP__
113#include "simdlib_256_avx512.inl"
114#if defined(SIMD_ARCH_KNIGHTS)
115#include "simdlib_256_avx512_knights.inl"
116#else // optimize for core
117#include "simdlib_256_avx512_core.inl"
118#endif // defined(SIMD_ARCH_KNIGHTS)
119#undef __SIMD_LIB_AVX512_HPP__
120#endif     // SIMD_OPT_256_AVX512
121        }; // struct AVX2Impl
122#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
123
124        struct Traits : SIMDImpl::Traits
125        {
126#if SIMD_ARCH == SIMD_ARCH_AVX
127            using IsaImpl = AVXImpl;
128#elif SIMD_ARCH == SIMD_ARCH_AVX2
129            using IsaImpl = AVX2Impl;
130#elif SIMD_ARCH == SIMD_ARCH_AVX512
131            using IsaImpl = AVX512Impl;
132#else
133#error Invalid value for SIMD_ARCH
134#endif
135
136            using Float   = SIMD256Impl::Float;
137            using Double  = SIMD256Impl::Double;
138            using Integer = SIMD256Impl::Integer;
139            using Vec4    = SIMD256Impl::Vec4;
140            using Mask    = SIMD256Impl::Mask;
141        };
142    } // namespace SIMD256Impl
143
144    namespace SIMD512Impl
145    {
146#if SIMD_ARCH >= SIMD_ARCH_AVX
147        template <typename SIMD256T>
148        struct AVXImplBase
149        {
150#define __SIMD_LIB_AVX_HPP__
151#include "simdlib_512_emu.inl"
152#include "simdlib_512_emu_masks.inl"
153#undef __SIMD_LIB_AVX_HPP__
154        }; // struct AVXImplBase
155        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
156#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
157
158#if SIMD_ARCH >= SIMD_ARCH_AVX2
159        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
160#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
161
162#if SIMD_ARCH >= SIMD_ARCH_AVX512
163        struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
164        {
165#define __SIMD_LIB_AVX512_HPP__
166#include "simdlib_512_avx512.inl"
167#include "simdlib_512_avx512_masks.inl"
168#if defined(SIMD_ARCH_KNIGHTS)
169#include "simdlib_512_avx512_knights.inl"
170#include "simdlib_512_avx512_masks_knights.inl"
171#else // optimize for core
172#include "simdlib_512_avx512_core.inl"
173#include "simdlib_512_avx512_masks_core.inl"
174#endif // defined(SIMD_ARCH_KNIGHTS)
175#undef __SIMD_LIB_AVX512_HPP__
176        }; // struct AVX512ImplBase
177#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
178
179        struct Traits : SIMDImpl::Traits
180        {
181#if SIMD_ARCH == SIMD_ARCH_AVX
182            using IsaImpl = AVXImpl;
183#elif SIMD_ARCH == SIMD_ARCH_AVX2
184            using IsaImpl = AVX2Impl;
185#elif SIMD_ARCH == SIMD_ARCH_AVX512
186            using IsaImpl = AVX512Impl;
187#else
188#error Invalid value for SIMD_ARCH
189#endif
190
191            using Float   = SIMD512Impl::Float;
192            using Double  = SIMD512Impl::Double;
193            using Integer = SIMD512Impl::Integer;
194            using Vec4    = SIMD512Impl::Vec4;
195            using Mask    = SIMD512Impl::Mask;
196        };
197    } // namespace SIMD512Impl
198} // namespace SIMDImpl
199
200template <typename Traits>
201struct SIMDBase : Traits::IsaImpl
202{
203    using CompareType = typename Traits::CompareType;
204    using ScaleFactor = typename Traits::ScaleFactor;
205    using RoundMode   = typename Traits::RoundMode;
206    using SIMD        = typename Traits::IsaImpl;
207    using Float       = typename Traits::Float;
208    using Double      = typename Traits::Double;
209    using Integer     = typename Traits::Integer;
210    using Vec4        = typename Traits::Vec4;
211    using Mask        = typename Traits::Mask;
212
213    static const size_t VECTOR_BYTES = sizeof(Float);
214
215    // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
216    static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
217    {
218        r[0] = SIMD::set1_ps(p[0]);
219        r[1] = SIMD::set1_ps(p[1]);
220        r[2] = SIMD::set1_ps(p[2]);
221        r[3] = SIMD::set1_ps(p[3]);
222    }
223
224    static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
225    {
226        r[0] = s;
227        r[1] = s;
228        r[2] = s;
229        r[3] = s;
230    }
231
232    static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
233    {
234        Float tmp, r;
235        r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
236
237        tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
238        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y)
239
240        tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
241        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
242
243        return r;
244    }
245
246    static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
247    {
248        Float tmp, r;
249        r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
250
251        tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
252        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y)
253
254        tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
255        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
256
257        tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
258        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
259
260        return r;
261    }
262
263    static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
264    {
265        Float length = vec4_dp4_ps(v, v);
266        return SIMD::rsqrt_ps(length);
267    }
268
269    static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
270    {
271        Float rcpLength = vec4_rcp_length_ps(v);
272
273        r[0] = SIMD::mul_ps(v[0], rcpLength);
274        r[1] = SIMD::mul_ps(v[1], rcpLength);
275        r[2] = SIMD::mul_ps(v[2], rcpLength);
276        r[3] = SIMD::mul_ps(v[3], rcpLength);
277    }
278
279    static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
280    {
281        r[0] = SIMD::mul_ps(v[0], s);
282        r[1] = SIMD::mul_ps(v[1], s);
283        r[2] = SIMD::mul_ps(v[2], s);
284        r[3] = SIMD::mul_ps(v[3], s);
285    }
286
287    static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
288    {
289        r[0] = SIMD::mul_ps(v0[0], v1[0]);
290        r[1] = SIMD::mul_ps(v0[1], v1[1]);
291        r[2] = SIMD::mul_ps(v0[2], v1[2]);
292        r[3] = SIMD::mul_ps(v0[3], v1[3]);
293    }
294
295    static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
296    {
297        r[0] = SIMD::add_ps(v0[0], s);
298        r[1] = SIMD::add_ps(v0[1], s);
299        r[2] = SIMD::add_ps(v0[2], s);
300        r[3] = SIMD::add_ps(v0[3], s);
301    }
302
303    static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
304    {
305        r[0] = SIMD::add_ps(v0[0], v1[0]);
306        r[1] = SIMD::add_ps(v0[1], v1[1]);
307        r[2] = SIMD::add_ps(v0[2], v1[2]);
308        r[3] = SIMD::add_ps(v0[3], v1[3]);
309    }
310
311    static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
312    {
313        r[0] = SIMD::min_ps(v0[0], s);
314        r[1] = SIMD::min_ps(v0[1], s);
315        r[2] = SIMD::min_ps(v0[2], s);
316        r[3] = SIMD::min_ps(v0[3], s);
317    }
318
319    static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
320    {
321        r[0] = SIMD::max_ps(v0[0], s);
322        r[1] = SIMD::max_ps(v0[1], s);
323        r[2] = SIMD::max_ps(v0[2], s);
324        r[3] = SIMD::max_ps(v0[3], s);
325    }
326
327    // Matrix4x4 * Vector4
328    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
329    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
330    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
331    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
332    static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4&        result,
333                                                         const float* pMatrix,
334                                                         const Vec4&  v)
335    {
336        Float m;
337        Float r0;
338        Float r1;
339
340        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
341        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
342        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
343        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
344        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
345        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
346        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
347        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
348        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
349        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
350        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
351        result[0] = r0;
352
353        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
354        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
355        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
356        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
357        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
358        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
359        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
360        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
361        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
362        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
363        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
364        result[1] = r0;
365
366        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
367        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
368        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
369        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
370        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
371        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
372        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
373        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
374        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
375        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
376        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
377        result[2] = r0;
378
379        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
380        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
381        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
382        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
383        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
384        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
385        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
386        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
387        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
388        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
389        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
390        result[3] = r0;
391    }
392
393    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
394    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
395    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
396    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
397    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
398    static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4&        result,
399                                                            const float* pMatrix,
400                                                            const Vec4&  v)
401    {
402        Float m;
403        Float r0;
404        Float r1;
405
406        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
407        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
408        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
409        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
410        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
411        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
412        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
413        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
414        result[0] = r0;
415
416        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
417        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
418        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
419        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
420        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
421        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
422        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
423        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
424        result[1] = r0;
425
426        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
427        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
428        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
429        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
430        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
431        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
432        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
433        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
434        result[2] = r0;
435
436        result[3] = SIMD::setzero_ps();
437    }
438
439    // Matrix4x4 * Vector3 - Position vector where w = 1.
440    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
441    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
442    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
443    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
444    static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4&        result,
445                                                            const float* pMatrix,
446                                                            const Vec4&  v)
447    {
448        Float m;
449        Float r0;
450        Float r1;
451
452        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
453        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
454        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
455        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
456        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
457        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
458        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
459        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
460        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
461        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
462        result[0] = r0;
463
464        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
465        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
466        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
467        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
468        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
469        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
470        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
471        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
472        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
473        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
474        result[1] = r0;
475
476        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
477        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
478        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
479        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
480        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
481        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
482        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
483        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
484        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
485        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
486        result[2] = r0;
487
488        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
489        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
490        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
491        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
492        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
493        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
494        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
495        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
496        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
497        result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
498    }
499
500    static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4&        result,
501                                                            const float* pMatrix,
502                                                            const Vec4&  v)
503    {
504        Float m;
505        Float r0;
506        Float r1;
507
508        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
509        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
510        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
511        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
512        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
513        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
514        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
515        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
516        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
517        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
518        result[0] = r0;
519
520        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
521        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
522        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
523        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
524        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
525        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
526        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
527        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
528        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
529        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
530        result[1] = r0;
531
532        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
533        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
534        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
535        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
536        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
537        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
538        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
539        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
540        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
541        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
542        result[2] = r0;
543        result[3] = SIMD::set1_ps(1.0f);
544    }
545}; // struct SIMDBase
546
547using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
548using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
549using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
550
551template <typename SIMD_T>
552using CompareType = typename SIMD_T::CompareType;
553template <typename SIMD_T>
554using ScaleFactor = typename SIMD_T::ScaleFactor;
555template <typename SIMD_T>
556using RoundMode = typename SIMD_T::RoundMode;
557template <typename SIMD_T>
558using Float = typename SIMD_T::Float;
559template <typename SIMD_T>
560using Double = typename SIMD_T::Double;
561template <typename SIMD_T>
562using Integer = typename SIMD_T::Integer;
563template <typename SIMD_T>
564using Vec4 = typename SIMD_T::Vec4;
565template <typename SIMD_T>
566using Mask = typename SIMD_T::Mask;
567
568