1/****************************************************************************
2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23#pragma once
24#if 0
25//===========================================================================
26// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
27//===========================================================================
28struct SIMD256 // or SIMD4 or SIMD16
29{
30    //=======================================================================
31    // SIMD Types
32    //
33    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
34    // use different base types with this same naming.
35    using Float     = __m256;  // Packed single-precision float vector
36    using Double    = __m256d; // Packed double-precision float vector
37    using Integer   = __m256i; // Packed integer vector (mutable element widths)
38    using Mask      = uint8_t; // Integer representing mask bits
39
40    //=======================================================================
41    // Standard interface
42    // (available in both SIMD256 and SIMD16 widths)
43    //=======================================================================
44
45    //-----------------------------------------------------------------------
46    // Single precision floating point arithmetic operations
47    //-----------------------------------------------------------------------
48    static Float    add_ps(Float a, Float b);               // return a + b
49    static Float    div_ps(Float a, Float b);               // return a / b
50    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
51    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
52    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
53    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
54    static Float    mul_ps(Float a, Float b);               // return a * b
55    static Float    rcp_ps(Float a);                        // return 1.0f / a
56    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
57    static Float    sub_ps(Float a, Float b);               // return a - b
58
59    enum class RoundMode
60    {
61        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
62        TO_NEG_INF      = 0x01, // Round to negative infinity
63        TO_POS_INF      = 0x02, // Round to positive infinity
64        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
65        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
66
67        RAISE_EXC       = 0x00, // Raise exception on overflow
68        NO_EXC          = 0x08, // Suppress exceptions
69
70        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
71        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
72        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
73        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
74        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
75        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
76        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
77        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
78        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
79        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
80    };
81
82    // return round_func(a)
83    //
84    // round_func is chosen on the RMT template parameter.  See the documentation
85    // for the RoundMode enumeration above.
86    template <RoundMode RMT>
87    static Float    round_ps(Float a);                  // return round(a)
88
89
90    //-----------------------------------------------------------------------
91    // Integer (various width) arithmetic operations
92    //-----------------------------------------------------------------------
93    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
94    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
95    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
96    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
97    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
98    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
99    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
100    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
101    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
102
103    // return (a * b) & 0xFFFFFFFF
104    //
105    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
106    // and store the low 32 bits of the intermediate integers in dst.
107    static Float    mullo_epi32(Integer a, Integer b);
108
109    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
110    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
111    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
112
113    //-----------------------------------------------------------------------
114    // Logical operations
115    //-----------------------------------------------------------------------
116    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
117    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
118    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
119    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
120    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
121    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
122    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
123    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
124
125    //-----------------------------------------------------------------------
126    // Shift operations
127    //-----------------------------------------------------------------------
128    template<int ImmT>
129    static Integer  slli_epi32(Integer a);              // return a << ImmT
130    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
131    template<int ImmT>
132    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
133    template<int ImmT>
134    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
135    template<int ImmT>                                  // for each 128-bit lane:
136    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
137    template<int ImmT>
138    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
139    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
140
141    //-----------------------------------------------------------------------
142    // Conversion operations
143    //-----------------------------------------------------------------------
144    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
145    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
146    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
147    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
148    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
149    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
150    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
151    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
152    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
153    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
154    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
155    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
156    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
157
158    //-----------------------------------------------------------------------
159    // Comparison operations
160    //-----------------------------------------------------------------------
161
162    // Comparison types used with cmp_ps:
163    //   - ordered comparisons are always false if either operand is NaN
164    //   - unordered comparisons are always true if either operand is NaN
165    //   - signaling comparisons raise an exception if either operand is NaN
166    //   - non-signaling comparisons will never raise an exception
167    //
168    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
169    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
170    enum class CompareType
171    {
172        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
173        LT_OS      = 0x01, // Less-than (ordered, signaling)
174        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
175        UNORD_Q    = 0x03, // Unordered (nonsignaling)
176        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
177        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
178        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
179        ORD_Q      = 0x07, // Ordered (nonsignaling)
180        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
181        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
182        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
183        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
184        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
185        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
186        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
187        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
188        EQ_OS      = 0x10, // Equal (ordered, signaling)
189        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
190        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
191        UNORD_S    = 0x13, // Unordered (signaling)
192        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
193        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
194        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
195        ORD_S      = 0x17, // Ordered (signaling)
196        EQ_US      = 0x18, // Equal (unordered, signaling)
197        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
198        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
199        FALSE_OS   = 0x1B, // False (ordered, signaling)
200        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
201        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
202        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
203        TRUE_US    = 0x1F, // True (unordered, signaling)
204    };
205
206    // return a (CmpTypeT) b (float)
207    //
208    // See documentation for CompareType above for valid values for CmpTypeT.
209    template<CompareType CmpTypeT>
210    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
211    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
212    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
213    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
214    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
215    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
216    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
217    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
218    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
219    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
220    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
221    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
222    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
223    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
224    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
225    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
226    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
227    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
228
229    //-----------------------------------------------------------------------
230    // Blend / shuffle / permute operations
231    //-----------------------------------------------------------------------
232    template<int ImmT>
233    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
234    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
235    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
236    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
237    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
238    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
239    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
240    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
241    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
242    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
243    template<int SwizT>
244    static Integer  shuffle_epi32(Integer a, Integer b);
245    template<int SwizT>
246    static Integer  shuffle_epi64(Integer a, Integer b);
247    static Integer  shuffle_epi8(Integer a, Integer b);
248    template<int SwizT>
249    static Float    shuffle_pd(Double a, Double b);
250    template<int SwizT>
251    static Float    shuffle_ps(Float a, Float b);
252    static Integer  unpackhi_epi16(Integer a, Integer b);
253    static Integer  unpackhi_epi32(Integer a, Integer b);
254    static Integer  unpackhi_epi64(Integer a, Integer b);
255    static Integer  unpackhi_epi8(Integer a, Integer b);
256    static Float    unpackhi_pd(Double a, Double b);
257    static Float    unpackhi_ps(Float a, Float b);
258    static Integer  unpacklo_epi16(Integer a, Integer b);
259    static Integer  unpacklo_epi32(Integer a, Integer b);
260    static Integer  unpacklo_epi64(Integer a, Integer b);
261    static Integer  unpacklo_epi8(Integer a, Integer b);
262    static Float    unpacklo_pd(Double a, Double b);
263    static Float    unpacklo_ps(Float a, Float b);
264
265    //-----------------------------------------------------------------------
266    // Load / store operations
267    //-----------------------------------------------------------------------
268    enum class ScaleFactor
269    {
270        SF_1,   // No scaling
271        SF_2,   // Scale offset by 2
272        SF_4,   // Scale offset by 4
273        SF_8,   // Scale offset by 8
274    };
275
276    template<ScaleFactor ScaleT = ScaleFactor::SF_1>
277    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
278    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
279    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
280    static Integer  load_si(Integer const *p);                  // return *p
281    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
282    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
283
284    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
285    template<int ScaleT>
286    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
287
288    static void     maskstore_ps(float *p, Integer mask, Float src);
289    static int      movemask_epi8(Integer a);
290    static int      movemask_pd(Double a);
291    static int      movemask_ps(Float a);
292    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
293    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
294    static Float    set1_ps(float f);                           // return f (all elements are same value)
295    static Float    setzero_ps();                               // return 0 (float)
296    static Integer  setzero_si();                               // return 0 (integer)
297    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
298    static void     store_si(Integer *p, Integer a);            // *p = a
299    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
300
301    //=======================================================================
302    // Legacy interface (available only in SIMD256 width)
303    //=======================================================================
304
305    static Float    broadcast_ps(__m128 const *p);
306    template<int ImmT>
307    static __m128d  extractf128_pd(Double a);
308    template<int ImmT>
309    static __m128   extractf128_ps(Float a);
310    template<int ImmT>
311    static __m128i  extractf128_si(Integer a);
312    template<int ImmT>
313    static Double   insertf128_pd(Double a, __m128d b);
314    template<int ImmT>
315    static Float    insertf128_ps(Float a, __m128 b);
316    template<int ImmT>
317    static Integer  insertf128_si(Integer a, __m128i b);
318    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
319    template<int ImmT>
320    static Double   permute2f128_pd(Double a, Double b);
321    template<int ImmT>
322    static Float    permute2f128_ps(Float a, Float b);
323    template<int ImmT>
324    static Integer  permute2f128_si(Integer a, Integer b);
325    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
326    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
327
328    //=======================================================================
329    // Advanced masking interface (currently available only in SIMD16 width)
330    //=======================================================================
331};
332#endif // #if 0
333