1/****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24#ifndef __SWR_SIMDINTRIN_H__
25#define __SWR_SIMDINTRIN_H__
26
27#include "common/intrin.h"
28#include "common/simdlib.hpp"
29
30#if KNOB_SIMD_WIDTH == 8
31typedef SIMD256 SIMD;
32#else
33#error Unsupported vector width
34#endif // KNOB_SIMD16_WIDTH == 16
35
36#define _simd128_maskstore_ps SIMD128::maskstore_ps
37#define _simd128_fmadd_ps SIMD128::fmadd_ps
38
39#define _simd_load_ps SIMD::load_ps
40#define _simd_load1_ps SIMD::broadcast_ss
41#define _simd_loadu_ps SIMD::loadu_ps
42#define _simd_setzero_ps SIMD::setzero_ps
43#define _simd_set1_ps SIMD::set1_ps
44#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
45#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
46#define _simd_blendv_ps SIMD::blendv_ps
47#define _simd_store_ps SIMD::store_ps
48#define _simd_mul_ps SIMD::mul_ps
49#define _simd_add_ps SIMD::add_ps
50#define _simd_sub_ps SIMD::sub_ps
51#define _simd_rsqrt_ps SIMD::rsqrt_ps
52#define _simd_min_ps SIMD::min_ps
53#define _simd_max_ps SIMD::max_ps
54#define _simd_movemask_ps SIMD::movemask_ps
55#define _simd_cvtps_epi32 SIMD::cvtps_epi32
56#define _simd_cvttps_epi32 SIMD::cvttps_epi32
57#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
58#define _simd_cmplt_ps SIMD::cmplt_ps
59#define _simd_cmpgt_ps SIMD::cmpgt_ps
60#define _simd_cmpneq_ps SIMD::cmpneq_ps
61#define _simd_cmpeq_ps SIMD::cmpeq_ps
62#define _simd_cmpge_ps SIMD::cmpge_ps
63#define _simd_cmple_ps SIMD::cmple_ps
64#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
65#define _simd_and_ps SIMD::and_ps
66#define _simd_or_ps SIMD::or_ps
67#define _simd_rcp_ps SIMD::rcp_ps
68#define _simd_div_ps SIMD::div_ps
69#define _simd_castsi_ps SIMD::castsi_ps
70#define _simd_castps_pd SIMD::castps_pd
71#define _simd_castpd_ps SIMD::castpd_ps
72#define _simd_andnot_ps SIMD::andnot_ps
73#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
74#define _simd_castpd_ps SIMD::castpd_ps
75#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
76#define _simd_stream_ps SIMD::stream_ps
77
78#define _simd_movemask_pd SIMD::movemask_pd
79#define _simd_castsi_pd SIMD::castsi_pd
80
81#define _simd_mul_epi32 SIMD::mul_epi32
82#define _simd_mullo_epi32 SIMD::mullo_epi32
83#define _simd_sub_epi32 SIMD::sub_epi32
84#define _simd_sub_epi64 SIMD::sub_epi64
85#define _simd_min_epi32 SIMD::min_epi32
86#define _simd_min_epu32 SIMD::min_epu32
87#define _simd_max_epi32 SIMD::max_epi32
88#define _simd_max_epu32 SIMD::max_epu32
89#define _simd_add_epi32 SIMD::add_epi32
90#define _simd_and_si SIMD::and_si
91#define _simd_andnot_si SIMD::andnot_si
92#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
93#define _simd_cmplt_epi32 SIMD::cmplt_epi32
94#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
95#define _simd_or_si SIMD::or_si
96#define _simd_xor_si SIMD::xor_si
97#define _simd_castps_si SIMD::castps_si
98#define _simd_adds_epu8 SIMD::adds_epu8
99#define _simd_subs_epu8 SIMD::subs_epu8
100#define _simd_add_epi8 SIMD::add_epi8
101#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
102#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
103#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
104#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
105#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
106#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
107#define _simd_movemask_epi8 SIMD::movemask_epi8
108#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
109#define _simd_permute_ps SIMD::permute_ps
110#define _simd_permute_epi32 SIMD::permute_epi32
111#define _simd_srlv_epi32 SIMD::srlv_epi32
112#define _simd_sllv_epi32 SIMD::sllv_epi32
113
114#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
115#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
116#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
117#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
118#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
119#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
120#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
121#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
122
123#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
124#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
125#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
126#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
127
128#define _simd_fmadd_ps SIMD::fmadd_ps
129#define _simd_fmsub_ps SIMD::fmsub_ps
130#define _simd_shuffle_epi8 SIMD::shuffle_epi8
131
132#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
133#define _simd_mask_i32gather_ps(r, p, o, m, s) \
134    SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
135#define _simd_abs_epi32 SIMD::abs_epi32
136
137#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
138#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
139#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
140#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
141#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
142
143#define _simd_packus_epi16 SIMD::packus_epi16
144#define _simd_packs_epi16 SIMD::packs_epi16
145#define _simd_packus_epi32 SIMD::packus_epi32
146#define _simd_packs_epi32 SIMD::packs_epi32
147
148#define _simd_unpacklo_ps SIMD::unpacklo_ps
149#define _simd_unpackhi_ps SIMD::unpackhi_ps
150#define _simd_unpacklo_pd SIMD::unpacklo_pd
151#define _simd_unpackhi_pd SIMD::unpackhi_pd
152#define _simd_insertf128_ps SIMD::insertf128_ps
153#define _simd_insertf128_pd SIMD::insertf128_pd
154#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
155#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
156#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
157#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
158#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
159#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
160#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
161#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
162#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
163#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
164#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
165#define _simd_set1_epi32 SIMD::set1_epi32
166#define _simd_set_epi32 SIMD::set_epi32
167#define _simd_set_ps SIMD::set_ps
168#define _simd_set1_epi8 SIMD::set1_epi8
169#define _simd_setzero_si SIMD::setzero_si
170#define _simd_cvttps_epi32 SIMD::cvttps_epi32
171#define _simd_store_si SIMD::store_si
172#define _simd_broadcast_ss SIMD::broadcast_ss
173#define _simd_maskstore_ps SIMD::maskstore_ps
174#define _simd_load_si SIMD::load_si
175#define _simd_loadu_si SIMD::loadu_si
176#define _simd_sub_ps SIMD::sub_ps
177#define _simd_testz_ps SIMD::testz_ps
178#define _simd_testz_si SIMD::testz_si
179#define _simd_xor_ps SIMD::xor_ps
180
181#define _simd_loadu2_si SIMD::loadu2_si
182#define _simd_storeu2_si SIMD::storeu2_si
183
184#define _simd_blendv_epi32 SIMD::blendv_epi32
185#define _simd_vmask_ps SIMD::vmask_ps
186
187template <int mask>
188SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
189{
190    return SIMD128::castps_si(
191        SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
192}
193
194SIMDINLINE
195void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane)
196{
197    OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
198    SIMD256::store_ps(rArray, r);
199    SIMD256::store_ps(sArray, s);
200    rArray[rlane] = sArray[slane];
201    r             = SIMD256::load_ps(rArray);
202}
203
204// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
205#define _simdvec_load_ps SIMD::vec4_load1_ps
206
207SIMDINLINE
208void _simdvec_mov(simdvector& r, const simdscalar& s)
209{
210    SIMD::vec4_set1_vps(r, s);
211}
212
213SIMDINLINE
214void _simdvec_mov(simdvector& r, const simdvector& v)
215{
216    r = v;
217}
218
219#if 0
220// just move a lane from the source simdvector to dest simdvector
221SIMDINLINE
222void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
223{
224    _simd_mov(r[0], rlane, s[0], slane);
225    _simd_mov(r[1], rlane, s[1], slane);
226    _simd_mov(r[2], rlane, s[2], slane);
227    _simd_mov(r[3], rlane, s[3], slane);
228}
229
230#endif
231
232#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
233#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
234#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
235#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
236#define _simdvec_mul_ps SIMD::vec4_mul_ps
237#define _simdvec_add_ps SIMD::vec4_add_ps
238#define _simdvec_min_ps SIMD::vec4_min_ps
239#define _simdvec_max_ps SIMD::vec4_max_ps
240#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
241#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
242#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
243#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
244
245//////////////////////////////////////////////////////////////////////////
246/// @brief Compute plane equation vA * vX + vB * vY + vC
247SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
248                               simdscalar const& vB,
249                               simdscalar const& vC,
250                               simdscalar const& vX,
251                               simdscalar const& vY)
252{
253    simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
254    vOut            = _simd_fmadd_ps(vB, vY, vOut);
255    return vOut;
256}
257
258//////////////////////////////////////////////////////////////////////////
259/// @brief Compute plane equation vA * vX + vB * vY + vC
260SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
261                                simd4scalar const& vB,
262                                simd4scalar const& vC,
263                                simd4scalar const& vX,
264                                simd4scalar const& vY)
265{
266    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
267    vOut             = _simd128_fmadd_ps(vB, vY, vOut);
268    return vOut;
269}
270
271//////////////////////////////////////////////////////////////////////////
272/// @brief Interpolates a single component.
273/// @param vI - barycentric I
274/// @param vJ - barycentric J
275/// @param pInterpBuffer - pointer to attribute barycentric coeffs
276template <UINT Attrib, UINT Comp, UINT numComponents = 4>
277static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
278                                                  simdscalar const& vJ,
279                                                  const float*      pInterpBuffer)
280{
281    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
282    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
283    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
284
285    simdscalar vA = _simd_broadcast_ss(pInterpA);
286    simdscalar vB = _simd_broadcast_ss(pInterpB);
287    simdscalar vC = _simd_broadcast_ss(pInterpC);
288
289    simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
290    vC            = _simd_mul_ps(vk, vC);
291
292    return vplaneps(vA, vB, vC, vI, vJ);
293}
294
295//////////////////////////////////////////////////////////////////////////
296/// @brief Interpolates a single component (flat shade).
297/// @param pInterpBuffer - pointer to attribute barycentric coeffs
298template <UINT Attrib, UINT Comp, UINT numComponents = 4>
299static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
300{
301    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
302
303    simdscalar vA = _simd_broadcast_ss(pInterpA);
304
305    return vA;
306}
307
308//////////////////////////////////////////////////////////////////////////
309/// @brief Interpolates a single component (flat shade).
310/// @param pInterpBuffer - pointer to attribute barycentric coeffs
311template <UINT Attrib, UINT Comp, UINT numComponents = 4>
312static SIMDINLINE simdscalari InterpolateComponentFlatInt(const uint32_t* pInterpBuffer)
313{
314    const uint32_t interpA = pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
315
316    simdscalari vA = _simd_set1_epi32(interpA);
317
318    return vA;
319}
320
321//////////////////////////////////////////////////////////////////////////
322/// @brief Interpolates a single component.
323/// @param vI - barycentric I
324/// @param vJ - barycentric J
325/// @param pInterpBuffer - pointer to attribute barycentric coeffs
326template <UINT Attrib, UINT Comp, UINT numComponents = 4>
327static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
328                                                   simd4scalar const& vJ,
329                                                   const float*       pInterpBuffer)
330{
331    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
332    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
333    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
334
335    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
336    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
337    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
338
339    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
340    vC             = SIMD128::mul_ps(vk, vC);
341
342    return vplaneps(vA, vB, vC, vI, vJ);
343}
344
345static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
346{
347    simd4scalari ai = SIMD128::castps_si(a);
348    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
349}
350
351static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
352{
353    simdscalari ai = _simd_castps_si(a);
354    return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
355}
356
357#include "simd16intrin.h"
358
359#endif //__SWR_SIMDINTRIN_H__
360