1/****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 ****************************************************************************/
23
24#ifndef __SWR_INTRIN_H__
25#define __SWR_INTRIN_H__
26
27#include "os.h"
28
29#if !defined(SIMD_ARCH)
30#define SIMD_ARCH KNOB_ARCH
31#endif
32
33#include "simdlib_types.hpp"
34
35typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
36typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
37typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
38typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
39typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
40
41typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
42typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
43typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
44typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
45typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
46
47typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
48typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
49typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
50typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
51typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
52
53#if KNOB_SIMD_WIDTH == 8
54typedef simd8scalar  simdscalar;
55typedef simd8scalard simdscalard;
56typedef simd8scalari simdscalari;
57typedef simd8vector  simdvector;
58typedef simd8mask    simdmask;
59#else
60#error Unsupported vector width
61#endif
62
63INLINE
64UINT pdep_u32(UINT a, UINT mask)
65{
66#if KNOB_ARCH >= KNOB_ARCH_AVX2
67    return _pdep_u32(a, mask);
68#else
69    UINT result = 0;
70
71    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
72    // using bsf instead of funky loop
73    DWORD maskIndex;
74    while (_BitScanForward(&maskIndex, mask))
75    {
76        // 1. isolate lowest set bit of mask
77        const UINT lowest = 1 << maskIndex;
78
79        // 2. populate LSB from src
80        const UINT LSB = (UINT)((int)(a << 31) >> 31);
81
82        // 3. copy bit from mask
83        result |= LSB & lowest;
84
85        // 4. clear lowest bit
86        mask &= ~lowest;
87
88        // 5. prepare for next iteration
89        a >>= 1;
90    }
91
92    return result;
93#endif
94}
95
96INLINE
97UINT pext_u32(UINT a, UINT mask)
98{
99#if KNOB_ARCH >= KNOB_ARCH_AVX2
100    return _pext_u32(a, mask);
101#else
102    UINT     result = 0;
103    DWORD    maskIndex;
104    uint32_t currentBit = 0;
105    while (_BitScanForward(&maskIndex, mask))
106    {
107        // 1. isolate lowest set bit of mask
108        const UINT lowest = 1 << maskIndex;
109
110        // 2. copy bit from mask
111        result |= ((a & lowest) > 0) << currentBit++;
112
113        // 3. clear lowest bit
114        mask &= ~lowest;
115    }
116    return result;
117#endif
118}
119
120#endif //__SWR_INTRIN_H__
121