101e04c3fSmrg/* 201e04c3fSmrg * Copyright 2015 Red Hat Inc. 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 801e04c3fSmrg * license, and/or sell copies of the Software, and to permit persons to whom 901e04c3fSmrg * the Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 1901e04c3fSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 2001e04c3fSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 2101e04c3fSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 2201e04c3fSmrg * 2301e04c3fSmrg * Author: Oded Gabbay <oded.gabbay@redhat.com> 2401e04c3fSmrg */ 2501e04c3fSmrg 2601e04c3fSmrg/** 2701e04c3fSmrg * @file 2801e04c3fSmrg * POWER8 intrinsics portability header. 2901e04c3fSmrg * 3001e04c3fSmrg */ 3101e04c3fSmrg 3201e04c3fSmrg#ifndef U_PWR8_H_ 3301e04c3fSmrg#define U_PWR8_H_ 3401e04c3fSmrg 357ec681f3Smrg#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN 3601e04c3fSmrg 3701e04c3fSmrg#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16))) 3801e04c3fSmrg 3901e04c3fSmrgtypedef VECTOR_ALIGN_16 vector unsigned char __m128i; 4001e04c3fSmrg 4101e04c3fSmrgtypedef VECTOR_ALIGN_16 union m128i { 4201e04c3fSmrg __m128i m128i; 4301e04c3fSmrg vector signed int m128si; 4401e04c3fSmrg vector unsigned int m128ui; 4501e04c3fSmrg ubyte ub[16]; 4601e04c3fSmrg ushort us[8]; 4701e04c3fSmrg int i[4]; 4801e04c3fSmrg uint ui[4]; 4901e04c3fSmrg} __m128i_union; 5001e04c3fSmrg 5101e04c3fSmrgstatic inline __m128i 5201e04c3fSmrgvec_set_epi32 (int i3, int i2, int i1, int i0) 5301e04c3fSmrg{ 5401e04c3fSmrg __m128i_union vdst; 5501e04c3fSmrg 567ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 5701e04c3fSmrg vdst.i[0] = i0; 5801e04c3fSmrg vdst.i[1] = i1; 5901e04c3fSmrg vdst.i[2] = i2; 6001e04c3fSmrg vdst.i[3] = i3; 6101e04c3fSmrg#else 6201e04c3fSmrg vdst.i[3] = i0; 6301e04c3fSmrg vdst.i[2] = i1; 6401e04c3fSmrg vdst.i[1] = i2; 6501e04c3fSmrg vdst.i[0] = i3; 6601e04c3fSmrg#endif 6701e04c3fSmrg 6801e04c3fSmrg return (__m128i) vdst.m128si; 6901e04c3fSmrg} 7001e04c3fSmrg 7101e04c3fSmrgstatic inline __m128i 7201e04c3fSmrgvec_setr_epi32 (int i0, int i1, int i2, int i3) 7301e04c3fSmrg{ 7401e04c3fSmrg return vec_set_epi32 (i3, i2, i1, i0); 7501e04c3fSmrg} 7601e04c3fSmrg 7701e04c3fSmrgstatic inline __m128i 7801e04c3fSmrgvec_unpacklo_epi32 (__m128i even, __m128i odd) 7901e04c3fSmrg{ 8001e04c3fSmrg static const __m128i perm_mask = 817ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 8201e04c3fSmrg { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23}; 8301e04c3fSmrg#else 8401e04c3fSmrg {24, 25, 26, 27, 8, 9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15}; 8501e04c3fSmrg#endif 8601e04c3fSmrg 8701e04c3fSmrg return vec_perm (even, odd, perm_mask); 8801e04c3fSmrg} 8901e04c3fSmrg 9001e04c3fSmrgstatic inline __m128i 9101e04c3fSmrgvec_unpackhi_epi32 (__m128i even, __m128i odd) 9201e04c3fSmrg{ 9301e04c3fSmrg static const __m128i perm_mask = 947ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 9501e04c3fSmrg { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31}; 9601e04c3fSmrg#else 9701e04c3fSmrg {16, 17, 18, 19, 0, 1, 2, 3, 20, 21, 22, 23, 4, 5, 6, 7}; 9801e04c3fSmrg#endif 9901e04c3fSmrg 10001e04c3fSmrg return vec_perm (even, odd, perm_mask); 10101e04c3fSmrg} 10201e04c3fSmrg 10301e04c3fSmrgstatic inline __m128i 10401e04c3fSmrgvec_unpacklo_epi64 (__m128i even, __m128i odd) 10501e04c3fSmrg{ 10601e04c3fSmrg static const __m128i perm_mask = 1077ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 10801e04c3fSmrg { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23}; 10901e04c3fSmrg#else 11001e04c3fSmrg {24, 25, 26, 27, 28, 29, 30, 31, 8, 9, 10, 11, 12, 13, 14, 15}; 11101e04c3fSmrg#endif 11201e04c3fSmrg 11301e04c3fSmrg return vec_perm (even, odd, perm_mask); 11401e04c3fSmrg} 11501e04c3fSmrg 11601e04c3fSmrgstatic inline __m128i 11701e04c3fSmrgvec_unpackhi_epi64 (__m128i even, __m128i odd) 11801e04c3fSmrg{ 11901e04c3fSmrg static const __m128i perm_mask = 1207ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 12101e04c3fSmrg { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31}; 12201e04c3fSmrg#else 12301e04c3fSmrg {16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7}; 12401e04c3fSmrg#endif 12501e04c3fSmrg 12601e04c3fSmrg return vec_perm (even, odd, perm_mask); 12701e04c3fSmrg} 12801e04c3fSmrg 12901e04c3fSmrgstatic inline __m128i 13001e04c3fSmrgvec_add_epi32 (__m128i a, __m128i b) 13101e04c3fSmrg{ 13201e04c3fSmrg return (__m128i) vec_add ((vector signed int) a, (vector signed int) b); 13301e04c3fSmrg} 13401e04c3fSmrg 13501e04c3fSmrgstatic inline __m128i 13601e04c3fSmrgvec_sub_epi32 (__m128i a, __m128i b) 13701e04c3fSmrg{ 13801e04c3fSmrg return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b); 13901e04c3fSmrg} 14001e04c3fSmrg 14101e04c3fSmrg/* Call this function ONLY on POWER8 and newer platforms */ 14201e04c3fSmrgstatic inline __m128i 14301e04c3fSmrgvec_mullo_epi32 (__m128i a, __m128i b) 14401e04c3fSmrg{ 14501e04c3fSmrg __m128i v; 14601e04c3fSmrg 14701e04c3fSmrg __asm__( 14801e04c3fSmrg "vmuluwm %0, %1, %2 \n" 14901e04c3fSmrg : "=v" (v) 15001e04c3fSmrg : "v" (a), "v" (b) 15101e04c3fSmrg ); 15201e04c3fSmrg 15301e04c3fSmrg return v; 15401e04c3fSmrg} 15501e04c3fSmrg 15601e04c3fSmrgstatic inline __m128i 15701e04c3fSmrgvec_andnot_si128 (__m128i a, __m128i b) 15801e04c3fSmrg{ 15901e04c3fSmrg return vec_andc (b, a); 16001e04c3fSmrg} 16101e04c3fSmrg 16201e04c3fSmrgstatic inline void 16301e04c3fSmrgtranspose4_epi32(const __m128i * restrict a, 16401e04c3fSmrg const __m128i * restrict b, 16501e04c3fSmrg const __m128i * restrict c, 16601e04c3fSmrg const __m128i * restrict d, 16701e04c3fSmrg __m128i * restrict o, 16801e04c3fSmrg __m128i * restrict p, 16901e04c3fSmrg __m128i * restrict q, 17001e04c3fSmrg __m128i * restrict r) 17101e04c3fSmrg{ 17201e04c3fSmrg __m128i t0 = vec_unpacklo_epi32(*a, *b); 17301e04c3fSmrg __m128i t1 = vec_unpacklo_epi32(*c, *d); 17401e04c3fSmrg __m128i t2 = vec_unpackhi_epi32(*a, *b); 17501e04c3fSmrg __m128i t3 = vec_unpackhi_epi32(*c, *d); 17601e04c3fSmrg 17701e04c3fSmrg *o = vec_unpacklo_epi64(t0, t1); 17801e04c3fSmrg *p = vec_unpackhi_epi64(t0, t1); 17901e04c3fSmrg *q = vec_unpacklo_epi64(t2, t3); 18001e04c3fSmrg *r = vec_unpackhi_epi64(t2, t3); 18101e04c3fSmrg} 18201e04c3fSmrg 18301e04c3fSmrgstatic inline __m128i 18401e04c3fSmrgvec_slli_epi32 (__m128i vsrc, unsigned int count) 18501e04c3fSmrg{ 18601e04c3fSmrg __m128i_union vec_count; 18701e04c3fSmrg 18801e04c3fSmrg if (count >= 32) 18901e04c3fSmrg return (__m128i) vec_splats (0); 19001e04c3fSmrg else if (count == 0) 19101e04c3fSmrg return vsrc; 19201e04c3fSmrg 19301e04c3fSmrg /* In VMX, all shift count fields must contain the same value */ 19401e04c3fSmrg vec_count.m128si = (vector signed int) vec_splats (count); 19501e04c3fSmrg return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui); 19601e04c3fSmrg} 19701e04c3fSmrg 19801e04c3fSmrgstatic inline __m128i 19901e04c3fSmrgvec_srli_epi32 (__m128i vsrc, unsigned int count) 20001e04c3fSmrg{ 20101e04c3fSmrg __m128i_union vec_count; 20201e04c3fSmrg 20301e04c3fSmrg if (count >= 32) 20401e04c3fSmrg return (__m128i) vec_splats (0); 20501e04c3fSmrg else if (count == 0) 20601e04c3fSmrg return vsrc; 20701e04c3fSmrg 20801e04c3fSmrg /* In VMX, all shift count fields must contain the same value */ 20901e04c3fSmrg vec_count.m128si = (vector signed int) vec_splats (count); 21001e04c3fSmrg return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui); 21101e04c3fSmrg} 21201e04c3fSmrg 21301e04c3fSmrgstatic inline __m128i 21401e04c3fSmrgvec_srai_epi32 (__m128i vsrc, unsigned int count) 21501e04c3fSmrg{ 21601e04c3fSmrg __m128i_union vec_count; 21701e04c3fSmrg 21801e04c3fSmrg if (count >= 32) 21901e04c3fSmrg return (__m128i) vec_splats (0); 22001e04c3fSmrg else if (count == 0) 22101e04c3fSmrg return vsrc; 22201e04c3fSmrg 22301e04c3fSmrg /* In VMX, all shift count fields must contain the same value */ 22401e04c3fSmrg vec_count.m128si = (vector signed int) vec_splats (count); 22501e04c3fSmrg return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui); 22601e04c3fSmrg} 22701e04c3fSmrg 22801e04c3fSmrgstatic inline __m128i 22901e04c3fSmrgvec_cmpeq_epi32 (__m128i a, __m128i b) 23001e04c3fSmrg{ 23101e04c3fSmrg return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b); 23201e04c3fSmrg} 23301e04c3fSmrg 23401e04c3fSmrgstatic inline __m128i 23501e04c3fSmrgvec_loadu_si128 (const uint32_t* src) 23601e04c3fSmrg{ 23701e04c3fSmrg __m128i_union vsrc; 23801e04c3fSmrg 2397ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 24001e04c3fSmrg 24101e04c3fSmrg vsrc.m128ui = *((vector unsigned int *) src); 24201e04c3fSmrg 24301e04c3fSmrg#else 24401e04c3fSmrg 24501e04c3fSmrg __m128i vmask, tmp1, tmp2; 24601e04c3fSmrg 24701e04c3fSmrg vmask = vec_lvsl(0, src); 24801e04c3fSmrg 24901e04c3fSmrg tmp1 = (__m128i) vec_ld (0, src); 25001e04c3fSmrg tmp2 = (__m128i) vec_ld (15, src); 25101e04c3fSmrg vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask); 25201e04c3fSmrg 25301e04c3fSmrg#endif 25401e04c3fSmrg 25501e04c3fSmrg return vsrc.m128i; 25601e04c3fSmrg} 25701e04c3fSmrg 25801e04c3fSmrgstatic inline __m128i 25901e04c3fSmrgvec_load_si128 (const uint32_t* src) 26001e04c3fSmrg{ 26101e04c3fSmrg __m128i_union vsrc; 26201e04c3fSmrg 26301e04c3fSmrg vsrc.m128ui = *((vector unsigned int *) src); 26401e04c3fSmrg 26501e04c3fSmrg return vsrc.m128i; 26601e04c3fSmrg} 26701e04c3fSmrg 26801e04c3fSmrgstatic inline void 26901e04c3fSmrgvec_store_si128 (uint32_t* dest, __m128i vdata) 27001e04c3fSmrg{ 27101e04c3fSmrg vec_st ((vector unsigned int) vdata, 0, dest); 27201e04c3fSmrg} 27301e04c3fSmrg 27401e04c3fSmrg/* Call this function ONLY on POWER8 and newer platforms */ 27501e04c3fSmrgstatic inline int 27601e04c3fSmrgvec_movemask_epi8 (__m128i vsrc) 27701e04c3fSmrg{ 27801e04c3fSmrg __m128i_union vtemp; 27901e04c3fSmrg int result; 28001e04c3fSmrg 28101e04c3fSmrg vtemp.m128i = vec_vgbbd(vsrc); 28201e04c3fSmrg 2837ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 28401e04c3fSmrg result = vtemp.ub[15] << 8 | vtemp.ub[7]; 28501e04c3fSmrg#else 28601e04c3fSmrg result = vtemp.ub[0] << 8 | vtemp.ub[8]; 28701e04c3fSmrg#endif 28801e04c3fSmrg 28901e04c3fSmrg return result; 29001e04c3fSmrg} 29101e04c3fSmrg 29201e04c3fSmrgstatic inline __m128i 29301e04c3fSmrgvec_packs_epi16 (__m128i a, __m128i b) 29401e04c3fSmrg{ 2957ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 29601e04c3fSmrg return (__m128i) vec_packs ((vector signed short) a, 29701e04c3fSmrg (vector signed short) b); 29801e04c3fSmrg#else 29901e04c3fSmrg return (__m128i) vec_packs ((vector signed short) b, 30001e04c3fSmrg (vector signed short) a); 30101e04c3fSmrg#endif 30201e04c3fSmrg} 30301e04c3fSmrg 30401e04c3fSmrgstatic inline __m128i 30501e04c3fSmrgvec_packs_epi32 (__m128i a, __m128i b) 30601e04c3fSmrg{ 3077ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN 30801e04c3fSmrg return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b); 30901e04c3fSmrg#else 31001e04c3fSmrg return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a); 31101e04c3fSmrg#endif 31201e04c3fSmrg} 31301e04c3fSmrg 3147ec681f3Smrg#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ 31501e04c3fSmrg 31601e04c3fSmrg#endif /* U_PWR8_H_ */ 317