101e04c3fSmrg/*
201e04c3fSmrg * Copyright 2015 Red Hat Inc.
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
801e04c3fSmrg * license, and/or sell copies of the Software, and to permit persons to whom
901e04c3fSmrg * the Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
1901e04c3fSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
2001e04c3fSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
2101e04c3fSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
2201e04c3fSmrg *
2301e04c3fSmrg * Author: Oded Gabbay <oded.gabbay@redhat.com>
2401e04c3fSmrg */
2501e04c3fSmrg
2601e04c3fSmrg/**
2701e04c3fSmrg * @file
2801e04c3fSmrg * POWER8 intrinsics portability header.
2901e04c3fSmrg *
3001e04c3fSmrg */
3101e04c3fSmrg
3201e04c3fSmrg#ifndef U_PWR8_H_
3301e04c3fSmrg#define U_PWR8_H_
3401e04c3fSmrg
357ec681f3Smrg#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN
3601e04c3fSmrg
3701e04c3fSmrg#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
3801e04c3fSmrg
3901e04c3fSmrgtypedef VECTOR_ALIGN_16 vector unsigned char __m128i;
4001e04c3fSmrg
4101e04c3fSmrgtypedef VECTOR_ALIGN_16 union m128i {
4201e04c3fSmrg   __m128i m128i;
4301e04c3fSmrg   vector signed int m128si;
4401e04c3fSmrg   vector unsigned int m128ui;
4501e04c3fSmrg   ubyte ub[16];
4601e04c3fSmrg   ushort us[8];
4701e04c3fSmrg   int i[4];
4801e04c3fSmrg   uint ui[4];
4901e04c3fSmrg} __m128i_union;
5001e04c3fSmrg
5101e04c3fSmrgstatic inline __m128i
5201e04c3fSmrgvec_set_epi32 (int i3, int i2, int i1, int i0)
5301e04c3fSmrg{
5401e04c3fSmrg   __m128i_union vdst;
5501e04c3fSmrg
567ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
5701e04c3fSmrg   vdst.i[0] = i0;
5801e04c3fSmrg   vdst.i[1] = i1;
5901e04c3fSmrg   vdst.i[2] = i2;
6001e04c3fSmrg   vdst.i[3] = i3;
6101e04c3fSmrg#else
6201e04c3fSmrg   vdst.i[3] = i0;
6301e04c3fSmrg   vdst.i[2] = i1;
6401e04c3fSmrg   vdst.i[1] = i2;
6501e04c3fSmrg   vdst.i[0] = i3;
6601e04c3fSmrg#endif
6701e04c3fSmrg
6801e04c3fSmrg   return (__m128i) vdst.m128si;
6901e04c3fSmrg}
7001e04c3fSmrg
7101e04c3fSmrgstatic inline __m128i
7201e04c3fSmrgvec_setr_epi32 (int i0, int i1, int i2, int i3)
7301e04c3fSmrg{
7401e04c3fSmrg  return vec_set_epi32 (i3, i2, i1, i0);
7501e04c3fSmrg}
7601e04c3fSmrg
7701e04c3fSmrgstatic inline __m128i
7801e04c3fSmrgvec_unpacklo_epi32 (__m128i even, __m128i odd)
7901e04c3fSmrg{
8001e04c3fSmrg   static const __m128i perm_mask =
817ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
8201e04c3fSmrg      { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
8301e04c3fSmrg#else
8401e04c3fSmrg      {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
8501e04c3fSmrg#endif
8601e04c3fSmrg
8701e04c3fSmrg   return vec_perm (even, odd, perm_mask);
8801e04c3fSmrg}
8901e04c3fSmrg
9001e04c3fSmrgstatic inline __m128i
9101e04c3fSmrgvec_unpackhi_epi32 (__m128i even, __m128i odd)
9201e04c3fSmrg{
9301e04c3fSmrg   static const __m128i perm_mask =
947ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
9501e04c3fSmrg      { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
9601e04c3fSmrg#else
9701e04c3fSmrg      {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
9801e04c3fSmrg#endif
9901e04c3fSmrg
10001e04c3fSmrg   return vec_perm (even, odd, perm_mask);
10101e04c3fSmrg}
10201e04c3fSmrg
10301e04c3fSmrgstatic inline __m128i
10401e04c3fSmrgvec_unpacklo_epi64 (__m128i even, __m128i odd)
10501e04c3fSmrg{
10601e04c3fSmrg   static const __m128i perm_mask =
1077ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
10801e04c3fSmrg      { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
10901e04c3fSmrg#else
11001e04c3fSmrg      {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
11101e04c3fSmrg#endif
11201e04c3fSmrg
11301e04c3fSmrg   return vec_perm (even, odd, perm_mask);
11401e04c3fSmrg}
11501e04c3fSmrg
11601e04c3fSmrgstatic inline __m128i
11701e04c3fSmrgvec_unpackhi_epi64 (__m128i even, __m128i odd)
11801e04c3fSmrg{
11901e04c3fSmrg   static const __m128i perm_mask =
1207ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
12101e04c3fSmrg      { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
12201e04c3fSmrg#else
12301e04c3fSmrg      {16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7};
12401e04c3fSmrg#endif
12501e04c3fSmrg
12601e04c3fSmrg   return vec_perm (even, odd, perm_mask);
12701e04c3fSmrg}
12801e04c3fSmrg
12901e04c3fSmrgstatic inline __m128i
13001e04c3fSmrgvec_add_epi32 (__m128i a, __m128i b)
13101e04c3fSmrg{
13201e04c3fSmrg   return (__m128i) vec_add ((vector signed int) a, (vector signed int) b);
13301e04c3fSmrg}
13401e04c3fSmrg
13501e04c3fSmrgstatic inline __m128i
13601e04c3fSmrgvec_sub_epi32 (__m128i a, __m128i b)
13701e04c3fSmrg{
13801e04c3fSmrg   return (__m128i) vec_sub ((vector signed int) a, (vector signed int) b);
13901e04c3fSmrg}
14001e04c3fSmrg
14101e04c3fSmrg/* Call this function ONLY on POWER8 and newer platforms */
14201e04c3fSmrgstatic inline __m128i
14301e04c3fSmrgvec_mullo_epi32 (__m128i a, __m128i b)
14401e04c3fSmrg{
14501e04c3fSmrg   __m128i v;
14601e04c3fSmrg
14701e04c3fSmrg   __asm__(
14801e04c3fSmrg           "vmuluwm %0, %1, %2   \n"
14901e04c3fSmrg           : "=v" (v)
15001e04c3fSmrg           : "v" (a), "v" (b)
15101e04c3fSmrg           );
15201e04c3fSmrg
15301e04c3fSmrg   return v;
15401e04c3fSmrg}
15501e04c3fSmrg
15601e04c3fSmrgstatic inline __m128i
15701e04c3fSmrgvec_andnot_si128 (__m128i a, __m128i b)
15801e04c3fSmrg{
15901e04c3fSmrg   return vec_andc (b, a);
16001e04c3fSmrg}
16101e04c3fSmrg
16201e04c3fSmrgstatic inline void
16301e04c3fSmrgtranspose4_epi32(const __m128i * restrict a,
16401e04c3fSmrg                 const __m128i * restrict b,
16501e04c3fSmrg                 const __m128i * restrict c,
16601e04c3fSmrg                 const __m128i * restrict d,
16701e04c3fSmrg                 __m128i * restrict o,
16801e04c3fSmrg                 __m128i * restrict p,
16901e04c3fSmrg                 __m128i * restrict q,
17001e04c3fSmrg                 __m128i * restrict r)
17101e04c3fSmrg{
17201e04c3fSmrg   __m128i t0 = vec_unpacklo_epi32(*a, *b);
17301e04c3fSmrg   __m128i t1 = vec_unpacklo_epi32(*c, *d);
17401e04c3fSmrg   __m128i t2 = vec_unpackhi_epi32(*a, *b);
17501e04c3fSmrg   __m128i t3 = vec_unpackhi_epi32(*c, *d);
17601e04c3fSmrg
17701e04c3fSmrg   *o = vec_unpacklo_epi64(t0, t1);
17801e04c3fSmrg   *p = vec_unpackhi_epi64(t0, t1);
17901e04c3fSmrg   *q = vec_unpacklo_epi64(t2, t3);
18001e04c3fSmrg   *r = vec_unpackhi_epi64(t2, t3);
18101e04c3fSmrg}
18201e04c3fSmrg
18301e04c3fSmrgstatic inline __m128i
18401e04c3fSmrgvec_slli_epi32 (__m128i vsrc, unsigned int count)
18501e04c3fSmrg{
18601e04c3fSmrg   __m128i_union vec_count;
18701e04c3fSmrg
18801e04c3fSmrg   if (count >= 32)
18901e04c3fSmrg      return (__m128i) vec_splats (0);
19001e04c3fSmrg   else if (count == 0)
19101e04c3fSmrg      return vsrc;
19201e04c3fSmrg
19301e04c3fSmrg   /* In VMX, all shift count fields must contain the same value */
19401e04c3fSmrg   vec_count.m128si = (vector signed int) vec_splats (count);
19501e04c3fSmrg   return (__m128i) vec_sl ((vector signed int) vsrc, vec_count.m128ui);
19601e04c3fSmrg}
19701e04c3fSmrg
19801e04c3fSmrgstatic inline __m128i
19901e04c3fSmrgvec_srli_epi32 (__m128i vsrc, unsigned int count)
20001e04c3fSmrg{
20101e04c3fSmrg   __m128i_union vec_count;
20201e04c3fSmrg
20301e04c3fSmrg   if (count >= 32)
20401e04c3fSmrg      return (__m128i) vec_splats (0);
20501e04c3fSmrg   else if (count == 0)
20601e04c3fSmrg      return vsrc;
20701e04c3fSmrg
20801e04c3fSmrg   /* In VMX, all shift count fields must contain the same value */
20901e04c3fSmrg   vec_count.m128si = (vector signed int) vec_splats (count);
21001e04c3fSmrg   return (__m128i) vec_sr ((vector signed int) vsrc, vec_count.m128ui);
21101e04c3fSmrg}
21201e04c3fSmrg
21301e04c3fSmrgstatic inline __m128i
21401e04c3fSmrgvec_srai_epi32 (__m128i vsrc, unsigned int count)
21501e04c3fSmrg{
21601e04c3fSmrg   __m128i_union vec_count;
21701e04c3fSmrg
21801e04c3fSmrg   if (count >= 32)
21901e04c3fSmrg      return (__m128i) vec_splats (0);
22001e04c3fSmrg   else if (count == 0)
22101e04c3fSmrg      return vsrc;
22201e04c3fSmrg
22301e04c3fSmrg   /* In VMX, all shift count fields must contain the same value */
22401e04c3fSmrg   vec_count.m128si = (vector signed int) vec_splats (count);
22501e04c3fSmrg   return (__m128i) vec_sra ((vector signed int) vsrc, vec_count.m128ui);
22601e04c3fSmrg}
22701e04c3fSmrg
22801e04c3fSmrgstatic inline __m128i
22901e04c3fSmrgvec_cmpeq_epi32 (__m128i a, __m128i b)
23001e04c3fSmrg{
23101e04c3fSmrg   return (__m128i) vec_cmpeq ((vector signed int) a, (vector signed int) b);
23201e04c3fSmrg}
23301e04c3fSmrg
23401e04c3fSmrgstatic inline __m128i
23501e04c3fSmrgvec_loadu_si128 (const uint32_t* src)
23601e04c3fSmrg{
23701e04c3fSmrg   __m128i_union vsrc;
23801e04c3fSmrg
2397ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
24001e04c3fSmrg
24101e04c3fSmrg   vsrc.m128ui = *((vector unsigned int *) src);
24201e04c3fSmrg
24301e04c3fSmrg#else
24401e04c3fSmrg
24501e04c3fSmrg   __m128i vmask, tmp1, tmp2;
24601e04c3fSmrg
24701e04c3fSmrg   vmask = vec_lvsl(0, src);
24801e04c3fSmrg
24901e04c3fSmrg   tmp1 = (__m128i) vec_ld (0, src);
25001e04c3fSmrg   tmp2 = (__m128i) vec_ld (15, src);
25101e04c3fSmrg   vsrc.m128ui = (vector unsigned int) vec_perm (tmp1, tmp2, vmask);
25201e04c3fSmrg
25301e04c3fSmrg#endif
25401e04c3fSmrg
25501e04c3fSmrg   return vsrc.m128i;
25601e04c3fSmrg}
25701e04c3fSmrg
25801e04c3fSmrgstatic inline __m128i
25901e04c3fSmrgvec_load_si128 (const uint32_t* src)
26001e04c3fSmrg{
26101e04c3fSmrg   __m128i_union vsrc;
26201e04c3fSmrg
26301e04c3fSmrg   vsrc.m128ui = *((vector unsigned int *) src);
26401e04c3fSmrg
26501e04c3fSmrg   return vsrc.m128i;
26601e04c3fSmrg}
26701e04c3fSmrg
26801e04c3fSmrgstatic inline void
26901e04c3fSmrgvec_store_si128 (uint32_t* dest, __m128i vdata)
27001e04c3fSmrg{
27101e04c3fSmrg   vec_st ((vector unsigned int) vdata, 0, dest);
27201e04c3fSmrg}
27301e04c3fSmrg
27401e04c3fSmrg/* Call this function ONLY on POWER8 and newer platforms */
27501e04c3fSmrgstatic inline int
27601e04c3fSmrgvec_movemask_epi8 (__m128i vsrc)
27701e04c3fSmrg{
27801e04c3fSmrg   __m128i_union vtemp;
27901e04c3fSmrg   int result;
28001e04c3fSmrg
28101e04c3fSmrg   vtemp.m128i = vec_vgbbd(vsrc);
28201e04c3fSmrg
2837ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
28401e04c3fSmrg   result = vtemp.ub[15] << 8 | vtemp.ub[7];
28501e04c3fSmrg#else
28601e04c3fSmrg   result = vtemp.ub[0] << 8 | vtemp.ub[8];
28701e04c3fSmrg#endif
28801e04c3fSmrg
28901e04c3fSmrg   return result;
29001e04c3fSmrg}
29101e04c3fSmrg
29201e04c3fSmrgstatic inline __m128i
29301e04c3fSmrgvec_packs_epi16 (__m128i a, __m128i b)
29401e04c3fSmrg{
2957ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
29601e04c3fSmrg   return (__m128i) vec_packs ((vector signed short) a,
29701e04c3fSmrg                               (vector signed short) b);
29801e04c3fSmrg#else
29901e04c3fSmrg   return (__m128i) vec_packs ((vector signed short) b,
30001e04c3fSmrg                               (vector signed short) a);
30101e04c3fSmrg#endif
30201e04c3fSmrg}
30301e04c3fSmrg
30401e04c3fSmrgstatic inline __m128i
30501e04c3fSmrgvec_packs_epi32 (__m128i a, __m128i b)
30601e04c3fSmrg{
3077ec681f3Smrg#if UTIL_ARCH_LITTLE_ENDIAN
30801e04c3fSmrg   return (__m128i) vec_packs ((vector signed int) a, (vector signed int) b);
30901e04c3fSmrg#else
31001e04c3fSmrg   return (__m128i) vec_packs ((vector signed int) b, (vector signed int) a);
31101e04c3fSmrg#endif
31201e04c3fSmrg}
31301e04c3fSmrg
3147ec681f3Smrg#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */
31501e04c3fSmrg
31601e04c3fSmrg#endif /* U_PWR8_H_ */
317