Home | History | Annotate | Line # | Download | only in spu
      1  1.10  mrg /* Copyright (C) 2006-2019 Free Software Foundation, Inc.
      2   1.1  mrg 
      3   1.1  mrg    This file is free software; you can redistribute it and/or modify it under
      4   1.1  mrg    the terms of the GNU General Public License as published by the Free
      5   1.1  mrg    Software Foundation; either version 3 of the License, or (at your option)
      6   1.1  mrg    any later version.
      7   1.1  mrg 
      8   1.1  mrg    This file is distributed in the hope that it will be useful, but WITHOUT
      9   1.1  mrg    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     10   1.1  mrg    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     11   1.1  mrg    for more details.
     12   1.1  mrg 
     13   1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     14   1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     15   1.1  mrg    3.1, as published by the Free Software Foundation.
     16   1.1  mrg 
     17   1.1  mrg    You should have received a copy of the GNU General Public License and
     18   1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     19   1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     20   1.1  mrg    <http://www.gnu.org/licenses/>.  */
     21   1.1  mrg 
     22   1.1  mrg #ifndef _VMX2SPU_H_
     23   1.1  mrg #define _VMX2SPU_H_	1
     24   1.1  mrg 
     25   1.1  mrg #ifdef __cplusplus
     26   1.1  mrg 
     27   1.1  mrg #ifdef __SPU__
     28   1.1  mrg 
     29   1.1  mrg #include <spu_intrinsics.h>
     30   1.1  mrg #include <vec_types.h>
     31   1.1  mrg 
     32   1.1  mrg /* This file maps generic VMX intrinsics and predicates to the SPU using
     33   1.1  mrg  * overloaded C++ functions.
     34   1.1  mrg  */
     35   1.1  mrg 
     36   1.1  mrg /************************************************************************
     37   1.1  mrg  *                        INTRINSICS
     38   1.1  mrg  ************************************************************************/
     39   1.1  mrg 
     40   1.1  mrg /* vec_abs (vector absolute value)
     41   1.1  mrg  * =======
     42   1.1  mrg  */
     43   1.1  mrg static inline vec_char16 vec_abs(vec_char16 a)
     44   1.1  mrg {
     45   1.1  mrg   vec_char16 minus_a;
     46   1.1  mrg 
     47   1.1  mrg   minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
     48   1.1  mrg   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
     49   1.1  mrg }
     50   1.1  mrg 
     51   1.1  mrg static inline vec_short8 vec_abs(vec_short8 a)
     52   1.1  mrg {
     53   1.1  mrg   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
     54   1.1  mrg }
     55   1.1  mrg 
     56   1.1  mrg static inline vec_int4 vec_abs(vec_int4 a)
     57   1.1  mrg {
     58   1.1  mrg   return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
     59   1.1  mrg }
     60   1.1  mrg 
     61   1.1  mrg static inline vec_float4 vec_abs(vec_float4 a)
     62   1.1  mrg {
     63   1.1  mrg   return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
     64   1.1  mrg }
     65   1.1  mrg 
     66   1.1  mrg /* vec_abss (vector absolute value saturate)
     67   1.1  mrg  * ========
     68   1.1  mrg  */
     69   1.1  mrg static inline vec_char16 vec_abss(vec_char16 a)
     70   1.1  mrg {
     71   1.1  mrg   vec_char16 minus_a;
     72   1.1  mrg 
     73   1.1  mrg   minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
     74   1.1  mrg 				(vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
     75   1.1  mrg   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
     76   1.1  mrg }
     77   1.1  mrg 
     78   1.1  mrg static inline vec_short8 vec_abss(vec_short8 a)
     79   1.1  mrg {
     80   1.1  mrg   vec_short8 minus_a;
     81   1.1  mrg 
     82   1.1  mrg   minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
     83   1.1  mrg   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
     84   1.1  mrg }
     85   1.1  mrg 
     86   1.1  mrg static inline vec_int4 vec_abss(vec_int4 a)
     87   1.1  mrg {
     88   1.1  mrg   vec_int4 minus_a;
     89   1.1  mrg 
     90   1.1  mrg   minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
     91   1.1  mrg   return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
     92   1.1  mrg }
     93   1.1  mrg 
     94   1.1  mrg 
     95   1.1  mrg /* vec_add (vector add)
     96   1.1  mrg  * =======
     97   1.1  mrg  */
     98   1.1  mrg static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
     99   1.1  mrg {
    100   1.1  mrg   return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
    101   1.1  mrg 				spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
    102   1.1  mrg 				spu_splats((unsigned short)(0xFF00)))));
    103   1.1  mrg }
    104   1.1  mrg 
    105   1.1  mrg static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
    106   1.1  mrg {
    107   1.1  mrg   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
    108   1.1  mrg }
    109   1.1  mrg 
    110   1.1  mrg static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
    111   1.1  mrg {
    112   1.1  mrg   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
    113   1.1  mrg }
    114   1.1  mrg 
    115   1.1  mrg static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
    116   1.1  mrg {
    117   1.1  mrg   return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
    118   1.1  mrg }
    119   1.1  mrg 
    120   1.1  mrg static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
    121   1.1  mrg {
    122   1.1  mrg   return (spu_add(a, b));
    123   1.1  mrg }
    124   1.1  mrg 
    125   1.1  mrg static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
    126   1.1  mrg {
    127   1.1  mrg   return (spu_add(a, b));
    128   1.1  mrg }
    129   1.1  mrg 
    130   1.1  mrg static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
    131   1.1  mrg {
    132   1.1  mrg   return (spu_add((vec_short8)(a), b));
    133   1.1  mrg }
    134   1.1  mrg 
    135   1.1  mrg static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
    136   1.1  mrg {
    137   1.1  mrg   return (spu_add(a, (vec_short8)(b)));
    138   1.1  mrg }
    139   1.1  mrg 
    140   1.1  mrg static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
    141   1.1  mrg {
    142   1.1  mrg   return (spu_add(a, b));
    143   1.1  mrg }
    144   1.1  mrg 
    145   1.1  mrg static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
    146   1.1  mrg {
    147   1.1  mrg   return (spu_add(a, b));
    148   1.1  mrg }
    149   1.1  mrg 
    150   1.1  mrg static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
    151   1.1  mrg {
    152   1.1  mrg   return (spu_add((vec_int4)(a), b));
    153   1.1  mrg }
    154   1.1  mrg 
    155   1.1  mrg static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
    156   1.1  mrg {
    157   1.1  mrg   return (spu_add(a, (vec_int4)(b)));
    158   1.1  mrg }
    159   1.1  mrg 
    160   1.1  mrg static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
    161   1.1  mrg {
    162   1.1  mrg   return (spu_add(a, b));
    163   1.1  mrg }
    164   1.1  mrg 
    165   1.1  mrg /* vec_addc (vector add carryout unsigned word)
    166   1.1  mrg  * ========
    167   1.1  mrg  */
    168   1.1  mrg #define vec_addc(_a, _b)	spu_genc(_a, _b)
    169   1.1  mrg 
    170   1.1  mrg /* vec_adds (vector add saturated)
    171   1.1  mrg  * ========
    172   1.1  mrg  */
    173   1.1  mrg static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
    174   1.1  mrg {
    175   1.1  mrg   vec_uchar16 s1, s2, s, d;
    176   1.1  mrg 
    177   1.1  mrg   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
    178   1.1  mrg   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
    179   1.1  mrg   s  = spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
    180   1.1  mrg 				          8, 24, 10, 26, 12, 28, 14, 30}));
    181   1.1  mrg   d  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
    182   1.1  mrg 				          9, 25, 11, 27, 13, 29, 15, 31}));
    183   1.1  mrg   return (spu_or(d, spu_cmpeq(s, 1)));
    184   1.1  mrg }
    185   1.1  mrg 
    186   1.1  mrg static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
    187   1.1  mrg {
    188   1.1  mrg   vec_uchar16 s1, s2, s, d;
    189   1.1  mrg 
    190   1.1  mrg   s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
    191   1.1  mrg   s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
    192   1.1  mrg   s  = spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
    193   1.1  mrg 				          9, 25, 11, 27, 13, 29, 15, 31}));
    194   1.1  mrg   d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
    195   1.1  mrg   d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
    196   1.1  mrg   return ((vec_char16)(d));
    197   1.1  mrg }
    198   1.1  mrg 
    199   1.1  mrg static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
    200   1.1  mrg {
    201   1.1  mrg   return (vec_adds((vec_char16)(a), b));
    202   1.1  mrg }
    203   1.1  mrg 
    204   1.1  mrg static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
    205   1.1  mrg {
    206   1.1  mrg   return (vec_adds(a, (vec_char16)(b)));
    207   1.1  mrg }
    208   1.1  mrg 
    209   1.1  mrg static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
    210   1.1  mrg {
    211   1.1  mrg   vec_ushort8 s, d;
    212   1.1  mrg 
    213   1.1  mrg   s = spu_add(a, b);
    214   1.1  mrg   d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
    215   1.1  mrg   return (d);
    216   1.1  mrg }
    217   1.1  mrg 
    218   1.1  mrg static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
    219   1.1  mrg {
    220   1.1  mrg   vec_short8 s, d;
    221   1.1  mrg 
    222   1.1  mrg   s = spu_add(a, b);
    223   1.1  mrg   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
    224   1.1  mrg   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
    225   1.1  mrg   return (d);
    226   1.1  mrg }
    227   1.1  mrg 
    228   1.1  mrg static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
    229   1.1  mrg {
    230   1.1  mrg   return (vec_adds((vec_short8)(a), b));
    231   1.1  mrg }
    232   1.1  mrg 
    233   1.1  mrg static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
    234   1.1  mrg {
    235   1.1  mrg   return (vec_adds(a, (vec_short8)(b)));
    236   1.1  mrg }
    237   1.1  mrg 
    238   1.1  mrg static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
    239   1.1  mrg {
    240   1.1  mrg   return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
    241   1.1  mrg }
    242   1.1  mrg 
    243   1.1  mrg static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
    244   1.1  mrg {
    245   1.1  mrg   vec_int4 s, d;
    246   1.1  mrg 
    247   1.1  mrg   s = spu_add(a, b);
    248   1.1  mrg   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
    249   1.1  mrg   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
    250   1.1  mrg   return (d);
    251   1.1  mrg }
    252   1.1  mrg 
    253   1.1  mrg static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
    254   1.1  mrg {
    255   1.1  mrg   return (vec_adds((vec_int4)(a), b));
    256   1.1  mrg }
    257   1.1  mrg 
    258   1.1  mrg static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
    259   1.1  mrg {
    260   1.1  mrg   return (vec_adds(a, (vec_int4)(b)));
    261   1.1  mrg }
    262   1.1  mrg 
    263   1.1  mrg /* vec_and (vector logical and)
    264   1.1  mrg  * =======
    265   1.1  mrg  */
    266   1.1  mrg static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
    267   1.1  mrg {
    268   1.1  mrg   return (spu_and(a, b));
    269   1.1  mrg }
    270   1.1  mrg 
    271   1.1  mrg static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
    272   1.1  mrg {
    273   1.1  mrg   return (spu_and(a, b));
    274   1.1  mrg }
    275   1.1  mrg 
    276   1.1  mrg static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
    277   1.1  mrg {
    278   1.1  mrg   return (spu_and((vec_char16)(a), b));
    279   1.1  mrg }
    280   1.1  mrg 
    281   1.1  mrg static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
    282   1.1  mrg {
    283   1.1  mrg   return (spu_and(a, (vec_char16)(b)));
    284   1.1  mrg }
    285   1.1  mrg 
    286   1.1  mrg static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
    287   1.1  mrg {
    288   1.1  mrg   return (spu_and(a, b));
    289   1.1  mrg }
    290   1.1  mrg 
    291   1.1  mrg static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
    292   1.1  mrg {
    293   1.1  mrg   return (spu_and(a, b));
    294   1.1  mrg }
    295   1.1  mrg 
    296   1.1  mrg static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
    297   1.1  mrg {
    298   1.1  mrg   return (spu_and((vec_short8)(a), b));
    299   1.1  mrg }
    300   1.1  mrg 
    301   1.1  mrg static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
    302   1.1  mrg {
    303   1.1  mrg   return (spu_and(a, (vec_short8)(b)));
    304   1.1  mrg }
    305   1.1  mrg 
    306   1.1  mrg static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
    307   1.1  mrg {
    308   1.1  mrg   return (spu_and(a, b));
    309   1.1  mrg }
    310   1.1  mrg 
    311   1.1  mrg static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
    312   1.1  mrg {
    313   1.1  mrg   return (spu_and(a, b));
    314   1.1  mrg }
    315   1.1  mrg 
    316   1.1  mrg static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
    317   1.1  mrg {
    318   1.1  mrg   return (spu_and((vec_int4)(a), b));
    319   1.1  mrg }
    320   1.1  mrg 
    321   1.1  mrg static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
    322   1.1  mrg {
    323   1.1  mrg   return (spu_and(a, (vec_int4)(b)));
    324   1.1  mrg }
    325   1.1  mrg 
    326   1.1  mrg static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
    327   1.1  mrg {
    328   1.1  mrg   return (spu_and(a, b));
    329   1.1  mrg }
    330   1.1  mrg 
    331   1.1  mrg static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
    332   1.1  mrg {
    333   1.1  mrg   return (spu_and((vec_float4)(a),b));
    334   1.1  mrg }
    335   1.1  mrg 
    336   1.1  mrg static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
    337   1.1  mrg {
    338   1.1  mrg   return (spu_and(a, (vec_float4)(b)));
    339   1.1  mrg }
    340   1.1  mrg 
    341   1.1  mrg 
    342   1.1  mrg /* vec_andc (vector logical and with complement)
    343   1.1  mrg  * ========
    344   1.1  mrg  */
    345   1.1  mrg static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
    346   1.1  mrg {
    347   1.1  mrg   return (spu_andc(a, b));
    348   1.1  mrg }
    349   1.1  mrg 
    350   1.1  mrg static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
    351   1.1  mrg {
    352   1.1  mrg   return (spu_andc(a, b));
    353   1.1  mrg }
    354   1.1  mrg 
    355   1.1  mrg static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
    356   1.1  mrg {
    357   1.1  mrg   return (spu_andc((vec_char16)(a), b));
    358   1.1  mrg }
    359   1.1  mrg 
    360   1.1  mrg static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
    361   1.1  mrg {
    362   1.1  mrg   return (spu_andc(a, (vec_char16)(b)));
    363   1.1  mrg }
    364   1.1  mrg 
    365   1.1  mrg static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
    366   1.1  mrg {
    367   1.1  mrg   return (spu_andc(a, b));
    368   1.1  mrg }
    369   1.1  mrg 
    370   1.1  mrg static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
    371   1.1  mrg {
    372   1.1  mrg   return (spu_andc(a, b));
    373   1.1  mrg }
    374   1.1  mrg 
    375   1.1  mrg static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
    376   1.1  mrg {
    377   1.1  mrg   return (spu_andc((vec_short8)(a), b));
    378   1.1  mrg }
    379   1.1  mrg 
    380   1.1  mrg static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
    381   1.1  mrg {
    382   1.1  mrg   return (spu_andc(a, (vec_short8)(b)));
    383   1.1  mrg }
    384   1.1  mrg 
    385   1.1  mrg static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
    386   1.1  mrg {
    387   1.1  mrg   return (spu_andc(a, b));
    388   1.1  mrg }
    389   1.1  mrg 
    390   1.1  mrg static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
    391   1.1  mrg {
    392   1.1  mrg   return (spu_andc(a, b));
    393   1.1  mrg }
    394   1.1  mrg 
    395   1.1  mrg static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
    396   1.1  mrg {
    397   1.1  mrg   return (spu_andc((vec_int4)(a), b));
    398   1.1  mrg }
    399   1.1  mrg 
    400   1.1  mrg static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
    401   1.1  mrg {
    402   1.1  mrg   return (spu_andc(a, (vec_int4)(b)));
    403   1.1  mrg }
    404   1.1  mrg 
    405   1.1  mrg static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
    406   1.1  mrg {
    407   1.1  mrg   return (spu_andc(a,b));
    408   1.1  mrg }
    409   1.1  mrg 
    410   1.1  mrg static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
    411   1.1  mrg {
    412   1.1  mrg   return (spu_andc((vec_float4)(a),b));
    413   1.1  mrg }
    414   1.1  mrg 
    415   1.1  mrg static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
    416   1.1  mrg {
    417   1.1  mrg   return (spu_andc(a, (vec_float4)(b)));
    418   1.1  mrg }
    419   1.1  mrg 
    420   1.1  mrg /* vec_avg (vector average)
    421   1.1  mrg  * =======
    422   1.1  mrg  */
    423   1.1  mrg static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
    424   1.1  mrg {
    425   1.1  mrg   return (spu_avg(a, b));
    426   1.1  mrg }
    427   1.1  mrg 
    428   1.1  mrg static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
    429   1.1  mrg {
    430   1.1  mrg   return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
    431   1.1  mrg 			       (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
    432   1.1  mrg }
    433   1.1  mrg 
    434   1.1  mrg static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
    435   1.1  mrg {
    436   1.1  mrg   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
    437   1.1  mrg 		  spu_and(spu_or(a, b), 1)));
    438   1.1  mrg }
    439   1.1  mrg 
    440   1.1  mrg static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
    441   1.1  mrg {
    442   1.1  mrg   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
    443   1.1  mrg 		  spu_and(spu_or(a, b), 1)));
    444   1.1  mrg }
    445   1.1  mrg 
    446   1.1  mrg static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
    447   1.1  mrg {
    448   1.1  mrg   return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
    449   1.1  mrg 		  spu_and(spu_or(a, b), 1)));
    450   1.1  mrg }
    451   1.1  mrg 
    452   1.1  mrg static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
    453   1.1  mrg {
    454   1.1  mrg   return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
    455   1.1  mrg 		  spu_and(spu_or(a, b), 1)));
    456   1.1  mrg }
    457   1.1  mrg 
    458   1.1  mrg 
    459   1.1  mrg /* vec_ceil (vector ceiling)
    460   1.1  mrg  * ========
    461   1.1  mrg  */
    462   1.1  mrg static inline vec_float4 vec_ceil(vec_float4 a)
    463   1.1  mrg {
    464   1.1  mrg   vec_int4  exp;
    465   1.1  mrg   vec_uint4 mask;
    466   1.1  mrg 
    467   1.1  mrg   a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
    468   1.1  mrg   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
    469   1.1  mrg   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
    470   1.1  mrg   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
    471   1.1  mrg   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
    472   1.1  mrg 
    473   1.1  mrg   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
    474   1.1  mrg }
    475   1.1  mrg 
    476   1.1  mrg 
    477   1.1  mrg /* vec_cmpb (vector compare bounds floating-point)
    478   1.1  mrg  * ========
    479   1.1  mrg  */
    480   1.1  mrg static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
    481   1.1  mrg {
    482   1.1  mrg   vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
    483   1.1  mrg   vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
    484   1.1  mrg 
    485   1.1  mrg   return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
    486   1.1  mrg 		 spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
    487   1.1  mrg }
    488   1.1  mrg 
    489   1.1  mrg /* vec_cmpeq (vector compare equal)
    490   1.1  mrg  * =========
    491   1.1  mrg  */
    492   1.1  mrg #define vec_cmpeq(_a, _b)	spu_cmpeq(_a, _b)
    493   1.1  mrg 
    494   1.1  mrg 
    495   1.1  mrg /* vec_cmpge (vector compare greater than or equal)
    496   1.1  mrg  * =========
    497   1.1  mrg  */
    498   1.1  mrg static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
    499   1.1  mrg {
    500   1.1  mrg   return (spu_xor(spu_cmpgt(b, a), -1));
    501   1.1  mrg }
    502   1.1  mrg 
    503   1.1  mrg 
    504   1.1  mrg /* vec_cmpgt (vector compare greater than)
    505   1.1  mrg  * =========
    506   1.1  mrg  */
    507   1.1  mrg #define vec_cmpgt(_a, _b)	spu_cmpgt(_a, _b)
    508   1.1  mrg 
    509   1.1  mrg 
    510   1.1  mrg /* vec_cmple (vector compare less than or equal)
    511   1.1  mrg  * =========
    512   1.1  mrg  */
    513   1.1  mrg static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
    514   1.1  mrg {
    515   1.1  mrg   return (spu_xor(spu_cmpgt(a, b), -1));
    516   1.1  mrg }
    517   1.1  mrg 
    518   1.1  mrg 
    519   1.1  mrg /* vec_cmplt (vector compare less than)
    520   1.1  mrg  * =========
    521   1.1  mrg  */
    522   1.1  mrg #define vec_cmplt(_a, _b)	spu_cmpgt(_b, _a)
    523   1.1  mrg 
    524   1.1  mrg 
    525   1.1  mrg /* vec_ctf (vector convert from fixed-point word)
    526   1.1  mrg  * =======
    527   1.1  mrg  */
    528   1.1  mrg #define vec_ctf(_a, _b)		spu_convtf(_a, _b)
    529   1.1  mrg 
    530   1.1  mrg 
    531   1.1  mrg /* vec_cts (vector convert to signed fixed-point word saturate)
    532   1.1  mrg  * =======
    533   1.1  mrg  */
    534   1.1  mrg #define vec_cts(_a, _b)		spu_convts(_a, _b)
    535   1.1  mrg 
    536   1.1  mrg 
    537   1.1  mrg /* vec_ctu (vector convert to unsigned fixed-point word saturate)
    538   1.1  mrg  * =======
    539   1.1  mrg  */
    540   1.1  mrg #define vec_ctu(_a, _b)		spu_convtu(_a, _b)
    541   1.1  mrg 
    542   1.1  mrg 
    543   1.1  mrg /* vec_dss (vector data stream stop)
    544   1.1  mrg  * =======
    545   1.1  mrg  */
    546   1.1  mrg #define vec_dss(_a)
    547   1.1  mrg 
    548   1.1  mrg 
    549   1.1  mrg /* vec_dssall (vector data stream stop all)
    550   1.1  mrg  * ==========
    551   1.1  mrg  */
    552   1.1  mrg #define vec_dssall()
    553   1.1  mrg 
    554   1.1  mrg 
    555   1.1  mrg /* vec_dst (vector data stream touch)
    556   1.1  mrg  * =======
    557   1.1  mrg  */
    558   1.1  mrg #define vec_dst(_a, _b, _c)
    559   1.1  mrg 
    560   1.1  mrg 
    561   1.1  mrg /* vec_dstst (vector data stream touch for store)
    562   1.1  mrg  * =========
    563   1.1  mrg  */
    564   1.1  mrg #define vec_dstst(_a, _b, _c)
    565   1.1  mrg 
    566   1.1  mrg 
    567   1.1  mrg /* vec_dststt (vector data stream touch for store transient)
    568   1.1  mrg  * ==========
    569   1.1  mrg  */
    570   1.1  mrg #define vec_dststt(_a, _b, _c)
    571   1.1  mrg 
    572   1.1  mrg 
    573   1.1  mrg /* vec_dstt (vector data stream touch transient)
    574   1.1  mrg  * ========
    575   1.1  mrg  */
    576   1.1  mrg #define vec_dstt(_a, _b, _c)
    577   1.1  mrg 
    578   1.1  mrg 
    579   1.1  mrg /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
    580   1.1  mrg  * =========
    581   1.1  mrg  */
    582   1.1  mrg static inline vec_float4 vec_expte(vec_float4 a)
    583   1.1  mrg {
    584   1.1  mrg   vec_float4 bias, frac, exp;
    585   1.1  mrg   vec_int4 ia;
    586   1.1  mrg 
    587   1.1  mrg   bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
    588   1.1  mrg   ia   = spu_convts(spu_add(a, bias), 0);
    589   1.1  mrg   frac = spu_sub(spu_convtf(ia, 0), a);
    590   1.1  mrg   exp  = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
    591   1.1  mrg 
    592   1.1  mrg   return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
    593   1.1  mrg 			   frac, spu_splats(1.0f)), exp));
    594   1.1  mrg }
    595   1.1  mrg 
    596   1.1  mrg 
    597   1.1  mrg /* vec_floor (vector floor)
    598   1.1  mrg  * =========
    599   1.1  mrg  */
    600   1.1  mrg static inline vec_float4 vec_floor(vec_float4 a)
    601   1.1  mrg {
    602   1.1  mrg   vec_int4  exp;
    603   1.1  mrg   vec_uint4 mask;
    604   1.1  mrg 
    605   1.1  mrg   a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
    606   1.1  mrg   exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
    607   1.1  mrg   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
    608   1.1  mrg   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
    609   1.1  mrg   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
    610   1.1  mrg 
    611   1.1  mrg   return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
    612   1.1  mrg }
    613   1.1  mrg 
    614   1.1  mrg 
    615   1.1  mrg /* vec_ld (vector load indexed)
    616   1.1  mrg  * ======
    617   1.1  mrg  */
    618   1.1  mrg static inline vec_uchar16 vec_ld(int a, unsigned char *b)
    619   1.1  mrg {
    620   1.1  mrg   return (*((vec_uchar16 *)(b+a)));
    621   1.1  mrg }
    622   1.1  mrg 
    623   1.1  mrg static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
    624   1.1  mrg {
    625   1.1  mrg   return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
    626   1.1  mrg }
    627   1.1  mrg 
    628   1.1  mrg static inline vec_char16 vec_ld(int a, signed char *b)
    629   1.1  mrg {
    630   1.1  mrg   return (*((vec_char16 *)(b+a)));
    631   1.1  mrg }
    632   1.1  mrg 
    633   1.1  mrg static inline vec_char16 vec_ld(int a, vec_char16 *b)
    634   1.1  mrg {
    635   1.1  mrg   return (*((vec_char16 *)((signed char *)(b)+a)));
    636   1.1  mrg }
    637   1.1  mrg 
    638   1.1  mrg static inline vec_ushort8 vec_ld(int a, unsigned short *b)
    639   1.1  mrg {
    640   1.1  mrg   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
    641   1.1  mrg }
    642   1.1  mrg 
    643   1.1  mrg static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
    644   1.1  mrg {
    645   1.1  mrg   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
    646   1.1  mrg }
    647   1.1  mrg 
    648   1.1  mrg static inline vec_short8 vec_ld(int a, signed short *b)
    649   1.1  mrg {
    650   1.1  mrg   return (*((vec_short8 *)((unsigned char *)(b)+a)));
    651   1.1  mrg }
    652   1.1  mrg 
    653   1.1  mrg static inline vec_short8 vec_ld(int a, vec_short8 *b)
    654   1.1  mrg {
    655   1.1  mrg   return (*((vec_short8 *)((signed char *)(b)+a)));
    656   1.1  mrg }
    657   1.1  mrg 
    658   1.1  mrg static inline vec_uint4 vec_ld(int a, unsigned int *b)
    659   1.1  mrg {
    660   1.1  mrg   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
    661   1.1  mrg }
    662   1.1  mrg 
    663   1.1  mrg static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
    664   1.1  mrg {
    665   1.1  mrg   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
    666   1.1  mrg }
    667   1.1  mrg 
    668   1.1  mrg static inline vec_int4 vec_ld(int a, signed int *b)
    669   1.1  mrg {
    670   1.1  mrg   return (*((vec_int4 *)((unsigned char *)(b)+a)));
    671   1.1  mrg }
    672   1.1  mrg 
    673   1.1  mrg static inline vec_int4 vec_ld(int a, vec_int4 *b)
    674   1.1  mrg {
    675   1.1  mrg   return (*((vec_int4 *)((signed char *)(b)+a)));
    676   1.1  mrg }
    677   1.1  mrg 
    678   1.1  mrg static inline vec_float4 vec_ld(int a, float *b)
    679   1.1  mrg {
    680   1.1  mrg   return (*((vec_float4 *)((unsigned char *)(b)+a)));
    681   1.1  mrg }
    682   1.1  mrg 
    683   1.1  mrg static inline vec_float4 vec_ld(int a, vec_float4 *b)
    684   1.1  mrg {
    685   1.1  mrg   return (*((vec_float4 *)((unsigned char *)(b)+a)));
    686   1.1  mrg }
    687   1.1  mrg 
    688   1.1  mrg /* vec_lde (vector load element indexed)
    689   1.1  mrg  * =======
    690   1.1  mrg  */
    691   1.1  mrg static inline vec_uchar16 vec_lde(int a, unsigned char *b)
    692   1.1  mrg {
    693   1.1  mrg   return (*((vec_uchar16 *)(b+a)));
    694   1.1  mrg }
    695   1.1  mrg 
    696   1.1  mrg static inline vec_char16 vec_lde(int a, signed char *b)
    697   1.1  mrg {
    698   1.1  mrg   return (*((vec_char16 *)(b+a)));
    699   1.1  mrg }
    700   1.1  mrg 
    701   1.1  mrg static inline vec_ushort8 vec_lde(int a, unsigned short *b)
    702   1.1  mrg {
    703   1.1  mrg   return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
    704   1.1  mrg }
    705   1.1  mrg 
    706   1.1  mrg static inline vec_short8 vec_lde(int a, signed short *b)
    707   1.1  mrg {
    708   1.1  mrg   return (*((vec_short8 *)((unsigned char *)(b)+a)));
    709   1.1  mrg }
    710   1.1  mrg 
    711   1.1  mrg 
    712   1.1  mrg static inline vec_uint4 vec_lde(int a, unsigned int *b)
    713   1.1  mrg {
    714   1.1  mrg   return (*((vec_uint4 *)((unsigned char *)(b)+a)));
    715   1.1  mrg }
    716   1.1  mrg 
    717   1.1  mrg static inline vec_int4 vec_lde(int a, signed int *b)
    718   1.1  mrg {
    719   1.1  mrg   return (*((vec_int4 *)((unsigned char *)(b)+a)));
    720   1.1  mrg }
    721   1.1  mrg 
    722   1.1  mrg 
    723   1.1  mrg static inline vec_float4 vec_lde(int a, float *b)
    724   1.1  mrg {
    725   1.1  mrg   return (*((vec_float4 *)((unsigned char *)(b)+a)));
    726   1.1  mrg }
    727   1.1  mrg 
    728   1.1  mrg /* vec_ldl (vector load indexed LRU)
    729   1.1  mrg  * =======
    730   1.1  mrg  */
    731   1.1  mrg #define vec_ldl(_a, _b)		vec_ld(_a, _b)
    732   1.1  mrg 
    733   1.1  mrg 
    734   1.1  mrg /* vec_loge (vector log2 estimate floating-point)
    735   1.1  mrg  * ========
    736   1.1  mrg  */
    737   1.1  mrg static inline vec_float4 vec_loge(vec_float4 a)
    738   1.1  mrg {
    739   1.1  mrg   vec_int4 exp;
    740   1.1  mrg   vec_float4 frac;
    741   1.1  mrg 
    742   1.1  mrg   exp  = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
    743   1.1  mrg   frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
    744   1.1  mrg 
    745   1.1  mrg   return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
    746   1.1  mrg 		   frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
    747   1.1  mrg }
    748   1.1  mrg 
    749   1.1  mrg 
    750   1.1  mrg /* vec_lvsl (vector load for shift left)
    751   1.1  mrg  * ========
    752   1.1  mrg  */
    753   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
    754   1.1  mrg {
    755   1.1  mrg   return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
    756   1.1  mrg 			       ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
    757   1.1  mrg 				              0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
    758   1.1  mrg }
    759   1.1  mrg 
    760   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, signed char *b)
    761   1.1  mrg {
    762   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    763   1.1  mrg }
    764   1.1  mrg 
    765   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
    766   1.1  mrg {
    767   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    768   1.1  mrg }
    769   1.1  mrg 
    770   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, short *b)
    771   1.1  mrg {
    772   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    773   1.1  mrg }
    774   1.1  mrg 
    775   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
    776   1.1  mrg {
    777   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    778   1.1  mrg }
    779   1.1  mrg 
    780   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, int *b)
    781   1.1  mrg {
    782   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    783   1.1  mrg }
    784   1.1  mrg 
    785   1.1  mrg static inline vec_uchar16 vec_lvsl(int a, float *b)
    786   1.1  mrg {
    787   1.1  mrg   return (vec_lvsl(a, (unsigned char *)b));
    788   1.1  mrg }
    789   1.1  mrg 
    790   1.1  mrg 
    791   1.1  mrg /* vec_lvsr (vector load for shift right)
    792   1.1  mrg  * ========
    793   1.1  mrg  */
    794   1.1  mrg static  inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
    795   1.1  mrg {
    796   1.1  mrg   return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
    797   1.1  mrg 				               0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
    798   1.1  mrg 				(vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
    799   1.1  mrg }
    800   1.1  mrg 
    801   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, signed char *b)
    802   1.1  mrg {
    803   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    804   1.1  mrg }
    805   1.1  mrg 
    806   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
    807   1.1  mrg {
    808   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    809   1.1  mrg }
    810   1.1  mrg 
    811   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, short *b)
    812   1.1  mrg {
    813   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    814   1.1  mrg }
    815   1.1  mrg 
    816   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
    817   1.1  mrg {
    818   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    819   1.1  mrg }
    820   1.1  mrg 
    821   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, int *b)
    822   1.1  mrg {
    823   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    824   1.1  mrg }
    825   1.1  mrg 
    826   1.1  mrg static inline vec_uchar16 vec_lvsr(int a, float *b)
    827   1.1  mrg {
    828   1.1  mrg   return (vec_lvsr(a, (unsigned char *)b));
    829   1.1  mrg }
    830   1.1  mrg 
    831   1.1  mrg /* vec_madd (vector multiply add)
    832   1.1  mrg  * ========
    833   1.1  mrg  */
    834   1.1  mrg #define vec_madd(_a, _b, _c)	spu_madd(_a, _b, _c)
    835   1.1  mrg 
    836   1.1  mrg 
    837   1.1  mrg 
    838   1.1  mrg /* vec_madds (vector multiply add saturate)
    839   1.1  mrg  * =========
    840   1.1  mrg  */
    841   1.1  mrg static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
    842   1.1  mrg {
    843   1.1  mrg   return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
    844   1.1  mrg 			      (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
    845   1.1  mrg 			      ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
    846   1.1  mrg }
    847   1.1  mrg 
    848   1.1  mrg /* vec_max (vector maximum)
    849   1.1  mrg  * =======
    850   1.1  mrg  */
    851   1.1  mrg static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
    852   1.1  mrg {
    853   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    854   1.1  mrg }
    855   1.1  mrg 
    856   1.1  mrg static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
    857   1.1  mrg {
    858   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    859   1.1  mrg }
    860   1.1  mrg 
    861   1.1  mrg static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
    862   1.1  mrg {
    863   1.1  mrg   return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
    864   1.1  mrg }
    865   1.1  mrg 
    866   1.1  mrg static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
    867   1.1  mrg {
    868   1.1  mrg   return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
    869   1.1  mrg }
    870   1.1  mrg 
    871   1.1  mrg static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
    872   1.1  mrg {
    873   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    874   1.1  mrg }
    875   1.1  mrg 
    876   1.1  mrg static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
    877   1.1  mrg {
    878   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    879   1.1  mrg }
    880   1.1  mrg 
    881   1.1  mrg static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
    882   1.1  mrg {
    883   1.1  mrg   return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
    884   1.1  mrg }
    885   1.1  mrg 
    886   1.1  mrg static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
    887   1.1  mrg {
    888   1.1  mrg   return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
    889   1.1  mrg }
    890   1.1  mrg 
    891   1.1  mrg static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
    892   1.1  mrg {
    893   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    894   1.1  mrg }
    895   1.1  mrg 
    896   1.1  mrg static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
    897   1.1  mrg {
    898   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    899   1.1  mrg }
    900   1.1  mrg 
    901   1.1  mrg static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
    902   1.1  mrg {
    903   1.1  mrg   return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
    904   1.1  mrg }
    905   1.1  mrg 
    906   1.1  mrg static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
    907   1.1  mrg {
    908   1.1  mrg   return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
    909   1.1  mrg }
    910   1.1  mrg 
    911   1.1  mrg static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
    912   1.1  mrg {
    913   1.1  mrg   return (spu_sel(b, a, spu_cmpgt(a, b)));
    914   1.1  mrg }
    915   1.1  mrg 
    916   1.1  mrg 
    917   1.1  mrg /* vec_mergeh (vector merge high)
    918   1.1  mrg  * ==========
    919   1.1  mrg  */
    920   1.1  mrg static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
    921   1.1  mrg {
    922   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
    923   1.1  mrg 				           4, 20, 5, 21, 6, 22, 7, 23})));
    924   1.1  mrg }
    925   1.1  mrg 
    926   1.1  mrg static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
    927   1.1  mrg {
    928   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
    929   1.1  mrg 				           4, 20, 5, 21, 6, 22, 7, 23})));
    930   1.1  mrg }
    931   1.1  mrg 
    932   1.1  mrg static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
    933   1.1  mrg {
    934   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
    935   1.1  mrg 				           4, 5, 20, 21, 6, 7, 22, 23})));
    936   1.1  mrg }
    937   1.1  mrg 
    938   1.1  mrg static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
    939   1.1  mrg {
    940   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
    941   1.1  mrg 				           4, 5, 20, 21, 6, 7, 22, 23})));
    942   1.1  mrg }
    943   1.1  mrg 
    944   1.1  mrg static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
    945   1.1  mrg {
    946   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
    947   1.1  mrg 				           4, 5, 6, 7, 20, 21, 22, 23})));
    948   1.1  mrg }
    949   1.1  mrg 
    950   1.1  mrg static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
    951   1.1  mrg {
    952   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
    953   1.1  mrg 				           4, 5, 6, 7, 20, 21, 22, 23})));
    954   1.1  mrg }
    955   1.1  mrg 
    956   1.1  mrg static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
    957   1.1  mrg {
    958   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
    959   1.1  mrg 				           4, 5, 6, 7, 20, 21, 22, 23})));
    960   1.1  mrg }
    961   1.1  mrg 
    962   1.1  mrg /* vec_mergel (vector merge low)
    963   1.1  mrg  * ==========
    964   1.1  mrg  */
    965   1.1  mrg static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
    966   1.1  mrg {
    967   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
    968   1.1  mrg 				           12, 28, 13, 29, 14, 30, 15, 31})));
    969   1.1  mrg }
    970   1.1  mrg 
    971   1.1  mrg static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
    972   1.1  mrg {
    973   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24,  9, 25, 10, 26, 11, 27,
    974   1.1  mrg 				           12, 28, 13, 29, 14, 30, 15, 31})));
    975   1.1  mrg }
    976   1.1  mrg 
    977   1.1  mrg static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
    978   1.1  mrg {
    979   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
    980   1.1  mrg 				           12, 13, 28, 29, 14, 15, 30, 31})));
    981   1.1  mrg }
    982   1.1  mrg 
    983   1.1  mrg static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
    984   1.1  mrg {
    985   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 24, 25, 10, 11, 26, 27,
    986   1.1  mrg 				           12, 13, 28, 29, 14, 15, 30, 31})));
    987   1.1  mrg }
    988   1.1  mrg 
    989   1.1  mrg static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
    990   1.1  mrg {
    991   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
    992   1.1  mrg 				           12, 13, 14, 15, 28, 29, 30, 31})));
    993   1.1  mrg }
    994   1.1  mrg 
    995   1.1  mrg static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
    996   1.1  mrg {
    997   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
    998   1.1  mrg 				           12, 13, 14, 15, 28, 29, 30, 31})));
    999   1.1  mrg }
   1000   1.1  mrg 
   1001   1.1  mrg static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
   1002   1.1  mrg {
   1003   1.1  mrg   return (spu_shuffle(a, b, ((vec_uchar16){ 8,  9, 10, 11, 24, 25, 26, 27,
   1004   1.1  mrg 				           12, 13, 14, 15, 28, 29, 30, 31})));
   1005   1.1  mrg }
   1006   1.1  mrg 
   1007   1.1  mrg /* vec_mfvscr (vector move from vector status and control register)
   1008   1.1  mrg  * ==========
   1009   1.1  mrg  */
   1010   1.1  mrg static inline vec_ushort8 vec_mfvscr()
   1011   1.1  mrg {
   1012   1.1  mrg   return ((vec_ushort8)spu_splats(0)); 		/* not supported */
   1013   1.1  mrg }
   1014   1.1  mrg 
   1015   1.1  mrg 
   1016   1.1  mrg /* vec_min (vector minimum)
   1017   1.1  mrg  * =======
   1018   1.1  mrg  */
   1019   1.1  mrg static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
   1020   1.1  mrg {
   1021   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1022   1.1  mrg }
   1023   1.1  mrg 
   1024   1.1  mrg static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
   1025   1.1  mrg {
   1026   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1027   1.1  mrg }
   1028   1.1  mrg 
   1029   1.1  mrg static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
   1030   1.1  mrg {
   1031   1.1  mrg   return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
   1032   1.1  mrg }
   1033   1.1  mrg 
   1034   1.1  mrg static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
   1035   1.1  mrg {
   1036   1.1  mrg   return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
   1037   1.1  mrg }
   1038   1.1  mrg 
   1039   1.1  mrg static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
   1040   1.1  mrg {
   1041   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1042   1.1  mrg }
   1043   1.1  mrg 
   1044   1.1  mrg static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
   1045   1.1  mrg {
   1046   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1047   1.1  mrg }
   1048   1.1  mrg 
   1049   1.1  mrg static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
   1050   1.1  mrg {
   1051   1.1  mrg   return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
   1052   1.1  mrg }
   1053   1.1  mrg 
   1054   1.1  mrg static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
   1055   1.1  mrg {
   1056   1.1  mrg   return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
   1057   1.1  mrg }
   1058   1.1  mrg 
   1059   1.1  mrg static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
   1060   1.1  mrg {
   1061   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1062   1.1  mrg }
   1063   1.1  mrg 
   1064   1.1  mrg static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
   1065   1.1  mrg {
   1066   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1067   1.1  mrg }
   1068   1.1  mrg 
   1069   1.1  mrg static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
   1070   1.1  mrg {
   1071   1.1  mrg   return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
   1072   1.1  mrg }
   1073   1.1  mrg 
   1074   1.1  mrg static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
   1075   1.1  mrg {
   1076   1.1  mrg   return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
   1077   1.1  mrg }
   1078   1.1  mrg 
   1079   1.1  mrg static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
   1080   1.1  mrg {
   1081   1.1  mrg   return (spu_sel(a, b, spu_cmpgt(a, b)));
   1082   1.1  mrg }
   1083   1.1  mrg 
   1084   1.1  mrg /* vec_mladd (vector multiply low and add unsigned half word)
   1085   1.1  mrg  * =========
   1086   1.1  mrg  */
   1087   1.1  mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
   1088   1.1  mrg {
   1089   1.1  mrg   return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
   1090   1.1  mrg 					    (vec_short8)(spu_rl((vec_uint4)(b), -16)),
   1091   1.1  mrg 					    (vec_int4)(spu_rl((vec_uint4)(c), -16))),
   1092   1.1  mrg 				   spu_madd(a, b, spu_extend(c)),
   1093   1.1  mrg 				   ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
   1094   1.1  mrg 					          10, 11, 26, 27, 14, 15, 30, 31}))));
   1095   1.1  mrg }
   1096   1.1  mrg 
   1097   1.1  mrg 
   1098   1.1  mrg static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
   1099   1.1  mrg {
   1100   1.1  mrg   return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
   1101   1.1  mrg }
   1102   1.1  mrg 
   1103   1.1  mrg static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
   1104   1.1  mrg {
   1105   1.1  mrg   return (vec_mladd((vec_short8)(a), b, c));
   1106   1.1  mrg }
   1107   1.1  mrg 
   1108   1.1  mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
   1109   1.1  mrg {
   1110   1.1  mrg   return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
   1111   1.1  mrg }
   1112   1.1  mrg 
   1113   1.1  mrg 
   1114   1.1  mrg /* vec_mradds (vector multiply round and add saturate)
   1115   1.1  mrg  * ==========
   1116   1.1  mrg  */
   1117   1.1  mrg static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
   1118   1.1  mrg {
   1119   1.1  mrg   vec_int4 round = (vec_int4)spu_splats(0x4000);
   1120   1.1  mrg   vec_short8 hi, lo;
   1121   1.1  mrg 
   1122   1.1  mrg   hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
   1123   1.1  mrg   lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
   1124   1.1  mrg 
   1125   1.1  mrg   return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
   1126   1.1  mrg }
   1127   1.1  mrg 
   1128   1.1  mrg 
   1129   1.1  mrg /* vec_msum (vector multiply sum)
   1130   1.1  mrg  * ========
   1131   1.1  mrg  */
   1132   1.1  mrg static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
   1133   1.1  mrg {
   1134   1.1  mrg   vec_ushort8 a1, a2, b1, b2;
   1135   1.1  mrg   vec_uint4 p1, p2;
   1136   1.1  mrg 
   1137   1.1  mrg   a1 = spu_and((vec_ushort8)(a), 0xFF);
   1138   1.1  mrg   a2 = spu_rlmask((vec_ushort8)(a), -8);
   1139   1.1  mrg   b1 = spu_and((vec_ushort8)(b), 0xFF);
   1140   1.1  mrg   b2 = spu_rlmask((vec_ushort8)(b), -8);
   1141   1.1  mrg 
   1142   1.1  mrg   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
   1143   1.1  mrg   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
   1144   1.1  mrg   return (spu_add(p2, spu_add(p1, c)));
   1145   1.1  mrg }
   1146   1.1  mrg 
   1147   1.1  mrg static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
   1148   1.1  mrg {
   1149   1.1  mrg   vec_short8 a1, a2, b1, b2;
   1150   1.1  mrg   vec_int4 p1, p2;
   1151   1.1  mrg 
   1152   1.1  mrg   a1 = (vec_short8)(spu_extend(a));
   1153   1.1  mrg   a2 = spu_rlmaska((vec_short8)(a), -8);
   1154   1.1  mrg   b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
   1155   1.1  mrg   b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
   1156   1.1  mrg 
   1157   1.1  mrg   p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
   1158   1.1  mrg   p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
   1159   1.1  mrg   return (spu_add(p2, spu_add(p1, c)));
   1160   1.1  mrg }
   1161   1.1  mrg 
   1162   1.1  mrg static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
   1163   1.1  mrg {
   1164   1.1  mrg   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
   1165   1.1  mrg }
   1166   1.1  mrg 
   1167   1.1  mrg static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
   1168   1.1  mrg {
   1169   1.1  mrg   return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
   1170   1.1  mrg }
   1171   1.1  mrg 
   1172   1.1  mrg 
   1173   1.1  mrg /* vec_msums (vector multiply sum saturate)
   1174   1.1  mrg  * ========
   1175   1.1  mrg  */
   1176   1.1  mrg static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
   1177   1.1  mrg {
   1178   1.1  mrg   vec_uint4 p1, p2;
   1179   1.1  mrg 
   1180   1.1  mrg   p1 = spu_mulo(a, b);
   1181   1.1  mrg   p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
   1182   1.1  mrg 
   1183   1.1  mrg   return (vec_adds(p2, vec_adds(p1, c)));
   1184   1.1  mrg }
   1185   1.1  mrg 
   1186   1.1  mrg static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
   1187   1.1  mrg {
   1188   1.1  mrg   return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
   1189   1.1  mrg }
   1190   1.1  mrg 
   1191   1.1  mrg /* vec_mtvscr (vector move to vector status and control register)
   1192   1.1  mrg  * ==========
   1193   1.1  mrg  */
   1194   1.1  mrg #define vec_mtvscr(_a)		/* not supported */
   1195   1.1  mrg 
   1196   1.1  mrg 
   1197   1.1  mrg /* vec_mule (vector multiply even)
   1198   1.1  mrg  * ========
   1199   1.1  mrg  */
   1200   1.1  mrg static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
   1201   1.1  mrg {
   1202   1.1  mrg   vec_ushort8 hi, lo;
   1203   1.1  mrg 
   1204   1.1  mrg   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
   1205   1.1  mrg 			     (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
   1206   1.1  mrg   lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
   1207   1.1  mrg 			     (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
   1208   1.1  mrg 
   1209   1.1  mrg   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
   1210   1.1  mrg 				             10, 11, 26, 27, 14, 15, 30, 31})));
   1211   1.1  mrg }
   1212   1.1  mrg 
   1213   1.1  mrg static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
   1214   1.1  mrg {
   1215   1.1  mrg   vec_short8 hi, lo;
   1216   1.1  mrg 
   1217   1.1  mrg   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
   1218   1.1  mrg 			    (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
   1219   1.1  mrg   lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
   1220   1.1  mrg 			    (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
   1221   1.1  mrg 
   1222   1.1  mrg   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
   1223   1.1  mrg 				             10, 11, 26, 27, 14, 15, 30, 31})));
   1224   1.1  mrg }
   1225   1.1  mrg 
   1226   1.1  mrg static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
   1227   1.1  mrg {
   1228   1.1  mrg  return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
   1229   1.1  mrg 		  (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
   1230   1.1  mrg }
   1231   1.1  mrg 
   1232   1.1  mrg 
   1233   1.1  mrg static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
   1234   1.1  mrg {
   1235   1.1  mrg  return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
   1236   1.1  mrg 		  (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
   1237   1.1  mrg }
   1238   1.1  mrg 
   1239   1.1  mrg 
   1240   1.1  mrg /* vec_mulo (vector multiply odd)
   1241   1.1  mrg  * ========
   1242   1.1  mrg  */
   1243   1.1  mrg static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
   1244   1.1  mrg {
   1245   1.1  mrg   vec_ushort8 hi, lo;
   1246   1.1  mrg 
   1247   1.1  mrg   hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
   1248   1.1  mrg 			     (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
   1249   1.1  mrg   lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
   1250   1.1  mrg 
   1251   1.1  mrg   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
   1252   1.1  mrg 				             10, 11, 26, 27, 14, 15, 30, 31})));
   1253   1.1  mrg }
   1254   1.1  mrg 
   1255   1.1  mrg static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
   1256   1.1  mrg {
   1257   1.1  mrg   vec_short8 aa, bb, hi, lo;
   1258   1.1  mrg 
   1259   1.1  mrg   aa = spu_extend(a);
   1260   1.1  mrg   bb = spu_extend(b);
   1261   1.1  mrg 
   1262   1.1  mrg   hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
   1263   1.1  mrg 		(vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
   1264   1.1  mrg   lo = (vec_short8)spu_mulo(aa, bb);
   1265   1.1  mrg   return (spu_shuffle(hi, lo, ((vec_uchar16){ 2,  3, 18, 19,  6,  7, 22, 23,
   1266   1.1  mrg 				             10, 11, 26, 27, 14, 15, 30, 31})));
   1267   1.1  mrg }
   1268   1.1  mrg 
   1269   1.1  mrg static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
   1270   1.1  mrg {
   1271   1.1  mrg   return (spu_mulo(a, b));
   1272   1.1  mrg }
   1273   1.1  mrg 
   1274   1.1  mrg 
   1275   1.1  mrg static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
   1276   1.1  mrg {
   1277   1.1  mrg   return (spu_mulo(a, b));
   1278   1.1  mrg }
   1279   1.1  mrg 
   1280   1.1  mrg 
   1281   1.1  mrg /* vec_nmsub (vector negative multiply subtract)
   1282   1.1  mrg  * =========
   1283   1.1  mrg  */
   1284   1.1  mrg #define vec_nmsub(_a, _b, _c)	spu_nmsub(_a, _b, _c)
   1285   1.1  mrg 
   1286   1.1  mrg 
   1287   1.1  mrg /* vec_nor (vector logical nor)
   1288   1.1  mrg  * =======
   1289   1.1  mrg  */
   1290   1.1  mrg #define vec_nor(_a, _b)		spu_nor(_a, _b)
   1291   1.1  mrg 
   1292   1.1  mrg 
   1293   1.1  mrg /* vec_or (vector logical or)
   1294   1.1  mrg  * ======
   1295   1.1  mrg  */
   1296   1.1  mrg static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
   1297   1.1  mrg {
   1298   1.1  mrg   return (spu_or(a, b));
   1299   1.1  mrg }
   1300   1.1  mrg 
   1301   1.1  mrg static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
   1302   1.1  mrg {
   1303   1.1  mrg   return (spu_or(a, b));
   1304   1.1  mrg }
   1305   1.1  mrg 
   1306   1.1  mrg static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
   1307   1.1  mrg {
   1308   1.1  mrg   return (spu_or((vec_char16)(a), b));
   1309   1.1  mrg }
   1310   1.1  mrg 
   1311   1.1  mrg static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
   1312   1.1  mrg {
   1313   1.1  mrg   return (spu_or(a, (vec_char16)(b)));
   1314   1.1  mrg }
   1315   1.1  mrg 
   1316   1.1  mrg static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
   1317   1.1  mrg {
   1318   1.1  mrg   return (spu_or(a, b));
   1319   1.1  mrg }
   1320   1.1  mrg 
   1321   1.1  mrg static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
   1322   1.1  mrg {
   1323   1.1  mrg   return (spu_or(a, b));
   1324   1.1  mrg }
   1325   1.1  mrg 
   1326   1.1  mrg static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
   1327   1.1  mrg {
   1328   1.1  mrg   return (spu_or((vec_short8)(a), b));
   1329   1.1  mrg }
   1330   1.1  mrg 
   1331   1.1  mrg static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
   1332   1.1  mrg {
   1333   1.1  mrg   return (spu_or(a, (vec_short8)(b)));
   1334   1.1  mrg }
   1335   1.1  mrg 
   1336   1.1  mrg static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
   1337   1.1  mrg {
   1338   1.1  mrg   return (spu_or(a, b));
   1339   1.1  mrg }
   1340   1.1  mrg 
   1341   1.1  mrg static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
   1342   1.1  mrg {
   1343   1.1  mrg   return (spu_or(a, b));
   1344   1.1  mrg }
   1345   1.1  mrg 
   1346   1.1  mrg static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
   1347   1.1  mrg {
   1348   1.1  mrg   return (spu_or((vec_int4)(a), b));
   1349   1.1  mrg }
   1350   1.1  mrg 
   1351   1.1  mrg static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
   1352   1.1  mrg {
   1353   1.1  mrg   return (spu_or(a, (vec_int4)(b)));
   1354   1.1  mrg }
   1355   1.1  mrg 
   1356   1.1  mrg static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
   1357   1.1  mrg {
   1358   1.1  mrg   return (spu_or(a, b));
   1359   1.1  mrg }
   1360   1.1  mrg 
   1361   1.1  mrg static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
   1362   1.1  mrg {
   1363   1.1  mrg   return (spu_or((vec_float4)(a),b));
   1364   1.1  mrg }
   1365   1.1  mrg 
   1366   1.1  mrg static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
   1367   1.1  mrg {
   1368   1.1  mrg   return (spu_or(a, (vec_float4)(b)));
   1369   1.1  mrg }
   1370   1.1  mrg 
   1371   1.1  mrg 
   1372   1.1  mrg /* vec_pack (vector pack)
   1373   1.1  mrg  * ========
   1374   1.1  mrg  */
   1375   1.1  mrg static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
   1376   1.1  mrg {
   1377   1.1  mrg   return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1378   1.1  mrg 					                17, 19, 21, 23, 25, 27, 29, 31})));
   1379   1.1  mrg }
   1380   1.1  mrg 
   1381   1.1  mrg static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
   1382   1.1  mrg {
   1383   1.1  mrg   return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1384   1.1  mrg 					               17, 19, 21, 23, 25, 27, 29, 31})));
   1385   1.1  mrg }
   1386   1.1  mrg 
   1387   1.1  mrg static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
   1388   1.1  mrg {
   1389   1.1  mrg   return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1390   1.1  mrg 					                18, 19, 22, 23, 26, 27, 30, 31})));
   1391   1.1  mrg }
   1392   1.1  mrg 
   1393   1.1  mrg static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
   1394   1.1  mrg {
   1395   1.1  mrg   return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1396   1.1  mrg 					               18, 19, 22, 23, 26, 27, 30, 31})));
   1397   1.1  mrg }
   1398   1.1  mrg 
   1399   1.1  mrg 
   1400   1.1  mrg /* vec_packpx (vector pack pixel)
   1401   1.1  mrg  * ==========
   1402   1.1  mrg  */
   1403   1.1  mrg static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
   1404   1.1  mrg {
   1405   1.1  mrg   vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
   1406   1.1  mrg   vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
   1407   1.1  mrg 
   1408   1.1  mrg   return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
   1409   1.1  mrg 					   spu_sl(a, 13), x001F),
   1410   1.1  mrg 				   spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
   1411   1.1  mrg 					   spu_sl(b, 13), x001F),
   1412   1.1  mrg 				   ((vec_uchar16){ 0,  1,  4,  5,   8,  9, 12, 13,
   1413   1.1  mrg 					          16, 17, 20, 21, 24, 25, 28, 29}))));
   1414   1.1  mrg }
   1415   1.1  mrg 
   1416   1.1  mrg 
   1417   1.1  mrg /* vec_packs (vector pack saturate)
   1418   1.1  mrg  * =========
   1419   1.1  mrg  */
   1420   1.1  mrg static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
   1421   1.1  mrg {
   1422   1.1  mrg   vec_ushort8 max = spu_splats((unsigned short)0x00FF);
   1423   1.1  mrg 
   1424   1.1  mrg   return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
   1425   1.1  mrg 				    spu_sel(b, max, spu_cmpgt(b, 255)),
   1426   1.1  mrg 				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1427   1.1  mrg 					           17, 19, 21, 23, 25, 27, 29, 31}))));
   1428   1.1  mrg }
   1429   1.1  mrg 
   1430   1.1  mrg static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
   1431   1.1  mrg {
   1432   1.1  mrg   vec_short8 max = spu_splats((signed short)0x007F);
   1433   1.1  mrg   vec_short8 min = spu_splats((signed short)0xFF80);
   1434   1.1  mrg 
   1435   1.1  mrg   return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
   1436   1.1  mrg 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
   1437   1.1  mrg 				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1438   1.1  mrg 					          17, 19, 21, 23, 25, 27, 29, 31}))));
   1439   1.1  mrg }
   1440   1.1  mrg 
   1441   1.1  mrg static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
   1442   1.1  mrg {
   1443   1.1  mrg   vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
   1444   1.1  mrg 
   1445   1.1  mrg   return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
   1446   1.1  mrg 				    spu_sel(b, max, spu_cmpgt(b, max)),
   1447   1.1  mrg 				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1448   1.1  mrg 					           18, 19, 22, 23, 26, 27, 30, 31}))));
   1449   1.1  mrg }
   1450   1.1  mrg 
   1451   1.1  mrg static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
   1452   1.1  mrg {
   1453   1.1  mrg   vec_int4 max = spu_splats((signed int)0x00007FFF);
   1454   1.1  mrg   vec_int4 min = spu_splats((signed int)0xFFFF8000);
   1455   1.1  mrg 
   1456   1.1  mrg   return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
   1457   1.1  mrg 				   spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
   1458   1.1  mrg 				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1459   1.1  mrg 					          18, 19, 22, 23, 26, 27, 30, 31}))));
   1460   1.1  mrg }
   1461   1.1  mrg 
   1462   1.1  mrg 
   1463   1.1  mrg /* vec_packsu (vector pack saturate unsigned)
   1464   1.1  mrg  * ==========
   1465   1.1  mrg  */
   1466   1.1  mrg static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
   1467   1.1  mrg {
   1468   1.1  mrg   return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
   1469   1.1  mrg 				   spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
   1470   1.1  mrg 				   ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1471   1.1  mrg 					          17, 19, 21, 23, 25, 27, 29, 31})));
   1472   1.1  mrg }
   1473   1.1  mrg 
   1474   1.1  mrg static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
   1475   1.1  mrg {
   1476   1.1  mrg   vec_short8 max = spu_splats((signed short)0x00FF);
   1477   1.1  mrg   vec_short8 min = spu_splats((signed short)0x0000);
   1478   1.1  mrg 
   1479   1.1  mrg   return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
   1480   1.1  mrg 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
   1481   1.1  mrg 				    ((vec_uchar16){ 1,  3,  5,  7,  9, 11, 13, 15,
   1482   1.1  mrg 					           17, 19, 21, 23, 25, 27, 29, 31}))));
   1483   1.1  mrg 
   1484   1.1  mrg   return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
   1485   1.1  mrg }
   1486   1.1  mrg 
   1487   1.1  mrg static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
   1488   1.1  mrg {
   1489   1.1  mrg   vec_uint4 max = spu_splats((unsigned int)0xFFFF);
   1490   1.1  mrg 
   1491   1.1  mrg   return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
   1492   1.1  mrg 				   spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
   1493   1.1  mrg 				   ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1494   1.1  mrg 					          18, 19, 22, 23, 26, 27, 30, 31})));
   1495   1.1  mrg }
   1496   1.1  mrg 
   1497   1.1  mrg static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
   1498   1.1  mrg {
   1499   1.1  mrg   vec_int4 max = spu_splats((signed int)0x0000FFFF);
   1500   1.1  mrg   vec_int4 min = spu_splats((signed int)0x00000000);
   1501   1.1  mrg 
   1502   1.1  mrg   return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
   1503   1.1  mrg 				    spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
   1504   1.1  mrg 				    ((vec_uchar16){ 2,  3,  6,  7, 10, 11, 14, 15,
   1505   1.1  mrg 					           18, 19, 22, 23, 26, 27, 30, 31}))));
   1506   1.1  mrg }
   1507   1.1  mrg 
   1508   1.1  mrg 
   1509   1.1  mrg /* vec_perm (vector permute)
   1510   1.1  mrg  * ========
   1511   1.1  mrg  */
   1512   1.1  mrg static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
   1513   1.1  mrg {
   1514   1.1  mrg   return (spu_shuffle(a, b, spu_and(c, 0x1F)));
   1515   1.1  mrg }
   1516   1.1  mrg 
   1517   1.1  mrg static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
   1518   1.1  mrg {
   1519   1.1  mrg   return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1520   1.1  mrg }
   1521   1.1  mrg 
   1522   1.1  mrg static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
   1523   1.1  mrg {
   1524   1.1  mrg   return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1525   1.1  mrg }
   1526   1.1  mrg 
   1527   1.1  mrg static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
   1528   1.1  mrg {
   1529   1.1  mrg   return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1530   1.1  mrg }
   1531   1.1  mrg 
   1532   1.1  mrg static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
   1533   1.1  mrg {
   1534   1.1  mrg   return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1535   1.1  mrg }
   1536   1.1  mrg 
   1537   1.1  mrg static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
   1538   1.1  mrg {
   1539   1.1  mrg   return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1540   1.1  mrg }
   1541   1.1  mrg 
   1542   1.1  mrg static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
   1543   1.1  mrg {
   1544   1.1  mrg   return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
   1545   1.1  mrg }
   1546   1.1  mrg 
   1547   1.1  mrg 
   1548   1.1  mrg /* vec_re (vector reciprocal estimate)
   1549   1.1  mrg  * ======
   1550   1.1  mrg  */
   1551   1.1  mrg #define vec_re(_a)	spu_re(_a)
   1552   1.1  mrg 
   1553   1.1  mrg 
   1554   1.1  mrg /* vec_rl (vector rotate left)
   1555   1.1  mrg  * ======
   1556   1.1  mrg  */
   1557   1.1  mrg static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
   1558   1.1  mrg {
   1559   1.1  mrg   vec_ushort8 r1, r2;
   1560   1.1  mrg 
   1561   1.1  mrg   r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
   1562   1.1  mrg   r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
   1563   1.1  mrg   return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
   1564   1.1  mrg }
   1565   1.1  mrg 
   1566   1.1  mrg static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
   1567   1.1  mrg {
   1568   1.1  mrg   return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
   1569   1.1  mrg }
   1570   1.1  mrg 
   1571   1.1  mrg static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
   1572   1.1  mrg {
   1573   1.1  mrg   return (spu_rl(a, (vec_short8)(b)));
   1574   1.1  mrg }
   1575   1.1  mrg 
   1576   1.1  mrg static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
   1577   1.1  mrg {
   1578   1.1  mrg   return (spu_rl(a, (vec_short8)(b)));
   1579   1.1  mrg }
   1580   1.1  mrg 
   1581   1.1  mrg static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
   1582   1.1  mrg {
   1583   1.1  mrg   return (spu_rl(a, (vec_int4)(b)));
   1584   1.1  mrg }
   1585   1.1  mrg 
   1586   1.1  mrg static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
   1587   1.1  mrg {
   1588   1.1  mrg   return (spu_rl(a, (vec_int4)(b)));
   1589   1.1  mrg }
   1590   1.1  mrg 
   1591   1.1  mrg 
   1592   1.1  mrg /* vec_round (vector round)
   1593   1.1  mrg  * =========
   1594   1.1  mrg  */
   1595   1.1  mrg static inline vec_float4 vec_round(vec_float4 a)
   1596   1.1  mrg {
   1597   1.1  mrg   vec_float4 s_half, s_one, d;
   1598   1.1  mrg   vec_uint4 odd;
   1599   1.1  mrg   vec_uint4 msb = spu_splats((unsigned int)0x80000000);
   1600   1.1  mrg   vec_float4 half = spu_splats(0.5f);
   1601   1.1  mrg   vec_int4 exp;
   1602   1.1  mrg   vec_uint4 mask;
   1603   1.1  mrg 
   1604   1.1  mrg   s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
   1605   1.1  mrg   a = spu_add(a, s_half);
   1606   1.1  mrg   s_one = spu_add(s_half, s_half);
   1607   1.1  mrg   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
   1608   1.1  mrg   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
   1609   1.1  mrg   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
   1610   1.1  mrg   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
   1611   1.1  mrg 
   1612   1.1  mrg   odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
   1613   1.1  mrg   s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
   1614   1.1  mrg   s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
   1615   1.1  mrg 				 (vec_float4)spu_cmpeq(odd, 1)));
   1616   1.1  mrg   d = spu_andc(a, (vec_float4)(mask));
   1617   1.1  mrg   d = spu_sub(d, s_one);
   1618   1.1  mrg   return (d);
   1619   1.1  mrg }
   1620   1.1  mrg 
   1621   1.1  mrg /* vec_rsqrte (vector reciprocal square root estimate)
   1622   1.1  mrg  * ==========
   1623   1.1  mrg  */
   1624   1.1  mrg #define vec_rsqrte(_a)	spu_rsqrte(_a)
   1625   1.1  mrg 
   1626   1.1  mrg 
   1627   1.1  mrg /* vec_sel (vector select)
   1628   1.1  mrg  * =======
   1629   1.1  mrg  */
   1630   1.1  mrg #define vec_sel(_a, _b, _c)	spu_sel(_a, _b, _c)
   1631   1.1  mrg 
   1632   1.1  mrg 
   1633   1.1  mrg /* vec_sl (vector shift left)
   1634   1.1  mrg  * ======
   1635   1.1  mrg  */
   1636   1.1  mrg static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
   1637   1.1  mrg {
   1638   1.1  mrg   vec_ushort8 hi, lo;
   1639   1.1  mrg 
   1640   1.1  mrg   lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
   1641   1.1  mrg   hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
   1642   1.1  mrg 
   1643   1.1  mrg   return ((vec_uchar16)(spu_or(hi, lo)));
   1644   1.1  mrg }
   1645   1.1  mrg 
   1646   1.1  mrg static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
   1647   1.1  mrg {
   1648   1.1  mrg   return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
   1649   1.1  mrg }
   1650   1.1  mrg 
   1651   1.1  mrg static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
   1652   1.1  mrg {
   1653   1.1  mrg   return (spu_sl(a, spu_and(b, 15)));
   1654   1.1  mrg }
   1655   1.1  mrg 
   1656   1.1  mrg static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
   1657   1.1  mrg {
   1658   1.1  mrg   return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
   1659   1.1  mrg }
   1660   1.1  mrg 
   1661   1.1  mrg static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
   1662   1.1  mrg {
   1663   1.1  mrg   return (spu_sl(a, spu_and(b, 31)));
   1664   1.1  mrg }
   1665   1.1  mrg 
   1666   1.1  mrg static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
   1667   1.1  mrg {
   1668   1.1  mrg   return (spu_sl(a, spu_and(b, 31)));
   1669   1.1  mrg }
   1670   1.1  mrg 
   1671   1.1  mrg 
   1672   1.1  mrg /* vec_sld (vector shift left double)
   1673   1.1  mrg  * =======
   1674   1.1  mrg  */
   1675   1.1  mrg #define vec_sld(_a, _b, _c)	spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c),  1+(_c),  2+(_c),  3+(_c),  \
   1676   1.1  mrg 								    4+(_c),  5+(_c),  6+(_c),  7+(_c), 	\
   1677   1.1  mrg 								    8+(_c),  9+(_c), 10+(_c), 11+(_c), 	\
   1678   1.1  mrg 							           12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
   1679   1.1  mrg 
   1680   1.1  mrg 
   1681   1.1  mrg /* vec_sll (vector shift left long)
   1682   1.1  mrg  * =======
   1683   1.1  mrg  */
   1684   1.1  mrg #define vec_sll(_a, _b)		spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
   1685   1.1  mrg 
   1686   1.1  mrg 
   1687   1.1  mrg /* vec_slo (vector shift left by octet)
   1688   1.1  mrg  * =======
   1689   1.1  mrg  */
   1690   1.1  mrg #define vec_slo(_a, _b)		spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
   1691   1.1  mrg 
   1692   1.1  mrg 
   1693   1.1  mrg /* vec_splat (vector splat)
   1694   1.1  mrg  * =========
   1695   1.1  mrg  */
   1696   1.1  mrg #define vec_splat(_a, _b)	spu_splats(spu_extract(_a, _b))
   1697   1.1  mrg 
   1698   1.1  mrg 
   1699   1.1  mrg /* vec_splat_s8 (vector splat signed byte)
   1700   1.1  mrg  * ============
   1701   1.1  mrg  */
   1702   1.1  mrg #define vec_splat_s8(_a)	spu_splats((signed char)(_a))
   1703   1.1  mrg 
   1704   1.1  mrg 
   1705   1.1  mrg /* vec_splat_s16 (vector splat signed half-word)
   1706   1.1  mrg  * =============
   1707   1.1  mrg  */
   1708   1.1  mrg #define vec_splat_s16(_a)	spu_splats((signed short)(_a))
   1709   1.1  mrg 
   1710   1.1  mrg 
   1711   1.1  mrg /* vec_splat_s32 (vector splat signed word)
   1712   1.1  mrg  * =============
   1713   1.1  mrg  */
   1714   1.1  mrg #define vec_splat_s32(_a)	spu_splats((signed int)(_a))
   1715   1.1  mrg 
   1716   1.1  mrg 
   1717   1.1  mrg /* vec_splat_u8 (vector splat unsigned byte)
   1718   1.1  mrg  * ============
   1719   1.1  mrg  */
   1720   1.1  mrg #define vec_splat_u8(_a)	spu_splats((unsigned char)(_a))
   1721   1.1  mrg 
   1722   1.1  mrg 
   1723   1.1  mrg /* vec_splat_u16 (vector splat unsigned half-word)
   1724   1.1  mrg  * =============
   1725   1.1  mrg  */
   1726   1.1  mrg #define vec_splat_u16(_a)	spu_splats((unsigned short)(_a))
   1727   1.1  mrg 
   1728   1.1  mrg 
   1729   1.1  mrg /* vec_splat_u32 (vector splat unsigned word)
   1730   1.1  mrg  * =============
   1731   1.1  mrg  */
   1732   1.1  mrg #define vec_splat_u32(_a)	spu_splats((unsigned int)(_a))
   1733   1.1  mrg 
   1734   1.1  mrg 
   1735   1.1  mrg /* vec_sr (vector shift right)
   1736   1.1  mrg  * ======
   1737   1.1  mrg  */
   1738   1.1  mrg static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
   1739   1.1  mrg {
   1740   1.1  mrg   vec_ushort8 hi, lo;
   1741   1.1  mrg 
   1742   1.1  mrg   lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
   1743   1.1  mrg   hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
   1744   1.1  mrg 
   1745   1.1  mrg   return ((vec_uchar16)(spu_or(hi, lo)));
   1746   1.1  mrg }
   1747   1.1  mrg 
   1748   1.1  mrg static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
   1749   1.1  mrg {
   1750   1.1  mrg   return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
   1751   1.1  mrg }
   1752   1.1  mrg 
   1753   1.1  mrg static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
   1754   1.1  mrg {
   1755   1.1  mrg   return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
   1756   1.1  mrg }
   1757   1.1  mrg 
   1758   1.1  mrg static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
   1759   1.1  mrg {
   1760   1.1  mrg   return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
   1761   1.1  mrg }
   1762   1.1  mrg 
   1763   1.1  mrg static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
   1764   1.1  mrg {
   1765   1.1  mrg   return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
   1766   1.1  mrg }
   1767   1.1  mrg 
   1768   1.1  mrg static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
   1769   1.1  mrg {
   1770   1.1  mrg   return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
   1771   1.1  mrg }
   1772   1.1  mrg 
   1773   1.1  mrg 
   1774   1.1  mrg /* vec_sra (vector shift right algebraic)
   1775   1.1  mrg  * =======
   1776   1.1  mrg  */
   1777   1.1  mrg static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
   1778   1.1  mrg {
   1779   1.1  mrg   vec_short8 hi, lo;
   1780   1.1  mrg 
   1781   1.1  mrg   lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
   1782   1.1  mrg   hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
   1783   1.1  mrg 
   1784   1.1  mrg   return ((vec_char16)(spu_or(hi, lo)));
   1785   1.1  mrg }
   1786   1.1  mrg 
   1787   1.1  mrg static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
   1788   1.1  mrg {
   1789   1.1  mrg   return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
   1790   1.1  mrg }
   1791   1.1  mrg 
   1792   1.1  mrg static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
   1793   1.1  mrg {
   1794   1.1  mrg   return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
   1795   1.1  mrg }
   1796   1.1  mrg 
   1797   1.1  mrg static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
   1798   1.1  mrg {
   1799   1.1  mrg   return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
   1800   1.1  mrg }
   1801   1.1  mrg 
   1802   1.1  mrg static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
   1803   1.1  mrg {
   1804   1.1  mrg   return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
   1805   1.1  mrg }
   1806   1.1  mrg 
   1807   1.1  mrg static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
   1808   1.1  mrg {
   1809   1.1  mrg   return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
   1810   1.1  mrg }
   1811   1.1  mrg 
   1812   1.1  mrg 
   1813   1.1  mrg /* vec_srl (vector shift right long)
   1814   1.1  mrg  * =======
   1815   1.1  mrg  */
   1816   1.1  mrg #define vec_srl(_a, _b)		spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
   1817   1.1  mrg 
   1818   1.1  mrg 
   1819   1.1  mrg /* vec_sro (vector shift right by octet)
   1820   1.1  mrg  * =======
   1821   1.1  mrg  */
   1822   1.1  mrg #define vec_sro(_a, _b)		spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
   1823   1.1  mrg 
   1824   1.1  mrg /* vec_st (vector store indexed)
   1825   1.1  mrg  * ======
   1826   1.1  mrg  */
   1827   1.1  mrg static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
   1828   1.1  mrg {
   1829   1.1  mrg   *((vec_uchar16 *)(c+b)) = a;
   1830   1.1  mrg }
   1831   1.1  mrg 
   1832   1.1  mrg static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
   1833   1.1  mrg {
   1834   1.1  mrg   *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
   1835   1.1  mrg }
   1836   1.1  mrg 
   1837   1.1  mrg static inline void vec_st(vec_char16 a, int b, signed char *c)
   1838   1.1  mrg {
   1839   1.1  mrg   *((vec_char16 *)(c+b)) = a;
   1840   1.1  mrg }
   1841   1.1  mrg 
   1842   1.1  mrg static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
   1843   1.1  mrg {
   1844   1.1  mrg   *((vec_char16 *)((signed char *)(c)+b)) = a;
   1845   1.1  mrg }
   1846   1.1  mrg 
   1847   1.1  mrg static inline void vec_st(vec_bchar16 a, int b, signed char *c)
   1848   1.1  mrg {
   1849   1.1  mrg   *((vec_bchar16 *)((signed char *)(c)+b)) = a;
   1850   1.1  mrg }
   1851   1.1  mrg 
   1852   1.1  mrg static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
   1853   1.1  mrg {
   1854   1.1  mrg   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
   1855   1.1  mrg }
   1856   1.1  mrg 
   1857   1.1  mrg static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
   1858   1.1  mrg {
   1859   1.1  mrg   *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
   1860   1.1  mrg }
   1861   1.1  mrg 
   1862   1.1  mrg static inline void vec_st(vec_short8 a, int b, signed short *c)
   1863   1.1  mrg {
   1864   1.1  mrg   *((vec_short8 *)((unsigned char *)(c)+b)) = a;
   1865   1.1  mrg }
   1866   1.1  mrg 
   1867   1.1  mrg static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
   1868   1.1  mrg {
   1869   1.1  mrg   *((vec_short8 *)((signed char *)(c)+b)) = a;
   1870   1.1  mrg }
   1871   1.1  mrg 
   1872   1.1  mrg static inline void vec_st(vec_bshort8 a, int b, signed short *c)
   1873   1.1  mrg {
   1874   1.1  mrg   *((vec_bshort8 *)((signed char *)(c)+b)) = a;
   1875   1.1  mrg }
   1876   1.1  mrg 
   1877   1.1  mrg static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
   1878   1.1  mrg {
   1879   1.1  mrg   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
   1880   1.1  mrg }
   1881   1.1  mrg 
   1882   1.1  mrg static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
   1883   1.1  mrg {
   1884   1.1  mrg   *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
   1885   1.1  mrg }
   1886   1.1  mrg 
   1887   1.1  mrg static inline void vec_st(vec_int4 a, int b, signed int *c)
   1888   1.1  mrg {
   1889   1.1  mrg   *((vec_int4 *)((unsigned char *)(c)+b)) = a;
   1890   1.1  mrg }
   1891   1.1  mrg 
   1892   1.1  mrg static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
   1893   1.1  mrg {
   1894   1.1  mrg   *((vec_int4 *)((signed char *)(c)+b)) = a;
   1895   1.1  mrg }
   1896   1.1  mrg 
   1897   1.1  mrg static inline void vec_st(vec_bint4 a, int b, signed int *c)
   1898   1.1  mrg {
   1899   1.1  mrg   *((vec_bint4 *)((signed char *)(c)+b)) = a;
   1900   1.1  mrg }
   1901   1.1  mrg 
   1902   1.1  mrg static inline void vec_st(vec_float4 a, int b, float *c)
   1903   1.1  mrg {
   1904   1.1  mrg   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
   1905   1.1  mrg }
   1906   1.1  mrg 
   1907   1.1  mrg static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
   1908   1.1  mrg {
   1909   1.1  mrg   *((vec_float4 *)((unsigned char *)(c)+b)) = a;
   1910   1.1  mrg }
   1911   1.1  mrg 
   1912   1.1  mrg 
   1913   1.1  mrg /* vec_ste (vector store element indexed)
   1914   1.1  mrg  * =======
   1915   1.1  mrg  */
   1916   1.1  mrg static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
   1917   1.1  mrg {
   1918   1.1  mrg   unsigned char *ptr;
   1919   1.1  mrg 
   1920   1.1  mrg   ptr = c + b;
   1921   1.1  mrg   *ptr = spu_extract(a, (int)(ptr) & 15);
   1922   1.1  mrg }
   1923   1.1  mrg 
   1924   1.1  mrg static inline void vec_ste(vec_char16 a, int b, signed char *c)
   1925   1.1  mrg {
   1926   1.1  mrg   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
   1927   1.1  mrg }
   1928   1.1  mrg 
   1929   1.1  mrg static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
   1930   1.1  mrg {
   1931   1.1  mrg   vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
   1932   1.1  mrg }
   1933   1.1  mrg 
   1934   1.1  mrg static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
   1935   1.1  mrg {
   1936   1.1  mrg   unsigned short *ptr;
   1937   1.1  mrg 
   1938   1.1  mrg   ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
   1939   1.1  mrg   *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
   1940   1.1  mrg }
   1941   1.1  mrg 
   1942   1.1  mrg static inline void vec_ste(vec_short8 a, int b, signed short *c)
   1943   1.1  mrg {
   1944   1.1  mrg   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
   1945   1.1  mrg }
   1946   1.1  mrg 
   1947   1.1  mrg static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
   1948   1.1  mrg {
   1949   1.1  mrg   vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
   1950   1.1  mrg }
   1951   1.1  mrg 
   1952   1.1  mrg static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
   1953   1.1  mrg {
   1954   1.1  mrg   unsigned int *ptr;
   1955   1.1  mrg 
   1956   1.1  mrg   ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
   1957   1.1  mrg   *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
   1958   1.1  mrg }
   1959   1.1  mrg 
   1960   1.1  mrg static inline void vec_ste(vec_int4 a, int b, signed int *c)
   1961   1.1  mrg {
   1962   1.1  mrg   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
   1963   1.1  mrg }
   1964   1.1  mrg 
   1965   1.1  mrg static inline void vec_ste(vec_bint4 a, int b, signed int *c)
   1966   1.1  mrg {
   1967   1.1  mrg   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
   1968   1.1  mrg }
   1969   1.1  mrg 
   1970   1.1  mrg static inline void vec_ste(vec_float4 a, int b, float *c)
   1971   1.1  mrg {
   1972   1.1  mrg   vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
   1973   1.1  mrg }
   1974   1.1  mrg 
   1975   1.1  mrg 
   1976   1.1  mrg /* vec_stl (vector store indexed LRU)
   1977   1.1  mrg  * =======
   1978   1.1  mrg  */
   1979   1.1  mrg #define vec_stl(_a, _b, _c)		vec_st(_a, _b, _c)
   1980   1.1  mrg 
   1981   1.1  mrg 
   1982   1.1  mrg /* vec_sub (vector subtract)
   1983   1.1  mrg  * =======
   1984   1.1  mrg  */
   1985   1.1  mrg static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
   1986   1.1  mrg {
   1987   1.1  mrg   return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
   1988   1.1  mrg 				spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
   1989   1.1  mrg 				spu_splats((unsigned short)0xFF00))));
   1990   1.1  mrg }
   1991   1.1  mrg 
   1992   1.1  mrg static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
   1993   1.1  mrg {
   1994   1.1  mrg   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
   1995   1.1  mrg }
   1996   1.1  mrg 
   1997   1.1  mrg static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
   1998   1.1  mrg {
   1999   1.1  mrg   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
   2000   1.1  mrg }
   2001   1.1  mrg 
   2002   1.1  mrg static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
   2003   1.1  mrg {
   2004   1.1  mrg   return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
   2005   1.1  mrg }
   2006   1.1  mrg 
   2007   1.1  mrg static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
   2008   1.1  mrg {
   2009   1.1  mrg   return (spu_sub(a, b));
   2010   1.1  mrg }
   2011   1.1  mrg 
   2012   1.1  mrg static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
   2013   1.1  mrg {
   2014   1.1  mrg   return (spu_sub(a, b));
   2015   1.1  mrg }
   2016   1.1  mrg 
   2017   1.1  mrg static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
   2018   1.1  mrg {
   2019   1.1  mrg   return (spu_sub((vec_short8)(a), b));
   2020   1.1  mrg }
   2021   1.1  mrg 
   2022   1.1  mrg static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
   2023   1.1  mrg {
   2024   1.1  mrg   return (spu_sub(a, (vec_short8)(b)));
   2025   1.1  mrg }
   2026   1.1  mrg 
   2027   1.1  mrg static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
   2028   1.1  mrg {
   2029   1.1  mrg   return (spu_sub(a, b));
   2030   1.1  mrg }
   2031   1.1  mrg 
   2032   1.1  mrg static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
   2033   1.1  mrg {
   2034   1.1  mrg   return (spu_sub(a, b));
   2035   1.1  mrg }
   2036   1.1  mrg 
   2037   1.1  mrg static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
   2038   1.1  mrg {
   2039   1.1  mrg   return (spu_sub((vec_int4)(a), b));
   2040   1.1  mrg }
   2041   1.1  mrg 
   2042   1.1  mrg static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
   2043   1.1  mrg {
   2044   1.1  mrg   return (spu_sub(a, (vec_int4)(b)));
   2045   1.1  mrg }
   2046   1.1  mrg 
   2047   1.1  mrg static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
   2048   1.1  mrg {
   2049   1.1  mrg   return (spu_sub(a, b));
   2050   1.1  mrg }
   2051   1.1  mrg 
   2052   1.1  mrg 
   2053   1.1  mrg /* vec_subc (vector subtract carryout)
   2054   1.1  mrg  * ========
   2055   1.1  mrg  */
   2056   1.1  mrg #define vec_subc(_a, _b)	spu_genb(_a, _b)
   2057   1.1  mrg 
   2058   1.1  mrg 
   2059   1.1  mrg /* vec_subs (vector subtract saturate)
   2060   1.1  mrg  * ========
   2061   1.1  mrg  */
   2062   1.1  mrg static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
   2063   1.1  mrg {
   2064   1.1  mrg   vec_ushort8 s1, s2;
   2065   1.1  mrg   vec_uchar16 s, d;
   2066   1.1  mrg 
   2067   1.1  mrg   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
   2068   1.1  mrg   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
   2069   1.1  mrg   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16,  2, 18,  4, 20,  6, 22,
   2070   1.1  mrg 					                8, 24, 10, 26, 12, 28, 14, 30})));
   2071   1.1  mrg   d  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
   2072   1.1  mrg 					                9, 25, 11, 27, 13, 29, 15, 31})));
   2073   1.1  mrg   return (spu_andc(d, s));
   2074   1.1  mrg }
   2075   1.1  mrg 
   2076   1.1  mrg static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
   2077   1.1  mrg {
   2078   1.1  mrg   vec_ushort8 s1, s2;
   2079   1.1  mrg   vec_uchar16 s, d;
   2080   1.1  mrg 
   2081   1.1  mrg   s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
   2082   1.1  mrg   s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
   2083   1.1  mrg   s  = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17,  3, 19,  5, 21,  7, 23,
   2084   1.1  mrg 					                9, 25, 11, 27, 13, 29, 15, 31})));
   2085   1.1  mrg   d  = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
   2086   1.1  mrg   d  = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
   2087   1.1  mrg 
   2088   1.1  mrg   return ((vec_char16)(d));
   2089   1.1  mrg }
   2090   1.1  mrg 
   2091   1.1  mrg static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
   2092   1.1  mrg {
   2093   1.1  mrg   return (vec_subs((vec_char16)(a), b));
   2094   1.1  mrg }
   2095   1.1  mrg 
   2096   1.1  mrg static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
   2097   1.1  mrg {
   2098   1.1  mrg   return (vec_subs(a, (vec_char16)(b)));
   2099   1.1  mrg }
   2100   1.1  mrg 
   2101   1.1  mrg static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
   2102   1.1  mrg {
   2103   1.1  mrg   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
   2104   1.1  mrg }
   2105   1.1  mrg 
   2106   1.1  mrg static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
   2107   1.1  mrg {
   2108   1.1  mrg   vec_short8 s;
   2109   1.1  mrg   vec_short8 d;
   2110   1.1  mrg 
   2111   1.1  mrg   s = spu_sub(a, b);
   2112   1.1  mrg   d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
   2113   1.1  mrg   d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
   2114   1.1  mrg 
   2115   1.1  mrg   return (d);
   2116   1.1  mrg }
   2117   1.1  mrg 
   2118   1.1  mrg static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
   2119   1.1  mrg {
   2120   1.1  mrg   return ((vec_short8)(vec_subs((vec_short8)(a), b)));
   2121   1.1  mrg }
   2122   1.1  mrg 
   2123   1.1  mrg static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
   2124   1.1  mrg {
   2125   1.1  mrg   return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
   2126   1.1  mrg }
   2127   1.1  mrg 
   2128   1.1  mrg static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
   2129   1.1  mrg {
   2130   1.1  mrg   return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
   2131   1.1  mrg }
   2132   1.1  mrg 
   2133   1.1  mrg static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
   2134   1.1  mrg {
   2135   1.1  mrg   vec_int4 s;
   2136   1.1  mrg   vec_int4 d;
   2137   1.1  mrg 
   2138   1.1  mrg   s = spu_sub(a, b);
   2139   1.1  mrg   d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
   2140   1.1  mrg   d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
   2141   1.1  mrg 
   2142   1.1  mrg   return (d);
   2143   1.1  mrg }
   2144   1.1  mrg 
   2145   1.1  mrg static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
   2146   1.1  mrg {
   2147   1.1  mrg   return ((vec_int4)(vec_subs((vec_int4)(a), b)));
   2148   1.1  mrg }
   2149   1.1  mrg 
   2150   1.1  mrg static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
   2151   1.1  mrg {
   2152   1.1  mrg   return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
   2153   1.1  mrg }
   2154   1.1  mrg 
   2155   1.1  mrg 
   2156   1.1  mrg /* vec_sum4s (vector sum across partial (1/4) saturated)
   2157   1.1  mrg  * =========
   2158   1.1  mrg  */
   2159   1.1  mrg static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
   2160   1.1  mrg {
   2161   1.1  mrg   vec_uint4 a01_23, a0123;
   2162   1.1  mrg 
   2163   1.1  mrg   a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
   2164   1.1  mrg 			       spu_and((vec_ushort8)(a), 0xFF)));
   2165   1.1  mrg   a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
   2166   1.1  mrg   return (vec_adds(a0123, b));
   2167   1.1  mrg }
   2168   1.1  mrg 
   2169   1.1  mrg static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
   2170   1.1  mrg {
   2171   1.1  mrg   vec_int4 a01_23, a0123;
   2172   1.1  mrg 
   2173   1.1  mrg   a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
   2174   1.1  mrg 			      spu_extend(a)));
   2175   1.1  mrg   a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
   2176   1.1  mrg   return (vec_adds(a0123, b));
   2177   1.1  mrg }
   2178   1.1  mrg 
   2179   1.1  mrg static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
   2180   1.1  mrg {
   2181   1.1  mrg   vec_int4 a0123;
   2182   1.1  mrg 
   2183   1.1  mrg   a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
   2184   1.1  mrg   return (vec_adds(a0123, b));
   2185   1.1  mrg }
   2186   1.1  mrg 
   2187   1.1  mrg 
   2188   1.1  mrg /* vec_sum2s (vector sum across partial (1/2) saturated)
   2189   1.1  mrg  * =========
   2190   1.1  mrg  */
   2191   1.1  mrg static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
   2192   1.1  mrg {
   2193   1.1  mrg   vec_int4 c, d;
   2194   1.1  mrg   vec_int4 sign1, sign2, sign3;
   2195   1.1  mrg   vec_int4 carry, sum_l, sum_h, sat, sat_val;
   2196   1.1  mrg 
   2197   1.1  mrg   sign1 = spu_rlmaska(a, -31);
   2198   1.1  mrg   sign2 = spu_rlmaska(b, -31);
   2199   1.1  mrg 
   2200   1.1  mrg   c = spu_rlqwbyte(a, -4);
   2201   1.1  mrg   sign3 = spu_rlqwbyte(sign1, -4);
   2202   1.1  mrg 
   2203   1.1  mrg   carry = spu_genc(a, b);
   2204   1.1  mrg   sum_l = spu_add(a, b);
   2205   1.1  mrg   sum_h = spu_addx(sign1, sign2, carry);
   2206   1.1  mrg 
   2207   1.1  mrg   carry = spu_genc(sum_l, c);
   2208   1.1  mrg   sum_l = spu_add(sum_l, c);
   2209   1.1  mrg   sum_h = spu_addx(sum_h, sign3, carry);
   2210   1.1  mrg 
   2211   1.1  mrg   sign1 = spu_rlmaska(sum_l, -31);
   2212   1.1  mrg   sign2 = spu_rlmaska(sum_h, -31);
   2213   1.1  mrg 
   2214   1.1  mrg   sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
   2215   1.1  mrg 
   2216   1.1  mrg   sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
   2217   1.1  mrg 
   2218   1.1  mrg   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
   2219   1.1  mrg 
   2220   1.1  mrg   return (d);
   2221   1.1  mrg }
   2222   1.1  mrg 
   2223   1.1  mrg 
   2224   1.1  mrg /* vec_sums (vector sum saturated)
   2225   1.1  mrg  * ========
   2226   1.1  mrg  */
   2227   1.1  mrg static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
   2228   1.1  mrg {
   2229   1.1  mrg   vec_int4 a0, a1, a2, c0, c1, c2, d;
   2230   1.1  mrg   vec_int4 sign_a, sign_b, sign_l, sign_h;
   2231   1.1  mrg   vec_int4 sum_l, sum_h, sat, sat_val;
   2232   1.1  mrg 
   2233   1.1  mrg   sign_a = spu_rlmaska(a, -31);
   2234   1.1  mrg   sign_b = spu_rlmaska(b, -31);
   2235   1.1  mrg 
   2236   1.1  mrg   a0 = spu_rlqwbyte(a, -12);
   2237   1.1  mrg   a1 = spu_rlqwbyte(a, -8);
   2238   1.1  mrg   a2 = spu_rlqwbyte(a, -4);
   2239   1.1  mrg 
   2240   1.1  mrg   sum_l = spu_add(a, b);
   2241   1.1  mrg   sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
   2242   1.1  mrg 
   2243   1.1  mrg   c2 = spu_genc(sum_l, a2);
   2244   1.1  mrg   sum_l = spu_add(sum_l, a2);
   2245   1.1  mrg   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
   2246   1.1  mrg 
   2247   1.1  mrg   c1 = spu_genc(sum_l, a1);
   2248   1.1  mrg   sum_l = spu_add(sum_l, a1);
   2249   1.1  mrg   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
   2250   1.1  mrg 
   2251   1.1  mrg   c0 = spu_genc(sum_l, a0);
   2252   1.1  mrg   sum_l = spu_add(sum_l, a0);
   2253   1.1  mrg   sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
   2254   1.1  mrg 
   2255   1.1  mrg   sign_l = spu_rlmaska(sum_l, -31);
   2256   1.1  mrg   sign_h = spu_rlmaska(sum_h, -31);
   2257   1.1  mrg 
   2258   1.1  mrg   sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
   2259   1.1  mrg 
   2260   1.1  mrg   sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
   2261   1.1  mrg 
   2262   1.1  mrg   d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
   2263   1.1  mrg 
   2264   1.1  mrg   return (d);
   2265   1.1  mrg }
   2266   1.1  mrg 
   2267   1.1  mrg 
   2268   1.1  mrg /* vec_trunc (vector truncate)
   2269   1.1  mrg  * =========
   2270   1.1  mrg  */
   2271   1.1  mrg static inline vec_float4 vec_trunc(vec_float4 a)
   2272   1.1  mrg {
   2273   1.1  mrg   vec_int4 exp;
   2274   1.1  mrg   vec_uint4 mask;
   2275   1.1  mrg 
   2276   1.1  mrg   exp  = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
   2277   1.1  mrg   mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
   2278   1.1  mrg   mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
   2279   1.1  mrg   mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
   2280   1.1  mrg   return (spu_andc(a, (vec_float4)(mask)));
   2281   1.1  mrg }
   2282   1.1  mrg 
   2283   1.1  mrg /* vec_unpackh (vector unpack high element)
   2284   1.1  mrg  * ===========
   2285   1.1  mrg  */
   2286   1.1  mrg static inline vec_short8 vec_unpackh(vec_char16 a)
   2287   1.1  mrg {
   2288   1.1  mrg   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
   2289   1.1  mrg 					              4, 4, 5, 5, 6, 6, 7, 7}))));
   2290   1.1  mrg }
   2291   1.1  mrg 
   2292   1.1  mrg static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
   2293   1.1  mrg {
   2294   1.1  mrg   return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
   2295   1.1  mrg }
   2296   1.1  mrg 
   2297   1.1  mrg static inline vec_int4 vec_unpackh(vec_short8 a)
   2298   1.1  mrg {
   2299   1.1  mrg   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
   2300   1.1  mrg 					              0, 0, 4, 5, 0, 0, 6, 7}))));
   2301   1.1  mrg }
   2302   1.1  mrg 
   2303   1.1  mrg #ifdef SUPPORT_UNPACK_PIXEL
   2304   1.1  mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts
   2305  1.10  mrg  * cannot simultaneously be supported. By default, the boolean short is
   2306   1.1  mrg  * supported.
   2307   1.1  mrg  */
   2308   1.1  mrg static inline vec_uint4 vec_unpackh(vec_pixel8 a)
   2309   1.1  mrg {
   2310   1.1  mrg   vec_ushort8 p1, p2;
   2311   1.1  mrg 
   2312   1.1  mrg   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
   2313   1.1  mrg 		   spu_and((vec_ushort8)(a.p), 0x1F),
   2314   1.1  mrg 		   ((vec_uchar16){ 0, 128, 128, 17,  2, 128, 128, 19,
   2315   1.1  mrg 			           4, 128, 128, 21,  6, 128, 128, 23}));
   2316   1.1  mrg   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
   2317   1.1  mrg 		   spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
   2318   1.1  mrg 		   ((vec_uchar16){ 128,  17, 1, 128, 128,  19, 3, 128,
   2319   1.1  mrg 			           128,  21, 5, 128, 128,  23, 7, 128}));
   2320   1.1  mrg   return ((vec_uint4)(spu_or(p1, p2)));
   2321   1.1  mrg }
   2322   1.1  mrg 
   2323   1.1  mrg #else
   2324   1.1  mrg 
   2325   1.1  mrg static inline vec_bint4 vec_unpackh(vec_bshort8 a)
   2326   1.1  mrg {
   2327   1.1  mrg   return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
   2328   1.1  mrg }
   2329   1.1  mrg #endif
   2330   1.1  mrg 
   2331   1.1  mrg 
   2332   1.1  mrg 
   2333   1.1  mrg 
   2334   1.1  mrg 
   2335   1.1  mrg /* vec_unpackl (vector unpack low element)
   2336   1.1  mrg  * ===========
   2337   1.1  mrg  */
   2338   1.1  mrg static inline vec_short8 vec_unpackl(vec_char16 a)
   2339   1.1  mrg {
   2340   1.1  mrg   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
   2341   1.1  mrg 					              12, 12, 13, 13, 14, 14, 15, 15}))));
   2342   1.1  mrg }
   2343   1.1  mrg 
   2344   1.1  mrg static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
   2345   1.1  mrg {
   2346   1.1  mrg   return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
   2347   1.1  mrg }
   2348   1.1  mrg 
   2349   1.1  mrg 
   2350   1.1  mrg static inline vec_int4 vec_unpackl(vec_short8 a)
   2351   1.1  mrg {
   2352   1.1  mrg   return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
   2353   1.1  mrg 					              0, 0,12,13, 0, 0, 14, 15}))));
   2354   1.1  mrg }
   2355   1.1  mrg 
   2356   1.1  mrg 
   2357   1.1  mrg #ifdef SUPPORT_UNPACK_PIXEL
   2358   1.1  mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts
   2359  1.10  mrg  * cannot simultaneously be supported. By default, the boolean short is
   2360   1.1  mrg  * supported.
   2361   1.1  mrg  */
   2362   1.1  mrg static inline vec_uint4 vec_unpackl(vec_pixel8 a)
   2363   1.1  mrg {
   2364   1.1  mrg   vec_ushort8 p1, p2;
   2365   1.1  mrg 
   2366   1.1  mrg   p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
   2367   1.1  mrg 		   spu_and((vec_ushort8)(a), 0x1F),
   2368   1.1  mrg 		   ((vec_uchar16){ 8, 128, 128, 25,  10, 128, 128, 27,
   2369   1.1  mrg 			          12, 128, 128, 29,  14, 128, 128, 31}));
   2370   1.1  mrg   p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
   2371   1.1  mrg 		   spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
   2372   1.1  mrg 		   ((vec_uchar16){ 128, 25,  9, 128, 128, 27, 11, 128,
   2373   1.1  mrg 			           128, 29, 13, 128, 128, 31, 15, 128}));
   2374   1.1  mrg   return ((vec_uint4)(spu_or(p1, p2)));
   2375   1.1  mrg }
   2376   1.1  mrg 
   2377   1.1  mrg #else
   2378   1.1  mrg 
   2379   1.1  mrg static inline vec_bint4 vec_unpackl(vec_bshort8 a)
   2380   1.1  mrg {
   2381   1.1  mrg   return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
   2382   1.1  mrg 
   2383   1.1  mrg }
   2384   1.1  mrg #endif
   2385   1.1  mrg 
   2386   1.1  mrg 
   2387   1.1  mrg 
   2388   1.1  mrg /* vec_xor (vector logical xor)
   2389   1.1  mrg  * ======
   2390   1.1  mrg  */
   2391   1.1  mrg static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
   2392   1.1  mrg {
   2393   1.1  mrg   return (spu_xor(a, b));
   2394   1.1  mrg }
   2395   1.1  mrg 
   2396   1.1  mrg static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
   2397   1.1  mrg {
   2398   1.1  mrg   return (spu_xor(a, b));
   2399   1.1  mrg }
   2400   1.1  mrg 
   2401   1.1  mrg static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
   2402   1.1  mrg {
   2403   1.1  mrg   return (spu_xor((vec_char16)(a), b));
   2404   1.1  mrg }
   2405   1.1  mrg 
   2406   1.1  mrg static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
   2407   1.1  mrg {
   2408   1.1  mrg   return (spu_xor(a, (vec_char16)(b)));
   2409   1.1  mrg }
   2410   1.1  mrg 
   2411   1.1  mrg static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
   2412   1.1  mrg {
   2413   1.1  mrg   return (spu_xor(a, b));
   2414   1.1  mrg }
   2415   1.1  mrg 
   2416   1.1  mrg static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
   2417   1.1  mrg {
   2418   1.1  mrg   return (spu_xor(a, b));
   2419   1.1  mrg }
   2420   1.1  mrg 
   2421   1.1  mrg static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
   2422   1.1  mrg {
   2423   1.1  mrg   return (spu_xor((vec_short8)(a), b));
   2424   1.1  mrg }
   2425   1.1  mrg 
   2426   1.1  mrg static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
   2427   1.1  mrg {
   2428   1.1  mrg   return (spu_xor(a, (vec_short8)(b)));
   2429   1.1  mrg }
   2430   1.1  mrg 
   2431   1.1  mrg static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
   2432   1.1  mrg {
   2433   1.1  mrg   return (spu_xor(a, b));
   2434   1.1  mrg }
   2435   1.1  mrg 
   2436   1.1  mrg static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
   2437   1.1  mrg {
   2438   1.1  mrg   return (spu_xor(a, b));
   2439   1.1  mrg }
   2440   1.1  mrg 
   2441   1.1  mrg static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
   2442   1.1  mrg {
   2443   1.1  mrg   return (spu_xor((vec_int4)(a), b));
   2444   1.1  mrg }
   2445   1.1  mrg 
   2446   1.1  mrg static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
   2447   1.1  mrg {
   2448   1.1  mrg   return (spu_xor(a, (vec_int4)(b)));
   2449   1.1  mrg }
   2450   1.1  mrg 
   2451   1.1  mrg static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
   2452   1.1  mrg {
   2453   1.1  mrg   return (spu_xor(a, b));
   2454   1.1  mrg }
   2455   1.1  mrg 
   2456   1.1  mrg static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
   2457   1.1  mrg {
   2458   1.1  mrg   return (spu_xor((vec_float4)(a),b));
   2459   1.1  mrg }
   2460   1.1  mrg 
   2461   1.1  mrg static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
   2462   1.1  mrg {
   2463   1.1  mrg   return (spu_xor(a, (vec_float4)(b)));
   2464   1.1  mrg }
   2465   1.1  mrg 
   2466   1.1  mrg /************************************************************************
   2467   1.1  mrg  *                        PREDICATES
   2468   1.1  mrg  ************************************************************************/
   2469   1.1  mrg 
   2470   1.1  mrg /* vec_all_eq (all elements equal)
   2471   1.1  mrg  * ==========
   2472   1.1  mrg  */
   2473   1.1  mrg static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
   2474   1.1  mrg {
   2475   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
   2476   1.1  mrg }
   2477   1.1  mrg 
   2478   1.1  mrg static inline int vec_all_eq(vec_char16 a, vec_char16 b)
   2479   1.1  mrg {
   2480   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
   2481   1.1  mrg }
   2482   1.1  mrg 
   2483   1.1  mrg static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
   2484   1.1  mrg {
   2485   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
   2486   1.1  mrg }
   2487   1.1  mrg 
   2488   1.1  mrg static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
   2489   1.1  mrg {
   2490   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
   2491   1.1  mrg }
   2492   1.1  mrg 
   2493   1.1  mrg static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
   2494   1.1  mrg {
   2495   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
   2496   1.1  mrg }
   2497   1.1  mrg 
   2498   1.1  mrg static inline int vec_all_eq(vec_short8 a, vec_short8 b)
   2499   1.1  mrg {
   2500   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
   2501   1.1  mrg }
   2502   1.1  mrg 
   2503   1.1  mrg static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
   2504   1.1  mrg {
   2505   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
   2506   1.1  mrg }
   2507   1.1  mrg 
   2508   1.1  mrg static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
   2509   1.1  mrg {
   2510   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
   2511   1.1  mrg }
   2512   1.1  mrg 
   2513   1.1  mrg static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
   2514   1.1  mrg {
   2515   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
   2516   1.1  mrg }
   2517   1.1  mrg 
   2518   1.1  mrg static inline int vec_all_eq(vec_int4 a, vec_int4 b)
   2519   1.1  mrg {
   2520   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
   2521   1.1  mrg }
   2522   1.1  mrg 
   2523   1.1  mrg static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
   2524   1.1  mrg {
   2525   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
   2526   1.1  mrg }
   2527   1.1  mrg 
   2528   1.1  mrg static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
   2529   1.1  mrg {
   2530   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
   2531   1.1  mrg }
   2532   1.1  mrg 
   2533   1.1  mrg static inline int vec_all_eq(vec_float4 a, vec_float4 b)
   2534   1.1  mrg {
   2535   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
   2536   1.1  mrg }
   2537   1.1  mrg 
   2538   1.1  mrg 
   2539   1.1  mrg /* vec_all_ge (all elements greater than or equal)
   2540   1.1  mrg  * ==========
   2541   1.1  mrg  */
   2542   1.1  mrg static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
   2543   1.1  mrg {
   2544   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2545   1.1  mrg }
   2546   1.1  mrg 
   2547   1.1  mrg static inline int vec_all_ge(vec_char16 a, vec_char16 b)
   2548   1.1  mrg {
   2549   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2550   1.1  mrg }
   2551   1.1  mrg 
   2552   1.1  mrg static inline  int vec_all_ge(vec_bchar16 a, vec_char16 b)
   2553   1.1  mrg {
   2554   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
   2555   1.1  mrg }
   2556   1.1  mrg 
   2557   1.1  mrg static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
   2558   1.1  mrg {
   2559   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
   2560   1.1  mrg }
   2561   1.1  mrg 
   2562   1.1  mrg static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
   2563   1.1  mrg {
   2564   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2565   1.1  mrg }
   2566   1.1  mrg 
   2567   1.1  mrg static inline int vec_all_ge(vec_short8 a, vec_short8 b)
   2568   1.1  mrg {
   2569   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2570   1.1  mrg }
   2571   1.1  mrg 
   2572   1.1  mrg static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
   2573   1.1  mrg {
   2574   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
   2575   1.1  mrg }
   2576   1.1  mrg 
   2577   1.1  mrg static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
   2578   1.1  mrg {
   2579   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
   2580   1.1  mrg }
   2581   1.1  mrg 
   2582   1.1  mrg static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
   2583   1.1  mrg {
   2584   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2585   1.1  mrg }
   2586   1.1  mrg 
   2587   1.1  mrg static inline int vec_all_ge(vec_int4 a, vec_int4 b)
   2588   1.1  mrg {
   2589   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2590   1.1  mrg }
   2591   1.1  mrg 
   2592   1.1  mrg static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
   2593   1.1  mrg {
   2594   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
   2595   1.1  mrg }
   2596   1.1  mrg 
   2597   1.1  mrg static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
   2598   1.1  mrg {
   2599   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
   2600   1.1  mrg }
   2601   1.1  mrg 
   2602   1.1  mrg static inline int vec_all_ge(vec_float4 a, vec_float4 b)
   2603   1.1  mrg {
   2604   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2605   1.1  mrg }
   2606   1.1  mrg 
   2607   1.1  mrg 
   2608   1.1  mrg /* vec_all_gt (all elements greater than)
   2609   1.1  mrg  * ==========
   2610   1.1  mrg  */
   2611   1.1  mrg static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
   2612   1.1  mrg {
   2613   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
   2614   1.1  mrg }
   2615   1.1  mrg 
   2616   1.1  mrg static inline int vec_all_gt(vec_char16 a, vec_char16 b)
   2617   1.1  mrg {
   2618   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
   2619   1.1  mrg }
   2620   1.1  mrg 
   2621   1.1  mrg static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
   2622   1.1  mrg {
   2623   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
   2624   1.1  mrg }
   2625   1.1  mrg 
   2626   1.1  mrg static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
   2627   1.1  mrg {
   2628   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
   2629   1.1  mrg }
   2630   1.1  mrg 
   2631   1.1  mrg static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
   2632   1.1  mrg {
   2633   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
   2634   1.1  mrg }
   2635   1.1  mrg 
   2636   1.1  mrg static inline int vec_all_gt(vec_short8 a, vec_short8 b)
   2637   1.1  mrg {
   2638   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
   2639   1.1  mrg }
   2640   1.1  mrg 
   2641   1.1  mrg static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
   2642   1.1  mrg {
   2643   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
   2644   1.1  mrg }
   2645   1.1  mrg 
   2646   1.1  mrg static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
   2647   1.1  mrg {
   2648   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
   2649   1.1  mrg }
   2650   1.1  mrg 
   2651   1.1  mrg static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
   2652   1.1  mrg {
   2653   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
   2654   1.1  mrg }
   2655   1.1  mrg 
   2656   1.1  mrg static inline int vec_all_gt(vec_int4 a, vec_int4 b)
   2657   1.1  mrg {
   2658   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
   2659   1.1  mrg }
   2660   1.1  mrg 
   2661   1.1  mrg static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
   2662   1.1  mrg {
   2663   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
   2664   1.1  mrg }
   2665   1.1  mrg 
   2666   1.1  mrg static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
   2667   1.1  mrg {
   2668   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
   2669   1.1  mrg }
   2670   1.1  mrg 
   2671   1.1  mrg static inline int vec_all_gt(vec_float4 a, vec_float4 b)
   2672   1.1  mrg {
   2673   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
   2674   1.1  mrg }
   2675   1.1  mrg 
   2676   1.1  mrg 
   2677   1.1  mrg /* vec_all_in (all elements in bounds)
   2678   1.1  mrg  * ==========
   2679   1.1  mrg  */
   2680   1.1  mrg static inline int vec_all_in(vec_float4 a, vec_float4 b)
   2681   1.1  mrg {
   2682   1.1  mrg   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
   2683   1.1  mrg }
   2684   1.1  mrg 
   2685   1.1  mrg 
   2686   1.1  mrg /* vec_all_le (all elements less than or equal)
   2687   1.1  mrg  * ==========
   2688   1.1  mrg  */
   2689   1.1  mrg static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
   2690   1.1  mrg {
   2691   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2692   1.1  mrg }
   2693   1.1  mrg 
   2694   1.1  mrg static inline int vec_all_le(vec_char16 a, vec_char16 b)
   2695   1.1  mrg {
   2696   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2697   1.1  mrg }
   2698   1.1  mrg 
   2699   1.1  mrg static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
   2700   1.1  mrg {
   2701   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
   2702   1.1  mrg }
   2703   1.1  mrg 
   2704   1.1  mrg static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
   2705   1.1  mrg {
   2706   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
   2707   1.1  mrg }
   2708   1.1  mrg 
   2709   1.1  mrg static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
   2710   1.1  mrg {
   2711   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2712   1.1  mrg }
   2713   1.1  mrg 
   2714   1.1  mrg static inline int vec_all_le(vec_short8 a, vec_short8 b)
   2715   1.1  mrg {
   2716   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2717   1.1  mrg }
   2718   1.1  mrg 
   2719   1.1  mrg static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
   2720   1.1  mrg {
   2721   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
   2722   1.1  mrg }
   2723   1.1  mrg 
   2724   1.1  mrg static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
   2725   1.1  mrg {
   2726   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
   2727   1.1  mrg }
   2728   1.1  mrg 
   2729   1.1  mrg static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
   2730   1.1  mrg {
   2731   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2732   1.1  mrg }
   2733   1.1  mrg 
   2734   1.1  mrg static inline int vec_all_le(vec_int4 a, vec_int4 b)
   2735   1.1  mrg {
   2736   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2737   1.1  mrg }
   2738   1.1  mrg 
   2739   1.1  mrg static inline int vec_all_le(vec_bint4 a, vec_int4 b)
   2740   1.1  mrg {
   2741   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
   2742   1.1  mrg }
   2743   1.1  mrg 
   2744   1.1  mrg static inline int vec_all_le(vec_int4 a, vec_bint4 b)
   2745   1.1  mrg {
   2746   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
   2747   1.1  mrg }
   2748   1.1  mrg 
   2749   1.1  mrg static inline int vec_all_le(vec_float4 a, vec_float4 b)
   2750   1.1  mrg {
   2751   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2752   1.1  mrg }
   2753   1.1  mrg 
   2754   1.1  mrg 
   2755   1.1  mrg /* vec_all_lt (all elements less than)
   2756   1.1  mrg  * ==========
   2757   1.1  mrg  */
   2758   1.1  mrg static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
   2759   1.1  mrg {
   2760   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
   2761   1.1  mrg }
   2762   1.1  mrg 
   2763   1.1  mrg static inline int vec_all_lt(vec_char16 a, vec_char16 b)
   2764   1.1  mrg {
   2765   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
   2766   1.1  mrg }
   2767   1.1  mrg 
   2768   1.1  mrg static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
   2769   1.1  mrg {
   2770   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
   2771   1.1  mrg }
   2772   1.1  mrg 
   2773   1.1  mrg static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
   2774   1.1  mrg {
   2775   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
   2776   1.1  mrg }
   2777   1.1  mrg 
   2778   1.1  mrg static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
   2779   1.1  mrg {
   2780   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
   2781   1.1  mrg }
   2782   1.1  mrg 
   2783   1.1  mrg static inline int vec_all_lt(vec_short8 a, vec_short8 b)
   2784   1.1  mrg {
   2785   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
   2786   1.1  mrg }
   2787   1.1  mrg 
   2788   1.1  mrg static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
   2789   1.1  mrg {
   2790   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
   2791   1.1  mrg }
   2792   1.1  mrg 
   2793   1.1  mrg static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
   2794   1.1  mrg {
   2795   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
   2796   1.1  mrg }
   2797   1.1  mrg 
   2798   1.1  mrg static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
   2799   1.1  mrg {
   2800   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
   2801   1.1  mrg }
   2802   1.1  mrg 
   2803   1.1  mrg static inline int vec_all_lt(vec_int4 a, vec_int4 b)
   2804   1.1  mrg {
   2805   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
   2806   1.1  mrg }
   2807   1.1  mrg 
   2808   1.1  mrg static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
   2809   1.1  mrg {
   2810   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
   2811   1.1  mrg }
   2812   1.1  mrg 
   2813   1.1  mrg static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
   2814   1.1  mrg {
   2815   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
   2816   1.1  mrg }
   2817   1.1  mrg 
   2818   1.1  mrg static inline int vec_all_lt(vec_float4 a, vec_float4 b)
   2819   1.1  mrg {
   2820   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
   2821   1.1  mrg }
   2822   1.1  mrg 
   2823   1.1  mrg 
   2824   1.1  mrg /* vec_all_nan (all elements not a number)
   2825   1.1  mrg  * ===========
   2826   1.1  mrg  */
   2827   1.1  mrg static inline int vec_all_nan(vec_float4 a)
   2828   1.1  mrg {
   2829   1.1  mrg   vec_uint4 exp, man;
   2830   1.1  mrg   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
   2831   1.1  mrg 
   2832   1.1  mrg   exp = spu_and((vec_uint4)(a), exp_mask);
   2833   1.1  mrg   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
   2834   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
   2835   1.1  mrg 						spu_cmpeq(man, 0))), 0) == 0xF));
   2836   1.1  mrg }
   2837   1.1  mrg 
   2838   1.1  mrg #define vec_all_nan(_a)		(0)
   2839   1.1  mrg 
   2840   1.1  mrg 
   2841   1.1  mrg /* vec_all_ne (all elements not equal)
   2842   1.1  mrg  * ==========
   2843   1.1  mrg  */
   2844   1.1  mrg static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
   2845   1.1  mrg {
   2846   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2847   1.1  mrg }
   2848   1.1  mrg 
   2849   1.1  mrg static inline int vec_all_ne(vec_char16 a, vec_char16 b)
   2850   1.1  mrg {
   2851   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2852   1.1  mrg }
   2853   1.1  mrg 
   2854   1.1  mrg static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
   2855   1.1  mrg {
   2856   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
   2857   1.1  mrg }
   2858   1.1  mrg 
   2859   1.1  mrg static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
   2860   1.1  mrg {
   2861   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
   2862   1.1  mrg }
   2863   1.1  mrg 
   2864   1.1  mrg static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
   2865   1.1  mrg {
   2866   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2867   1.1  mrg }
   2868   1.1  mrg 
   2869   1.1  mrg static inline int vec_all_ne(vec_short8 a, vec_short8 b)
   2870   1.1  mrg {
   2871   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2872   1.1  mrg }
   2873   1.1  mrg 
   2874   1.1  mrg static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
   2875   1.1  mrg {
   2876   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
   2877   1.1  mrg }
   2878   1.1  mrg 
   2879   1.1  mrg static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
   2880   1.1  mrg {
   2881   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
   2882   1.1  mrg }
   2883   1.1  mrg 
   2884   1.1  mrg static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
   2885   1.1  mrg {
   2886   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2887   1.1  mrg }
   2888   1.1  mrg 
   2889   1.1  mrg static inline int vec_all_ne(vec_int4 a, vec_int4 b)
   2890   1.1  mrg {
   2891   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2892   1.1  mrg }
   2893   1.1  mrg 
   2894   1.1  mrg static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
   2895   1.1  mrg {
   2896   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
   2897   1.1  mrg }
   2898   1.1  mrg 
   2899   1.1  mrg static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
   2900   1.1  mrg {
   2901   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
   2902   1.1  mrg }
   2903   1.1  mrg 
   2904   1.1  mrg static inline int vec_all_ne(vec_float4 a, vec_float4 b)
   2905   1.1  mrg {
   2906   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
   2907   1.1  mrg }
   2908   1.1  mrg 
   2909   1.1  mrg 
   2910   1.1  mrg /* vec_all_nge (all elements not greater than or equal)
   2911   1.1  mrg  * ===========
   2912   1.1  mrg  */
   2913   1.1  mrg static inline int vec_all_nge(vec_float4 a, vec_float4 b)
   2914   1.1  mrg {
   2915   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
   2916   1.1  mrg }
   2917   1.1  mrg 
   2918   1.1  mrg 
   2919   1.1  mrg /* vec_all_ngt (all elements not greater than)
   2920   1.1  mrg  * ===========
   2921   1.1  mrg  */
   2922   1.1  mrg static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
   2923   1.1  mrg {
   2924   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
   2925   1.1  mrg }
   2926   1.1  mrg 
   2927   1.1  mrg 
   2928   1.1  mrg /* vec_all_nle (all elements not less than or equal)
   2929   1.1  mrg  * ===========
   2930   1.1  mrg  */
   2931   1.1  mrg static inline int vec_all_nle(vec_float4 a, vec_float4 b)
   2932   1.1  mrg {
   2933   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
   2934   1.1  mrg }
   2935   1.1  mrg 
   2936   1.1  mrg 
   2937   1.1  mrg /* vec_all_nlt (all elements not less than)
   2938   1.1  mrg  * ===========
   2939   1.1  mrg  */
   2940   1.1  mrg static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
   2941   1.1  mrg {
   2942   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
   2943   1.1  mrg }
   2944   1.1  mrg 
   2945   1.1  mrg 
   2946   1.1  mrg /* vec_all_numeric (all elements numeric)
   2947   1.1  mrg  * ===========
   2948   1.1  mrg  */
   2949   1.1  mrg static inline int vec_all_numeric(vec_float4 a)
   2950   1.1  mrg {
   2951   1.1  mrg   vec_uint4 exp;
   2952   1.1  mrg 
   2953   1.1  mrg   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
   2954   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
   2955   1.1  mrg }
   2956   1.1  mrg 
   2957   1.1  mrg 
   2958   1.1  mrg 
   2959   1.1  mrg /* vec_any_eq (any elements equal)
   2960   1.1  mrg  * ==========
   2961   1.1  mrg  */
   2962   1.1  mrg static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
   2963   1.1  mrg {
   2964   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
   2965   1.1  mrg }
   2966   1.1  mrg 
   2967   1.1  mrg static inline int vec_any_eq(vec_char16 a, vec_char16 b)
   2968   1.1  mrg {
   2969   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
   2970   1.1  mrg }
   2971   1.1  mrg 
   2972   1.1  mrg static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
   2973   1.1  mrg {
   2974   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
   2975   1.1  mrg }
   2976   1.1  mrg 
   2977   1.1  mrg static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
   2978   1.1  mrg {
   2979   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
   2980   1.1  mrg }
   2981   1.1  mrg 
   2982   1.1  mrg static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
   2983   1.1  mrg {
   2984   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
   2985   1.1  mrg }
   2986   1.1  mrg 
   2987   1.1  mrg static inline int vec_any_eq(vec_short8 a, vec_short8 b)
   2988   1.1  mrg {
   2989   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
   2990   1.1  mrg }
   2991   1.1  mrg 
   2992   1.1  mrg static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
   2993   1.1  mrg {
   2994   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
   2995   1.1  mrg }
   2996   1.1  mrg 
   2997   1.1  mrg static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
   2998   1.1  mrg {
   2999   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
   3000   1.1  mrg }
   3001   1.1  mrg 
   3002   1.1  mrg static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
   3003   1.1  mrg {
   3004   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
   3005   1.1  mrg }
   3006   1.1  mrg 
   3007   1.1  mrg static inline int vec_any_eq(vec_int4 a, vec_int4 b)
   3008   1.1  mrg {
   3009   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
   3010   1.1  mrg }
   3011   1.1  mrg 
   3012   1.1  mrg static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
   3013   1.1  mrg {
   3014   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
   3015   1.1  mrg }
   3016   1.1  mrg 
   3017   1.1  mrg static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
   3018   1.1  mrg {
   3019   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
   3020   1.1  mrg }
   3021   1.1  mrg 
   3022   1.1  mrg static inline int vec_any_eq(vec_float4 a, vec_float4 b)
   3023   1.1  mrg {
   3024   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
   3025   1.1  mrg }
   3026   1.1  mrg 
   3027   1.1  mrg /* vec_any_ge (any elements greater than or equal)
   3028   1.1  mrg  * ==========
   3029   1.1  mrg  */
   3030   1.1  mrg static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
   3031   1.1  mrg {
   3032   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
   3033   1.1  mrg }
   3034   1.1  mrg 
   3035   1.1  mrg static inline int vec_any_ge(vec_char16 a, vec_char16 b)
   3036   1.1  mrg {
   3037   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
   3038   1.1  mrg }
   3039   1.1  mrg 
   3040   1.1  mrg static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
   3041   1.1  mrg {
   3042   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
   3043   1.1  mrg }
   3044   1.1  mrg 
   3045   1.1  mrg static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
   3046   1.1  mrg {
   3047   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
   3048   1.1  mrg }
   3049   1.1  mrg 
   3050   1.1  mrg static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
   3051   1.1  mrg {
   3052   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
   3053   1.1  mrg }
   3054   1.1  mrg 
   3055   1.1  mrg static inline int vec_any_ge(vec_short8 a, vec_short8 b)
   3056   1.1  mrg {
   3057   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
   3058   1.1  mrg }
   3059   1.1  mrg 
   3060   1.1  mrg static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
   3061   1.1  mrg {
   3062   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
   3063   1.1  mrg }
   3064   1.1  mrg 
   3065   1.1  mrg static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
   3066   1.1  mrg {
   3067   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
   3068   1.1  mrg }
   3069   1.1  mrg 
   3070   1.1  mrg static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
   3071   1.1  mrg {
   3072   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
   3073   1.1  mrg }
   3074   1.1  mrg 
   3075   1.1  mrg static inline int vec_any_ge(vec_int4 a, vec_int4 b)
   3076   1.1  mrg {
   3077   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
   3078   1.1  mrg }
   3079   1.1  mrg 
   3080   1.1  mrg static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
   3081   1.1  mrg {
   3082   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
   3083   1.1  mrg }
   3084   1.1  mrg 
   3085   1.1  mrg static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
   3086   1.1  mrg {
   3087   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
   3088   1.1  mrg }
   3089   1.1  mrg 
   3090   1.1  mrg static inline int vec_any_ge(vec_float4 a, vec_float4 b)
   3091   1.1  mrg {
   3092   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
   3093   1.1  mrg }
   3094   1.1  mrg 
   3095   1.1  mrg 
   3096   1.1  mrg /* vec_any_gt (any elements greater than)
   3097   1.1  mrg  * ==========
   3098   1.1  mrg  */
   3099   1.1  mrg static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
   3100   1.1  mrg {
   3101   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
   3102   1.1  mrg }
   3103   1.1  mrg 
   3104   1.1  mrg static inline int vec_any_gt(vec_char16 a, vec_char16 b)
   3105   1.1  mrg {
   3106   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
   3107   1.1  mrg }
   3108   1.1  mrg 
   3109   1.1  mrg static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
   3110   1.1  mrg {
   3111   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
   3112   1.1  mrg }
   3113   1.1  mrg 
   3114   1.1  mrg static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
   3115   1.1  mrg {
   3116   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
   3117   1.1  mrg }
   3118   1.1  mrg 
   3119   1.1  mrg static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
   3120   1.1  mrg {
   3121   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
   3122   1.1  mrg }
   3123   1.1  mrg 
   3124   1.1  mrg static inline int vec_any_gt(vec_short8 a, vec_short8 b)
   3125   1.1  mrg {
   3126   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
   3127   1.1  mrg }
   3128   1.1  mrg 
   3129   1.1  mrg static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
   3130   1.1  mrg {
   3131   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
   3132   1.1  mrg }
   3133   1.1  mrg 
   3134   1.1  mrg static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
   3135   1.1  mrg {
   3136   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
   3137   1.1  mrg }
   3138   1.1  mrg 
   3139   1.1  mrg 
   3140   1.1  mrg static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
   3141   1.1  mrg {
   3142   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
   3143   1.1  mrg }
   3144   1.1  mrg 
   3145   1.1  mrg static inline int vec_any_gt(vec_int4 a, vec_int4 b)
   3146   1.1  mrg {
   3147   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
   3148   1.1  mrg }
   3149   1.1  mrg 
   3150   1.1  mrg static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
   3151   1.1  mrg {
   3152   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
   3153   1.1  mrg }
   3154   1.1  mrg 
   3155   1.1  mrg static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
   3156   1.1  mrg {
   3157   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
   3158   1.1  mrg }
   3159   1.1  mrg 
   3160   1.1  mrg static inline int vec_any_gt(vec_float4 a, vec_float4 b)
   3161   1.1  mrg {
   3162   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
   3163   1.1  mrg }
   3164   1.1  mrg 
   3165   1.1  mrg /* vec_any_le (any elements less than or equal)
   3166   1.1  mrg  * ==========
   3167   1.1  mrg  */
   3168   1.1  mrg static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
   3169   1.1  mrg {
   3170   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
   3171   1.1  mrg }
   3172   1.1  mrg 
   3173   1.1  mrg static inline int vec_any_le(vec_char16 a, vec_char16 b)
   3174   1.1  mrg {
   3175   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
   3176   1.1  mrg }
   3177   1.1  mrg 
   3178   1.1  mrg static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
   3179   1.1  mrg {
   3180   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
   3181   1.1  mrg }
   3182   1.1  mrg 
   3183   1.1  mrg static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
   3184   1.1  mrg {
   3185   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
   3186   1.1  mrg }
   3187   1.1  mrg 
   3188   1.1  mrg static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
   3189   1.1  mrg {
   3190   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
   3191   1.1  mrg }
   3192   1.1  mrg 
   3193   1.1  mrg static inline int vec_any_le(vec_short8 a, vec_short8 b)
   3194   1.1  mrg {
   3195   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
   3196   1.1  mrg }
   3197   1.1  mrg 
   3198   1.1  mrg static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
   3199   1.1  mrg {
   3200   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
   3201   1.1  mrg }
   3202   1.1  mrg 
   3203   1.1  mrg static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
   3204   1.1  mrg {
   3205   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
   3206   1.1  mrg }
   3207   1.1  mrg 
   3208   1.1  mrg static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
   3209   1.1  mrg {
   3210   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
   3211   1.1  mrg }
   3212   1.1  mrg 
   3213   1.1  mrg static inline int vec_any_le(vec_int4 a, vec_int4 b)
   3214   1.1  mrg {
   3215   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
   3216   1.1  mrg }
   3217   1.1  mrg 
   3218   1.1  mrg static inline int vec_any_le(vec_bint4 a, vec_int4 b)
   3219   1.1  mrg {
   3220   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
   3221   1.1  mrg }
   3222   1.1  mrg 
   3223   1.1  mrg static inline int vec_any_le(vec_int4 a, vec_bint4 b)
   3224   1.1  mrg {
   3225   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
   3226   1.1  mrg }
   3227   1.1  mrg 
   3228   1.1  mrg static inline int vec_any_le(vec_float4 a, vec_float4 b)
   3229   1.1  mrg {
   3230   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
   3231   1.1  mrg }
   3232   1.1  mrg 
   3233   1.1  mrg 
   3234   1.1  mrg /* vec_any_lt (any elements less than)
   3235   1.1  mrg  * ==========
   3236   1.1  mrg  */
   3237   1.1  mrg static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
   3238   1.1  mrg {
   3239   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
   3240   1.1  mrg }
   3241   1.1  mrg 
   3242   1.1  mrg static inline int vec_any_lt(vec_char16 a, vec_char16 b)
   3243   1.1  mrg {
   3244   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
   3245   1.1  mrg }
   3246   1.1  mrg 
   3247   1.1  mrg static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
   3248   1.1  mrg {
   3249   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
   3250   1.1  mrg }
   3251   1.1  mrg 
   3252   1.1  mrg static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
   3253   1.1  mrg {
   3254   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
   3255   1.1  mrg }
   3256   1.1  mrg 
   3257   1.1  mrg static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
   3258   1.1  mrg {
   3259   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
   3260   1.1  mrg }
   3261   1.1  mrg 
   3262   1.1  mrg static inline int vec_any_lt(vec_short8 a, vec_short8 b)
   3263   1.1  mrg {
   3264   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
   3265   1.1  mrg }
   3266   1.1  mrg 
   3267   1.1  mrg static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
   3268   1.1  mrg {
   3269   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
   3270   1.1  mrg }
   3271   1.1  mrg 
   3272   1.1  mrg static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
   3273   1.1  mrg {
   3274   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
   3275   1.1  mrg }
   3276   1.1  mrg 
   3277   1.1  mrg static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
   3278   1.1  mrg {
   3279   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
   3280   1.1  mrg }
   3281   1.1  mrg 
   3282   1.1  mrg static inline int vec_any_lt(vec_int4 a, vec_int4 b)
   3283   1.1  mrg {
   3284   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
   3285   1.1  mrg }
   3286   1.1  mrg 
   3287   1.1  mrg static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
   3288   1.1  mrg {
   3289   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
   3290   1.1  mrg }
   3291   1.1  mrg 
   3292   1.1  mrg static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
   3293   1.1  mrg {
   3294   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
   3295   1.1  mrg }
   3296   1.1  mrg 
   3297   1.1  mrg static inline int vec_any_lt(vec_float4 a, vec_float4 b)
   3298   1.1  mrg {
   3299   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
   3300   1.1  mrg }
   3301   1.1  mrg 
   3302   1.1  mrg /* vec_any_nan (any elements not a number)
   3303   1.1  mrg  * ===========
   3304   1.1  mrg  */
   3305   1.1  mrg static inline int vec_any_nan(vec_float4 a)
   3306   1.1  mrg {
   3307   1.1  mrg   vec_uint4 exp, man;
   3308   1.1  mrg   vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
   3309   1.1  mrg 
   3310   1.1  mrg   exp = spu_and((vec_uint4)(a), exp_mask);
   3311   1.1  mrg   man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
   3312   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
   3313   1.1  mrg 						spu_cmpeq(man, 0))), 0) != 0));
   3314   1.1  mrg }
   3315   1.1  mrg 
   3316   1.1  mrg 
   3317   1.1  mrg /* vec_any_ne (any elements not equal)
   3318   1.1  mrg  * ==========
   3319   1.1  mrg  */
   3320   1.1  mrg static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
   3321   1.1  mrg {
   3322   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
   3323   1.1  mrg }
   3324   1.1  mrg 
   3325   1.1  mrg static inline int vec_any_ne(vec_char16 a, vec_char16 b)
   3326   1.1  mrg {
   3327   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
   3328   1.1  mrg }
   3329   1.1  mrg 
   3330   1.1  mrg static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
   3331   1.1  mrg {
   3332   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
   3333   1.1  mrg }
   3334   1.1  mrg 
   3335   1.1  mrg static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
   3336   1.1  mrg {
   3337   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
   3338   1.1  mrg }
   3339   1.1  mrg 
   3340   1.1  mrg static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
   3341   1.1  mrg {
   3342   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
   3343   1.1  mrg }
   3344   1.1  mrg 
   3345   1.1  mrg static inline int vec_any_ne(vec_short8 a, vec_short8 b)
   3346   1.1  mrg {
   3347   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
   3348   1.1  mrg }
   3349   1.1  mrg 
   3350   1.1  mrg static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
   3351   1.1  mrg {
   3352   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
   3353   1.1  mrg }
   3354   1.1  mrg 
   3355   1.1  mrg static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
   3356   1.1  mrg {
   3357   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
   3358   1.1  mrg }
   3359   1.1  mrg 
   3360   1.1  mrg static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
   3361   1.1  mrg {
   3362   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
   3363   1.1  mrg }
   3364   1.1  mrg 
   3365   1.1  mrg static inline int vec_any_ne(vec_int4 a, vec_int4 b)
   3366   1.1  mrg {
   3367   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
   3368   1.1  mrg }
   3369   1.1  mrg 
   3370   1.1  mrg static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
   3371   1.1  mrg {
   3372   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
   3373   1.1  mrg }
   3374   1.1  mrg 
   3375   1.1  mrg static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
   3376   1.1  mrg {
   3377   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
   3378   1.1  mrg }
   3379   1.1  mrg 
   3380   1.1  mrg static inline int vec_any_ne(vec_float4 a, vec_float4 b)
   3381   1.1  mrg {
   3382   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
   3383   1.1  mrg }
   3384   1.1  mrg 
   3385   1.1  mrg 
   3386   1.1  mrg /* vec_any_nge (any elements not greater than or equal)
   3387   1.1  mrg  * ===========
   3388   1.1  mrg  */
   3389   1.1  mrg static inline int vec_any_nge(vec_float4 a, vec_float4 b)
   3390   1.1  mrg {
   3391   1.1  mrg   return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
   3392   1.1  mrg }
   3393   1.1  mrg 
   3394   1.1  mrg /* vec_any_ngt (any elements not greater than)
   3395   1.1  mrg  * ===========
   3396   1.1  mrg  */
   3397   1.1  mrg static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
   3398   1.1  mrg {
   3399   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
   3400   1.1  mrg }
   3401   1.1  mrg 
   3402   1.1  mrg 
   3403   1.1  mrg /* vec_any_nle (any elements not less than or equal)
   3404   1.1  mrg  * ===========
   3405   1.1  mrg  */
   3406   1.1  mrg static inline int vec_any_nle(vec_float4 a, vec_float4 b)
   3407   1.1  mrg {
   3408   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
   3409   1.1  mrg }
   3410   1.1  mrg 
   3411   1.1  mrg 
   3412   1.1  mrg /* vec_any_nlt (any elements not less than)
   3413   1.1  mrg  * ===========
   3414   1.1  mrg  */
   3415   1.1  mrg static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
   3416   1.1  mrg {
   3417   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
   3418   1.1  mrg }
   3419   1.1  mrg 
   3420   1.1  mrg 
   3421   1.1  mrg /* vec_any_numeric (any elements numeric)
   3422   1.1  mrg  * ===============
   3423   1.1  mrg  */
   3424   1.1  mrg static inline int vec_any_numeric(vec_float4 a)
   3425   1.1  mrg {
   3426   1.1  mrg   vec_uint4 exp;
   3427   1.1  mrg 
   3428   1.1  mrg   exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
   3429   1.1  mrg   return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
   3430   1.1  mrg }
   3431   1.1  mrg 
   3432   1.1  mrg 
   3433   1.1  mrg /* vec_any_out (any elements out of bounds)
   3434   1.1  mrg  * ===========
   3435   1.1  mrg  */
   3436   1.1  mrg static inline int vec_any_out(vec_float4 a, vec_float4 b)
   3437   1.1  mrg {
   3438   1.1  mrg   return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
   3439   1.1  mrg }
   3440   1.1  mrg 
   3441   1.1  mrg 
   3442   1.1  mrg /* CBE Language Extension Intrinsics
   3443   1.1  mrg  */
   3444   1.1  mrg 
   3445   1.1  mrg /* vec_extract (extract element from vector)
   3446   1.1  mrg  * ===========
   3447   1.1  mrg  */
   3448   1.1  mrg #define vec_extract(_a, _element)	spu_extract(_a, _element)
   3449   1.1  mrg 
   3450   1.1  mrg 
   3451   1.1  mrg /* vec_insert (insert scalar into specified vector element)
   3452   1.1  mrg  * ==========
   3453   1.1  mrg  */
   3454   1.1  mrg #define vec_insert(_a, _b, _element)	spu_insert(_a, _b, _element)
   3455   1.1  mrg 
   3456   1.1  mrg /* vec_lvlx (load vector left indexed)
   3457   1.1  mrg  * ========
   3458   1.1  mrg  */
   3459   1.1  mrg static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
   3460   1.1  mrg {
   3461   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
   3462   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3463   1.1  mrg }
   3464   1.1  mrg 
   3465   1.1  mrg static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
   3466   1.1  mrg {
   3467   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
   3468   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3469   1.1  mrg }
   3470   1.1  mrg 
   3471   1.1  mrg static inline vec_char16 vec_lvlx(int a, signed char *b)
   3472   1.1  mrg {
   3473   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
   3474   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3475   1.1  mrg }
   3476   1.1  mrg 
   3477   1.1  mrg static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
   3478   1.1  mrg {
   3479   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
   3480   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3481   1.1  mrg }
   3482   1.1  mrg 
   3483   1.1  mrg static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
   3484   1.1  mrg {
   3485   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
   3486   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3487   1.1  mrg }
   3488   1.1  mrg 
   3489   1.1  mrg static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
   3490   1.1  mrg {
   3491   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
   3492   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3493   1.1  mrg }
   3494   1.1  mrg 
   3495   1.1  mrg static inline vec_short8 vec_lvlx(int a, signed short *b)
   3496   1.1  mrg {
   3497   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
   3498   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3499   1.1  mrg }
   3500   1.1  mrg 
   3501   1.1  mrg static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
   3502   1.1  mrg {
   3503   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
   3504   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3505   1.1  mrg }
   3506   1.1  mrg 
   3507   1.1  mrg static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
   3508   1.1  mrg {
   3509   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
   3510   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3511   1.1  mrg }
   3512   1.1  mrg 
   3513   1.1  mrg static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
   3514   1.1  mrg {
   3515   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
   3516   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3517   1.1  mrg }
   3518   1.1  mrg 
   3519   1.1  mrg static inline vec_int4 vec_lvlx(int a, signed int *b)
   3520   1.1  mrg {
   3521   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
   3522   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3523   1.1  mrg }
   3524   1.1  mrg 
   3525   1.1  mrg static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
   3526   1.1  mrg {
   3527   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
   3528   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3529   1.1  mrg }
   3530   1.1  mrg 
   3531   1.1  mrg static inline vec_float4 vec_lvlx(int a, float *b)
   3532   1.1  mrg {
   3533   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
   3534   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3535   1.1  mrg }
   3536   1.1  mrg 
   3537   1.1  mrg static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
   3538   1.1  mrg {
   3539   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
   3540   1.1  mrg   return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
   3541   1.1  mrg }
   3542   1.1  mrg 
   3543   1.1  mrg 
   3544   1.1  mrg /* vec_lvlxl (load vector left indexed last)
   3545   1.1  mrg  * =========
   3546   1.1  mrg  */
   3547   1.1  mrg #define vec_lvlxl(_a, _b)	vec_lvlx(_a, _b)
   3548   1.1  mrg 
   3549   1.1  mrg 
   3550   1.1  mrg /* vec_lvrx (load vector right indexed)
   3551   1.1  mrg  * ========
   3552   1.1  mrg  */
   3553   1.1  mrg static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
   3554   1.1  mrg {
   3555   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
   3556   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3557   1.1  mrg }
   3558   1.1  mrg 
   3559   1.1  mrg static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
   3560   1.1  mrg {
   3561   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
   3562   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3563   1.1  mrg }
   3564   1.1  mrg 
   3565   1.1  mrg static inline vec_char16 vec_lvrx(int a, signed char *b)
   3566   1.1  mrg {
   3567   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
   3568   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3569   1.1  mrg }
   3570   1.1  mrg 
   3571   1.1  mrg static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
   3572   1.1  mrg {
   3573   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
   3574   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3575   1.1  mrg }
   3576   1.1  mrg 
   3577   1.1  mrg static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
   3578   1.1  mrg {
   3579   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
   3580   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3581   1.1  mrg }
   3582   1.1  mrg 
   3583   1.1  mrg static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
   3584   1.1  mrg {
   3585   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
   3586   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3587   1.1  mrg }
   3588   1.1  mrg 
   3589   1.1  mrg static inline vec_short8 vec_lvrx(int a, signed short *b)
   3590   1.1  mrg {
   3591   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
   3592   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3593   1.1  mrg }
   3594   1.1  mrg 
   3595   1.1  mrg static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
   3596   1.1  mrg {
   3597   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
   3598   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3599   1.1  mrg }
   3600   1.1  mrg 
   3601   1.1  mrg static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
   3602   1.1  mrg {
   3603   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
   3604   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3605   1.1  mrg }
   3606   1.1  mrg 
   3607   1.1  mrg static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
   3608   1.1  mrg {
   3609   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
   3610   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3611   1.1  mrg }
   3612   1.1  mrg 
   3613   1.1  mrg static inline vec_int4 vec_lvrx(int a, signed int *b)
   3614   1.1  mrg {
   3615   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
   3616   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3617   1.1  mrg }
   3618   1.1  mrg 
   3619   1.1  mrg static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
   3620   1.1  mrg {
   3621   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
   3622   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3623   1.1  mrg }
   3624   1.1  mrg 
   3625   1.1  mrg static inline vec_float4 vec_lvrx(int a, float *b)
   3626   1.1  mrg {
   3627   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
   3628   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3629   1.1  mrg }
   3630   1.1  mrg 
   3631   1.1  mrg static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
   3632   1.1  mrg {
   3633   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
   3634   1.1  mrg   return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
   3635   1.1  mrg }
   3636   1.1  mrg 
   3637   1.1  mrg 
   3638   1.1  mrg 
   3639   1.1  mrg /* vec_lvrxl (load vector right indexed last)
   3640   1.1  mrg  * =========
   3641   1.1  mrg  */
   3642   1.1  mrg #define vec_lvrxl(_a, _b)	vec_lvrx(_a, _b)
   3643   1.1  mrg 
   3644   1.1  mrg 
   3645   1.1  mrg /* vec_promote (promote scalar to a vector)
   3646   1.1  mrg  * ===========
   3647   1.1  mrg  */
   3648   1.1  mrg #define vec_promote(_a, _element)	spu_promote(_a, _element)
   3649   1.1  mrg 
   3650   1.1  mrg 
   3651   1.1  mrg /* vec_splats (splat scalar to a vector)
   3652   1.1  mrg  * ==========
   3653   1.1  mrg  */
   3654   1.1  mrg #define vec_splats(_a)	spu_splats(_a)
   3655   1.1  mrg 
   3656   1.1  mrg 
   3657   1.1  mrg /* vec_stvlx (store vector left indexed)
   3658   1.1  mrg  * =========
   3659   1.1  mrg  */
   3660   1.1  mrg static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
   3661   1.1  mrg {
   3662   1.1  mrg   int shift;
   3663   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
   3664   1.1  mrg 
   3665   1.1  mrg   shift = -((int)p & 0xF);
   3666   1.1  mrg   *p = spu_sel(*p,
   3667   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3668   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
   3669   1.1  mrg }
   3670   1.1  mrg 
   3671   1.1  mrg static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
   3672   1.1  mrg {
   3673   1.1  mrg   int shift;
   3674   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
   3675   1.1  mrg 
   3676   1.1  mrg   shift = -((int)p & 0xF);
   3677   1.1  mrg   *p = spu_sel(*p,
   3678   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3679   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
   3680   1.1  mrg }
   3681   1.1  mrg 
   3682   1.1  mrg static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
   3683   1.1  mrg {
   3684   1.1  mrg   int shift;
   3685   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
   3686   1.1  mrg 
   3687   1.1  mrg   shift = -((int)p & 0xF);
   3688   1.1  mrg   *p = spu_sel(*p,
   3689   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3690   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
   3691   1.1  mrg }
   3692   1.1  mrg 
   3693   1.1  mrg static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
   3694   1.1  mrg {
   3695   1.1  mrg   int shift;
   3696   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
   3697   1.1  mrg 
   3698   1.1  mrg   shift = -((int)p & 0xF);
   3699   1.1  mrg   *p = spu_sel(*p,
   3700   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3701   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
   3702   1.1  mrg }
   3703   1.1  mrg 
   3704   1.1  mrg static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
   3705   1.1  mrg {
   3706   1.1  mrg   int shift;
   3707   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
   3708   1.1  mrg 
   3709   1.1  mrg   shift = -((int)p & 0xF);
   3710   1.1  mrg   *p = spu_sel(*p,
   3711   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3712   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3713   1.1  mrg }
   3714   1.1  mrg 
   3715   1.1  mrg static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
   3716   1.1  mrg {
   3717   1.1  mrg   int shift;
   3718   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
   3719   1.1  mrg 
   3720   1.1  mrg   shift = -((int)p & 0xF);
   3721   1.1  mrg   *p = spu_sel(*p,
   3722   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3723   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3724   1.1  mrg }
   3725   1.1  mrg 
   3726   1.1  mrg static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
   3727   1.1  mrg {
   3728   1.1  mrg   int shift;
   3729   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
   3730   1.1  mrg 
   3731   1.1  mrg   shift = -((int)p & 0xF);
   3732   1.1  mrg   *p = spu_sel(*p,
   3733   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3734   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3735   1.1  mrg }
   3736   1.1  mrg 
   3737   1.1  mrg static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
   3738   1.1  mrg {
   3739   1.1  mrg   int shift;
   3740   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
   3741   1.1  mrg 
   3742   1.1  mrg   shift = -((int)p & 0xF);
   3743   1.1  mrg   *p = spu_sel(*p,
   3744   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3745   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3746   1.1  mrg }
   3747   1.1  mrg 
   3748   1.1  mrg static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
   3749   1.1  mrg {
   3750   1.1  mrg   int shift;
   3751   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
   3752   1.1  mrg 
   3753   1.1  mrg   shift = -((int)p & 0xF);
   3754   1.1  mrg   *p = spu_sel(*p,
   3755   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3756   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3757   1.1  mrg }
   3758   1.1  mrg 
   3759   1.1  mrg static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
   3760   1.1  mrg {
   3761   1.1  mrg   int shift;
   3762   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
   3763   1.1  mrg 
   3764   1.1  mrg   shift = -((int)p & 0xF);
   3765   1.1  mrg   *p = spu_sel(*p,
   3766   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3767   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3768   1.1  mrg }
   3769   1.1  mrg 
   3770   1.1  mrg static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
   3771   1.1  mrg {
   3772   1.1  mrg   int shift;
   3773   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
   3774   1.1  mrg 
   3775   1.1  mrg   shift = -((int)p & 0xF);
   3776   1.1  mrg   *p = spu_sel(*p,
   3777   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3778   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3779   1.1  mrg }
   3780   1.1  mrg 
   3781   1.1  mrg static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
   3782   1.1  mrg {
   3783   1.1  mrg   int shift;
   3784   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
   3785   1.1  mrg 
   3786   1.1  mrg   shift = -((int)p & 0xF);
   3787   1.1  mrg   *p = spu_sel(*p,
   3788   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3789   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3790   1.1  mrg }
   3791   1.1  mrg 
   3792   1.1  mrg static inline void vec_stvlx(vec_float4 a, int b, float *c)
   3793   1.1  mrg {
   3794   1.1  mrg   int shift;
   3795   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
   3796   1.1  mrg 
   3797   1.1  mrg   shift = -((int)p & 0xF);
   3798   1.1  mrg   *p = spu_sel(*p,
   3799   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3800   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3801   1.1  mrg }
   3802   1.1  mrg 
   3803   1.1  mrg static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
   3804   1.1  mrg {
   3805   1.1  mrg   int shift;
   3806   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
   3807   1.1  mrg 
   3808   1.1  mrg   shift = -((int)p & 0xF);
   3809   1.1  mrg   *p = spu_sel(*p,
   3810   1.1  mrg 	       spu_rlmaskqwbyte(a, shift),
   3811   1.1  mrg 	       spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3812   1.1  mrg }
   3813   1.1  mrg 
   3814   1.1  mrg /* vec_stvlxl (store vector left indexed last)
   3815   1.1  mrg  * ==========
   3816   1.1  mrg  */
   3817   1.1  mrg #define vec_stvlxl(_a, _b, _c)	vec_stvlx(_a, _b, _c)
   3818   1.1  mrg 
   3819   1.1  mrg 
   3820   1.1  mrg /* vec_stvrx (store vector right indexed)
   3821   1.1  mrg  * =========
   3822   1.1  mrg  */
   3823   1.1  mrg static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
   3824   1.1  mrg {
   3825   1.1  mrg   int shift;
   3826   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
   3827   1.1  mrg 
   3828   1.1  mrg   shift = 16-((int)p & 0xF);
   3829   1.1  mrg   *p = spu_sel(*p,
   3830   1.1  mrg 	       spu_slqwbyte(a, shift),
   3831   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
   3832   1.1  mrg }
   3833   1.1  mrg 
   3834   1.1  mrg static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
   3835   1.1  mrg {
   3836   1.1  mrg   int shift;
   3837   1.1  mrg   vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
   3838   1.1  mrg 
   3839   1.1  mrg   shift = 16-((int)p & 0xF);
   3840   1.1  mrg   *p = spu_sel(*p,
   3841   1.1  mrg 	       spu_slqwbyte(a, shift),
   3842   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
   3843   1.1  mrg }
   3844   1.1  mrg 
   3845   1.1  mrg static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
   3846   1.1  mrg {
   3847   1.1  mrg   int shift;
   3848   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
   3849   1.1  mrg 
   3850   1.1  mrg   shift = 16-((int)p & 0xF);
   3851   1.1  mrg   *p = spu_sel(*p,
   3852   1.1  mrg 	       spu_slqwbyte(a, shift),
   3853   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
   3854   1.1  mrg }
   3855   1.1  mrg 
   3856   1.1  mrg static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
   3857   1.1  mrg {
   3858   1.1  mrg   int shift;
   3859   1.1  mrg   vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
   3860   1.1  mrg 
   3861   1.1  mrg   shift = 16-((int)p & 0xF);
   3862   1.1  mrg   *p = spu_sel(*p,
   3863   1.1  mrg 	       spu_slqwbyte(a, shift),
   3864   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
   3865   1.1  mrg }
   3866   1.1  mrg 
   3867   1.1  mrg static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
   3868   1.1  mrg {
   3869   1.1  mrg   int shift;
   3870   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
   3871   1.1  mrg 
   3872   1.1  mrg   shift = 16-((int)p & 0xF);
   3873   1.1  mrg   *p = spu_sel(*p,
   3874   1.1  mrg 	       spu_slqwbyte(a, shift),
   3875   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3876   1.1  mrg }
   3877   1.1  mrg 
   3878   1.1  mrg static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
   3879   1.1  mrg {
   3880   1.1  mrg   int shift;
   3881   1.1  mrg   vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
   3882   1.1  mrg 
   3883   1.1  mrg   shift = 16-((int)p & 0xF);
   3884   1.1  mrg   *p = spu_sel(*p,
   3885   1.1  mrg 	       spu_slqwbyte(a, shift),
   3886   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3887   1.1  mrg }
   3888   1.1  mrg 
   3889   1.1  mrg static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
   3890   1.1  mrg {
   3891   1.1  mrg   int shift;
   3892   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
   3893   1.1  mrg 
   3894   1.1  mrg   shift = 16-((int)p & 0xF);
   3895   1.1  mrg   *p = spu_sel(*p,
   3896   1.1  mrg 	       spu_slqwbyte(a, shift),
   3897   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3898   1.1  mrg }
   3899   1.1  mrg 
   3900   1.1  mrg static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
   3901   1.1  mrg {
   3902   1.1  mrg   int shift;
   3903   1.1  mrg   vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
   3904   1.1  mrg 
   3905   1.1  mrg   shift = 16-((int)p & 0xF);
   3906   1.1  mrg   *p = spu_sel(*p,
   3907   1.1  mrg 	       spu_slqwbyte(a, shift),
   3908   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
   3909   1.1  mrg }
   3910   1.1  mrg 
   3911   1.1  mrg static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
   3912   1.1  mrg {
   3913   1.1  mrg   int shift;
   3914   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
   3915   1.1  mrg 
   3916   1.1  mrg   shift = 16-((int)p & 0xF);
   3917   1.1  mrg   *p = spu_sel(*p,
   3918   1.1  mrg 	       spu_slqwbyte(a, shift),
   3919   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3920   1.1  mrg }
   3921   1.1  mrg 
   3922   1.1  mrg static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
   3923   1.1  mrg {
   3924   1.1  mrg   int shift;
   3925   1.1  mrg   vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
   3926   1.1  mrg 
   3927   1.1  mrg   shift = 16-((int)p & 0xF);
   3928   1.1  mrg   *p = spu_sel(*p,
   3929   1.1  mrg 	       spu_slqwbyte(a, shift),
   3930   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3931   1.1  mrg }
   3932   1.1  mrg 
   3933   1.1  mrg static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
   3934   1.1  mrg {
   3935   1.1  mrg   int shift;
   3936   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
   3937   1.1  mrg 
   3938   1.1  mrg   shift = 16-((int)p & 0xF);
   3939   1.1  mrg   *p = spu_sel(*p,
   3940   1.1  mrg 	       spu_slqwbyte(a, shift),
   3941   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3942   1.1  mrg }
   3943   1.1  mrg 
   3944   1.1  mrg static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
   3945   1.1  mrg {
   3946   1.1  mrg   int shift;
   3947   1.1  mrg   vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
   3948   1.1  mrg 
   3949   1.1  mrg   shift = 16-((int)p & 0xF);
   3950   1.1  mrg   *p = spu_sel(*p,
   3951   1.1  mrg 	       spu_slqwbyte(a, shift),
   3952   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3953   1.1  mrg }
   3954   1.1  mrg 
   3955   1.1  mrg static inline void vec_stvrx(vec_float4 a, int b, float *c)
   3956   1.1  mrg {
   3957   1.1  mrg   int shift;
   3958   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
   3959   1.1  mrg 
   3960   1.1  mrg   shift = 16-((int)p & 0xF);
   3961   1.1  mrg   *p = spu_sel(*p,
   3962   1.1  mrg 	       spu_slqwbyte(a, shift),
   3963   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3964   1.1  mrg }
   3965   1.1  mrg 
   3966   1.1  mrg static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
   3967   1.1  mrg {
   3968   1.1  mrg   int shift;
   3969   1.1  mrg   vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
   3970   1.1  mrg 
   3971   1.1  mrg   shift = 16-((int)p & 0xF);
   3972   1.1  mrg   *p = spu_sel(*p,
   3973   1.1  mrg 	       spu_slqwbyte(a, shift),
   3974   1.1  mrg 	       spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
   3975   1.1  mrg }
   3976   1.1  mrg 
   3977   1.1  mrg /* vec_stvrxl (store vector right indexed last)
   3978   1.1  mrg  * ==========
   3979   1.1  mrg  */
   3980   1.1  mrg #define vec_stvrxl(_a, _b, _c)	vec_stvrx(_a, _b, _c)
   3981   1.1  mrg 
   3982   1.1  mrg 
   3983   1.1  mrg #endif /* __SPU__ */
   3984   1.1  mrg #endif /* __cplusplus */
   3985   1.1  mrg #endif /* !_VMX2SPU_H_ */
   3986