17117f1b4Smrg ; 27117f1b4Smrg/* 3c1f859d4Smrg * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> 47117f1b4Smrg */ 57117f1b4Smrg 67117f1b4Smrg 77117f1b4Smrg#ifdef USE_MMX_ASM 8c1f859d4Smrg#include "assyntax.h" 97ec681f3Smrg#define MATH_ASM_PTR_SIZE 4 107ec681f3Smrg#include "math/m_vector_asm.h" 117117f1b4Smrg 127117f1b4Smrg/* integer multiplication - alpha plus one 137117f1b4Smrg * 147117f1b4Smrg * makes the following approximation to the division (Sree) 157117f1b4Smrg * 167117f1b4Smrg * rgb*a/255 ~= (rgb*(a+1)) >> 256 177117f1b4Smrg * 187117f1b4Smrg * which is the fastest method that satisfies the following OpenGL criteria 197117f1b4Smrg * 207117f1b4Smrg * 0*0 = 0 and 255*255 = 255 217117f1b4Smrg * 227117f1b4Smrg * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making 237117f1b4Smrg * 247117f1b4Smrg * PCMPEQW ( MX1, MX1 ) 257117f1b4Smrg */ 267117f1b4Smrg#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ 277117f1b4Smrg PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ 287117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 297117f1b4Smrg ;\ 307117f1b4SmrgTWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ 317117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 327117f1b4Smrg ;\ 337117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ 347117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ 357117f1b4Smrg 367117f1b4Smrg 377117f1b4Smrg/* integer multiplication - geometric series 387117f1b4Smrg * 397117f1b4Smrg * takes the geometric series approximation to the division 407117f1b4Smrg * 417117f1b4Smrg * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 427117f1b4Smrg * 437117f1b4Smrg * in this case just the first two terms to fit in 16bit arithmetic 447117f1b4Smrg * 457117f1b4Smrg * t/255 ~= (t + (t >> 8)) >> 8 467117f1b4Smrg * 477117f1b4Smrg * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 487117f1b4Smrg * so the special case a = 255 must be accounted or roundoff must be used 497117f1b4Smrg */ 507117f1b4Smrg#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ 517117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 527117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 537117f1b4Smrg ;\ 547117f1b4Smrg MOVQ ( MA1, MP1 ) ;\ 557117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 567117f1b4Smrg ;\ 577117f1b4SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 587117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 597117f1b4Smrg ;\ 607117f1b4Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 617117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 627117f1b4Smrg ;\ 637117f1b4SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 647117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 657117f1b4Smrg 667117f1b4Smrg 677117f1b4Smrg/* integer multiplication - geometric series plus rounding 687117f1b4Smrg * 697117f1b4Smrg * when using a geometric series division instead of truncating the result 707117f1b4Smrg * use roundoff in the approximation (Jim Blinn) 717117f1b4Smrg * 727117f1b4Smrg * t = rgb*a + 0x80 737117f1b4Smrg * 747117f1b4Smrg * achieving the exact results 757117f1b4Smrg * 767117f1b4Smrg * note that M80 is register with the 0x0080008000800080 constant 777117f1b4Smrg */ 787117f1b4Smrg#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ 797117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 807117f1b4Smrg PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 817117f1b4Smrg ;\ 827117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 837117f1b4SmrgTWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 847117f1b4Smrg ;\ 857117f1b4Smrg MOVQ ( MA1, MP1 ) ;\ 867117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 877117f1b4Smrg ;\ 887117f1b4SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 897117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 907117f1b4Smrg ;\ 917117f1b4Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 927117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 937117f1b4Smrg ;\ 947117f1b4SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 957117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 967117f1b4Smrg 977117f1b4Smrg 987117f1b4Smrg/* linear interpolation - geometric series 997117f1b4Smrg */ 1007117f1b4Smrg#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 1017117f1b4Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 1027117f1b4Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 1037117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 1047117f1b4Smrg ;\ 1057117f1b4SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 1067117f1b4SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 1077117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 1087117f1b4Smrg ;\ 1097117f1b4Smrg MOVQ ( MA1, MP1 ) ;\ 1107117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 1117117f1b4Smrg ;\ 1127117f1b4SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 1137117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 1147117f1b4Smrg ;\ 1157117f1b4Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 1167117f1b4SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 1177117f1b4Smrg ;\ 1187117f1b4Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 1197117f1b4SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 1207117f1b4Smrg ;\ 1217117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 1227117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 1237117f1b4Smrg 1247117f1b4Smrg 1257117f1b4Smrg/* linear interpolation - geometric series with roundoff 1267117f1b4Smrg * 1277117f1b4Smrg * this is a generalization of Blinn's formula to signed arithmetic 1287117f1b4Smrg * 1297117f1b4Smrg * note that M80 is a register with the 0x0080008000800080 constant 1307117f1b4Smrg */ 1317117f1b4Smrg#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ 1327117f1b4Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 1337117f1b4Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 1347117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 1357117f1b4Smrg ;\ 1367117f1b4SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 1377117f1b4SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 1387117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 1397117f1b4Smrg ;\ 1407117f1b4Smrg PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ 1417117f1b4SmrgTWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ 1427117f1b4Smrg ;\ 1437117f1b4Smrg PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ 1447117f1b4SmrgTWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ 1457117f1b4Smrg ;\ 1467117f1b4Smrg PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ 1477117f1b4SmrgTWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ 1487117f1b4Smrg ;\ 1497117f1b4Smrg PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 1507117f1b4SmrgTWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 1517117f1b4Smrg ;\ 1527117f1b4Smrg MOVQ ( MA1, MP1 ) ;\ 1537117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 1547117f1b4Smrg ;\ 1557117f1b4SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 1567117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 1577117f1b4Smrg ;\ 1587117f1b4Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 1597117f1b4SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 1607117f1b4Smrg ;\ 1617117f1b4Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 1627117f1b4SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 1637117f1b4Smrg ;\ 1647117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 1657117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 1667117f1b4Smrg 1677117f1b4Smrg 1687117f1b4Smrg/* linear interpolation - geometric series with correction 1697117f1b4Smrg * 1707117f1b4Smrg * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria 1717117f1b4Smrg * 1727117f1b4Smrg * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 1737117f1b4Smrg * 1747117f1b4Smrg * note that although is faster than rounding off it doesn't give always the exact results 1757117f1b4Smrg */ 1767117f1b4Smrg#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 1777117f1b4Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 1787117f1b4Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 1797117f1b4Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 1807117f1b4Smrg ;\ 1817117f1b4SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 1827117f1b4SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 1837117f1b4SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 1847117f1b4Smrg ;\ 1857117f1b4Smrg MOVQ ( MA1, MP1 ) ;\ 1867117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 1877117f1b4Smrg ;\ 1887117f1b4SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 1897117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 1907117f1b4Smrg ;\ 1917117f1b4Smrg PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 1927117f1b4Smrg PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\ 1937117f1b4Smrg ;\ 1947117f1b4SmrgTWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 1957117f1b4SmrgTWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\ 1967117f1b4Smrg ;\ 1977117f1b4Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ 1987117f1b4SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ 1997117f1b4Smrg ;\ 2007117f1b4Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 2017117f1b4SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 2027117f1b4Smrg ;\ 2037117f1b4Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 2047117f1b4SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 2057117f1b4Smrg 2067117f1b4Smrg 2077117f1b4Smrg/* common blending setup code 2087117f1b4Smrg * 2097117f1b4Smrg * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making 2107117f1b4Smrg * 2117117f1b4Smrg * PXOR ( M00, M00 ) 2127117f1b4Smrg */ 2137117f1b4Smrg#define GMB_LOAD(rgba, dest, MPP, MQQ) \ 2147117f1b4SmrgONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 2157117f1b4SmrgONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 2167117f1b4Smrg ;\ 2177117f1b4SmrgTWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 2187117f1b4SmrgTWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ 2197117f1b4Smrg 2207117f1b4Smrg#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \ 2217117f1b4SmrgTWO(MOVQ ( MP1, MP2 )) ;\ 2227117f1b4SmrgTWO(MOVQ ( MQ1, MQ2 )) ;\ 2237117f1b4Smrg ;\ 2247117f1b4Smrg PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ 2257117f1b4SmrgTWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ 2267117f1b4Smrg PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ 2277117f1b4SmrgTWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ 2287117f1b4Smrg 2297117f1b4Smrg#define GMB_ALPHA(MP1, MA1, MP2, MA2) \ 2307117f1b4Smrg MOVQ ( MP1, MA1 ) ;\ 2317117f1b4SmrgTWO(MOVQ ( MP2, MA2 )) ;\ 2327117f1b4Smrg ;\ 2337117f1b4Smrg PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ 2347117f1b4SmrgTWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ 2357117f1b4Smrg PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ 2367117f1b4SmrgTWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ 2377117f1b4Smrg 2387117f1b4Smrg#define GMB_PACK( MS1, MS2 ) \ 23901e04c3fSmrg PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ; 2407117f1b4Smrg 2417117f1b4Smrg#define GMB_STORE(rgba, MSS ) \ 2427117f1b4SmrgONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 2437117f1b4SmrgTWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ 2447117f1b4Smrg 2457117f1b4Smrg/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006 2467117f1b4Smrg * Replace data segment constants with text-segment 2477117f1b4Smrg * constants (via pushl/movq) 2487117f1b4Smrg SEG_DATA 2497117f1b4Smrg 2507117f1b4SmrgALIGNDATA8 2517117f1b4Smrgconst_0080: 2527117f1b4Smrg D_LONG 0x00800080, 0x00800080 2537117f1b4Smrg 2547117f1b4Smrgconst_80: 2557117f1b4Smrg D_LONG 0x80808080, 0x80808080 2567117f1b4Smrg*/ 2577117f1b4Smrg#define const_0080_l 0x00800080 2587117f1b4Smrg#define const_0080_h 0x00800080 2597117f1b4Smrg#define const_80_l 0x80808080 2607117f1b4Smrg#define const_80_h 0x80808080 2617117f1b4Smrg 2627117f1b4Smrg SEG_TEXT 2637117f1b4Smrg 2647117f1b4Smrg 2657117f1b4Smrg/* Blend transparency function 2667117f1b4Smrg */ 2677117f1b4Smrg 2687117f1b4Smrg#define TAG(x) CONCAT(x,_transparency) 2697117f1b4Smrg#define LLTAG(x) LLBL2(x,_transparency) 2707117f1b4Smrg 2717117f1b4Smrg#define INIT \ 2727117f1b4Smrg PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 2737117f1b4Smrg 2747117f1b4Smrg#define MAIN( rgba, dest ) \ 2757117f1b4Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 2767117f1b4Smrg GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 2777117f1b4Smrg GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\ 2787117f1b4Smrg GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ 2797117f1b4Smrg GMB_PACK( MM3, MM6 ) ;\ 2807117f1b4Smrg GMB_STORE( rgba, MM3 ) 2817117f1b4Smrg 2827117f1b4Smrg#include "mmx_blendtmp.h" 2837117f1b4Smrg 2847117f1b4Smrg 2857117f1b4Smrg/* Blend add function 2867117f1b4Smrg * 2877117f1b4Smrg * FIXME: Add some loop unrolling here... 2887117f1b4Smrg */ 2897117f1b4Smrg 2907117f1b4Smrg#define TAG(x) CONCAT(x,_add) 2917117f1b4Smrg#define LLTAG(x) LLBL2(x,_add) 2927117f1b4Smrg 2937117f1b4Smrg#define INIT 2947117f1b4Smrg 2957117f1b4Smrg#define MAIN( rgba, dest ) \ 2967117f1b4SmrgONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 2977117f1b4SmrgONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 2987117f1b4SmrgONE(PADDUSB ( MM2, MM1 )) ;\ 2997117f1b4SmrgONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 3007117f1b4Smrg ;\ 3017117f1b4SmrgTWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 3027117f1b4SmrgTWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ 3037117f1b4SmrgTWO(MOVQ ( MM1, REGIND(rgba) )) 3047117f1b4Smrg 3057117f1b4Smrg#include "mmx_blendtmp.h" 3067117f1b4Smrg 3077117f1b4Smrg 3087117f1b4Smrg/* Blend min function 3097117f1b4Smrg */ 3107117f1b4Smrg 3117117f1b4Smrg#define TAG(x) CONCAT(x,_min) 3127117f1b4Smrg#define LLTAG(x) LLBL2(x,_min) 3137117f1b4Smrg 3147117f1b4Smrg/* Kevin F. Quinn 2nd July 2006 3157117f1b4Smrg * Replace data segment constants with text-segment instructions 3167117f1b4Smrg#define INIT \ 3177117f1b4Smrg MOVQ ( CONTENT(const_80), MM7 ) 3187117f1b4Smrg */ 3197117f1b4Smrg#define INIT \ 3207117f1b4Smrg PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 3217117f1b4Smrg PUSH_L ( CONST(const_80_l) ) ;\ 3227117f1b4Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 3237117f1b4Smrg ADD_L ( CONST(8), ESP) 3247117f1b4Smrg 3257117f1b4Smrg#define MAIN( rgba, dest ) \ 3267117f1b4Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 3277117f1b4Smrg MOVQ ( MM1, MM3 ) ;\ 3287117f1b4Smrg MOVQ ( MM2, MM4 ) ;\ 3297117f1b4Smrg PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 3307117f1b4Smrg PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 3317117f1b4Smrg PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 3327117f1b4Smrg PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\ 3337117f1b4Smrg PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\ 3347117f1b4Smrg POR ( MM1, MM4 ) /* q > p ? p : q */ ;\ 3357117f1b4Smrg GMB_STORE( rgba, MM4 ) 3367117f1b4Smrg 3377117f1b4Smrg#include "mmx_blendtmp.h" 3387117f1b4Smrg 3397117f1b4Smrg 3407117f1b4Smrg/* Blend max function 3417117f1b4Smrg */ 3427117f1b4Smrg 3437117f1b4Smrg#define TAG(x) CONCAT(x,_max) 3447117f1b4Smrg#define LLTAG(x) LLBL2(x,_max) 3457117f1b4Smrg 3467117f1b4Smrg/* Kevin F. Quinn 2nd July 2006 3477117f1b4Smrg * Replace data segment constants with text-segment instructions 3487117f1b4Smrg#define INIT \ 3497117f1b4Smrg MOVQ ( CONTENT(const_80), MM7 ) 3507117f1b4Smrg */ 3517117f1b4Smrg#define INIT \ 3527117f1b4Smrg PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 3537117f1b4Smrg PUSH_L ( CONST(const_80_h) ) ;\ 3547117f1b4Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 3557117f1b4Smrg ADD_L ( CONST(8), ESP) 3567117f1b4Smrg 3577117f1b4Smrg#define MAIN( rgba, dest ) \ 3587117f1b4Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 3597117f1b4Smrg MOVQ ( MM1, MM3 ) ;\ 3607117f1b4Smrg MOVQ ( MM2, MM4 ) ;\ 3617117f1b4Smrg PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 3627117f1b4Smrg PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 3637117f1b4Smrg PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 3647117f1b4Smrg PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\ 3657117f1b4Smrg PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\ 3667117f1b4Smrg POR ( MM2, MM4 ) /* q > p ? p : q */ ;\ 3677117f1b4Smrg GMB_STORE( rgba, MM4 ) 3687117f1b4Smrg 3697117f1b4Smrg#include "mmx_blendtmp.h" 3707117f1b4Smrg 3717117f1b4Smrg 3727117f1b4Smrg/* Blend modulate function 3737117f1b4Smrg */ 3747117f1b4Smrg 3757117f1b4Smrg#define TAG(x) CONCAT(x,_modulate) 3767117f1b4Smrg#define LLTAG(x) LLBL2(x,_modulate) 3777117f1b4Smrg 3787117f1b4Smrg/* Kevin F. Quinn 2nd July 2006 3797117f1b4Smrg * Replace data segment constants with text-segment instructions 3807117f1b4Smrg#define INIT \ 3817117f1b4Smrg MOVQ ( CONTENT(const_0080), MM7 ) 3827117f1b4Smrg */ 3837117f1b4Smrg#define INIT \ 3847117f1b4Smrg PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\ 3857117f1b4Smrg PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\ 3867117f1b4Smrg PUSH_L ( CONST(const_0080_h) ) ;\ 3877117f1b4Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 3887117f1b4Smrg ADD_L ( CONST(8), ESP) 3897117f1b4Smrg 3907117f1b4Smrg#define MAIN( rgba, dest ) \ 3917117f1b4Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 3927117f1b4Smrg GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 3937117f1b4Smrg GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\ 3947117f1b4Smrg GMB_PACK( MM2, MM5 ) ;\ 3957117f1b4Smrg GMB_STORE( rgba, MM2 ) 3967117f1b4Smrg 3977117f1b4Smrg#include "mmx_blendtmp.h" 3987117f1b4Smrg 3997117f1b4Smrg#endif 4007117f1b4Smrg 4017117f1b4Smrg#if defined (__ELF__) && defined (__linux__) 4027117f1b4Smrg .section .note.GNU-stack,"",%progbits 4037117f1b4Smrg#endif 404