1848b8605Smrg ; 2848b8605Smrg/* 3848b8605Smrg * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> 4848b8605Smrg */ 5848b8605Smrg 6848b8605Smrg 7848b8605Smrg#ifdef USE_MMX_ASM 8848b8605Smrg#include "assyntax.h" 9848b8605Smrg#include "matypes.h" 10848b8605Smrg 11848b8605Smrg/* integer multiplication - alpha plus one 12848b8605Smrg * 13848b8605Smrg * makes the following approximation to the division (Sree) 14848b8605Smrg * 15848b8605Smrg * rgb*a/255 ~= (rgb*(a+1)) >> 256 16848b8605Smrg * 17848b8605Smrg * which is the fastest method that satisfies the following OpenGL criteria 18848b8605Smrg * 19848b8605Smrg * 0*0 = 0 and 255*255 = 255 20848b8605Smrg * 21848b8605Smrg * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making 22848b8605Smrg * 23848b8605Smrg * PCMPEQW ( MX1, MX1 ) 24848b8605Smrg */ 25848b8605Smrg#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ 26848b8605Smrg PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ 27848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 28848b8605Smrg ;\ 29848b8605SmrgTWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ 30848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 31848b8605Smrg ;\ 32848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ 33848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ 34848b8605Smrg 35848b8605Smrg 36848b8605Smrg/* integer multiplication - geometric series 37848b8605Smrg * 38848b8605Smrg * takes the geometric series approximation to the division 39848b8605Smrg * 40848b8605Smrg * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 41848b8605Smrg * 42848b8605Smrg * in this case just the first two terms to fit in 16bit arithmetic 43848b8605Smrg * 44848b8605Smrg * t/255 ~= (t + (t >> 8)) >> 8 45848b8605Smrg * 46848b8605Smrg * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 47848b8605Smrg * so the special case a = 255 must be accounted or roundoff must be used 48848b8605Smrg */ 49848b8605Smrg#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ 50848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 51848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 52848b8605Smrg ;\ 53848b8605Smrg MOVQ ( MA1, MP1 ) ;\ 54848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 55848b8605Smrg ;\ 56848b8605SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 57848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 58848b8605Smrg ;\ 59848b8605Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 60848b8605Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 61848b8605Smrg ;\ 62848b8605SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 63848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 64848b8605Smrg 65848b8605Smrg 66848b8605Smrg/* integer multiplication - geometric series plus rounding 67848b8605Smrg * 68848b8605Smrg * when using a geometric series division instead of truncating the result 69848b8605Smrg * use roundoff in the approximation (Jim Blinn) 70848b8605Smrg * 71848b8605Smrg * t = rgb*a + 0x80 72848b8605Smrg * 73848b8605Smrg * achieving the exact results 74848b8605Smrg * 75848b8605Smrg * note that M80 is register with the 0x0080008000800080 constant 76848b8605Smrg */ 77848b8605Smrg#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ 78848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 79848b8605Smrg PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 80848b8605Smrg ;\ 81848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 82848b8605SmrgTWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 83848b8605Smrg ;\ 84848b8605Smrg MOVQ ( MA1, MP1 ) ;\ 85848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 86848b8605Smrg ;\ 87848b8605SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 88848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 89848b8605Smrg ;\ 90848b8605Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 91848b8605Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 92848b8605Smrg ;\ 93848b8605SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 94848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 95848b8605Smrg 96848b8605Smrg 97848b8605Smrg/* linear interpolation - geometric series 98848b8605Smrg */ 99848b8605Smrg#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 100848b8605Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 101848b8605Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 102848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 103848b8605Smrg ;\ 104848b8605SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 105848b8605SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 106848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 107848b8605Smrg ;\ 108848b8605Smrg MOVQ ( MA1, MP1 ) ;\ 109848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 110848b8605Smrg ;\ 111848b8605SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 112848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 113848b8605Smrg ;\ 114848b8605Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 115848b8605SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 116848b8605Smrg ;\ 117848b8605Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 118848b8605SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 119848b8605Smrg ;\ 120848b8605Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 121848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 122848b8605Smrg 123848b8605Smrg 124848b8605Smrg/* linear interpolation - geometric series with roundoff 125848b8605Smrg * 126848b8605Smrg * this is a generalization of Blinn's formula to signed arithmetic 127848b8605Smrg * 128848b8605Smrg * note that M80 is a register with the 0x0080008000800080 constant 129848b8605Smrg */ 130848b8605Smrg#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ 131848b8605Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 132848b8605Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 133848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 134848b8605Smrg ;\ 135848b8605SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 136848b8605SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 137848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 138848b8605Smrg ;\ 139848b8605Smrg PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ 140848b8605SmrgTWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ 141848b8605Smrg ;\ 142848b8605Smrg PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ 143848b8605SmrgTWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ 144848b8605Smrg ;\ 145848b8605Smrg PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ 146848b8605SmrgTWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ 147848b8605Smrg ;\ 148848b8605Smrg PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 149848b8605SmrgTWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 150848b8605Smrg ;\ 151848b8605Smrg MOVQ ( MA1, MP1 ) ;\ 152848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 153848b8605Smrg ;\ 154848b8605SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 155848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 156848b8605Smrg ;\ 157848b8605Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 158848b8605SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 159848b8605Smrg ;\ 160848b8605Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 161848b8605SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 162848b8605Smrg ;\ 163848b8605Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 164848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 165848b8605Smrg 166848b8605Smrg 167848b8605Smrg/* linear interpolation - geometric series with correction 168848b8605Smrg * 169848b8605Smrg * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria 170848b8605Smrg * 171848b8605Smrg * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 172848b8605Smrg * 173848b8605Smrg * note that although is faster than rounding off it doesn't give always the exact results 174848b8605Smrg */ 175848b8605Smrg#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 176848b8605Smrg PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 177848b8605Smrg PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 178848b8605Smrg PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 179848b8605Smrg ;\ 180848b8605SmrgTWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 181848b8605SmrgTWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 182848b8605SmrgTWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 183848b8605Smrg ;\ 184848b8605Smrg MOVQ ( MA1, MP1 ) ;\ 185848b8605Smrg PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 186848b8605Smrg ;\ 187848b8605SmrgTWO(MOVQ ( MA2, MP2 )) ;\ 188848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 189848b8605Smrg ;\ 190848b8605Smrg PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 191848b8605Smrg PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\ 192848b8605Smrg ;\ 193848b8605SmrgTWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 194848b8605SmrgTWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\ 195848b8605Smrg ;\ 196848b8605Smrg PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ 197848b8605SmrgTWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ 198848b8605Smrg ;\ 199848b8605Smrg PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 200848b8605SmrgTWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 201848b8605Smrg ;\ 202848b8605Smrg PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 203848b8605SmrgTWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 204848b8605Smrg 205848b8605Smrg 206848b8605Smrg/* common blending setup code 207848b8605Smrg * 208848b8605Smrg * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making 209848b8605Smrg * 210848b8605Smrg * PXOR ( M00, M00 ) 211848b8605Smrg */ 212848b8605Smrg#define GMB_LOAD(rgba, dest, MPP, MQQ) \ 213848b8605SmrgONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 214848b8605SmrgONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 215848b8605Smrg ;\ 216848b8605SmrgTWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 217848b8605SmrgTWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ 218848b8605Smrg 219848b8605Smrg#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \ 220848b8605SmrgTWO(MOVQ ( MP1, MP2 )) ;\ 221848b8605SmrgTWO(MOVQ ( MQ1, MQ2 )) ;\ 222848b8605Smrg ;\ 223848b8605Smrg PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ 224848b8605SmrgTWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ 225848b8605Smrg PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ 226848b8605SmrgTWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ 227848b8605Smrg 228848b8605Smrg#define GMB_ALPHA(MP1, MA1, MP2, MA2) \ 229848b8605Smrg MOVQ ( MP1, MA1 ) ;\ 230848b8605SmrgTWO(MOVQ ( MP2, MA2 )) ;\ 231848b8605Smrg ;\ 232848b8605Smrg PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ 233848b8605SmrgTWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ 234848b8605Smrg PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ 235848b8605SmrgTWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ 236848b8605Smrg 237848b8605Smrg#define GMB_PACK( MS1, MS2 ) \ 238b8e80941Smrg PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ; 239848b8605Smrg 240848b8605Smrg#define GMB_STORE(rgba, MSS ) \ 241848b8605SmrgONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 242848b8605SmrgTWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ 243848b8605Smrg 244848b8605Smrg/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006 245848b8605Smrg * Replace data segment constants with text-segment 246848b8605Smrg * constants (via pushl/movq) 247848b8605Smrg SEG_DATA 248848b8605Smrg 249848b8605SmrgALIGNDATA8 250848b8605Smrgconst_0080: 251848b8605Smrg D_LONG 0x00800080, 0x00800080 252848b8605Smrg 253848b8605Smrgconst_80: 254848b8605Smrg D_LONG 0x80808080, 0x80808080 255848b8605Smrg*/ 256848b8605Smrg#define const_0080_l 0x00800080 257848b8605Smrg#define const_0080_h 0x00800080 258848b8605Smrg#define const_80_l 0x80808080 259848b8605Smrg#define const_80_h 0x80808080 260848b8605Smrg 261848b8605Smrg SEG_TEXT 262848b8605Smrg 263848b8605Smrg 264848b8605Smrg/* Blend transparency function 265848b8605Smrg */ 266848b8605Smrg 267848b8605Smrg#define TAG(x) CONCAT(x,_transparency) 268848b8605Smrg#define LLTAG(x) LLBL2(x,_transparency) 269848b8605Smrg 270848b8605Smrg#define INIT \ 271848b8605Smrg PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 272848b8605Smrg 273848b8605Smrg#define MAIN( rgba, dest ) \ 274848b8605Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 275848b8605Smrg GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 276848b8605Smrg GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\ 277848b8605Smrg GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ 278848b8605Smrg GMB_PACK( MM3, MM6 ) ;\ 279848b8605Smrg GMB_STORE( rgba, MM3 ) 280848b8605Smrg 281848b8605Smrg#include "mmx_blendtmp.h" 282848b8605Smrg 283848b8605Smrg 284848b8605Smrg/* Blend add function 285848b8605Smrg * 286848b8605Smrg * FIXME: Add some loop unrolling here... 287848b8605Smrg */ 288848b8605Smrg 289848b8605Smrg#define TAG(x) CONCAT(x,_add) 290848b8605Smrg#define LLTAG(x) LLBL2(x,_add) 291848b8605Smrg 292848b8605Smrg#define INIT 293848b8605Smrg 294848b8605Smrg#define MAIN( rgba, dest ) \ 295848b8605SmrgONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 296848b8605SmrgONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 297848b8605SmrgONE(PADDUSB ( MM2, MM1 )) ;\ 298848b8605SmrgONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 299848b8605Smrg ;\ 300848b8605SmrgTWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 301848b8605SmrgTWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ 302848b8605SmrgTWO(MOVQ ( MM1, REGIND(rgba) )) 303848b8605Smrg 304848b8605Smrg#include "mmx_blendtmp.h" 305848b8605Smrg 306848b8605Smrg 307848b8605Smrg/* Blend min function 308848b8605Smrg */ 309848b8605Smrg 310848b8605Smrg#define TAG(x) CONCAT(x,_min) 311848b8605Smrg#define LLTAG(x) LLBL2(x,_min) 312848b8605Smrg 313848b8605Smrg/* Kevin F. Quinn 2nd July 2006 314848b8605Smrg * Replace data segment constants with text-segment instructions 315848b8605Smrg#define INIT \ 316848b8605Smrg MOVQ ( CONTENT(const_80), MM7 ) 317848b8605Smrg */ 318848b8605Smrg#define INIT \ 319848b8605Smrg PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 320848b8605Smrg PUSH_L ( CONST(const_80_l) ) ;\ 321848b8605Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 322848b8605Smrg ADD_L ( CONST(8), ESP) 323848b8605Smrg 324848b8605Smrg#define MAIN( rgba, dest ) \ 325848b8605Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 326848b8605Smrg MOVQ ( MM1, MM3 ) ;\ 327848b8605Smrg MOVQ ( MM2, MM4 ) ;\ 328848b8605Smrg PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 329848b8605Smrg PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 330848b8605Smrg PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 331848b8605Smrg PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\ 332848b8605Smrg PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\ 333848b8605Smrg POR ( MM1, MM4 ) /* q > p ? p : q */ ;\ 334848b8605Smrg GMB_STORE( rgba, MM4 ) 335848b8605Smrg 336848b8605Smrg#include "mmx_blendtmp.h" 337848b8605Smrg 338848b8605Smrg 339848b8605Smrg/* Blend max function 340848b8605Smrg */ 341848b8605Smrg 342848b8605Smrg#define TAG(x) CONCAT(x,_max) 343848b8605Smrg#define LLTAG(x) LLBL2(x,_max) 344848b8605Smrg 345848b8605Smrg/* Kevin F. Quinn 2nd July 2006 346848b8605Smrg * Replace data segment constants with text-segment instructions 347848b8605Smrg#define INIT \ 348848b8605Smrg MOVQ ( CONTENT(const_80), MM7 ) 349848b8605Smrg */ 350848b8605Smrg#define INIT \ 351848b8605Smrg PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 352848b8605Smrg PUSH_L ( CONST(const_80_h) ) ;\ 353848b8605Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 354848b8605Smrg ADD_L ( CONST(8), ESP) 355848b8605Smrg 356848b8605Smrg#define MAIN( rgba, dest ) \ 357848b8605Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 358848b8605Smrg MOVQ ( MM1, MM3 ) ;\ 359848b8605Smrg MOVQ ( MM2, MM4 ) ;\ 360848b8605Smrg PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 361848b8605Smrg PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 362848b8605Smrg PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 363848b8605Smrg PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\ 364848b8605Smrg PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\ 365848b8605Smrg POR ( MM2, MM4 ) /* q > p ? p : q */ ;\ 366848b8605Smrg GMB_STORE( rgba, MM4 ) 367848b8605Smrg 368848b8605Smrg#include "mmx_blendtmp.h" 369848b8605Smrg 370848b8605Smrg 371848b8605Smrg/* Blend modulate function 372848b8605Smrg */ 373848b8605Smrg 374848b8605Smrg#define TAG(x) CONCAT(x,_modulate) 375848b8605Smrg#define LLTAG(x) LLBL2(x,_modulate) 376848b8605Smrg 377848b8605Smrg/* Kevin F. Quinn 2nd July 2006 378848b8605Smrg * Replace data segment constants with text-segment instructions 379848b8605Smrg#define INIT \ 380848b8605Smrg MOVQ ( CONTENT(const_0080), MM7 ) 381848b8605Smrg */ 382848b8605Smrg#define INIT \ 383848b8605Smrg PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\ 384848b8605Smrg PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\ 385848b8605Smrg PUSH_L ( CONST(const_0080_h) ) ;\ 386848b8605Smrg MOVQ ( REGIND(ESP), MM7 ) ;\ 387848b8605Smrg ADD_L ( CONST(8), ESP) 388848b8605Smrg 389848b8605Smrg#define MAIN( rgba, dest ) \ 390848b8605Smrg GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 391848b8605Smrg GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 392848b8605Smrg GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\ 393848b8605Smrg GMB_PACK( MM2, MM5 ) ;\ 394848b8605Smrg GMB_STORE( rgba, MM2 ) 395848b8605Smrg 396848b8605Smrg#include "mmx_blendtmp.h" 397848b8605Smrg 398848b8605Smrg#endif 399848b8605Smrg 400848b8605Smrg#if defined (__ELF__) && defined (__linux__) 401848b8605Smrg .section .note.GNU-stack,"",%progbits 402848b8605Smrg#endif 403