17117f1b4Smrg
27117f1b4Smrg/*
37117f1b4Smrg * Mesa 3-D graphics library
47117f1b4Smrg *
57117f1b4Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
67117f1b4Smrg *
77117f1b4Smrg * Permission is hereby granted, free of charge, to any person obtaining a
87117f1b4Smrg * copy of this software and associated documentation files (the "Software"),
97117f1b4Smrg * to deal in the Software without restriction, including without limitation
107117f1b4Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
117117f1b4Smrg * and/or sell copies of the Software, and to permit persons to whom the
127117f1b4Smrg * Software is furnished to do so, subject to the following conditions:
137117f1b4Smrg *
147117f1b4Smrg * The above copyright notice and this permission notice shall be included
157117f1b4Smrg * in all copies or substantial portions of the Software.
167117f1b4Smrg *
177117f1b4Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
187117f1b4Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
197117f1b4Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21af69d88dSmrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22af69d88dSmrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23af69d88dSmrg * OTHER DEALINGS IN THE SOFTWARE.
247117f1b4Smrg */
257117f1b4Smrg
267117f1b4Smrg/** TODO:
277117f1b4Smrg  * - insert PREFETCH instructions to avoid cache-misses !
287117f1b4Smrg  * - some more optimizations are possible...
297117f1b4Smrg  * - for 40-50% more performance in the SSE-functions, the
307117f1b4Smrg  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
317117f1b4Smrg  */
327117f1b4Smrg
337117f1b4Smrg#ifdef USE_SSE_ASM
34c1f859d4Smrg#include "assyntax.h"
357ec681f3Smrg#define MATH_ASM_PTR_SIZE 4
367ec681f3Smrg#include "math/m_vector_asm.h"
377117f1b4Smrg#include "xform_args.h"
387117f1b4Smrg
397117f1b4Smrg   SEG_TEXT
407117f1b4Smrg
417117f1b4Smrg#define S(i) 	REGOFF(i * 4, ESI)
427117f1b4Smrg#define D(i) 	REGOFF(i * 4, EDI)
437117f1b4Smrg#define M(i) 	REGOFF(i * 4, EDX)
447117f1b4Smrg
457117f1b4Smrg
467117f1b4SmrgALIGNTEXT4
477117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_general)
487117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_general)
497117f1b4SmrgGLNAME( _mesa_sse_transform_points3_general ):
507ec681f3Smrg    _CET_ENDBR
517117f1b4Smrg#define FRAME_OFFSET 8
527117f1b4Smrg    PUSH_L    ( ESI )
537117f1b4Smrg    PUSH_L    ( EDI )
547117f1b4Smrg
557117f1b4Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
567117f1b4Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
577117f1b4Smrg
587117f1b4Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
597117f1b4Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
607117f1b4Smrg
617117f1b4Smrg    CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
627117f1b4Smrg    JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
637117f1b4Smrg
647117f1b4Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
657117f1b4Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
667117f1b4Smrg
677117f1b4Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
687117f1b4Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
697117f1b4Smrg
707117f1b4Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
717117f1b4Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
727117f1b4Smrg
737117f1b4Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
747117f1b4Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
757117f1b4Smrg
767117f1b4Smrg
777117f1b4SmrgALIGNTEXT32
787117f1b4Smrg    MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
797117f1b4Smrg    MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
807117f1b4Smrg    MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
817117f1b4Smrg    MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
827117f1b4Smrg
837117f1b4Smrg
847117f1b4SmrgALIGNTEXT32
857117f1b4SmrgLLBL(K_GTPGR_top):
867117f1b4Smrg    MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
877117f1b4Smrg    SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
887117f1b4Smrg    MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
897117f1b4Smrg    SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
907117f1b4Smrg    MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
917117f1b4Smrg    SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
927117f1b4Smrg
937117f1b4Smrg    MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
947117f1b4Smrg    MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
957117f1b4Smrg    MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
967117f1b4Smrg
977117f1b4Smrg    ADDPS     ( XMM5, XMM4 )
987117f1b4Smrg    ADDPS     ( XMM6, XMM4 )
997117f1b4Smrg    ADDPS     ( XMM3, XMM4 )
1007117f1b4Smrg
1017117f1b4Smrg    MOVAPS    ( XMM4, REGOFF(0, EDI) )
1027117f1b4Smrg
1037117f1b4SmrgLLBL(K_GTPGR_skip):
1047117f1b4Smrg    ADD_L     ( CONST(16), EDI )
1057117f1b4Smrg    ADD_L     ( EAX, ESI )
1067117f1b4Smrg    CMP_L     ( ECX, EDI )
1077117f1b4Smrg    JNE       ( LLBL(K_GTPGR_top) )
1087117f1b4Smrg
1097117f1b4SmrgLLBL(K_GTPGR_finish):
1107117f1b4Smrg    POP_L     ( EDI )
1117117f1b4Smrg    POP_L     ( ESI )
1127117f1b4Smrg    RET
1137117f1b4Smrg#undef FRAME_OFFSET
1147117f1b4Smrg
1157117f1b4Smrg
1167117f1b4SmrgALIGNTEXT4
1177117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_identity)
1187117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_identity)
1197117f1b4SmrgGLNAME( _mesa_sse_transform_points3_identity ):
1207ec681f3Smrg    _CET_ENDBR
1217117f1b4Smrg#define FRAME_OFFSET 8
1227117f1b4Smrg    PUSH_L    ( ESI )
1237117f1b4Smrg    PUSH_L    ( EDI )
1247117f1b4Smrg
1257117f1b4Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
1267117f1b4Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
1277117f1b4Smrg
1287117f1b4Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
1297117f1b4Smrg
1307117f1b4Smrg    TEST_L( ECX, ECX)
1317117f1b4Smrg    JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
1327117f1b4Smrg
1337117f1b4Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
1347117f1b4Smrg    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
1357117f1b4Smrg
1367117f1b4Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
1377117f1b4Smrg    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
1387117f1b4Smrg
1397117f1b4Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
1407117f1b4Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
1417117f1b4Smrg
1427117f1b4Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
1437117f1b4Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
1447117f1b4Smrg
1457117f1b4Smrg    CMP_L( ESI, EDI )
1467117f1b4Smrg    JE( LLBL(K_GTPIR_finish) )
1477117f1b4Smrg
1487117f1b4Smrg
1497117f1b4SmrgALIGNTEXT32
1507117f1b4SmrgLLBL(K_GTPIR_top):
1517117f1b4Smrg    MOVLPS    ( S(0), XMM0 )
1527117f1b4Smrg    MOVLPS    ( XMM0, D(0) )
1537117f1b4Smrg    MOVSS     ( S(2), XMM0 )
1547117f1b4Smrg    MOVSS     ( XMM0, D(2) )
1557117f1b4Smrg
1567117f1b4SmrgLLBL(K_GTPIR_skip):
1577117f1b4Smrg    ADD_L     ( CONST(16), EDI )
1587117f1b4Smrg    ADD_L     ( EAX, ESI )
1597117f1b4Smrg    CMP_L     ( ECX, EDI )
1607117f1b4Smrg    JNE       ( LLBL(K_GTPIR_top) )
1617117f1b4Smrg
1627117f1b4SmrgLLBL(K_GTPIR_finish):
1637117f1b4Smrg    POP_L     ( EDI )
1647117f1b4Smrg    POP_L     ( ESI )
1657117f1b4Smrg    RET
1667117f1b4Smrg#undef FRAME_OFFSET
1677117f1b4Smrg
1687117f1b4Smrg
1697117f1b4Smrg
1707117f1b4Smrg
1717117f1b4SmrgALIGNTEXT4
1727117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
1737117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_3d_no_rot)
1747117f1b4SmrgGLNAME(_mesa_sse_transform_points3_3d_no_rot):
1757ec681f3Smrg    _CET_ENDBR
1767117f1b4Smrg#define FRAME_OFFSET 8
1777117f1b4Smrg    PUSH_L( ESI )
1787117f1b4Smrg    PUSH_L( EDI )
1797117f1b4Smrg
1807117f1b4Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
1817117f1b4Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
1827117f1b4Smrg
1837117f1b4Smrg
1847117f1b4Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
1857117f1b4Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
1867117f1b4Smrg
1877117f1b4Smrg    TEST_L( ECX, ECX)
1887117f1b4Smrg    JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
1897117f1b4Smrg
1907117f1b4Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
1917117f1b4Smrg    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
1927117f1b4Smrg
1937117f1b4Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
1947117f1b4Smrg    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
1957117f1b4Smrg
1967117f1b4Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
1977117f1b4Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
1987117f1b4Smrg
1997117f1b4Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
2007117f1b4Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
2017117f1b4Smrg
2024a49301eSmrg    XORPS( XMM0, XMM0 )                         /* clean the working register */
2037117f1b4Smrg
2047117f1b4SmrgALIGNTEXT32
2057117f1b4Smrg    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
2067117f1b4Smrg    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
2077117f1b4Smrg    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
2087117f1b4Smrg    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
2097117f1b4Smrg    MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
2107117f1b4Smrg    MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
2117117f1b4Smrg
2127117f1b4SmrgALIGNTEXT32
2137117f1b4SmrgLLBL(K_GTP3DNRR_top):
2147117f1b4Smrg
2157117f1b4Smrg    MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
2167117f1b4Smrg    MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
2177117f1b4Smrg    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
2187117f1b4Smrg    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
2197117f1b4Smrg
2207117f1b4Smrg    MOVSS    ( S(2), XMM0 )			/* sz */
2217117f1b4Smrg    MULSS    ( XMM3, XMM0 )			/* sz*m10 */
2227117f1b4Smrg    ADDSS    ( XMM4, XMM0 )			/* +m14 */
2237117f1b4Smrg    MOVSS    ( XMM0, D(2) )			/* -> D(2) */
2247117f1b4Smrg
2257117f1b4SmrgLLBL(K_GTP3DNRR_skip):
2267117f1b4Smrg    ADD_L    ( CONST(16), EDI )
2277117f1b4Smrg    ADD_L    ( EAX, ESI )
2287117f1b4Smrg    CMP_L    ( ECX, EDI )
2297117f1b4Smrg    JNE      ( LLBL(K_GTP3DNRR_top) )
2307117f1b4Smrg
2317117f1b4SmrgLLBL(K_GTP3DNRR_finish):
2327117f1b4Smrg    POP_L    ( EDI )
2337117f1b4Smrg    POP_L    ( ESI )
2347117f1b4Smrg    RET
2357117f1b4Smrg#undef FRAME_OFFSET
2367117f1b4Smrg
2377117f1b4Smrg
2387117f1b4Smrg
2397117f1b4SmrgALIGNTEXT4
2407117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_perspective)
2417117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_perspective)
2427117f1b4SmrgGLNAME(_mesa_sse_transform_points3_perspective):
2437ec681f3Smrg    _CET_ENDBR
2447117f1b4Smrg#define FRAME_OFFSET 8
2457117f1b4Smrg    PUSH_L   ( ESI )
2467117f1b4Smrg    PUSH_L   ( EDI )
2477117f1b4Smrg
2487117f1b4Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
2497117f1b4Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
2507117f1b4Smrg
2517117f1b4Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
2527117f1b4Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
2537117f1b4Smrg
2547117f1b4Smrg    TEST_L( ECX, ECX)
2557117f1b4Smrg    JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
2567117f1b4Smrg
2577117f1b4Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
2587117f1b4Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
2597117f1b4Smrg
2607117f1b4Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
2617117f1b4Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
2627117f1b4Smrg
2637117f1b4Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
2647117f1b4Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
2657117f1b4Smrg
2667117f1b4Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
2677117f1b4Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
2687117f1b4Smrg
2697117f1b4SmrgALIGNTEXT32
2707117f1b4Smrg    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
2717117f1b4Smrg    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
2727117f1b4Smrg    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
2737117f1b4Smrg    MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
2747117f1b4Smrg    MOVSS    ( M(10), XMM3 )			/* m10 */
2757117f1b4Smrg    MOVSS    ( M(14), XMM4 )			/* m14 */
2767117f1b4Smrg    XORPS    ( XMM6, XMM6 )			/* 0 */
2777117f1b4Smrg
2787117f1b4SmrgALIGNTEXT32
2797117f1b4SmrgLLBL(K_GTP3PR_top):
2807117f1b4Smrg    MOVLPS   ( S(0), XMM0 )			/* oy | ox */
2817117f1b4Smrg    MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
2827117f1b4Smrg    MOVSS    ( S(2), XMM5 )			/* oz */
2837117f1b4Smrg    SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
2847117f1b4Smrg    MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
2857117f1b4Smrg    ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
2867117f1b4Smrg    MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
2877117f1b4Smrg
2887117f1b4Smrg    MOVSS    ( S(2), XMM0 )			/* oz */
2897117f1b4Smrg    MULSS    ( XMM3, XMM0 )			/* oz*m10 */
2907117f1b4Smrg    ADDSS    ( XMM4, XMM0 )			/* +m14 */
2917117f1b4Smrg    MOVSS    ( XMM0, D(2) )			/* ->D(2) */
2927117f1b4Smrg
2937117f1b4Smrg    MOVSS    ( S(2), XMM0 )			/* oz */
2947117f1b4Smrg    MOVSS    ( XMM6, XMM5 )			/* 0 */
2957117f1b4Smrg    SUBPS    ( XMM0, XMM5 )			/* -oz */
2967117f1b4Smrg    MOVSS    ( XMM5, D(3) )			/* ->D(3) */
2977117f1b4Smrg
2987117f1b4SmrgLLBL(K_GTP3PR_skip):
2997117f1b4Smrg    ADD_L( CONST(16), EDI )
3007117f1b4Smrg    ADD_L( EAX, ESI )
3017117f1b4Smrg    CMP_L( ECX, EDI )
3027117f1b4Smrg    JNE( LLBL(K_GTP3PR_top) )
3037117f1b4Smrg
3047117f1b4SmrgLLBL(K_GTP3PR_finish):
3057117f1b4Smrg    POP_L    ( EDI )
3067117f1b4Smrg    POP_L    ( ESI )
3077117f1b4Smrg    RET
3087117f1b4Smrg#undef FRAME_OFFSET
3097117f1b4Smrg
3107117f1b4Smrg
3117117f1b4Smrg
3127117f1b4SmrgALIGNTEXT4
3137117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_2d)
3147117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_2d)
3157117f1b4SmrgGLNAME(_mesa_sse_transform_points3_2d):
3167ec681f3Smrg    _CET_ENDBR
3177117f1b4Smrg#define FRAME_OFFSET 8
3187117f1b4Smrg    PUSH_L( ESI )
3197117f1b4Smrg    PUSH_L( EDI )
3207117f1b4Smrg
3217117f1b4Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
3227117f1b4Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
3237117f1b4Smrg
3247117f1b4Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
3257117f1b4Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
3267117f1b4Smrg
3277117f1b4Smrg    TEST_L( ECX, ECX)
3287117f1b4Smrg    JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
3297117f1b4Smrg
3307117f1b4Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
3317117f1b4Smrg    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
3327117f1b4Smrg
3337117f1b4Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
3347117f1b4Smrg    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
3357117f1b4Smrg
3367117f1b4Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
3377117f1b4Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
3387117f1b4Smrg
3397117f1b4Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
3407117f1b4Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
3417117f1b4Smrg
3427117f1b4SmrgALIGNTEXT32
3437117f1b4Smrg    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
3447117f1b4Smrg    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
3457117f1b4Smrg    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
3467117f1b4Smrg
3477117f1b4SmrgALIGNTEXT32
3487117f1b4SmrgLLBL(K_GTP3P2DR_top):
3497117f1b4Smrg    MOVSS    ( S(0), XMM3 )			/* ox */
3507117f1b4Smrg    SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
3517117f1b4Smrg    MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
3527117f1b4Smrg    MOVSS    ( S(1), XMM4 )			/* oy */
3537117f1b4Smrg    SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
3547117f1b4Smrg    MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
3557117f1b4Smrg
3567117f1b4Smrg    ADDPS    ( XMM4, XMM3 )
3577117f1b4Smrg    ADDPS    ( XMM2, XMM3 )
3587117f1b4Smrg    MOVLPS   ( XMM3, D(0) )
3597117f1b4Smrg
3607117f1b4Smrg    MOVSS    ( S(2), XMM3 )
3617117f1b4Smrg    MOVSS    ( XMM3, D(2) )
3627117f1b4Smrg
3637117f1b4SmrgLLBL(K_GTP3P2DR_skip):
3647117f1b4Smrg    ADD_L    ( CONST(16), EDI )
3657117f1b4Smrg    ADD_L    ( EAX, ESI )
3667117f1b4Smrg    CMP_L    ( ECX, EDI )
3677117f1b4Smrg    JNE      ( LLBL(K_GTP3P2DR_top) )
3687117f1b4Smrg
3697117f1b4SmrgLLBL(K_GTP3P2DR_finish):
3707117f1b4Smrg    POP_L    ( EDI )
3717117f1b4Smrg    POP_L    ( ESI )
3727117f1b4Smrg    RET
3737117f1b4Smrg#undef FRAME_OFFSET
3747117f1b4Smrg
3757117f1b4Smrg
3767117f1b4Smrg
3777117f1b4SmrgALIGNTEXT4
3787117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
3797117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_2d_no_rot)
3807117f1b4SmrgGLNAME(_mesa_sse_transform_points3_2d_no_rot):
3817ec681f3Smrg	_CET_ENDBR
3827117f1b4Smrg#define FRAME_OFFSET 8
3837117f1b4Smrg	PUSH_L( ESI )
3847117f1b4Smrg	PUSH_L( EDI )
3857117f1b4Smrg
3867117f1b4Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
3877117f1b4Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
3887117f1b4Smrg
3897117f1b4Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
3907117f1b4Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
3917117f1b4Smrg
3927117f1b4Smrg	TEST_L( ECX, ECX)
3937117f1b4Smrg	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
3947117f1b4Smrg
3957117f1b4Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
3967117f1b4Smrg	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
3977117f1b4Smrg
3987117f1b4Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
3997117f1b4Smrg	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
4007117f1b4Smrg
4017117f1b4Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
4027117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
4037117f1b4Smrg
4047117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
4057117f1b4Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
4067117f1b4Smrg
4077117f1b4SmrgALIGNTEXT32
4087117f1b4Smrg	MOVSS    ( M(0), XMM1 )			/* m0 */
4097117f1b4Smrg	MOVSS    ( M(5), XMM2 )			/* m5 */
4107117f1b4Smrg	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
4117117f1b4Smrg	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
4127117f1b4Smrg
4137117f1b4SmrgALIGNTEXT32
4147117f1b4SmrgLLBL(K_GTP3P2DNRR_top):
4157117f1b4Smrg	MOVLPS( S(0), XMM0 )			/* oy | ox */
4167117f1b4Smrg	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
4177117f1b4Smrg	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
4187117f1b4Smrg	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
4197117f1b4Smrg
4207117f1b4Smrg	MOVSS( S(2), XMM0 )
4217117f1b4Smrg	MOVSS( XMM0, D(2) )
4227117f1b4Smrg
4237117f1b4SmrgLLBL(K_GTP3P2DNRR_skip):
4247117f1b4Smrg	ADD_L( CONST(16), EDI )
4257117f1b4Smrg	ADD_L( EAX, ESI )
4267117f1b4Smrg	CMP_L( ECX, EDI )
4277117f1b4Smrg	JNE( LLBL(K_GTP3P2DNRR_top) )
4287117f1b4Smrg
4297117f1b4SmrgLLBL(K_GTP3P2DNRR_finish):
4307117f1b4Smrg	POP_L( EDI )
4317117f1b4Smrg	POP_L( ESI )
4327117f1b4Smrg	RET
4337117f1b4Smrg#undef FRAME_OFFSET
4347117f1b4Smrg
4357117f1b4Smrg
4367117f1b4Smrg
4377117f1b4Smrg
4387117f1b4SmrgALIGNTEXT4
4397117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_points3_3d)
4407117f1b4SmrgHIDDEN(_mesa_sse_transform_points3_3d)
4417117f1b4SmrgGLNAME(_mesa_sse_transform_points3_3d):
4427ec681f3Smrg	_CET_ENDBR
4437117f1b4Smrg#define FRAME_OFFSET 8
4447117f1b4Smrg	PUSH_L( ESI )
4457117f1b4Smrg	PUSH_L( EDI )
4467117f1b4Smrg
4477117f1b4Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
4487117f1b4Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
4497117f1b4Smrg
4507117f1b4Smrg
4517117f1b4Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
4527117f1b4Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
4537117f1b4Smrg
4547117f1b4Smrg	TEST_L( ECX, ECX)
4557117f1b4Smrg	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
4567117f1b4Smrg
4577117f1b4Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
4587117f1b4Smrg	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
4597117f1b4Smrg
4607117f1b4Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
4617117f1b4Smrg	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
4627117f1b4Smrg
4637117f1b4Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
4647117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
4657117f1b4Smrg
4667117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
4677117f1b4Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
4687117f1b4Smrg
4697117f1b4Smrg
4707117f1b4SmrgALIGNTEXT32
4717117f1b4Smrg	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
4727117f1b4Smrg	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
4737117f1b4Smrg	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
4747117f1b4Smrg	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
4757117f1b4Smrg
4767117f1b4SmrgALIGNTEXT32
4777117f1b4SmrgLLBL(K_GTP3P3DR_top):
4787117f1b4Smrg	MOVSS( S(0), XMM4 )
4797117f1b4Smrg	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
4807117f1b4Smrg	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
4817117f1b4Smrg
4827117f1b4Smrg	MOVSS( S(1), XMM5 )
4837117f1b4Smrg	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
4847117f1b4Smrg	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
4857117f1b4Smrg
4867117f1b4Smrg	MOVSS( S(2), XMM6 )
4877117f1b4Smrg	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
4887117f1b4Smrg	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
4897117f1b4Smrg
4907117f1b4Smrg	ADDPS( XMM5, XMM4 )			/* + | + | + */
4917117f1b4Smrg	ADDPS( XMM6, XMM4 )			/* + | + | + */
4927117f1b4Smrg	ADDPS( XMM3, XMM4 )			/* + | + | + */
4937117f1b4Smrg
4947117f1b4Smrg	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
4957117f1b4Smrg	UNPCKHPS( XMM4, XMM4 )
4967117f1b4Smrg	MOVSS( XMM4, D(2) )
4977117f1b4Smrg
4987117f1b4SmrgLLBL(K_GTP3P3DR_skip):
4997117f1b4Smrg	ADD_L( CONST(16), EDI )
5007117f1b4Smrg	ADD_L( EAX, ESI )
5017117f1b4Smrg	CMP_L( ECX, EDI )
5027117f1b4Smrg	JNE( LLBL(K_GTP3P3DR_top) )
5037117f1b4Smrg
5047117f1b4SmrgLLBL(K_GTP3P3DR_finish):
5057117f1b4Smrg	POP_L( EDI )
5067117f1b4Smrg	POP_L( ESI )
5077117f1b4Smrg	RET
5087117f1b4Smrg#undef FRAME_OFFSET
5097117f1b4Smrg#endif
5107117f1b4Smrg
5117117f1b4Smrg#if defined (__ELF__) && defined (__linux__)
5127117f1b4Smrg	.section .note.GNU-stack,"",%progbits
5137117f1b4Smrg#endif
514