17117f1b4Smrg
27117f1b4Smrg/*
37117f1b4Smrg * Mesa 3-D graphics library
47117f1b4Smrg *
57117f1b4Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
67117f1b4Smrg *
77117f1b4Smrg * Permission is hereby granted, free of charge, to any person obtaining a
87117f1b4Smrg * copy of this software and associated documentation files (the "Software"),
97117f1b4Smrg * to deal in the Software without restriction, including without limitation
107117f1b4Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
117117f1b4Smrg * and/or sell copies of the Software, and to permit persons to whom the
127117f1b4Smrg * Software is furnished to do so, subject to the following conditions:
137117f1b4Smrg *
147117f1b4Smrg * The above copyright notice and this permission notice shall be included
157117f1b4Smrg * in all copies or substantial portions of the Software.
167117f1b4Smrg *
177117f1b4Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
187117f1b4Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
197117f1b4Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21af69d88dSmrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22af69d88dSmrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23af69d88dSmrg * OTHER DEALINGS IN THE SOFTWARE.
247117f1b4Smrg */
257117f1b4Smrg
267117f1b4Smrg#ifdef USE_SSE_ASM
27c1f859d4Smrg#include "assyntax.h"
287ec681f3Smrg#define MATH_ASM_PTR_SIZE 4
297ec681f3Smrg#include "math/m_vector_asm.h"
307117f1b4Smrg#include "xform_args.h"
317117f1b4Smrg
327117f1b4Smrg	SEG_TEXT
337117f1b4Smrg
347117f1b4Smrg#define FRAME_OFFSET	8
357117f1b4Smrg
367117f1b4Smrg#define SRC(i)		REGOFF(i * 4, ESI)
377117f1b4Smrg#define DST(i)		REGOFF(i * 4, EDI)
387117f1b4Smrg#define MAT(i)		REGOFF(i * 4, EDX)
397117f1b4Smrg
407117f1b4Smrg#define SELECT(r0, r1, r2, r3)	CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 )
417117f1b4Smrg
427117f1b4Smrg
437117f1b4SmrgALIGNTEXT16
447117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_general )
457117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_general)
467117f1b4SmrgGLNAME( _mesa_sse_transform_points4_general ):
477ec681f3Smrg	_CET_ENDBR
487117f1b4Smrg	PUSH_L( ESI )
497117f1b4Smrg	PUSH_L( EDI )
507117f1b4Smrg
517117f1b4Smrg	MOV_L( ARG_SOURCE, ESI )
527117f1b4Smrg	MOV_L( ARG_DEST, EDI )
537117f1b4Smrg
547117f1b4Smrg	MOV_L( ARG_MATRIX, EDX )
557117f1b4Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
567117f1b4Smrg
577117f1b4Smrg	TEST_L( ECX, ECX )			/* verify non-zero count */
587117f1b4Smrg	JE( LLBL( sse_general_done ) )
597117f1b4Smrg
607117f1b4Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
617117f1b4Smrg	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
627117f1b4Smrg
637117f1b4Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
647117f1b4Smrg	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
657117f1b4Smrg
667117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
677117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
687117f1b4Smrg
697117f1b4Smrg	PREFETCHT0( REGIND(ESI) )
707117f1b4Smrg
717117f1b4Smrg	MOVAPS( MAT(0), XMM4 )			/* m3  | m2  | m1  | m0  */
727117f1b4Smrg	MOVAPS( MAT(4), XMM5 )			/* m7  | m6  | m5  | m4  */
737117f1b4Smrg	MOVAPS( MAT(8), XMM6 )			/* m11 | m10 | m9  | m8  */
747117f1b4Smrg	MOVAPS( MAT(12), XMM7 )			/* m15 | m14 | m13 | m12 */
757117f1b4Smrg
767117f1b4SmrgALIGNTEXT16
777117f1b4SmrgLLBL( sse_general_loop ):
787117f1b4Smrg
797117f1b4Smrg	MOVSS( SRC(0), XMM0 )			/* ox */
807117f1b4Smrg	SHUFPS( CONST(0x0), XMM0, XMM0 )	/* ox | ox | ox | ox */
817117f1b4Smrg	MULPS( XMM4, XMM0 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
827117f1b4Smrg
837117f1b4Smrg	MOVSS( SRC(1), XMM1 )			/* oy */
847117f1b4Smrg	SHUFPS( CONST(0x0), XMM1, XMM1 )	/* oy | oy | oy | oy */
857117f1b4Smrg	MULPS( XMM5, XMM1 )			/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
867117f1b4Smrg
877117f1b4Smrg	MOVSS( SRC(2), XMM2 )			/* oz */
887117f1b4Smrg	SHUFPS( CONST(0x0), XMM2, XMM2 )	/* oz | oz | oz | oz */
897117f1b4Smrg	MULPS( XMM6, XMM2 )			/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
907117f1b4Smrg
917117f1b4Smrg	MOVSS( SRC(3), XMM3 )			/* ow */
927117f1b4Smrg	SHUFPS( CONST(0x0), XMM3, XMM3 )	/* ow | ow | ow | ow */
937117f1b4Smrg	MULPS( XMM7, XMM3 )			/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
947117f1b4Smrg
957117f1b4Smrg	ADDPS( XMM1, XMM0 )			/* ox*m3+oy*m7 | ... */
967117f1b4Smrg	ADDPS( XMM2, XMM0 )			/* ox*m3+oy*m7+oz*m11 | ... */
977117f1b4Smrg	ADDPS( XMM3, XMM0 )			/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
987117f1b4Smrg	MOVAPS( XMM0, DST(0) )			/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
997117f1b4Smrg
1007117f1b4Smrg	ADD_L( CONST(16), EDI )
1017117f1b4Smrg	ADD_L( EAX, ESI )
1027117f1b4Smrg
1037117f1b4Smrg	DEC_L( ECX )
1047117f1b4Smrg	JNZ( LLBL( sse_general_loop ) )
1057117f1b4Smrg
1067117f1b4SmrgLLBL( sse_general_done ):
1077117f1b4Smrg
1087117f1b4Smrg	POP_L( EDI )
1097117f1b4Smrg	POP_L( ESI )
1107117f1b4Smrg	RET
1117117f1b4Smrg
1127117f1b4Smrg
1137117f1b4Smrg
1147117f1b4Smrg
1157117f1b4SmrgALIGNTEXT4
1167117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_3d )
1177117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_3d)
1187117f1b4SmrgGLNAME( _mesa_sse_transform_points4_3d ):
1197ec681f3Smrg	_CET_ENDBR
1207117f1b4Smrg	PUSH_L( ESI )
1217117f1b4Smrg	PUSH_L( EDI )
1227117f1b4Smrg
1237117f1b4Smrg	MOV_L( ARG_SOURCE, ESI )		/* ptr to source GLvector4f */
1247117f1b4Smrg	MOV_L( ARG_DEST, EDI )			/* ptr to dest GLvector4f */
1257117f1b4Smrg
1267117f1b4Smrg	MOV_L( ARG_MATRIX, EDX )		/* ptr to matrix */
1277117f1b4Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )	/* source count */
1287117f1b4Smrg
1297117f1b4Smrg	TEST_L( ECX, ECX)
1307117f1b4Smrg	JZ( LLBL(K_GTP43P3DR_finish) )		/* count was zero; go to finish */
1317117f1b4Smrg
1327117f1b4Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
1337117f1b4Smrg	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
1347117f1b4Smrg
1357117f1b4Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
1367117f1b4Smrg	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */
1377117f1b4Smrg
1387117f1b4Smrg	SHL_L( CONST(4), ECX )			/* count *= 16 */
1397117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
1407117f1b4Smrg
1417117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
1427117f1b4Smrg	ADD_L( EDI, ECX )			/* count += dest ptr */
1437117f1b4Smrg
1447117f1b4Smrg	MOVAPS( MAT(0), XMM0 )			/* m3  | m2  | m1  |  m0 */
1457117f1b4Smrg	MOVAPS( MAT(4), XMM1 )			/* m7  | m6  | m5  |  m4 */
1467117f1b4Smrg	MOVAPS( MAT(8), XMM2 )			/* m11 | m10 | m9  |  m8 */
1477117f1b4Smrg	MOVAPS( MAT(12), XMM3 )			/* m15 | m14 | m13 | m12 */
1487117f1b4Smrg
1497117f1b4SmrgALIGNTEXT32
1507117f1b4SmrgLLBL( K_GTP43P3DR_top ):
1517117f1b4Smrg	MOVSS( SRC(0), XMM4 )			/* ox */
1527117f1b4Smrg	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
1537117f1b4Smrg	MULPS( XMM0, XMM4 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
1547117f1b4Smrg
1557117f1b4Smrg	MOVSS( SRC(1), XMM5 )			/* oy */
1567117f1b4Smrg	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
1577117f1b4Smrg	MULPS( XMM1, XMM5 )			/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
1587117f1b4Smrg
1597117f1b4Smrg	MOVSS( SRC(2), XMM6 )			/* oz */
1607117f1b4Smrg	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
1617117f1b4Smrg	MULPS( XMM2, XMM6 )			/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
1627117f1b4Smrg
1637117f1b4Smrg	MOVSS( SRC(3), XMM7 )			/* ow */
1647117f1b4Smrg	SHUFPS( CONST(0x0), XMM7, XMM7 )	/* ow | ow | ow | ow */
1657117f1b4Smrg	MULPS( XMM3, XMM7 )			/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
1667117f1b4Smrg
1677117f1b4Smrg	ADDPS( XMM5, XMM4 )			/* ox*m3+oy*m7 | ... */
1687117f1b4Smrg	ADDPS( XMM6, XMM4 )			/* ox*m3+oy*m7+oz*m11 | ... */
1697117f1b4Smrg	ADDPS( XMM7, XMM4 )			/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
1707117f1b4Smrg	MOVAPS( XMM4, DST(0) )			/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
1717117f1b4Smrg
1727117f1b4Smrg	MOVSS( SRC(3), XMM4 )			/* ow */
1737117f1b4Smrg	MOVSS( XMM4, DST(3) )			/* ->D(3) */
1747117f1b4Smrg
1757117f1b4SmrgLLBL( K_GTP43P3DR_skip ):
1767117f1b4Smrg	ADD_L( CONST(16), EDI )
1777117f1b4Smrg	ADD_L( EAX, ESI )
1787117f1b4Smrg	CMP_L( ECX, EDI )
1797117f1b4Smrg	JNE( LLBL(K_GTP43P3DR_top) )
1807117f1b4Smrg
1817117f1b4SmrgLLBL( K_GTP43P3DR_finish ):
1827117f1b4Smrg	POP_L( EDI )
1837117f1b4Smrg	POP_L( ESI )
1847117f1b4Smrg	RET
1857117f1b4Smrg
1867117f1b4Smrg
1877117f1b4SmrgALIGNTEXT16
1887117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_identity )
1897117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_identity)
1907117f1b4SmrgGLNAME( _mesa_sse_transform_points4_identity ):
1917ec681f3Smrg	_CET_ENDBR
1927117f1b4Smrg	PUSH_L( ESI )
1937117f1b4Smrg	PUSH_L( EDI )
1947117f1b4Smrg
1957117f1b4Smrg	MOV_L( ARG_SOURCE, ESI )
1967117f1b4Smrg	MOV_L( ARG_DEST, EDI )
1977117f1b4Smrg
1987117f1b4Smrg	MOV_L( ARG_MATRIX, EDX )
1997117f1b4Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
2007117f1b4Smrg
2017117f1b4Smrg	TEST_L( ECX, ECX )			/* verify non-zero count */
2027117f1b4Smrg	JE( LLBL( sse_identity_done ) )
2037117f1b4Smrg
2047117f1b4Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )	/* stride */
2057117f1b4Smrg	OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
2067117f1b4Smrg
2077117f1b4Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )	/* set dest count */
2087117f1b4Smrg	MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
2097117f1b4Smrg
2107117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )	/* ptr to first source vertex */
2117117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )	/* ptr to first dest vertex */
2127117f1b4Smrg
2137117f1b4SmrgALIGNTEXT16
2147117f1b4SmrgLLBL( sse_identity_loop ):
2157117f1b4Smrg
2167117f1b4Smrg	PREFETCHNTA( REGOFF(32, ESI) )
2177117f1b4Smrg
2187117f1b4Smrg	MOVAPS( REGIND(ESI), XMM0 )
2197117f1b4Smrg	ADD_L( EAX, ESI )
2207117f1b4Smrg
2217117f1b4Smrg	MOVAPS( XMM0, REGIND(EDI) )
2227117f1b4Smrg	ADD_L( CONST(16), EDI )
2237117f1b4Smrg
2247117f1b4Smrg	DEC_L( ECX )
2257117f1b4Smrg	JNZ( LLBL( sse_identity_loop ) )
2267117f1b4Smrg
2277117f1b4SmrgLLBL( sse_identity_done ):
2287117f1b4Smrg
2297117f1b4Smrg	POP_L( EDI )
2307117f1b4Smrg	POP_L( ESI )
2317117f1b4Smrg	RET
2327117f1b4Smrg#endif
2337117f1b4Smrg
2347117f1b4Smrg#if defined (__ELF__) && defined (__linux__)
2357117f1b4Smrg	.section .note.GNU-stack,"",%progbits
2367117f1b4Smrg#endif
237