17117f1b4Smrg
27117f1b4Smrg/*
37117f1b4Smrg * Mesa 3-D graphics library
47117f1b4Smrg *
57117f1b4Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
67117f1b4Smrg *
77117f1b4Smrg * Permission is hereby granted, free of charge, to any person obtaining a
87117f1b4Smrg * copy of this software and associated documentation files (the "Software"),
97117f1b4Smrg * to deal in the Software without restriction, including without limitation
107117f1b4Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
117117f1b4Smrg * and/or sell copies of the Software, and to permit persons to whom the
127117f1b4Smrg * Software is furnished to do so, subject to the following conditions:
137117f1b4Smrg *
147117f1b4Smrg * The above copyright notice and this permission notice shall be included
157117f1b4Smrg * in all copies or substantial portions of the Software.
167117f1b4Smrg *
177117f1b4Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
187117f1b4Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
197117f1b4Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21af69d88dSmrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22af69d88dSmrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23af69d88dSmrg * OTHER DEALINGS IN THE SOFTWARE.
247117f1b4Smrg */
257117f1b4Smrg
267117f1b4Smrg/** TODO:
277117f1b4Smrg  * - insert PREFETCH instructions to avoid cache-misses !
287117f1b4Smrg  * - some more optimizations are possible...
297117f1b4Smrg  * - for 40-50% more performance in the SSE-functions, the
307117f1b4Smrg  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
317117f1b4Smrg  */
327117f1b4Smrg
337117f1b4Smrg#ifdef USE_SSE_ASM
34c1f859d4Smrg#include "assyntax.h"
357ec681f3Smrg#define MATH_ASM_PTR_SIZE 4
367ec681f3Smrg#include "math/m_vector_asm.h"
377117f1b4Smrg#include "norm_args.h"
387117f1b4Smrg
397117f1b4Smrg   SEG_TEXT
407117f1b4Smrg
417117f1b4Smrg#define M(i)    REGOFF(i * 4, EDX)
427117f1b4Smrg#define S(i)	REGOFF(i * 4, ESI)
437117f1b4Smrg#define D(i)	REGOFF(i * 4, EDI)
447117f1b4Smrg#define STRIDE  REGOFF(12, ESI)
457117f1b4Smrg
467117f1b4Smrg
477117f1b4SmrgALIGNTEXT16
487117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot)
497117f1b4SmrgHIDDEN(_mesa_sse_transform_rescale_normals_no_rot)
507117f1b4SmrgGLNAME(_mesa_sse_transform_rescale_normals_no_rot):
517ec681f3Smrg	_CET_ENDBR
527117f1b4Smrg#define FRAME_OFFSET 8
537117f1b4Smrg	PUSH_L  ( ESI )
547117f1b4Smrg	PUSH_L  ( EDI )
557117f1b4Smrg
567117f1b4Smrg	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
577117f1b4Smrg	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
587117f1b4Smrg
597117f1b4Smrg	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
607ec681f3Smrg	ADD_L	( CONST(MATRIX_INV), EDX )		/* matrix->inv */
617117f1b4Smrg
627117f1b4Smrg	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
637117f1b4Smrg
647117f1b4Smrg	TEST_L	( ECX, ECX )
657117f1b4Smrg	JZ( LLBL(K_G3TRNNRR_finish) )			/* count was zero; go to finish */
667117f1b4Smrg
677117f1b4Smrg	MOV_L	( STRIDE, EAX )				/* stride */
687117f1b4Smrg	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
697117f1b4Smrg
707117f1b4Smrg	IMUL_L( CONST(16), ECX )			/* count *= 16 */
717117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
727117f1b4Smrg
737117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
747117f1b4Smrg	ADD_L( EDI, ECX ) 				/* count += dest ptr */
757117f1b4Smrg
767117f1b4SmrgALIGNTEXT32
777117f1b4Smrg	MOVSS	( M(0), XMM1 )				/* m0 */
787117f1b4Smrg	MOVSS	( M(5), XMM2 )				/* m5 */
797117f1b4Smrg	UNPCKLPS( XMM2, XMM1 )				/* m5 | m0 */
807117f1b4Smrg	MOVSS	( ARG_SCALE, XMM0 )			/* scale */
817117f1b4Smrg	SHUFPS	( CONST(0x0), XMM0, XMM0 )		/* scale | scale */
827117f1b4Smrg	MULPS	( XMM0, XMM1 )				/* m5*scale | m0*scale */
837117f1b4Smrg	MULSS	( M(10), XMM0 )				/* m10*scale */
847117f1b4Smrg
857117f1b4SmrgALIGNTEXT32
867117f1b4SmrgLLBL(K_G3TRNNRR_top):
877117f1b4Smrg	MOVLPS	( S(0), XMM2 )				/* uy | ux */
887117f1b4Smrg	MULPS	( XMM1, XMM2 )				/* uy*m5*scale | ux*m0*scale */
897117f1b4Smrg	MOVLPS	( XMM2, D(0) )				/* ->D(1) | D(0) */
907117f1b4Smrg
917117f1b4Smrg	MOVSS	( S(2), XMM2 )				/* uz */
927117f1b4Smrg	MULSS	( XMM0, XMM2 )				/* uz*m10*scale */
937117f1b4Smrg	MOVSS	( XMM2, D(2) )				/* ->D(2) */
947117f1b4Smrg
957117f1b4SmrgLLBL(K_G3TRNNRR_skip):
967117f1b4Smrg	ADD_L	( CONST(16), EDI )
977117f1b4Smrg	ADD_L	( EAX, ESI )
987117f1b4Smrg	CMP_L	( ECX, EDI )
997117f1b4Smrg	JNE	( LLBL(K_G3TRNNRR_top) )
1007117f1b4Smrg
1017117f1b4SmrgLLBL(K_G3TRNNRR_finish):
1027117f1b4Smrg	POP_L	( EDI )
1037117f1b4Smrg	POP_L	( ESI )
1047117f1b4Smrg	RET
1057117f1b4Smrg#undef FRAME_OFFSET
1067117f1b4Smrg
1077117f1b4Smrg
1087117f1b4Smrg
1097117f1b4SmrgALIGNTEXT16
1107117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals)
1117117f1b4SmrgHIDDEN(_mesa_sse_transform_rescale_normals)
1127117f1b4SmrgGLNAME(_mesa_sse_transform_rescale_normals):
1137ec681f3Smrg	_CET_ENDBR
1147117f1b4Smrg#define FRAME_OFFSET 8
1157117f1b4Smrg	PUSH_L  ( ESI )
1167117f1b4Smrg	PUSH_L  ( EDI )
1177117f1b4Smrg
1187117f1b4Smrg	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
1197117f1b4Smrg	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
1207117f1b4Smrg
1217117f1b4Smrg	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
1227ec681f3Smrg	ADD_L	( CONST(MATRIX_INV), EDX )		/* matrix->inv */
1237117f1b4Smrg
1247117f1b4Smrg	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
1257117f1b4Smrg
1267117f1b4Smrg	TEST_L	( ECX, ECX )
1277117f1b4Smrg	JZ( LLBL(K_G3TRNR_finish) )			/* count was zero; go to finish */
1287117f1b4Smrg
1297117f1b4Smrg	MOV_L	( STRIDE, EAX )				/* stride */
1307117f1b4Smrg	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
1317117f1b4Smrg
1327117f1b4Smrg	IMUL_L( CONST(16), ECX )			/* count *= 16 */
1337117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
1347117f1b4Smrg
1357117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
1367117f1b4Smrg	ADD_L( EDI, ECX ) 				/* count += dest ptr */
1377117f1b4Smrg
1387117f1b4SmrgALIGNTEXT32
1397117f1b4Smrg	MOVSS	( M(0), XMM0 )				/* m0 */
1407117f1b4Smrg	MOVSS	( M(4), XMM1 )				/* m4 */
1417117f1b4Smrg	UNPCKLPS( XMM1, XMM0 )				/* m4 | m0 */
1427117f1b4Smrg
1437117f1b4Smrg	MOVSS	( ARG_SCALE, XMM4 )			/* scale */
1447117f1b4Smrg	SHUFPS	( CONST(0x0), XMM4, XMM4 )		/* scale | scale */
1457117f1b4Smrg
1467117f1b4Smrg	MULPS	( XMM4, XMM0 )				/* m4*scale | m0*scale */
1477117f1b4Smrg	MOVSS	( M(1), XMM1 )				/* m1 */
1487117f1b4Smrg	MOVSS	( M(5), XMM2 )				/* m5 */
1497117f1b4Smrg	UNPCKLPS( XMM2, XMM1 )				/* m5 | m1 */
1507117f1b4Smrg	MULPS	( XMM4, XMM1 )				/* m5*scale | m1*scale */
1517117f1b4Smrg	MOVSS	( M(2), XMM2 )				/* m2 */
1527117f1b4Smrg	MOVSS	( M(6), XMM3 )				/* m6 */
1537117f1b4Smrg	UNPCKLPS( XMM3, XMM2 )				/* m6 | m2 */
1547117f1b4Smrg	MULPS	( XMM4, XMM2 )				/* m6*scale | m2*scale */
1557117f1b4Smrg
1567117f1b4Smrg	MOVSS	( M(8), XMM6 )				/* m8 */
1577117f1b4Smrg	MULSS	( ARG_SCALE, XMM6 )			/* m8*scale */
1587117f1b4Smrg	MOVSS	( M(9), XMM7 )				/* m9 */
1597117f1b4Smrg	MULSS	( ARG_SCALE, XMM7 )			/* m9*scale */
1607117f1b4Smrg
1617117f1b4SmrgALIGNTEXT32
1627117f1b4SmrgLLBL(K_G3TRNR_top):
1637117f1b4Smrg	MOVSS	( S(0), XMM3 )				/* ux */
1647117f1b4Smrg	SHUFPS	( CONST(0x0), XMM3, XMM3 )		/* ux | ux */
1657117f1b4Smrg	MULPS	( XMM0, XMM3 )				/* ux*m4 | ux*m0 */
1667117f1b4Smrg	MOVSS	( S(1), XMM4 )				/* uy */
1677117f1b4Smrg	SHUFPS	( CONST(0x0), XMM4, XMM4 )		/* uy | uy */
1687117f1b4Smrg	MULPS	( XMM1, XMM4 )				/* uy*m5 | uy*m1 */
1697117f1b4Smrg	MOVSS	( S(2), XMM5 )				/* uz */
1707117f1b4Smrg	SHUFPS	( CONST(0x0), XMM5, XMM5 )		/* uz | uz */
1717117f1b4Smrg	MULPS	( XMM2, XMM5 )				/* uz*m6 | uz*m2 */
1727117f1b4Smrg
1737117f1b4Smrg	ADDPS	( XMM4, XMM3 )
1747117f1b4Smrg	ADDPS	( XMM5, XMM3 )
1757117f1b4Smrg	MOVLPS	( XMM3, D(0) )
1767117f1b4Smrg
1777117f1b4Smrg	MOVSS	( M(10), XMM3 )				/* m10 */
1787117f1b4Smrg	MULSS	( ARG_SCALE, XMM3 )			/* m10*scale */
1797117f1b4Smrg	MULSS	( S(2), XMM3 )				/* m10*scale*uz */
1807117f1b4Smrg	MOVSS	( S(1), XMM4 )				/* uy */
1817117f1b4Smrg	MULSS	( XMM7, XMM4 )				/* uy*m9*scale */
1827117f1b4Smrg	MOVSS	( S(0), XMM5 )				/* ux */
1837117f1b4Smrg	MULSS	( XMM6, XMM5 )				/* ux*m8*scale */
1847117f1b4Smrg
1857117f1b4Smrg	ADDSS	( XMM4, XMM3 )
1867117f1b4Smrg	ADDSS	( XMM5, XMM3 )
1877117f1b4Smrg	MOVSS	( XMM3, D(2) )
1887117f1b4Smrg
1897117f1b4SmrgLLBL(K_G3TRNR_skip):
1907117f1b4Smrg	ADD_L	( CONST(16), EDI )
1917117f1b4Smrg	ADD_L	( EAX, ESI )
1927117f1b4Smrg	CMP_L	( ECX, EDI )
1937117f1b4Smrg	JNE	( LLBL(K_G3TRNR_top) )
1947117f1b4Smrg
1957117f1b4SmrgLLBL(K_G3TRNR_finish):
1967117f1b4Smrg	POP_L	( EDI )
1977117f1b4Smrg	POP_L	( ESI )
1987117f1b4Smrg	RET
1997117f1b4Smrg#undef FRAME_OFFSET
2007117f1b4Smrg
2017117f1b4Smrg
2027117f1b4SmrgALIGNTEXT16
2037117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_normals_no_rot)
2047117f1b4SmrgHIDDEN(_mesa_sse_transform_normals_no_rot)
2057117f1b4SmrgGLNAME(_mesa_sse_transform_normals_no_rot):
2067ec681f3Smrg	_CET_ENDBR
2077117f1b4Smrg#define FRAME_OFFSET 8
2087117f1b4Smrg	PUSH_L  ( ESI )
2097117f1b4Smrg	PUSH_L  ( EDI )
2107117f1b4Smrg
2117117f1b4Smrg	MOV_L	( ARG_IN, ESI )				/* ptr to source GLvector3f */
2127117f1b4Smrg	MOV_L	( ARG_DEST, EDI )			/* ptr to dest GLvector3f */
2137117f1b4Smrg
2147117f1b4Smrg	MOV_L	( ARG_MAT, EDX )			/* ptr to matrix */
2157ec681f3Smrg	ADD_L	( CONST(MATRIX_INV), EDX )		/* matrix->inv */
2167117f1b4Smrg
2177117f1b4Smrg	MOV_L	( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
2187117f1b4Smrg
2197117f1b4Smrg	TEST_L	( ECX, ECX )
2207117f1b4Smrg	JZ( LLBL(K_G3TNNRR_finish) )			/* count was zero; go to finish */
2217117f1b4Smrg
2227117f1b4Smrg	MOV_L	( STRIDE, EAX )				/* stride */
2237117f1b4Smrg	MOV_L	( ECX, REGOFF(V4F_COUNT, EDI) )		/* set dest-count */
2247117f1b4Smrg
2257117f1b4Smrg	IMUL_L( CONST(16), ECX )			/* count *= 16 */
2267117f1b4Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI )		/* ptr to first source vertex */
2277117f1b4Smrg
2287117f1b4Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI )		/* ptr to first dest vertex */
2297117f1b4Smrg	ADD_L( EDI, ECX ) 				/* count += dest ptr */
2307117f1b4Smrg
2317117f1b4SmrgALIGNTEXT32
2327117f1b4Smrg	MOVSS( M(0), XMM0 )				/* m0 */
2337117f1b4Smrg	MOVSS( M(5), XMM1 )				/* m5 */
2347117f1b4Smrg	UNPCKLPS( XMM1, XMM0 )				/* m5 | m0 */
2357117f1b4Smrg	MOVSS( M(10), XMM1 )				/* m10 */
2367117f1b4Smrg
2377117f1b4SmrgALIGNTEXT32
2387117f1b4SmrgLLBL(K_G3TNNRR_top):
2397117f1b4Smrg	MOVLPS( S(0), XMM2 )				/* uy | ux */
2407117f1b4Smrg	MULPS( XMM0, XMM2 )				/* uy*m5 | ux*m0 */
2417117f1b4Smrg	MOVLPS( XMM2, D(0) )
2427117f1b4Smrg
2437117f1b4Smrg	MOVSS( S(2), XMM2 )				/* uz */
2447117f1b4Smrg	MULSS( XMM1, XMM2 )				/* uz*m10 */
2457117f1b4Smrg	MOVSS( XMM2, D(2) )
2467117f1b4Smrg
2477117f1b4SmrgLLBL(K_G3TNNRR_skip):
2487117f1b4Smrg	ADD_L	( CONST(16), EDI )
2497117f1b4Smrg	ADD_L	( EAX, ESI )
2507117f1b4Smrg	CMP_L	( ECX, EDI )
2517117f1b4Smrg	JNE	( LLBL(K_G3TNNRR_top) )
2527117f1b4Smrg
2537117f1b4SmrgLLBL(K_G3TNNRR_finish):
2547117f1b4Smrg	POP_L	( EDI )
2557117f1b4Smrg	POP_L	( ESI )
2567117f1b4Smrg	RET
2577117f1b4Smrg#undef FRAME_OFFSET
2587117f1b4Smrg#endif
2597117f1b4Smrg
2607117f1b4Smrg#if defined (__ELF__) && defined (__linux__)
2617117f1b4Smrg	.section .note.GNU-stack,"",%progbits
2627117f1b4Smrg#endif
263