17117f1b4Smrg 27117f1b4Smrg/* 37117f1b4Smrg * Mesa 3-D graphics library 47117f1b4Smrg * 57117f1b4Smrg * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. 67117f1b4Smrg * 77117f1b4Smrg * Permission is hereby granted, free of charge, to any person obtaining a 87117f1b4Smrg * copy of this software and associated documentation files (the "Software"), 97117f1b4Smrg * to deal in the Software without restriction, including without limitation 107117f1b4Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 117117f1b4Smrg * and/or sell copies of the Software, and to permit persons to whom the 127117f1b4Smrg * Software is furnished to do so, subject to the following conditions: 137117f1b4Smrg * 147117f1b4Smrg * The above copyright notice and this permission notice shall be included 157117f1b4Smrg * in all copies or substantial portions of the Software. 167117f1b4Smrg * 177117f1b4Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 187117f1b4Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 197117f1b4Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21af69d88dSmrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22af69d88dSmrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23af69d88dSmrg * OTHER DEALINGS IN THE SOFTWARE. 247117f1b4Smrg */ 257117f1b4Smrg 267117f1b4Smrg/** TODO: 277117f1b4Smrg * - insert PREFETCH instructions to avoid cache-misses ! 287117f1b4Smrg * - some more optimizations are possible... 297117f1b4Smrg * - for 40-50% more performance in the SSE-functions, the 307117f1b4Smrg * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! 317117f1b4Smrg */ 327117f1b4Smrg 337117f1b4Smrg#ifdef USE_SSE_ASM 34c1f859d4Smrg#include "assyntax.h" 357ec681f3Smrg#define MATH_ASM_PTR_SIZE 4 367ec681f3Smrg#include "math/m_vector_asm.h" 377117f1b4Smrg#include "norm_args.h" 387117f1b4Smrg 397117f1b4Smrg SEG_TEXT 407117f1b4Smrg 417117f1b4Smrg#define M(i) REGOFF(i * 4, EDX) 427117f1b4Smrg#define S(i) REGOFF(i * 4, ESI) 437117f1b4Smrg#define D(i) REGOFF(i * 4, EDI) 447117f1b4Smrg#define STRIDE REGOFF(12, ESI) 457117f1b4Smrg 467117f1b4Smrg 477117f1b4SmrgALIGNTEXT16 487117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot) 497117f1b4SmrgHIDDEN(_mesa_sse_transform_rescale_normals_no_rot) 507117f1b4SmrgGLNAME(_mesa_sse_transform_rescale_normals_no_rot): 517ec681f3Smrg _CET_ENDBR 527117f1b4Smrg#define FRAME_OFFSET 8 537117f1b4Smrg PUSH_L ( ESI ) 547117f1b4Smrg PUSH_L ( EDI ) 557117f1b4Smrg 567117f1b4Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 577117f1b4Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 587117f1b4Smrg 597117f1b4Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 607ec681f3Smrg ADD_L ( CONST(MATRIX_INV), EDX ) /* matrix->inv */ 617117f1b4Smrg 627117f1b4Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 637117f1b4Smrg 647117f1b4Smrg TEST_L ( ECX, ECX ) 657117f1b4Smrg JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */ 667117f1b4Smrg 677117f1b4Smrg MOV_L ( STRIDE, EAX ) /* stride */ 687117f1b4Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 697117f1b4Smrg 707117f1b4Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 717117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 727117f1b4Smrg 737117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 747117f1b4Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 757117f1b4Smrg 767117f1b4SmrgALIGNTEXT32 777117f1b4Smrg MOVSS ( M(0), XMM1 ) /* m0 */ 787117f1b4Smrg MOVSS ( M(5), XMM2 ) /* m5 */ 797117f1b4Smrg UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */ 807117f1b4Smrg MOVSS ( ARG_SCALE, XMM0 ) /* scale */ 817117f1b4Smrg SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */ 827117f1b4Smrg MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */ 837117f1b4Smrg MULSS ( M(10), XMM0 ) /* m10*scale */ 847117f1b4Smrg 857117f1b4SmrgALIGNTEXT32 867117f1b4SmrgLLBL(K_G3TRNNRR_top): 877117f1b4Smrg MOVLPS ( S(0), XMM2 ) /* uy | ux */ 887117f1b4Smrg MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */ 897117f1b4Smrg MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */ 907117f1b4Smrg 917117f1b4Smrg MOVSS ( S(2), XMM2 ) /* uz */ 927117f1b4Smrg MULSS ( XMM0, XMM2 ) /* uz*m10*scale */ 937117f1b4Smrg MOVSS ( XMM2, D(2) ) /* ->D(2) */ 947117f1b4Smrg 957117f1b4SmrgLLBL(K_G3TRNNRR_skip): 967117f1b4Smrg ADD_L ( CONST(16), EDI ) 977117f1b4Smrg ADD_L ( EAX, ESI ) 987117f1b4Smrg CMP_L ( ECX, EDI ) 997117f1b4Smrg JNE ( LLBL(K_G3TRNNRR_top) ) 1007117f1b4Smrg 1017117f1b4SmrgLLBL(K_G3TRNNRR_finish): 1027117f1b4Smrg POP_L ( EDI ) 1037117f1b4Smrg POP_L ( ESI ) 1047117f1b4Smrg RET 1057117f1b4Smrg#undef FRAME_OFFSET 1067117f1b4Smrg 1077117f1b4Smrg 1087117f1b4Smrg 1097117f1b4SmrgALIGNTEXT16 1107117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals) 1117117f1b4SmrgHIDDEN(_mesa_sse_transform_rescale_normals) 1127117f1b4SmrgGLNAME(_mesa_sse_transform_rescale_normals): 1137ec681f3Smrg _CET_ENDBR 1147117f1b4Smrg#define FRAME_OFFSET 8 1157117f1b4Smrg PUSH_L ( ESI ) 1167117f1b4Smrg PUSH_L ( EDI ) 1177117f1b4Smrg 1187117f1b4Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 1197117f1b4Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 1207117f1b4Smrg 1217117f1b4Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 1227ec681f3Smrg ADD_L ( CONST(MATRIX_INV), EDX ) /* matrix->inv */ 1237117f1b4Smrg 1247117f1b4Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 1257117f1b4Smrg 1267117f1b4Smrg TEST_L ( ECX, ECX ) 1277117f1b4Smrg JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */ 1287117f1b4Smrg 1297117f1b4Smrg MOV_L ( STRIDE, EAX ) /* stride */ 1307117f1b4Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 1317117f1b4Smrg 1327117f1b4Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 1337117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 1347117f1b4Smrg 1357117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 1367117f1b4Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 1377117f1b4Smrg 1387117f1b4SmrgALIGNTEXT32 1397117f1b4Smrg MOVSS ( M(0), XMM0 ) /* m0 */ 1407117f1b4Smrg MOVSS ( M(4), XMM1 ) /* m4 */ 1417117f1b4Smrg UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */ 1427117f1b4Smrg 1437117f1b4Smrg MOVSS ( ARG_SCALE, XMM4 ) /* scale */ 1447117f1b4Smrg SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */ 1457117f1b4Smrg 1467117f1b4Smrg MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */ 1477117f1b4Smrg MOVSS ( M(1), XMM1 ) /* m1 */ 1487117f1b4Smrg MOVSS ( M(5), XMM2 ) /* m5 */ 1497117f1b4Smrg UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */ 1507117f1b4Smrg MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */ 1517117f1b4Smrg MOVSS ( M(2), XMM2 ) /* m2 */ 1527117f1b4Smrg MOVSS ( M(6), XMM3 ) /* m6 */ 1537117f1b4Smrg UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */ 1547117f1b4Smrg MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */ 1557117f1b4Smrg 1567117f1b4Smrg MOVSS ( M(8), XMM6 ) /* m8 */ 1577117f1b4Smrg MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */ 1587117f1b4Smrg MOVSS ( M(9), XMM7 ) /* m9 */ 1597117f1b4Smrg MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */ 1607117f1b4Smrg 1617117f1b4SmrgALIGNTEXT32 1627117f1b4SmrgLLBL(K_G3TRNR_top): 1637117f1b4Smrg MOVSS ( S(0), XMM3 ) /* ux */ 1647117f1b4Smrg SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */ 1657117f1b4Smrg MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */ 1667117f1b4Smrg MOVSS ( S(1), XMM4 ) /* uy */ 1677117f1b4Smrg SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */ 1687117f1b4Smrg MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */ 1697117f1b4Smrg MOVSS ( S(2), XMM5 ) /* uz */ 1707117f1b4Smrg SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */ 1717117f1b4Smrg MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */ 1727117f1b4Smrg 1737117f1b4Smrg ADDPS ( XMM4, XMM3 ) 1747117f1b4Smrg ADDPS ( XMM5, XMM3 ) 1757117f1b4Smrg MOVLPS ( XMM3, D(0) ) 1767117f1b4Smrg 1777117f1b4Smrg MOVSS ( M(10), XMM3 ) /* m10 */ 1787117f1b4Smrg MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */ 1797117f1b4Smrg MULSS ( S(2), XMM3 ) /* m10*scale*uz */ 1807117f1b4Smrg MOVSS ( S(1), XMM4 ) /* uy */ 1817117f1b4Smrg MULSS ( XMM7, XMM4 ) /* uy*m9*scale */ 1827117f1b4Smrg MOVSS ( S(0), XMM5 ) /* ux */ 1837117f1b4Smrg MULSS ( XMM6, XMM5 ) /* ux*m8*scale */ 1847117f1b4Smrg 1857117f1b4Smrg ADDSS ( XMM4, XMM3 ) 1867117f1b4Smrg ADDSS ( XMM5, XMM3 ) 1877117f1b4Smrg MOVSS ( XMM3, D(2) ) 1887117f1b4Smrg 1897117f1b4SmrgLLBL(K_G3TRNR_skip): 1907117f1b4Smrg ADD_L ( CONST(16), EDI ) 1917117f1b4Smrg ADD_L ( EAX, ESI ) 1927117f1b4Smrg CMP_L ( ECX, EDI ) 1937117f1b4Smrg JNE ( LLBL(K_G3TRNR_top) ) 1947117f1b4Smrg 1957117f1b4SmrgLLBL(K_G3TRNR_finish): 1967117f1b4Smrg POP_L ( EDI ) 1977117f1b4Smrg POP_L ( ESI ) 1987117f1b4Smrg RET 1997117f1b4Smrg#undef FRAME_OFFSET 2007117f1b4Smrg 2017117f1b4Smrg 2027117f1b4SmrgALIGNTEXT16 2037117f1b4SmrgGLOBL GLNAME(_mesa_sse_transform_normals_no_rot) 2047117f1b4SmrgHIDDEN(_mesa_sse_transform_normals_no_rot) 2057117f1b4SmrgGLNAME(_mesa_sse_transform_normals_no_rot): 2067ec681f3Smrg _CET_ENDBR 2077117f1b4Smrg#define FRAME_OFFSET 8 2087117f1b4Smrg PUSH_L ( ESI ) 2097117f1b4Smrg PUSH_L ( EDI ) 2107117f1b4Smrg 2117117f1b4Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 2127117f1b4Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 2137117f1b4Smrg 2147117f1b4Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 2157ec681f3Smrg ADD_L ( CONST(MATRIX_INV), EDX ) /* matrix->inv */ 2167117f1b4Smrg 2177117f1b4Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 2187117f1b4Smrg 2197117f1b4Smrg TEST_L ( ECX, ECX ) 2207117f1b4Smrg JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */ 2217117f1b4Smrg 2227117f1b4Smrg MOV_L ( STRIDE, EAX ) /* stride */ 2237117f1b4Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 2247117f1b4Smrg 2257117f1b4Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 2267117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 2277117f1b4Smrg 2287117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 2297117f1b4Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 2307117f1b4Smrg 2317117f1b4SmrgALIGNTEXT32 2327117f1b4Smrg MOVSS( M(0), XMM0 ) /* m0 */ 2337117f1b4Smrg MOVSS( M(5), XMM1 ) /* m5 */ 2347117f1b4Smrg UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */ 2357117f1b4Smrg MOVSS( M(10), XMM1 ) /* m10 */ 2367117f1b4Smrg 2377117f1b4SmrgALIGNTEXT32 2387117f1b4SmrgLLBL(K_G3TNNRR_top): 2397117f1b4Smrg MOVLPS( S(0), XMM2 ) /* uy | ux */ 2407117f1b4Smrg MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */ 2417117f1b4Smrg MOVLPS( XMM2, D(0) ) 2427117f1b4Smrg 2437117f1b4Smrg MOVSS( S(2), XMM2 ) /* uz */ 2447117f1b4Smrg MULSS( XMM1, XMM2 ) /* uz*m10 */ 2457117f1b4Smrg MOVSS( XMM2, D(2) ) 2467117f1b4Smrg 2477117f1b4SmrgLLBL(K_G3TNNRR_skip): 2487117f1b4Smrg ADD_L ( CONST(16), EDI ) 2497117f1b4Smrg ADD_L ( EAX, ESI ) 2507117f1b4Smrg CMP_L ( ECX, EDI ) 2517117f1b4Smrg JNE ( LLBL(K_G3TNNRR_top) ) 2527117f1b4Smrg 2537117f1b4SmrgLLBL(K_G3TNNRR_finish): 2547117f1b4Smrg POP_L ( EDI ) 2557117f1b4Smrg POP_L ( ESI ) 2567117f1b4Smrg RET 2577117f1b4Smrg#undef FRAME_OFFSET 2587117f1b4Smrg#endif 2597117f1b4Smrg 2607117f1b4Smrg#if defined (__ELF__) && defined (__linux__) 2617117f1b4Smrg .section .note.GNU-stack,"",%progbits 2627117f1b4Smrg#endif 263