17117f1b4Smrg 27117f1b4Smrg/* 37117f1b4Smrg * Mesa 3-D graphics library 47117f1b4Smrg * 57117f1b4Smrg * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. 67117f1b4Smrg * 77117f1b4Smrg * Permission is hereby granted, free of charge, to any person obtaining a 87117f1b4Smrg * copy of this software and associated documentation files (the "Software"), 97117f1b4Smrg * to deal in the Software without restriction, including without limitation 107117f1b4Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 117117f1b4Smrg * and/or sell copies of the Software, and to permit persons to whom the 127117f1b4Smrg * Software is furnished to do so, subject to the following conditions: 137117f1b4Smrg * 147117f1b4Smrg * The above copyright notice and this permission notice shall be included 157117f1b4Smrg * in all copies or substantial portions of the Software. 167117f1b4Smrg * 177117f1b4Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 187117f1b4Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 197117f1b4Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21af69d88dSmrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22af69d88dSmrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23af69d88dSmrg * OTHER DEALINGS IN THE SOFTWARE. 247117f1b4Smrg */ 257117f1b4Smrg 267117f1b4Smrg#ifdef USE_SSE_ASM 27c1f859d4Smrg#include "assyntax.h" 287ec681f3Smrg#define MATH_ASM_PTR_SIZE 4 297ec681f3Smrg#include "math/m_vector_asm.h" 307117f1b4Smrg#include "xform_args.h" 317117f1b4Smrg 327117f1b4Smrg SEG_TEXT 337117f1b4Smrg 347117f1b4Smrg#define FRAME_OFFSET 8 357117f1b4Smrg 367117f1b4Smrg#define SRC(i) REGOFF(i * 4, ESI) 377117f1b4Smrg#define DST(i) REGOFF(i * 4, EDI) 387117f1b4Smrg#define MAT(i) REGOFF(i * 4, EDX) 397117f1b4Smrg 407117f1b4Smrg#define SELECT(r0, r1, r2, r3) CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 ) 417117f1b4Smrg 427117f1b4Smrg 437117f1b4SmrgALIGNTEXT16 447117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_general ) 457117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_general) 467117f1b4SmrgGLNAME( _mesa_sse_transform_points4_general ): 477ec681f3Smrg _CET_ENDBR 487117f1b4Smrg PUSH_L( ESI ) 497117f1b4Smrg PUSH_L( EDI ) 507117f1b4Smrg 517117f1b4Smrg MOV_L( ARG_SOURCE, ESI ) 527117f1b4Smrg MOV_L( ARG_DEST, EDI ) 537117f1b4Smrg 547117f1b4Smrg MOV_L( ARG_MATRIX, EDX ) 557117f1b4Smrg MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 567117f1b4Smrg 577117f1b4Smrg TEST_L( ECX, ECX ) /* verify non-zero count */ 587117f1b4Smrg JE( LLBL( sse_general_done ) ) 597117f1b4Smrg 607117f1b4Smrg MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ 617117f1b4Smrg OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ 627117f1b4Smrg 637117f1b4Smrg MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ 647117f1b4Smrg MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */ 657117f1b4Smrg 667117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 677117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 687117f1b4Smrg 697117f1b4Smrg PREFETCHT0( REGIND(ESI) ) 707117f1b4Smrg 717117f1b4Smrg MOVAPS( MAT(0), XMM4 ) /* m3 | m2 | m1 | m0 */ 727117f1b4Smrg MOVAPS( MAT(4), XMM5 ) /* m7 | m6 | m5 | m4 */ 737117f1b4Smrg MOVAPS( MAT(8), XMM6 ) /* m11 | m10 | m9 | m8 */ 747117f1b4Smrg MOVAPS( MAT(12), XMM7 ) /* m15 | m14 | m13 | m12 */ 757117f1b4Smrg 767117f1b4SmrgALIGNTEXT16 777117f1b4SmrgLLBL( sse_general_loop ): 787117f1b4Smrg 797117f1b4Smrg MOVSS( SRC(0), XMM0 ) /* ox */ 807117f1b4Smrg SHUFPS( CONST(0x0), XMM0, XMM0 ) /* ox | ox | ox | ox */ 817117f1b4Smrg MULPS( XMM4, XMM0 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 827117f1b4Smrg 837117f1b4Smrg MOVSS( SRC(1), XMM1 ) /* oy */ 847117f1b4Smrg SHUFPS( CONST(0x0), XMM1, XMM1 ) /* oy | oy | oy | oy */ 857117f1b4Smrg MULPS( XMM5, XMM1 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 867117f1b4Smrg 877117f1b4Smrg MOVSS( SRC(2), XMM2 ) /* oz */ 887117f1b4Smrg SHUFPS( CONST(0x0), XMM2, XMM2 ) /* oz | oz | oz | oz */ 897117f1b4Smrg MULPS( XMM6, XMM2 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 907117f1b4Smrg 917117f1b4Smrg MOVSS( SRC(3), XMM3 ) /* ow */ 927117f1b4Smrg SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ow | ow | ow | ow */ 937117f1b4Smrg MULPS( XMM7, XMM3 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 947117f1b4Smrg 957117f1b4Smrg ADDPS( XMM1, XMM0 ) /* ox*m3+oy*m7 | ... */ 967117f1b4Smrg ADDPS( XMM2, XMM0 ) /* ox*m3+oy*m7+oz*m11 | ... */ 977117f1b4Smrg ADDPS( XMM3, XMM0 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 987117f1b4Smrg MOVAPS( XMM0, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 997117f1b4Smrg 1007117f1b4Smrg ADD_L( CONST(16), EDI ) 1017117f1b4Smrg ADD_L( EAX, ESI ) 1027117f1b4Smrg 1037117f1b4Smrg DEC_L( ECX ) 1047117f1b4Smrg JNZ( LLBL( sse_general_loop ) ) 1057117f1b4Smrg 1067117f1b4SmrgLLBL( sse_general_done ): 1077117f1b4Smrg 1087117f1b4Smrg POP_L( EDI ) 1097117f1b4Smrg POP_L( ESI ) 1107117f1b4Smrg RET 1117117f1b4Smrg 1127117f1b4Smrg 1137117f1b4Smrg 1147117f1b4Smrg 1157117f1b4SmrgALIGNTEXT4 1167117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_3d ) 1177117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_3d) 1187117f1b4SmrgGLNAME( _mesa_sse_transform_points4_3d ): 1197ec681f3Smrg _CET_ENDBR 1207117f1b4Smrg PUSH_L( ESI ) 1217117f1b4Smrg PUSH_L( EDI ) 1227117f1b4Smrg 1237117f1b4Smrg MOV_L( ARG_SOURCE, ESI ) /* ptr to source GLvector4f */ 1247117f1b4Smrg MOV_L( ARG_DEST, EDI ) /* ptr to dest GLvector4f */ 1257117f1b4Smrg 1267117f1b4Smrg MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */ 1277117f1b4Smrg MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 1287117f1b4Smrg 1297117f1b4Smrg TEST_L( ECX, ECX) 1307117f1b4Smrg JZ( LLBL(K_GTP43P3DR_finish) ) /* count was zero; go to finish */ 1317117f1b4Smrg 1327117f1b4Smrg MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ 1337117f1b4Smrg OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ 1347117f1b4Smrg 1357117f1b4Smrg MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ 1367117f1b4Smrg MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */ 1377117f1b4Smrg 1387117f1b4Smrg SHL_L( CONST(4), ECX ) /* count *= 16 */ 1397117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 1407117f1b4Smrg 1417117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 1427117f1b4Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 1437117f1b4Smrg 1447117f1b4Smrg MOVAPS( MAT(0), XMM0 ) /* m3 | m2 | m1 | m0 */ 1457117f1b4Smrg MOVAPS( MAT(4), XMM1 ) /* m7 | m6 | m5 | m4 */ 1467117f1b4Smrg MOVAPS( MAT(8), XMM2 ) /* m11 | m10 | m9 | m8 */ 1477117f1b4Smrg MOVAPS( MAT(12), XMM3 ) /* m15 | m14 | m13 | m12 */ 1487117f1b4Smrg 1497117f1b4SmrgALIGNTEXT32 1507117f1b4SmrgLLBL( K_GTP43P3DR_top ): 1517117f1b4Smrg MOVSS( SRC(0), XMM4 ) /* ox */ 1527117f1b4Smrg SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */ 1537117f1b4Smrg MULPS( XMM0, XMM4 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 1547117f1b4Smrg 1557117f1b4Smrg MOVSS( SRC(1), XMM5 ) /* oy */ 1567117f1b4Smrg SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */ 1577117f1b4Smrg MULPS( XMM1, XMM5 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 1587117f1b4Smrg 1597117f1b4Smrg MOVSS( SRC(2), XMM6 ) /* oz */ 1607117f1b4Smrg SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */ 1617117f1b4Smrg MULPS( XMM2, XMM6 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 1627117f1b4Smrg 1637117f1b4Smrg MOVSS( SRC(3), XMM7 ) /* ow */ 1647117f1b4Smrg SHUFPS( CONST(0x0), XMM7, XMM7 ) /* ow | ow | ow | ow */ 1657117f1b4Smrg MULPS( XMM3, XMM7 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 1667117f1b4Smrg 1677117f1b4Smrg ADDPS( XMM5, XMM4 ) /* ox*m3+oy*m7 | ... */ 1687117f1b4Smrg ADDPS( XMM6, XMM4 ) /* ox*m3+oy*m7+oz*m11 | ... */ 1697117f1b4Smrg ADDPS( XMM7, XMM4 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 1707117f1b4Smrg MOVAPS( XMM4, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 1717117f1b4Smrg 1727117f1b4Smrg MOVSS( SRC(3), XMM4 ) /* ow */ 1737117f1b4Smrg MOVSS( XMM4, DST(3) ) /* ->D(3) */ 1747117f1b4Smrg 1757117f1b4SmrgLLBL( K_GTP43P3DR_skip ): 1767117f1b4Smrg ADD_L( CONST(16), EDI ) 1777117f1b4Smrg ADD_L( EAX, ESI ) 1787117f1b4Smrg CMP_L( ECX, EDI ) 1797117f1b4Smrg JNE( LLBL(K_GTP43P3DR_top) ) 1807117f1b4Smrg 1817117f1b4SmrgLLBL( K_GTP43P3DR_finish ): 1827117f1b4Smrg POP_L( EDI ) 1837117f1b4Smrg POP_L( ESI ) 1847117f1b4Smrg RET 1857117f1b4Smrg 1867117f1b4Smrg 1877117f1b4SmrgALIGNTEXT16 1887117f1b4SmrgGLOBL GLNAME( _mesa_sse_transform_points4_identity ) 1897117f1b4SmrgHIDDEN(_mesa_sse_transform_points4_identity) 1907117f1b4SmrgGLNAME( _mesa_sse_transform_points4_identity ): 1917ec681f3Smrg _CET_ENDBR 1927117f1b4Smrg PUSH_L( ESI ) 1937117f1b4Smrg PUSH_L( EDI ) 1947117f1b4Smrg 1957117f1b4Smrg MOV_L( ARG_SOURCE, ESI ) 1967117f1b4Smrg MOV_L( ARG_DEST, EDI ) 1977117f1b4Smrg 1987117f1b4Smrg MOV_L( ARG_MATRIX, EDX ) 1997117f1b4Smrg MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 2007117f1b4Smrg 2017117f1b4Smrg TEST_L( ECX, ECX ) /* verify non-zero count */ 2027117f1b4Smrg JE( LLBL( sse_identity_done ) ) 2037117f1b4Smrg 2047117f1b4Smrg MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */ 2057117f1b4Smrg OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */ 2067117f1b4Smrg 2077117f1b4Smrg MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */ 2087117f1b4Smrg MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */ 2097117f1b4Smrg 2107117f1b4Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 2117117f1b4Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 2127117f1b4Smrg 2137117f1b4SmrgALIGNTEXT16 2147117f1b4SmrgLLBL( sse_identity_loop ): 2157117f1b4Smrg 2167117f1b4Smrg PREFETCHNTA( REGOFF(32, ESI) ) 2177117f1b4Smrg 2187117f1b4Smrg MOVAPS( REGIND(ESI), XMM0 ) 2197117f1b4Smrg ADD_L( EAX, ESI ) 2207117f1b4Smrg 2217117f1b4Smrg MOVAPS( XMM0, REGIND(EDI) ) 2227117f1b4Smrg ADD_L( CONST(16), EDI ) 2237117f1b4Smrg 2247117f1b4Smrg DEC_L( ECX ) 2257117f1b4Smrg JNZ( LLBL( sse_identity_loop ) ) 2267117f1b4Smrg 2277117f1b4SmrgLLBL( sse_identity_done ): 2287117f1b4Smrg 2297117f1b4Smrg POP_L( EDI ) 2307117f1b4Smrg POP_L( ESI ) 2317117f1b4Smrg RET 2327117f1b4Smrg#endif 2337117f1b4Smrg 2347117f1b4Smrg#if defined (__ELF__) && defined (__linux__) 2357117f1b4Smrg .section .note.GNU-stack,"",%progbits 2367117f1b4Smrg#endif 237