1848b8605Smrg 2848b8605Smrg/* 3848b8605Smrg * Mesa 3-D graphics library 4848b8605Smrg * 5848b8605Smrg * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. 6848b8605Smrg * 7848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 8848b8605Smrg * copy of this software and associated documentation files (the "Software"), 9848b8605Smrg * to deal in the Software without restriction, including without limitation 10848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the 12848b8605Smrg * Software is furnished to do so, subject to the following conditions: 13848b8605Smrg * 14848b8605Smrg * The above copyright notice and this permission notice shall be included 15848b8605Smrg * in all copies or substantial portions of the Software. 16848b8605Smrg * 17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 21848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 22848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE. 24848b8605Smrg */ 25848b8605Smrg 26848b8605Smrg/** TODO: 27848b8605Smrg * - insert PREFETCH instructions to avoid cache-misses ! 28848b8605Smrg * - some more optimizations are possible... 29848b8605Smrg * - for 40-50% more performance in the SSE-functions, the 30848b8605Smrg * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned ! 31848b8605Smrg */ 32848b8605Smrg 33848b8605Smrg#ifdef USE_SSE_ASM 34848b8605Smrg#include "assyntax.h" 35848b8605Smrg#include "matypes.h" 36848b8605Smrg#include "norm_args.h" 37848b8605Smrg 38848b8605Smrg SEG_TEXT 39848b8605Smrg 40848b8605Smrg#define M(i) REGOFF(i * 4, EDX) 41848b8605Smrg#define S(i) REGOFF(i * 4, ESI) 42848b8605Smrg#define D(i) REGOFF(i * 4, EDI) 43848b8605Smrg#define STRIDE REGOFF(12, ESI) 44848b8605Smrg 45848b8605Smrg 46848b8605SmrgALIGNTEXT16 47848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot) 48848b8605SmrgHIDDEN(_mesa_sse_transform_rescale_normals_no_rot) 49848b8605SmrgGLNAME(_mesa_sse_transform_rescale_normals_no_rot): 50848b8605Smrg 51848b8605Smrg#define FRAME_OFFSET 8 52848b8605Smrg PUSH_L ( ESI ) 53848b8605Smrg PUSH_L ( EDI ) 54848b8605Smrg 55848b8605Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 56848b8605Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 57848b8605Smrg 58848b8605Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 59848b8605Smrg MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ 60848b8605Smrg 61848b8605Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 62848b8605Smrg 63848b8605Smrg TEST_L ( ECX, ECX ) 64848b8605Smrg JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */ 65848b8605Smrg 66848b8605Smrg MOV_L ( STRIDE, EAX ) /* stride */ 67848b8605Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 68848b8605Smrg 69848b8605Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 70848b8605Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 71848b8605Smrg 72848b8605Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 73848b8605Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 74848b8605Smrg 75848b8605SmrgALIGNTEXT32 76848b8605Smrg MOVSS ( M(0), XMM1 ) /* m0 */ 77848b8605Smrg MOVSS ( M(5), XMM2 ) /* m5 */ 78848b8605Smrg UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */ 79848b8605Smrg MOVSS ( ARG_SCALE, XMM0 ) /* scale */ 80848b8605Smrg SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */ 81848b8605Smrg MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */ 82848b8605Smrg MULSS ( M(10), XMM0 ) /* m10*scale */ 83848b8605Smrg 84848b8605SmrgALIGNTEXT32 85848b8605SmrgLLBL(K_G3TRNNRR_top): 86848b8605Smrg MOVLPS ( S(0), XMM2 ) /* uy | ux */ 87848b8605Smrg MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */ 88848b8605Smrg MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */ 89848b8605Smrg 90848b8605Smrg MOVSS ( S(2), XMM2 ) /* uz */ 91848b8605Smrg MULSS ( XMM0, XMM2 ) /* uz*m10*scale */ 92848b8605Smrg MOVSS ( XMM2, D(2) ) /* ->D(2) */ 93848b8605Smrg 94848b8605SmrgLLBL(K_G3TRNNRR_skip): 95848b8605Smrg ADD_L ( CONST(16), EDI ) 96848b8605Smrg ADD_L ( EAX, ESI ) 97848b8605Smrg CMP_L ( ECX, EDI ) 98848b8605Smrg JNE ( LLBL(K_G3TRNNRR_top) ) 99848b8605Smrg 100848b8605SmrgLLBL(K_G3TRNNRR_finish): 101848b8605Smrg POP_L ( EDI ) 102848b8605Smrg POP_L ( ESI ) 103848b8605Smrg RET 104848b8605Smrg#undef FRAME_OFFSET 105848b8605Smrg 106848b8605Smrg 107848b8605Smrg 108848b8605SmrgALIGNTEXT16 109848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_rescale_normals) 110848b8605SmrgHIDDEN(_mesa_sse_transform_rescale_normals) 111848b8605SmrgGLNAME(_mesa_sse_transform_rescale_normals): 112848b8605Smrg 113848b8605Smrg#define FRAME_OFFSET 8 114848b8605Smrg PUSH_L ( ESI ) 115848b8605Smrg PUSH_L ( EDI ) 116848b8605Smrg 117848b8605Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 118848b8605Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 119848b8605Smrg 120848b8605Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 121848b8605Smrg MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ 122848b8605Smrg 123848b8605Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 124848b8605Smrg 125848b8605Smrg TEST_L ( ECX, ECX ) 126848b8605Smrg JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */ 127848b8605Smrg 128848b8605Smrg MOV_L ( STRIDE, EAX ) /* stride */ 129848b8605Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 130848b8605Smrg 131848b8605Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 132848b8605Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 133848b8605Smrg 134848b8605Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 135848b8605Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 136848b8605Smrg 137848b8605SmrgALIGNTEXT32 138848b8605Smrg MOVSS ( M(0), XMM0 ) /* m0 */ 139848b8605Smrg MOVSS ( M(4), XMM1 ) /* m4 */ 140848b8605Smrg UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */ 141848b8605Smrg 142848b8605Smrg MOVSS ( ARG_SCALE, XMM4 ) /* scale */ 143848b8605Smrg SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */ 144848b8605Smrg 145848b8605Smrg MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */ 146848b8605Smrg MOVSS ( M(1), XMM1 ) /* m1 */ 147848b8605Smrg MOVSS ( M(5), XMM2 ) /* m5 */ 148848b8605Smrg UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */ 149848b8605Smrg MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */ 150848b8605Smrg MOVSS ( M(2), XMM2 ) /* m2 */ 151848b8605Smrg MOVSS ( M(6), XMM3 ) /* m6 */ 152848b8605Smrg UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */ 153848b8605Smrg MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */ 154848b8605Smrg 155848b8605Smrg MOVSS ( M(8), XMM6 ) /* m8 */ 156848b8605Smrg MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */ 157848b8605Smrg MOVSS ( M(9), XMM7 ) /* m9 */ 158848b8605Smrg MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */ 159848b8605Smrg 160848b8605SmrgALIGNTEXT32 161848b8605SmrgLLBL(K_G3TRNR_top): 162848b8605Smrg MOVSS ( S(0), XMM3 ) /* ux */ 163848b8605Smrg SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */ 164848b8605Smrg MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */ 165848b8605Smrg MOVSS ( S(1), XMM4 ) /* uy */ 166848b8605Smrg SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */ 167848b8605Smrg MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */ 168848b8605Smrg MOVSS ( S(2), XMM5 ) /* uz */ 169848b8605Smrg SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */ 170848b8605Smrg MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */ 171848b8605Smrg 172848b8605Smrg ADDPS ( XMM4, XMM3 ) 173848b8605Smrg ADDPS ( XMM5, XMM3 ) 174848b8605Smrg MOVLPS ( XMM3, D(0) ) 175848b8605Smrg 176848b8605Smrg MOVSS ( M(10), XMM3 ) /* m10 */ 177848b8605Smrg MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */ 178848b8605Smrg MULSS ( S(2), XMM3 ) /* m10*scale*uz */ 179848b8605Smrg MOVSS ( S(1), XMM4 ) /* uy */ 180848b8605Smrg MULSS ( XMM7, XMM4 ) /* uy*m9*scale */ 181848b8605Smrg MOVSS ( S(0), XMM5 ) /* ux */ 182848b8605Smrg MULSS ( XMM6, XMM5 ) /* ux*m8*scale */ 183848b8605Smrg 184848b8605Smrg ADDSS ( XMM4, XMM3 ) 185848b8605Smrg ADDSS ( XMM5, XMM3 ) 186848b8605Smrg MOVSS ( XMM3, D(2) ) 187848b8605Smrg 188848b8605SmrgLLBL(K_G3TRNR_skip): 189848b8605Smrg ADD_L ( CONST(16), EDI ) 190848b8605Smrg ADD_L ( EAX, ESI ) 191848b8605Smrg CMP_L ( ECX, EDI ) 192848b8605Smrg JNE ( LLBL(K_G3TRNR_top) ) 193848b8605Smrg 194848b8605SmrgLLBL(K_G3TRNR_finish): 195848b8605Smrg POP_L ( EDI ) 196848b8605Smrg POP_L ( ESI ) 197848b8605Smrg RET 198848b8605Smrg#undef FRAME_OFFSET 199848b8605Smrg 200848b8605Smrg 201848b8605SmrgALIGNTEXT16 202848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_normals_no_rot) 203848b8605SmrgHIDDEN(_mesa_sse_transform_normals_no_rot) 204848b8605SmrgGLNAME(_mesa_sse_transform_normals_no_rot): 205848b8605Smrg 206848b8605Smrg#define FRAME_OFFSET 8 207848b8605Smrg PUSH_L ( ESI ) 208848b8605Smrg PUSH_L ( EDI ) 209848b8605Smrg 210848b8605Smrg MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */ 211848b8605Smrg MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */ 212848b8605Smrg 213848b8605Smrg MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */ 214848b8605Smrg MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */ 215848b8605Smrg 216848b8605Smrg MOV_L ( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */ 217848b8605Smrg 218848b8605Smrg TEST_L ( ECX, ECX ) 219848b8605Smrg JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */ 220848b8605Smrg 221848b8605Smrg MOV_L ( STRIDE, EAX ) /* stride */ 222848b8605Smrg MOV_L ( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest-count */ 223848b8605Smrg 224848b8605Smrg IMUL_L( CONST(16), ECX ) /* count *= 16 */ 225848b8605Smrg MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */ 226848b8605Smrg 227848b8605Smrg MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */ 228848b8605Smrg ADD_L( EDI, ECX ) /* count += dest ptr */ 229848b8605Smrg 230848b8605SmrgALIGNTEXT32 231848b8605Smrg MOVSS( M(0), XMM0 ) /* m0 */ 232848b8605Smrg MOVSS( M(5), XMM1 ) /* m5 */ 233848b8605Smrg UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */ 234848b8605Smrg MOVSS( M(10), XMM1 ) /* m10 */ 235848b8605Smrg 236848b8605SmrgALIGNTEXT32 237848b8605SmrgLLBL(K_G3TNNRR_top): 238848b8605Smrg MOVLPS( S(0), XMM2 ) /* uy | ux */ 239848b8605Smrg MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */ 240848b8605Smrg MOVLPS( XMM2, D(0) ) 241848b8605Smrg 242848b8605Smrg MOVSS( S(2), XMM2 ) /* uz */ 243848b8605Smrg MULSS( XMM1, XMM2 ) /* uz*m10 */ 244848b8605Smrg MOVSS( XMM2, D(2) ) 245848b8605Smrg 246848b8605SmrgLLBL(K_G3TNNRR_skip): 247848b8605Smrg ADD_L ( CONST(16), EDI ) 248848b8605Smrg ADD_L ( EAX, ESI ) 249848b8605Smrg CMP_L ( ECX, EDI ) 250848b8605Smrg JNE ( LLBL(K_G3TNNRR_top) ) 251848b8605Smrg 252848b8605SmrgLLBL(K_G3TNNRR_finish): 253848b8605Smrg POP_L ( EDI ) 254848b8605Smrg POP_L ( ESI ) 255848b8605Smrg RET 256848b8605Smrg#undef FRAME_OFFSET 257848b8605Smrg#endif 258848b8605Smrg 259848b8605Smrg#if defined (__ELF__) && defined (__linux__) 260848b8605Smrg .section .note.GNU-stack,"",%progbits 261848b8605Smrg#endif 262