1848b8605Smrg
2848b8605Smrg/*
3848b8605Smrg * Mesa 3-D graphics library
4848b8605Smrg *
5848b8605Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
6848b8605Smrg *
7848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
8848b8605Smrg * copy of this software and associated documentation files (the "Software"),
9848b8605Smrg * to deal in the Software without restriction, including without limitation
10848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
12848b8605Smrg * Software is furnished to do so, subject to the following conditions:
13848b8605Smrg *
14848b8605Smrg * The above copyright notice and this permission notice shall be included
15848b8605Smrg * in all copies or substantial portions of the Software.
16848b8605Smrg *
17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE.
24848b8605Smrg */
25848b8605Smrg
26848b8605Smrg/** TODO:
27848b8605Smrg  * - insert PREFETCH instructions to avoid cache-misses !
28848b8605Smrg  * - some more optimizations are possible...
29848b8605Smrg  * - for 40-50% more performance in the SSE-functions, the
30848b8605Smrg  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
31848b8605Smrg  */
32848b8605Smrg
33848b8605Smrg#ifdef USE_SSE_ASM
34848b8605Smrg#include "assyntax.h"
35848b8605Smrg#include "matypes.h"
36848b8605Smrg#include "xform_args.h"
37848b8605Smrg
38848b8605Smrg   SEG_TEXT
39848b8605Smrg
40848b8605Smrg#define S(i) 	REGOFF(i * 4, ESI)
41848b8605Smrg#define D(i) 	REGOFF(i * 4, EDI)
42848b8605Smrg#define M(i) 	REGOFF(i * 4, EDX)
43848b8605Smrg
44848b8605Smrg
45848b8605SmrgALIGNTEXT4
46848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_general)
47848b8605SmrgHIDDEN( _mesa_sse_transform_points1_general )
48848b8605SmrgGLNAME( _mesa_sse_transform_points1_general ):
49848b8605Smrg
50848b8605Smrg#define FRAME_OFFSET 8
51848b8605Smrg    PUSH_L    ( ESI )
52848b8605Smrg    PUSH_L    ( EDI )
53848b8605Smrg
54848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
55848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
56848b8605Smrg
57848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
58848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
59848b8605Smrg
60848b8605Smrg    CMP_L( CONST(0), ECX )			/* count == 0 ? */
61848b8605Smrg    JE( LLBL(K_GTP1GR_finish) )			/* yes -> nothing to do. */
62848b8605Smrg
63848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
64848b8605Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
65848b8605Smrg
66848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
67848b8605Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
68848b8605Smrg
69848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
70848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
71848b8605Smrg
72848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
73848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
74848b8605Smrg
75848b8605Smrg
76848b8605SmrgALIGNTEXT32
77848b8605Smrg    MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  | m0  */
78848b8605Smrg    MOVAPS( M(12), XMM1 )			/* m15 | m14 | m13 | m12 */
79848b8605Smrg
80848b8605SmrgALIGNTEXT32
81848b8605SmrgLLBL(K_GTP1GR_top):
82848b8605Smrg    MOVSS( S(0), XMM2 )				/* ox */
83848b8605Smrg    SHUFPS( CONST(0x0), XMM2, XMM2 )		/* ox | ox | ox | ox */
84848b8605Smrg    MULPS( XMM0, XMM2 )				/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
85848b8605Smrg    ADDPS( XMM1, XMM2 )				/* + | + | + | + */
86848b8605Smrg    MOVUPS( XMM2, D(0) )
87848b8605Smrg
88848b8605SmrgLLBL(K_GTP1GR_skip):
89848b8605Smrg    ADD_L     ( CONST(16), EDI )
90848b8605Smrg    ADD_L     ( EAX, ESI )
91848b8605Smrg    CMP_L     ( ECX, EDI )
92848b8605Smrg    JNE       ( LLBL(K_GTP1GR_top) )
93848b8605Smrg
94848b8605SmrgLLBL(K_GTP1GR_finish):
95848b8605Smrg    POP_L     ( EDI )
96848b8605Smrg    POP_L     ( ESI )
97848b8605Smrg    RET
98848b8605Smrg#undef FRAME_OFFSET
99848b8605Smrg
100848b8605Smrg
101848b8605Smrg
102848b8605SmrgALIGNTEXT4
103848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_identity)
104848b8605SmrgHIDDEN(_mesa_sse_transform_points1_identity)
105848b8605SmrgGLNAME( _mesa_sse_transform_points1_identity ):
106848b8605Smrg
107848b8605Smrg#define FRAME_OFFSET 8
108848b8605Smrg    PUSH_L    ( ESI )
109848b8605Smrg    PUSH_L    ( EDI )
110848b8605Smrg
111848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
112848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
113848b8605Smrg
114848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
115848b8605Smrg
116848b8605Smrg    TEST_L( ECX, ECX)
117848b8605Smrg    JZ( LLBL(K_GTP1IR_finish) ) 		/* count was zero; go to finish */
118848b8605Smrg
119848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
120848b8605Smrg    OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
121848b8605Smrg
122848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
123848b8605Smrg    MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
124848b8605Smrg
125848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
126848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
127848b8605Smrg
128848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
129848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
130848b8605Smrg
131848b8605Smrg    CMP_L( ESI, EDI )
132848b8605Smrg    JE( LLBL(K_GTP1IR_finish) )
133848b8605Smrg
134848b8605Smrg
135848b8605SmrgALIGNTEXT32
136848b8605SmrgLLBL(K_GTP1IR_top):
137848b8605Smrg    MOV_L( S(0), EDX )
138848b8605Smrg    MOV_L( EDX, D(0) )
139848b8605Smrg
140848b8605SmrgLLBL(K_GTP1IR_skip):
141848b8605Smrg    ADD_L     ( CONST(16), EDI )
142848b8605Smrg    ADD_L     ( EAX, ESI )
143848b8605Smrg    CMP_L     ( ECX, EDI )
144848b8605Smrg    JNE       ( LLBL(K_GTP1IR_top) )
145848b8605Smrg
146848b8605SmrgLLBL(K_GTP1IR_finish):
147848b8605Smrg    POP_L     ( EDI )
148848b8605Smrg    POP_L     ( ESI )
149848b8605Smrg    RET
150848b8605Smrg#undef FRAME_OFFSET
151848b8605Smrg
152848b8605Smrg
153848b8605Smrg
154848b8605SmrgALIGNTEXT4
155848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot)
156848b8605SmrgHIDDEN(_mesa_sse_transform_points1_3d_no_rot)
157848b8605SmrgGLNAME(_mesa_sse_transform_points1_3d_no_rot):
158848b8605Smrg
159848b8605Smrg#define FRAME_OFFSET 8
160848b8605Smrg    PUSH_L( ESI )
161848b8605Smrg    PUSH_L( EDI )
162848b8605Smrg
163848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI )	/* ptr to source GLvector4f */
164848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
165848b8605Smrg
166848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
167848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
168848b8605Smrg
169848b8605Smrg    TEST_L( ECX, ECX)
170848b8605Smrg    JZ( LLBL(K_GTP13DNRR_finish) ) 		/* count was zero; go to finish */
171848b8605Smrg
172848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
173848b8605Smrg    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
174848b8605Smrg
175848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
176848b8605Smrg    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
177848b8605Smrg
178848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
179848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
180848b8605Smrg
181848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
182848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
183848b8605Smrg
184848b8605Smrg
185848b8605SmrgALIGNTEXT32
186848b8605Smrg    MOVSS( M(0), XMM0 )				/* m0 */
187848b8605Smrg    MOVSS( M(12), XMM1 )			/* m12 */
188848b8605Smrg    MOVSS( M(13), XMM2 )			/* m13 */
189848b8605Smrg    MOVSS( M(14), XMM3 )			/* m14 */
190848b8605Smrg
191848b8605SmrgALIGNTEXT32
192848b8605SmrgLLBL(K_GTP13DNRR_top):
193848b8605Smrg    MOVSS( S(0), XMM4 )				/* ox */
194848b8605Smrg    MULSS( XMM0, XMM4 )				/* ox*m0 */
195848b8605Smrg    ADDSS( XMM1, XMM4 )				/* ox*m0+m12 */
196848b8605Smrg    MOVSS( XMM4, D(0) )
197848b8605Smrg
198848b8605Smrg    MOVSS( XMM2, D(1) )
199848b8605Smrg    MOVSS( XMM3, D(2) )
200848b8605Smrg
201848b8605SmrgLLBL(K_GTP13DNRR_skip):
202848b8605Smrg    ADD_L    ( CONST(16), EDI )
203848b8605Smrg    ADD_L    ( EAX, ESI )
204848b8605Smrg    CMP_L    ( ECX, EDI )
205848b8605Smrg    JNE      ( LLBL(K_GTP13DNRR_top) )
206848b8605Smrg
207848b8605SmrgLLBL(K_GTP13DNRR_finish):
208848b8605Smrg    POP_L    ( EDI )
209848b8605Smrg    POP_L    ( ESI )
210848b8605Smrg    RET
211848b8605Smrg#undef FRAME_OFFSET
212848b8605Smrg
213848b8605Smrg
214848b8605Smrg
215848b8605SmrgALIGNTEXT4
216848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_perspective)
217848b8605SmrgHIDDEN(_mesa_sse_transform_points1_perspective)
218848b8605SmrgGLNAME(_mesa_sse_transform_points1_perspective):
219848b8605Smrg
220848b8605Smrg#define FRAME_OFFSET 8
221848b8605Smrg    PUSH_L   ( ESI )
222848b8605Smrg    PUSH_L   ( EDI )
223848b8605Smrg
224848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
225848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
226848b8605Smrg
227848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
228848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
229848b8605Smrg
230848b8605Smrg    TEST_L( ECX, ECX)
231848b8605Smrg    JZ( LLBL(K_GTP13PR_finish) )		/* count was zero; go to finish */
232848b8605Smrg
233848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
234848b8605Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
235848b8605Smrg
236848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
237848b8605Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
238848b8605Smrg
239848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
240848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
241848b8605Smrg
242848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
243848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
244848b8605Smrg
245848b8605Smrg
246848b8605SmrgALIGNTEXT32
247848b8605Smrg    XORPS( XMM0, XMM0 )				/* 0 | 0 | 0 | 0 */
248848b8605Smrg    MOVSS( M(0), XMM1 )				/* m0 */
249848b8605Smrg    MOVSS( M(14), XMM2 )			/* m14 */
250848b8605Smrg
251848b8605SmrgALIGNTEXT32
252848b8605SmrgLLBL(K_GTP13PR_top):
253848b8605Smrg    MOVSS( S(0), XMM3 )				/* ox */
254848b8605Smrg    MULSS( XMM1, XMM3 )				/* ox*m0 */
255848b8605Smrg    MOVSS( XMM3, D(0) )				/* ox*m0->D(0) */
256848b8605Smrg    MOVSS( XMM2, D(2) )				/* m14->D(2) */
257848b8605Smrg
258848b8605Smrg    MOVSS( XMM0, D(1) )
259848b8605Smrg    MOVSS( XMM0, D(3) )
260848b8605Smrg
261848b8605SmrgLLBL(K_GTP13PR_skip):
262848b8605Smrg    ADD_L( CONST(16), EDI )
263848b8605Smrg    ADD_L( EAX, ESI )
264848b8605Smrg    CMP_L( ECX, EDI )
265848b8605Smrg    JNE( LLBL(K_GTP13PR_top) )
266848b8605Smrg
267848b8605SmrgLLBL(K_GTP13PR_finish):
268848b8605Smrg    POP_L    ( EDI )
269848b8605Smrg    POP_L    ( ESI )
270848b8605Smrg    RET
271848b8605Smrg#undef FRAME_OFFSET
272848b8605Smrg
273848b8605Smrg
274848b8605SmrgALIGNTEXT4
275848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_2d)
276848b8605SmrgHIDDEN(_mesa_sse_transform_points1_2d)
277848b8605SmrgGLNAME(_mesa_sse_transform_points1_2d):
278848b8605Smrg
279848b8605Smrg#define FRAME_OFFSET 8
280848b8605Smrg    PUSH_L( ESI )
281848b8605Smrg    PUSH_L( EDI )
282848b8605Smrg
283848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
284848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
285848b8605Smrg
286848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
287848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
288848b8605Smrg
289848b8605Smrg    TEST_L( ECX, ECX)
290848b8605Smrg    JZ( LLBL(K_GTP13P2DR_finish) ) 		/* count was zero; go to finish */
291848b8605Smrg
292848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
293848b8605Smrg    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
294848b8605Smrg
295848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
296848b8605Smrg    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
297848b8605Smrg
298848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
299848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
300848b8605Smrg
301848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
302848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
303848b8605Smrg
304848b8605SmrgALIGNTEXT32
305848b8605Smrg    MOVLPS( M(0), XMM0 )			/* m1  | m0  */
306848b8605Smrg    MOVLPS( M(12), XMM1 )			/* m13 | m12 */
307848b8605Smrg
308848b8605SmrgALIGNTEXT32
309848b8605SmrgLLBL(K_GTP13P2DR_top):
310848b8605Smrg    MOVSS( S(0), XMM2 )				/* ox */
311848b8605Smrg    SHUFPS( CONST(0x0), XMM2, XMM2 )		/* ox | ox | ox | ox */
312848b8605Smrg    MULPS( XMM0, XMM2 )				/* - | - | ox*m1 | ox*m0 */
313848b8605Smrg    ADDPS( XMM1, XMM2 )				/* - | - | ox*m1+m13 | ox*m0+m12 */
314848b8605Smrg    MOVLPS( XMM2, D(0) )
315848b8605Smrg
316848b8605SmrgLLBL(K_GTP13P2DR_skip):
317848b8605Smrg    ADD_L    ( CONST(16), EDI )
318848b8605Smrg    ADD_L    ( EAX, ESI )
319848b8605Smrg    CMP_L    ( ECX, EDI )
320848b8605Smrg    JNE      ( LLBL(K_GTP13P2DR_top) )
321848b8605Smrg
322848b8605SmrgLLBL(K_GTP13P2DR_finish):
323848b8605Smrg    POP_L    ( EDI )
324848b8605Smrg    POP_L    ( ESI )
325848b8605Smrg    RET
326848b8605Smrg#undef FRAME_OFFSET
327848b8605Smrg
328848b8605Smrg
329848b8605SmrgALIGNTEXT4
330848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot)
331848b8605SmrgHIDDEN(_mesa_sse_transform_points1_2d_no_rot)
332848b8605SmrgGLNAME(_mesa_sse_transform_points1_2d_no_rot):
333848b8605Smrg
334848b8605Smrg#define FRAME_OFFSET 8
335848b8605Smrg	PUSH_L( ESI )
336848b8605Smrg	PUSH_L( EDI )
337848b8605Smrg
338848b8605Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
339848b8605Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
340848b8605Smrg
341848b8605Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
342848b8605Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
343848b8605Smrg
344848b8605Smrg	TEST_L( ECX, ECX)
345848b8605Smrg	JZ( LLBL(K_GTP13P2DNRR_finish) ) 	/* count was zero; go to finish */
346848b8605Smrg
347848b8605Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
348848b8605Smrg	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
349848b8605Smrg
350848b8605Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
351848b8605Smrg	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
352848b8605Smrg
353848b8605Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
354848b8605Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
355848b8605Smrg
356848b8605Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
357848b8605Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
358848b8605Smrg
359848b8605SmrgALIGNTEXT32
360848b8605Smrg	MOVSS( M(0), XMM0 )			/* m0 */
361848b8605Smrg	MOVSS( M(12), XMM1 )			/* m12 */
362848b8605Smrg	MOVSS( M(13), XMM2 )			/* m13 */
363848b8605Smrg
364848b8605SmrgALIGNTEXT32
365848b8605SmrgLLBL(K_GTP13P2DNRR_top):
366848b8605Smrg	MOVSS( S(0), XMM3 )			/* ox */
367848b8605Smrg	MULSS( XMM0, XMM3 )			/* ox*m0 */
368848b8605Smrg	ADDSS( XMM1, XMM3 )			/* ox*m0+m12 */
369848b8605Smrg	MOVSS( XMM3, D(0) )
370848b8605Smrg	MOVSS( XMM2, D(1) )
371848b8605Smrg
372848b8605SmrgLLBL(K_GTP13P2DNRR_skip):
373848b8605Smrg	ADD_L( CONST(16), EDI )
374848b8605Smrg	ADD_L( EAX, ESI )
375848b8605Smrg	CMP_L( ECX, EDI )
376848b8605Smrg	JNE( LLBL(K_GTP13P2DNRR_top) )
377848b8605Smrg
378848b8605SmrgLLBL(K_GTP13P2DNRR_finish):
379848b8605Smrg	POP_L( EDI )
380848b8605Smrg	POP_L( ESI )
381848b8605Smrg	RET
382848b8605Smrg#undef FRAME_OFFSET
383848b8605Smrg
384848b8605Smrg
385848b8605Smrg
386848b8605SmrgALIGNTEXT4
387848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points1_3d)
388848b8605SmrgHIDDEN(_mesa_sse_transform_points1_3d)
389848b8605SmrgGLNAME(_mesa_sse_transform_points1_3d):
390848b8605Smrg
391848b8605Smrg#define FRAME_OFFSET 8
392848b8605Smrg	PUSH_L( ESI )
393848b8605Smrg	PUSH_L( EDI )
394848b8605Smrg
395848b8605Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
396848b8605Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
397848b8605Smrg
398848b8605Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
399848b8605Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
400848b8605Smrg
401848b8605Smrg	TEST_L( ECX, ECX)
402848b8605Smrg	JZ( LLBL(K_GTP13P3DR_finish) ) 	/* count was zero; go to finish */
403848b8605Smrg
404848b8605Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
405848b8605Smrg	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
406848b8605Smrg
407848b8605Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
408848b8605Smrg	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
409848b8605Smrg
410848b8605Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
411848b8605Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
412848b8605Smrg
413848b8605Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
414848b8605Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
415848b8605Smrg
416848b8605Smrg
417848b8605SmrgALIGNTEXT32
418848b8605Smrg	MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  |  m0 */
419848b8605Smrg	MOVAPS( M(12), XMM1 )			/* m15 | m14 | m13 | m12 */
420848b8605Smrg
421848b8605SmrgALIGNTEXT32
422848b8605SmrgLLBL(K_GTP13P3DR_top):
423848b8605Smrg	MOVSS( S(0), XMM2 )			/* ox */
424848b8605Smrg	SHUFPS( CONST(0x0), XMM2, XMM2 )	/* ox | ox | ox | ox */
425848b8605Smrg	MULPS( XMM0, XMM2 )			/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
426848b8605Smrg	ADDPS( XMM1, XMM2 )			/* +m15  | +m14  | +m13  | +m12  */
427848b8605Smrg	MOVLPS( XMM2, D(0) )			/*   -   |   -   | ->D(1)| ->D(0)*/
428848b8605Smrg	UNPCKHPS( XMM2, XMM2 )			/* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */
429848b8605Smrg	MOVSS( XMM2, D(2) )
430848b8605Smrg
431848b8605SmrgLLBL(K_GTP13P3DR_skip):
432848b8605Smrg	ADD_L( CONST(16), EDI )
433848b8605Smrg	ADD_L( EAX, ESI )
434848b8605Smrg	CMP_L( ECX, EDI )
435848b8605Smrg	JNE( LLBL(K_GTP13P3DR_top) )
436848b8605Smrg
437848b8605SmrgLLBL(K_GTP13P3DR_finish):
438848b8605Smrg	POP_L( EDI )
439848b8605Smrg	POP_L( ESI )
440848b8605Smrg	RET
441848b8605Smrg#undef FRAME_OFFSET
442848b8605Smrg#endif
443848b8605Smrg
444848b8605Smrg#if defined (__ELF__) && defined (__linux__)
445848b8605Smrg	.section .note.GNU-stack,"",%progbits
446848b8605Smrg#endif
447