1848b8605Smrg
2848b8605Smrg/*
3848b8605Smrg * Mesa 3-D graphics library
4848b8605Smrg *
5848b8605Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
6848b8605Smrg *
7848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
8848b8605Smrg * copy of this software and associated documentation files (the "Software"),
9848b8605Smrg * to deal in the Software without restriction, including without limitation
10848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
12848b8605Smrg * Software is furnished to do so, subject to the following conditions:
13848b8605Smrg *
14848b8605Smrg * The above copyright notice and this permission notice shall be included
15848b8605Smrg * in all copies or substantial portions of the Software.
16848b8605Smrg *
17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE.
24848b8605Smrg */
25848b8605Smrg
26848b8605Smrg/** TODO:
27848b8605Smrg  * - insert PREFETCH instructions to avoid cache-misses !
28848b8605Smrg  * - some more optimizations are possible...
29848b8605Smrg  * - for 40-50% more performance in the SSE-functions, the
30848b8605Smrg  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
31848b8605Smrg  */
32848b8605Smrg
33848b8605Smrg#ifdef USE_SSE_ASM
34848b8605Smrg#include "assyntax.h"
35848b8605Smrg#include "matypes.h"
36848b8605Smrg#include "xform_args.h"
37848b8605Smrg
38848b8605Smrg   SEG_TEXT
39848b8605Smrg
40848b8605Smrg#define S(i) 	REGOFF(i * 4, ESI)
41848b8605Smrg#define D(i) 	REGOFF(i * 4, EDI)
42848b8605Smrg#define M(i) 	REGOFF(i * 4, EDX)
43848b8605Smrg
44848b8605Smrg
45848b8605SmrgALIGNTEXT4
46848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_general)
47848b8605SmrgHIDDEN (_mesa_sse_transform_points2_general)
48848b8605SmrgGLNAME( _mesa_sse_transform_points2_general ):
49848b8605Smrg
50848b8605Smrg#define FRAME_OFFSET 8
51848b8605Smrg    PUSH_L    ( ESI )
52848b8605Smrg    PUSH_L    ( EDI )
53848b8605Smrg
54848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
55848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
56848b8605Smrg
57848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
58848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
59848b8605Smrg
60848b8605Smrg    TEST_L( ECX, ECX )
61848b8605Smrg    JZ( LLBL(K_GTP2GR_finish) )			/* count was zero; go to finish */
62848b8605Smrg
63848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
64848b8605Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
65848b8605Smrg
66848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
67848b8605Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
68848b8605Smrg
69848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
70848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
71848b8605Smrg
72848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
73848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
74848b8605Smrg
75848b8605SmrgALIGNTEXT32
76848b8605Smrg    MOVAPS( M(0), XMM0 )			/* m3  | m2  | m1  | m0 */
77848b8605Smrg    MOVAPS( M(4), XMM1 )			/* m7  | m6  | m5  | m4 */
78848b8605Smrg    MOVAPS( M(12), XMM2 )			/* m15 | m14 | m13 | m12 */
79848b8605Smrg
80848b8605SmrgALIGNTEXT32
81848b8605SmrgLLBL(K_GTP2GR_top):
82848b8605Smrg    MOVSS( S(0), XMM3 )				/* ox */
83848b8605Smrg    SHUFPS( CONST(0x0), XMM3, XMM3 )		/* ox | ox | ox | ox */
84848b8605Smrg    MULPS( XMM0, XMM3 )				/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
85848b8605Smrg    MOVSS( S(1), XMM4 )				/* oy */
86848b8605Smrg    SHUFPS( CONST(0x0), XMM4, XMM4 )		/* oy | oy | oy | oy */
87848b8605Smrg    MULPS( XMM1, XMM4 )				/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
88848b8605Smrg
89848b8605Smrg    ADDPS( XMM4, XMM3 )
90848b8605Smrg    ADDPS( XMM2, XMM3 )
91848b8605Smrg    MOVAPS( XMM3, D(0) )
92848b8605Smrg
93848b8605SmrgLLBL(K_GTP2GR_skip):
94848b8605Smrg    ADD_L     ( CONST(16), EDI )
95848b8605Smrg    ADD_L     ( EAX, ESI )
96848b8605Smrg    CMP_L     ( ECX, EDI )
97848b8605Smrg    JNE       ( LLBL(K_GTP2GR_top) )
98848b8605Smrg
99848b8605SmrgLLBL(K_GTP2GR_finish):
100848b8605Smrg    POP_L     ( EDI )
101848b8605Smrg    POP_L     ( ESI )
102848b8605Smrg    RET
103848b8605Smrg#undef FRAME_OFFSET
104848b8605Smrg
105848b8605Smrg
106848b8605SmrgALIGNTEXT4
107848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_identity)
108848b8605SmrgHIDDEN(_mesa_sse_transform_points2_identity)
109848b8605SmrgGLNAME( _mesa_sse_transform_points2_identity ):
110848b8605Smrg
111848b8605Smrg#define FRAME_OFFSET 8
112848b8605Smrg    PUSH_L    ( ESI )
113848b8605Smrg    PUSH_L    ( EDI )
114848b8605Smrg
115848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
116848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
117848b8605Smrg
118848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
119848b8605Smrg
120848b8605Smrg    TEST_L( ECX, ECX)
121848b8605Smrg    JZ( LLBL(K_GTP2IR_finish) )			/* count was zero; go to finish */
122848b8605Smrg
123848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
124848b8605Smrg    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
125848b8605Smrg
126848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
127848b8605Smrg    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
128848b8605Smrg
129848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
130848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
131848b8605Smrg
132848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
133848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
134848b8605Smrg
135848b8605Smrg    CMP_L( ESI, EDI )
136848b8605Smrg    JE( LLBL(K_GTP2IR_finish) )
137848b8605Smrg
138848b8605Smrg
139848b8605SmrgALIGNTEXT32
140848b8605SmrgLLBL(K_GTP2IR_top):
141848b8605Smrg    MOV_L     ( S(0), EDX )
142848b8605Smrg    MOV_L     ( EDX, D(0) )
143848b8605Smrg    MOV_L     ( S(1), EDX )
144848b8605Smrg    MOV_L     ( EDX, D(1) )
145848b8605Smrg
146848b8605SmrgLLBL(K_GTP2IR_skip):
147848b8605Smrg    ADD_L     ( CONST(16), EDI )
148848b8605Smrg    ADD_L     ( EAX, ESI )
149848b8605Smrg    CMP_L     ( ECX, EDI )
150848b8605Smrg    JNE       ( LLBL(K_GTP2IR_top) )
151848b8605Smrg
152848b8605SmrgLLBL(K_GTP2IR_finish):
153848b8605Smrg    POP_L     ( EDI )
154848b8605Smrg    POP_L     ( ESI )
155848b8605Smrg    RET
156848b8605Smrg#undef FRAME_OFFSET
157848b8605Smrg
158848b8605Smrg
159848b8605SmrgALIGNTEXT4
160848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot)
161848b8605SmrgHIDDEN(_mesa_sse_transform_points2_3d_no_rot)
162848b8605SmrgGLNAME(_mesa_sse_transform_points2_3d_no_rot):
163848b8605Smrg
164848b8605Smrg#define FRAME_OFFSET 8
165848b8605Smrg    PUSH_L( ESI )
166848b8605Smrg    PUSH_L( EDI )
167848b8605Smrg
168848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
169848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
170848b8605Smrg
171848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
172848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
173848b8605Smrg
174848b8605Smrg    TEST_L( ECX, ECX)
175848b8605Smrg    JZ( LLBL(K_GTP23DNRR_finish) ) 		/* count was zero; go to finish */
176848b8605Smrg
177848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
178848b8605Smrg    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
179848b8605Smrg
180848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
181848b8605Smrg    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
182848b8605Smrg
183848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
184848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
185848b8605Smrg
186848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
187848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
188848b8605Smrg
189848b8605Smrg    XORPS( XMM0, XMM0 )                         /* clean the working register */
190848b8605Smrg
191848b8605SmrgALIGNTEXT32
192848b8605Smrg    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
193848b8605Smrg    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
194848b8605Smrg    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
195848b8605Smrg    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
196848b8605Smrg    MOVSS    ( M(14), XMM3 )			/* - | - |  -  | m14 */
197848b8605Smrg
198848b8605SmrgALIGNTEXT32
199848b8605SmrgLLBL(K_GTP23DNRR_top):
200848b8605Smrg    MOVLPS   ( S(0), XMM0 )			/* - | - |  oy   | ox */
201848b8605Smrg    MULPS    ( XMM1, XMM0 )			/* - | - | oy*m5 | ox*m0 */
202848b8605Smrg    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
203848b8605Smrg    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
204848b8605Smrg
205848b8605Smrg    MOVSS    ( XMM3, D(2) )			/* -> D(2) */
206848b8605Smrg
207848b8605SmrgLLBL(K_GTP23DNRR_skip):
208848b8605Smrg    ADD_L    ( CONST(16), EDI )
209848b8605Smrg    ADD_L    ( EAX, ESI )
210848b8605Smrg    CMP_L    ( ECX, EDI )
211848b8605Smrg    JNE      ( LLBL(K_GTP23DNRR_top) )
212848b8605Smrg
213848b8605SmrgLLBL(K_GTP23DNRR_finish):
214848b8605Smrg    POP_L    ( EDI )
215848b8605Smrg    POP_L    ( ESI )
216848b8605Smrg    RET
217848b8605Smrg#undef FRAME_OFFSET
218848b8605Smrg
219848b8605Smrg
220848b8605SmrgALIGNTEXT4
221848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_perspective)
222848b8605SmrgHIDDEN(_mesa_sse_transform_points2_perspective)
223848b8605SmrgGLNAME(_mesa_sse_transform_points2_perspective):
224848b8605Smrg
225848b8605Smrg#define FRAME_OFFSET 8
226848b8605Smrg    PUSH_L   ( ESI )
227848b8605Smrg    PUSH_L   ( EDI )
228848b8605Smrg
229848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI )	/* ptr to source GLvector4f */
230848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
231848b8605Smrg
232848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
233848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
234848b8605Smrg
235848b8605Smrg    TEST_L( ECX, ECX)
236848b8605Smrg    JZ( LLBL(K_GTP23PR_finish) )		/* count was zero; go to finish */
237848b8605Smrg
238848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
239848b8605Smrg    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
240848b8605Smrg
241848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
242848b8605Smrg    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
243848b8605Smrg
244848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
245848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
246848b8605Smrg
247848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
248848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
249848b8605Smrg
250848b8605SmrgALIGNTEXT32
251848b8605Smrg    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
252848b8605Smrg    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
253848b8605Smrg    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
254848b8605Smrg    MOVSS    ( M(14), XMM3 )			/* m14 */
255848b8605Smrg    XORPS    ( XMM0, XMM0 )			/* 0 | 0 | 0 | 0 */
256848b8605Smrg
257848b8605SmrgALIGNTEXT32
258848b8605SmrgLLBL(K_GTP23PR_top):
259848b8605Smrg    MOVLPS( S(0), XMM4 )			/* oy | ox */
260848b8605Smrg    MULPS( XMM1, XMM4 )				/* oy*m5 | ox*m0 */
261848b8605Smrg    MOVLPS( XMM4, D(0) )			/* ->D(1) | ->D(0) */
262848b8605Smrg    MOVSS( XMM3, D(2) )				/* ->D(2) */
263848b8605Smrg    MOVSS( XMM0, D(3) )				/* ->D(3) */
264848b8605Smrg
265848b8605SmrgLLBL(K_GTP23PR_skip):
266848b8605Smrg    ADD_L( CONST(16), EDI )
267848b8605Smrg    ADD_L( EAX, ESI )
268848b8605Smrg    CMP_L( ECX, EDI )
269848b8605Smrg    JNE( LLBL(K_GTP23PR_top) )
270848b8605Smrg
271848b8605SmrgLLBL(K_GTP23PR_finish):
272848b8605Smrg    POP_L    ( EDI )
273848b8605Smrg    POP_L    ( ESI )
274848b8605Smrg    RET
275848b8605Smrg#undef FRAME_OFFSET
276848b8605Smrg
277848b8605Smrg
278848b8605Smrg
279848b8605SmrgALIGNTEXT4
280848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_2d)
281848b8605SmrgHIDDEN(_mesa_sse_transform_points2_2d)
282848b8605SmrgGLNAME(_mesa_sse_transform_points2_2d):
283848b8605Smrg
284848b8605Smrg#define FRAME_OFFSET 8
285848b8605Smrg    PUSH_L( ESI )
286848b8605Smrg    PUSH_L( EDI )
287848b8605Smrg
288848b8605Smrg    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
289848b8605Smrg    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
290848b8605Smrg
291848b8605Smrg    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
292848b8605Smrg    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
293848b8605Smrg
294848b8605Smrg    TEST_L( ECX, ECX)
295848b8605Smrg    JZ( LLBL(K_GTP23P2DR_finish) ) 		/* count was zero; go to finish */
296848b8605Smrg
297848b8605Smrg    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
298848b8605Smrg    OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
299848b8605Smrg
300848b8605Smrg    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
301848b8605Smrg    MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
302848b8605Smrg
303848b8605Smrg    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
304848b8605Smrg    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
305848b8605Smrg
306848b8605Smrg    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
307848b8605Smrg    ADD_L( EDI, ECX ) 				/* count += dest ptr */
308848b8605Smrg
309848b8605SmrgALIGNTEXT32
310848b8605Smrg    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
311848b8605Smrg    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
312848b8605Smrg    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
313848b8605Smrg
314848b8605SmrgALIGNTEXT32
315848b8605SmrgLLBL(K_GTP23P2DR_top):
316848b8605Smrg    MOVSS( S(0), XMM3 )				/* ox */
317848b8605Smrg    SHUFPS( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
318848b8605Smrg    MULPS( XMM0, XMM3 )				/* ox*m1 | ox*m0 */
319848b8605Smrg
320848b8605Smrg    MOVSS( S(1), XMM4 )				/* oy */
321848b8605Smrg    SHUFPS( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
322848b8605Smrg    MULPS( XMM1, XMM4 )				/* oy*m5 | oy*m4 */
323848b8605Smrg
324848b8605Smrg    ADDPS( XMM4, XMM3 )
325848b8605Smrg    ADDPS( XMM2, XMM3 )
326848b8605Smrg    MOVLPS( XMM3, D(0) )			/* ->D(1) | ->D(0) */
327848b8605Smrg
328848b8605SmrgLLBL(K_GTP23P2DR_skip):
329848b8605Smrg    ADD_L    ( CONST(16), EDI )
330848b8605Smrg    ADD_L    ( EAX, ESI )
331848b8605Smrg    CMP_L    ( ECX, EDI )
332848b8605Smrg    JNE      ( LLBL(K_GTP23P2DR_top) )
333848b8605Smrg
334848b8605SmrgLLBL(K_GTP23P2DR_finish):
335848b8605Smrg    POP_L    ( EDI )
336848b8605Smrg    POP_L    ( ESI )
337848b8605Smrg    RET
338848b8605Smrg#undef FRAME_OFFSET
339848b8605Smrg
340848b8605Smrg
341848b8605Smrg
342848b8605SmrgALIGNTEXT4
343848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot)
344848b8605SmrgHIDDEN(_mesa_sse_transform_points2_2d_no_rot)
345848b8605SmrgGLNAME(_mesa_sse_transform_points2_2d_no_rot):
346848b8605Smrg
347848b8605Smrg#define FRAME_OFFSET 8
348848b8605Smrg	PUSH_L( ESI )
349848b8605Smrg	PUSH_L( EDI )
350848b8605Smrg
351848b8605Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
352848b8605Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
353848b8605Smrg
354848b8605Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
355848b8605Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
356848b8605Smrg
357848b8605Smrg	TEST_L( ECX, ECX)
358848b8605Smrg	JZ( LLBL(K_GTP23P2DNRR_finish) ) 	/* count was zero; go to finish */
359848b8605Smrg
360848b8605Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
361848b8605Smrg	OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
362848b8605Smrg
363848b8605Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
364848b8605Smrg	MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
365848b8605Smrg
366848b8605Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
367848b8605Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
368848b8605Smrg
369848b8605Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
370848b8605Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
371848b8605Smrg
372848b8605SmrgALIGNTEXT32
373848b8605Smrg	MOVSS    ( M(0), XMM1 )			/* m0 */
374848b8605Smrg	MOVSS    ( M(5), XMM2 )			/* m5 */
375848b8605Smrg	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
376848b8605Smrg	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
377848b8605Smrg
378848b8605SmrgALIGNTEXT32
379848b8605SmrgLLBL(K_GTP23P2DNRR_top):
380848b8605Smrg	MOVLPS( S(0), XMM0 )			/* oy | ox */
381848b8605Smrg	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
382848b8605Smrg	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
383848b8605Smrg	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
384848b8605Smrg
385848b8605SmrgLLBL(K_GTP23P2DNRR_skip):
386848b8605Smrg	ADD_L( CONST(16), EDI )
387848b8605Smrg	ADD_L( EAX, ESI )
388848b8605Smrg	CMP_L( ECX, EDI )
389848b8605Smrg	JNE( LLBL(K_GTP23P2DNRR_top) )
390848b8605Smrg
391848b8605SmrgLLBL(K_GTP23P2DNRR_finish):
392848b8605Smrg	POP_L( EDI )
393848b8605Smrg	POP_L( ESI )
394848b8605Smrg	RET
395848b8605Smrg#undef FRAME_OFFSET
396848b8605Smrg
397848b8605Smrg
398848b8605Smrg
399848b8605SmrgALIGNTEXT4
400848b8605SmrgGLOBL GLNAME(_mesa_sse_transform_points2_3d)
401848b8605SmrgHIDDEN(_mesa_sse_transform_points2_3d)
402848b8605SmrgGLNAME(_mesa_sse_transform_points2_3d):
403848b8605Smrg
404848b8605Smrg#define FRAME_OFFSET 8
405848b8605Smrg	PUSH_L( ESI )
406848b8605Smrg	PUSH_L( EDI )
407848b8605Smrg
408848b8605Smrg	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
409848b8605Smrg	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
410848b8605Smrg
411848b8605Smrg	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
412848b8605Smrg	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
413848b8605Smrg
414848b8605Smrg	TEST_L( ECX, ECX)
415848b8605Smrg	JZ( LLBL(K_GTP23P3DR_finish) ) 	/* count was zero; go to finish */
416848b8605Smrg
417848b8605Smrg	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
418848b8605Smrg	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
419848b8605Smrg
420848b8605Smrg	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
421848b8605Smrg	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
422848b8605Smrg
423848b8605Smrg	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
424848b8605Smrg	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
425848b8605Smrg
426848b8605Smrg	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
427848b8605Smrg	ADD_L( EDI, ECX ) 			/* count += dest ptr */
428848b8605Smrg
429848b8605SmrgALIGNTEXT32
430848b8605Smrg	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
431848b8605Smrg	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
432848b8605Smrg	MOVAPS( M(12), XMM2 )			/* m14 | m13 | m12 */
433848b8605Smrg
434848b8605SmrgALIGNTEXT32
435848b8605SmrgLLBL(K_GTP23P3DR_top):
436848b8605Smrg	MOVSS( S(0), XMM3 )			/* ox */
437848b8605Smrg	SHUFPS( CONST(0x0), XMM3, XMM3 )	/* ox | ox | ox */
438848b8605Smrg	MULPS( XMM0, XMM3 )			/* ox*m2 | ox*m1 | ox*m0 */
439848b8605Smrg
440848b8605Smrg	MOVSS( S(1), XMM4 )			/* oy */
441848b8605Smrg	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* oy | oy | oy */
442848b8605Smrg	MULPS( XMM1, XMM4 )			/* oy*m6 | oy*m5 | oy*m4 */
443848b8605Smrg
444848b8605Smrg	ADDPS( XMM4, XMM3 )
445848b8605Smrg	ADDPS( XMM2, XMM3 )
446848b8605Smrg
447848b8605Smrg	MOVLPS( XMM3, D(0) )			/* ->D(1) | ->D(0) */
448848b8605Smrg	UNPCKHPS( XMM3, XMM3 )
449848b8605Smrg	MOVSS( XMM3, D(2) )			/* ->D(2) */
450848b8605Smrg
451848b8605SmrgLLBL(K_GTP23P3DR_skip):
452848b8605Smrg	ADD_L( CONST(16), EDI )
453848b8605Smrg	ADD_L( EAX, ESI )
454848b8605Smrg	CMP_L( ECX, EDI )
455848b8605Smrg	JNE( LLBL(K_GTP23P3DR_top) )
456848b8605Smrg
457848b8605SmrgLLBL(K_GTP23P3DR_finish):
458848b8605Smrg	POP_L( EDI )
459848b8605Smrg	POP_L( ESI )
460848b8605Smrg	RET
461848b8605Smrg#undef FRAME_OFFSET
462848b8605Smrg#endif
463848b8605Smrg
464848b8605Smrg#if defined (__ELF__) && defined (__linux__)
465848b8605Smrg	.section .note.GNU-stack,"",%progbits
466848b8605Smrg#endif
467