1848b8605Smrg
2848b8605Smrg/*
3848b8605Smrg * Mesa 3-D graphics library
4848b8605Smrg *
5848b8605Smrg * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
6848b8605Smrg *
7848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
8848b8605Smrg * copy of this software and associated documentation files (the "Software"),
9848b8605Smrg * to deal in the Software without restriction, including without limitation
10848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
12848b8605Smrg * Software is furnished to do so, subject to the following conditions:
13848b8605Smrg *
14848b8605Smrg * The above copyright notice and this permission notice shall be included
15848b8605Smrg * in all copies or substantial portions of the Software.
16848b8605Smrg *
17848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE.
24848b8605Smrg */
25848b8605Smrg
26848b8605Smrg#ifdef USE_3DNOW_ASM
27848b8605Smrg#include "assyntax.h"
28848b8605Smrg#include "matypes.h"
29848b8605Smrg#include "xform_args.h"
30848b8605Smrg
31848b8605Smrg    SEG_TEXT
32848b8605Smrg
33848b8605Smrg#define FRAME_OFFSET	4
34848b8605Smrg
35848b8605Smrg
36848b8605SmrgALIGNTEXT16
37848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_general )
38848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_general)
39848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_general ):
40848b8605Smrg
41848b8605Smrg    PUSH_L    ( ESI )
42848b8605Smrg
43848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
44848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
45848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
46848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
47848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
48848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
49848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
50848b8605Smrg
51848b8605Smrg    PUSH_L    ( EDI )
52848b8605Smrg
53848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
54848b8605Smrg    MOV_L     ( ESI, ECX )
55848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
56848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
57848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
58848b8605Smrg
59848b8605Smrg    TEST_L    ( ESI, ESI )
60848b8605Smrg    JZ        ( LLBL( G3TPGR_2 ) )
61848b8605Smrg
62848b8605Smrg    PREFETCHW ( REGIND(EDX) )
63848b8605Smrg
64848b8605SmrgALIGNTEXT16
65848b8605SmrgLLBL( G3TPGR_1 ):
66848b8605Smrg
67848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
68848b8605Smrg
69848b8605Smrg    MOVQ      ( REGIND(EAX), MM0 )	/* x1            | x0                */
70848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM4 )	/* x3            | x2                */
71848b8605Smrg
72848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
73848b8605Smrg    PREFETCH  ( REGIND(EAX) )
74848b8605Smrg
75848b8605Smrg    MOVQ      ( MM0, MM2 )		/* x1              | x0              */
76848b8605Smrg    MOVQ      ( MM4, MM6 )		/* x3              | x2              */
77848b8605Smrg
78848b8605Smrg    PUNPCKLDQ ( MM0, MM0 )		/* x0              | x0              */
79848b8605Smrg    PUNPCKHDQ ( MM2, MM2 )		/* x1              | x1              */
80848b8605Smrg
81848b8605Smrg    MOVQ      ( MM0, MM1 )		/* x0              | x0              */
82848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
83848b8605Smrg
84848b8605Smrg    PFMUL     ( REGIND(ECX), MM0 )	/* x0*m1           | x0*m0           */
85848b8605Smrg    MOVQ      ( MM2, MM3 )		/* x1              | x1              */
86848b8605Smrg
87848b8605Smrg    PFMUL     ( REGOFF(8, ECX), MM1 )	/* x0*m3           | x0*m2           */
88848b8605Smrg    PUNPCKLDQ ( MM4, MM4 )		/* x2              | x2              */
89848b8605Smrg
90848b8605Smrg    PFMUL     ( REGOFF(16, ECX), MM2 )	/* x1*m5           | x1*m4           */
91848b8605Smrg    MOVQ      ( MM4, MM5 )		/* x2              | x2              */
92848b8605Smrg
93848b8605Smrg    PFMUL     ( REGOFF(24, ECX), MM3 )	/* x1*m7           | x1*m6           */
94848b8605Smrg    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
95848b8605Smrg
96848b8605Smrg    PFMUL     ( REGOFF(32, ECX), MM4 )	/* x2*m9           | x2*m8           */
97848b8605Smrg    MOVQ      ( MM6, MM7 )		/* x3              | x3              */
98848b8605Smrg
99848b8605Smrg    PFMUL     ( REGOFF(40, ECX), MM5 )	/* x2*m11          | x2*m10          */
100848b8605Smrg    PFADD     ( MM0, MM2 )
101848b8605Smrg
102848b8605Smrg    PFMUL     ( REGOFF(48, ECX), MM6 )	/* x3*m13          | x3*m12          */
103848b8605Smrg    PFADD     ( MM1, MM3 )
104848b8605Smrg
105848b8605Smrg    PFMUL     ( REGOFF(56, ECX), MM7 )	/* x3*m15          | x3*m14          */
106848b8605Smrg    PFADD     ( MM4, MM6 )
107848b8605Smrg
108848b8605Smrg    PFADD     ( MM5, MM7 )
109848b8605Smrg    PFADD     ( MM2, MM6 )
110848b8605Smrg
111848b8605Smrg    PFADD     ( MM3, MM7 )
112848b8605Smrg    MOVQ      ( MM6, REGOFF(-16, EDX) )
113848b8605Smrg
114848b8605Smrg    MOVQ      ( MM7, REGOFF(-8, EDX) )
115848b8605Smrg
116848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
117848b8605Smrg    JNZ       ( LLBL( G3TPGR_1 ) )	/* cnt > 0 ? -> process next vertex  */
118848b8605Smrg
119848b8605SmrgLLBL( G3TPGR_2 ):
120848b8605Smrg
121848b8605Smrg    FEMMS
122848b8605Smrg    POP_L     ( EDI )
123848b8605Smrg    POP_L     ( ESI )
124848b8605Smrg    RET
125848b8605Smrg
126848b8605Smrg
127848b8605Smrg
128848b8605Smrg
129848b8605SmrgALIGNTEXT16
130848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_perspective )
131848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_perspective)
132848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_perspective ):
133848b8605Smrg
134848b8605Smrg    PUSH_L    ( ESI )
135848b8605Smrg
136848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
137848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
138848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
139848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
140848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
141848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
142848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
143848b8605Smrg
144848b8605Smrg    PUSH_L    ( EDI )
145848b8605Smrg
146848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
147848b8605Smrg    MOV_L     ( ESI, ECX )
148848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
149848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
150848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
151848b8605Smrg
152848b8605Smrg    TEST_L    ( ESI, ESI )
153848b8605Smrg    JZ        ( LLBL( G3TPPR_2 ) )
154848b8605Smrg
155848b8605Smrg    PREFETCH  ( REGIND(EAX) )
156848b8605Smrg    PREFETCHW ( REGIND(EDX) )
157848b8605Smrg
158848b8605Smrg    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
159848b8605Smrg    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
160848b8605Smrg
161848b8605Smrg    MOVD      ( REGOFF(40, ECX), MM1 )	/*                 | m22             */
162848b8605Smrg    PUNPCKLDQ ( REGOFF(56, ECX), MM1 )	/* m32             | m22             */
163848b8605Smrg
164848b8605Smrg    MOVQ      ( REGOFF(32, ECX), MM2 )	/* m21             | m20             */
165848b8605Smrg    PXOR      ( MM7, MM7 )		/* 0               | 0               */
166848b8605Smrg
167848b8605SmrgALIGNTEXT16
168848b8605SmrgLLBL( G3TPPR_1 ):
169848b8605Smrg
170848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
171848b8605Smrg
172848b8605Smrg    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
173848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
174848b8605Smrg    MOVD      ( REGOFF(8, EAX), MM3 )	/*                 | x2              */
175848b8605Smrg
176848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
177848b8605Smrg    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully stride is zero          */
178848b8605Smrg
179848b8605Smrg    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
180848b8605Smrg    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
181848b8605Smrg
182848b8605Smrg    PUNPCKLDQ ( MM5, MM5 )		/* x2              | x2              */
183848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
184848b8605Smrg
185848b8605Smrg    PFMUL     ( MM2, MM5 )		/* x2*m21          | x2*m20          */
186848b8605Smrg    PFSUBR    ( MM7, MM3 )		/*                 | -x2             */
187848b8605Smrg
188848b8605Smrg    PFMUL     ( MM1, MM6 )		/* x3*m32          | x2*m22          */
189848b8605Smrg    PFADD     ( MM4, MM5 )		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
190848b8605Smrg
191848b8605Smrg    PFACC     ( MM3, MM6 )		/* -x2             | x2*m22+x3*m32   */
192848b8605Smrg    MOVQ      ( MM5, REGOFF(-16, EDX) )	/* write r0, r1                      */
193848b8605Smrg
194848b8605Smrg    MOVQ      ( MM6, REGOFF(-8, EDX) )	/* write r2, r3                      */
195848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
196848b8605Smrg
197848b8605Smrg    JNZ       ( LLBL( G3TPPR_1 ) )	/* cnt > 0 ? -> process next vertex  */
198848b8605Smrg
199848b8605SmrgLLBL( G3TPPR_2 ):
200848b8605Smrg
201848b8605Smrg    FEMMS
202848b8605Smrg    POP_L     ( EDI )
203848b8605Smrg    POP_L     ( ESI )
204848b8605Smrg    RET
205848b8605Smrg
206848b8605Smrg
207848b8605Smrg
208848b8605Smrg
209848b8605SmrgALIGNTEXT16
210848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_3d )
211848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_3d)
212848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_3d ):
213848b8605Smrg
214848b8605Smrg    PUSH_L    ( ESI )
215848b8605Smrg
216848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
217848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
218848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
219848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
220848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
221848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
222848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
223848b8605Smrg
224848b8605Smrg    PUSH_L    ( EDI )
225848b8605Smrg
226848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
227848b8605Smrg    MOV_L     ( ESI, ECX )
228848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
229848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
230848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
231848b8605Smrg
232848b8605Smrg    TEST_L    ( ESI, ESI )
233848b8605Smrg    JZ        ( LLBL( G3TP3R_2 ) )
234848b8605Smrg
235848b8605Smrg    MOVD      ( REGOFF(8, ECX), MM6 )	/*                 | m2              */
236848b8605Smrg    PUNPCKLDQ ( REGOFF(24, ECX), MM6 )	/* m6              | m2              */
237848b8605Smrg
238848b8605Smrg    MOVD      ( REGOFF(40, ECX), MM7 )	/*                 | m10             */
239848b8605Smrg    PUNPCKLDQ ( REGOFF(56, ECX), MM7 )	/* m14             | m10             */
240848b8605Smrg
241848b8605SmrgALIGNTEXT16
242848b8605SmrgLLBL( G3TP3R_1 ):
243848b8605Smrg
244848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
245848b8605Smrg    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully array is tightly packed */
246848b8605Smrg
247848b8605Smrg    MOVQ      ( REGIND(EAX), MM2 )	/* x1              | x0              */
248848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM3 )	/* x3              | x2              */
249848b8605Smrg
250848b8605Smrg    MOVQ      ( MM2, MM0 )		/* x1              | x0              */
251848b8605Smrg    MOVQ      ( MM3, MM4 )		/* x3              | x2              */
252848b8605Smrg
253848b8605Smrg    MOVQ      ( MM0, MM1 )		/* x1              | x0              */
254848b8605Smrg    MOVQ      ( MM4, MM5 )		/* x3              | x2              */
255848b8605Smrg
256848b8605Smrg    PUNPCKLDQ ( MM0, MM0 )		/* x0              | x0              */
257848b8605Smrg    PUNPCKHDQ ( MM1, MM1 )		/* x1              | x1              */
258848b8605Smrg
259848b8605Smrg    PFMUL     ( REGIND(ECX), MM0 )	/* x0*m1           | x0*m0           */
260848b8605Smrg    PUNPCKLDQ ( MM3, MM3 )		/* x2              | x2              */
261848b8605Smrg
262848b8605Smrg    PFMUL     ( REGOFF(16, ECX), MM1 )	/* x1*m5           | x1*m4           */
263848b8605Smrg    PUNPCKHDQ ( MM4, MM4 )		/* x3              | x3              */
264848b8605Smrg
265848b8605Smrg    PFMUL     ( MM6, MM2 )		/* x1*m6           | x0*m2           */
266848b8605Smrg    PFADD     ( MM0, MM1 )		/* x0*m1+x1*m5     | x0*m0+x1*m4     */
267848b8605Smrg
268848b8605Smrg    PFMUL     ( REGOFF(32, ECX), MM3 )	/* x2*m9           | x2*m8           */
269848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
270848b8605Smrg
271848b8605Smrg    PFMUL     ( REGOFF(48, ECX), MM4 )	/* x3*m13          | x3*m12          */
272848b8605Smrg    PFADD     ( MM1, MM3 )		/* x0*m1+..+x2*m9  | x0*m0+...+x2*m8 */
273848b8605Smrg
274848b8605Smrg    PFMUL     ( MM7, MM5 )		/* x3*m14          | x2*m10          */
275848b8605Smrg    PFADD     ( MM3, MM4 )		/* r1              | r0              */
276848b8605Smrg
277848b8605Smrg    PFACC     ( MM2, MM5 )		/* x0*m2+x1*m6     | x2*m10+x3*m14   */
278848b8605Smrg    MOVD      ( REGOFF(12, EAX), MM0 )	/*                 | x3              */
279848b8605Smrg
280848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
281848b8605Smrg    PFACC     ( MM0, MM5 )		/* r3              | r2              */
282848b8605Smrg
283848b8605Smrg    MOVQ      ( MM4, REGOFF(-16, EDX) )	/* write r0, r1                      */
284848b8605Smrg    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
285848b8605Smrg
286848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
287848b8605Smrg    JNZ       ( LLBL( G3TP3R_1 ) )	/* cnt > 0 ? -> process next vertex  */
288848b8605Smrg
289848b8605SmrgLLBL( G3TP3R_2 ):
290848b8605Smrg
291848b8605Smrg    FEMMS
292848b8605Smrg    POP_L     ( EDI )
293848b8605Smrg    POP_L     ( ESI )
294848b8605Smrg    RET
295848b8605Smrg
296848b8605Smrg
297848b8605Smrg
298848b8605Smrg
299848b8605SmrgALIGNTEXT16
300848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot )
301848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_3d_no_rot)
302848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_3d_no_rot ):
303848b8605Smrg
304848b8605Smrg    PUSH_L    ( ESI )
305848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
306848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
307848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
308848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
309848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
310848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
311848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
312848b8605Smrg
313848b8605Smrg    PUSH_L    ( EDI )
314848b8605Smrg
315848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
316848b8605Smrg    MOV_L     ( ESI, ECX )
317848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
318848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
319848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
320848b8605Smrg
321848b8605Smrg    TEST_L    ( ESI, ESI )
322848b8605Smrg    JZ        ( LLBL( G3TP3NRR_2 ) )
323848b8605Smrg
324848b8605Smrg    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
325848b8605Smrg    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
326848b8605Smrg
327848b8605Smrg    MOVD      ( REGOFF(40, ECX), MM2 )	/*                 | m22             */
328848b8605Smrg    PUNPCKLDQ ( REGOFF(56, ECX), MM2 )	/* m32             | m22             */
329848b8605Smrg
330848b8605Smrg    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
331848b8605Smrg
332848b8605SmrgALIGNTEXT16
333848b8605SmrgLLBL( G3TP3NRR_1 ):
334848b8605Smrg
335848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
336848b8605Smrg
337848b8605Smrg    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
338848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
339848b8605Smrg    MOVD      ( REGOFF(12, EAX), MM7 )	/*                 | x3              */
340848b8605Smrg
341848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
342848b8605Smrg    PREFETCH  ( REGOFF(32, EAX) )	/* hopefully stride is zero          */
343848b8605Smrg
344848b8605Smrg    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
345848b8605Smrg    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
346848b8605Smrg
347848b8605Smrg    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
348848b8605Smrg    PFMUL     ( MM2, MM5 )		/* x3*m32          | x2*m22          */
349848b8605Smrg
350848b8605Smrg    PFMUL     ( MM1, MM6 )		/* x3*m31          | x3*m30          */
351848b8605Smrg    PFACC     ( MM7, MM5 )		/* x3              | x2*m22+x3*m32   */
352848b8605Smrg
353848b8605Smrg    PFADD     ( MM6, MM4 )		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
354848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
355848b8605Smrg
356848b8605Smrg    MOVQ      ( MM4, REGOFF(-16, EDX) )	/* write r0, r1                      */
357848b8605Smrg    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
358848b8605Smrg
359848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
360848b8605Smrg    JNZ       ( LLBL( G3TP3NRR_1 ) )	/* cnt > 0 ? -> process next vertex  */
361848b8605Smrg
362848b8605SmrgLLBL( G3TP3NRR_2 ):
363848b8605Smrg
364848b8605Smrg    FEMMS
365848b8605Smrg    POP_L     ( EDI )
366848b8605Smrg    POP_L     ( ESI )
367848b8605Smrg    RET
368848b8605Smrg
369848b8605Smrg
370848b8605Smrg
371848b8605Smrg
372848b8605SmrgALIGNTEXT16
373848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_2d )
374848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_2d)
375848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_2d ):
376848b8605Smrg
377848b8605Smrg    PUSH_L    ( ESI )
378848b8605Smrg
379848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
380848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
381848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
382848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
383848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
384848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
385848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
386848b8605Smrg
387848b8605Smrg    PUSH_L    ( EDI )
388848b8605Smrg
389848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
390848b8605Smrg    MOV_L     ( ESI, ECX )
391848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
392848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
393848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
394848b8605Smrg
395848b8605Smrg    TEST_L    ( ESI, ESI )
396848b8605Smrg    JZ        ( LLBL( G3TP2R_2 ) )
397848b8605Smrg
398848b8605Smrg    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
399848b8605Smrg    PUNPCKLDQ ( REGOFF(16, ECX), MM0 )	/* m10             | m00             */
400848b8605Smrg
401848b8605Smrg    MOVD      ( REGOFF(4, ECX), MM1 )	/*                 | m01             */
402848b8605Smrg    PUNPCKLDQ ( REGOFF(20, ECX), MM1 )	/* m11             | m01             */
403848b8605Smrg
404848b8605Smrg    MOVQ      ( REGOFF(48, ECX), MM2 )	/* m31             | m30             */
405848b8605Smrg
406848b8605SmrgALIGNTEXT16
407848b8605SmrgLLBL( G3TP2R_1 ):
408848b8605Smrg
409848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )       /* prefetch 2 vertices ahead         */
410848b8605Smrg
411848b8605Smrg    MOVQ      ( REGIND(EAX), MM3 )	/* x1              | x0              */
412848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
413848b8605Smrg
414848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
415848b8605Smrg    PREFETCH  ( REGIND(EAX) )
416848b8605Smrg
417848b8605Smrg    MOVQ      ( MM3, MM4 )		/* x1              | x0              */
418848b8605Smrg    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
419848b8605Smrg
420848b8605Smrg    PFMUL     ( MM1, MM4 )		/* x1*m11          | x0*m01          */
421848b8605Smrg    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
422848b8605Smrg
423848b8605Smrg    PFMUL     ( MM0, MM3 )		/* x1*m10          | x0*m00          */
424848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
425848b8605Smrg
426848b8605Smrg    PFACC     ( MM4, MM3 )		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
427848b8605Smrg    PFMUL     ( MM2, MM6 )		/* x3*m31          | x3*m30          */
428848b8605Smrg
429848b8605Smrg    PFADD     ( MM6, MM3 )		/* r1              | r0              */
430848b8605Smrg    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
431848b8605Smrg
432848b8605Smrg    MOVQ      ( MM3, REGOFF(-16, EDX) )	/* write r0, r1                      */
433848b8605Smrg
434848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
435848b8605Smrg    JNZ       ( LLBL( G3TP2R_1 ) )	/* cnt > 0 ? -> process next vertex  */
436848b8605Smrg
437848b8605SmrgLLBL( G3TP2R_2 ):
438848b8605Smrg
439848b8605Smrg    FEMMS
440848b8605Smrg    POP_L     ( EDI )
441848b8605Smrg    POP_L     ( ESI )
442848b8605Smrg    RET
443848b8605Smrg
444848b8605Smrg
445848b8605Smrg
446848b8605Smrg
447848b8605SmrgALIGNTEXT16
448848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot )
449848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_2d_no_rot)
450848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_2d_no_rot ):
451848b8605Smrg
452848b8605Smrg    PUSH_L    ( ESI )
453848b8605Smrg
454848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
455848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
456848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
457848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
458848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
459848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
460848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
461848b8605Smrg
462848b8605Smrg    PUSH_L    ( EDI )
463848b8605Smrg
464848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
465848b8605Smrg    MOV_L     ( ESI, ECX )
466848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
467848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
468848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
469848b8605Smrg
470848b8605Smrg    TEST_L    ( ESI, ESI )
471848b8605Smrg    JZ        ( LLBL( G3TP2NRR_3 ) )
472848b8605Smrg
473848b8605Smrg    MOVD      ( REGIND(ECX), MM0 )	/*                 | m00             */
474848b8605Smrg    PUNPCKLDQ ( REGOFF(20, ECX), MM0 )	/* m11             | m00             */
475848b8605Smrg
476848b8605Smrg    MOVQ      ( REGOFF(48, ECX), MM1 )	/* m31             | m30             */
477848b8605Smrg
478848b8605SmrgALIGNTEXT16
479848b8605SmrgLLBL( G3TP2NRR_2 ):
480848b8605Smrg
481848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )	/* prefetch 2 vertices ahead         */
482848b8605Smrg
483848b8605Smrg    MOVQ      ( REGIND(EAX), MM4 )	/* x1              | x0              */
484848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM5 )	/* x3              | x2              */
485848b8605Smrg
486848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
487848b8605Smrg    PREFETCH  ( REGIND(EAX) )
488848b8605Smrg
489848b8605Smrg    PFMUL     ( MM0, MM4 )		/* x1*m11          | x0*m00          */
490848b8605Smrg    MOVQ      ( MM5, MM6 )		/* x3              | x2              */
491848b8605Smrg
492848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
493848b8605Smrg    PUNPCKHDQ ( MM6, MM6 )		/* x3              | x3              */
494848b8605Smrg
495848b8605Smrg    PFMUL     ( MM1, MM6 )		/* x3*m31          | x3*m30          */
496848b8605Smrg    PFADD     ( MM4, MM6 )		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
497848b8605Smrg
498848b8605Smrg    MOVQ      ( MM6, REGOFF(-16, EDX) )	/* write r0, r1                      */
499848b8605Smrg    MOVQ      ( MM5, REGOFF(-8, EDX) )	/* write r2, r3                      */
500848b8605Smrg
501848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
502848b8605Smrg
503848b8605Smrg    JNZ       ( LLBL( G3TP2NRR_2 ) )	/* cnt > 0 ? -> process next vertex  */
504848b8605Smrg
505848b8605SmrgLLBL( G3TP2NRR_3 ):
506848b8605Smrg
507848b8605Smrg    FEMMS
508848b8605Smrg    POP_L     ( EDI )
509848b8605Smrg    POP_L     ( ESI )
510848b8605Smrg    RET
511848b8605Smrg
512848b8605Smrg
513848b8605Smrg
514848b8605Smrg
515848b8605SmrgALIGNTEXT16
516848b8605SmrgGLOBL GLNAME( _mesa_3dnow_transform_points4_identity )
517848b8605SmrgHIDDEN(_mesa_3dnow_transform_points4_identity)
518848b8605SmrgGLNAME( _mesa_3dnow_transform_points4_identity ):
519848b8605Smrg
520848b8605Smrg    PUSH_L    ( ESI )
521848b8605Smrg
522848b8605Smrg    MOV_L     ( ARG_DEST, ECX )
523848b8605Smrg    MOV_L     ( ARG_MATRIX, ESI )
524848b8605Smrg    MOV_L     ( ARG_SOURCE, EAX )
525848b8605Smrg    MOV_L     ( CONST(4), REGOFF(V4F_SIZE, ECX) )
526848b8605Smrg    OR_B      ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
527848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), EDX )
528848b8605Smrg    MOV_L     ( EDX, REGOFF(V4F_COUNT, ECX) )
529848b8605Smrg
530848b8605Smrg    PUSH_L    ( EDI )
531848b8605Smrg
532848b8605Smrg    MOV_L     ( REGOFF(V4F_START, ECX), EDX )
533848b8605Smrg    MOV_L     ( ESI, ECX )
534848b8605Smrg    MOV_L     ( REGOFF(V4F_COUNT, EAX), ESI )
535848b8605Smrg    MOV_L     ( REGOFF(V4F_STRIDE, EAX), EDI )
536848b8605Smrg    MOV_L     ( REGOFF(V4F_START, EAX), EAX )
537848b8605Smrg
538848b8605Smrg    TEST_L    ( ESI, ESI )
539848b8605Smrg    JZ        ( LLBL( G3TPIR_2 ) )
540848b8605Smrg
541848b8605SmrgALIGNTEXT16
542848b8605SmrgLLBL( G3TPIR_1 ):
543848b8605Smrg
544848b8605Smrg    PREFETCHW ( REGOFF(32, EDX) )       /* prefetch 2 vertices ahead         */
545848b8605Smrg
546848b8605Smrg    MOVQ      ( REGIND(EAX), MM0 )	/* x1              | x0              */
547848b8605Smrg    MOVQ      ( REGOFF(8, EAX), MM1 )	/* x3              | x2              */
548848b8605Smrg
549848b8605Smrg    ADD_L     ( EDI, EAX )		/* next vertex                       */
550848b8605Smrg    PREFETCH  ( REGIND(EAX) )
551848b8605Smrg
552848b8605Smrg    ADD_L     ( CONST(16), EDX )	/* next r                            */
553848b8605Smrg    MOVQ      ( MM0, REGOFF(-16, EDX) )	/* r1              | r0              */
554848b8605Smrg
555848b8605Smrg    MOVQ      ( MM1, REGOFF(-8, EDX) )	/* r3              | r2              */
556848b8605Smrg
557848b8605Smrg    DEC_L     ( ESI )			/* decrement vertex counter          */
558848b8605Smrg    JNZ       ( LLBL( G3TPIR_1 ) )	/* cnt > 0 ? -> process next vertex  */
559848b8605Smrg
560848b8605SmrgLLBL( G3TPIR_2 ):
561848b8605Smrg
562848b8605Smrg    FEMMS
563848b8605Smrg    POP_L     ( EDI )
564848b8605Smrg    POP_L     ( ESI )
565848b8605Smrg    RET
566848b8605Smrg#endif
567848b8605Smrg
568848b8605Smrg#if defined (__ELF__) && defined (__linux__)
569848b8605Smrg	.section .note.GNU-stack,"",%progbits
570848b8605Smrg#endif
571