sse_xform3.S revision c1f859d4
1
2/*
3 * Mesa 3-D graphics library
4 * Version:  3.5
5 *
6 * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included
16 * in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
21 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
22 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26/** TODO:
27  * - insert PREFETCH instructions to avoid cache-misses !
28  * - some more optimizations are possible...
29  * - for 40-50% more performance in the SSE-functions, the
30  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
31  */
32
33#ifdef USE_SSE_ASM
34#include "assyntax.h"
35#include "matypes.h"
36#include "xform_args.h"
37
38   SEG_TEXT
39
40#define S(i) 	REGOFF(i * 4, ESI)
41#define D(i) 	REGOFF(i * 4, EDI)
42#define M(i) 	REGOFF(i * 4, EDX)
43
44
45ALIGNTEXT4
46GLOBL GLNAME(_mesa_sse_transform_points3_general)
47HIDDEN(_mesa_sse_transform_points3_general)
48GLNAME( _mesa_sse_transform_points3_general ):
49
50#define FRAME_OFFSET 8
51    PUSH_L    ( ESI )
52    PUSH_L    ( EDI )
53
54    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
55    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
56
57    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
58    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
59
60    CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
61    JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
62
63    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
64    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
65
66    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
67    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
68
69    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
70    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
71
72    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
73    ADD_L( EDI, ECX ) 				/* count += dest ptr */
74
75
76ALIGNTEXT32
77    MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
78    MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
79    MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
80    MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
81
82
83ALIGNTEXT32
84LLBL(K_GTPGR_top):
85    MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
86    SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
87    MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
88    SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
89    MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
90    SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
91
92    MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
93    MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
94    MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
95
96    ADDPS     ( XMM5, XMM4 )
97    ADDPS     ( XMM6, XMM4 )
98    ADDPS     ( XMM3, XMM4 )
99
100    MOVAPS    ( XMM4, REGOFF(0, EDI) )
101
102LLBL(K_GTPGR_skip):
103    ADD_L     ( CONST(16), EDI )
104    ADD_L     ( EAX, ESI )
105    CMP_L     ( ECX, EDI )
106    JNE       ( LLBL(K_GTPGR_top) )
107
108LLBL(K_GTPGR_finish):
109    POP_L     ( EDI )
110    POP_L     ( ESI )
111    RET
112#undef FRAME_OFFSET
113
114
115ALIGNTEXT4
116GLOBL GLNAME(_mesa_sse_transform_points3_identity)
117HIDDEN(_mesa_sse_transform_points3_identity)
118GLNAME( _mesa_sse_transform_points3_identity ):
119
120#define FRAME_OFFSET 8
121    PUSH_L    ( ESI )
122    PUSH_L    ( EDI )
123
124    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
125    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
126
127    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
128
129    TEST_L( ECX, ECX)
130    JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
131
132    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
133    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
134
135    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
136    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
137
138    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
139    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
140
141    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
142    ADD_L( EDI, ECX ) 				/* count += dest ptr */
143
144    CMP_L( ESI, EDI )
145    JE( LLBL(K_GTPIR_finish) )
146
147
148ALIGNTEXT32
149LLBL(K_GTPIR_top):
150    MOVLPS    ( S(0), XMM0 )
151    MOVLPS    ( XMM0, D(0) )
152    MOVSS     ( S(2), XMM0 )
153    MOVSS     ( XMM0, D(2) )
154
155LLBL(K_GTPIR_skip):
156    ADD_L     ( CONST(16), EDI )
157    ADD_L     ( EAX, ESI )
158    CMP_L     ( ECX, EDI )
159    JNE       ( LLBL(K_GTPIR_top) )
160
161LLBL(K_GTPIR_finish):
162    POP_L     ( EDI )
163    POP_L     ( ESI )
164    RET
165#undef FRAME_OFFSET
166
167
168
169
170ALIGNTEXT4
171GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
172HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
173GLNAME(_mesa_sse_transform_points3_3d_no_rot):
174
175#define FRAME_OFFSET 8
176    PUSH_L( ESI )
177    PUSH_L( EDI )
178
179    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
180    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
181
182
183    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
184    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
185
186    TEST_L( ECX, ECX)
187    JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
188
189    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
190    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
191
192    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
193    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
194
195    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
196    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
197
198    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
199    ADD_L( EDI, ECX ) 				/* count += dest ptr */
200
201
202ALIGNTEXT32
203    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
204    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
205    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
206    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
207    MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
208    MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
209
210ALIGNTEXT32
211LLBL(K_GTP3DNRR_top):
212
213    MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
214    MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
215    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
216    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
217
218    MOVSS    ( S(2), XMM0 )			/* sz */
219    MULSS    ( XMM3, XMM0 )			/* sz*m10 */
220    ADDSS    ( XMM4, XMM0 )			/* +m14 */
221    MOVSS    ( XMM0, D(2) )			/* -> D(2) */
222
223LLBL(K_GTP3DNRR_skip):
224    ADD_L    ( CONST(16), EDI )
225    ADD_L    ( EAX, ESI )
226    CMP_L    ( ECX, EDI )
227    JNE      ( LLBL(K_GTP3DNRR_top) )
228
229LLBL(K_GTP3DNRR_finish):
230    POP_L    ( EDI )
231    POP_L    ( ESI )
232    RET
233#undef FRAME_OFFSET
234
235
236
237ALIGNTEXT4
238GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
239HIDDEN(_mesa_sse_transform_points3_perspective)
240GLNAME(_mesa_sse_transform_points3_perspective):
241
242#define FRAME_OFFSET 8
243    PUSH_L   ( ESI )
244    PUSH_L   ( EDI )
245
246    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
247    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
248
249    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
250    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
251
252    TEST_L( ECX, ECX)
253    JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
254
255    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
256    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
257
258    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
259    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
260
261    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
262    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
263
264    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
265    ADD_L( EDI, ECX ) 				/* count += dest ptr */
266
267ALIGNTEXT32
268    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
269    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
270    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
271    MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
272    MOVSS    ( M(10), XMM3 )			/* m10 */
273    MOVSS    ( M(14), XMM4 )			/* m14 */
274    XORPS    ( XMM6, XMM6 )			/* 0 */
275
276ALIGNTEXT32
277LLBL(K_GTP3PR_top):
278    MOVLPS   ( S(0), XMM0 )			/* oy | ox */
279    MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
280    MOVSS    ( S(2), XMM5 )			/* oz */
281    SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
282    MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
283    ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
284    MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
285
286    MOVSS    ( S(2), XMM0 )			/* oz */
287    MULSS    ( XMM3, XMM0 )			/* oz*m10 */
288    ADDSS    ( XMM4, XMM0 )			/* +m14 */
289    MOVSS    ( XMM0, D(2) )			/* ->D(2) */
290
291    MOVSS    ( S(2), XMM0 )			/* oz */
292    MOVSS    ( XMM6, XMM5 )			/* 0 */
293    SUBPS    ( XMM0, XMM5 )			/* -oz */
294    MOVSS    ( XMM5, D(3) )			/* ->D(3) */
295
296LLBL(K_GTP3PR_skip):
297    ADD_L( CONST(16), EDI )
298    ADD_L( EAX, ESI )
299    CMP_L( ECX, EDI )
300    JNE( LLBL(K_GTP3PR_top) )
301
302LLBL(K_GTP3PR_finish):
303    POP_L    ( EDI )
304    POP_L    ( ESI )
305    RET
306#undef FRAME_OFFSET
307
308
309
310ALIGNTEXT4
311GLOBL GLNAME(_mesa_sse_transform_points3_2d)
312HIDDEN(_mesa_sse_transform_points3_2d)
313GLNAME(_mesa_sse_transform_points3_2d):
314
315#define FRAME_OFFSET 8
316    PUSH_L( ESI )
317    PUSH_L( EDI )
318
319    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
320    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
321
322    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
323    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
324
325    TEST_L( ECX, ECX)
326    JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
327
328    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
329    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
330
331    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
332    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
333
334    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
335    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
336
337    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
338    ADD_L( EDI, ECX ) 				/* count += dest ptr */
339
340ALIGNTEXT32
341    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
342    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
343    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
344
345ALIGNTEXT32
346LLBL(K_GTP3P2DR_top):
347    MOVSS    ( S(0), XMM3 )			/* ox */
348    SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
349    MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
350    MOVSS    ( S(1), XMM4 )			/* oy */
351    SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
352    MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
353
354    ADDPS    ( XMM4, XMM3 )
355    ADDPS    ( XMM2, XMM3 )
356    MOVLPS   ( XMM3, D(0) )
357
358    MOVSS    ( S(2), XMM3 )
359    MOVSS    ( XMM3, D(2) )
360
361LLBL(K_GTP3P2DR_skip):
362    ADD_L    ( CONST(16), EDI )
363    ADD_L    ( EAX, ESI )
364    CMP_L    ( ECX, EDI )
365    JNE      ( LLBL(K_GTP3P2DR_top) )
366
367LLBL(K_GTP3P2DR_finish):
368    POP_L    ( EDI )
369    POP_L    ( ESI )
370    RET
371#undef FRAME_OFFSET
372
373
374
375ALIGNTEXT4
376GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
377HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
378GLNAME(_mesa_sse_transform_points3_2d_no_rot):
379
380#define FRAME_OFFSET 8
381	PUSH_L( ESI )
382	PUSH_L( EDI )
383
384	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
385	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
386
387	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
388	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
389
390	TEST_L( ECX, ECX)
391	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
392
393	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
394	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
395
396	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
397	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
398
399	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
400	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
401
402	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
403	ADD_L( EDI, ECX ) 			/* count += dest ptr */
404
405ALIGNTEXT32
406	MOVSS    ( M(0), XMM1 )			/* m0 */
407	MOVSS    ( M(5), XMM2 )			/* m5 */
408	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
409	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
410
411ALIGNTEXT32
412LLBL(K_GTP3P2DNRR_top):
413	MOVLPS( S(0), XMM0 )			/* oy | ox */
414	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
415	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
416	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
417
418	MOVSS( S(2), XMM0 )
419	MOVSS( XMM0, D(2) )
420
421LLBL(K_GTP3P2DNRR_skip):
422	ADD_L( CONST(16), EDI )
423	ADD_L( EAX, ESI )
424	CMP_L( ECX, EDI )
425	JNE( LLBL(K_GTP3P2DNRR_top) )
426
427LLBL(K_GTP3P2DNRR_finish):
428	POP_L( EDI )
429	POP_L( ESI )
430	RET
431#undef FRAME_OFFSET
432
433
434
435
436ALIGNTEXT4
437GLOBL GLNAME(_mesa_sse_transform_points3_3d)
438HIDDEN(_mesa_sse_transform_points3_3d)
439GLNAME(_mesa_sse_transform_points3_3d):
440
441#define FRAME_OFFSET 8
442	PUSH_L( ESI )
443	PUSH_L( EDI )
444
445	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
446	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
447
448
449	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
450	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
451
452	TEST_L( ECX, ECX)
453	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
454
455	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
456	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
457
458	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
459	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
460
461	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
462	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
463
464	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
465	ADD_L( EDI, ECX ) 			/* count += dest ptr */
466
467
468ALIGNTEXT32
469	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
470	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
471	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
472	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
473
474ALIGNTEXT32
475LLBL(K_GTP3P3DR_top):
476	MOVSS( S(0), XMM4 )
477	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
478	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
479
480	MOVSS( S(1), XMM5 )
481	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
482	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
483
484	MOVSS( S(2), XMM6 )
485	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
486	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
487
488	ADDPS( XMM5, XMM4 )			/* + | + | + */
489	ADDPS( XMM6, XMM4 )			/* + | + | + */
490	ADDPS( XMM3, XMM4 )			/* + | + | + */
491
492	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
493	UNPCKHPS( XMM4, XMM4 )
494	MOVSS( XMM4, D(2) )
495
496LLBL(K_GTP3P3DR_skip):
497	ADD_L( CONST(16), EDI )
498	ADD_L( EAX, ESI )
499	CMP_L( ECX, EDI )
500	JNE( LLBL(K_GTP3P3DR_top) )
501
502LLBL(K_GTP3P3DR_finish):
503	POP_L( EDI )
504	POP_L( ESI )
505	RET
506#undef FRAME_OFFSET
507#endif
508
509#if defined (__ELF__) && defined (__linux__)
510	.section .note.GNU-stack,"",%progbits
511#endif
512