1848b8605Smrg/*
2848b8605Smrg * Mesa 3-D graphics library
3848b8605Smrg *
4848b8605Smrg * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
5848b8605Smrg *
6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7848b8605Smrg * copy of this software and associated documentation files (the "Software"),
8848b8605Smrg * to deal in the Software without restriction, including without limitation
9848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
11848b8605Smrg * Software is furnished to do so, subject to the following conditions:
12848b8605Smrg *
13848b8605Smrg * The above copyright notice and this permission notice shall be included
14848b8605Smrg * in all copies or substantial portions of the Software.
15848b8605Smrg *
16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE.
23848b8605Smrg */
24848b8605Smrg
25848b8605Smrg#ifdef USE_X86_64_ASM
26848b8605Smrg
27848b8605Smrg#include "matypes.h"
28848b8605Smrg
29848b8605Smrg.text
30848b8605Smrg
31848b8605Smrg.align 16
32848b8605Smrg.globl _mesa_x86_64_cpuid
33848b8605Smrg.hidden _mesa_x86_64_cpuid
34848b8605Smrg_mesa_x86_64_cpuid:
35848b8605Smrg	pushq	%rbx
36848b8605Smrg	movl	(%rdi), %eax
37848b8605Smrg	movl	8(%rdi), %ecx
38848b8605Smrg
39848b8605Smrg	cpuid
40848b8605Smrg
41848b8605Smrg	movl	%ebx, 4(%rdi)
42848b8605Smrg	movl	%eax, (%rdi)
43848b8605Smrg	movl	%ecx, 8(%rdi)
44848b8605Smrg	movl	%edx, 12(%rdi)
45848b8605Smrg	popq	%rbx
46848b8605Smrg	ret
47848b8605Smrg
48848b8605Smrg.align 16
49848b8605Smrg.globl _mesa_x86_64_transform_points4_general
50848b8605Smrg.hidden _mesa_x86_64_transform_points4_general
51848b8605Smrg_mesa_x86_64_transform_points4_general:
52848b8605Smrg/*
53848b8605Smrg *	rdi = dest
54848b8605Smrg *	rsi = matrix
55848b8605Smrg *	rdx = source
56848b8605Smrg */
57848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
58848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
59848b8605Smrg
60848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
61848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
62848b8605Smrg	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
63848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
64848b8605Smrg
65848b8605Smrg	testl %ecx, %ecx		/* verify non-zero count */
66848b8605Smrg	prefetchnta 64(%rsi)
67848b8605Smrg	jz p4_general_done
68848b8605Smrg
69848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
70848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
71848b8605Smrg
72b8e80941Smrg	prefetcht1 16(%rdx)
73848b8605Smrg
74848b8605Smrg	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
75848b8605Smrg	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
76848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
77848b8605Smrg	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
78848b8605Smrg        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
79848b8605Smrg
80848b8605Smrgp4_general_loop:
81848b8605Smrg
82848b8605Smrg	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
83b8e80941Smrg	prefetcht1 16(%rdi)
84848b8605Smrg
85848b8605Smrg	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
86848b8605Smrg	addq %rax, %rdx
87848b8605Smrg	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
88848b8605Smrg	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89848b8605Smrg	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
90848b8605Smrg	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91848b8605Smrg	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
92848b8605Smrg	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93848b8605Smrg	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
94848b8605Smrg	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95848b8605Smrg	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
96b8e80941Smrg	prefetcht1 16(%rdx)
97848b8605Smrg	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
98848b8605Smrg
99848b8605Smrg	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
100848b8605Smrg	addq $16, %rdi
101848b8605Smrg
102848b8605Smrg	decl %ecx
103848b8605Smrg	jnz p4_general_loop
104848b8605Smrg
105848b8605Smrgp4_general_done:
106848b8605Smrg	.byte 0xf3
107848b8605Smrg	ret
108848b8605Smrg
109848b8605Smrg.section .rodata
110848b8605Smrg
111848b8605Smrg.align 16
112848b8605Smrgp4_constants:
113848b8605Smrg.byte  0xff, 0xff, 0xff, 0xff
114848b8605Smrg.byte  0xff, 0xff, 0xff, 0xff
115848b8605Smrg.byte  0xff, 0xff, 0xff, 0xff
116848b8605Smrg.byte  0x00, 0x00, 0x00, 0x00
117848b8605Smrg
118848b8605Smrg.byte  0x00, 0x00, 0x00, 0x00
119848b8605Smrg.byte  0x00, 0x00, 0x00, 0x00
120848b8605Smrg.byte  0x00, 0x00, 0x00, 0x00
121848b8605Smrg.float 1.0
122848b8605Smrg
123848b8605Smrg.text
124848b8605Smrg.align 16
125848b8605Smrg.globl _mesa_x86_64_transform_points4_3d
126848b8605Smrg.hidden _mesa_x86_64_transform_points4_3d
127848b8605Smrg/*
128848b8605Smrg * this is slower than _mesa_x86_64_transform_points4_general
129848b8605Smrg * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
130848b8605Smrg */
131848b8605Smrg_mesa_x86_64_transform_points4_3d:
132848b8605Smrg
133848b8605Smrg	leaq p4_constants(%rip), %rax
134848b8605Smrg
135848b8605Smrg	prefetchnta 64(%rsi)
136848b8605Smrg
137848b8605Smrg	movaps (%rax), %xmm9
138848b8605Smrg	movaps 16(%rax), %xmm10
139848b8605Smrg
140848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
141848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
142848b8605Smrg
143848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
144848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
145848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
146848b8605Smrg
147848b8605Smrg	testl %ecx, %ecx		/* verify non-zero count */
148848b8605Smrg	jz p4_3d_done
149848b8605Smrg
150848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
151848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
152848b8605Smrg
153b8e80941Smrg	prefetcht1 16(%rdx)
154848b8605Smrg
155848b8605Smrg	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
156848b8605Smrg	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
157848b8605Smrg	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
158848b8605Smrg	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
159848b8605Smrg	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
160848b8605Smrg        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
161848b8605Smrg	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
162848b8605Smrg	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
163848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
164848b8605Smrg	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
165848b8605Smrg
166848b8605Smrgp4_3d_loop:
167848b8605Smrg
168848b8605Smrg	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
169b8e80941Smrg	prefetcht1 16(%rdi)
170848b8605Smrg
171848b8605Smrg	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
172848b8605Smrg	addq %rax, %rdx
173848b8605Smrg	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
174848b8605Smrg	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175848b8605Smrg	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
176848b8605Smrg	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177848b8605Smrg	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
178848b8605Smrg	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179848b8605Smrg	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
180848b8605Smrg	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181848b8605Smrg	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
182b8e80941Smrg	prefetcht1 16(%rdx)
183848b8605Smrg	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
184848b8605Smrg
185848b8605Smrg	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
186848b8605Smrg	addq $16, %rdi
187848b8605Smrg
188848b8605Smrg	dec %ecx
189848b8605Smrg	jnz p4_3d_loop
190848b8605Smrg
191848b8605Smrgp4_3d_done:
192848b8605Smrg	.byte 0xf3
193848b8605Smrg	ret
194848b8605Smrg
195848b8605Smrg
196848b8605Smrg.align 16
197848b8605Smrg.globl _mesa_x86_64_transform_points4_identity
198848b8605Smrg.hidden _mesa_x86_64_transform_points4_identity
199848b8605Smrg_mesa_x86_64_transform_points4_identity:
200848b8605Smrg
201848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
202848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
203848b8605Smrg
204848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
205848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
206848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207848b8605Smrg
208848b8605Smrg	test %ecx, %ecx
209848b8605Smrg	jz p4_identity_done
210848b8605Smrg
211848b8605Smrg	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
212848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
213b8e80941Smrg	prefetcht1 64(%rsi)
214b8e80941Smrg	prefetcht1 64(%rdi)
215848b8605Smrg
216848b8605Smrg	add %ecx, %ecx
217848b8605Smrg
218848b8605Smrg	rep movsq
219848b8605Smrg
220848b8605Smrgp4_identity_done:
221848b8605Smrg	.byte 0xf3
222848b8605Smrg	ret
223848b8605Smrg
224848b8605Smrg
225848b8605Smrg.align 16
226848b8605Smrg.globl _mesa_3dnow_transform_points4_3d_no_rot
227848b8605Smrg.hidden _mesa_3dnow_transform_points4_3d_no_rot
228848b8605Smrg_mesa_3dnow_transform_points4_3d_no_rot:
229848b8605Smrg
230848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
231848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
232848b8605Smrg
233848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
234848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
235848b8605Smrg	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
236848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
237848b8605Smrg
238848b8605Smrg	test %ecx, %ecx
239848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
240848b8605Smrg	jz p4_3d_no_rot_done
241848b8605Smrg
242848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
243848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
244848b8605Smrg
245b8e80941Smrg	prefetcht1 (%rdx)
246848b8605Smrg
247848b8605Smrg	movd (%rsi), %mm0		/*                 | m00             */
248848b8605Smrg	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
249848b8605Smrg	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
250848b8605Smrg
251848b8605Smrg	movd 40(%rsi), %mm2		/*                 | m22             */
252848b8605Smrg	movq 48(%rsi), %mm1		/* m31             | m30             */
253848b8605Smrg
254848b8605Smrg	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
255848b8605Smrg
256848b8605Smrgp4_3d_no_rot_loop:
257848b8605Smrg
258b8e80941Smrg	prefetcht1 32(%rdi)
259848b8605Smrg
260848b8605Smrg	movq  (%rdx), %mm4		/* x1              | x0              */
261848b8605Smrg	movq  8(%rdx), %mm5		/* x3              | x2              */
262848b8605Smrg	movd  12(%rdx), %mm7		/*                 | x3              */
263848b8605Smrg
264848b8605Smrg	movq  %mm5, %mm6		/* x3              | x2              */
265848b8605Smrg	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
266848b8605Smrg
267848b8605Smrg	punpckhdq %mm6, %mm6		/* x3              | x3              */
268848b8605Smrg	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
269848b8605Smrg
270848b8605Smrg	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
271848b8605Smrg	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
272848b8605Smrg
273848b8605Smrg        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
274848b8605Smrg
275848b8605Smrg	addq %rax, %rdx
276848b8605Smrg	movq %mm4, (%rdi)		/* write r0, r1                      */
277848b8605Smrg	movq %mm5, 8(%rdi)		/* write r2, r3                      */
278848b8605Smrg
279848b8605Smrg	addq $16, %rdi
280848b8605Smrg
281848b8605Smrg	decl %ecx
282b8e80941Smrg	prefetcht1 32(%rdx)
283848b8605Smrg	jnz p4_3d_no_rot_loop
284848b8605Smrg
285848b8605Smrgp4_3d_no_rot_done:
286848b8605Smrg	femms
287848b8605Smrg	ret
288848b8605Smrg
289848b8605Smrg
290848b8605Smrg.align 16
291848b8605Smrg.globl _mesa_3dnow_transform_points4_perspective
292848b8605Smrg.hidden _mesa_3dnow_transform_points4_perspective
293848b8605Smrg_mesa_3dnow_transform_points4_perspective:
294848b8605Smrg
295848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
296848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
297848b8605Smrg
298848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
299848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
300848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
301848b8605Smrg
302848b8605Smrg	test %ecx, %ecx
303848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
304848b8605Smrg	jz p4_perspective_done
305848b8605Smrg
306848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
307848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
308848b8605Smrg
309848b8605Smrg	movd (%rsi), %mm0		/*                 | m00             */
310848b8605Smrg        pxor %mm7, %mm7			/* 0               | 0               */
311848b8605Smrg	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
312848b8605Smrg
313848b8605Smrg	movq 32(%rsi), %mm2		/* m21             | m20             */
314b8e80941Smrg	prefetcht1 (%rdx)
315848b8605Smrg
316848b8605Smrg	movd 40(%rsi), %mm1		/*                 | m22             */
317848b8605Smrg
318848b8605Smrg	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
319848b8605Smrg	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
320848b8605Smrg
321848b8605Smrg
322848b8605Smrgp4_perspective_loop:
323848b8605Smrg
324b8e80941Smrg	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
325848b8605Smrg
326848b8605Smrg	movq (%rdx), %mm4		/* x1              | x0              */
327848b8605Smrg	movq 8(%rdx), %mm5		/* x3              | x2              */
328848b8605Smrg	movd 8(%rdx), %mm3		/*                 | x2              */
329848b8605Smrg
330848b8605Smrg	movq %mm5, %mm6			/* x3              | x2              */
331848b8605Smrg	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
332848b8605Smrg
333848b8605Smrg	punpckldq %mm5, %mm5		/* x2              | x2              */
334848b8605Smrg
335848b8605Smrg	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
336848b8605Smrg	pfsubr %mm7, %mm3		/*                 | -x2             */
337848b8605Smrg
338848b8605Smrg	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
339848b8605Smrg	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
340848b8605Smrg
341848b8605Smrg	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
342848b8605Smrg
343848b8605Smrg	movq %mm5, (%rdi)		/* write r0, r1                      */
344848b8605Smrg	addq %rax, %rdx
345848b8605Smrg	movq %mm6, 8(%rdi)		/* write r2, r3                      */
346848b8605Smrg
347848b8605Smrg	addq $16, %rdi
348848b8605Smrg
349848b8605Smrg	decl %ecx
350b8e80941Smrg	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
351848b8605Smrg	jnz p4_perspective_loop
352848b8605Smrg
353848b8605Smrgp4_perspective_done:
354848b8605Smrg	femms
355848b8605Smrg	ret
356848b8605Smrg
357848b8605Smrg.align 16
358848b8605Smrg.globl _mesa_3dnow_transform_points4_2d_no_rot
359848b8605Smrg.hidden _mesa_3dnow_transform_points4_2d_no_rot
360848b8605Smrg_mesa_3dnow_transform_points4_2d_no_rot:
361848b8605Smrg
362848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
363848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
364848b8605Smrg
365848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
366848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
367848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
368848b8605Smrg
369848b8605Smrg	test %ecx, %ecx
370848b8605Smrg	.byte 0x90			/* manual align += 1 */
371848b8605Smrg	jz p4_2d_no_rot_done
372848b8605Smrg
373848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
374848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
375848b8605Smrg
376848b8605Smrg	movd (%rsi), %mm0		/*                 | m00             */
377b8e80941Smrg	prefetcht1 (%rdx)
378848b8605Smrg	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
379848b8605Smrg
380848b8605Smrg	movq 48(%rsi), %mm1		/* m31             | m30             */
381848b8605Smrg
382848b8605Smrgp4_2d_no_rot_loop:
383848b8605Smrg
384b8e80941Smrg	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
385848b8605Smrg
386848b8605Smrg	movq (%rdx), %mm4		/* x1              | x0              */
387848b8605Smrg	movq 8(%rdx), %mm5		/* x3              | x2              */
388848b8605Smrg
389848b8605Smrg	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
390848b8605Smrg	movq %mm5, %mm6			/* x3              | x2              */
391848b8605Smrg
392848b8605Smrg	punpckhdq %mm6, %mm6		/* x3              | x3              */
393848b8605Smrg
394848b8605Smrg	addq %rax, %rdx
395848b8605Smrg	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
396848b8605Smrg
397b8e80941Smrg	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
398848b8605Smrg	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
399848b8605Smrg
400848b8605Smrg	movq %mm6, (%rdi)		/* write r0, r1                      */
401848b8605Smrg	movq %mm5, 8(%rdi)		/* write r2, r3                      */
402848b8605Smrg
403848b8605Smrg	addq $16, %rdi
404848b8605Smrg
405848b8605Smrg	decl %ecx
406848b8605Smrg	jnz p4_2d_no_rot_loop
407848b8605Smrg
408848b8605Smrgp4_2d_no_rot_done:
409848b8605Smrg	femms
410848b8605Smrg	ret
411848b8605Smrg
412848b8605Smrg
413848b8605Smrg.align 16
414848b8605Smrg.globl _mesa_3dnow_transform_points4_2d
415848b8605Smrg.hidden _mesa_3dnow_transform_points4_2d
416848b8605Smrg_mesa_3dnow_transform_points4_2d:
417848b8605Smrg
418848b8605Smrg	movl V4F_COUNT(%rdx), %ecx	/* count */
419848b8605Smrg	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
420848b8605Smrg
421848b8605Smrg	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
422848b8605Smrg	movl $4, V4F_SIZE(%rdi)		/* set dest size */
423848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
424848b8605Smrg	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
425848b8605Smrg
426848b8605Smrg	test %ecx, %ecx
427848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
428848b8605Smrg	jz p4_2d_done
429848b8605Smrg
430848b8605Smrg	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
431848b8605Smrg	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
432848b8605Smrg
433848b8605Smrg	movd (%rsi), %mm0		/*                 | m00             */
434848b8605Smrg	movd 4(%rsi), %mm1		/*                 | m01             */
435848b8605Smrg
436b8e80941Smrg	prefetcht1 (%rdx)
437848b8605Smrg
438848b8605Smrg	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
439848b8605Smrg	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
440848b8605Smrg	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
441848b8605Smrg
442848b8605Smrg	movq 48(%rsi), %mm2		/* m31             | m30             */
443848b8605Smrg
444848b8605Smrgp4_2d_loop:
445848b8605Smrg
446b8e80941Smrg	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
447848b8605Smrg
448848b8605Smrg	movq (%rdx), %mm3		/* x1              | x0              */
449848b8605Smrg	movq 8(%rdx), %mm5		/* x3              | x2              */
450848b8605Smrg
451848b8605Smrg	movq %mm3, %mm4			/* x1              | x0              */
452848b8605Smrg	movq %mm5, %mm6			/* x3              | x2              */
453848b8605Smrg
454848b8605Smrg	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
455848b8605Smrg	punpckhdq %mm6, %mm6		/* x3              | x3              */
456848b8605Smrg
457848b8605Smrg	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
458848b8605Smrg
459848b8605Smrg	addq %rax, %rdx
460848b8605Smrg	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
461848b8605Smrg
462848b8605Smrg	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
463b8e80941Smrg	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
464848b8605Smrg
465848b8605Smrg	pfadd %mm6, %mm3		/* r1              | r0              */
466848b8605Smrg
467848b8605Smrg	movq %mm3, (%rdi)		/* write r0, r1                      */
468848b8605Smrg	movq %mm5, 8(%rdi)		/* write r2, r3                      */
469848b8605Smrg
470848b8605Smrg	addq $16, %rdi
471848b8605Smrg
472848b8605Smrg	decl %ecx
473848b8605Smrg	jnz p4_2d_loop
474848b8605Smrg
475848b8605Smrgp4_2d_done:
476848b8605Smrg	femms
477848b8605Smrg	ret
478848b8605Smrg
479848b8605Smrg#endif
480848b8605Smrg
481848b8605Smrg#if defined (__ELF__) && defined (__linux__)
482848b8605Smrg	.section .note.GNU-stack,"",%progbits
483848b8605Smrg#endif
484