xform4.S revision c1f859d4
1/*
2 * Mesa 3-D graphics library
3 * Version:  7.1
4 *
5 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#ifdef USE_X86_64_ASM
26
27#include "matypes.h"
28
29.text
30
31.align 16
32.globl _mesa_x86_64_cpuid
33_mesa_x86_64_cpuid:
34	pushq	%rbx
35	movl	(%rdi), %eax
36	movl	8(%rdi), %ecx
37
38	cpuid
39
40	movl	%ebx, 4(%rdi)
41	movl	%eax, (%rdi)
42	movl	%ecx, 8(%rdi)
43	movl	%edx, 12(%rdi)
44	popq	%rbx
45	ret
46
47.align 16
48.globl _mesa_x86_64_transform_points4_general
49_mesa_x86_64_transform_points4_general:
50/*
51 *	rdi = dest
52 *	rsi = matrix
53 *	rdx = source
54 */
55	movl V4F_COUNT(%rdx), %ecx	/* count */
56	movzx V4F_STRIDE(%rdx), %eax	/* stride */
57
58	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
59	movl $4, V4F_SIZE(%rdi)		/* set dest size */
60	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
61	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
62
63	testl %ecx, %ecx		/* verify non-zero count */
64	prefetchnta 64(%rsi)
65	jz p4_general_done
66
67	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
68	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
69
70	prefetch 16(%rdx)
71
72	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
73	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
74	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
75	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
76        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
77
78p4_general_loop:
79
80	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
81	prefetchw 16(%rdi)
82
83	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
84	addq %rax, %rdx
85	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
86	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
87	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
88	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
89	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
90	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
91	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
92	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
93	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
94	prefetch 16(%rdx)
95	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
96
97	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
98	addq $16, %rdi
99
100	decl %ecx
101	jnz p4_general_loop
102
103p4_general_done:
104	.byte 0xf3
105	ret
106
107.section .rodata
108
109.align 16
110p4_constants:
111.byte  0xff, 0xff, 0xff, 0xff
112.byte  0xff, 0xff, 0xff, 0xff
113.byte  0xff, 0xff, 0xff, 0xff
114.byte  0x00, 0x00, 0x00, 0x00
115
116.byte  0x00, 0x00, 0x00, 0x00
117.byte  0x00, 0x00, 0x00, 0x00
118.byte  0x00, 0x00, 0x00, 0x00
119.float 0f+1.0
120
121.text
122.align 16
123.globl _mesa_x86_64_transform_points4_3d
124/*
125 * this is slower than _mesa_x86_64_transform_points4_general
126 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
127 */
128_mesa_x86_64_transform_points4_3d:
129
130	leaq p4_constants(%rip), %rax
131
132	prefetchnta 64(%rsi)
133
134	movaps (%rax), %xmm9
135	movaps 16(%rax), %xmm10
136
137	movl V4F_COUNT(%rdx), %ecx	/* count */
138	movzx V4F_STRIDE(%rdx), %eax	/* stride */
139
140	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
141	movl $4, V4F_SIZE(%rdi)		/* set dest size */
142	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
143
144	testl %ecx, %ecx		/* verify non-zero count */
145	jz p4_3d_done
146
147	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
148	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
149
150	prefetch 16(%rdx)
151
152	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
153	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
154	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
155	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
156	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
157        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
158	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
159	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
160	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
161	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
162
163p4_3d_loop:
164
165	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
166	prefetchw 16(%rdi)
167
168	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
169	addq %rax, %rdx
170	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
171	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
172	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
173	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
174	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
175	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
176	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
177	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
178	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
179	prefetch 16(%rdx)
180	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
181
182	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
183	addq $16, %rdi
184
185	dec %ecx
186	jnz p4_3d_loop
187
188p4_3d_done:
189	.byte 0xf3
190	ret
191
192
193.align 16
194.globl _mesa_x86_64_transform_points4_identity
195_mesa_x86_64_transform_points4_identity:
196
197	movl V4F_COUNT(%rdx), %ecx	/* count */
198	movzx V4F_STRIDE(%rdx), %eax	/* stride */
199
200	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
201	movl $4, V4F_SIZE(%rdi)		/* set dest size */
202	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
203
204	test %ecx, %ecx
205	jz p4_identity_done
206
207	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
208	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
209	prefetch 64(%rsi)
210	prefetchw 64(%rdi)
211
212	add %ecx, %ecx
213
214	rep movsq
215
216p4_identity_done:
217	.byte 0xf3
218	ret
219
220
221.align 16
222.globl _mesa_3dnow_transform_points4_3d_no_rot
223_mesa_3dnow_transform_points4_3d_no_rot:
224
225	movl V4F_COUNT(%rdx), %ecx	/* count */
226	movzx V4F_STRIDE(%rdx), %eax	/* stride */
227
228	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
229	movl $4, V4F_SIZE(%rdi)		/* set dest size */
230	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
231	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
232
233	test %ecx, %ecx
234	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
235	jz p4_3d_no_rot_done
236
237	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
238	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
239
240	prefetch (%rdx)
241
242	movd (%rsi), %mm0		/*                 | m00             */
243	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
244	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
245
246	movd 40(%rsi), %mm2		/*                 | m22             */
247	movq 48(%rsi), %mm1		/* m31             | m30             */
248
249	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
250
251p4_3d_no_rot_loop:
252
253	prefetchw 32(%rdi)
254
255	movq  (%rdx), %mm4		/* x1              | x0              */
256	movq  8(%rdx), %mm5		/* x3              | x2              */
257	movd  12(%rdx), %mm7		/*                 | x3              */
258
259	movq  %mm5, %mm6		/* x3              | x2              */
260	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
261
262	punpckhdq %mm6, %mm6		/* x3              | x3              */
263	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
264
265	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
266	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
267
268        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
269
270	addq %rax, %rdx
271	movq %mm4, (%rdi)		/* write r0, r1                      */
272	movq %mm5, 8(%rdi)		/* write r2, r3                      */
273
274	addq $16, %rdi
275
276	decl %ecx
277	prefetch 32(%rdx)
278	jnz p4_3d_no_rot_loop
279
280p4_3d_no_rot_done:
281	femms
282	ret
283
284
285.align 16
286.globl _mesa_3dnow_transform_points4_perspective
287_mesa_3dnow_transform_points4_perspective:
288
289	movl V4F_COUNT(%rdx), %ecx	/* count */
290	movzx V4F_STRIDE(%rdx), %eax	/* stride */
291
292	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
293	movl $4, V4F_SIZE(%rdi)		/* set dest size */
294	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
295
296	test %ecx, %ecx
297	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
298	jz p4_perspective_done
299
300	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
301	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
302
303	movd (%rsi), %mm0		/*                 | m00             */
304        pxor %mm7, %mm7			/* 0               | 0               */
305	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
306
307	movq 32(%rsi), %mm2		/* m21             | m20             */
308	prefetch (%rdx)
309
310	movd 40(%rsi), %mm1		/*                 | m22             */
311
312	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
313	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
314
315
316p4_perspective_loop:
317
318	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
319
320	movq (%rdx), %mm4		/* x1              | x0              */
321	movq 8(%rdx), %mm5		/* x3              | x2              */
322	movd 8(%rdx), %mm3		/*                 | x2              */
323
324	movq %mm5, %mm6			/* x3              | x2              */
325	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
326
327	punpckldq %mm5, %mm5		/* x2              | x2              */
328
329	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
330	pfsubr %mm7, %mm3		/*                 | -x2             */
331
332	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
333	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
334
335	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
336
337	movq %mm5, (%rdi)		/* write r0, r1                      */
338	addq %rax, %rdx
339	movq %mm6, 8(%rdi)		/* write r2, r3                      */
340
341	addq $16, %rdi
342
343	decl %ecx
344	prefetch 32(%rdx)		/* hopefully stride is zero          */
345	jnz p4_perspective_loop
346
347p4_perspective_done:
348	femms
349	ret
350
351.align 16
352.globl _mesa_3dnow_transform_points4_2d_no_rot
353_mesa_3dnow_transform_points4_2d_no_rot:
354
355	movl V4F_COUNT(%rdx), %ecx	/* count */
356	movzx V4F_STRIDE(%rdx), %eax	/* stride */
357
358	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
359	movl $4, V4F_SIZE(%rdi)		/* set dest size */
360	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
361
362	test %ecx, %ecx
363	.byte 0x90			/* manual align += 1 */
364	jz p4_2d_no_rot_done
365
366	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
367	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
368
369	movd (%rsi), %mm0		/*                 | m00             */
370	prefetch (%rdx)
371	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
372
373	movq 48(%rsi), %mm1		/* m31             | m30             */
374
375p4_2d_no_rot_loop:
376
377	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
378
379	movq (%rdx), %mm4		/* x1              | x0              */
380	movq 8(%rdx), %mm5		/* x3              | x2              */
381
382	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
383	movq %mm5, %mm6			/* x3              | x2              */
384
385	punpckhdq %mm6, %mm6		/* x3              | x3              */
386
387	addq %rax, %rdx
388	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
389
390	prefetch 32(%rdx)		/* hopefully stride is zero          */
391	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
392
393	movq %mm6, (%rdi)		/* write r0, r1                      */
394	movq %mm5, 8(%rdi)		/* write r2, r3                      */
395
396	addq $16, %rdi
397
398	decl %ecx
399	jnz p4_2d_no_rot_loop
400
401p4_2d_no_rot_done:
402	femms
403	ret
404
405
406.align 16
407.globl _mesa_3dnow_transform_points4_2d
408_mesa_3dnow_transform_points4_2d:
409
410	movl V4F_COUNT(%rdx), %ecx	/* count */
411	movzx V4F_STRIDE(%rdx), %eax	/* stride */
412
413	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
414	movl $4, V4F_SIZE(%rdi)		/* set dest size */
415	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
416	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
417
418	test %ecx, %ecx
419	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
420	jz p4_2d_done
421
422	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
423	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
424
425	movd (%rsi), %mm0		/*                 | m00             */
426	movd 4(%rsi), %mm1		/*                 | m01             */
427
428	prefetch (%rdx)
429
430	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
431	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
432	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
433
434	movq 48(%rsi), %mm2		/* m31             | m30             */
435
436p4_2d_loop:
437
438	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
439
440	movq (%rdx), %mm3		/* x1              | x0              */
441	movq 8(%rdx), %mm5		/* x3              | x2              */
442
443	movq %mm3, %mm4			/* x1              | x0              */
444	movq %mm5, %mm6			/* x3              | x2              */
445
446	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
447	punpckhdq %mm6, %mm6		/* x3              | x3              */
448
449	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
450
451	addq %rax, %rdx
452	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
453
454	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
455	prefetch 32(%rdx)		/* hopefully stride is zero          */
456
457	pfadd %mm6, %mm3		/* r1              | r0              */
458
459	movq %mm3, (%rdi)		/* write r0, r1                      */
460	movq %mm5, 8(%rdi)		/* write r2, r3                      */
461
462	addq $16, %rdi
463
464	decl %ecx
465	jnz p4_2d_loop
466
467p4_2d_done:
468	femms
469	ret
470
471#endif
472
473#if defined (__ELF__) && defined (__linux__)
474	.section .note.GNU-stack,"",%progbits
475#endif
476