xform4.S revision 7117f1b4
1/*
2 * Mesa 3-D graphics library
3 * Version:  7.0.1
4 *
5 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#ifdef USE_X86_64_ASM
26
27#include "matypes.h"
28
29.text
30
31.align 16
32
33.globl _mesa_x86_64_transform_points4_general
34_mesa_x86_64_transform_points4_general:
35/*
36 *	rdi = dest
37 *	rsi = matrix
38 *	rdx = source
39 */
40	movl V4F_COUNT(%rdx), %ecx	/* count */
41	movzx V4F_STRIDE(%rdx), %eax	/* stride */
42
43	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
44	movl $4, V4F_SIZE(%rdi)		/* set dest size */
45	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
46	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
47
48	testl %ecx, %ecx		/* verify non-zero count */
49	prefetchnta 64(%rsi)
50	jz p4_general_done
51
52	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
53	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
54
55	prefetch 16(%rdx)
56
57	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
58	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
59	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
60	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
61        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
62
63p4_general_loop:
64
65	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
66	prefetchw 16(%rdi)
67
68	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
69	addq %rax, %rdx
70	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
71	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
72	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
73	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
74	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
75	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
76	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
77	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
78	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
79	prefetch 16(%rdx)
80	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
81
82	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
83	addq $16, %rdi
84
85	decl %ecx
86	jnz p4_general_loop
87
88p4_general_done:
89	.byte 0xf3
90	ret
91
92.section .rodata
93
94.align 16
95p4_constants:
96.byte  0xff, 0xff, 0xff, 0xff
97.byte  0xff, 0xff, 0xff, 0xff
98.byte  0xff, 0xff, 0xff, 0xff
99.byte  0x00, 0x00, 0x00, 0x00
100
101.byte  0x00, 0x00, 0x00, 0x00
102.byte  0x00, 0x00, 0x00, 0x00
103.byte  0x00, 0x00, 0x00, 0x00
104.float 0f+1.0
105
106.text
107.align 16
108.globl _mesa_x86_64_transform_points4_3d
109/*
110 * this is slower than _mesa_x86_64_transform_points4_general
111 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
112 */
113_mesa_x86_64_transform_points4_3d:
114
115	leaq p4_constants(%rip), %rax
116
117	prefetchnta 64(%rsi)
118
119	movaps (%rax), %xmm9
120	movaps 16(%rax), %xmm10
121
122	movl V4F_COUNT(%rdx), %ecx	/* count */
123	movzx V4F_STRIDE(%rdx), %eax	/* stride */
124
125	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
126	movl $4, V4F_SIZE(%rdi)		/* set dest size */
127	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
128
129	testl %ecx, %ecx		/* verify non-zero count */
130	jz p4_3d_done
131
132	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
133	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
134
135	prefetch 16(%rdx)
136
137	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
138	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
139	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
140	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
141	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
142        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
143	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
144	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
145	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
146	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
147
148p4_3d_loop:
149
150	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
151	prefetchw 16(%rdi)
152
153	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
154	addq %rax, %rdx
155	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
156	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
157	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
158	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
159	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
160	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
161	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
162	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
163	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
164	prefetch 16(%rdx)
165	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
166
167	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
168	addq $16, %rdi
169
170	dec %ecx
171	jnz p4_3d_loop
172
173p4_3d_done:
174	.byte 0xf3
175	ret
176
177
178.align 16
179.globl _mesa_x86_64_transform_points4_identity
180_mesa_x86_64_transform_points4_identity:
181
182	movl V4F_COUNT(%rdx), %ecx	/* count */
183	movzx V4F_STRIDE(%rdx), %eax	/* stride */
184
185	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
186	movl $4, V4F_SIZE(%rdi)		/* set dest size */
187	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
188
189	test %ecx, %ecx
190	jz p4_identity_done
191
192	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
193	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
194	prefetch 64(%rsi)
195	prefetchw 64(%rdi)
196
197	add %ecx, %ecx
198
199	rep movsq
200
201p4_identity_done:
202	.byte 0xf3
203	ret
204
205
206.align 16
207.globl _mesa_x86_64_transform_points4_3d_no_rot
208_mesa_x86_64_transform_points4_3d_no_rot:
209
210	movl V4F_COUNT(%rdx), %ecx	/* count */
211	movzx V4F_STRIDE(%rdx), %eax	/* stride */
212
213	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
214	movl $4, V4F_SIZE(%rdi)		/* set dest size */
215	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
216	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
217
218	test %ecx, %ecx
219	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
220	jz p4_3d_no_rot_done
221
222	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
223	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
224
225	prefetch (%rdx)
226
227	movd (%rsi), %mm0		/*                 | m00             */
228	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
229	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
230
231	movd 40(%rsi), %mm2		/*                 | m22             */
232	movq 48(%rsi), %mm1		/* m31             | m30             */
233
234	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
235
236p4_3d_no_rot_loop:
237
238	prefetchw 32(%rdi)
239
240	movq  (%rdx), %mm4		/* x1              | x0              */
241	movq  8(%rdx), %mm5		/* x3              | x2              */
242	movd  12(%rdx), %mm7		/*                 | x3              */
243
244	movq  %mm5, %mm6		/* x3              | x2              */
245	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
246
247	punpckhdq %mm6, %mm6		/* x3              | x3              */
248	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
249
250	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
251	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
252
253        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
254
255	addq %rax, %rdx
256	movq %mm4, (%rdi)		/* write r0, r1                      */
257	movq %mm5, 8(%rdi)		/* write r2, r3                      */
258
259	addq $16, %rdi
260
261	decl %ecx
262	prefetch 32(%rdx)
263	jnz p4_3d_no_rot_loop
264
265p4_3d_no_rot_done:
266	femms
267	ret
268
269
270.align 16
271.globl _mesa_x86_64_transform_points4_perspective
272_mesa_x86_64_transform_points4_perspective:
273
274	movl V4F_COUNT(%rdx), %ecx	/* count */
275	movzx V4F_STRIDE(%rdx), %eax	/* stride */
276
277	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
278	movl $4, V4F_SIZE(%rdi)		/* set dest size */
279	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
280
281	test %ecx, %ecx
282	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
283	jz p4_perspective_done
284
285	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
286	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
287
288	movd (%rsi), %mm0		/*                 | m00             */
289        pxor %mm7, %mm7			/* 0               | 0               */
290	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
291
292	movq 32(%rsi), %mm2		/* m21             | m20             */
293	prefetch (%rdx)
294
295	movd 40(%rsi), %mm1		/*                 | m22             */
296
297	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
298	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
299
300
301p4_perspective_loop:
302
303	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
304
305	movq (%rdx), %mm4		/* x1              | x0              */
306	movq 8(%rdx), %mm5		/* x3              | x2              */
307	movd 8(%rdx), %mm3		/*                 | x2              */
308
309	movq %mm5, %mm6			/* x3              | x2              */
310	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
311
312	punpckldq %mm5, %mm5		/* x2              | x2              */
313
314	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
315	pfsubr %mm7, %mm3		/*                 | -x2             */
316
317	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
318	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
319
320	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
321
322	movq %mm5, (%rdi)		/* write r0, r1                      */
323	addq %rax, %rdx
324	movq %mm6, 8(%rdi)		/* write r2, r3                      */
325
326	addq $16, %rdi
327
328	decl %ecx
329	prefetch 32(%rdx)		/* hopefully stride is zero          */
330	jnz p4_perspective_loop
331
332p4_perspective_done:
333	femms
334	ret
335
336.align 16
337.globl _mesa_x86_64_transform_points4_2d_no_rot
338_mesa_x86_64_transform_points4_2d_no_rot:
339
340	movl V4F_COUNT(%rdx), %ecx	/* count */
341	movzx V4F_STRIDE(%rdx), %eax	/* stride */
342
343	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
344	movl $4, V4F_SIZE(%rdi)		/* set dest size */
345	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
346
347	test %ecx, %ecx
348	.byte 0x90			/* manual align += 1 */
349	jz p4_2d_no_rot_done
350
351	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
352	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
353
354	movd (%rsi), %mm0		/*                 | m00             */
355	prefetch (%rdx)
356	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
357
358	movq 48(%rsi), %mm1		/* m31             | m30             */
359
360p4_2d_no_rot_loop:
361
362	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
363
364	movq (%rdx), %mm4		/* x1              | x0              */
365	movq 8(%rdx), %mm5		/* x3              | x2              */
366
367	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
368	movq %mm5, %mm6			/* x3              | x2              */
369
370	punpckhdq %mm6, %mm6		/* x3              | x3              */
371
372	addq %rax, %rdx
373	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
374
375	prefetch 32(%rdx)		/* hopefully stride is zero          */
376	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
377
378	movq %mm6, (%rdi)		/* write r0, r1                      */
379	movq %mm5, 8(%rdi)		/* write r2, r3                      */
380
381	addq $16, %rdi
382
383	decl %ecx
384	jnz p4_2d_no_rot_loop
385
386p4_2d_no_rot_done:
387	femms
388	ret
389
390
391.align 16
392.globl _mesa_x86_64_transform_points4_2d
393_mesa_x86_64_transform_points4_2d:
394
395	movl V4F_COUNT(%rdx), %ecx	/* count */
396	movzx V4F_STRIDE(%rdx), %eax	/* stride */
397
398	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
399	movl $4, V4F_SIZE(%rdi)		/* set dest size */
400	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
401	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
402
403	test %ecx, %ecx
404	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
405	jz p4_2d_done
406
407	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
408	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
409
410	movd (%rsi), %mm0		/*                 | m00             */
411	movd 4(%rsi), %mm1		/*                 | m01             */
412
413	prefetch (%rdx)
414
415	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
416	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
417	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
418
419	movq 48(%rsi), %mm2		/* m31             | m30             */
420
421p4_2d_loop:
422
423	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
424
425	movq (%rdx), %mm3		/* x1              | x0              */
426	movq 8(%rdx), %mm5		/* x3              | x2              */
427
428	movq %mm3, %mm4			/* x1              | x0              */
429	movq %mm5, %mm6			/* x3              | x2              */
430
431	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
432	punpckhdq %mm6, %mm6		/* x3              | x3              */
433
434	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
435
436	addq %rax, %rdx
437	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
438
439	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
440	prefetch 32(%rdx)		/* hopefully stride is zero          */
441
442	pfadd %mm6, %mm3		/* r1              | r0              */
443
444	movq %mm3, (%rdi)		/* write r0, r1                      */
445	movq %mm5, 8(%rdi)		/* write r2, r3                      */
446
447	addq $16, %rdi
448
449	decl %ecx
450	jnz p4_2d_loop
451
452p4_2d_done:
453	femms
454	ret
455
456#endif
457
458#if defined (__ELF__) && defined (__linux__)
459	.section .note.GNU-stack,"",%progbits
460#endif
461