1848b8605Smrg/* 2848b8605Smrg * Mesa 3-D graphics library 3848b8605Smrg * 4848b8605Smrg * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 5848b8605Smrg * 6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 7848b8605Smrg * copy of this software and associated documentation files (the "Software"), 8848b8605Smrg * to deal in the Software without restriction, including without limitation 9848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the 11848b8605Smrg * Software is furnished to do so, subject to the following conditions: 12848b8605Smrg * 13848b8605Smrg * The above copyright notice and this permission notice shall be included 14848b8605Smrg * in all copies or substantial portions of the Software. 15848b8605Smrg * 16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE. 23848b8605Smrg */ 24848b8605Smrg 25848b8605Smrg#ifdef USE_X86_64_ASM 26848b8605Smrg 27848b8605Smrg#include "matypes.h" 28848b8605Smrg 29848b8605Smrg.text 30848b8605Smrg 31848b8605Smrg.align 16 32848b8605Smrg.globl _mesa_x86_64_cpuid 33848b8605Smrg.hidden _mesa_x86_64_cpuid 34848b8605Smrg_mesa_x86_64_cpuid: 35848b8605Smrg pushq %rbx 36848b8605Smrg movl (%rdi), %eax 37848b8605Smrg movl 8(%rdi), %ecx 38848b8605Smrg 39848b8605Smrg cpuid 40848b8605Smrg 41848b8605Smrg movl %ebx, 4(%rdi) 42848b8605Smrg movl %eax, (%rdi) 43848b8605Smrg movl %ecx, 8(%rdi) 44848b8605Smrg movl %edx, 12(%rdi) 45848b8605Smrg popq %rbx 46848b8605Smrg ret 47848b8605Smrg 48848b8605Smrg.align 16 49848b8605Smrg.globl _mesa_x86_64_transform_points4_general 50848b8605Smrg.hidden _mesa_x86_64_transform_points4_general 51848b8605Smrg_mesa_x86_64_transform_points4_general: 52848b8605Smrg/* 53848b8605Smrg * rdi = dest 54848b8605Smrg * rsi = matrix 55848b8605Smrg * rdx = source 56848b8605Smrg */ 57848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 58848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 59848b8605Smrg 60848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 61848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 62848b8605Smrg .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 63848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 64848b8605Smrg 65848b8605Smrg testl %ecx, %ecx /* verify non-zero count */ 66848b8605Smrg prefetchnta 64(%rsi) 67848b8605Smrg jz p4_general_done 68848b8605Smrg 69848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 70848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 71848b8605Smrg 72b8e80941Smrg prefetcht1 16(%rdx) 73848b8605Smrg 74848b8605Smrg movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 75848b8605Smrg movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 76848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 77848b8605Smrg movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 78848b8605Smrg movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 79848b8605Smrg 80848b8605Smrgp4_general_loop: 81848b8605Smrg 82848b8605Smrg movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 83b8e80941Smrg prefetcht1 16(%rdi) 84848b8605Smrg 85848b8605Smrg pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 86848b8605Smrg addq %rax, %rdx 87848b8605Smrg pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 88848b8605Smrg mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 89848b8605Smrg pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 90848b8605Smrg mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 91848b8605Smrg pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 92848b8605Smrg mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 93848b8605Smrg addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 94848b8605Smrg mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 95848b8605Smrg addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 96b8e80941Smrg prefetcht1 16(%rdx) 97848b8605Smrg addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 98848b8605Smrg 99848b8605Smrg movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 100848b8605Smrg addq $16, %rdi 101848b8605Smrg 102848b8605Smrg decl %ecx 103848b8605Smrg jnz p4_general_loop 104848b8605Smrg 105848b8605Smrgp4_general_done: 106848b8605Smrg .byte 0xf3 107848b8605Smrg ret 108848b8605Smrg 109848b8605Smrg.section .rodata 110848b8605Smrg 111848b8605Smrg.align 16 112848b8605Smrgp4_constants: 113848b8605Smrg.byte 0xff, 0xff, 0xff, 0xff 114848b8605Smrg.byte 0xff, 0xff, 0xff, 0xff 115848b8605Smrg.byte 0xff, 0xff, 0xff, 0xff 116848b8605Smrg.byte 0x00, 0x00, 0x00, 0x00 117848b8605Smrg 118848b8605Smrg.byte 0x00, 0x00, 0x00, 0x00 119848b8605Smrg.byte 0x00, 0x00, 0x00, 0x00 120848b8605Smrg.byte 0x00, 0x00, 0x00, 0x00 121848b8605Smrg.float 1.0 122848b8605Smrg 123848b8605Smrg.text 124848b8605Smrg.align 16 125848b8605Smrg.globl _mesa_x86_64_transform_points4_3d 126848b8605Smrg.hidden _mesa_x86_64_transform_points4_3d 127848b8605Smrg/* 128848b8605Smrg * this is slower than _mesa_x86_64_transform_points4_general 129848b8605Smrg * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 130848b8605Smrg */ 131848b8605Smrg_mesa_x86_64_transform_points4_3d: 132848b8605Smrg 133848b8605Smrg leaq p4_constants(%rip), %rax 134848b8605Smrg 135848b8605Smrg prefetchnta 64(%rsi) 136848b8605Smrg 137848b8605Smrg movaps (%rax), %xmm9 138848b8605Smrg movaps 16(%rax), %xmm10 139848b8605Smrg 140848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 141848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 142848b8605Smrg 143848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 144848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 145848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 146848b8605Smrg 147848b8605Smrg testl %ecx, %ecx /* verify non-zero count */ 148848b8605Smrg jz p4_3d_done 149848b8605Smrg 150848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 151848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 152848b8605Smrg 153b8e80941Smrg prefetcht1 16(%rdx) 154848b8605Smrg 155848b8605Smrg movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 156848b8605Smrg movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 157848b8605Smrg andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 158848b8605Smrg movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 159848b8605Smrg andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 160848b8605Smrg movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 161848b8605Smrg andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 162848b8605Smrg andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 163848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 164848b8605Smrg orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 165848b8605Smrg 166848b8605Smrgp4_3d_loop: 167848b8605Smrg 168848b8605Smrg movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 169b8e80941Smrg prefetcht1 16(%rdi) 170848b8605Smrg 171848b8605Smrg pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 172848b8605Smrg addq %rax, %rdx 173848b8605Smrg pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 174848b8605Smrg mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 175848b8605Smrg pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 176848b8605Smrg mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 177848b8605Smrg pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 178848b8605Smrg mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 179848b8605Smrg addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 180848b8605Smrg mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 181848b8605Smrg addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 182b8e80941Smrg prefetcht1 16(%rdx) 183848b8605Smrg addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 184848b8605Smrg 185848b8605Smrg movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 186848b8605Smrg addq $16, %rdi 187848b8605Smrg 188848b8605Smrg dec %ecx 189848b8605Smrg jnz p4_3d_loop 190848b8605Smrg 191848b8605Smrgp4_3d_done: 192848b8605Smrg .byte 0xf3 193848b8605Smrg ret 194848b8605Smrg 195848b8605Smrg 196848b8605Smrg.align 16 197848b8605Smrg.globl _mesa_x86_64_transform_points4_identity 198848b8605Smrg.hidden _mesa_x86_64_transform_points4_identity 199848b8605Smrg_mesa_x86_64_transform_points4_identity: 200848b8605Smrg 201848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 202848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 203848b8605Smrg 204848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 205848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 206848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 207848b8605Smrg 208848b8605Smrg test %ecx, %ecx 209848b8605Smrg jz p4_identity_done 210848b8605Smrg 211848b8605Smrg movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 212848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 213b8e80941Smrg prefetcht1 64(%rsi) 214b8e80941Smrg prefetcht1 64(%rdi) 215848b8605Smrg 216848b8605Smrg add %ecx, %ecx 217848b8605Smrg 218848b8605Smrg rep movsq 219848b8605Smrg 220848b8605Smrgp4_identity_done: 221848b8605Smrg .byte 0xf3 222848b8605Smrg ret 223848b8605Smrg 224848b8605Smrg 225848b8605Smrg.align 16 226848b8605Smrg.globl _mesa_3dnow_transform_points4_3d_no_rot 227848b8605Smrg.hidden _mesa_3dnow_transform_points4_3d_no_rot 228848b8605Smrg_mesa_3dnow_transform_points4_3d_no_rot: 229848b8605Smrg 230848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 231848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 232848b8605Smrg 233848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 234848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 235848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 236848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 237848b8605Smrg 238848b8605Smrg test %ecx, %ecx 239848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 240848b8605Smrg jz p4_3d_no_rot_done 241848b8605Smrg 242848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 243848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 244848b8605Smrg 245b8e80941Smrg prefetcht1 (%rdx) 246848b8605Smrg 247848b8605Smrg movd (%rsi), %mm0 /* | m00 */ 248848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 249848b8605Smrg punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 250848b8605Smrg 251848b8605Smrg movd 40(%rsi), %mm2 /* | m22 */ 252848b8605Smrg movq 48(%rsi), %mm1 /* m31 | m30 */ 253848b8605Smrg 254848b8605Smrg punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 255848b8605Smrg 256848b8605Smrgp4_3d_no_rot_loop: 257848b8605Smrg 258b8e80941Smrg prefetcht1 32(%rdi) 259848b8605Smrg 260848b8605Smrg movq (%rdx), %mm4 /* x1 | x0 */ 261848b8605Smrg movq 8(%rdx), %mm5 /* x3 | x2 */ 262848b8605Smrg movd 12(%rdx), %mm7 /* | x3 */ 263848b8605Smrg 264848b8605Smrg movq %mm5, %mm6 /* x3 | x2 */ 265848b8605Smrg pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 266848b8605Smrg 267848b8605Smrg punpckhdq %mm6, %mm6 /* x3 | x3 */ 268848b8605Smrg pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 269848b8605Smrg 270848b8605Smrg pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 271848b8605Smrg pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 272848b8605Smrg 273848b8605Smrg pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 274848b8605Smrg 275848b8605Smrg addq %rax, %rdx 276848b8605Smrg movq %mm4, (%rdi) /* write r0, r1 */ 277848b8605Smrg movq %mm5, 8(%rdi) /* write r2, r3 */ 278848b8605Smrg 279848b8605Smrg addq $16, %rdi 280848b8605Smrg 281848b8605Smrg decl %ecx 282b8e80941Smrg prefetcht1 32(%rdx) 283848b8605Smrg jnz p4_3d_no_rot_loop 284848b8605Smrg 285848b8605Smrgp4_3d_no_rot_done: 286848b8605Smrg femms 287848b8605Smrg ret 288848b8605Smrg 289848b8605Smrg 290848b8605Smrg.align 16 291848b8605Smrg.globl _mesa_3dnow_transform_points4_perspective 292848b8605Smrg.hidden _mesa_3dnow_transform_points4_perspective 293848b8605Smrg_mesa_3dnow_transform_points4_perspective: 294848b8605Smrg 295848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 296848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 297848b8605Smrg 298848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 299848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 300848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 301848b8605Smrg 302848b8605Smrg test %ecx, %ecx 303848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 304848b8605Smrg jz p4_perspective_done 305848b8605Smrg 306848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 307848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 308848b8605Smrg 309848b8605Smrg movd (%rsi), %mm0 /* | m00 */ 310848b8605Smrg pxor %mm7, %mm7 /* 0 | 0 */ 311848b8605Smrg punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 312848b8605Smrg 313848b8605Smrg movq 32(%rsi), %mm2 /* m21 | m20 */ 314b8e80941Smrg prefetcht1 (%rdx) 315848b8605Smrg 316848b8605Smrg movd 40(%rsi), %mm1 /* | m22 */ 317848b8605Smrg 318848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 319848b8605Smrg punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 320848b8605Smrg 321848b8605Smrg 322848b8605Smrgp4_perspective_loop: 323848b8605Smrg 324b8e80941Smrg prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 325848b8605Smrg 326848b8605Smrg movq (%rdx), %mm4 /* x1 | x0 */ 327848b8605Smrg movq 8(%rdx), %mm5 /* x3 | x2 */ 328848b8605Smrg movd 8(%rdx), %mm3 /* | x2 */ 329848b8605Smrg 330848b8605Smrg movq %mm5, %mm6 /* x3 | x2 */ 331848b8605Smrg pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 332848b8605Smrg 333848b8605Smrg punpckldq %mm5, %mm5 /* x2 | x2 */ 334848b8605Smrg 335848b8605Smrg pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 336848b8605Smrg pfsubr %mm7, %mm3 /* | -x2 */ 337848b8605Smrg 338848b8605Smrg pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 339848b8605Smrg pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 340848b8605Smrg 341848b8605Smrg pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 342848b8605Smrg 343848b8605Smrg movq %mm5, (%rdi) /* write r0, r1 */ 344848b8605Smrg addq %rax, %rdx 345848b8605Smrg movq %mm6, 8(%rdi) /* write r2, r3 */ 346848b8605Smrg 347848b8605Smrg addq $16, %rdi 348848b8605Smrg 349848b8605Smrg decl %ecx 350b8e80941Smrg prefetcht1 32(%rdx) /* hopefully stride is zero */ 351848b8605Smrg jnz p4_perspective_loop 352848b8605Smrg 353848b8605Smrgp4_perspective_done: 354848b8605Smrg femms 355848b8605Smrg ret 356848b8605Smrg 357848b8605Smrg.align 16 358848b8605Smrg.globl _mesa_3dnow_transform_points4_2d_no_rot 359848b8605Smrg.hidden _mesa_3dnow_transform_points4_2d_no_rot 360848b8605Smrg_mesa_3dnow_transform_points4_2d_no_rot: 361848b8605Smrg 362848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 363848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 364848b8605Smrg 365848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 366848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 367848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 368848b8605Smrg 369848b8605Smrg test %ecx, %ecx 370848b8605Smrg .byte 0x90 /* manual align += 1 */ 371848b8605Smrg jz p4_2d_no_rot_done 372848b8605Smrg 373848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 374848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 375848b8605Smrg 376848b8605Smrg movd (%rsi), %mm0 /* | m00 */ 377b8e80941Smrg prefetcht1 (%rdx) 378848b8605Smrg punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 379848b8605Smrg 380848b8605Smrg movq 48(%rsi), %mm1 /* m31 | m30 */ 381848b8605Smrg 382848b8605Smrgp4_2d_no_rot_loop: 383848b8605Smrg 384b8e80941Smrg prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 385848b8605Smrg 386848b8605Smrg movq (%rdx), %mm4 /* x1 | x0 */ 387848b8605Smrg movq 8(%rdx), %mm5 /* x3 | x2 */ 388848b8605Smrg 389848b8605Smrg pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 390848b8605Smrg movq %mm5, %mm6 /* x3 | x2 */ 391848b8605Smrg 392848b8605Smrg punpckhdq %mm6, %mm6 /* x3 | x3 */ 393848b8605Smrg 394848b8605Smrg addq %rax, %rdx 395848b8605Smrg pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 396848b8605Smrg 397b8e80941Smrg prefetcht1 32(%rdx) /* hopefully stride is zero */ 398848b8605Smrg pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 399848b8605Smrg 400848b8605Smrg movq %mm6, (%rdi) /* write r0, r1 */ 401848b8605Smrg movq %mm5, 8(%rdi) /* write r2, r3 */ 402848b8605Smrg 403848b8605Smrg addq $16, %rdi 404848b8605Smrg 405848b8605Smrg decl %ecx 406848b8605Smrg jnz p4_2d_no_rot_loop 407848b8605Smrg 408848b8605Smrgp4_2d_no_rot_done: 409848b8605Smrg femms 410848b8605Smrg ret 411848b8605Smrg 412848b8605Smrg 413848b8605Smrg.align 16 414848b8605Smrg.globl _mesa_3dnow_transform_points4_2d 415848b8605Smrg.hidden _mesa_3dnow_transform_points4_2d 416848b8605Smrg_mesa_3dnow_transform_points4_2d: 417848b8605Smrg 418848b8605Smrg movl V4F_COUNT(%rdx), %ecx /* count */ 419848b8605Smrg movzbl V4F_STRIDE(%rdx), %eax /* stride */ 420848b8605Smrg 421848b8605Smrg movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 422848b8605Smrg movl $4, V4F_SIZE(%rdi) /* set dest size */ 423848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 424848b8605Smrg orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 425848b8605Smrg 426848b8605Smrg test %ecx, %ecx 427848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 428848b8605Smrg jz p4_2d_done 429848b8605Smrg 430848b8605Smrg movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 431848b8605Smrg movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 432848b8605Smrg 433848b8605Smrg movd (%rsi), %mm0 /* | m00 */ 434848b8605Smrg movd 4(%rsi), %mm1 /* | m01 */ 435848b8605Smrg 436b8e80941Smrg prefetcht1 (%rdx) 437848b8605Smrg 438848b8605Smrg punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 439848b8605Smrg .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 440848b8605Smrg punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 441848b8605Smrg 442848b8605Smrg movq 48(%rsi), %mm2 /* m31 | m30 */ 443848b8605Smrg 444848b8605Smrgp4_2d_loop: 445848b8605Smrg 446b8e80941Smrg prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 447848b8605Smrg 448848b8605Smrg movq (%rdx), %mm3 /* x1 | x0 */ 449848b8605Smrg movq 8(%rdx), %mm5 /* x3 | x2 */ 450848b8605Smrg 451848b8605Smrg movq %mm3, %mm4 /* x1 | x0 */ 452848b8605Smrg movq %mm5, %mm6 /* x3 | x2 */ 453848b8605Smrg 454848b8605Smrg pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 455848b8605Smrg punpckhdq %mm6, %mm6 /* x3 | x3 */ 456848b8605Smrg 457848b8605Smrg pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 458848b8605Smrg 459848b8605Smrg addq %rax, %rdx 460848b8605Smrg pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 461848b8605Smrg 462848b8605Smrg pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 463b8e80941Smrg prefetcht1 32(%rdx) /* hopefully stride is zero */ 464848b8605Smrg 465848b8605Smrg pfadd %mm6, %mm3 /* r1 | r0 */ 466848b8605Smrg 467848b8605Smrg movq %mm3, (%rdi) /* write r0, r1 */ 468848b8605Smrg movq %mm5, 8(%rdi) /* write r2, r3 */ 469848b8605Smrg 470848b8605Smrg addq $16, %rdi 471848b8605Smrg 472848b8605Smrg decl %ecx 473848b8605Smrg jnz p4_2d_loop 474848b8605Smrg 475848b8605Smrgp4_2d_done: 476848b8605Smrg femms 477848b8605Smrg ret 478848b8605Smrg 479848b8605Smrg#endif 480848b8605Smrg 481848b8605Smrg#if defined (__ELF__) && defined (__linux__) 482848b8605Smrg .section .note.GNU-stack,"",%progbits 483848b8605Smrg#endif 484