xform4.S revision c1f859d4
1/* 2 * Mesa 3-D graphics library 3 * Version: 7.1 4 * 5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included 15 * in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#ifdef USE_X86_64_ASM 26 27#include "matypes.h" 28 29.text 30 31.align 16 32.globl _mesa_x86_64_cpuid 33_mesa_x86_64_cpuid: 34 pushq %rbx 35 movl (%rdi), %eax 36 movl 8(%rdi), %ecx 37 38 cpuid 39 40 movl %ebx, 4(%rdi) 41 movl %eax, (%rdi) 42 movl %ecx, 8(%rdi) 43 movl %edx, 12(%rdi) 44 popq %rbx 45 ret 46 47.align 16 48.globl _mesa_x86_64_transform_points4_general 49_mesa_x86_64_transform_points4_general: 50/* 51 * rdi = dest 52 * rsi = matrix 53 * rdx = source 54 */ 55 movl V4F_COUNT(%rdx), %ecx /* count */ 56 movzx V4F_STRIDE(%rdx), %eax /* stride */ 57 58 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 59 movl $4, V4F_SIZE(%rdi) /* set dest size */ 60 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 61 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 62 63 testl %ecx, %ecx /* verify non-zero count */ 64 prefetchnta 64(%rsi) 65 jz p4_general_done 66 67 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 68 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 69 70 prefetch 16(%rdx) 71 72 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 73 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 74 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 75 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 76 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 77 78p4_general_loop: 79 80 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 81 prefetchw 16(%rdi) 82 83 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 84 addq %rax, %rdx 85 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 86 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 87 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 88 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 89 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 90 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 91 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 92 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 93 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 94 prefetch 16(%rdx) 95 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 96 97 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 98 addq $16, %rdi 99 100 decl %ecx 101 jnz p4_general_loop 102 103p4_general_done: 104 .byte 0xf3 105 ret 106 107.section .rodata 108 109.align 16 110p4_constants: 111.byte 0xff, 0xff, 0xff, 0xff 112.byte 0xff, 0xff, 0xff, 0xff 113.byte 0xff, 0xff, 0xff, 0xff 114.byte 0x00, 0x00, 0x00, 0x00 115 116.byte 0x00, 0x00, 0x00, 0x00 117.byte 0x00, 0x00, 0x00, 0x00 118.byte 0x00, 0x00, 0x00, 0x00 119.float 0f+1.0 120 121.text 122.align 16 123.globl _mesa_x86_64_transform_points4_3d 124/* 125 * this is slower than _mesa_x86_64_transform_points4_general 126 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 127 */ 128_mesa_x86_64_transform_points4_3d: 129 130 leaq p4_constants(%rip), %rax 131 132 prefetchnta 64(%rsi) 133 134 movaps (%rax), %xmm9 135 movaps 16(%rax), %xmm10 136 137 movl V4F_COUNT(%rdx), %ecx /* count */ 138 movzx V4F_STRIDE(%rdx), %eax /* stride */ 139 140 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 141 movl $4, V4F_SIZE(%rdi) /* set dest size */ 142 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 143 144 testl %ecx, %ecx /* verify non-zero count */ 145 jz p4_3d_done 146 147 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 148 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 149 150 prefetch 16(%rdx) 151 152 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 153 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 154 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 155 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 156 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 157 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 158 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 159 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 160 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 161 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 162 163p4_3d_loop: 164 165 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 166 prefetchw 16(%rdi) 167 168 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 169 addq %rax, %rdx 170 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 171 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 172 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 173 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 174 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 175 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 176 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 177 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 178 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 179 prefetch 16(%rdx) 180 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 181 182 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 183 addq $16, %rdi 184 185 dec %ecx 186 jnz p4_3d_loop 187 188p4_3d_done: 189 .byte 0xf3 190 ret 191 192 193.align 16 194.globl _mesa_x86_64_transform_points4_identity 195_mesa_x86_64_transform_points4_identity: 196 197 movl V4F_COUNT(%rdx), %ecx /* count */ 198 movzx V4F_STRIDE(%rdx), %eax /* stride */ 199 200 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 201 movl $4, V4F_SIZE(%rdi) /* set dest size */ 202 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 203 204 test %ecx, %ecx 205 jz p4_identity_done 206 207 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 208 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 209 prefetch 64(%rsi) 210 prefetchw 64(%rdi) 211 212 add %ecx, %ecx 213 214 rep movsq 215 216p4_identity_done: 217 .byte 0xf3 218 ret 219 220 221.align 16 222.globl _mesa_3dnow_transform_points4_3d_no_rot 223_mesa_3dnow_transform_points4_3d_no_rot: 224 225 movl V4F_COUNT(%rdx), %ecx /* count */ 226 movzx V4F_STRIDE(%rdx), %eax /* stride */ 227 228 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 229 movl $4, V4F_SIZE(%rdi) /* set dest size */ 230 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 231 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 232 233 test %ecx, %ecx 234 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 235 jz p4_3d_no_rot_done 236 237 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 238 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 239 240 prefetch (%rdx) 241 242 movd (%rsi), %mm0 /* | m00 */ 243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 244 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 245 246 movd 40(%rsi), %mm2 /* | m22 */ 247 movq 48(%rsi), %mm1 /* m31 | m30 */ 248 249 punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 250 251p4_3d_no_rot_loop: 252 253 prefetchw 32(%rdi) 254 255 movq (%rdx), %mm4 /* x1 | x0 */ 256 movq 8(%rdx), %mm5 /* x3 | x2 */ 257 movd 12(%rdx), %mm7 /* | x3 */ 258 259 movq %mm5, %mm6 /* x3 | x2 */ 260 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 261 262 punpckhdq %mm6, %mm6 /* x3 | x3 */ 263 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 264 265 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 266 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 267 268 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 269 270 addq %rax, %rdx 271 movq %mm4, (%rdi) /* write r0, r1 */ 272 movq %mm5, 8(%rdi) /* write r2, r3 */ 273 274 addq $16, %rdi 275 276 decl %ecx 277 prefetch 32(%rdx) 278 jnz p4_3d_no_rot_loop 279 280p4_3d_no_rot_done: 281 femms 282 ret 283 284 285.align 16 286.globl _mesa_3dnow_transform_points4_perspective 287_mesa_3dnow_transform_points4_perspective: 288 289 movl V4F_COUNT(%rdx), %ecx /* count */ 290 movzx V4F_STRIDE(%rdx), %eax /* stride */ 291 292 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 293 movl $4, V4F_SIZE(%rdi) /* set dest size */ 294 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 295 296 test %ecx, %ecx 297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 298 jz p4_perspective_done 299 300 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 301 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 302 303 movd (%rsi), %mm0 /* | m00 */ 304 pxor %mm7, %mm7 /* 0 | 0 */ 305 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 306 307 movq 32(%rsi), %mm2 /* m21 | m20 */ 308 prefetch (%rdx) 309 310 movd 40(%rsi), %mm1 /* | m22 */ 311 312 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 313 punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 314 315 316p4_perspective_loop: 317 318 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 319 320 movq (%rdx), %mm4 /* x1 | x0 */ 321 movq 8(%rdx), %mm5 /* x3 | x2 */ 322 movd 8(%rdx), %mm3 /* | x2 */ 323 324 movq %mm5, %mm6 /* x3 | x2 */ 325 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 326 327 punpckldq %mm5, %mm5 /* x2 | x2 */ 328 329 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 330 pfsubr %mm7, %mm3 /* | -x2 */ 331 332 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 333 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 334 335 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 336 337 movq %mm5, (%rdi) /* write r0, r1 */ 338 addq %rax, %rdx 339 movq %mm6, 8(%rdi) /* write r2, r3 */ 340 341 addq $16, %rdi 342 343 decl %ecx 344 prefetch 32(%rdx) /* hopefully stride is zero */ 345 jnz p4_perspective_loop 346 347p4_perspective_done: 348 femms 349 ret 350 351.align 16 352.globl _mesa_3dnow_transform_points4_2d_no_rot 353_mesa_3dnow_transform_points4_2d_no_rot: 354 355 movl V4F_COUNT(%rdx), %ecx /* count */ 356 movzx V4F_STRIDE(%rdx), %eax /* stride */ 357 358 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 359 movl $4, V4F_SIZE(%rdi) /* set dest size */ 360 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 361 362 test %ecx, %ecx 363 .byte 0x90 /* manual align += 1 */ 364 jz p4_2d_no_rot_done 365 366 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 367 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 368 369 movd (%rsi), %mm0 /* | m00 */ 370 prefetch (%rdx) 371 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 372 373 movq 48(%rsi), %mm1 /* m31 | m30 */ 374 375p4_2d_no_rot_loop: 376 377 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 378 379 movq (%rdx), %mm4 /* x1 | x0 */ 380 movq 8(%rdx), %mm5 /* x3 | x2 */ 381 382 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 383 movq %mm5, %mm6 /* x3 | x2 */ 384 385 punpckhdq %mm6, %mm6 /* x3 | x3 */ 386 387 addq %rax, %rdx 388 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 389 390 prefetch 32(%rdx) /* hopefully stride is zero */ 391 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 392 393 movq %mm6, (%rdi) /* write r0, r1 */ 394 movq %mm5, 8(%rdi) /* write r2, r3 */ 395 396 addq $16, %rdi 397 398 decl %ecx 399 jnz p4_2d_no_rot_loop 400 401p4_2d_no_rot_done: 402 femms 403 ret 404 405 406.align 16 407.globl _mesa_3dnow_transform_points4_2d 408_mesa_3dnow_transform_points4_2d: 409 410 movl V4F_COUNT(%rdx), %ecx /* count */ 411 movzx V4F_STRIDE(%rdx), %eax /* stride */ 412 413 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 414 movl $4, V4F_SIZE(%rdi) /* set dest size */ 415 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 416 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 417 418 test %ecx, %ecx 419 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 420 jz p4_2d_done 421 422 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 423 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 424 425 movd (%rsi), %mm0 /* | m00 */ 426 movd 4(%rsi), %mm1 /* | m01 */ 427 428 prefetch (%rdx) 429 430 punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 432 punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 433 434 movq 48(%rsi), %mm2 /* m31 | m30 */ 435 436p4_2d_loop: 437 438 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 439 440 movq (%rdx), %mm3 /* x1 | x0 */ 441 movq 8(%rdx), %mm5 /* x3 | x2 */ 442 443 movq %mm3, %mm4 /* x1 | x0 */ 444 movq %mm5, %mm6 /* x3 | x2 */ 445 446 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 447 punpckhdq %mm6, %mm6 /* x3 | x3 */ 448 449 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 450 451 addq %rax, %rdx 452 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 453 454 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 455 prefetch 32(%rdx) /* hopefully stride is zero */ 456 457 pfadd %mm6, %mm3 /* r1 | r0 */ 458 459 movq %mm3, (%rdi) /* write r0, r1 */ 460 movq %mm5, 8(%rdi) /* write r2, r3 */ 461 462 addq $16, %rdi 463 464 decl %ecx 465 jnz p4_2d_loop 466 467p4_2d_done: 468 femms 469 ret 470 471#endif 472 473#if defined (__ELF__) && defined (__linux__) 474 .section .note.GNU-stack,"",%progbits 475#endif 476