xform4.S revision 7117f1b4
1/* 2 * Mesa 3-D graphics library 3 * Version: 7.0.1 4 * 5 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a 8 * copy of this software and associated documentation files (the "Software"), 9 * to deal in the Software without restriction, including without limitation 10 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 11 * and/or sell copies of the Software, and to permit persons to whom the 12 * Software is furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included 15 * in all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#ifdef USE_X86_64_ASM 26 27#include "matypes.h" 28 29.text 30 31.align 16 32 33.globl _mesa_x86_64_transform_points4_general 34_mesa_x86_64_transform_points4_general: 35/* 36 * rdi = dest 37 * rsi = matrix 38 * rdx = source 39 */ 40 movl V4F_COUNT(%rdx), %ecx /* count */ 41 movzx V4F_STRIDE(%rdx), %eax /* stride */ 42 43 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 44 movl $4, V4F_SIZE(%rdi) /* set dest size */ 45 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 46 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 47 48 testl %ecx, %ecx /* verify non-zero count */ 49 prefetchnta 64(%rsi) 50 jz p4_general_done 51 52 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 53 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 54 55 prefetch 16(%rdx) 56 57 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 58 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 59 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 60 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 61 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 62 63p4_general_loop: 64 65 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 66 prefetchw 16(%rdi) 67 68 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 69 addq %rax, %rdx 70 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 71 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 72 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 73 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 74 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 75 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 76 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 77 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 78 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 79 prefetch 16(%rdx) 80 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 81 82 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 83 addq $16, %rdi 84 85 decl %ecx 86 jnz p4_general_loop 87 88p4_general_done: 89 .byte 0xf3 90 ret 91 92.section .rodata 93 94.align 16 95p4_constants: 96.byte 0xff, 0xff, 0xff, 0xff 97.byte 0xff, 0xff, 0xff, 0xff 98.byte 0xff, 0xff, 0xff, 0xff 99.byte 0x00, 0x00, 0x00, 0x00 100 101.byte 0x00, 0x00, 0x00, 0x00 102.byte 0x00, 0x00, 0x00, 0x00 103.byte 0x00, 0x00, 0x00, 0x00 104.float 0f+1.0 105 106.text 107.align 16 108.globl _mesa_x86_64_transform_points4_3d 109/* 110 * this is slower than _mesa_x86_64_transform_points4_general 111 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 112 */ 113_mesa_x86_64_transform_points4_3d: 114 115 leaq p4_constants(%rip), %rax 116 117 prefetchnta 64(%rsi) 118 119 movaps (%rax), %xmm9 120 movaps 16(%rax), %xmm10 121 122 movl V4F_COUNT(%rdx), %ecx /* count */ 123 movzx V4F_STRIDE(%rdx), %eax /* stride */ 124 125 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 126 movl $4, V4F_SIZE(%rdi) /* set dest size */ 127 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 128 129 testl %ecx, %ecx /* verify non-zero count */ 130 jz p4_3d_done 131 132 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 133 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 134 135 prefetch 16(%rdx) 136 137 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 138 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 139 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 140 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 141 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 142 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 143 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 144 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 145 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 146 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 147 148p4_3d_loop: 149 150 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 151 prefetchw 16(%rdi) 152 153 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 154 addq %rax, %rdx 155 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 156 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 157 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 158 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 159 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 160 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 161 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 162 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 163 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 164 prefetch 16(%rdx) 165 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 166 167 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 168 addq $16, %rdi 169 170 dec %ecx 171 jnz p4_3d_loop 172 173p4_3d_done: 174 .byte 0xf3 175 ret 176 177 178.align 16 179.globl _mesa_x86_64_transform_points4_identity 180_mesa_x86_64_transform_points4_identity: 181 182 movl V4F_COUNT(%rdx), %ecx /* count */ 183 movzx V4F_STRIDE(%rdx), %eax /* stride */ 184 185 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 186 movl $4, V4F_SIZE(%rdi) /* set dest size */ 187 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 188 189 test %ecx, %ecx 190 jz p4_identity_done 191 192 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 193 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 194 prefetch 64(%rsi) 195 prefetchw 64(%rdi) 196 197 add %ecx, %ecx 198 199 rep movsq 200 201p4_identity_done: 202 .byte 0xf3 203 ret 204 205 206.align 16 207.globl _mesa_x86_64_transform_points4_3d_no_rot 208_mesa_x86_64_transform_points4_3d_no_rot: 209 210 movl V4F_COUNT(%rdx), %ecx /* count */ 211 movzx V4F_STRIDE(%rdx), %eax /* stride */ 212 213 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 214 movl $4, V4F_SIZE(%rdi) /* set dest size */ 215 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 216 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 217 218 test %ecx, %ecx 219 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 220 jz p4_3d_no_rot_done 221 222 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 223 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 224 225 prefetch (%rdx) 226 227 movd (%rsi), %mm0 /* | m00 */ 228 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 229 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 230 231 movd 40(%rsi), %mm2 /* | m22 */ 232 movq 48(%rsi), %mm1 /* m31 | m30 */ 233 234 punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 235 236p4_3d_no_rot_loop: 237 238 prefetchw 32(%rdi) 239 240 movq (%rdx), %mm4 /* x1 | x0 */ 241 movq 8(%rdx), %mm5 /* x3 | x2 */ 242 movd 12(%rdx), %mm7 /* | x3 */ 243 244 movq %mm5, %mm6 /* x3 | x2 */ 245 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 246 247 punpckhdq %mm6, %mm6 /* x3 | x3 */ 248 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 249 250 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 251 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 252 253 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 254 255 addq %rax, %rdx 256 movq %mm4, (%rdi) /* write r0, r1 */ 257 movq %mm5, 8(%rdi) /* write r2, r3 */ 258 259 addq $16, %rdi 260 261 decl %ecx 262 prefetch 32(%rdx) 263 jnz p4_3d_no_rot_loop 264 265p4_3d_no_rot_done: 266 femms 267 ret 268 269 270.align 16 271.globl _mesa_x86_64_transform_points4_perspective 272_mesa_x86_64_transform_points4_perspective: 273 274 movl V4F_COUNT(%rdx), %ecx /* count */ 275 movzx V4F_STRIDE(%rdx), %eax /* stride */ 276 277 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 278 movl $4, V4F_SIZE(%rdi) /* set dest size */ 279 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 280 281 test %ecx, %ecx 282 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 283 jz p4_perspective_done 284 285 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 286 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 287 288 movd (%rsi), %mm0 /* | m00 */ 289 pxor %mm7, %mm7 /* 0 | 0 */ 290 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 291 292 movq 32(%rsi), %mm2 /* m21 | m20 */ 293 prefetch (%rdx) 294 295 movd 40(%rsi), %mm1 /* | m22 */ 296 297 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 298 punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 299 300 301p4_perspective_loop: 302 303 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 304 305 movq (%rdx), %mm4 /* x1 | x0 */ 306 movq 8(%rdx), %mm5 /* x3 | x2 */ 307 movd 8(%rdx), %mm3 /* | x2 */ 308 309 movq %mm5, %mm6 /* x3 | x2 */ 310 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 311 312 punpckldq %mm5, %mm5 /* x2 | x2 */ 313 314 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 315 pfsubr %mm7, %mm3 /* | -x2 */ 316 317 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 318 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 319 320 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 321 322 movq %mm5, (%rdi) /* write r0, r1 */ 323 addq %rax, %rdx 324 movq %mm6, 8(%rdi) /* write r2, r3 */ 325 326 addq $16, %rdi 327 328 decl %ecx 329 prefetch 32(%rdx) /* hopefully stride is zero */ 330 jnz p4_perspective_loop 331 332p4_perspective_done: 333 femms 334 ret 335 336.align 16 337.globl _mesa_x86_64_transform_points4_2d_no_rot 338_mesa_x86_64_transform_points4_2d_no_rot: 339 340 movl V4F_COUNT(%rdx), %ecx /* count */ 341 movzx V4F_STRIDE(%rdx), %eax /* stride */ 342 343 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 344 movl $4, V4F_SIZE(%rdi) /* set dest size */ 345 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 346 347 test %ecx, %ecx 348 .byte 0x90 /* manual align += 1 */ 349 jz p4_2d_no_rot_done 350 351 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 352 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 353 354 movd (%rsi), %mm0 /* | m00 */ 355 prefetch (%rdx) 356 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 357 358 movq 48(%rsi), %mm1 /* m31 | m30 */ 359 360p4_2d_no_rot_loop: 361 362 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 363 364 movq (%rdx), %mm4 /* x1 | x0 */ 365 movq 8(%rdx), %mm5 /* x3 | x2 */ 366 367 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 368 movq %mm5, %mm6 /* x3 | x2 */ 369 370 punpckhdq %mm6, %mm6 /* x3 | x3 */ 371 372 addq %rax, %rdx 373 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 374 375 prefetch 32(%rdx) /* hopefully stride is zero */ 376 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 377 378 movq %mm6, (%rdi) /* write r0, r1 */ 379 movq %mm5, 8(%rdi) /* write r2, r3 */ 380 381 addq $16, %rdi 382 383 decl %ecx 384 jnz p4_2d_no_rot_loop 385 386p4_2d_no_rot_done: 387 femms 388 ret 389 390 391.align 16 392.globl _mesa_x86_64_transform_points4_2d 393_mesa_x86_64_transform_points4_2d: 394 395 movl V4F_COUNT(%rdx), %ecx /* count */ 396 movzx V4F_STRIDE(%rdx), %eax /* stride */ 397 398 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 399 movl $4, V4F_SIZE(%rdi) /* set dest size */ 400 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 401 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 402 403 test %ecx, %ecx 404 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 405 jz p4_2d_done 406 407 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 408 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 409 410 movd (%rsi), %mm0 /* | m00 */ 411 movd 4(%rsi), %mm1 /* | m01 */ 412 413 prefetch (%rdx) 414 415 punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 416 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 417 punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 418 419 movq 48(%rsi), %mm2 /* m31 | m30 */ 420 421p4_2d_loop: 422 423 prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 424 425 movq (%rdx), %mm3 /* x1 | x0 */ 426 movq 8(%rdx), %mm5 /* x3 | x2 */ 427 428 movq %mm3, %mm4 /* x1 | x0 */ 429 movq %mm5, %mm6 /* x3 | x2 */ 430 431 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 432 punpckhdq %mm6, %mm6 /* x3 | x3 */ 433 434 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 435 436 addq %rax, %rdx 437 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 438 439 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 440 prefetch 32(%rdx) /* hopefully stride is zero */ 441 442 pfadd %mm6, %mm3 /* r1 | r0 */ 443 444 movq %mm3, (%rdi) /* write r0, r1 */ 445 movq %mm5, 8(%rdi) /* write r2, r3 */ 446 447 addq $16, %rdi 448 449 decl %ecx 450 jnz p4_2d_loop 451 452p4_2d_done: 453 femms 454 ret 455 456#endif 457 458#if defined (__ELF__) && defined (__linux__) 459 .section .note.GNU-stack,"",%progbits 460#endif 461