1/* 2 * XGI memcpy() routines (assembly) 3 * 4 * Copyright (C) 2004-2005 Thomas Winischhofer 5 * 6 * Idea and some code bits from via_memcpy.c which is 7 * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved. 8 * 9 * Permission is hereby granted, free of charge, to any person obtaining a 10 * copy of this software and associated documentation files (the "Software"), 11 * to deal in the Software without restriction, including without limitation 12 * the rights to use, copy, modify, merge, publish, distribute, sub license, 13 * and/or sell copies of the Software, and to permit persons to whom the 14 * Software is furnished to do so, subject to the following conditions: 15 * 16 * The above copyright notice and this permission notice (including the 17 * next paragraph) shall be included in all copies or substantial portions 18 * of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 23 * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 * DEALINGS IN THE SOFTWARE. 27 * 28 */ 29 30#ifdef HAVE_CONFIG_H 31#include "config.h" 32#endif 33 34/* #include "xgi.h" */ 35 36/* Jong 07/10/2008; use run-time debug instead except for HW acceleration routines */ 37/* extern bool g_bRunTimeDebug; 38#define RUNTIMEDEBUG(p) if(g_bRunTimeDebug)p; */ 39 40/* #define DEBUG */ 41#ifdef DEBUG 42#define PDEBUG(p) p /* RUNTIMEDEBUG(p) */ 43#else 44#define PDEBUG(p) 45#endif 46 47/* Jong 01/15/2009; CPU flags (for memcpy() etc.) */ 48#define XGI_CPUFL_LIBC 0x0001 49#define XGI_CPUFL_BI 0x0002 50#define XGI_CPUFL_SSE 0x0004 51#define XGI_CPUFL_MMX 0x0008 52#define XGI_CPUFL_3DNOW 0x0010 53#define XGI_CPUFL_MMX2 0x0020 54#define XGI_CPUFL_BI2 0x0040 55#define XGI_CPUFL_SSE2 0x0080 56#define XGI_CPUFL_FLAG 0x8000 57 58#if 0 /* Debug */ 59#define XGIDGBMC 60#endif 61 62/* Jong 01/07/2009; debug */ 63/* 64#ifdef XGI_DEBUG 65#define PDEBUG(p) p 66#else 67#define PDEBUG(p) 68#endif 69*/ 70 71#if 0 /* Jong 01/15/2009; ignore at the moment */ 72extern unsigned int XGIAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize); 73extern void XGIFreeFBMemory(ScrnInfoPtr pScrn, void **handle); 74#endif 75 76#define CPUBUFFERSIZE 2048 /* Size of /proc/cpuinfo buffer */ 77#define BUFFERSIZE (576 * 1152) /* Matches 720x576 YUV420 */ 78 79/************************************************************************/ 80/* arch specific memcpy() routines */ 81/************************************************************************/ 82 83/* i386, AMD64 */ 84 85#define FENCE \ 86 __asm__ __volatile__( \ 87 " sfence\n" \ 88 : \ 89 : \ 90 : "memory"); 91 92#define FENCEMMS \ 93 __asm__ __volatile__ ( \ 94 " sfence\n" \ 95 " emms\n" \ 96 : \ 97 : \ 98 : "memory"); 99 100#define FEMMS \ 101 __asm__ __volatile__( \ 102 " femms\n" \ 103 : \ 104 : \ 105 : "memory"); 106 107#define EMMS \ 108 __asm__ __volatile__( \ 109 " emms\n" \ 110 : \ 111 : \ 112 : "memory"); 113 114#define SSE_PREFETCH " prefetchnta " 115#define NOW_PREFETCH " prefetch " 116 117#define PREFETCH1(arch_prefetch,from) \ 118 __asm__ __volatile__ ( \ 119 arch_prefetch "(%0)\n" \ 120 arch_prefetch "32(%0)\n" \ 121 arch_prefetch "64(%0)\n" \ 122 arch_prefetch "96(%0)\n" \ 123 arch_prefetch "128(%0)\n" \ 124 arch_prefetch "160(%0)\n" \ 125 arch_prefetch "192(%0)\n" \ 126 arch_prefetch "256(%0)\n" \ 127 arch_prefetch "288(%0)\n" \ 128 : \ 129 : "r" (from) ); 130 131#define PREFETCH2(arch_prefetch,from) \ 132 __asm__ __volatile__ ( \ 133 arch_prefetch "320(%0)\n" \ 134 : \ 135 : "r" (from) ); 136 137#define PREFETCH3(arch_prefetch,from) \ 138 __asm__ __volatile__ ( \ 139 arch_prefetch "288(%0)\n" \ 140 : \ 141 : "r" (from) ); 142 143#define small_memcpy_i386(to,from,n) \ 144 { \ 145 __asm__ __volatile__( \ 146 " cld\n" \ 147 " shrl $1, %%ecx\n" \ 148 " jnc 1f\n" \ 149 " movsb\n" \ 150 "1: shrl $1, %%ecx\n" \ 151 " jnc 2f\n" \ 152 " movsw\n" \ 153 "2: rep ; movsl" \ 154 : "=&D" (to), "=&S" (from) \ 155 : "c" (n), "0" ((long) to), "1" ((long) from) \ 156 : "memory", "cc"); \ 157 PDEBUG(ErrorF("Jong-small_memcpy_i386(to,from,n)...\n")); \ 158 } 159 160#define small_memcpy_amd64(to,from,n) \ 161 { \ 162 __asm__ __volatile__( \ 163 " cld\n" \ 164 " shrq $1, %%rcx\n" \ 165 " jnc 1f\n" \ 166 " movsb\n" \ 167 "1: shrq $1, %%rcx\n" \ 168 " jnc 2f\n" \ 169 " movsw\n" \ 170 "2: shrq $1, %%rcx\n" \ 171 " jnc 3f\n" \ 172 " movsl\n" \ 173 "3: rep ; movsq" \ 174 : "=&D" (to), "=&S" (from) \ 175 : "c" (n), "0" ((long) to), "1" ((long) from) \ 176 : "memory", "cc"); \ 177 PDEBUG(ErrorF("Jong-small_memcpy_amd64(to,from,n)...\n")); \ 178 } 179 180#define MMX_CPY(prefetch,from,to,dummy,lcnt) \ 181 __asm__ __volatile__ ( \ 182 "1:\n" \ 183 prefetch "320(%1)\n" \ 184 " movq (%1), %%mm0\n" \ 185 " movq 8(%1), %%mm1\n" \ 186 " movq 16(%1), %%mm2\n" \ 187 " movq 24(%1), %%mm3\n" \ 188 " movq %%mm0, (%0)\n" \ 189 " movq %%mm1, 8(%0)\n" \ 190 " movq %%mm2, 16(%0)\n" \ 191 " movq %%mm3, 24(%0)\n" \ 192 prefetch "352(%1)\n" \ 193 " movq 32(%1), %%mm0\n" \ 194 " movq 40(%1), %%mm1\n" \ 195 " movq 48(%1), %%mm2\n" \ 196 " movq 56(%1), %%mm3\n" \ 197 " leal 64(%1),%1\n" \ 198 " movq %%mm0, 32(%0)\n" \ 199 " movq %%mm1, 40(%0)\n" \ 200 " movq %%mm2, 48(%0)\n" \ 201 " movq %%mm3, 56(%0)\n" \ 202 " decl %2\n" \ 203 " leal 64(%0),%0\n" \ 204 " jne 1b\n" \ 205 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 206 : "0" (to), "1" (from), "2" (lcnt) \ 207 : "memory", "cc"); \ 208 PDEBUG(ErrorF("Jong-MMX_CPY(prefetch,from,to,dummy,lcnt)...\n")); 209 210 211#define SSE_CPY(prefetch,from,to,dummy,lcnt) \ 212 if((ULong) from & 15) { \ 213 __asm__ __volatile__ ( \ 214 "1:\n" \ 215 prefetch "320(%1)\n" \ 216 " movups (%1), %%xmm0\n" \ 217 " movups 16(%1), %%xmm1\n" \ 218 " movntps %%xmm0, (%0)\n" \ 219 " movntps %%xmm1, 16(%0)\n" \ 220 prefetch "352(%1)\n" \ 221 " movups 32(%1), %%xmm2\n" \ 222 " movups 48(%1), %%xmm3\n" \ 223 " leal 64(%1),%1\n" \ 224 " movntps %%xmm2, 32(%0)\n" \ 225 " movntps %%xmm3, 48(%0)\n" \ 226 " decl %2\n" \ 227 " leal 64(%0),%0\n" \ 228 " jne 1b\n" \ 229 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 230 : "0" (to), "1" (from), "2" (lcnt) \ 231 : "memory", "cc"); \ 232 PDEBUG(ErrorF("Jong-SSE_CPY(prefetch,from,to,dummy,lcnt)-1...\n")); \ 233 } else { \ 234 __asm__ __volatile__ ( \ 235 "2:\n" \ 236 prefetch "320(%1)\n" \ 237 " movaps (%1), %%xmm0\n" \ 238 " movaps 16(%1), %%xmm1\n" \ 239 " movntps %%xmm0, (%0)\n" \ 240 " movntps %%xmm1, 16(%0)\n" \ 241 prefetch "352(%1)\n" \ 242 " movaps 32(%1), %%xmm2\n" \ 243 " movaps 48(%1), %%xmm3\n" \ 244 " leal 64(%1),%1\n" \ 245 " movntps %%xmm2, 32(%0)\n" \ 246 " movntps %%xmm3, 48(%0)\n" \ 247 " decl %2\n" \ 248 " leal 64(%0),%0\n" \ 249 " jne 2b\n" \ 250 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 251 : "0" (to), "1" (from), "2" (lcnt) \ 252 : "memory", "cc"); \ 253 PDEBUG(ErrorF("Jong-SSE_CPY(prefetch,from,to,dummy,lcnt)-2...\n")); \ 254 } 255 256#define SSE64_CPY(prefetch,from,to,dummy,lcnt) \ 257 if((ULong) from & 15) { \ 258 __asm__ __volatile__ ( \ 259 "1:\n" \ 260 prefetch "320(%1)\n" \ 261 " movups (%1), %%xmm0\n" \ 262 " movups 16(%1), %%xmm1\n" \ 263 " movntps %%xmm0, (%0)\n" \ 264 " movntps %%xmm1, 16(%0)\n" \ 265 prefetch "352(%1)\n" \ 266 " movups 32(%1), %%xmm2\n" \ 267 " movups 48(%1), %%xmm3\n" \ 268 " leaq 64(%1),%1\n" \ 269 " movntps %%xmm2, 32(%0)\n" \ 270 " movntps %%xmm3, 48(%0)\n" \ 271 " decl %2\n" \ 272 " leaq 64(%0),%0\n" \ 273 " jne 1b\n" \ 274 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 275 : "0" (to), "1" (from), "2" (lcnt) \ 276 : "memory", "cc"); \ 277 PDEBUG(ErrorF("Jong-SSE64_CPY(prefetch,from,to,dummy,lcnt)-1...\n")); \ 278 } else { \ 279 __asm__ __volatile__ ( \ 280 "2:\n" \ 281 prefetch "320(%1)\n" \ 282 " movaps (%1), %%xmm0\n" \ 283 " movaps 16(%1), %%xmm1\n" \ 284 " movntps %%xmm0, (%0)\n" \ 285 " movntps %%xmm1, 16(%0)\n" \ 286 prefetch "352(%1)\n" \ 287 " movaps 32(%1), %%xmm2\n" \ 288 " movaps 48(%1), %%xmm3\n" \ 289 " leaq 64(%1),%1\n" \ 290 " movntps %%xmm2, 32(%0)\n" \ 291 " movntps %%xmm3, 48(%0)\n" \ 292 " decl %2\n" \ 293 " leaq 64(%0),%0\n" \ 294 " jne 2b\n" \ 295 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 296 : "0" (to), "1" (from), "2" (lcnt) \ 297 : "memory", "cc"); \ 298 PDEBUG(ErrorF("Jong-SSE64_CPY(prefetch,from,to,dummy,lcnt)-2...\n")); \ 299 } 300 301#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt) \ 302 __asm__ __volatile__ ( \ 303 ".p2align 4,,7\n" \ 304 "1:\n" \ 305 prefetch "320(%1)\n" \ 306 " movq (%1), %%mm0\n" \ 307 " movq 8(%1), %%mm1\n" \ 308 " movq 16(%1), %%mm2\n" \ 309 " movq 24(%1), %%mm3\n" \ 310 " movntq %%mm0, (%0)\n" \ 311 " movntq %%mm1, 8(%0)\n" \ 312 " movntq %%mm2, 16(%0)\n" \ 313 " movntq %%mm3, 24(%0)\n" \ 314 prefetch "352(%1)\n" \ 315 " movq 32(%1), %%mm0\n" \ 316 " movq 40(%1), %%mm1\n" \ 317 " movq 48(%1), %%mm2\n" \ 318 " movq 56(%1), %%mm3\n" \ 319 " leal 64(%1),%1\n" \ 320 " movntq %%mm0, 32(%0)\n" \ 321 " movntq %%mm1, 40(%0)\n" \ 322 " movntq %%mm2, 48(%0)\n" \ 323 " movntq %%mm3, 56(%0)\n" \ 324 " decl %2\n" \ 325 " leal 64(%0),%0\n" \ 326 " jne 1b\n" \ 327 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 328 : "0" (to), "1" (from), "2" (lcnt) \ 329 : "memory", "cc"); \ 330 PDEBUG(ErrorF("Jong-MMXEXT_CPY(prefetch,from,to,dummy,lcnt)...\n")); 331 332 333#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small) \ 334 \ 335/* static void prefix##_memcpy(UChar *to, */ \ 336 void prefix##_memcpy(UChar *to, \ 337 const UChar *from, \ 338 int size) \ 339 { \ 340 int lcnt = size >> 6; \ 341 int rest = size & 63; \ 342 register int dummy; \ 343 \ 344 PREFETCH1(ptype##_PREFETCH,from); \ 345 begin; \ 346 PDEBUG(ErrorF("Jong-After-begin()...\n")); \ 347 if(lcnt) { \ 348 PDEBUG(ErrorF("Jong-Before-SSE_CPY()...\n")); \ 349 itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt); \ 350 PDEBUG(ErrorF("Jong-After-SSE_CPY()...\n")); \ 351 } \ 352 if(rest) { \ 353 PDEBUG(ErrorF("Jong-Before-PREFETCH2()...\n")); \ 354 PREFETCH2(ptype##_PREFETCH,from); \ 355 PDEBUG(ErrorF("Jong-After-PREFETCH2()...\n")); \ 356 small(to, from, rest); \ 357 PDEBUG(ErrorF("Jong-After-small...\n")); \ 358 PREFETCH3(ptype##_PREFETCH,from); \ 359 PDEBUG(ErrorF("Jong-After-PREFETCH3()...\n")); \ 360 } \ 361 PDEBUG(ErrorF("Jong-Before-fence...\n")); \ 362 fence; \ 363 PDEBUG(ErrorF("Jong-PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)-end...\n")); \ 364 } 365 366#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small) \ 367 \ 368 static void prefix##_memcpy(UChar *to, \ 369 const UChar *from, \ 370 int size) \ 371 { \ 372 int lcnt = size >> 6; \ 373 int rest = size & 63; \ 374 register int dummy; \ 375 \ 376 begin; \ 377 if(lcnt) { \ 378 itype##_CPY("#",from,to,dummy,lcnt); \ 379 } \ 380 if(rest) { \ 381 small(to, from, rest); \ 382 } \ 383 fence; \ 384 PDEBUG(ErrorF("Jong-NOPREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)...\n")); \ 385 } 386 387/* Other archs */ 388 389/* ... */ 390 391 392#if 0 /* Jong 01/15/2009; ignore at the moment */ 393/* Type for table for benchmark list */ 394typedef struct { 395 vidCopyFunc mFunc; 396 char *mName; 397 unsigned int mycpuflag; 398 int grade; 399 int gradefrom; 400 Bool reqAlignment; 401} XGIMCFuncData; 402#endif 403 404/************************************************************************/ 405/* libc memcpy() wrapper - generic */ 406/************************************************************************/ 407#define UChar unsigned char 408 409void XGI_libc_memcpy(UChar *dst, const UChar *src, int size) 410{ 411 PDEBUG(ErrorF("Jong-XGI_libc_memcpy()...\n")); 412 memcpy(dst, src, size); 413} 414 415/************************************************************************/ 416/* We only do all that stuff under gcc; no idea what other compilers */ 417/* would do with our asm code. */ 418/************************************************************************/ 419 420#ifndef __GNUC__ 421 422unsigned int XGIGetCPUFlags(ScrnInfoPtr pScrn) 423{ 424 PDEBUG(ErrorF("Jong-XGIGetCPUFlags(ScrnInfoPtr pScrn)...\n")); 425 return 0; 426} 427 428vidCopyFunc XGIVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 429{ 430 PDEBUG(ErrorF("Jong-XGIVidCopyInit()...\n")); 431 *UMemCpy = XGI_libc_memcpy; 432 return XGI_libc_memcpy; 433} 434 435vidCopyFunc XGIVidCopyGetDefault(void) 436{ 437 PDEBUG(ErrorF("Jong-XGIVidCopyGetDefault()...\n")); 438 return XGI_libc_memcpy; 439} 440 441#else /* ! Everything below is gcc specific ! */ 442 443/************************************************************************/ 444/* Definitions for archs and OSes */ 445/************************************************************************/ 446 447#undef XGI_checkosforsse 448#undef XGI_canBenchmark 449#undef XGI_haveProc 450#undef XGI_haveBuiltInMC 451 452/* Jong Lin */ 453#if defined(__arm__) 454void XGI_builtin_memcpy_arm(UChar *to, const UChar *from, int n) 455{ 456 long d1,d2,d3; 457 458 PDEBUG(ErrorF("XGI_builtin_memcpy_arm_begin-size=%d...\n", n)); 459 460 __asm__ __volatile__( 461 " start: \n" 462 /* " stmfd sp!, {r0-r12}\n" */ /* Save some working registers */ 463 " blockcopy: \n" 464 " movs r3,r2, lsr #3\n" /* Number of eight word multiples */ 465 " beq copywords\n" /* Less than eight words to move? */ 466 " stmfd sp!, {r4-r11}\n" /* Save some working registers */ 467 " octcopy: \n" 468 " ldmia r1!, {r4-r11}\n" /* Load 8 words from the source */ 469 " stmia r0!, {r4-r11}\n" /* Put them at the destination */ 470 " subs r3, r3, #1\n" /* Decrement the counter */ 471 " bne octcopy\n" /* ... copy more */ 472 " ldmfd sp!, {r4-r11}\n" /* Don't need these now - restore */ 473 " copywords: \n" 474 " ands r2, r2, #7\n" /* Number of odd words to copy */ 475 " beq stop\n" /* No words left to copy? */ 476 " wordcopy: \n" 477 " ldr r3, [r1], #4\n" /* Load a word from the source */ 478 " str r3, [r0], #4\n" /* and store it to the destination */ 479 " subs r2, r2, #1\n" /* Decrement the counter */ 480 " bne wordcopy\n" /* ... copy more */ 481 " stop: \n" 482 /* "ldmfd sp!, {r0-r12}\n" */ /* Don't need these now - restore */ 483 : 484 :"r" ((long) to), "r" ((long) from), "r" ((unsigned long) n) 485 : "memory", "r3"); 486 487} 488#endif 489 490#if defined(__i386__) /* ***************************************** i386 */ 491 492#define XGI_checkosforsse /* Does this cpu support sse and do we need to check os? */ 493#define XGI_canBenchmark /* Can we perform a benchmark? */ 494#ifdef XGI_LINUX 495#define XGI_haveProc /* Do we have /proc/cpuinfo or similar? */ 496#endif 497#define XGI_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 498 499/* Built-in memcpy for i386 */ 500#define size_t int 501static __inline void * builtin_memcpy(void * to, const void * from, size_t n) 502{ 503 int d1,d2,d3; 504 505 __asm__ __volatile__( 506 " cld\n" 507 " shrl $1, %%ecx\n" 508 " jnc 1f\n" 509 " movsb\n" 510 "1: shrl $1, %%ecx\n" 511 " jnc 2f\n" 512 " movsw\n" 513 "2: rep ; movsl\n" 514 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 515 : "0" (n), "1" ((long) to), "2" ((long) from) 516 : "memory", "cc"); 517 518 PDEBUG(ErrorF("Jong-__inline builtin_memcpy() for i386...\n")); 519 return(to); 520} 521 522/* Alternative for 586: Unroll loop, copy 32 bytes at a time */ 523static void XGI_builtin_memcp2(UChar *to, const UChar *from, int n) 524{ 525 int d1,d2,d3; 526 527 __asm__ __volatile__( 528 " movl %%edi, %%eax\n" 529 " cmpl $32, %%ecx\n" 530 " cld\n" 531 " jbe 3f\n" 532 " negl %%eax\n" /* Align dest */ 533 " andl $3, %%eax\n" 534 " subl %%eax, %%ecx\n" 535 " xchgl %%eax, %%ecx\n" 536 " rep ; movsb\n" 537 " movl %%eax, %%ecx\n" 538 " subl $32, %%ecx\n" 539 " js 2f\n" 540 " movl (%%edi), %%eax\n" 541 "1: movl 28(%%edi), %%edx\n" /* Trick: Read-ahead */ 542 " subl $32, %%ecx\n" 543 " movl (%%esi), %%eax\n" 544 " movl 4(%%esi), %%edx\n" 545 " movl %%eax, (%%edi)\n" 546 " movl %%edx, 4(%%edi)\n" 547 " movl 8(%%esi), %%eax\n" 548 " movl 12(%%esi), %%edx\n" 549 " movl %%eax, 8(%%edi)\n" 550 " movl %%edx, 12(%%edi)\n" 551 " movl 16(%%esi), %%eax\n" 552 " movl 20(%%esi), %%edx\n" 553 " movl %%eax, 16(%%edi)\n" 554 " movl %%edx, 20(%%edi)\n" 555 " movl 24(%%esi), %%eax\n" 556 " movl 28(%%esi), %%edx\n" 557 " movl %%eax, 24(%%edi)\n" 558 " movl %%edx, 28(%%edi)\n" 559 " leal 32(%%esi), %%esi\n" 560 " leal 32(%%edi), %%edi\n" 561 " jns 1b\n" 562 "2: addl $32, %%ecx\n" 563 "3: rep ; movsb" 564 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 565 : "0" (n), "1" ((long) to), "2" ((long) from) 566 : "eax", "edx", "memory", "cc"); 567 568 PDEBUG(ErrorF("Jong-XGI_builtin_memcp2()-copy 32 bytes at a time-586...\n")); 569} 570 571static unsigned int taketime(void) /* get current time (for benchmarking) */ 572{ 573 unsigned int eax; 574 575 __asm__ volatile ( 576 " pushl %%ebx\n" 577 " cpuid\n" 578 " rdtsc\n" 579 " popl %%ebx\n" 580 : "=a" (eax) 581 : "0" (0) 582 : "ecx", "edx", "cc"); 583 584 return(eax); 585} 586 587#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */ 588 589#define XGI_checkosforsse /* Does this cpu support sse and do we need to check os? */ 590#define XGI_canBenchmark /* Can we perform a benchmark? */ 591#ifdef XGI_LINUX 592#define XGI_haveProc /* Do we have /proc/cpuinfo or similar? */ 593#endif 594#define XGI_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 595 596/* Built-in memcpy for AMD64 */ 597static __inline void * builtin_memcpy(void * to, const void * from, int n) 598{ 599 long d1, d2, d3; 600 601 __asm__ __volatile__ ( 602 " cld\n" 603 " rep ; movsq\n" 604 " movq %4, %%rcx\n" 605 " rep ; movsb" 606 : "=%c" (d1), "=&D" (d2), "=&S" (d3) 607 : "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)), 608 "1" ((long) to), "2" ((long) from) 609 : "memory"); 610 611 PDEBUG(ErrorF("Jong-builtin_memcpy() for AMD64...\n")); 612 return(to); 613} 614 615/* Alternative: Unroll loop, copy 32 bytes at a time */ 616static void XGI_builtin_memcp2(UChar *to, const UChar *from, int n) 617{ 618 long d1,d2,d3; 619 620 __asm__ __volatile__( 621 " movq %%rdi, %%rax\n" 622 " cmpq $32, %%rcx\n" 623 " cld\n" /* Pipeline; no other flags but DF */ 624 " jbe 1f\n" 625 " negq %%rax\n" /* Align dest */ 626 " andq $7, %%rax\n" 627 " subq %%rax, %%rcx\n" 628 " xchgq %%rax, %%rcx\n" 629 " rep ; movsb\n" 630 " movq %%rax, %%rcx\n" 631 " subq $32, %%rcx\n" 632 " js 2f\n" 633 ".p2align 4\n" 634 "3: subq $32, %%rcx\n" 635 " movq (%%rsi), %%rax\n" 636 " movq 8(%%rsi), %%rdx\n" 637 " movq 16(%%rsi), %%r8\n" 638 " movq 24(%%rsi), %%r9\n" 639 " movq %%rax, (%%rdi)\n" 640 " movq %%rdx, 8(%%rdi)\n" 641 " movq %%r8, 16(%%rdi)\n" 642 " movq %%r9, 24(%%rdi)\n" 643 " leaq 32(%%rsi), %%rsi\n" 644 " leaq 32(%%rdi), %%rdi\n" 645 " jns 3b\n" 646 "2: addq $32, %%rcx\n" 647 "1: rep ; movsb" 648 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 649 :"0" ((ULong) n), "1" ((long) to), "2" ((long) from) 650 : "rax", "rdx", "r8", "r9", "memory", "cc"); 651 652 PDEBUG(ErrorF("Jong-XGI_builtin_memcp2()-copy 32 bytes at a time-AMD...\n")); 653} 654 655static unsigned int taketime(void) /* get current time (for benchmarking) */ 656{ 657 unsigned int eax; 658 659 __asm__ volatile ( 660 " pushq %%rbx\n" 661 " cpuid\n" 662 " rdtsc\n" 663 " popq %%rbx\n" 664 : "=a" (eax) 665 : "0" (0) 666 : "rcx", "rdx", "cc"); 667 668 return(eax); 669} 670 671#else /* **************************************** Other archs */ 672 673/* 1. Can we do a benchmark? */ 674/* #define XGI_canBenchmark */ 675 676/* 2. Do we have /proc filesystem or similar for CPU information? */ 677/* #define XGI_haveproc */ 678 679/* 3. Optional: build-in memcpy() */ 680/* #define XGI_haveBuiltInMC */ 681/* static __inline void * builtin_memcpy(void * to, const void * from, int n) 682 { 683 } 684*/ 685 686/* 4. Function for getting current time (for benchmarking) */ 687/* static unsigned int taketime(void) 688 { 689 } 690*/ 691 692#endif 693 694/************************************************************************/ 695/* Generic built-in memcpy wrapper */ 696/************************************************************************/ 697 698#ifdef XGI_haveBuiltInMC 699static void XGI_builtin_memcpy(UChar *dst, const UChar *src, int size) 700{ 701 PDEBUG(ErrorF("Jong-XGI_builtin_memcpy()...\n")); 702 builtin_memcpy(dst, src, size); 703} 704#endif 705 706#if 0 /* Jong 01/15/2009; ignore at the moment */ 707/************************************************************************/ 708/* Generic routines if Benchmark can be performed (all archs, all OSes) */ 709/************************************************************************/ 710 711#ifdef XGI_canBenchmark 712 713/* Get time (unsigned int) */ 714static unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size) 715{ 716 unsigned int t1, t2; 717 718 t1 = taketime(); 719 720 (*mf)(buf1, buf2, size); 721 722 t2 = taketime(); 723 724 return((t1 < t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1)); 725} 726 727/* Allocate an area of offscreen FB memory (buf1), a simulated video 728 * player buffer (buf2) and a pool of uninitialized "video" data (buf3). 729 */ 730static void * 731XGI_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3) 732{ 733 XGIPtr pXGI = XGIPTR(pScrn); 734 unsigned int offset; 735 void *handle = NULL; 736 737 if(!(offset = XGIAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) { 738 return NULL; 739 } 740 (*buf1) = (UChar *)pXGI->FbBase + offset; 741 (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31); 742 743 if(!((*buf2) = (UChar *)xalloc(BUFFERSIZE + 15))) { 744 XGIFreeFBMemory(pScrn, &handle); 745 return NULL; 746 } 747 748 if(!((*buf3) = (UChar *)xalloc(BUFFERSIZE + 15))) { 749 xfree((*buf2)); 750 XGIFreeFBMemory(pScrn, &handle); 751 return NULL; 752 } 753 754 return handle; 755} 756#endif 757 758/* Perform Benchmark */ 759static int XGI_BenchmarkMemcpy(ScrnInfoPtr pScrn, XGIMCFuncData *MCFunctions, 760 unsigned int myCPUflags, UChar *buf1, UChar *buf2, 761 UChar *buf3, char *frqBuf, double cpuFreq, 762 vidCopyFunc *UMemCpy, int *best2, Bool from) 763{ 764 XGIMCFuncData *curData; 765 int j = 0, bestSoFar = 0; 766 unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU; 767 768 (*best2) = 0; 769 770 /* Make probable buf1 and buf2 are not paged out by referencing them */ 771 XGI_libc_memcpy(buf1, buf2, BUFFERSIZE); 772 773 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 774 "Benchmarking %s RAM to %s RAM memory transfer methods:\n", 775 from ? "video" : "system", 776 from ? "system" : "video"); 777 778#ifdef TWDEBUG 779 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags); 780#endif 781 782 j = 0; 783 while(MCFunctions[j].mFunc) { 784 PDEBUG(ErrorF("Check function[%d]-%s...\n", j, MCFunctions[j].mName)); 785 786 curData = MCFunctions + j; 787 788 if(myCPUflags & curData->mycpuflag) { 789 790 /* Simulate setup of the video buffer and copy result to framebuffer */ 791 /* Do this 4 times to verify results */ 792 if(!from) { 793 XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE); 794 tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 795 XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE); 796 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 797 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 798 XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE); 799 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 800 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 801 XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE); 802 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 803 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 804 } else { 805 XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE); 806 tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 807 XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE); 808 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 809 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 810 XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE); 811 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 812 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 813 XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE); 814 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 815 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 816 } 817 818 if((!frqBuf) || (tmp1 == 0)) { 819 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 820 "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1); 821 } else { 822 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 823 "\tChecked %s memcpy()... \t%.1f MiB/s\n", 824 curData->mName, 825 cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000))); 826 } 827 828 if(tmp1 < best) { 829 best = tmp1; 830 bestSoFar = j; 831 PDEBUG(ErrorF("* bestSoFar is %d...\n", bestSoFar)); 832 } 833 834 if(!curData->reqAlignment) { 835 if(tmp1 < sbest) { 836 sbest = tmp1; 837 (*best2) = j; 838 } 839 } 840 841 } 842 843 j++; 844 } 845 846 PDEBUG(ErrorF("***** The best is func(%d)-%s...\n", bestSoFar, MCFunctions[bestSoFar].mName)); 847 848 /* Jong 01/07/2009; test for performance */ 849 /* bestSoFar = 2; */ /* the worst case; buit-in-2 */ 850 return bestSoFar; 851} 852 853static vidCopyFunc XGI_GetBestByGrade(ScrnInfoPtr pScrn, XGIMCFuncData *MCFunctions, 854 unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from) 855{ 856 int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10; 857 int grade; 858 859 *UMemCpy = XGI_libc_memcpy; 860 861 while(MCFunctions[j].mFunc) { 862 if(myCPUflags & MCFunctions[j].mycpuflag) { 863 grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade; 864 if(grade < bestSoFar) { 865 best = j; 866 bestSoFar = grade; 867 } 868 if(grade < best2SoFar) { 869 if(!MCFunctions[j].reqAlignment) { 870 secondbest = j; 871 best2SoFar = grade; 872 } 873 } 874 } 875 j++; 876 } 877 if(best >= 0) { 878 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 879 "Chose %s method for aligned data transfers %s video RAM\n", 880 MCFunctions[best].mName, 881 from ? "from" : "to"); 882 if(secondbest >= 0) { 883 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 884 "Chose %s method for unaligned data transfers %s video RAM\n", 885 MCFunctions[secondbest].mName, 886 from ? "from" : "to"); 887 *UMemCpy = MCFunctions[secondbest].mFunc; 888 } 889 return MCFunctions[best].mFunc; 890 } 891 892 return XGI_libc_memcpy; 893} 894#endif /* canBenchmark */ 895 896/**********************************************************************/ 897/* Generic routines if /proc filesystem is available (Linux) */ 898/**********************************************************************/ 899 900#ifdef XGI_haveProc 901/* Linux: Read file (/proc/cpuinfo) into buffer */ 902static int XGI_ReadProc(char *buf, char *filename) 903{ 904 FILE *cpuInfoFile; 905 int count; 906 907 if((cpuInfoFile = fopen(filename, "r")) == NULL) { 908 return 0; 909 } 910 911 count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile); 912 if(ferror(cpuInfoFile)) { 913 fclose(cpuInfoFile); 914 return 0; 915 } 916 917 fclose(cpuInfoFile); 918 919 if(count >= CPUBUFFERSIZE - 2) { 920 return 0; 921 } 922 923 buf[count] = 0; 924 925 return count; 926} 927 928/* Linux: Extract CPU speed from /proc/cpuinfo */ 929static char *XGI_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq) 930{ 931 char *frqBuf, *endBuf; 932 933 (*cpuFreq) = 0.0; 934 935 if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) { 936 frqBuf += 11; 937 (*cpuFreq) = strtod(frqBuf, &endBuf); 938 if(endBuf == frqBuf) frqBuf = NULL; 939 if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */ 940 if(frqBuf) { 941 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq)); 942 } 943 } 944 945 return frqBuf; 946} 947#endif /* haveProc */ 948 949/**********************************************************************/ 950/* Arch-specific routines */ 951/**********************************************************************/ 952 953#ifdef XGI_checkosforsse /* Common i386, AMD64 */ 954 955#ifdef XGICHECKOSSSE 956 957#ifndef XFree86LOADER 958#include <setjmp.h> 959#endif 960 961static jmp_buf sigill_return; 962 963static void sigill_handler(void) 964{ 965 longjmp(sigill_return, 1); 966} 967#endif 968 969#if 0 /* Jong 01/15/2009; ignore at the moment */ 970static Bool CheckOSforSSE(ScrnInfoPtr pScrn) 971{ 972#ifdef XGICHECKOSSSE /* Check OS for SSE possible: */ 973 int signo = -1; 974 975#ifdef XGIDGBMC 976 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n"); 977#endif 978 979 xf86InterceptSigIll(&sigill_handler); 980 981 if(setjmp(sigill_return)) { 982 signo = 4; 983 } else { 984 __asm__ __volatile__ (" xorps %xmm0, %xmm0\n"); 985 /* __asm__ __volatile__ (" .byte 0xff\n"); */ /* For test */ 986 } 987 988 xf86InterceptSigIll(NULL); 989 990#ifdef XGIDGBMC 991 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo); 992#endif 993 994 if(signo != -1) { 995 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 996 "OS does not support SSE instructions\n"); 997 } 998 999 return (signo >= 0) ? FALSE : TRUE; 1000 1001#else /* no check for SSE possible: */ 1002 1003 XGIPtr pXGI = XGIPTR(pScrn); 1004 1005 xf86DrvMsg(pScrn->scrnIndex, pXGI->XvSSEMemcpy ? X_WARNING : X_INFO, 1006 "Checking OS for SSE support is not supported in this version of " XGIMYSERVERNAME "\n"); 1007 1008 if(pXGI->XvSSEMemcpy) { 1009 xf86DrvMsg(pScrn->scrnIndex, X_WARNING, 1010 "If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n"); 1011 return TRUE; 1012 } else { 1013 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 1014 "If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n"); 1015 return FALSE; 1016 } 1017#endif 1018} 1019 1020#endif /* XGI_checkosforsse */ 1021#endif 1022 1023#ifdef __i386__ /* i386 specific *************************************/ 1024#define ULong unsigned long 1025 1026PREFETCH_FUNC(XGI_sse,SSE,SSE,,FENCE,small_memcpy_i386) 1027PREFETCH_FUNC(XGI_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386) 1028PREFETCH_FUNC(XGI_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386) 1029NOPREFETCH_FUNC(XGI_mmx,MMX,EMMS,EMMS,small_memcpy_i386) 1030 1031#if 0 /* Jong 01/15/2009; ignore at the moment */ 1032static XGIMCFuncData MCFunctions_i386[] = { 1033 {XGI_libc_memcpy, "libc", XGI_CPUFL_LIBC, 4, 4, FALSE}, 1034 {XGI_builtin_memcpy,"built-in-1",XGI_CPUFL_BI, 5, 5, FALSE}, 1035 {XGI_builtin_memcp2,"built-in-2",XGI_CPUFL_BI2, 6, 6, FALSE}, 1036 {XGI_mmx_memcpy, "MMX", XGI_CPUFL_MMX, 3, 3, FALSE}, 1037 {XGI_sse_memcpy, "SSE", XGI_CPUFL_SSE, 1, 0, TRUE}, 1038 {XGI_now_memcpy, "3DNow!", XGI_CPUFL_3DNOW, 2, 2, FALSE}, 1039 {XGI_mmxext_memcpy, "MMX2", XGI_CPUFL_MMX2, 0, 1, FALSE}, 1040 {NULL, "", 0, 10, 10, FALSE} 1041}; 1042#endif 1043 1044#define Def_FL (XGI_CPUFL_LIBC | XGI_CPUFL_BI | XGI_CPUFL_BI2) /* Default methods */ 1045 1046#define cpuid(op, eax, ebx, ecx, edx) \ 1047 __asm__ __volatile__ ( \ 1048 " pushl %%ebx\n" \ 1049 " cpuid\n" \ 1050 " movl %%ebx, %1\n" \ 1051 " popl %%ebx\n" \ 1052 : "=a" (eax), "=r" (ebx), \ 1053 "=c" (ecx), "=d" (edx) \ 1054 : "a" (op) \ 1055 : "cc") 1056 1057#if 0 /* Jong 01/15/2009; ignore at the moment */ 1058static Bool cpuIDSupported(ScrnInfoPtr pScrn) 1059{ 1060 int eax, ebx, ecx, edx; 1061 1062 /* Check for cpuid instruction */ 1063 __asm__ __volatile__ ( 1064 " pushf\n" 1065 " popl %0\n" 1066 " movl %0, %1\n" 1067 " xorl $0x200000, %0\n" 1068 " push %0\n" 1069 " popf\n" 1070 " pushf\n" 1071 " popl %0\n" 1072 : "=a" (eax), "=c" (ecx) 1073 : 1074 : "cc"); 1075 1076 if(eax == ecx) { 1077 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n"); 1078 return FALSE; 1079 } 1080 1081 /* Check for cpuid level */ 1082 cpuid(0x00000000, eax, ebx, ecx, edx); 1083 if(!eax) { 1084 return FALSE; 1085 } 1086 1087 /* Check for RDTSC */ 1088 cpuid(0x00000001, eax, ebx, ecx, edx); 1089 1090 if(!(edx & 0x10)) { 1091 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n"); 1092 return FALSE; 1093 } 1094 1095 return TRUE; 1096} 1097 1098static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn) 1099{ 1100 unsigned int flags = 0, eax, ebx, ecx, edx; 1101 Bool IsAMD; 1102 1103 /* Check if cpuid and rdtsc instructions are supported */ 1104 if(!cpuIDSupported(pScrn)) { 1105 return 0; 1106 } 1107 1108 cpuid(0x00000000, eax, ebx, ecx, edx); 1109 1110 IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163); 1111 1112 cpuid(0x00000001, eax, ebx, ecx, edx); 1113 /* MMX */ 1114 if(edx & 0x00800000) flags |= XGI_CPUFL_MMX; 1115 /* SSE, MMXEXT */ 1116 if(edx & 0x02000000) flags |= (XGI_CPUFL_SSE | XGI_CPUFL_MMX2); 1117 /* SSE2 - don't need this one directly, set SSE instead */ 1118 if(edx & 0x04000000) flags |= (XGI_CPUFL_SSE | XGI_CPUFL_SSE2); 1119 1120 cpuid(0x80000000, eax, ebx, ecx, edx); 1121 if(eax >= 0x80000001) { 1122 cpuid(0x80000001, eax, ebx, ecx, edx); 1123 /* 3DNow! */ 1124 if(edx & 0x80000000) flags |= XGI_CPUFL_3DNOW; 1125 /* AMD MMXEXT */ 1126 if(IsAMD && (edx & 0x00400000)) flags |= XGI_CPUFL_MMX2; 1127 } 1128 1129 return flags; 1130} 1131#endif 1132 1133#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */ 1134 1135PREFETCH_FUNC(XGI_sse,SSE64,SSE,,FENCE,small_memcpy_amd64) 1136 1137#if 0 /* Jong 01/15/2009; ignore at the moment */ 1138 1139static XGIMCFuncData MCFunctions_AMD64[] = { 1140 {XGI_libc_memcpy, "libc", XGI_CPUFL_LIBC, 2, 2, FALSE}, 1141 {XGI_builtin_memcpy,"built-in-1",XGI_CPUFL_BI, 1, 1, FALSE}, 1142 {XGI_builtin_memcp2,"built-in-2",XGI_CPUFL_BI2, 3, 3, FALSE}, 1143 {XGI_sse_memcpy, "SSE", XGI_CPUFL_SSE, 0, 0, TRUE}, 1144 {NULL, "", 0, 10, 10, FALSE} 1145}; 1146 1147#define Def_FL (XGI_CPUFL_LIBC | XGI_CPUFL_BI | XGI_CPUFL_BI2) 1148 1149static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn) 1150{ 1151 return((unsigned int)(XGI_CPUFL_SSE|XGI_CPUFL_SSE2)); 1152} 1153#endif 1154 1155#else /* Specific for other archs ******************************** */ 1156 1157/* Fill in here */ 1158#if 0 /* Jong 01/15/2009; ignore at the moment */ 1159 1160#define Def_FL (XGI_CPUFL_LIBC) 1161 1162static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn) 1163{ 1164 return((unsigned int)(0)); 1165} 1166 1167#endif 1168 1169#endif 1170 1171/**********************************************************************/ 1172/* Benchmark the video copy routines and choose the fastest */ 1173/**********************************************************************/ 1174 1175#if 0 /* Jong 01/15/2009; ignore at the moment */ 1176#ifdef XGI_canBenchmark 1177static vidCopyFunc 1178XGIVidCopyInitGen(ScreenPtr pScreen, XGIMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from) 1179{ 1180 ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum]; 1181 XGIPtr pXGI = XGIPTR(pScrn); 1182 void *fbhandle = NULL; 1183 char *frqBuf = NULL; 1184 UChar *buf1, *buf2, *buf3; 1185 double cpuFreq = 0.0; 1186 unsigned int myCPUflags = pXGI->CPUFlags | Def_FL; 1187 int best, secondbest; 1188#ifdef XGI_haveProc 1189 char buf[CPUBUFFERSIZE]; 1190#endif 1191 1192 PDEBUG(ErrorF("---XGIVidCopyInitGen()...begin\n")); 1193 *UMemCpy = XGI_libc_memcpy; 1194 1195 /* Bail out if user disabled benchmarking */ 1196 if(!pXGI->BenchMemCpy) { 1197 PDEBUG(ErrorF("---return XGI_libc_memcpy() as vidCopyFunc...\n")); 1198 return XGI_libc_memcpy; 1199 } 1200 1201#ifdef XGI_haveProc 1202 /* Read /proc/cpuinfo into buf */ 1203 if(XGI_ReadProc(buf, "/proc/cpuinfo")) { 1204 1205 /* Extract CPU frequency */ 1206 frqBuf = XGI_GetCPUFreq(pScrn, buf, &cpuFreq); 1207 1208 } 1209#endif 1210 1211 /* Allocate buffers; buf1:LFB; buf2,buf3:system memory */ 1212 if(!(fbhandle = XGI_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) { 1213 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 1214 "Failed to allocate video RAM for video data transfer benchmark\n"); 1215 return XGI_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from); 1216 } 1217 1218 /* Perform Benchmark */ 1219 PDEBUG(ErrorF("---Perform XGI_BenchmarkMemcpy()...\n")); 1220 best = XGI_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1, 1221 (UChar *)(((unsigned long)buf2 + 15) & ~15), 1222 (UChar *)(((unsigned long)buf3 + 15) & ~15), 1223 frqBuf, cpuFreq, UMemCpy, &secondbest, from); 1224 1225 /* Free buffers */ 1226 XGIFreeFBMemory(pScrn, &fbhandle); 1227 xfree(buf2); 1228 xfree(buf3); 1229 1230 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 1231 "Using %s method for aligned data transfers %s video RAM\n", 1232 MCFunctions[best].mName, 1233 from ? "from" : "to"); 1234 1235 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 1236 "Using %s method for unaligned data transfers %s video RAM\n", 1237 MCFunctions[secondbest].mName, 1238 from ? "from" : "to"); 1239 1240 return MCFunctions[best].mFunc; 1241} 1242#endif /* canBenchmark */ 1243#endif 1244 1245/**********************************************************************/ 1246/* main(): Get CPU capabilities */ 1247/* (called externally) */ 1248/**********************************************************************/ 1249#if 0 /* Jong 01/15/2009; ignore at the moment */ 1250 1251unsigned int 1252XGIGetCPUFlags(ScrnInfoPtr pScrn) 1253{ 1254 unsigned int myCPUflags = XGI_GetCpuFeatures(pScrn); 1255 1256#ifdef XGI_checkosforsse 1257 if(myCPUflags & (XGI_CPUFL_SSE | XGI_CPUFL_SSE2)) { 1258 1259 /* Check if OS supports usage of SSE instructions */ 1260 if(!(CheckOSforSSE(pScrn))) { 1261 myCPUflags &= ~(XGI_CPUFL_SSE | XGI_CPUFL_SSE2); 1262 } 1263 1264 } 1265#endif 1266 1267 return myCPUflags; 1268} 1269 1270#endif 1271 1272/**********************************************************************/ 1273/* main(): XGIVidCopyInit() */ 1274/* (called externally) */ 1275/* (XGIGetCPUFlags must be called before this one) */ 1276/**********************************************************************/ 1277 1278#if 0 /* Jong 01/15/2009; ignore at the moment */ 1279vidCopyFunc XGIVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 1280{ 1281/* Jong 01/08/2009; test for performance */ 1282#if defined(__i386__) && defined(XGI_canBenchmark) 1283 PDEBUG(ErrorF("XGIVidCopyInit()-i386...\n")); 1284 return(XGIVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from)); 1285#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(XGI_canBenchmark) 1286 PDEBUG(ErrorF("XGIVidCopyInit()-AMD64-x86_64...\n")); 1287 return(XGIVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from)); 1288#else /* Other cases: Use libc memcpy() */ 1289 *UMemCpy = XGI_libc_memcpy; 1290 PDEBUG(ErrorF("XGIVidCopyInit()-generic...\n")); 1291 return XGI_libc_memcpy; 1292#endif 1293} 1294#endif 1295 1296#if 0 /* Jong 01/15/2009; ignore at the moment */ 1297vidCopyFunc XGIVidCopyGetDefault(void) 1298{ 1299 return XGI_libc_memcpy; 1300} 1301#endif 1302 1303#endif /* GNU C */ 1304 1305 1306