1/* 2 * SiS memcpy() routines (assembly) 3 * 4 * Copyright (C) 2004-2005 Thomas Winischhofer 5 * 6 * Idea and some code bits from via_memcpy.c which is 7 * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved. 8 * 9 * Permission is hereby granted, free of charge, to any person obtaining a 10 * copy of this software and associated documentation files (the "Software"), 11 * to deal in the Software without restriction, including without limitation 12 * the rights to use, copy, modify, merge, publish, distribute, sub license, 13 * and/or sell copies of the Software, and to permit persons to whom the 14 * Software is furnished to do so, subject to the following conditions: 15 * 16 * The above copyright notice and this permission notice (including the 17 * next paragraph) shall be included in all copies or substantial portions 18 * of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 23 * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 * DEALINGS IN THE SOFTWARE. 27 * 28 */ 29 30#ifdef HAVE_CONFIG_H 31#include "config.h" 32#endif 33 34#include <stdlib.h> 35#include "sis.h" 36 37#if 0 /* Debug */ 38#define SISDGBMC 39#endif 40 41extern unsigned int SISAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize); 42extern void SISFreeFBMemory(ScrnInfoPtr pScrn, void **handle); 43 44#define CPUBUFFERSIZE 2048 /* Size of /proc/cpuinfo buffer */ 45#define BUFFERSIZE (576 * 1152) /* Matches 720x576 YUV420 */ 46 47/************************************************************************/ 48/* arch specific memcpy() routines */ 49/************************************************************************/ 50 51/* i386, AMD64 */ 52 53#define FENCE \ 54 __asm__ __volatile__( \ 55 " sfence\n" \ 56 : \ 57 : \ 58 : "memory"); 59 60#define FENCEMMS \ 61 __asm__ __volatile__ ( \ 62 " sfence\n" \ 63 " emms\n" \ 64 : \ 65 : \ 66 : "memory"); 67 68#define FEMMS \ 69 __asm__ __volatile__( \ 70 " femms\n" \ 71 : \ 72 : \ 73 : "memory"); 74 75#define EMMS \ 76 __asm__ __volatile__( \ 77 " emms\n" \ 78 : \ 79 : \ 80 : "memory"); 81 82#define SSE_PREFETCH " prefetchnta " 83#define NOW_PREFETCH " prefetch " 84 85#define PREFETCH1(arch_prefetch,from) \ 86 __asm__ __volatile__ ( \ 87 arch_prefetch "(%0)\n" \ 88 arch_prefetch "32(%0)\n" \ 89 arch_prefetch "64(%0)\n" \ 90 arch_prefetch "96(%0)\n" \ 91 arch_prefetch "128(%0)\n" \ 92 arch_prefetch "160(%0)\n" \ 93 arch_prefetch "192(%0)\n" \ 94 arch_prefetch "256(%0)\n" \ 95 arch_prefetch "288(%0)\n" \ 96 : \ 97 : "r" (from) ); 98 99#define PREFETCH2(arch_prefetch,from) \ 100 __asm__ __volatile__ ( \ 101 arch_prefetch "320(%0)\n" \ 102 : \ 103 : "r" (from) ); 104 105#define PREFETCH3(arch_prefetch,from) \ 106 __asm__ __volatile__ ( \ 107 arch_prefetch "288(%0)\n" \ 108 : \ 109 : "r" (from) ); 110 111#define small_memcpy_i386(to,from,n) \ 112 { \ 113 __asm__ __volatile__( \ 114 " cld\n" \ 115 " shrl $1, %%ecx\n" \ 116 " jnc 1f\n" \ 117 " movsb\n" \ 118 "1: shrl $1, %%ecx\n" \ 119 " jnc 2f\n" \ 120 " movsw\n" \ 121 "2: rep ; movsl" \ 122 : "=&D" (to), "=&S" (from) \ 123 : "c" (n), "0" ((long) to), "1" ((long) from) \ 124 : "memory", "cc"); \ 125 } 126 127#define small_memcpy_amd64(to,from,n) \ 128 { \ 129 __asm__ __volatile__( \ 130 " cld\n" \ 131 " shrq $1, %%rcx\n" \ 132 " jnc 1f\n" \ 133 " movsb\n" \ 134 "1: shrq $1, %%rcx\n" \ 135 " jnc 2f\n" \ 136 " movsw\n" \ 137 "2: shrq $1, %%rcx\n" \ 138 " jnc 3f\n" \ 139 " movsl\n" \ 140 "3: rep ; movsq" \ 141 : "=&D" (to), "=&S" (from) \ 142 : "c" (n), "0" ((long) to), "1" ((long) from) \ 143 : "memory", "cc"); \ 144 } 145 146#define MMX_CPY(prefetch,from,to,dummy,lcnt) \ 147 __asm__ __volatile__ ( \ 148 "1:\n" \ 149 prefetch "320(%1)\n" \ 150 " movq (%1), %%mm0\n" \ 151 " movq 8(%1), %%mm1\n" \ 152 " movq 16(%1), %%mm2\n" \ 153 " movq 24(%1), %%mm3\n" \ 154 " movq %%mm0, (%0)\n" \ 155 " movq %%mm1, 8(%0)\n" \ 156 " movq %%mm2, 16(%0)\n" \ 157 " movq %%mm3, 24(%0)\n" \ 158 prefetch "352(%1)\n" \ 159 " movq 32(%1), %%mm0\n" \ 160 " movq 40(%1), %%mm1\n" \ 161 " movq 48(%1), %%mm2\n" \ 162 " movq 56(%1), %%mm3\n" \ 163 " leal 64(%1),%1\n" \ 164 " movq %%mm0, 32(%0)\n" \ 165 " movq %%mm1, 40(%0)\n" \ 166 " movq %%mm2, 48(%0)\n" \ 167 " movq %%mm3, 56(%0)\n" \ 168 " decl %2\n" \ 169 " leal 64(%0),%0\n" \ 170 " jne 1b\n" \ 171 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 172 : "0" (to), "1" (from), "2" (lcnt) \ 173 : "memory", "cc"); 174 175#define SSE_CPY(prefetch,from,to,dummy,lcnt) \ 176 if((ULong) from & 15) { \ 177 __asm__ __volatile__ ( \ 178 "1:\n" \ 179 prefetch "320(%1)\n" \ 180 " movups (%1), %%xmm0\n" \ 181 " movups 16(%1), %%xmm1\n" \ 182 " movntps %%xmm0, (%0)\n" \ 183 " movntps %%xmm1, 16(%0)\n" \ 184 prefetch "352(%1)\n" \ 185 " movups 32(%1), %%xmm2\n" \ 186 " movups 48(%1), %%xmm3\n" \ 187 " leal 64(%1),%1\n" \ 188 " movntps %%xmm2, 32(%0)\n" \ 189 " movntps %%xmm3, 48(%0)\n" \ 190 " decl %2\n" \ 191 " leal 64(%0),%0\n" \ 192 " jne 1b\n" \ 193 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 194 : "0" (to), "1" (from), "2" (lcnt) \ 195 : "memory", "cc"); \ 196 } else { \ 197 __asm__ __volatile__ ( \ 198 "2:\n" \ 199 prefetch "320(%1)\n" \ 200 " movaps (%1), %%xmm0\n" \ 201 " movaps 16(%1), %%xmm1\n" \ 202 " movntps %%xmm0, (%0)\n" \ 203 " movntps %%xmm1, 16(%0)\n" \ 204 prefetch "352(%1)\n" \ 205 " movaps 32(%1), %%xmm2\n" \ 206 " movaps 48(%1), %%xmm3\n" \ 207 " leal 64(%1),%1\n" \ 208 " movntps %%xmm2, 32(%0)\n" \ 209 " movntps %%xmm3, 48(%0)\n" \ 210 " decl %2\n" \ 211 " leal 64(%0),%0\n" \ 212 " jne 2b\n" \ 213 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 214 : "0" (to), "1" (from), "2" (lcnt) \ 215 : "memory", "cc"); \ 216 } 217 218#define SSE64_CPY(prefetch,from,to,dummy,lcnt) \ 219 if((ULong) from & 15) { \ 220 __asm__ __volatile__ ( \ 221 "1:\n" \ 222 prefetch "320(%1)\n" \ 223 " movups (%1), %%xmm0\n" \ 224 " movups 16(%1), %%xmm1\n" \ 225 " movntps %%xmm0, (%0)\n" \ 226 " movntps %%xmm1, 16(%0)\n" \ 227 prefetch "352(%1)\n" \ 228 " movups 32(%1), %%xmm2\n" \ 229 " movups 48(%1), %%xmm3\n" \ 230 " leaq 64(%1),%1\n" \ 231 " movntps %%xmm2, 32(%0)\n" \ 232 " movntps %%xmm3, 48(%0)\n" \ 233 " decl %2\n" \ 234 " leaq 64(%0),%0\n" \ 235 " jne 1b\n" \ 236 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 237 : "0" (to), "1" (from), "2" (lcnt) \ 238 : "memory", "cc"); \ 239 } else { \ 240 __asm__ __volatile__ ( \ 241 "2:\n" \ 242 prefetch "320(%1)\n" \ 243 " movaps (%1), %%xmm0\n" \ 244 " movaps 16(%1), %%xmm1\n" \ 245 " movntps %%xmm0, (%0)\n" \ 246 " movntps %%xmm1, 16(%0)\n" \ 247 prefetch "352(%1)\n" \ 248 " movaps 32(%1), %%xmm2\n" \ 249 " movaps 48(%1), %%xmm3\n" \ 250 " leaq 64(%1),%1\n" \ 251 " movntps %%xmm2, 32(%0)\n" \ 252 " movntps %%xmm3, 48(%0)\n" \ 253 " decl %2\n" \ 254 " leaq 64(%0),%0\n" \ 255 " jne 2b\n" \ 256 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 257 : "0" (to), "1" (from), "2" (lcnt) \ 258 : "memory", "cc"); \ 259 } 260 261#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt) \ 262 __asm__ __volatile__ ( \ 263 ".p2align 4,,7\n" \ 264 "1:\n" \ 265 prefetch "320(%1)\n" \ 266 " movq (%1), %%mm0\n" \ 267 " movq 8(%1), %%mm1\n" \ 268 " movq 16(%1), %%mm2\n" \ 269 " movq 24(%1), %%mm3\n" \ 270 " movntq %%mm0, (%0)\n" \ 271 " movntq %%mm1, 8(%0)\n" \ 272 " movntq %%mm2, 16(%0)\n" \ 273 " movntq %%mm3, 24(%0)\n" \ 274 prefetch "352(%1)\n" \ 275 " movq 32(%1), %%mm0\n" \ 276 " movq 40(%1), %%mm1\n" \ 277 " movq 48(%1), %%mm2\n" \ 278 " movq 56(%1), %%mm3\n" \ 279 " leal 64(%1),%1\n" \ 280 " movntq %%mm0, 32(%0)\n" \ 281 " movntq %%mm1, 40(%0)\n" \ 282 " movntq %%mm2, 48(%0)\n" \ 283 " movntq %%mm3, 56(%0)\n" \ 284 " decl %2\n" \ 285 " leal 64(%0),%0\n" \ 286 " jne 1b\n" \ 287 : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 288 : "0" (to), "1" (from), "2" (lcnt) \ 289 : "memory", "cc"); 290 291 292#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small) \ 293 \ 294 static void prefix##_memcpy(UChar *to, \ 295 const UChar *from, \ 296 int size) \ 297 { \ 298 int lcnt = size >> 6; \ 299 int rest = size & 63; \ 300 register int dummy; \ 301 \ 302 PREFETCH1(ptype##_PREFETCH,from); \ 303 \ 304 begin; \ 305 if(lcnt) { \ 306 itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt); \ 307 } \ 308 if(rest) { \ 309 PREFETCH2(ptype##_PREFETCH,from); \ 310 small(to, from, rest); \ 311 PREFETCH3(ptype##_PREFETCH,from); \ 312 } \ 313 fence; \ 314 } 315 316#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small) \ 317 \ 318 static void prefix##_memcpy(UChar *to, \ 319 const UChar *from, \ 320 int size) \ 321 { \ 322 int lcnt = size >> 6; \ 323 int rest = size & 63; \ 324 register int dummy; \ 325 \ 326 begin; \ 327 if(lcnt) { \ 328 itype##_CPY("#",from,to,dummy,lcnt); \ 329 } \ 330 if(rest) { \ 331 small(to, from, rest); \ 332 } \ 333 fence; \ 334 } 335 336/* Other archs */ 337 338/* ... */ 339 340 341/* Type for table for benchmark list */ 342 343typedef struct { 344 vidCopyFunc mFunc; 345 char *mName; 346 unsigned int mycpuflag; 347 int grade; 348 int gradefrom; 349 Bool reqAlignment; 350} SISMCFuncData; 351 352/************************************************************************/ 353/* libc memcpy() wrapper - generic */ 354/************************************************************************/ 355 356static void SiS_libc_memcpy(UChar *dst, const UChar *src, int size) 357{ 358 memcpy(dst, src, size); 359} 360 361/************************************************************************/ 362/* We only do all that stuff under gcc; no idea what other compilers */ 363/* would do with our asm code. */ 364/************************************************************************/ 365 366#ifndef __GNUC__ 367 368unsigned int SiSGetCPUFlags(ScrnInfoPtr pScrn) 369{ 370 return 0; 371} 372 373vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 374{ 375 *UMemCpy = SiS_libc_memcpy; 376 return SiS_libc_memcpy; 377} 378 379vidCopyFunc SiSVidCopyGetDefault(void) 380{ 381 return SiS_libc_memcpy; 382} 383 384#else /* ! Everything below is gcc specific ! */ 385 386/************************************************************************/ 387/* Definitions for archs and OSes */ 388/************************************************************************/ 389 390#undef SiS_checkosforsse 391#undef SiS_canBenchmark 392#undef SiS_haveProc 393#undef SiS_haveBuiltInMC 394 395#if defined(__i386__) /* ***************************************** i386 */ 396 397#define SiS_checkosforsse /* Does this cpu support sse and do we need to check os? */ 398#define SiS_canBenchmark /* Can we perform a benchmark? */ 399#ifdef SIS_LINUX 400#define SiS_haveProc /* Do we have /proc/cpuinfo or similar? */ 401#endif 402#define SiS_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 403 404/* Built-in memcpy for i386 */ 405static __inline void * builtin_memcpy(void * to, const void * from, size_t n) 406{ 407 int d1,d2,d3; 408 409 __asm__ __volatile__( 410 " cld\n" 411 " shrl $1, %%ecx\n" 412 " jnc 1f\n" 413 " movsb\n" 414 "1: shrl $1, %%ecx\n" 415 " jnc 2f\n" 416 " movsw\n" 417 "2: rep ; movsl\n" 418 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 419 : "0" (n), "1" ((long) to), "2" ((long) from) 420 : "memory", "cc"); 421 422 return(to); 423} 424 425/* Alternative for 586: Unroll loop, copy 32 bytes at a time */ 426static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n) 427{ 428 int d1,d2,d3; 429 430 __asm__ __volatile__( 431 " movl %%edi, %%eax\n" 432 " cmpl $32, %%ecx\n" 433 " cld\n" 434 " jbe 3f\n" 435 " negl %%eax\n" /* Align dest */ 436 " andl $3, %%eax\n" 437 " subl %%eax, %%ecx\n" 438 " xchgl %%eax, %%ecx\n" 439 " rep ; movsb\n" 440 " movl %%eax, %%ecx\n" 441 " subl $32, %%ecx\n" 442 " js 2f\n" 443 " movl (%%edi), %%eax\n" 444 "1: movl 28(%%edi), %%edx\n" /* Trick: Read-ahead */ 445 " subl $32, %%ecx\n" 446 " movl (%%esi), %%eax\n" 447 " movl 4(%%esi), %%edx\n" 448 " movl %%eax, (%%edi)\n" 449 " movl %%edx, 4(%%edi)\n" 450 " movl 8(%%esi), %%eax\n" 451 " movl 12(%%esi), %%edx\n" 452 " movl %%eax, 8(%%edi)\n" 453 " movl %%edx, 12(%%edi)\n" 454 " movl 16(%%esi), %%eax\n" 455 " movl 20(%%esi), %%edx\n" 456 " movl %%eax, 16(%%edi)\n" 457 " movl %%edx, 20(%%edi)\n" 458 " movl 24(%%esi), %%eax\n" 459 " movl 28(%%esi), %%edx\n" 460 " movl %%eax, 24(%%edi)\n" 461 " movl %%edx, 28(%%edi)\n" 462 " leal 32(%%esi), %%esi\n" 463 " leal 32(%%edi), %%edi\n" 464 " jns 1b\n" 465 "2: addl $32, %%ecx\n" 466 "3: rep ; movsb" 467 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 468 : "0" (n), "1" ((long) to), "2" ((long) from) 469 : "eax", "edx", "memory", "cc"); 470 471} 472 473static unsigned int taketime(void) /* get current time (for benchmarking) */ 474{ 475 unsigned int eax; 476 477 __asm__ volatile ( 478 " pushl %%ebx\n" 479 " cpuid\n" 480 " rdtsc\n" 481 " popl %%ebx\n" 482 : "=a" (eax) 483 : "0" (0) 484 : "ecx", "edx", "cc"); 485 486 return(eax); 487} 488 489#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */ 490 491#define SiS_checkosforsse /* Does this cpu support sse and do we need to check os? */ 492#define SiS_canBenchmark /* Can we perform a benchmark? */ 493#ifdef SIS_LINUX 494#define SiS_haveProc /* Do we have /proc/cpuinfo or similar? */ 495#endif 496#define SiS_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 497 498/* Built-in memcpy for AMD64 */ 499static __inline void * builtin_memcpy(void * to, const void * from, int n) 500{ 501 long d1, d2, d3; 502 503 __asm__ __volatile__ ( 504 " cld\n" 505 " rep ; movsq\n" 506 " movq %4, %%rcx\n" 507 " rep ; movsb" 508 : "=%c" (d1), "=&D" (d2), "=&S" (d3) 509 : "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)), 510 "1" ((long) to), "2" ((long) from) 511 : "memory"); 512 513 return(to); 514} 515 516/* Alternative: Unroll loop, copy 32 bytes at a time */ 517static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n) 518{ 519 long d1,d2,d3; 520 521 __asm__ __volatile__( 522 " movq %%rdi, %%rax\n" 523 " cmpq $32, %%rcx\n" 524 " cld\n" /* Pipeline; no other flags but DF */ 525 " jbe 1f\n" 526 " negq %%rax\n" /* Align dest */ 527 " andq $7, %%rax\n" 528 " subq %%rax, %%rcx\n" 529 " xchgq %%rax, %%rcx\n" 530 " rep ; movsb\n" 531 " movq %%rax, %%rcx\n" 532 " subq $32, %%rcx\n" 533 " js 2f\n" 534 ".p2align 4\n" 535 "3: subq $32, %%rcx\n" 536 " movq (%%rsi), %%rax\n" 537 " movq 8(%%rsi), %%rdx\n" 538 " movq 16(%%rsi), %%r8\n" 539 " movq 24(%%rsi), %%r9\n" 540 " movq %%rax, (%%rdi)\n" 541 " movq %%rdx, 8(%%rdi)\n" 542 " movq %%r8, 16(%%rdi)\n" 543 " movq %%r9, 24(%%rdi)\n" 544 " leaq 32(%%rsi), %%rsi\n" 545 " leaq 32(%%rdi), %%rdi\n" 546 " jns 3b\n" 547 "2: addq $32, %%rcx\n" 548 "1: rep ; movsb" 549 : "=&c" (d1), "=&D" (d2), "=&S" (d3) 550 :"0" ((ULong) n), "1" ((long) to), "2" ((long) from) 551 : "rax", "rdx", "r8", "r9", "memory", "cc"); 552 553} 554 555static unsigned int taketime(void) /* get current time (for benchmarking) */ 556{ 557 unsigned int eax; 558 559 __asm__ volatile ( 560 " pushq %%rbx\n" 561 " cpuid\n" 562 " rdtsc\n" 563 " popq %%rbx\n" 564 : "=a" (eax) 565 : "0" (0) 566 : "rcx", "rdx", "cc"); 567 568 return(eax); 569} 570 571#else /* **************************************** Other archs */ 572 573/* 1. Can we do a benchmark? */ 574/* #define SiS_canBenchmark */ 575 576/* 2. Do we have /proc filesystem or similar for CPU information? */ 577/* #define SiS_haveproc */ 578 579/* 3. Optional: build-in memcpy() */ 580/* #define SiS_haveBuiltInMC */ 581/* static __inline void * builtin_memcpy(void * to, const void * from, int n) 582 { 583 } 584*/ 585 586/* 4. Function for getting current time (for benchmarking) */ 587/* static unsigned int taketime(void) 588 { 589 } 590*/ 591 592#endif 593 594/************************************************************************/ 595/* Generic built-in memcpy wrapper */ 596/************************************************************************/ 597 598#ifdef SiS_haveBuiltInMC 599static void SiS_builtin_memcpy(UChar *dst, const UChar *src, int size) 600{ 601 builtin_memcpy(dst, src, size); 602} 603#endif 604 605/************************************************************************/ 606/* Generic routines if Benchmark can be performed (all archs, all OSes) */ 607/************************************************************************/ 608 609#ifdef SiS_canBenchmark 610 611/* Get time (unsigned int) */ 612static unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size) 613{ 614 unsigned int t1, t2; 615 616 t1 = taketime(); 617 618 (*mf)(buf1, buf2, size); 619 620 t2 = taketime(); 621 622 return((t1 < t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1)); 623} 624 625/* Allocate an area of offscreen FB memory (buf1), a simulated video 626 * player buffer (buf2) and a pool of uninitialized "video" data (buf3). 627 */ 628static void * 629SiS_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3) 630{ 631 SISPtr pSiS = SISPTR(pScrn); 632 unsigned int offset; 633 void *handle = NULL; 634 635 if(!(offset = SISAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) { 636 return NULL; 637 } 638 (*buf1) = (UChar *)pSiS->FbBase + offset; 639 (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31); 640 641 if(!((*buf2) = (UChar *)malloc(BUFFERSIZE + 15))) { 642 SISFreeFBMemory(pScrn, &handle); 643 return NULL; 644 } 645 646 if(!((*buf3) = (UChar *)malloc(BUFFERSIZE + 15))) { 647 free((*buf2)); 648 SISFreeFBMemory(pScrn, &handle); 649 return NULL; 650 } 651 652 return handle; 653} 654 655/* Perform Benchmark */ 656static int SiS_BenchmarkMemcpy(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions, 657 unsigned int myCPUflags, UChar *buf1, UChar *buf2, 658 UChar *buf3, char *frqBuf, double cpuFreq, 659 vidCopyFunc *UMemCpy, int *best2, Bool from) 660{ 661 SISMCFuncData *curData; 662 int j = 0, bestSoFar = 0; 663 unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU; 664 665 (*best2) = 0; 666 667 /* Make probable buf1 and buf2 are not paged out by referencing them */ 668 SiS_libc_memcpy(buf1, buf2, BUFFERSIZE); 669 670 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 671 "Benchmarking %s RAM to %s RAM memory transfer methods:\n", 672 from ? "video" : "system", 673 from ? "system" : "video"); 674 675#ifdef TWDEBUG 676 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags); 677#endif 678 679 j = 0; 680 while(MCFunctions[j].mFunc) { 681 682 curData = MCFunctions + j; 683 684 if(myCPUflags & curData->mycpuflag) { 685 686 /* Simulate setup of the video buffer and copy result to framebuffer */ 687 /* Do this 4 times to verify results */ 688 if(!from) { 689 SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 690 tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 691 SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 692 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 693 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 694 SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 695 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 696 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 697 SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 698 tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 699 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 700 } else { 701 SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 702 tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 703 SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 704 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 705 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 706 SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 707 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 708 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 709 SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 710 tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 711 tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 712 } 713 714 if((!frqBuf) || (tmp1 == 0)) { 715 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 716 "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1); 717 } else { 718 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 719 "\tChecked %s memcpy()... \t%.1f MiB/s\n", 720 curData->mName, 721 cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000))); 722 } 723 724 if(tmp1 < best) { 725 best = tmp1; 726 bestSoFar = j; 727 } 728 729 if(!curData->reqAlignment) { 730 if(tmp1 < sbest) { 731 sbest = tmp1; 732 (*best2) = j; 733 } 734 } 735 736 } 737 738 j++; 739 } 740 741 return bestSoFar; 742} 743 744static vidCopyFunc SiS_GetBestByGrade(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions, 745 unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from) 746{ 747 int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10; 748 int grade; 749 750 *UMemCpy = SiS_libc_memcpy; 751 752 while(MCFunctions[j].mFunc) { 753 if(myCPUflags & MCFunctions[j].mycpuflag) { 754 grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade; 755 if(grade < bestSoFar) { 756 best = j; 757 bestSoFar = grade; 758 } 759 if(grade < best2SoFar) { 760 if(!MCFunctions[j].reqAlignment) { 761 secondbest = j; 762 best2SoFar = grade; 763 } 764 } 765 } 766 j++; 767 } 768 if(best >= 0) { 769 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 770 "Chose %s method for aligned data transfers %s video RAM\n", 771 MCFunctions[best].mName, 772 from ? "from" : "to"); 773 if(secondbest >= 0) { 774 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 775 "Chose %s method for unaligned data transfers %s video RAM\n", 776 MCFunctions[secondbest].mName, 777 from ? "from" : "to"); 778 *UMemCpy = MCFunctions[secondbest].mFunc; 779 } 780 return MCFunctions[best].mFunc; 781 } 782 783 return SiS_libc_memcpy; 784} 785#endif /* canBenchmark */ 786 787/**********************************************************************/ 788/* Generic routines if /proc filesystem is available (Linux) */ 789/**********************************************************************/ 790 791#ifdef SiS_haveProc 792/* Linux: Read file (/proc/cpuinfo) into buffer */ 793static int SiS_ReadProc(char *buf, char *filename) 794{ 795 FILE *cpuInfoFile; 796 int count; 797 798 if((cpuInfoFile = fopen(filename, "r")) == NULL) { 799 return 0; 800 } 801 802 count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile); 803 if(ferror(cpuInfoFile)) { 804 fclose(cpuInfoFile); 805 return 0; 806 } 807 808 fclose(cpuInfoFile); 809 810 if(count >= CPUBUFFERSIZE - 2) { 811 return 0; 812 } 813 814 buf[count] = 0; 815 816 return count; 817} 818 819/* Linux: Extract CPU speed from /proc/cpuinfo */ 820static char *SiS_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq) 821{ 822 char *frqBuf, *endBuf; 823 824 (*cpuFreq) = 0.0; 825 826 if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) { 827 frqBuf += 11; 828 (*cpuFreq) = strtod(frqBuf, &endBuf); 829 if(endBuf == frqBuf) frqBuf = NULL; 830 if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */ 831 if(frqBuf) { 832 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq)); 833 } 834 } 835 836 return frqBuf; 837} 838#endif /* haveProc */ 839 840/**********************************************************************/ 841/* Arch-specific routines */ 842/**********************************************************************/ 843 844#ifdef SiS_checkosforsse /* Common i386, AMD64 */ 845 846#ifdef SISCHECKOSSSE 847 848#ifndef XFree86LOADER 849#include <setjmp.h> 850#endif 851 852static jmp_buf sigill_return; 853 854static void sigill_handler(void) 855{ 856 longjmp(sigill_return, 1); 857} 858#endif 859 860static Bool CheckOSforSSE(ScrnInfoPtr pScrn) 861{ 862#ifdef SISCHECKOSSSE /* Check OS for SSE possible: */ 863 int signo = -1; 864 865#ifdef SISDGBMC 866 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n"); 867#endif 868 869 xf86InterceptSigIll(&sigill_handler); 870 871 if(setjmp(sigill_return)) { 872 signo = 4; 873 } else { 874 __asm__ __volatile__ (" xorps %xmm0, %xmm0\n"); 875 /* __asm__ __volatile__ (" .byte 0xff\n"); */ /* For test */ 876 } 877 878 xf86InterceptSigIll(NULL); 879 880#ifdef SISDGBMC 881 xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo); 882#endif 883 884 if(signo != -1) { 885 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 886 "OS does not support SSE instructions\n"); 887 } 888 889 return (signo >= 0) ? FALSE : TRUE; 890 891#else /* no check for SSE possible: */ 892 893 SISPtr pSiS = SISPTR(pScrn); 894 895 xf86DrvMsg(pScrn->scrnIndex, pSiS->XvSSEMemcpy ? X_WARNING : X_INFO, 896 "Checking OS for SSE support is not supported in this version of " SISMYSERVERNAME "\n"); 897 898 if(pSiS->XvSSEMemcpy) { 899 xf86DrvMsg(pScrn->scrnIndex, X_WARNING, 900 "If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n"); 901 return TRUE; 902 } else { 903 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 904 "If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n"); 905 return FALSE; 906 } 907#endif 908} 909 910#endif /* SiS_checkosforsse */ 911 912#ifdef __i386__ /* i386 specific *************************************/ 913 914PREFETCH_FUNC(SiS_sse,SSE,SSE,,FENCE,small_memcpy_i386) 915PREFETCH_FUNC(SiS_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386) 916PREFETCH_FUNC(SiS_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386) 917NOPREFETCH_FUNC(SiS_mmx,MMX,EMMS,EMMS,small_memcpy_i386) 918 919static SISMCFuncData MCFunctions_i386[] = { 920 {SiS_libc_memcpy, "libc", SIS_CPUFL_LIBC, 4, 4, FALSE}, 921 {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI, 5, 5, FALSE}, 922 {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2, 6, 6, FALSE}, 923 {SiS_mmx_memcpy, "MMX", SIS_CPUFL_MMX, 3, 3, FALSE}, 924 {SiS_sse_memcpy, "SSE", SIS_CPUFL_SSE, 1, 0, TRUE}, 925 {SiS_now_memcpy, "3DNow!", SIS_CPUFL_3DNOW, 2, 2, FALSE}, 926 {SiS_mmxext_memcpy, "MMX2", SIS_CPUFL_MMX2, 0, 1, FALSE}, 927 {NULL, "", 0, 10, 10, FALSE} 928}; 929 930#define Def_FL (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2) /* Default methods */ 931 932#define cpuid(op, eax, ebx, ecx, edx) \ 933 __asm__ __volatile__ ( \ 934 " pushl %%ebx\n" \ 935 " cpuid\n" \ 936 " movl %%ebx, %1\n" \ 937 " popl %%ebx\n" \ 938 : "=a" (eax), "=r" (ebx), \ 939 "=c" (ecx), "=d" (edx) \ 940 : "a" (op) \ 941 : "cc") 942 943static Bool cpuIDSupported(ScrnInfoPtr pScrn) 944{ 945 int eax, ebx, ecx, edx; 946 947 /* Check for cpuid instruction */ 948 __asm__ __volatile__ ( 949 " pushf\n" 950 " popl %0\n" 951 " movl %0, %1\n" 952 " xorl $0x200000, %0\n" 953 " push %0\n" 954 " popf\n" 955 " pushf\n" 956 " popl %0\n" 957 : "=a" (eax), "=c" (ecx) 958 : 959 : "cc"); 960 961 if(eax == ecx) { 962 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n"); 963 return FALSE; 964 } 965 966 /* Check for cpuid level */ 967 cpuid(0x00000000, eax, ebx, ecx, edx); 968 if(!eax) { 969 return FALSE; 970 } 971 972 /* Check for RDTSC */ 973 cpuid(0x00000001, eax, ebx, ecx, edx); 974 975 if(!(edx & 0x10)) { 976 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n"); 977 return FALSE; 978 } 979 980 return TRUE; 981} 982 983static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 984{ 985 unsigned int flags = 0, eax, ebx, ecx, edx; 986 Bool IsAMD; 987 988 /* Check if cpuid and rdtsc instructions are supported */ 989 if(!cpuIDSupported(pScrn)) { 990 return 0; 991 } 992 993 cpuid(0x00000000, eax, ebx, ecx, edx); 994 995 IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163); 996 997 cpuid(0x00000001, eax, ebx, ecx, edx); 998 /* MMX */ 999 if(edx & 0x00800000) flags |= SIS_CPUFL_MMX; 1000 /* SSE, MMXEXT */ 1001 if(edx & 0x02000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_MMX2); 1002 /* SSE2 - don't need this one directly, set SSE instead */ 1003 if(edx & 0x04000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_SSE2); 1004 1005 cpuid(0x80000000, eax, ebx, ecx, edx); 1006 if(eax >= 0x80000001) { 1007 cpuid(0x80000001, eax, ebx, ecx, edx); 1008 /* 3DNow! */ 1009 if(edx & 0x80000000) flags |= SIS_CPUFL_3DNOW; 1010 /* AMD MMXEXT */ 1011 if(IsAMD && (edx & 0x00400000)) flags |= SIS_CPUFL_MMX2; 1012 } 1013 1014 return flags; 1015} 1016 1017#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */ 1018 1019PREFETCH_FUNC(SiS_sse,SSE64,SSE,,FENCE,small_memcpy_amd64) 1020 1021static SISMCFuncData MCFunctions_AMD64[] = { 1022 {SiS_libc_memcpy, "libc", SIS_CPUFL_LIBC, 2, 2, FALSE}, 1023 {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI, 1, 1, FALSE}, 1024 {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2, 3, 3, FALSE}, 1025 {SiS_sse_memcpy, "SSE", SIS_CPUFL_SSE, 0, 0, TRUE}, 1026 {NULL, "", 0, 10, 10, FALSE} 1027}; 1028 1029#define Def_FL (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2) 1030 1031static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 1032{ 1033 return((unsigned int)(SIS_CPUFL_SSE|SIS_CPUFL_SSE2)); 1034} 1035 1036#else /* Specific for other archs ******************************** */ 1037 1038/* Fill in here */ 1039 1040#define Def_FL (SIS_CPUFL_LIBC) 1041 1042static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 1043{ 1044 return((unsigned int)(0)); 1045} 1046 1047#endif 1048 1049/**********************************************************************/ 1050/* Benchmark the video copy routines and choose the fastest */ 1051/**********************************************************************/ 1052 1053#ifdef SiS_canBenchmark 1054static vidCopyFunc 1055SiSVidCopyInitGen(ScreenPtr pScreen, SISMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from) 1056{ 1057 ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 1058 SISPtr pSiS = SISPTR(pScrn); 1059 void *fbhandle = NULL; 1060 char *frqBuf = NULL; 1061 UChar *buf1, *buf2, *buf3; 1062 double cpuFreq = 0.0; 1063 unsigned int myCPUflags = pSiS->CPUFlags | Def_FL; 1064 int best, secondbest; 1065#ifdef SiS_haveProc 1066 char buf[CPUBUFFERSIZE]; 1067#endif 1068 1069 *UMemCpy = SiS_libc_memcpy; 1070 1071 /* Bail out if user disabled benchmarking */ 1072 if(!pSiS->BenchMemCpy) { 1073 return SiS_libc_memcpy; 1074 } 1075 1076#ifdef SiS_haveProc 1077 /* Read /proc/cpuinfo into buf */ 1078 if(SiS_ReadProc(buf, "/proc/cpuinfo")) { 1079 1080 /* Extract CPU frequency */ 1081 frqBuf = SiS_GetCPUFreq(pScrn, buf, &cpuFreq); 1082 1083 } 1084#endif 1085 1086 /* Allocate buffers */ 1087 if(!(fbhandle = SiS_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) { 1088 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 1089 "Failed to allocate video RAM for video data transfer benchmark\n"); 1090 return SiS_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from); 1091 } 1092 1093 /* Perform Benchmark */ 1094 best = SiS_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1, 1095 (UChar *)(((unsigned long)buf2 + 15) & ~15), 1096 (UChar *)(((unsigned long)buf3 + 15) & ~15), 1097 frqBuf, cpuFreq, UMemCpy, &secondbest, from); 1098 1099 /* Free buffers */ 1100 SISFreeFBMemory(pScrn, &fbhandle); 1101 free(buf2); 1102 free(buf3); 1103 1104 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 1105 "Using %s method for aligned data transfers %s video RAM\n", 1106 MCFunctions[best].mName, 1107 from ? "from" : "to"); 1108 1109 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 1110 "Using %s method for unaligned data transfers %s video RAM\n", 1111 MCFunctions[secondbest].mName, 1112 from ? "from" : "to"); 1113 1114 return MCFunctions[best].mFunc; 1115} 1116#endif /* canBenchmark */ 1117 1118/**********************************************************************/ 1119/* main(): Get CPU capabilities */ 1120/* (called externally) */ 1121/**********************************************************************/ 1122 1123unsigned int 1124SiSGetCPUFlags(ScrnInfoPtr pScrn) 1125{ 1126 unsigned int myCPUflags = SiS_GetCpuFeatures(pScrn); 1127 1128#ifdef SiS_checkosforsse 1129 if(myCPUflags & (SIS_CPUFL_SSE | SIS_CPUFL_SSE2)) { 1130 1131 /* Check if OS supports usage of SSE instructions */ 1132 if(!(CheckOSforSSE(pScrn))) { 1133 myCPUflags &= ~(SIS_CPUFL_SSE | SIS_CPUFL_SSE2); 1134 } 1135 1136 } 1137#endif 1138 1139 return myCPUflags; 1140} 1141 1142/**********************************************************************/ 1143/* main(): SiSVidCopyInit() */ 1144/* (called externally) */ 1145/* (SiSGetCPUFlags must be called before this one) */ 1146/**********************************************************************/ 1147 1148vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 1149{ 1150#if defined(__i386__) && defined(SiS_canBenchmark) 1151 return(SiSVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from)); 1152#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(SiS_canBenchmark) 1153 return(SiSVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from)); 1154#else /* Other cases: Use libc memcpy() */ 1155 *UMemCpy = SiS_libc_memcpy; 1156 return SiS_libc_memcpy; 1157#endif 1158} 1159 1160vidCopyFunc SiSVidCopyGetDefault(void) 1161{ 1162 return SiS_libc_memcpy; 1163} 1164 1165#endif /* GNU C */ 1166 1167 1168