1/* 2 * Copyright (C) 2004 Thomas Hellström, All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sub license, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the 12 * next paragraph) shall be included in all copies or substantial portions 13 * of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24#ifdef HAVE_CONFIG_H 25#include "config.h" 26#endif 27 28#include "via_driver.h" 29#include "compiler.h" 30 31 32#define BSIZ 2048 /* size of /proc/cpuinfo buffer */ 33#define BSIZW 720 /* typical copy width (YUV420) */ 34#define BSIZA 736 /* multiple of 32 bytes */ 35#define BSIZH 576 /* typical copy height */ 36 37#define SSE_PREFETCH " prefetchnta " 38#define FENCE __asm__ __volatile__ ("sfence":::"memory"); 39#define FENCEMMS __asm__ __volatile__ ("\t" \ 40 "sfence\n\t" \ 41 "emms\n\t" \ 42 :::"memory"); 43#define FEMMS __asm__ __volatile__("femms":::"memory"); 44#define EMMS __asm__ __volatile__("emms":::"memory"); 45 46#define NOW_PREFETCH " prefetch " 47 48 49#define PREFETCH1(arch_prefetch,from) \ 50 __asm__ __volatile__ ( \ 51 "1: " arch_prefetch "(%0)\n" \ 52 arch_prefetch "32(%0)\n" \ 53 arch_prefetch "64(%0)\n" \ 54 arch_prefetch "96(%0)\n" \ 55 arch_prefetch "128(%0)\n" \ 56 arch_prefetch "160(%0)\n" \ 57 arch_prefetch "192(%0)\n" \ 58 arch_prefetch "256(%0)\n" \ 59 arch_prefetch "288(%0)\n" \ 60 "2:\n" \ 61 : : "r" (from) ); 62 63#define PREFETCH2(arch_prefetch,from) \ 64 __asm__ __volatile__ ( \ 65 arch_prefetch "320(%0)\n" \ 66 : : "r" (from) ); 67#define PREFETCH3(arch_prefetch,from) \ 68 __asm__ __volatile__ ( \ 69 arch_prefetch "288(%0)\n" \ 70 : : "r" (from) ); 71 72 73#define small_memcpy(to, from, n) \ 74 { \ 75 __asm__ __volatile__( \ 76 "movl %2,%%ecx\n\t" \ 77 "sarl $2,%%ecx\n\t" \ 78 "rep ; movsl\n\t" \ 79 "testb $2,%b2\n\t" \ 80 "je 1f\n\t" \ 81 "movsw\n" \ 82 "1:\ttestb $1,%b2\n\t" \ 83 "je 2f\n\t" \ 84 "movsb\n" \ 85 "2:" \ 86 :"=&D" (to), "=&S" (from) \ 87 :"q" (n),"0" ((long) to),"1" ((long) from) \ 88 : "%ecx","memory"); \ 89 } 90 91 92#define SSE_CPY(prefetch, from, to, dummy, lcnt) \ 93 if ((unsigned long) from & 15) { \ 94 __asm__ __volatile__ ( \ 95 "1:\n" \ 96 prefetch "320(%1)\n" \ 97 " movups (%1), %%xmm0\n" \ 98 " movups 16(%1), %%xmm1\n" \ 99 " movntps %%xmm0, (%0)\n" \ 100 " movntps %%xmm1, 16(%0)\n" \ 101 prefetch "352(%1)\n" \ 102 " movups 32(%1), %%xmm2\n" \ 103 " movups 48(%1), %%xmm3\n" \ 104 " movntps %%xmm2, 32(%0)\n" \ 105 " movntps %%xmm3, 48(%0)\n" \ 106 " addl $64,%0\n" \ 107 " addl $64,%1\n" \ 108 " decl %2\n" \ 109 " jne 1b\n" \ 110 :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 111 :"0" (to), "1" (from), "2" (lcnt): "memory"); \ 112 } else { \ 113 __asm__ __volatile__ ( \ 114 "2:\n" \ 115 prefetch "320(%1)\n" \ 116 " movaps (%1), %%xmm0\n" \ 117 " movaps 16(%1), %%xmm1\n" \ 118 " movntps %%xmm0, (%0)\n" \ 119 " movntps %%xmm1, 16(%0)\n" \ 120 prefetch "352(%1)\n" \ 121 " movaps 32(%1), %%xmm2\n" \ 122 " movaps 48(%1), %%xmm3\n" \ 123 " movntps %%xmm2, 32(%0)\n" \ 124 " movntps %%xmm3, 48(%0)\n" \ 125 " addl $64,%0\n" \ 126 " addl $64,%1\n" \ 127 " decl %2\n" \ 128 " jne 2b\n" \ 129 :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 130 :"0" (to), "1" (from), "2" (lcnt): "memory"); \ 131 } 132 133#define MMX_CPY(prefetch, from, to, dummy, lcnt) \ 134 __asm__ __volatile__ ( \ 135 "1:\n" \ 136 prefetch "320(%1)\n" \ 137 "2: movq (%1), %%mm0\n" \ 138 " movq 8(%1), %%mm1\n" \ 139 " movq 16(%1), %%mm2\n" \ 140 " movq 24(%1), %%mm3\n" \ 141 " movq %%mm0, (%0)\n" \ 142 " movq %%mm1, 8(%0)\n" \ 143 " movq %%mm2, 16(%0)\n" \ 144 " movq %%mm3, 24(%0)\n" \ 145 prefetch "352(%1)\n" \ 146 " movq 32(%1), %%mm0\n" \ 147 " movq 40(%1), %%mm1\n" \ 148 " movq 48(%1), %%mm2\n" \ 149 " movq 56(%1), %%mm3\n" \ 150 " movq %%mm0, 32(%0)\n" \ 151 " movq %%mm1, 40(%0)\n" \ 152 " movq %%mm2, 48(%0)\n" \ 153 " movq %%mm3, 56(%0)\n" \ 154 " addl $64,%0\n" \ 155 " addl $64,%1\n" \ 156 " decl %2\n" \ 157 " jne 1b\n" \ 158 :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 159 :"0" (to), "1" (from), "2" (lcnt) : "memory"); 160 161#define MMXEXT_CPY(prefetch, from, to, dummy, lcnt) \ 162 __asm__ __volatile__ ( \ 163 ".p2align 4,,7\n" \ 164 "1:\n" \ 165 prefetch "320(%1)\n" \ 166 " movq (%1), %%mm0\n" \ 167 " movq 8(%1), %%mm1\n" \ 168 " movq 16(%1), %%mm2\n" \ 169 " movq 24(%1), %%mm3\n" \ 170 " movntq %%mm0, (%0)\n" \ 171 " movntq %%mm1, 8(%0)\n" \ 172 " movntq %%mm2, 16(%0)\n" \ 173 " movntq %%mm3, 24(%0)\n" \ 174 prefetch "352(%1)\n" \ 175 " movq 32(%1), %%mm0\n" \ 176 " movq 40(%1), %%mm1\n" \ 177 " movq 48(%1), %%mm2\n" \ 178 " movq 56(%1), %%mm3\n" \ 179 " movntq %%mm0, 32(%0)\n" \ 180 " movntq %%mm1, 40(%0)\n" \ 181 " movntq %%mm2, 48(%0)\n" \ 182 " movntq %%mm3, 56(%0)\n" \ 183 " addl $64,%0\n" \ 184 " addl $64,%1\n" \ 185 " decl %2\n" \ 186 " jne 1b\n" \ 187 :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 188 :"0" (to), "1" (from), "2" (lcnt) : "memory"); 189 190 191#define PREFETCH_FUNC(prefix, itype, ptype, begin, fence) \ 192 \ 193 static void prefix##_YUV42X(unsigned char *to, \ 194 const unsigned char *from, \ 195 int dstPitch, \ 196 int w, \ 197 int h, \ 198 int yuv422) \ 199 { \ 200 int dadd, rest, count, hc, lcnt; \ 201 register int dummy; \ 202 PREFETCH1(ptype##_PREFETCH, from); \ 203 begin; \ 204 count = 2; \ 205 \ 206 /* If destination pitch equals width, do it all in one go. */ \ 207 \ 208 if (yuv422) { \ 209 w <<= 1; \ 210 if (w == dstPitch) { \ 211 w *= h; \ 212 h = 1; \ 213 dstPitch = w; \ 214 count = 0; \ 215 } else { \ 216 h -= 1; \ 217 count = 1; \ 218 } \ 219 } else if (w == dstPitch) { \ 220 w = h*(w + (w >> 1)); \ 221 count = 0; \ 222 h = 1; \ 223 dstPitch = w; \ 224 } \ 225 \ 226 lcnt = w >> 6; \ 227 rest = w & 63; \ 228 while (count--) { \ 229 hc = h; \ 230 lcnt = w >> 6; \ 231 rest = w & 63; \ 232 dadd = dstPitch - w; \ 233 while (hc--) { \ 234 if (lcnt) { \ 235 itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \ 236 } \ 237 if (rest) { \ 238 PREFETCH2(ptype##_PREFETCH, from); \ 239 small_memcpy(to, from, rest); \ 240 PREFETCH3(ptype##_PREFETCH, from); \ 241 } \ 242 to += dadd; \ 243 } \ 244 w >>= 1; \ 245 dstPitch >>= 1; \ 246 h -= 1; \ 247 } \ 248 if (lcnt > 5) { \ 249 lcnt -= 5; \ 250 itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \ 251 lcnt = 5; \ 252 } \ 253 if (lcnt) { \ 254 itype##_CPY("#", from, to, dummy, lcnt); \ 255 } \ 256 if (rest) small_memcpy(to, from, rest); \ 257 fence; \ 258 } 259 260#define NOPREFETCH_FUNC(prefix, itype, begin, fence) \ 261 static void prefix##_YUV42X(unsigned char *to, \ 262 const unsigned char *from, \ 263 int dstPitch, \ 264 int w, \ 265 int h, \ 266 int yuv422) \ 267 \ 268 { \ 269 int dadd, rest, count, hc, lcnt; \ 270 register int dummy; \ 271 begin; \ 272 count = 2; \ 273 \ 274 /* If destination pitch equals width, do it all in one go. */ \ 275 \ 276 if (yuv422) { \ 277 w <<= 1; \ 278 count = 1; \ 279 if (w == dstPitch) { \ 280 w *= h; \ 281 h = 1; \ 282 dstPitch = w; \ 283 } \ 284 } else if (w == dstPitch) { \ 285 w = h*(w + (w >> 1)); \ 286 count = 1; \ 287 h = 1; \ 288 dstPitch = w; \ 289 } \ 290 \ 291 lcnt = w >> 6; \ 292 rest = w & 63; \ 293 while (count--) { \ 294 hc = h; \ 295 dadd = dstPitch - w; \ 296 lcnt = w >> 6; \ 297 rest = w & 63; \ 298 while (hc--) { \ 299 if (lcnt) { \ 300 itype##_CPY("#", from, to, dummy, lcnt); \ 301 } \ 302 if (rest) small_memcpy(to, from, rest); \ 303 to += dadd; \ 304 } \ 305 w >>= 1; \ 306 dstPitch >>= 1; \ 307 } \ 308 fence; \ 309 } 310 311 312static void 313libc_YUV42X(unsigned char *dst, const unsigned char *src, 314 int dstPitch, int w, int h, int yuv422) 315{ 316 if (yuv422) 317 w <<= 1; 318 if (dstPitch == w) { 319 int size = h * ((yuv422) ? w : (w + (w >> 1))); 320 321 memcpy(dst, src, size); 322 return; 323 } else { 324 int count; 325 326 /* Copy Y component to video memory. */ 327 count = h; 328 while (count--) { 329 memcpy(dst, src, w); 330 src += w; 331 dst += dstPitch; 332 } 333 334 /* UV component is 1/2 of Y. */ 335 if (!yuv422) { 336 w >>= 1; 337 dstPitch >>= 1; 338 339 /* Copy V(Cr),U(Cb) components to video memory. */ 340 count = h; 341 while (count--) { 342 memcpy(dst, src, w); 343 src += w; 344 dst += dstPitch; 345 } 346 } 347 } 348} 349 350#ifdef __i386__ 351 352/* Linux kernel __memcpy. */ 353static __inline void * 354__memcpy(void *to, const void *from, size_t n) 355{ 356 int d1, d2, d3; 357 358 __asm__ __volatile__( 359 "rep ; movsl\n\t" 360 "testb $2,%b4\n\t" 361 "je 1f\n\t" 362 "movsw\n" 363 "1:\ttestb $1,%b4\n\t" 364 "je 2f\n\t" 365 "movsb\n" 366 "2:" 367 :"=&c"(d1), "=&D"(d2), "=&S"(d3) 368 :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from) 369 :"memory"); 370 371 return (to); 372} 373 374 375static void 376kernel_YUV42X(unsigned char *dst, const unsigned char *src, 377 int dstPitch, int w, int h, int yuv422) 378{ 379 if (yuv422) 380 w <<= 1; 381 if (dstPitch == w) { 382 int size = h * ((yuv422) ? w : (w + (w >> 1))); 383 384 __memcpy(dst, src, size); 385 return; 386 } else { 387 int count; 388 389 /* Copy Y component to video memory. */ 390 count = h; 391 while (count--) { 392 __memcpy(dst, src, w); 393 src += w; 394 dst += dstPitch; 395 } 396 397 /* UV component is 1/2 of Y. */ 398 if (!yuv422) { 399 400 w >>= 1; 401 dstPitch >>= 1; 402 403 /* Copy V(Cr),U(Cb) components to video memory. */ 404 count = h; 405 while (count--) { 406 __memcpy(dst, src, w); 407 src += w; 408 dst += dstPitch; 409 } 410 } 411 } 412} 413 414PREFETCH_FUNC(sse, SSE, SSE,, FENCE) 415PREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS) 416PREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS) 417NOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS) 418 419static void 420*kernel_memcpy(void *to, const void *from, size_t len) 421{ 422 return __memcpy(to, from, len); 423} 424 425static unsigned 426fastrdtsc(void) 427{ 428 unsigned eax; 429 430 __asm__ volatile ("\t" 431 "pushl %%ebx\n\t" 432 "cpuid\n\t" 433 ".byte 0x0f, 0x31\n\t" 434 "popl %%ebx\n" 435 :"=a" (eax) 436 :"0"(0) 437 :"ecx", "edx", "cc"); 438 439 return eax; 440} 441 442 443static unsigned 444time_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2) 445{ 446 unsigned t, t2; 447 448 t = fastrdtsc(); 449 450 (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0); 451 452 t2 = fastrdtsc(); 453 return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1)); 454} 455 456enum 457{ libc = 0, kernel, sse, mmx, now, mmxext, totNum }; 458 459typedef struct 460{ 461 vidCopyFunc mFunc; 462 char *mName, **cpuFlag; 463} McFuncData; 464 465static char *libc_cpuflags[] = { " ", 0 }; 466static char *kernel_cpuflags[] = { " ", 0 }; 467static char *sse_cpuflags[] = { " sse ", 0 }; 468static char *mmx_cpuflags[] = { " mmx ", 0 }; 469static char *now_cpuflags[] = { " 3dnow ", 0 }; 470static char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 }; 471 472static McFuncData mcFunctions[totNum] = { 473{libc_YUV42X, "libc", libc_cpuflags}, 474{kernel_YUV42X, "kernel", kernel_cpuflags}, 475{sse_YUV42X, "SSE", sse_cpuflags}, 476{mmx_YUV42X, "MMX", mmx_cpuflags}, 477{now_YUV42X, "3DNow!", now_cpuflags}, 478{mmxext_YUV42X, "MMX2", mmx2_cpuflags} 479}; 480 481 482static int 483flagValid(const char *cpuinfo, char *flag) 484{ 485 const char *flagLoc, *nextProc; 486 int located = 0; 487 488 while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) { 489 located = 1; 490 cpuinfo += 11; 491 if ((flagLoc = strstr(cpuinfo, flag))) { 492 if ((nextProc = strstr(cpuinfo, "processor\t:"))) { 493 if (nextProc < flagLoc) 494 return 0; 495 } 496 } else { 497 return 0; 498 } 499 } 500 return located; 501} 502 503 504static int 505cpuValid(const char *cpuinfo, char **flags) 506{ 507 for (; *flags != 0; flags++) { 508 if (flagValid(cpuinfo, *flags)) 509 return 1; 510 } 511 return 0; 512} 513 514/* 515 * Benchmark the video copy routines and choose the fastest. 516 */ 517vidCopyFunc 518viaVidCopyInit(char *copyType, ScreenPtr pScreen) 519{ 520 ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 521 522 char buf[BSIZ]; 523 unsigned char *buf1, *buf2, *buf3; 524 char *tmpBuf, *endBuf; 525 int count, j, bestSoFar; 526 unsigned best, tmp, testSize, alignSize, tmp2; 527 struct buffer_object *tmpFbBuffer; 528 McFuncData *curData; 529 FILE *cpuInfoFile; 530 double cpuFreq; 531 532 if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) { 533 return libc_YUV42X; 534 } 535 count = fread(buf, 1, BSIZ, cpuInfoFile); 536 if (ferror(cpuInfoFile)) { 537 fclose(cpuInfoFile); 538 return libc_YUV42X; 539 } 540 fclose(cpuInfoFile); 541 if (BSIZ == count) { 542 xf86DrvMsg(pScrn->scrnIndex, X_WARNING, 543 "\"/proc/cpuinfo\" file too long. " 544 "Using Linux kernel memcpy.\n"); 545 return libc_YUV42X; 546 } 547 buf[count] = 0; 548 549 while (count--) 550 if ('\n' == buf[count]) 551 buf[count] = ' '; 552 553 /* Extract the CPU frequency. */ 554 cpuFreq = 0.; 555 if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) { 556 if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) { 557 cpuFreq = strtod(tmpBuf, &endBuf); 558 if (endBuf == tmpBuf) 559 tmpBuf = NULL; 560 } 561 } 562 563 alignSize = BSIZH * (BSIZA + (BSIZA >> 1)); 564 testSize = BSIZH * (BSIZW + (BSIZW >> 1)); 565 /* 566 * Allocate an area of offscreen FB memory, (buf1), a simulated video 567 * player buffer (buf2) and a pool of uninitialized "video" data (buf3). 568 */ 569 tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_FLAG_VRAM); 570 if (!tmpFbBuffer) 571 return libc_YUV42X; 572 if (NULL == (buf2 = (unsigned char *)malloc(testSize))) { 573 drm_bo_free(pScrn, tmpFbBuffer); 574 return libc_YUV42X; 575 } 576 if (NULL == (buf3 = (unsigned char *)malloc(testSize))) { 577 free(buf2); 578 drm_bo_free(pScrn, tmpFbBuffer); 579 return libc_YUV42X; 580 } 581 buf1 = drm_bo_map(pScrn, tmpFbBuffer); 582 bestSoFar = 0; 583 best = 0xFFFFFFFFU; 584 585 /* Make probable that buf1 and buf2 are in memory by referencing them. */ 586 libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0); 587 588 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 589 "Benchmarking %s copy. Less time is better.\n", copyType); 590 for (j = 0; j < totNum; ++j) { 591 curData = mcFunctions + j; 592 593 if (cpuValid(buf, curData->cpuFlag)) { 594 595 /* Simulate setup of the video buffer. */ 596 kernel_memcpy(buf2, buf3, testSize); 597 598 /* Copy the video buffer to frame-buffer memory. */ 599 tmp = time_function(curData->mFunc, buf1, buf2); 600 601 /* Do it again to avoid context-switch effects. */ 602 kernel_memcpy(buf2, buf3, testSize); 603 tmp2 = time_function(curData->mFunc, buf1, buf2); 604 tmp = (tmp2 < tmp) ? tmp2 : tmp; 605 606 if (NULL == tmpBuf) { 607 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 608 "Timed %6s YUV420 copy... %u.\n", 609 curData->mName, tmp); 610 } else { 611 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 612 "Timed %6s YUV420 copy... %u. " 613 "Throughput: %.1f MiB/s.\n", 614 curData->mName, tmp, 615 cpuFreq * 1.e6 * (double)testSize / 616 ((double)(tmp) * (double)(0x100000))); 617 } 618 if (tmp < best) { 619 best = tmp; 620 bestSoFar = j; 621 } 622 } else { 623 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 624 "Ditching %6s YUV420 copy. Not supported by CPU.\n", 625 curData->mName); 626 } 627 } 628 free(buf3); 629 free(buf2); 630 drm_bo_unmap(pScrn, tmpFbBuffer); 631 drm_bo_free(pScrn, tmpFbBuffer); 632 xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 633 "Using %s YUV42X copy for %s.\n", 634 mcFunctions[bestSoFar].mName, copyType); 635 return mcFunctions[bestSoFar].mFunc; 636} 637 638#else 639 640vidCopyFunc 641viaVidCopyInit(char *copyType, ScreenPtr pScreen) 642{ 643 ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 644 645 xf86DrvMsg(pScrn->scrnIndex, X_INFO, 646 "Using default xfree86 memcpy for video.\n"); 647 return libc_YUV42X; 648} 649 650#endif /* __i386__ */ 651