17e6fb56fSmrg/* 27e6fb56fSmrg * Copyright (C) 2004 Thomas Hellström, All Rights Reserved. 37e6fb56fSmrg * 47e6fb56fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 57e6fb56fSmrg * copy of this software and associated documentation files (the "Software"), 67e6fb56fSmrg * to deal in the Software without restriction, including without limitation 77e6fb56fSmrg * the rights to use, copy, modify, merge, publish, distribute, sub license, 87e6fb56fSmrg * and/or sell copies of the Software, and to permit persons to whom the 97e6fb56fSmrg * Software is furnished to do so, subject to the following conditions: 107e6fb56fSmrg * 117e6fb56fSmrg * The above copyright notice and this permission notice (including the 127e6fb56fSmrg * next paragraph) shall be included in all copies or substantial portions 137e6fb56fSmrg * of the Software. 147e6fb56fSmrg * 157e6fb56fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167e6fb56fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177e6fb56fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 187e6fb56fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197e6fb56fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207e6fb56fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 217e6fb56fSmrg * DEALINGS IN THE SOFTWARE. 227e6fb56fSmrg */ 237e6fb56fSmrg 247e6fb56fSmrg#ifdef HAVE_CONFIG_H 257e6fb56fSmrg#include "config.h" 267e6fb56fSmrg#endif 277e6fb56fSmrg 287e6fb56fSmrg#include "via_driver.h" 297e6fb56fSmrg#include "compiler.h" 307e6fb56fSmrg 317e6fb56fSmrg 327e6fb56fSmrg#define BSIZ 2048 /* size of /proc/cpuinfo buffer */ 337e6fb56fSmrg#define BSIZW 720 /* typical copy width (YUV420) */ 347e6fb56fSmrg#define BSIZA 736 /* multiple of 32 bytes */ 357e6fb56fSmrg#define BSIZH 576 /* typical copy height */ 367e6fb56fSmrg 377e6fb56fSmrg#define SSE_PREFETCH " prefetchnta " 387e6fb56fSmrg#define FENCE __asm__ __volatile__ ("sfence":::"memory"); 397e6fb56fSmrg#define FENCEMMS __asm__ __volatile__ ("\t" \ 407e6fb56fSmrg "sfence\n\t" \ 417e6fb56fSmrg "emms\n\t" \ 427e6fb56fSmrg :::"memory"); 437e6fb56fSmrg#define FEMMS __asm__ __volatile__("femms":::"memory"); 447e6fb56fSmrg#define EMMS __asm__ __volatile__("emms":::"memory"); 457e6fb56fSmrg 467e6fb56fSmrg#define NOW_PREFETCH " prefetch " 477e6fb56fSmrg 487e6fb56fSmrg 497e6fb56fSmrg#define PREFETCH1(arch_prefetch,from) \ 507e6fb56fSmrg __asm__ __volatile__ ( \ 517e6fb56fSmrg "1: " arch_prefetch "(%0)\n" \ 527e6fb56fSmrg arch_prefetch "32(%0)\n" \ 537e6fb56fSmrg arch_prefetch "64(%0)\n" \ 547e6fb56fSmrg arch_prefetch "96(%0)\n" \ 557e6fb56fSmrg arch_prefetch "128(%0)\n" \ 567e6fb56fSmrg arch_prefetch "160(%0)\n" \ 577e6fb56fSmrg arch_prefetch "192(%0)\n" \ 587e6fb56fSmrg arch_prefetch "256(%0)\n" \ 597e6fb56fSmrg arch_prefetch "288(%0)\n" \ 607e6fb56fSmrg "2:\n" \ 617e6fb56fSmrg : : "r" (from) ); 627e6fb56fSmrg 637e6fb56fSmrg#define PREFETCH2(arch_prefetch,from) \ 647e6fb56fSmrg __asm__ __volatile__ ( \ 657e6fb56fSmrg arch_prefetch "320(%0)\n" \ 667e6fb56fSmrg : : "r" (from) ); 677e6fb56fSmrg#define PREFETCH3(arch_prefetch,from) \ 687e6fb56fSmrg __asm__ __volatile__ ( \ 697e6fb56fSmrg arch_prefetch "288(%0)\n" \ 707e6fb56fSmrg : : "r" (from) ); 717e6fb56fSmrg 727e6fb56fSmrg 737e6fb56fSmrg#define small_memcpy(to, from, n) \ 747e6fb56fSmrg { \ 757e6fb56fSmrg __asm__ __volatile__( \ 767e6fb56fSmrg "movl %2,%%ecx\n\t" \ 777e6fb56fSmrg "sarl $2,%%ecx\n\t" \ 787e6fb56fSmrg "rep ; movsl\n\t" \ 797e6fb56fSmrg "testb $2,%b2\n\t" \ 807e6fb56fSmrg "je 1f\n\t" \ 817e6fb56fSmrg "movsw\n" \ 827e6fb56fSmrg "1:\ttestb $1,%b2\n\t" \ 837e6fb56fSmrg "je 2f\n\t" \ 847e6fb56fSmrg "movsb\n" \ 857e6fb56fSmrg "2:" \ 867e6fb56fSmrg :"=&D" (to), "=&S" (from) \ 877e6fb56fSmrg :"q" (n),"0" ((long) to),"1" ((long) from) \ 887e6fb56fSmrg : "%ecx","memory"); \ 897e6fb56fSmrg } 907e6fb56fSmrg 917e6fb56fSmrg 927e6fb56fSmrg#define SSE_CPY(prefetch, from, to, dummy, lcnt) \ 937e6fb56fSmrg if ((unsigned long) from & 15) { \ 947e6fb56fSmrg __asm__ __volatile__ ( \ 957e6fb56fSmrg "1:\n" \ 967e6fb56fSmrg prefetch "320(%1)\n" \ 977e6fb56fSmrg " movups (%1), %%xmm0\n" \ 987e6fb56fSmrg " movups 16(%1), %%xmm1\n" \ 997e6fb56fSmrg " movntps %%xmm0, (%0)\n" \ 1007e6fb56fSmrg " movntps %%xmm1, 16(%0)\n" \ 1017e6fb56fSmrg prefetch "352(%1)\n" \ 1027e6fb56fSmrg " movups 32(%1), %%xmm2\n" \ 1037e6fb56fSmrg " movups 48(%1), %%xmm3\n" \ 1047e6fb56fSmrg " movntps %%xmm2, 32(%0)\n" \ 1057e6fb56fSmrg " movntps %%xmm3, 48(%0)\n" \ 1067e6fb56fSmrg " addl $64,%0\n" \ 1077e6fb56fSmrg " addl $64,%1\n" \ 1087e6fb56fSmrg " decl %2\n" \ 1097e6fb56fSmrg " jne 1b\n" \ 1107e6fb56fSmrg :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 1117e6fb56fSmrg :"0" (to), "1" (from), "2" (lcnt): "memory"); \ 1127e6fb56fSmrg } else { \ 1137e6fb56fSmrg __asm__ __volatile__ ( \ 1147e6fb56fSmrg "2:\n" \ 1157e6fb56fSmrg prefetch "320(%1)\n" \ 1167e6fb56fSmrg " movaps (%1), %%xmm0\n" \ 1177e6fb56fSmrg " movaps 16(%1), %%xmm1\n" \ 1187e6fb56fSmrg " movntps %%xmm0, (%0)\n" \ 1197e6fb56fSmrg " movntps %%xmm1, 16(%0)\n" \ 1207e6fb56fSmrg prefetch "352(%1)\n" \ 1217e6fb56fSmrg " movaps 32(%1), %%xmm2\n" \ 1227e6fb56fSmrg " movaps 48(%1), %%xmm3\n" \ 1237e6fb56fSmrg " movntps %%xmm2, 32(%0)\n" \ 1247e6fb56fSmrg " movntps %%xmm3, 48(%0)\n" \ 1257e6fb56fSmrg " addl $64,%0\n" \ 1267e6fb56fSmrg " addl $64,%1\n" \ 1277e6fb56fSmrg " decl %2\n" \ 1287e6fb56fSmrg " jne 2b\n" \ 1297e6fb56fSmrg :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 1307e6fb56fSmrg :"0" (to), "1" (from), "2" (lcnt): "memory"); \ 1317e6fb56fSmrg } 1327e6fb56fSmrg 1337e6fb56fSmrg#define MMX_CPY(prefetch, from, to, dummy, lcnt) \ 1347e6fb56fSmrg __asm__ __volatile__ ( \ 1357e6fb56fSmrg "1:\n" \ 1367e6fb56fSmrg prefetch "320(%1)\n" \ 1377e6fb56fSmrg "2: movq (%1), %%mm0\n" \ 1387e6fb56fSmrg " movq 8(%1), %%mm1\n" \ 1397e6fb56fSmrg " movq 16(%1), %%mm2\n" \ 1407e6fb56fSmrg " movq 24(%1), %%mm3\n" \ 1417e6fb56fSmrg " movq %%mm0, (%0)\n" \ 1427e6fb56fSmrg " movq %%mm1, 8(%0)\n" \ 1437e6fb56fSmrg " movq %%mm2, 16(%0)\n" \ 1447e6fb56fSmrg " movq %%mm3, 24(%0)\n" \ 1457e6fb56fSmrg prefetch "352(%1)\n" \ 1467e6fb56fSmrg " movq 32(%1), %%mm0\n" \ 1477e6fb56fSmrg " movq 40(%1), %%mm1\n" \ 1487e6fb56fSmrg " movq 48(%1), %%mm2\n" \ 1497e6fb56fSmrg " movq 56(%1), %%mm3\n" \ 1507e6fb56fSmrg " movq %%mm0, 32(%0)\n" \ 1517e6fb56fSmrg " movq %%mm1, 40(%0)\n" \ 1527e6fb56fSmrg " movq %%mm2, 48(%0)\n" \ 1537e6fb56fSmrg " movq %%mm3, 56(%0)\n" \ 1547e6fb56fSmrg " addl $64,%0\n" \ 1557e6fb56fSmrg " addl $64,%1\n" \ 1567e6fb56fSmrg " decl %2\n" \ 1577e6fb56fSmrg " jne 1b\n" \ 1587e6fb56fSmrg :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 1597e6fb56fSmrg :"0" (to), "1" (from), "2" (lcnt) : "memory"); 1607e6fb56fSmrg 1617e6fb56fSmrg#define MMXEXT_CPY(prefetch, from, to, dummy, lcnt) \ 1627e6fb56fSmrg __asm__ __volatile__ ( \ 1637e6fb56fSmrg ".p2align 4,,7\n" \ 1647e6fb56fSmrg "1:\n" \ 1657e6fb56fSmrg prefetch "320(%1)\n" \ 1667e6fb56fSmrg " movq (%1), %%mm0\n" \ 1677e6fb56fSmrg " movq 8(%1), %%mm1\n" \ 1687e6fb56fSmrg " movq 16(%1), %%mm2\n" \ 1697e6fb56fSmrg " movq 24(%1), %%mm3\n" \ 1707e6fb56fSmrg " movntq %%mm0, (%0)\n" \ 1717e6fb56fSmrg " movntq %%mm1, 8(%0)\n" \ 1727e6fb56fSmrg " movntq %%mm2, 16(%0)\n" \ 1737e6fb56fSmrg " movntq %%mm3, 24(%0)\n" \ 1747e6fb56fSmrg prefetch "352(%1)\n" \ 1757e6fb56fSmrg " movq 32(%1), %%mm0\n" \ 1767e6fb56fSmrg " movq 40(%1), %%mm1\n" \ 1777e6fb56fSmrg " movq 48(%1), %%mm2\n" \ 1787e6fb56fSmrg " movq 56(%1), %%mm3\n" \ 1797e6fb56fSmrg " movntq %%mm0, 32(%0)\n" \ 1807e6fb56fSmrg " movntq %%mm1, 40(%0)\n" \ 1817e6fb56fSmrg " movntq %%mm2, 48(%0)\n" \ 1827e6fb56fSmrg " movntq %%mm3, 56(%0)\n" \ 1837e6fb56fSmrg " addl $64,%0\n" \ 1847e6fb56fSmrg " addl $64,%1\n" \ 1857e6fb56fSmrg " decl %2\n" \ 1867e6fb56fSmrg " jne 1b\n" \ 1877e6fb56fSmrg :"=&D"(to), "=&S"(from), "=&r"(dummy) \ 1887e6fb56fSmrg :"0" (to), "1" (from), "2" (lcnt) : "memory"); 1897e6fb56fSmrg 1907e6fb56fSmrg 1917e6fb56fSmrg#define PREFETCH_FUNC(prefix, itype, ptype, begin, fence) \ 1927e6fb56fSmrg \ 1937e6fb56fSmrg static void prefix##_YUV42X(unsigned char *to, \ 1947e6fb56fSmrg const unsigned char *from, \ 1957e6fb56fSmrg int dstPitch, \ 1967e6fb56fSmrg int w, \ 1977e6fb56fSmrg int h, \ 1987e6fb56fSmrg int yuv422) \ 1997e6fb56fSmrg { \ 2007e6fb56fSmrg int dadd, rest, count, hc, lcnt; \ 2017e6fb56fSmrg register int dummy; \ 2027e6fb56fSmrg PREFETCH1(ptype##_PREFETCH, from); \ 2037e6fb56fSmrg begin; \ 2047e6fb56fSmrg count = 2; \ 2057e6fb56fSmrg \ 2067e6fb56fSmrg /* If destination pitch equals width, do it all in one go. */ \ 2077e6fb56fSmrg \ 2087e6fb56fSmrg if (yuv422) { \ 2097e6fb56fSmrg w <<= 1; \ 2107e6fb56fSmrg if (w == dstPitch) { \ 2117e6fb56fSmrg w *= h; \ 2127e6fb56fSmrg h = 1; \ 2137e6fb56fSmrg dstPitch = w; \ 2147e6fb56fSmrg count = 0; \ 2157e6fb56fSmrg } else { \ 2167e6fb56fSmrg h -= 1; \ 2177e6fb56fSmrg count = 1; \ 2187e6fb56fSmrg } \ 2197e6fb56fSmrg } else if (w == dstPitch) { \ 2207e6fb56fSmrg w = h*(w + (w >> 1)); \ 2217e6fb56fSmrg count = 0; \ 2227e6fb56fSmrg h = 1; \ 2237e6fb56fSmrg dstPitch = w; \ 2247e6fb56fSmrg } \ 2257e6fb56fSmrg \ 2267e6fb56fSmrg lcnt = w >> 6; \ 2277e6fb56fSmrg rest = w & 63; \ 2287e6fb56fSmrg while (count--) { \ 2297e6fb56fSmrg hc = h; \ 2307e6fb56fSmrg lcnt = w >> 6; \ 2317e6fb56fSmrg rest = w & 63; \ 2327e6fb56fSmrg dadd = dstPitch - w; \ 2337e6fb56fSmrg while (hc--) { \ 2347e6fb56fSmrg if (lcnt) { \ 2357e6fb56fSmrg itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \ 2367e6fb56fSmrg } \ 2377e6fb56fSmrg if (rest) { \ 2387e6fb56fSmrg PREFETCH2(ptype##_PREFETCH, from); \ 2397e6fb56fSmrg small_memcpy(to, from, rest); \ 2407e6fb56fSmrg PREFETCH3(ptype##_PREFETCH, from); \ 2417e6fb56fSmrg } \ 2427e6fb56fSmrg to += dadd; \ 2437e6fb56fSmrg } \ 2447e6fb56fSmrg w >>= 1; \ 2457e6fb56fSmrg dstPitch >>= 1; \ 2467e6fb56fSmrg h -= 1; \ 2477e6fb56fSmrg } \ 2487e6fb56fSmrg if (lcnt > 5) { \ 2497e6fb56fSmrg lcnt -= 5; \ 2507e6fb56fSmrg itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \ 2517e6fb56fSmrg lcnt = 5; \ 2527e6fb56fSmrg } \ 2537e6fb56fSmrg if (lcnt) { \ 2547e6fb56fSmrg itype##_CPY("#", from, to, dummy, lcnt); \ 2557e6fb56fSmrg } \ 2567e6fb56fSmrg if (rest) small_memcpy(to, from, rest); \ 2577e6fb56fSmrg fence; \ 2587e6fb56fSmrg } 2597e6fb56fSmrg 2607e6fb56fSmrg#define NOPREFETCH_FUNC(prefix, itype, begin, fence) \ 2617e6fb56fSmrg static void prefix##_YUV42X(unsigned char *to, \ 2627e6fb56fSmrg const unsigned char *from, \ 2637e6fb56fSmrg int dstPitch, \ 2647e6fb56fSmrg int w, \ 2657e6fb56fSmrg int h, \ 2667e6fb56fSmrg int yuv422) \ 2677e6fb56fSmrg \ 2687e6fb56fSmrg { \ 2697e6fb56fSmrg int dadd, rest, count, hc, lcnt; \ 2707e6fb56fSmrg register int dummy; \ 2717e6fb56fSmrg begin; \ 2727e6fb56fSmrg count = 2; \ 2737e6fb56fSmrg \ 2747e6fb56fSmrg /* If destination pitch equals width, do it all in one go. */ \ 2757e6fb56fSmrg \ 2767e6fb56fSmrg if (yuv422) { \ 2777e6fb56fSmrg w <<= 1; \ 2787e6fb56fSmrg count = 1; \ 2797e6fb56fSmrg if (w == dstPitch) { \ 2807e6fb56fSmrg w *= h; \ 2817e6fb56fSmrg h = 1; \ 2827e6fb56fSmrg dstPitch = w; \ 2837e6fb56fSmrg } \ 2847e6fb56fSmrg } else if (w == dstPitch) { \ 2857e6fb56fSmrg w = h*(w + (w >> 1)); \ 2867e6fb56fSmrg count = 1; \ 2877e6fb56fSmrg h = 1; \ 2887e6fb56fSmrg dstPitch = w; \ 2897e6fb56fSmrg } \ 2907e6fb56fSmrg \ 2917e6fb56fSmrg lcnt = w >> 6; \ 2927e6fb56fSmrg rest = w & 63; \ 2937e6fb56fSmrg while (count--) { \ 2947e6fb56fSmrg hc = h; \ 2957e6fb56fSmrg dadd = dstPitch - w; \ 2967e6fb56fSmrg lcnt = w >> 6; \ 2977e6fb56fSmrg rest = w & 63; \ 2987e6fb56fSmrg while (hc--) { \ 2997e6fb56fSmrg if (lcnt) { \ 3007e6fb56fSmrg itype##_CPY("#", from, to, dummy, lcnt); \ 3017e6fb56fSmrg } \ 3027e6fb56fSmrg if (rest) small_memcpy(to, from, rest); \ 3037e6fb56fSmrg to += dadd; \ 3047e6fb56fSmrg } \ 3057e6fb56fSmrg w >>= 1; \ 3067e6fb56fSmrg dstPitch >>= 1; \ 3077e6fb56fSmrg } \ 3087e6fb56fSmrg fence; \ 3097e6fb56fSmrg } 3107e6fb56fSmrg 3117e6fb56fSmrg 3127e6fb56fSmrgstatic void 3137e6fb56fSmrglibc_YUV42X(unsigned char *dst, const unsigned char *src, 3147e6fb56fSmrg int dstPitch, int w, int h, int yuv422) 3157e6fb56fSmrg{ 3167e6fb56fSmrg if (yuv422) 3177e6fb56fSmrg w <<= 1; 3187e6fb56fSmrg if (dstPitch == w) { 3197e6fb56fSmrg int size = h * ((yuv422) ? w : (w + (w >> 1))); 3207e6fb56fSmrg 3217e6fb56fSmrg memcpy(dst, src, size); 3227e6fb56fSmrg return; 3237e6fb56fSmrg } else { 3247e6fb56fSmrg int count; 3257e6fb56fSmrg 3267e6fb56fSmrg /* Copy Y component to video memory. */ 3277e6fb56fSmrg count = h; 3287e6fb56fSmrg while (count--) { 3297e6fb56fSmrg memcpy(dst, src, w); 3307e6fb56fSmrg src += w; 3317e6fb56fSmrg dst += dstPitch; 3327e6fb56fSmrg } 3337e6fb56fSmrg 3347e6fb56fSmrg /* UV component is 1/2 of Y. */ 3357e6fb56fSmrg if (!yuv422) { 3367e6fb56fSmrg w >>= 1; 3377e6fb56fSmrg dstPitch >>= 1; 3387e6fb56fSmrg 3397e6fb56fSmrg /* Copy V(Cr),U(Cb) components to video memory. */ 3407e6fb56fSmrg count = h; 3417e6fb56fSmrg while (count--) { 3427e6fb56fSmrg memcpy(dst, src, w); 3437e6fb56fSmrg src += w; 3447e6fb56fSmrg dst += dstPitch; 3457e6fb56fSmrg } 3467e6fb56fSmrg } 3477e6fb56fSmrg } 3487e6fb56fSmrg} 3497e6fb56fSmrg 3507e6fb56fSmrg#ifdef __i386__ 3517e6fb56fSmrg 3527e6fb56fSmrg/* Linux kernel __memcpy. */ 3537e6fb56fSmrgstatic __inline void * 3547e6fb56fSmrg__memcpy(void *to, const void *from, size_t n) 3557e6fb56fSmrg{ 3567e6fb56fSmrg int d1, d2, d3; 3577e6fb56fSmrg 3587e6fb56fSmrg __asm__ __volatile__( 3597e6fb56fSmrg "rep ; movsl\n\t" 3607e6fb56fSmrg "testb $2,%b4\n\t" 3617e6fb56fSmrg "je 1f\n\t" 3627e6fb56fSmrg "movsw\n" 3637e6fb56fSmrg "1:\ttestb $1,%b4\n\t" 3647e6fb56fSmrg "je 2f\n\t" 3657e6fb56fSmrg "movsb\n" 3667e6fb56fSmrg "2:" 3677e6fb56fSmrg :"=&c"(d1), "=&D"(d2), "=&S"(d3) 3687e6fb56fSmrg :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from) 3697e6fb56fSmrg :"memory"); 3707e6fb56fSmrg 3717e6fb56fSmrg return (to); 3727e6fb56fSmrg} 3737e6fb56fSmrg 3747e6fb56fSmrg 3757e6fb56fSmrgstatic void 3767e6fb56fSmrgkernel_YUV42X(unsigned char *dst, const unsigned char *src, 3777e6fb56fSmrg int dstPitch, int w, int h, int yuv422) 3787e6fb56fSmrg{ 3797e6fb56fSmrg if (yuv422) 3807e6fb56fSmrg w <<= 1; 3817e6fb56fSmrg if (dstPitch == w) { 3827e6fb56fSmrg int size = h * ((yuv422) ? w : (w + (w >> 1))); 3837e6fb56fSmrg 3847e6fb56fSmrg __memcpy(dst, src, size); 3857e6fb56fSmrg return; 3867e6fb56fSmrg } else { 3877e6fb56fSmrg int count; 3887e6fb56fSmrg 3897e6fb56fSmrg /* Copy Y component to video memory. */ 3907e6fb56fSmrg count = h; 3917e6fb56fSmrg while (count--) { 3927e6fb56fSmrg __memcpy(dst, src, w); 3937e6fb56fSmrg src += w; 3947e6fb56fSmrg dst += dstPitch; 3957e6fb56fSmrg } 3967e6fb56fSmrg 3977e6fb56fSmrg /* UV component is 1/2 of Y. */ 3987e6fb56fSmrg if (!yuv422) { 3997e6fb56fSmrg 4007e6fb56fSmrg w >>= 1; 4017e6fb56fSmrg dstPitch >>= 1; 4027e6fb56fSmrg 4037e6fb56fSmrg /* Copy V(Cr),U(Cb) components to video memory. */ 4047e6fb56fSmrg count = h; 4057e6fb56fSmrg while (count--) { 4067e6fb56fSmrg __memcpy(dst, src, w); 4077e6fb56fSmrg src += w; 4087e6fb56fSmrg dst += dstPitch; 4097e6fb56fSmrg } 4107e6fb56fSmrg } 4117e6fb56fSmrg } 4127e6fb56fSmrg} 4137e6fb56fSmrg 4147e6fb56fSmrgPREFETCH_FUNC(sse, SSE, SSE,, FENCE) 4157e6fb56fSmrgPREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS) 4167e6fb56fSmrgPREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS) 4177e6fb56fSmrgNOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS) 4187e6fb56fSmrg 4197e6fb56fSmrgstatic void 4207e6fb56fSmrg*kernel_memcpy(void *to, const void *from, size_t len) 4217e6fb56fSmrg{ 4227e6fb56fSmrg return __memcpy(to, from, len); 4237e6fb56fSmrg} 4247e6fb56fSmrg 4257e6fb56fSmrgstatic unsigned 4267e6fb56fSmrgfastrdtsc(void) 4277e6fb56fSmrg{ 4287e6fb56fSmrg unsigned eax; 4297e6fb56fSmrg 4307e6fb56fSmrg __asm__ volatile ("\t" 4317e6fb56fSmrg "pushl %%ebx\n\t" 4327e6fb56fSmrg "cpuid\n\t" 4337e6fb56fSmrg ".byte 0x0f, 0x31\n\t" 4347e6fb56fSmrg "popl %%ebx\n" 4357e6fb56fSmrg :"=a" (eax) 4367e6fb56fSmrg :"0"(0) 4377e6fb56fSmrg :"ecx", "edx", "cc"); 4387e6fb56fSmrg 4397e6fb56fSmrg return eax; 4407e6fb56fSmrg} 4417e6fb56fSmrg 4427e6fb56fSmrg 4437e6fb56fSmrgstatic unsigned 4447e6fb56fSmrgtime_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2) 4457e6fb56fSmrg{ 4467e6fb56fSmrg unsigned t, t2; 4477e6fb56fSmrg 4487e6fb56fSmrg t = fastrdtsc(); 4497e6fb56fSmrg 4507e6fb56fSmrg (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0); 4517e6fb56fSmrg 4527e6fb56fSmrg t2 = fastrdtsc(); 4537e6fb56fSmrg return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1)); 4547e6fb56fSmrg} 4557e6fb56fSmrg 4567e6fb56fSmrgenum 4577e6fb56fSmrg{ libc = 0, kernel, sse, mmx, now, mmxext, totNum }; 4587e6fb56fSmrg 4597e6fb56fSmrgtypedef struct 4607e6fb56fSmrg{ 4617e6fb56fSmrg vidCopyFunc mFunc; 4627e6fb56fSmrg char *mName, **cpuFlag; 4637e6fb56fSmrg} McFuncData; 4647e6fb56fSmrg 4657e6fb56fSmrgstatic char *libc_cpuflags[] = { " ", 0 }; 4667e6fb56fSmrgstatic char *kernel_cpuflags[] = { " ", 0 }; 4677e6fb56fSmrgstatic char *sse_cpuflags[] = { " sse ", 0 }; 4687e6fb56fSmrgstatic char *mmx_cpuflags[] = { " mmx ", 0 }; 4697e6fb56fSmrgstatic char *now_cpuflags[] = { " 3dnow ", 0 }; 4707e6fb56fSmrgstatic char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 }; 4717e6fb56fSmrg 4727e6fb56fSmrgstatic McFuncData mcFunctions[totNum] = { 4737e6fb56fSmrg{libc_YUV42X, "libc", libc_cpuflags}, 4747e6fb56fSmrg{kernel_YUV42X, "kernel", kernel_cpuflags}, 4757e6fb56fSmrg{sse_YUV42X, "SSE", sse_cpuflags}, 4767e6fb56fSmrg{mmx_YUV42X, "MMX", mmx_cpuflags}, 4777e6fb56fSmrg{now_YUV42X, "3DNow!", now_cpuflags}, 4787e6fb56fSmrg{mmxext_YUV42X, "MMX2", mmx2_cpuflags} 4797e6fb56fSmrg}; 4807e6fb56fSmrg 4817e6fb56fSmrg 4827e6fb56fSmrgstatic int 4837e6fb56fSmrgflagValid(const char *cpuinfo, char *flag) 4847e6fb56fSmrg{ 4857e6fb56fSmrg const char *flagLoc, *nextProc; 4867e6fb56fSmrg int located = 0; 4877e6fb56fSmrg 4887e6fb56fSmrg while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) { 4897e6fb56fSmrg located = 1; 4907e6fb56fSmrg cpuinfo += 11; 4917e6fb56fSmrg if ((flagLoc = strstr(cpuinfo, flag))) { 4927e6fb56fSmrg if ((nextProc = strstr(cpuinfo, "processor\t:"))) { 4937e6fb56fSmrg if (nextProc < flagLoc) 4947e6fb56fSmrg return 0; 4957e6fb56fSmrg } 4967e6fb56fSmrg } else { 4977e6fb56fSmrg return 0; 4987e6fb56fSmrg } 4997e6fb56fSmrg } 5007e6fb56fSmrg return located; 5017e6fb56fSmrg} 5027e6fb56fSmrg 5037e6fb56fSmrg 5047e6fb56fSmrgstatic int 5057e6fb56fSmrgcpuValid(const char *cpuinfo, char **flags) 5067e6fb56fSmrg{ 5077e6fb56fSmrg for (; *flags != 0; flags++) { 5087e6fb56fSmrg if (flagValid(cpuinfo, *flags)) 5097e6fb56fSmrg return 1; 5107e6fb56fSmrg } 5117e6fb56fSmrg return 0; 5127e6fb56fSmrg} 5137e6fb56fSmrg 5147e6fb56fSmrg/* 5157e6fb56fSmrg * Benchmark the video copy routines and choose the fastest. 5167e6fb56fSmrg */ 5177e6fb56fSmrgvidCopyFunc 5187e6fb56fSmrgviaVidCopyInit(char *copyType, ScreenPtr pScreen) 5197e6fb56fSmrg{ 5208aedb4f6Smrg ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 5217e6fb56fSmrg 5227e6fb56fSmrg char buf[BSIZ]; 5237e6fb56fSmrg unsigned char *buf1, *buf2, *buf3; 5247e6fb56fSmrg char *tmpBuf, *endBuf; 5257e6fb56fSmrg int count, j, bestSoFar; 5267e6fb56fSmrg unsigned best, tmp, testSize, alignSize, tmp2; 5278aedb4f6Smrg struct buffer_object *tmpFbBuffer; 5287e6fb56fSmrg McFuncData *curData; 5297e6fb56fSmrg FILE *cpuInfoFile; 5307e6fb56fSmrg double cpuFreq; 5317e6fb56fSmrg 5327e6fb56fSmrg if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) { 5337e6fb56fSmrg return libc_YUV42X; 5347e6fb56fSmrg } 5357e6fb56fSmrg count = fread(buf, 1, BSIZ, cpuInfoFile); 5367e6fb56fSmrg if (ferror(cpuInfoFile)) { 5377e6fb56fSmrg fclose(cpuInfoFile); 5387e6fb56fSmrg return libc_YUV42X; 5397e6fb56fSmrg } 5407e6fb56fSmrg fclose(cpuInfoFile); 5417e6fb56fSmrg if (BSIZ == count) { 5427e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_WARNING, 5437e6fb56fSmrg "\"/proc/cpuinfo\" file too long. " 5447e6fb56fSmrg "Using Linux kernel memcpy.\n"); 5457e6fb56fSmrg return libc_YUV42X; 5467e6fb56fSmrg } 5477e6fb56fSmrg buf[count] = 0; 5487e6fb56fSmrg 5497e6fb56fSmrg while (count--) 5507e6fb56fSmrg if ('\n' == buf[count]) 5517e6fb56fSmrg buf[count] = ' '; 5527e6fb56fSmrg 5537e6fb56fSmrg /* Extract the CPU frequency. */ 5547e6fb56fSmrg cpuFreq = 0.; 5557e6fb56fSmrg if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) { 5567e6fb56fSmrg if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) { 5577e6fb56fSmrg cpuFreq = strtod(tmpBuf, &endBuf); 5587e6fb56fSmrg if (endBuf == tmpBuf) 5597e6fb56fSmrg tmpBuf = NULL; 5607e6fb56fSmrg } 5617e6fb56fSmrg } 5627e6fb56fSmrg 5637e6fb56fSmrg alignSize = BSIZH * (BSIZA + (BSIZA >> 1)); 5647e6fb56fSmrg testSize = BSIZH * (BSIZW + (BSIZW >> 1)); 5657e6fb56fSmrg /* 5667e6fb56fSmrg * Allocate an area of offscreen FB memory, (buf1), a simulated video 5678aedb4f6Smrg * player buffer (buf2) and a pool of uninitialized "video" data (buf3). 5687e6fb56fSmrg */ 5698aedb4f6Smrg tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_FLAG_VRAM); 5708aedb4f6Smrg if (!tmpFbBuffer) 5717e6fb56fSmrg return libc_YUV42X; 5728aedb4f6Smrg if (NULL == (buf2 = (unsigned char *)malloc(testSize))) { 5738aedb4f6Smrg drm_bo_free(pScrn, tmpFbBuffer); 5747e6fb56fSmrg return libc_YUV42X; 5757e6fb56fSmrg } 5768aedb4f6Smrg if (NULL == (buf3 = (unsigned char *)malloc(testSize))) { 5778aedb4f6Smrg free(buf2); 5788aedb4f6Smrg drm_bo_free(pScrn, tmpFbBuffer); 5797e6fb56fSmrg return libc_YUV42X; 5807e6fb56fSmrg } 5818aedb4f6Smrg buf1 = drm_bo_map(pScrn, tmpFbBuffer); 5827e6fb56fSmrg bestSoFar = 0; 5837e6fb56fSmrg best = 0xFFFFFFFFU; 5847e6fb56fSmrg 5857e6fb56fSmrg /* Make probable that buf1 and buf2 are in memory by referencing them. */ 5867e6fb56fSmrg libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0); 5877e6fb56fSmrg 5887e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 5897e6fb56fSmrg "Benchmarking %s copy. Less time is better.\n", copyType); 5907e6fb56fSmrg for (j = 0; j < totNum; ++j) { 5917e6fb56fSmrg curData = mcFunctions + j; 5927e6fb56fSmrg 5937e6fb56fSmrg if (cpuValid(buf, curData->cpuFlag)) { 5947e6fb56fSmrg 5957e6fb56fSmrg /* Simulate setup of the video buffer. */ 5967e6fb56fSmrg kernel_memcpy(buf2, buf3, testSize); 5977e6fb56fSmrg 5987e6fb56fSmrg /* Copy the video buffer to frame-buffer memory. */ 5997e6fb56fSmrg tmp = time_function(curData->mFunc, buf1, buf2); 6007e6fb56fSmrg 6017e6fb56fSmrg /* Do it again to avoid context-switch effects. */ 6027e6fb56fSmrg kernel_memcpy(buf2, buf3, testSize); 6037e6fb56fSmrg tmp2 = time_function(curData->mFunc, buf1, buf2); 6047e6fb56fSmrg tmp = (tmp2 < tmp) ? tmp2 : tmp; 6057e6fb56fSmrg 6067e6fb56fSmrg if (NULL == tmpBuf) { 6077e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 6087e6fb56fSmrg "Timed %6s YUV420 copy... %u.\n", 6097e6fb56fSmrg curData->mName, tmp); 6107e6fb56fSmrg } else { 6117e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 6127e6fb56fSmrg "Timed %6s YUV420 copy... %u. " 6137e6fb56fSmrg "Throughput: %.1f MiB/s.\n", 6147e6fb56fSmrg curData->mName, tmp, 6157e6fb56fSmrg cpuFreq * 1.e6 * (double)testSize / 6167e6fb56fSmrg ((double)(tmp) * (double)(0x100000))); 6177e6fb56fSmrg } 6187e6fb56fSmrg if (tmp < best) { 6197e6fb56fSmrg best = tmp; 6207e6fb56fSmrg bestSoFar = j; 6217e6fb56fSmrg } 6227e6fb56fSmrg } else { 6237e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 6247e6fb56fSmrg "Ditching %6s YUV420 copy. Not supported by CPU.\n", 6257e6fb56fSmrg curData->mName); 6267e6fb56fSmrg } 6277e6fb56fSmrg } 6288aedb4f6Smrg free(buf3); 6298aedb4f6Smrg free(buf2); 6308aedb4f6Smrg drm_bo_unmap(pScrn, tmpFbBuffer); 6318aedb4f6Smrg drm_bo_free(pScrn, tmpFbBuffer); 6327e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 6337e6fb56fSmrg "Using %s YUV42X copy for %s.\n", 6347e6fb56fSmrg mcFunctions[bestSoFar].mName, copyType); 6357e6fb56fSmrg return mcFunctions[bestSoFar].mFunc; 6367e6fb56fSmrg} 6377e6fb56fSmrg 6387e6fb56fSmrg#else 6397e6fb56fSmrg 6407e6fb56fSmrgvidCopyFunc 6417e6fb56fSmrgviaVidCopyInit(char *copyType, ScreenPtr pScreen) 6427e6fb56fSmrg{ 6438aedb4f6Smrg ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 6447e6fb56fSmrg 6457e6fb56fSmrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 6467e6fb56fSmrg "Using default xfree86 memcpy for video.\n"); 6477e6fb56fSmrg return libc_YUV42X; 6487e6fb56fSmrg} 6497e6fb56fSmrg 6507e6fb56fSmrg#endif /* __i386__ */ 651