17e6fb56fSmrg/*
27e6fb56fSmrg * Copyright (C) 2004 Thomas Hellström, All Rights Reserved.
37e6fb56fSmrg *
47e6fb56fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
57e6fb56fSmrg * copy of this software and associated documentation files (the "Software"),
67e6fb56fSmrg * to deal in the Software without restriction, including without limitation
77e6fb56fSmrg * the rights to use, copy, modify, merge, publish, distribute, sub license,
87e6fb56fSmrg * and/or sell copies of the Software, and to permit persons to whom the
97e6fb56fSmrg * Software is furnished to do so, subject to the following conditions:
107e6fb56fSmrg *
117e6fb56fSmrg * The above copyright notice and this permission notice (including the
127e6fb56fSmrg * next paragraph) shall be included in all copies or substantial portions
137e6fb56fSmrg * of the Software.
147e6fb56fSmrg *
157e6fb56fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167e6fb56fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177e6fb56fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
187e6fb56fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197e6fb56fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207e6fb56fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
217e6fb56fSmrg * DEALINGS IN THE SOFTWARE.
227e6fb56fSmrg */
237e6fb56fSmrg
247e6fb56fSmrg#ifdef HAVE_CONFIG_H
257e6fb56fSmrg#include "config.h"
267e6fb56fSmrg#endif
277e6fb56fSmrg
287e6fb56fSmrg#include "via_driver.h"
297e6fb56fSmrg#include "compiler.h"
307e6fb56fSmrg
317e6fb56fSmrg
327e6fb56fSmrg#define BSIZ 2048  /* size of /proc/cpuinfo buffer */
337e6fb56fSmrg#define BSIZW 720  /* typical copy width (YUV420) */
347e6fb56fSmrg#define BSIZA 736  /* multiple of 32 bytes */
357e6fb56fSmrg#define BSIZH 576  /* typical copy height */
367e6fb56fSmrg
377e6fb56fSmrg#define SSE_PREFETCH "  prefetchnta "
387e6fb56fSmrg#define FENCE __asm__ __volatile__ ("sfence":::"memory");
397e6fb56fSmrg#define FENCEMMS __asm__ __volatile__ ("\t"		\
407e6fb56fSmrg				       "sfence\n\t"	\
417e6fb56fSmrg				       "emms\n\t"	\
427e6fb56fSmrg				       :::"memory");
437e6fb56fSmrg#define FEMMS __asm__ __volatile__("femms":::"memory");
447e6fb56fSmrg#define EMMS __asm__ __volatile__("emms":::"memory");
457e6fb56fSmrg
467e6fb56fSmrg#define NOW_PREFETCH "  prefetch "
477e6fb56fSmrg
487e6fb56fSmrg
497e6fb56fSmrg#define PREFETCH1(arch_prefetch,from)			\
507e6fb56fSmrg    __asm__ __volatile__ (				\
517e6fb56fSmrg			  "1:  " arch_prefetch "(%0)\n"	\
527e6fb56fSmrg			  arch_prefetch "32(%0)\n"	\
537e6fb56fSmrg			  arch_prefetch "64(%0)\n"	\
547e6fb56fSmrg			  arch_prefetch "96(%0)\n"	\
557e6fb56fSmrg			  arch_prefetch "128(%0)\n"	\
567e6fb56fSmrg			  arch_prefetch "160(%0)\n"	\
577e6fb56fSmrg			  arch_prefetch "192(%0)\n"	\
587e6fb56fSmrg			  arch_prefetch "256(%0)\n"	\
597e6fb56fSmrg			  arch_prefetch "288(%0)\n"	\
607e6fb56fSmrg			  "2:\n"			\
617e6fb56fSmrg			  : : "r" (from) );
627e6fb56fSmrg
637e6fb56fSmrg#define PREFETCH2(arch_prefetch,from)			\
647e6fb56fSmrg    __asm__ __volatile__ (				\
657e6fb56fSmrg			  arch_prefetch "320(%0)\n"	\
667e6fb56fSmrg			  : : "r" (from) );
677e6fb56fSmrg#define PREFETCH3(arch_prefetch,from)			\
687e6fb56fSmrg    __asm__ __volatile__ (				\
697e6fb56fSmrg			  arch_prefetch "288(%0)\n"	\
707e6fb56fSmrg			  : : "r" (from) );
717e6fb56fSmrg
727e6fb56fSmrg
737e6fb56fSmrg#define small_memcpy(to, from, n)					\
747e6fb56fSmrg    {									\
757e6fb56fSmrg	__asm__ __volatile__(						\
767e6fb56fSmrg			     "movl %2,%%ecx\n\t"			\
777e6fb56fSmrg			     "sarl $2,%%ecx\n\t"			\
787e6fb56fSmrg			     "rep ; movsl\n\t"				\
797e6fb56fSmrg			     "testb $2,%b2\n\t"				\
807e6fb56fSmrg			     "je 1f\n\t"				\
817e6fb56fSmrg			     "movsw\n"					\
827e6fb56fSmrg			     "1:\ttestb $1,%b2\n\t"			\
837e6fb56fSmrg			     "je 2f\n\t"				\
847e6fb56fSmrg			     "movsb\n"					\
857e6fb56fSmrg			     "2:"					\
867e6fb56fSmrg			     :"=&D" (to), "=&S" (from)			\
877e6fb56fSmrg			     :"q" (n),"0" ((long) to),"1" ((long) from) \
887e6fb56fSmrg			     : "%ecx","memory");			\
897e6fb56fSmrg    }
907e6fb56fSmrg
917e6fb56fSmrg
927e6fb56fSmrg#define SSE_CPY(prefetch, from, to, dummy, lcnt)			\
937e6fb56fSmrg    if ((unsigned long) from & 15) {					\
947e6fb56fSmrg	__asm__ __volatile__ (						\
957e6fb56fSmrg			      "1:\n"					\
967e6fb56fSmrg			      prefetch "320(%1)\n"			\
977e6fb56fSmrg			      "  movups (%1), %%xmm0\n"			\
987e6fb56fSmrg			      "  movups 16(%1), %%xmm1\n"		\
997e6fb56fSmrg			      "  movntps %%xmm0, (%0)\n"		\
1007e6fb56fSmrg			      "  movntps %%xmm1, 16(%0)\n"		\
1017e6fb56fSmrg                              prefetch "352(%1)\n"			\
1027e6fb56fSmrg			      "  movups 32(%1), %%xmm2\n"		\
1037e6fb56fSmrg			      "  movups 48(%1), %%xmm3\n"		\
1047e6fb56fSmrg			      "  movntps %%xmm2, 32(%0)\n"		\
1057e6fb56fSmrg			      "  movntps %%xmm3, 48(%0)\n"		\
1067e6fb56fSmrg			      "  addl $64,%0\n"				\
1077e6fb56fSmrg			      "  addl $64,%1\n"				\
1087e6fb56fSmrg			      "  decl %2\n"				\
1097e6fb56fSmrg			      "  jne 1b\n"				\
1107e6fb56fSmrg			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
1117e6fb56fSmrg			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
1127e6fb56fSmrg    } else {								\
1137e6fb56fSmrg	__asm__ __volatile__ (						\
1147e6fb56fSmrg			      "2:\n"					\
1157e6fb56fSmrg			      prefetch "320(%1)\n"			\
1167e6fb56fSmrg			      "  movaps (%1), %%xmm0\n"			\
1177e6fb56fSmrg			      "  movaps 16(%1), %%xmm1\n"		\
1187e6fb56fSmrg			      "  movntps %%xmm0, (%0)\n"		\
1197e6fb56fSmrg			      "  movntps %%xmm1, 16(%0)\n"		\
1207e6fb56fSmrg			      prefetch "352(%1)\n"			\
1217e6fb56fSmrg			      "  movaps 32(%1), %%xmm2\n"		\
1227e6fb56fSmrg			      "  movaps 48(%1), %%xmm3\n"		\
1237e6fb56fSmrg			      "  movntps %%xmm2, 32(%0)\n"		\
1247e6fb56fSmrg			      "  movntps %%xmm3, 48(%0)\n"		\
1257e6fb56fSmrg			      "  addl $64,%0\n"				\
1267e6fb56fSmrg			      "  addl $64,%1\n"				\
1277e6fb56fSmrg			      "  decl %2\n"				\
1287e6fb56fSmrg			      "  jne 2b\n"				\
1297e6fb56fSmrg			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
1307e6fb56fSmrg			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
1317e6fb56fSmrg    }
1327e6fb56fSmrg
1337e6fb56fSmrg#define MMX_CPY(prefetch, from, to, dummy, lcnt)			\
1347e6fb56fSmrg    __asm__ __volatile__ (						\
1357e6fb56fSmrg			  "1:\n"					\
1367e6fb56fSmrg			  prefetch "320(%1)\n"				\
1377e6fb56fSmrg			  "2:  movq (%1), %%mm0\n"			\
1387e6fb56fSmrg			  "  movq 8(%1), %%mm1\n"			\
1397e6fb56fSmrg			  "  movq 16(%1), %%mm2\n"			\
1407e6fb56fSmrg			  "  movq 24(%1), %%mm3\n"			\
1417e6fb56fSmrg			  "  movq %%mm0, (%0)\n"			\
1427e6fb56fSmrg			  "  movq %%mm1, 8(%0)\n"			\
1437e6fb56fSmrg			  "  movq %%mm2, 16(%0)\n"			\
1447e6fb56fSmrg			  "  movq %%mm3, 24(%0)\n"			\
1457e6fb56fSmrg			  prefetch "352(%1)\n"				\
1467e6fb56fSmrg			  "  movq 32(%1), %%mm0\n"			\
1477e6fb56fSmrg			  "  movq 40(%1), %%mm1\n"			\
1487e6fb56fSmrg			  "  movq 48(%1), %%mm2\n"			\
1497e6fb56fSmrg			  "  movq 56(%1), %%mm3\n"			\
1507e6fb56fSmrg			  "  movq %%mm0, 32(%0)\n"			\
1517e6fb56fSmrg			  "  movq %%mm1, 40(%0)\n"			\
1527e6fb56fSmrg			  "  movq %%mm2, 48(%0)\n"			\
1537e6fb56fSmrg			  "  movq %%mm3, 56(%0)\n"			\
1547e6fb56fSmrg			  "  addl $64,%0\n"				\
1557e6fb56fSmrg			  "  addl $64,%1\n"				\
1567e6fb56fSmrg			  "  decl %2\n"					\
1577e6fb56fSmrg			  "  jne 1b\n"					\
1587e6fb56fSmrg			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
1597e6fb56fSmrg			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
1607e6fb56fSmrg
1617e6fb56fSmrg#define MMXEXT_CPY(prefetch, from, to, dummy, lcnt)			\
1627e6fb56fSmrg    __asm__ __volatile__ (						\
1637e6fb56fSmrg			  ".p2align 4,,7\n"				\
1647e6fb56fSmrg			  "1:\n"					\
1657e6fb56fSmrg			  prefetch "320(%1)\n"				\
1667e6fb56fSmrg			  "  movq (%1), %%mm0\n"			\
1677e6fb56fSmrg			  "  movq 8(%1), %%mm1\n"			\
1687e6fb56fSmrg			  "  movq 16(%1), %%mm2\n"			\
1697e6fb56fSmrg			  "  movq 24(%1), %%mm3\n"			\
1707e6fb56fSmrg			  "  movntq %%mm0, (%0)\n"			\
1717e6fb56fSmrg			  "  movntq %%mm1, 8(%0)\n"			\
1727e6fb56fSmrg			  "  movntq %%mm2, 16(%0)\n"			\
1737e6fb56fSmrg			  "  movntq %%mm3, 24(%0)\n"			\
1747e6fb56fSmrg			  prefetch "352(%1)\n"				\
1757e6fb56fSmrg			  "  movq 32(%1), %%mm0\n"			\
1767e6fb56fSmrg			  "  movq 40(%1), %%mm1\n"			\
1777e6fb56fSmrg			  "  movq 48(%1), %%mm2\n"			\
1787e6fb56fSmrg			  "  movq 56(%1), %%mm3\n"			\
1797e6fb56fSmrg			  "  movntq %%mm0, 32(%0)\n"			\
1807e6fb56fSmrg			  "  movntq %%mm1, 40(%0)\n"			\
1817e6fb56fSmrg			  "  movntq %%mm2, 48(%0)\n"			\
1827e6fb56fSmrg			  "  movntq %%mm3, 56(%0)\n"			\
1837e6fb56fSmrg			  "  addl $64,%0\n"				\
1847e6fb56fSmrg			  "  addl $64,%1\n"				\
1857e6fb56fSmrg			  "  decl %2\n"					\
1867e6fb56fSmrg			  "  jne 1b\n"					\
1877e6fb56fSmrg			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
1887e6fb56fSmrg			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
1897e6fb56fSmrg
1907e6fb56fSmrg
1917e6fb56fSmrg#define PREFETCH_FUNC(prefix, itype, ptype, begin, fence)		\
1927e6fb56fSmrg									\
1937e6fb56fSmrg    static void prefix##_YUV42X(unsigned char *to,			\
1947e6fb56fSmrg				const unsigned char *from,		\
1957e6fb56fSmrg				int dstPitch,				\
1967e6fb56fSmrg				int w,					\
1977e6fb56fSmrg				int h,					\
1987e6fb56fSmrg				int yuv422)				\
1997e6fb56fSmrg    {									\
2007e6fb56fSmrg	int dadd, rest, count, hc, lcnt;				\
2017e6fb56fSmrg	register int dummy;						\
2027e6fb56fSmrg	PREFETCH1(ptype##_PREFETCH, from);				\
2037e6fb56fSmrg	begin;								\
2047e6fb56fSmrg	count = 2;							\
2057e6fb56fSmrg									\
2067e6fb56fSmrg	/* If destination pitch equals width, do it all in one go. */	\
2077e6fb56fSmrg									\
2087e6fb56fSmrg	if (yuv422) {							\
2097e6fb56fSmrg	    w <<= 1;							\
2107e6fb56fSmrg	    if (w == dstPitch) {					\
2117e6fb56fSmrg		w *= h;							\
2127e6fb56fSmrg		h = 1;							\
2137e6fb56fSmrg		dstPitch = w;						\
2147e6fb56fSmrg		count = 0;						\
2157e6fb56fSmrg	    } else {							\
2167e6fb56fSmrg		h -= 1;							\
2177e6fb56fSmrg		count = 1;						\
2187e6fb56fSmrg	    }								\
2197e6fb56fSmrg	} else if (w == dstPitch) {					\
2207e6fb56fSmrg	    w = h*(w + (w >> 1));					\
2217e6fb56fSmrg	    count = 0;							\
2227e6fb56fSmrg	    h = 1;							\
2237e6fb56fSmrg	    dstPitch = w;						\
2247e6fb56fSmrg	}								\
2257e6fb56fSmrg									\
2267e6fb56fSmrg	lcnt = w >> 6;							\
2277e6fb56fSmrg	rest = w & 63;							\
2287e6fb56fSmrg	while (count--) {						\
2297e6fb56fSmrg	    hc = h;							\
2307e6fb56fSmrg	    lcnt = w >> 6;						\
2317e6fb56fSmrg	    rest = w & 63;						\
2327e6fb56fSmrg	    dadd = dstPitch - w;					\
2337e6fb56fSmrg	    while (hc--) {						\
2347e6fb56fSmrg		if (lcnt) {						\
2357e6fb56fSmrg		    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \
2367e6fb56fSmrg		}							\
2377e6fb56fSmrg		if (rest) {						\
2387e6fb56fSmrg		    PREFETCH2(ptype##_PREFETCH, from);			\
2397e6fb56fSmrg		    small_memcpy(to, from, rest);			\
2407e6fb56fSmrg		    PREFETCH3(ptype##_PREFETCH, from);			\
2417e6fb56fSmrg		}							\
2427e6fb56fSmrg		to += dadd;						\
2437e6fb56fSmrg	    }								\
2447e6fb56fSmrg	    w >>= 1;							\
2457e6fb56fSmrg	    dstPitch >>= 1;						\
2467e6fb56fSmrg	    h -= 1;							\
2477e6fb56fSmrg	}								\
2487e6fb56fSmrg	if (lcnt > 5) {							\
2497e6fb56fSmrg	    lcnt -= 5;							\
2507e6fb56fSmrg	    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt);	\
2517e6fb56fSmrg	    lcnt = 5;							\
2527e6fb56fSmrg	}								\
2537e6fb56fSmrg	if (lcnt) {							\
2547e6fb56fSmrg	    itype##_CPY("#", from, to, dummy, lcnt);			\
2557e6fb56fSmrg	}								\
2567e6fb56fSmrg	if (rest) small_memcpy(to, from, rest);				\
2577e6fb56fSmrg	fence;								\
2587e6fb56fSmrg    }
2597e6fb56fSmrg
2607e6fb56fSmrg#define NOPREFETCH_FUNC(prefix, itype, begin, fence)			\
2617e6fb56fSmrg    static void prefix##_YUV42X(unsigned char *to,			\
2627e6fb56fSmrg				const unsigned char *from,		\
2637e6fb56fSmrg				int dstPitch,				\
2647e6fb56fSmrg				int w,					\
2657e6fb56fSmrg				int h,					\
2667e6fb56fSmrg				int yuv422)				\
2677e6fb56fSmrg									\
2687e6fb56fSmrg    {									\
2697e6fb56fSmrg	int dadd, rest, count, hc, lcnt;				\
2707e6fb56fSmrg	register int dummy;						\
2717e6fb56fSmrg	begin;								\
2727e6fb56fSmrg	count = 2;							\
2737e6fb56fSmrg									\
2747e6fb56fSmrg	/* If destination pitch equals width, do it all in one go. */	\
2757e6fb56fSmrg									\
2767e6fb56fSmrg	if (yuv422) {							\
2777e6fb56fSmrg	    w <<= 1;							\
2787e6fb56fSmrg	    count = 1;							\
2797e6fb56fSmrg	    if (w == dstPitch) {					\
2807e6fb56fSmrg		w *= h;							\
2817e6fb56fSmrg		h = 1;							\
2827e6fb56fSmrg		dstPitch = w;						\
2837e6fb56fSmrg	    }								\
2847e6fb56fSmrg	} else if (w == dstPitch) {					\
2857e6fb56fSmrg	    w = h*(w + (w >> 1));					\
2867e6fb56fSmrg	    count = 1;							\
2877e6fb56fSmrg	    h = 1;							\
2887e6fb56fSmrg	    dstPitch = w;						\
2897e6fb56fSmrg	}								\
2907e6fb56fSmrg									\
2917e6fb56fSmrg	lcnt = w >> 6;							\
2927e6fb56fSmrg	rest = w & 63;							\
2937e6fb56fSmrg	while (count--) {						\
2947e6fb56fSmrg	    hc = h;							\
2957e6fb56fSmrg	    dadd = dstPitch - w;					\
2967e6fb56fSmrg	    lcnt = w >> 6;						\
2977e6fb56fSmrg	    rest = w & 63;						\
2987e6fb56fSmrg	    while (hc--) {						\
2997e6fb56fSmrg		if (lcnt) {						\
3007e6fb56fSmrg		    itype##_CPY("#", from, to, dummy, lcnt);		\
3017e6fb56fSmrg		}							\
3027e6fb56fSmrg		if (rest) small_memcpy(to, from, rest);			\
3037e6fb56fSmrg		to += dadd;						\
3047e6fb56fSmrg	    }								\
3057e6fb56fSmrg	    w >>= 1;							\
3067e6fb56fSmrg	    dstPitch >>= 1;						\
3077e6fb56fSmrg	}								\
3087e6fb56fSmrg	fence;								\
3097e6fb56fSmrg    }
3107e6fb56fSmrg
3117e6fb56fSmrg
3127e6fb56fSmrgstatic void
3137e6fb56fSmrglibc_YUV42X(unsigned char *dst, const unsigned char *src,
3147e6fb56fSmrg            int dstPitch, int w, int h, int yuv422)
3157e6fb56fSmrg{
3167e6fb56fSmrg    if (yuv422)
3177e6fb56fSmrg        w <<= 1;
3187e6fb56fSmrg    if (dstPitch == w) {
3197e6fb56fSmrg        int size = h * ((yuv422) ? w : (w + (w >> 1)));
3207e6fb56fSmrg
3217e6fb56fSmrg        memcpy(dst, src, size);
3227e6fb56fSmrg        return;
3237e6fb56fSmrg    } else {
3247e6fb56fSmrg        int count;
3257e6fb56fSmrg
3267e6fb56fSmrg        /* Copy Y component to video memory. */
3277e6fb56fSmrg        count = h;
3287e6fb56fSmrg        while (count--) {
3297e6fb56fSmrg            memcpy(dst, src, w);
3307e6fb56fSmrg            src += w;
3317e6fb56fSmrg            dst += dstPitch;
3327e6fb56fSmrg        }
3337e6fb56fSmrg
3347e6fb56fSmrg        /* UV component is 1/2 of Y. */
3357e6fb56fSmrg        if (!yuv422) {
3367e6fb56fSmrg            w >>= 1;
3377e6fb56fSmrg            dstPitch >>= 1;
3387e6fb56fSmrg
3397e6fb56fSmrg            /* Copy V(Cr),U(Cb) components to video memory. */
3407e6fb56fSmrg            count = h;
3417e6fb56fSmrg            while (count--) {
3427e6fb56fSmrg                memcpy(dst, src, w);
3437e6fb56fSmrg                src += w;
3447e6fb56fSmrg                dst += dstPitch;
3457e6fb56fSmrg            }
3467e6fb56fSmrg        }
3477e6fb56fSmrg    }
3487e6fb56fSmrg}
3497e6fb56fSmrg
3507e6fb56fSmrg#ifdef __i386__
3517e6fb56fSmrg
3527e6fb56fSmrg/* Linux kernel __memcpy. */
3537e6fb56fSmrgstatic __inline void *
3547e6fb56fSmrg__memcpy(void *to, const void *from, size_t n)
3557e6fb56fSmrg{
3567e6fb56fSmrg    int d1, d2, d3;
3577e6fb56fSmrg
3587e6fb56fSmrg    __asm__ __volatile__(
3597e6fb56fSmrg                         "rep ; movsl\n\t"
3607e6fb56fSmrg                         "testb $2,%b4\n\t"
3617e6fb56fSmrg                         "je 1f\n\t"
3627e6fb56fSmrg                         "movsw\n"
3637e6fb56fSmrg                         "1:\ttestb $1,%b4\n\t"
3647e6fb56fSmrg                         "je 2f\n\t"
3657e6fb56fSmrg                         "movsb\n"
3667e6fb56fSmrg                         "2:"
3677e6fb56fSmrg                         :"=&c"(d1), "=&D"(d2), "=&S"(d3)
3687e6fb56fSmrg                         :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from)
3697e6fb56fSmrg                         :"memory");
3707e6fb56fSmrg
3717e6fb56fSmrg    return (to);
3727e6fb56fSmrg}
3737e6fb56fSmrg
3747e6fb56fSmrg
3757e6fb56fSmrgstatic void
3767e6fb56fSmrgkernel_YUV42X(unsigned char *dst, const unsigned char *src,
3777e6fb56fSmrg              int dstPitch, int w, int h, int yuv422)
3787e6fb56fSmrg{
3797e6fb56fSmrg    if (yuv422)
3807e6fb56fSmrg        w <<= 1;
3817e6fb56fSmrg    if (dstPitch == w) {
3827e6fb56fSmrg        int size = h * ((yuv422) ? w : (w + (w >> 1)));
3837e6fb56fSmrg
3847e6fb56fSmrg        __memcpy(dst, src, size);
3857e6fb56fSmrg        return;
3867e6fb56fSmrg    } else {
3877e6fb56fSmrg        int count;
3887e6fb56fSmrg
3897e6fb56fSmrg        /* Copy Y component to video memory. */
3907e6fb56fSmrg        count = h;
3917e6fb56fSmrg        while (count--) {
3927e6fb56fSmrg            __memcpy(dst, src, w);
3937e6fb56fSmrg            src += w;
3947e6fb56fSmrg            dst += dstPitch;
3957e6fb56fSmrg        }
3967e6fb56fSmrg
3977e6fb56fSmrg        /* UV component is 1/2 of Y. */
3987e6fb56fSmrg        if (!yuv422) {
3997e6fb56fSmrg
4007e6fb56fSmrg            w >>= 1;
4017e6fb56fSmrg            dstPitch >>= 1;
4027e6fb56fSmrg
4037e6fb56fSmrg            /* Copy V(Cr),U(Cb) components to video memory. */
4047e6fb56fSmrg            count = h;
4057e6fb56fSmrg            while (count--) {
4067e6fb56fSmrg                __memcpy(dst, src, w);
4077e6fb56fSmrg                src += w;
4087e6fb56fSmrg                dst += dstPitch;
4097e6fb56fSmrg            }
4107e6fb56fSmrg        }
4117e6fb56fSmrg    }
4127e6fb56fSmrg}
4137e6fb56fSmrg
4147e6fb56fSmrgPREFETCH_FUNC(sse, SSE, SSE,, FENCE)
4157e6fb56fSmrgPREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS)
4167e6fb56fSmrgPREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS)
4177e6fb56fSmrgNOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS)
4187e6fb56fSmrg
4197e6fb56fSmrgstatic void
4207e6fb56fSmrg*kernel_memcpy(void *to, const void *from, size_t len)
4217e6fb56fSmrg{
4227e6fb56fSmrg    return __memcpy(to, from, len);
4237e6fb56fSmrg}
4247e6fb56fSmrg
4257e6fb56fSmrgstatic unsigned
4267e6fb56fSmrgfastrdtsc(void)
4277e6fb56fSmrg{
4287e6fb56fSmrg    unsigned eax;
4297e6fb56fSmrg
4307e6fb56fSmrg    __asm__ volatile ("\t"
4317e6fb56fSmrg                      "pushl %%ebx\n\t"
4327e6fb56fSmrg                      "cpuid\n\t"
4337e6fb56fSmrg                      ".byte 0x0f, 0x31\n\t"
4347e6fb56fSmrg                      "popl %%ebx\n"
4357e6fb56fSmrg                      :"=a" (eax)
4367e6fb56fSmrg                      :"0"(0)
4377e6fb56fSmrg                      :"ecx", "edx", "cc");
4387e6fb56fSmrg
4397e6fb56fSmrg    return eax;
4407e6fb56fSmrg}
4417e6fb56fSmrg
4427e6fb56fSmrg
4437e6fb56fSmrgstatic unsigned
4447e6fb56fSmrgtime_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2)
4457e6fb56fSmrg{
4467e6fb56fSmrg    unsigned t, t2;
4477e6fb56fSmrg
4487e6fb56fSmrg    t = fastrdtsc();
4497e6fb56fSmrg
4507e6fb56fSmrg    (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
4517e6fb56fSmrg
4527e6fb56fSmrg    t2 = fastrdtsc();
4537e6fb56fSmrg    return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1));
4547e6fb56fSmrg}
4557e6fb56fSmrg
4567e6fb56fSmrgenum
4577e6fb56fSmrg{ libc = 0, kernel, sse, mmx, now, mmxext, totNum };
4587e6fb56fSmrg
4597e6fb56fSmrgtypedef struct
4607e6fb56fSmrg{
4617e6fb56fSmrg    vidCopyFunc mFunc;
4627e6fb56fSmrg    char *mName, **cpuFlag;
4637e6fb56fSmrg} McFuncData;
4647e6fb56fSmrg
4657e6fb56fSmrgstatic char *libc_cpuflags[] = { " ", 0 };
4667e6fb56fSmrgstatic char *kernel_cpuflags[] = { " ", 0 };
4677e6fb56fSmrgstatic char *sse_cpuflags[] = { " sse ", 0 };
4687e6fb56fSmrgstatic char *mmx_cpuflags[] = { " mmx ", 0 };
4697e6fb56fSmrgstatic char *now_cpuflags[] = { " 3dnow ", 0 };
4707e6fb56fSmrgstatic char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 };
4717e6fb56fSmrg
4727e6fb56fSmrgstatic McFuncData mcFunctions[totNum] = {
4737e6fb56fSmrg{libc_YUV42X, "libc", libc_cpuflags},
4747e6fb56fSmrg{kernel_YUV42X, "kernel", kernel_cpuflags},
4757e6fb56fSmrg{sse_YUV42X, "SSE", sse_cpuflags},
4767e6fb56fSmrg{mmx_YUV42X, "MMX", mmx_cpuflags},
4777e6fb56fSmrg{now_YUV42X, "3DNow!", now_cpuflags},
4787e6fb56fSmrg{mmxext_YUV42X, "MMX2", mmx2_cpuflags}
4797e6fb56fSmrg};
4807e6fb56fSmrg
4817e6fb56fSmrg
4827e6fb56fSmrgstatic int
4837e6fb56fSmrgflagValid(const char *cpuinfo, char *flag)
4847e6fb56fSmrg{
4857e6fb56fSmrg    const char *flagLoc, *nextProc;
4867e6fb56fSmrg    int located = 0;
4877e6fb56fSmrg
4887e6fb56fSmrg    while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) {
4897e6fb56fSmrg        located = 1;
4907e6fb56fSmrg        cpuinfo += 11;
4917e6fb56fSmrg        if ((flagLoc = strstr(cpuinfo, flag))) {
4927e6fb56fSmrg            if ((nextProc = strstr(cpuinfo, "processor\t:"))) {
4937e6fb56fSmrg                if (nextProc < flagLoc)
4947e6fb56fSmrg                    return 0;
4957e6fb56fSmrg            }
4967e6fb56fSmrg        } else {
4977e6fb56fSmrg            return 0;
4987e6fb56fSmrg        }
4997e6fb56fSmrg    }
5007e6fb56fSmrg    return located;
5017e6fb56fSmrg}
5027e6fb56fSmrg
5037e6fb56fSmrg
5047e6fb56fSmrgstatic int
5057e6fb56fSmrgcpuValid(const char *cpuinfo, char **flags)
5067e6fb56fSmrg{
5077e6fb56fSmrg    for (; *flags != 0; flags++) {
5087e6fb56fSmrg        if (flagValid(cpuinfo, *flags))
5097e6fb56fSmrg            return 1;
5107e6fb56fSmrg    }
5117e6fb56fSmrg    return 0;
5127e6fb56fSmrg}
5137e6fb56fSmrg
5147e6fb56fSmrg/*
5157e6fb56fSmrg * Benchmark the video copy routines and choose the fastest.
5167e6fb56fSmrg */
5177e6fb56fSmrgvidCopyFunc
5187e6fb56fSmrgviaVidCopyInit(char *copyType, ScreenPtr pScreen)
5197e6fb56fSmrg{
5208aedb4f6Smrg    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
5217e6fb56fSmrg
5227e6fb56fSmrg    char buf[BSIZ];
5237e6fb56fSmrg    unsigned char *buf1, *buf2, *buf3;
5247e6fb56fSmrg    char *tmpBuf, *endBuf;
5257e6fb56fSmrg    int count, j, bestSoFar;
5267e6fb56fSmrg    unsigned best, tmp, testSize, alignSize, tmp2;
5278aedb4f6Smrg    struct buffer_object *tmpFbBuffer;
5287e6fb56fSmrg    McFuncData *curData;
5297e6fb56fSmrg    FILE *cpuInfoFile;
5307e6fb56fSmrg    double cpuFreq;
5317e6fb56fSmrg
5327e6fb56fSmrg    if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) {
5337e6fb56fSmrg        return libc_YUV42X;
5347e6fb56fSmrg    }
5357e6fb56fSmrg    count = fread(buf, 1, BSIZ, cpuInfoFile);
5367e6fb56fSmrg    if (ferror(cpuInfoFile)) {
5377e6fb56fSmrg        fclose(cpuInfoFile);
5387e6fb56fSmrg        return libc_YUV42X;
5397e6fb56fSmrg    }
5407e6fb56fSmrg    fclose(cpuInfoFile);
5417e6fb56fSmrg    if (BSIZ == count) {
5427e6fb56fSmrg        xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
5437e6fb56fSmrg                   "\"/proc/cpuinfo\" file too long. "
5447e6fb56fSmrg                   "Using Linux kernel memcpy.\n");
5457e6fb56fSmrg        return libc_YUV42X;
5467e6fb56fSmrg    }
5477e6fb56fSmrg    buf[count] = 0;
5487e6fb56fSmrg
5497e6fb56fSmrg    while (count--)
5507e6fb56fSmrg        if ('\n' == buf[count])
5517e6fb56fSmrg            buf[count] = ' ';
5527e6fb56fSmrg
5537e6fb56fSmrg    /* Extract the CPU frequency. */
5547e6fb56fSmrg    cpuFreq = 0.;
5557e6fb56fSmrg    if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) {
5567e6fb56fSmrg        if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) {
5577e6fb56fSmrg            cpuFreq = strtod(tmpBuf, &endBuf);
5587e6fb56fSmrg            if (endBuf == tmpBuf)
5597e6fb56fSmrg                tmpBuf = NULL;
5607e6fb56fSmrg        }
5617e6fb56fSmrg    }
5627e6fb56fSmrg
5637e6fb56fSmrg    alignSize = BSIZH * (BSIZA + (BSIZA >> 1));
5647e6fb56fSmrg    testSize = BSIZH * (BSIZW + (BSIZW >> 1));
5657e6fb56fSmrg    /*
5667e6fb56fSmrg     * Allocate an area of offscreen FB memory, (buf1), a simulated video
5678aedb4f6Smrg     * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
5687e6fb56fSmrg     */
5698aedb4f6Smrg    tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_FLAG_VRAM);
5708aedb4f6Smrg    if (!tmpFbBuffer)
5717e6fb56fSmrg        return libc_YUV42X;
5728aedb4f6Smrg    if (NULL == (buf2 = (unsigned char *)malloc(testSize))) {
5738aedb4f6Smrg        drm_bo_free(pScrn, tmpFbBuffer);
5747e6fb56fSmrg        return libc_YUV42X;
5757e6fb56fSmrg    }
5768aedb4f6Smrg    if (NULL == (buf3 = (unsigned char *)malloc(testSize))) {
5778aedb4f6Smrg        free(buf2);
5788aedb4f6Smrg        drm_bo_free(pScrn, tmpFbBuffer);
5797e6fb56fSmrg        return libc_YUV42X;
5807e6fb56fSmrg    }
5818aedb4f6Smrg    buf1 = drm_bo_map(pScrn, tmpFbBuffer);
5827e6fb56fSmrg    bestSoFar = 0;
5837e6fb56fSmrg    best = 0xFFFFFFFFU;
5847e6fb56fSmrg
5857e6fb56fSmrg    /* Make probable that buf1 and buf2 are in memory by referencing them. */
5867e6fb56fSmrg    libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
5877e6fb56fSmrg
5887e6fb56fSmrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
5897e6fb56fSmrg               "Benchmarking %s copy.  Less time is better.\n", copyType);
5907e6fb56fSmrg    for (j = 0; j < totNum; ++j) {
5917e6fb56fSmrg        curData = mcFunctions + j;
5927e6fb56fSmrg
5937e6fb56fSmrg        if (cpuValid(buf, curData->cpuFlag)) {
5947e6fb56fSmrg
5957e6fb56fSmrg            /* Simulate setup of the video buffer. */
5967e6fb56fSmrg            kernel_memcpy(buf2, buf3, testSize);
5977e6fb56fSmrg
5987e6fb56fSmrg            /* Copy the video buffer to frame-buffer memory. */
5997e6fb56fSmrg            tmp = time_function(curData->mFunc, buf1, buf2);
6007e6fb56fSmrg
6017e6fb56fSmrg            /* Do it again to avoid context-switch effects. */
6027e6fb56fSmrg            kernel_memcpy(buf2, buf3, testSize);
6037e6fb56fSmrg            tmp2 = time_function(curData->mFunc, buf1, buf2);
6047e6fb56fSmrg            tmp = (tmp2 < tmp) ? tmp2 : tmp;
6057e6fb56fSmrg
6067e6fb56fSmrg            if (NULL == tmpBuf) {
6077e6fb56fSmrg                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
6087e6fb56fSmrg                           "Timed %6s YUV420 copy... %u.\n",
6097e6fb56fSmrg                           curData->mName, tmp);
6107e6fb56fSmrg            } else {
6117e6fb56fSmrg                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
6127e6fb56fSmrg                           "Timed %6s YUV420 copy... %u. "
6137e6fb56fSmrg                           "Throughput: %.1f MiB/s.\n",
6147e6fb56fSmrg                           curData->mName, tmp,
6157e6fb56fSmrg                           cpuFreq * 1.e6 * (double)testSize /
6167e6fb56fSmrg                           ((double)(tmp) * (double)(0x100000)));
6177e6fb56fSmrg            }
6187e6fb56fSmrg            if (tmp < best) {
6197e6fb56fSmrg                best = tmp;
6207e6fb56fSmrg                bestSoFar = j;
6217e6fb56fSmrg            }
6227e6fb56fSmrg        } else {
6237e6fb56fSmrg            xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
6247e6fb56fSmrg                       "Ditching %6s YUV420 copy. Not supported by CPU.\n",
6257e6fb56fSmrg                       curData->mName);
6267e6fb56fSmrg        }
6277e6fb56fSmrg    }
6288aedb4f6Smrg    free(buf3);
6298aedb4f6Smrg    free(buf2);
6308aedb4f6Smrg    drm_bo_unmap(pScrn, tmpFbBuffer);
6318aedb4f6Smrg    drm_bo_free(pScrn, tmpFbBuffer);
6327e6fb56fSmrg    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
6337e6fb56fSmrg               "Using %s YUV42X copy for %s.\n",
6347e6fb56fSmrg               mcFunctions[bestSoFar].mName, copyType);
6357e6fb56fSmrg    return mcFunctions[bestSoFar].mFunc;
6367e6fb56fSmrg}
6377e6fb56fSmrg
6387e6fb56fSmrg#else
6397e6fb56fSmrg
6407e6fb56fSmrgvidCopyFunc
6417e6fb56fSmrgviaVidCopyInit(char *copyType, ScreenPtr pScreen)
6427e6fb56fSmrg{
6438aedb4f6Smrg    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
6447e6fb56fSmrg
6457e6fb56fSmrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
6467e6fb56fSmrg               "Using default xfree86 memcpy for video.\n");
6477e6fb56fSmrg    return libc_YUV42X;
6487e6fb56fSmrg}
6497e6fb56fSmrg
6507e6fb56fSmrg#endif /* __i386__ */
651