1/*
2 * Copyright (C) 2004 Thomas Hellström, All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24#ifdef HAVE_CONFIG_H
25#include "config.h"
26#endif
27
28#include "via_driver.h"
29#include "compiler.h"
30
31
32#define BSIZ 2048  /* size of /proc/cpuinfo buffer */
33#define BSIZW 720  /* typical copy width (YUV420) */
34#define BSIZA 736  /* multiple of 32 bytes */
35#define BSIZH 576  /* typical copy height */
36
37#define SSE_PREFETCH "  prefetchnta "
38#define FENCE __asm__ __volatile__ ("sfence":::"memory");
39#define FENCEMMS __asm__ __volatile__ ("\t"		\
40				       "sfence\n\t"	\
41				       "emms\n\t"	\
42				       :::"memory");
43#define FEMMS __asm__ __volatile__("femms":::"memory");
44#define EMMS __asm__ __volatile__("emms":::"memory");
45
46#define NOW_PREFETCH "  prefetch "
47
48
49#define PREFETCH1(arch_prefetch,from)			\
50    __asm__ __volatile__ (				\
51			  "1:  " arch_prefetch "(%0)\n"	\
52			  arch_prefetch "32(%0)\n"	\
53			  arch_prefetch "64(%0)\n"	\
54			  arch_prefetch "96(%0)\n"	\
55			  arch_prefetch "128(%0)\n"	\
56			  arch_prefetch "160(%0)\n"	\
57			  arch_prefetch "192(%0)\n"	\
58			  arch_prefetch "256(%0)\n"	\
59			  arch_prefetch "288(%0)\n"	\
60			  "2:\n"			\
61			  : : "r" (from) );
62
63#define PREFETCH2(arch_prefetch,from)			\
64    __asm__ __volatile__ (				\
65			  arch_prefetch "320(%0)\n"	\
66			  : : "r" (from) );
67#define PREFETCH3(arch_prefetch,from)			\
68    __asm__ __volatile__ (				\
69			  arch_prefetch "288(%0)\n"	\
70			  : : "r" (from) );
71
72
73#define small_memcpy(to, from, n)					\
74    {									\
75	__asm__ __volatile__(						\
76			     "movl %2,%%ecx\n\t"			\
77			     "sarl $2,%%ecx\n\t"			\
78			     "rep ; movsl\n\t"				\
79			     "testb $2,%b2\n\t"				\
80			     "je 1f\n\t"				\
81			     "movsw\n"					\
82			     "1:\ttestb $1,%b2\n\t"			\
83			     "je 2f\n\t"				\
84			     "movsb\n"					\
85			     "2:"					\
86			     :"=&D" (to), "=&S" (from)			\
87			     :"q" (n),"0" ((long) to),"1" ((long) from) \
88			     : "%ecx","memory");			\
89    }
90
91
92#define SSE_CPY(prefetch, from, to, dummy, lcnt)			\
93    if ((unsigned long) from & 15) {					\
94	__asm__ __volatile__ (						\
95			      "1:\n"					\
96			      prefetch "320(%1)\n"			\
97			      "  movups (%1), %%xmm0\n"			\
98			      "  movups 16(%1), %%xmm1\n"		\
99			      "  movntps %%xmm0, (%0)\n"		\
100			      "  movntps %%xmm1, 16(%0)\n"		\
101                              prefetch "352(%1)\n"			\
102			      "  movups 32(%1), %%xmm2\n"		\
103			      "  movups 48(%1), %%xmm3\n"		\
104			      "  movntps %%xmm2, 32(%0)\n"		\
105			      "  movntps %%xmm3, 48(%0)\n"		\
106			      "  addl $64,%0\n"				\
107			      "  addl $64,%1\n"				\
108			      "  decl %2\n"				\
109			      "  jne 1b\n"				\
110			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
111			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
112    } else {								\
113	__asm__ __volatile__ (						\
114			      "2:\n"					\
115			      prefetch "320(%1)\n"			\
116			      "  movaps (%1), %%xmm0\n"			\
117			      "  movaps 16(%1), %%xmm1\n"		\
118			      "  movntps %%xmm0, (%0)\n"		\
119			      "  movntps %%xmm1, 16(%0)\n"		\
120			      prefetch "352(%1)\n"			\
121			      "  movaps 32(%1), %%xmm2\n"		\
122			      "  movaps 48(%1), %%xmm3\n"		\
123			      "  movntps %%xmm2, 32(%0)\n"		\
124			      "  movntps %%xmm3, 48(%0)\n"		\
125			      "  addl $64,%0\n"				\
126			      "  addl $64,%1\n"				\
127			      "  decl %2\n"				\
128			      "  jne 2b\n"				\
129			      :"=&D"(to), "=&S"(from), "=&r"(dummy)	\
130			      :"0" (to), "1" (from), "2" (lcnt): "memory"); \
131    }
132
133#define MMX_CPY(prefetch, from, to, dummy, lcnt)			\
134    __asm__ __volatile__ (						\
135			  "1:\n"					\
136			  prefetch "320(%1)\n"				\
137			  "2:  movq (%1), %%mm0\n"			\
138			  "  movq 8(%1), %%mm1\n"			\
139			  "  movq 16(%1), %%mm2\n"			\
140			  "  movq 24(%1), %%mm3\n"			\
141			  "  movq %%mm0, (%0)\n"			\
142			  "  movq %%mm1, 8(%0)\n"			\
143			  "  movq %%mm2, 16(%0)\n"			\
144			  "  movq %%mm3, 24(%0)\n"			\
145			  prefetch "352(%1)\n"				\
146			  "  movq 32(%1), %%mm0\n"			\
147			  "  movq 40(%1), %%mm1\n"			\
148			  "  movq 48(%1), %%mm2\n"			\
149			  "  movq 56(%1), %%mm3\n"			\
150			  "  movq %%mm0, 32(%0)\n"			\
151			  "  movq %%mm1, 40(%0)\n"			\
152			  "  movq %%mm2, 48(%0)\n"			\
153			  "  movq %%mm3, 56(%0)\n"			\
154			  "  addl $64,%0\n"				\
155			  "  addl $64,%1\n"				\
156			  "  decl %2\n"					\
157			  "  jne 1b\n"					\
158			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
159			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
160
161#define MMXEXT_CPY(prefetch, from, to, dummy, lcnt)			\
162    __asm__ __volatile__ (						\
163			  ".p2align 4,,7\n"				\
164			  "1:\n"					\
165			  prefetch "320(%1)\n"				\
166			  "  movq (%1), %%mm0\n"			\
167			  "  movq 8(%1), %%mm1\n"			\
168			  "  movq 16(%1), %%mm2\n"			\
169			  "  movq 24(%1), %%mm3\n"			\
170			  "  movntq %%mm0, (%0)\n"			\
171			  "  movntq %%mm1, 8(%0)\n"			\
172			  "  movntq %%mm2, 16(%0)\n"			\
173			  "  movntq %%mm3, 24(%0)\n"			\
174			  prefetch "352(%1)\n"				\
175			  "  movq 32(%1), %%mm0\n"			\
176			  "  movq 40(%1), %%mm1\n"			\
177			  "  movq 48(%1), %%mm2\n"			\
178			  "  movq 56(%1), %%mm3\n"			\
179			  "  movntq %%mm0, 32(%0)\n"			\
180			  "  movntq %%mm1, 40(%0)\n"			\
181			  "  movntq %%mm2, 48(%0)\n"			\
182			  "  movntq %%mm3, 56(%0)\n"			\
183			  "  addl $64,%0\n"				\
184			  "  addl $64,%1\n"				\
185			  "  decl %2\n"					\
186			  "  jne 1b\n"					\
187			  :"=&D"(to), "=&S"(from), "=&r"(dummy)		\
188			  :"0" (to), "1" (from), "2" (lcnt) : "memory");
189
190
191#define PREFETCH_FUNC(prefix, itype, ptype, begin, fence)		\
192									\
193    static void prefix##_YUV42X(unsigned char *to,			\
194				const unsigned char *from,		\
195				int dstPitch,				\
196				int w,					\
197				int h,					\
198				int yuv422)				\
199    {									\
200	int dadd, rest, count, hc, lcnt;				\
201	register int dummy;						\
202	PREFETCH1(ptype##_PREFETCH, from);				\
203	begin;								\
204	count = 2;							\
205									\
206	/* If destination pitch equals width, do it all in one go. */	\
207									\
208	if (yuv422) {							\
209	    w <<= 1;							\
210	    if (w == dstPitch) {					\
211		w *= h;							\
212		h = 1;							\
213		dstPitch = w;						\
214		count = 0;						\
215	    } else {							\
216		h -= 1;							\
217		count = 1;						\
218	    }								\
219	} else if (w == dstPitch) {					\
220	    w = h*(w + (w >> 1));					\
221	    count = 0;							\
222	    h = 1;							\
223	    dstPitch = w;						\
224	}								\
225									\
226	lcnt = w >> 6;							\
227	rest = w & 63;							\
228	while (count--) {						\
229	    hc = h;							\
230	    lcnt = w >> 6;						\
231	    rest = w & 63;						\
232	    dadd = dstPitch - w;					\
233	    while (hc--) {						\
234		if (lcnt) {						\
235		    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt); \
236		}							\
237		if (rest) {						\
238		    PREFETCH2(ptype##_PREFETCH, from);			\
239		    small_memcpy(to, from, rest);			\
240		    PREFETCH3(ptype##_PREFETCH, from);			\
241		}							\
242		to += dadd;						\
243	    }								\
244	    w >>= 1;							\
245	    dstPitch >>= 1;						\
246	    h -= 1;							\
247	}								\
248	if (lcnt > 5) {							\
249	    lcnt -= 5;							\
250	    itype##_CPY(ptype##_PREFETCH, from, to, dummy, lcnt);	\
251	    lcnt = 5;							\
252	}								\
253	if (lcnt) {							\
254	    itype##_CPY("#", from, to, dummy, lcnt);			\
255	}								\
256	if (rest) small_memcpy(to, from, rest);				\
257	fence;								\
258    }
259
260#define NOPREFETCH_FUNC(prefix, itype, begin, fence)			\
261    static void prefix##_YUV42X(unsigned char *to,			\
262				const unsigned char *from,		\
263				int dstPitch,				\
264				int w,					\
265				int h,					\
266				int yuv422)				\
267									\
268    {									\
269	int dadd, rest, count, hc, lcnt;				\
270	register int dummy;						\
271	begin;								\
272	count = 2;							\
273									\
274	/* If destination pitch equals width, do it all in one go. */	\
275									\
276	if (yuv422) {							\
277	    w <<= 1;							\
278	    count = 1;							\
279	    if (w == dstPitch) {					\
280		w *= h;							\
281		h = 1;							\
282		dstPitch = w;						\
283	    }								\
284	} else if (w == dstPitch) {					\
285	    w = h*(w + (w >> 1));					\
286	    count = 1;							\
287	    h = 1;							\
288	    dstPitch = w;						\
289	}								\
290									\
291	lcnt = w >> 6;							\
292	rest = w & 63;							\
293	while (count--) {						\
294	    hc = h;							\
295	    dadd = dstPitch - w;					\
296	    lcnt = w >> 6;						\
297	    rest = w & 63;						\
298	    while (hc--) {						\
299		if (lcnt) {						\
300		    itype##_CPY("#", from, to, dummy, lcnt);		\
301		}							\
302		if (rest) small_memcpy(to, from, rest);			\
303		to += dadd;						\
304	    }								\
305	    w >>= 1;							\
306	    dstPitch >>= 1;						\
307	}								\
308	fence;								\
309    }
310
311
312static void
313libc_YUV42X(unsigned char *dst, const unsigned char *src,
314            int dstPitch, int w, int h, int yuv422)
315{
316    if (yuv422)
317        w <<= 1;
318    if (dstPitch == w) {
319        int size = h * ((yuv422) ? w : (w + (w >> 1)));
320
321        memcpy(dst, src, size);
322        return;
323    } else {
324        int count;
325
326        /* Copy Y component to video memory. */
327        count = h;
328        while (count--) {
329            memcpy(dst, src, w);
330            src += w;
331            dst += dstPitch;
332        }
333
334        /* UV component is 1/2 of Y. */
335        if (!yuv422) {
336            w >>= 1;
337            dstPitch >>= 1;
338
339            /* Copy V(Cr),U(Cb) components to video memory. */
340            count = h;
341            while (count--) {
342                memcpy(dst, src, w);
343                src += w;
344                dst += dstPitch;
345            }
346        }
347    }
348}
349
350#ifdef __i386__
351
352/* Linux kernel __memcpy. */
353static __inline void *
354__memcpy(void *to, const void *from, size_t n)
355{
356    int d1, d2, d3;
357
358    __asm__ __volatile__(
359                         "rep ; movsl\n\t"
360                         "testb $2,%b4\n\t"
361                         "je 1f\n\t"
362                         "movsw\n"
363                         "1:\ttestb $1,%b4\n\t"
364                         "je 2f\n\t"
365                         "movsb\n"
366                         "2:"
367                         :"=&c"(d1), "=&D"(d2), "=&S"(d3)
368                         :"0"(n >> 2), "q"(n), "1"((long)to), "2"((long)from)
369                         :"memory");
370
371    return (to);
372}
373
374
375static void
376kernel_YUV42X(unsigned char *dst, const unsigned char *src,
377              int dstPitch, int w, int h, int yuv422)
378{
379    if (yuv422)
380        w <<= 1;
381    if (dstPitch == w) {
382        int size = h * ((yuv422) ? w : (w + (w >> 1)));
383
384        __memcpy(dst, src, size);
385        return;
386    } else {
387        int count;
388
389        /* Copy Y component to video memory. */
390        count = h;
391        while (count--) {
392            __memcpy(dst, src, w);
393            src += w;
394            dst += dstPitch;
395        }
396
397        /* UV component is 1/2 of Y. */
398        if (!yuv422) {
399
400            w >>= 1;
401            dstPitch >>= 1;
402
403            /* Copy V(Cr),U(Cb) components to video memory. */
404            count = h;
405            while (count--) {
406                __memcpy(dst, src, w);
407                src += w;
408                dst += dstPitch;
409            }
410        }
411    }
412}
413
414PREFETCH_FUNC(sse, SSE, SSE,, FENCE)
415PREFETCH_FUNC(mmxext, MMXEXT, SSE, EMMS, FENCEMMS)
416PREFETCH_FUNC(now, MMX, NOW, FEMMS, FEMMS)
417NOPREFETCH_FUNC(mmx, MMX, EMMS, EMMS)
418
419static void
420*kernel_memcpy(void *to, const void *from, size_t len)
421{
422    return __memcpy(to, from, len);
423}
424
425static unsigned
426fastrdtsc(void)
427{
428    unsigned eax;
429
430    __asm__ volatile ("\t"
431                      "pushl %%ebx\n\t"
432                      "cpuid\n\t"
433                      ".byte 0x0f, 0x31\n\t"
434                      "popl %%ebx\n"
435                      :"=a" (eax)
436                      :"0"(0)
437                      :"ecx", "edx", "cc");
438
439    return eax;
440}
441
442
443static unsigned
444time_function(vidCopyFunc mf, unsigned char *buf1, unsigned char *buf2)
445{
446    unsigned t, t2;
447
448    t = fastrdtsc();
449
450    (*mf) (buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
451
452    t2 = fastrdtsc();
453    return ((t < t2) ? t2 - t : 0xFFFFFFFFU - (t - t2 - 1));
454}
455
456enum
457{ libc = 0, kernel, sse, mmx, now, mmxext, totNum };
458
459typedef struct
460{
461    vidCopyFunc mFunc;
462    char *mName, **cpuFlag;
463} McFuncData;
464
465static char *libc_cpuflags[] = { " ", 0 };
466static char *kernel_cpuflags[] = { " ", 0 };
467static char *sse_cpuflags[] = { " sse ", 0 };
468static char *mmx_cpuflags[] = { " mmx ", 0 };
469static char *now_cpuflags[] = { " 3dnow ", 0 };
470static char *mmx2_cpuflags[] = { " mmxext ", " sse ", 0 };
471
472static McFuncData mcFunctions[totNum] = {
473{libc_YUV42X, "libc", libc_cpuflags},
474{kernel_YUV42X, "kernel", kernel_cpuflags},
475{sse_YUV42X, "SSE", sse_cpuflags},
476{mmx_YUV42X, "MMX", mmx_cpuflags},
477{now_YUV42X, "3DNow!", now_cpuflags},
478{mmxext_YUV42X, "MMX2", mmx2_cpuflags}
479};
480
481
482static int
483flagValid(const char *cpuinfo, char *flag)
484{
485    const char *flagLoc, *nextProc;
486    int located = 0;
487
488    while ((cpuinfo = strstr(cpuinfo, "processor\t:"))) {
489        located = 1;
490        cpuinfo += 11;
491        if ((flagLoc = strstr(cpuinfo, flag))) {
492            if ((nextProc = strstr(cpuinfo, "processor\t:"))) {
493                if (nextProc < flagLoc)
494                    return 0;
495            }
496        } else {
497            return 0;
498        }
499    }
500    return located;
501}
502
503
504static int
505cpuValid(const char *cpuinfo, char **flags)
506{
507    for (; *flags != 0; flags++) {
508        if (flagValid(cpuinfo, *flags))
509            return 1;
510    }
511    return 0;
512}
513
514/*
515 * Benchmark the video copy routines and choose the fastest.
516 */
517vidCopyFunc
518viaVidCopyInit(char *copyType, ScreenPtr pScreen)
519{
520    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
521
522    char buf[BSIZ];
523    unsigned char *buf1, *buf2, *buf3;
524    char *tmpBuf, *endBuf;
525    int count, j, bestSoFar;
526    unsigned best, tmp, testSize, alignSize, tmp2;
527    struct buffer_object *tmpFbBuffer;
528    McFuncData *curData;
529    FILE *cpuInfoFile;
530    double cpuFreq;
531
532    if (NULL == (cpuInfoFile = fopen("/proc/cpuinfo", "r"))) {
533        return libc_YUV42X;
534    }
535    count = fread(buf, 1, BSIZ, cpuInfoFile);
536    if (ferror(cpuInfoFile)) {
537        fclose(cpuInfoFile);
538        return libc_YUV42X;
539    }
540    fclose(cpuInfoFile);
541    if (BSIZ == count) {
542        xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
543                   "\"/proc/cpuinfo\" file too long. "
544                   "Using Linux kernel memcpy.\n");
545        return libc_YUV42X;
546    }
547    buf[count] = 0;
548
549    while (count--)
550        if ('\n' == buf[count])
551            buf[count] = ' ';
552
553    /* Extract the CPU frequency. */
554    cpuFreq = 0.;
555    if (NULL != (tmpBuf = strstr(buf, "cpu MHz"))) {
556        if (NULL != (tmpBuf = strstr(tmpBuf, ":") + 1)) {
557            cpuFreq = strtod(tmpBuf, &endBuf);
558            if (endBuf == tmpBuf)
559                tmpBuf = NULL;
560        }
561    }
562
563    alignSize = BSIZH * (BSIZA + (BSIZA >> 1));
564    testSize = BSIZH * (BSIZW + (BSIZW >> 1));
565    /*
566     * Allocate an area of offscreen FB memory, (buf1), a simulated video
567     * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
568     */
569    tmpFbBuffer = drm_bo_alloc(pScrn, alignSize, 32, TTM_PL_FLAG_VRAM);
570    if (!tmpFbBuffer)
571        return libc_YUV42X;
572    if (NULL == (buf2 = (unsigned char *)malloc(testSize))) {
573        drm_bo_free(pScrn, tmpFbBuffer);
574        return libc_YUV42X;
575    }
576    if (NULL == (buf3 = (unsigned char *)malloc(testSize))) {
577        free(buf2);
578        drm_bo_free(pScrn, tmpFbBuffer);
579        return libc_YUV42X;
580    }
581    buf1 = drm_bo_map(pScrn, tmpFbBuffer);
582    bestSoFar = 0;
583    best = 0xFFFFFFFFU;
584
585    /* Make probable that buf1 and buf2 are in memory by referencing them. */
586    libc_YUV42X(buf1, buf2, BSIZA, BSIZW, BSIZH, 0);
587
588    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
589               "Benchmarking %s copy.  Less time is better.\n", copyType);
590    for (j = 0; j < totNum; ++j) {
591        curData = mcFunctions + j;
592
593        if (cpuValid(buf, curData->cpuFlag)) {
594
595            /* Simulate setup of the video buffer. */
596            kernel_memcpy(buf2, buf3, testSize);
597
598            /* Copy the video buffer to frame-buffer memory. */
599            tmp = time_function(curData->mFunc, buf1, buf2);
600
601            /* Do it again to avoid context-switch effects. */
602            kernel_memcpy(buf2, buf3, testSize);
603            tmp2 = time_function(curData->mFunc, buf1, buf2);
604            tmp = (tmp2 < tmp) ? tmp2 : tmp;
605
606            if (NULL == tmpBuf) {
607                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
608                           "Timed %6s YUV420 copy... %u.\n",
609                           curData->mName, tmp);
610            } else {
611                xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
612                           "Timed %6s YUV420 copy... %u. "
613                           "Throughput: %.1f MiB/s.\n",
614                           curData->mName, tmp,
615                           cpuFreq * 1.e6 * (double)testSize /
616                           ((double)(tmp) * (double)(0x100000)));
617            }
618            if (tmp < best) {
619                best = tmp;
620                bestSoFar = j;
621            }
622        } else {
623            xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
624                       "Ditching %6s YUV420 copy. Not supported by CPU.\n",
625                       curData->mName);
626        }
627    }
628    free(buf3);
629    free(buf2);
630    drm_bo_unmap(pScrn, tmpFbBuffer);
631    drm_bo_free(pScrn, tmpFbBuffer);
632    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
633               "Using %s YUV42X copy for %s.\n",
634               mcFunctions[bestSoFar].mName, copyType);
635    return mcFunctions[bestSoFar].mFunc;
636}
637
638#else
639
640vidCopyFunc
641viaVidCopyInit(char *copyType, ScreenPtr pScreen)
642{
643    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
644
645    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
646               "Using default xfree86 memcpy for video.\n");
647    return libc_YUV42X;
648}
649
650#endif /* __i386__ */
651