172b676d7Smrg/*
272b676d7Smrg * SiS memcpy() routines (assembly)
372b676d7Smrg *
472b676d7Smrg * Copyright (C) 2004-2005 Thomas Winischhofer
572b676d7Smrg *
672b676d7Smrg * Idea and some code bits from via_memcpy.c which is
772b676d7Smrg * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved.
872b676d7Smrg *
972b676d7Smrg * Permission is hereby granted, free of charge, to any person obtaining a
1072b676d7Smrg * copy of this software and associated documentation files (the "Software"),
1172b676d7Smrg * to deal in the Software without restriction, including without limitation
1272b676d7Smrg * the rights to use, copy, modify, merge, publish, distribute, sub license,
1372b676d7Smrg * and/or sell copies of the Software, and to permit persons to whom the
1472b676d7Smrg * Software is furnished to do so, subject to the following conditions:
1572b676d7Smrg *
1672b676d7Smrg * The above copyright notice and this permission notice (including the
1772b676d7Smrg * next paragraph) shall be included in all copies or substantial portions
1872b676d7Smrg * of the Software.
1972b676d7Smrg *
2072b676d7Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2172b676d7Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2272b676d7Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
2372b676d7Smrg * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
2472b676d7Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
2572b676d7Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
2672b676d7Smrg * DEALINGS IN THE SOFTWARE.
2772b676d7Smrg *
2872b676d7Smrg */
2972b676d7Smrg
3072b676d7Smrg#ifdef HAVE_CONFIG_H
3172b676d7Smrg#include "config.h"
3272b676d7Smrg#endif
3372b676d7Smrg
3472b676d7Smrg#include <stdlib.h>
3572b676d7Smrg#include "sis.h"
3672b676d7Smrg
3772b676d7Smrg#if 0			/* Debug */
3872b676d7Smrg#define SISDGBMC
3972b676d7Smrg#endif
4072b676d7Smrg
4172b676d7Smrgextern unsigned int SISAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize);
4272b676d7Smrgextern void	    SISFreeFBMemory(ScrnInfoPtr pScrn, void **handle);
4372b676d7Smrg
4472b676d7Smrg#define CPUBUFFERSIZE 2048       /* Size of /proc/cpuinfo buffer */
4572b676d7Smrg#define BUFFERSIZE (576 * 1152)  /* Matches 720x576 YUV420 */
4672b676d7Smrg
4772b676d7Smrg/************************************************************************/
4872b676d7Smrg/*                   arch specific memcpy() routines                    */
4972b676d7Smrg/************************************************************************/
5072b676d7Smrg
5172b676d7Smrg/* i386, AMD64 */
5272b676d7Smrg
5372b676d7Smrg#define FENCE 			\
5472b676d7Smrg     __asm__ __volatile__( 	\
5572b676d7Smrg		  " sfence\n" 	\
5672b676d7Smrg		  :		\
5772b676d7Smrg		  :		\
5872b676d7Smrg		  : "memory");
5972b676d7Smrg
6072b676d7Smrg#define FENCEMMS 		\
6172b676d7Smrg     __asm__ __volatile__ (	\
6272b676d7Smrg		  " sfence\n"	\
6372b676d7Smrg		  " emms\n"	\
6472b676d7Smrg		  :		\
6572b676d7Smrg		  :		\
6672b676d7Smrg		  : "memory");
6772b676d7Smrg
6872b676d7Smrg#define FEMMS 			\
6972b676d7Smrg     __asm__ __volatile__(	\
7072b676d7Smrg		  " femms\n"	\
7172b676d7Smrg		  :		\
7272b676d7Smrg		  :		\
7372b676d7Smrg		  : "memory");
7472b676d7Smrg
7572b676d7Smrg#define EMMS 			\
7672b676d7Smrg     __asm__ __volatile__(	\
7772b676d7Smrg		  " emms\n"	\
7872b676d7Smrg		  :		\
7972b676d7Smrg		  :		\
8072b676d7Smrg		  : "memory");
8172b676d7Smrg
8272b676d7Smrg#define SSE_PREFETCH " prefetchnta "
8372b676d7Smrg#define NOW_PREFETCH " prefetch "
8472b676d7Smrg
8572b676d7Smrg#define PREFETCH1(arch_prefetch,from)		\
8672b676d7Smrg    __asm__ __volatile__ (			\
8772b676d7Smrg		  arch_prefetch "(%0)\n"	\
8872b676d7Smrg		  arch_prefetch "32(%0)\n"	\
8972b676d7Smrg		  arch_prefetch "64(%0)\n"	\
9072b676d7Smrg		  arch_prefetch "96(%0)\n"	\
9172b676d7Smrg		  arch_prefetch "128(%0)\n"	\
9272b676d7Smrg		  arch_prefetch "160(%0)\n"	\
9372b676d7Smrg		  arch_prefetch "192(%0)\n"	\
9472b676d7Smrg		  arch_prefetch "256(%0)\n"	\
9572b676d7Smrg		  arch_prefetch "288(%0)\n"	\
9672b676d7Smrg		  : 				\
9772b676d7Smrg		  : "r" (from) );
9872b676d7Smrg
9972b676d7Smrg#define PREFETCH2(arch_prefetch,from)		\
10072b676d7Smrg    __asm__ __volatile__ (			\
10172b676d7Smrg		  arch_prefetch "320(%0)\n"	\
10272b676d7Smrg		  : 				\
10372b676d7Smrg		  : "r" (from) );
10472b676d7Smrg
10572b676d7Smrg#define PREFETCH3(arch_prefetch,from)		\
10672b676d7Smrg    __asm__ __volatile__ (			\
10772b676d7Smrg		  arch_prefetch "288(%0)\n"	\
10872b676d7Smrg		  : 				\
10972b676d7Smrg		  : "r" (from) );
11072b676d7Smrg
11172b676d7Smrg#define small_memcpy_i386(to,from,n)					\
11272b676d7Smrg    {									\
11372b676d7Smrg	__asm__ __volatile__(						\
11472b676d7Smrg		  " cld\n"						\
11572b676d7Smrg		  " shrl $1, %%ecx\n"					\
11672b676d7Smrg		  " jnc 1f\n"						\
11772b676d7Smrg		  " movsb\n"						\
11872b676d7Smrg		"1: shrl $1, %%ecx\n"					\
11972b676d7Smrg		  " jnc 2f\n"						\
12072b676d7Smrg		  " movsw\n"						\
12172b676d7Smrg		"2: rep ; movsl"					\
12272b676d7Smrg		  : "=&D" (to), "=&S" (from)				\
12372b676d7Smrg		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
12472b676d7Smrg		  : "memory", "cc");					\
12572b676d7Smrg    }
12672b676d7Smrg
12772b676d7Smrg#define small_memcpy_amd64(to,from,n)					\
12872b676d7Smrg    {									\
12972b676d7Smrg	__asm__ __volatile__(						\
13072b676d7Smrg		  " cld\n"						\
13172b676d7Smrg		  " shrq $1, %%rcx\n"					\
13272b676d7Smrg		  " jnc 1f\n"						\
13372b676d7Smrg		  " movsb\n"						\
13472b676d7Smrg		"1: shrq $1, %%rcx\n"					\
13572b676d7Smrg		  " jnc 2f\n"						\
13672b676d7Smrg		  " movsw\n"						\
13772b676d7Smrg		"2: shrq $1, %%rcx\n"					\
13872b676d7Smrg		  " jnc 3f\n"						\
13972b676d7Smrg		  " movsl\n"						\
14072b676d7Smrg		"3: rep ; movsq"					\
14172b676d7Smrg		  : "=&D" (to), "=&S" (from)				\
14272b676d7Smrg		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
14372b676d7Smrg		  : "memory", "cc");					\
14472b676d7Smrg    }
14572b676d7Smrg
14672b676d7Smrg#define MMX_CPY(prefetch,from,to,dummy,lcnt)				\
14772b676d7Smrg    __asm__ __volatile__ (						\
14872b676d7Smrg	        "1:\n"							\
14972b676d7Smrg		    prefetch "320(%1)\n"				\
15072b676d7Smrg	          " movq (%1), %%mm0\n"					\
15172b676d7Smrg		  " movq 8(%1), %%mm1\n"				\
15272b676d7Smrg		  " movq 16(%1), %%mm2\n"				\
15372b676d7Smrg		  " movq 24(%1), %%mm3\n"				\
15472b676d7Smrg		  " movq %%mm0, (%0)\n"					\
15572b676d7Smrg		  " movq %%mm1, 8(%0)\n"				\
15672b676d7Smrg		  " movq %%mm2, 16(%0)\n"				\
15772b676d7Smrg		  " movq %%mm3, 24(%0)\n"				\
15872b676d7Smrg		    prefetch "352(%1)\n"				\
15972b676d7Smrg		  " movq 32(%1), %%mm0\n"				\
16072b676d7Smrg		  " movq 40(%1), %%mm1\n"				\
16172b676d7Smrg		  " movq 48(%1), %%mm2\n"				\
16272b676d7Smrg		  " movq 56(%1), %%mm3\n"				\
16372b676d7Smrg		  " leal 64(%1),%1\n"					\
16472b676d7Smrg		  " movq %%mm0, 32(%0)\n"				\
16572b676d7Smrg		  " movq %%mm1, 40(%0)\n"				\
16672b676d7Smrg		  " movq %%mm2, 48(%0)\n"				\
16772b676d7Smrg		  " movq %%mm3, 56(%0)\n"				\
16872b676d7Smrg		  " decl %2\n"						\
16972b676d7Smrg		  " leal 64(%0),%0\n"					\
17072b676d7Smrg		  " jne 1b\n"						\
17172b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
17272b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt) 			\
17372b676d7Smrg		  : "memory", "cc");
17472b676d7Smrg
17572b676d7Smrg#define SSE_CPY(prefetch,from,to,dummy,lcnt)				\
17672b676d7Smrg    if((ULong) from & 15) {						\
17772b676d7Smrg	__asm__ __volatile__ (						\
17872b676d7Smrg		"1:\n"							\
17972b676d7Smrg		    prefetch "320(%1)\n"				\
18072b676d7Smrg		  " movups (%1), %%xmm0\n"				\
18172b676d7Smrg		  " movups 16(%1), %%xmm1\n"				\
18272b676d7Smrg		  " movntps %%xmm0, (%0)\n"				\
18372b676d7Smrg		  " movntps %%xmm1, 16(%0)\n"				\
18472b676d7Smrg		    prefetch "352(%1)\n"				\
18572b676d7Smrg		  " movups 32(%1), %%xmm2\n"				\
18672b676d7Smrg		  " movups 48(%1), %%xmm3\n"				\
18772b676d7Smrg		  " leal 64(%1),%1\n"					\
18872b676d7Smrg		  " movntps %%xmm2, 32(%0)\n"				\
18972b676d7Smrg		  " movntps %%xmm3, 48(%0)\n"				\
19072b676d7Smrg		  " decl %2\n"						\
19172b676d7Smrg		  " leal 64(%0),%0\n"					\
19272b676d7Smrg		  " jne 1b\n"						\
19372b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
19472b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt)			\
19572b676d7Smrg		  : "memory", "cc"); 					\
19672b676d7Smrg    } else {								\
19772b676d7Smrg	__asm__ __volatile__ (						\
19872b676d7Smrg		"2:\n"							\
19972b676d7Smrg		    prefetch "320(%1)\n"				\
20072b676d7Smrg		  " movaps (%1), %%xmm0\n"				\
20172b676d7Smrg		  " movaps 16(%1), %%xmm1\n"				\
20272b676d7Smrg		  " movntps %%xmm0, (%0)\n"				\
20372b676d7Smrg		  " movntps %%xmm1, 16(%0)\n"				\
20472b676d7Smrg        	    prefetch "352(%1)\n"				\
20572b676d7Smrg		  " movaps 32(%1), %%xmm2\n"				\
20672b676d7Smrg		  " movaps 48(%1), %%xmm3\n"				\
20772b676d7Smrg		  " leal 64(%1),%1\n"					\
20872b676d7Smrg		  " movntps %%xmm2, 32(%0)\n"				\
20972b676d7Smrg		  " movntps %%xmm3, 48(%0)\n"				\
21072b676d7Smrg		  " decl %2\n"						\
21172b676d7Smrg		  " leal 64(%0),%0\n"					\
21272b676d7Smrg		  " jne 2b\n"						\
21372b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
21472b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt)			\
21572b676d7Smrg		  : "memory", "cc");					\
21672b676d7Smrg    }
21772b676d7Smrg
21872b676d7Smrg#define SSE64_CPY(prefetch,from,to,dummy,lcnt)				\
21972b676d7Smrg    if((ULong) from & 15) {						\
22072b676d7Smrg	__asm__ __volatile__ (						\
22172b676d7Smrg		"1:\n"							\
22272b676d7Smrg		    prefetch "320(%1)\n"				\
22372b676d7Smrg		  " movups (%1), %%xmm0\n"				\
22472b676d7Smrg		  " movups 16(%1), %%xmm1\n"				\
22572b676d7Smrg		  " movntps %%xmm0, (%0)\n"				\
22672b676d7Smrg		  " movntps %%xmm1, 16(%0)\n"				\
22772b676d7Smrg		    prefetch "352(%1)\n"				\
22872b676d7Smrg		  " movups 32(%1), %%xmm2\n"				\
22972b676d7Smrg		  " movups 48(%1), %%xmm3\n"				\
23072b676d7Smrg		  " leaq 64(%1),%1\n"					\
23172b676d7Smrg		  " movntps %%xmm2, 32(%0)\n"				\
23272b676d7Smrg		  " movntps %%xmm3, 48(%0)\n"				\
23372b676d7Smrg		  " decl %2\n"						\
23472b676d7Smrg		  " leaq 64(%0),%0\n"					\
23572b676d7Smrg		  " jne 1b\n"						\
23672b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
23772b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt)			\
23872b676d7Smrg		  : "memory", "cc"); 					\
23972b676d7Smrg    } else {								\
24072b676d7Smrg	__asm__ __volatile__ (						\
24172b676d7Smrg		"2:\n"							\
24272b676d7Smrg		    prefetch "320(%1)\n"				\
24372b676d7Smrg		  " movaps (%1), %%xmm0\n"				\
24472b676d7Smrg		  " movaps 16(%1), %%xmm1\n"				\
24572b676d7Smrg		  " movntps %%xmm0, (%0)\n"				\
24672b676d7Smrg		  " movntps %%xmm1, 16(%0)\n"				\
24772b676d7Smrg        	    prefetch "352(%1)\n"				\
24872b676d7Smrg		  " movaps 32(%1), %%xmm2\n"				\
24972b676d7Smrg		  " movaps 48(%1), %%xmm3\n"				\
25072b676d7Smrg		  " leaq 64(%1),%1\n"					\
25172b676d7Smrg		  " movntps %%xmm2, 32(%0)\n"				\
25272b676d7Smrg		  " movntps %%xmm3, 48(%0)\n"				\
25372b676d7Smrg		  " decl %2\n"						\
25472b676d7Smrg		  " leaq 64(%0),%0\n"					\
25572b676d7Smrg		  " jne 2b\n"						\
25672b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
25772b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt)			\
25872b676d7Smrg		  : "memory", "cc");					\
25972b676d7Smrg    }
26072b676d7Smrg
26172b676d7Smrg#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt)				\
26272b676d7Smrg    __asm__ __volatile__ (						\
26372b676d7Smrg		  ".p2align 4,,7\n"					\
26472b676d7Smrg		 "1:\n"							\
26572b676d7Smrg		    prefetch "320(%1)\n"				\
26672b676d7Smrg		  " movq (%1), %%mm0\n"					\
26772b676d7Smrg		  " movq 8(%1), %%mm1\n"				\
26872b676d7Smrg		  " movq 16(%1), %%mm2\n"				\
26972b676d7Smrg		  " movq 24(%1), %%mm3\n"				\
27072b676d7Smrg		  " movntq %%mm0, (%0)\n"				\
27172b676d7Smrg		  " movntq %%mm1, 8(%0)\n"				\
27272b676d7Smrg		  " movntq %%mm2, 16(%0)\n"				\
27372b676d7Smrg		  " movntq %%mm3, 24(%0)\n"				\
27472b676d7Smrg		    prefetch "352(%1)\n"				\
27572b676d7Smrg		  " movq 32(%1), %%mm0\n"				\
27672b676d7Smrg		  " movq 40(%1), %%mm1\n"				\
27772b676d7Smrg		  " movq 48(%1), %%mm2\n"				\
27872b676d7Smrg		  " movq 56(%1), %%mm3\n"				\
27972b676d7Smrg		  " leal 64(%1),%1\n"					\
28072b676d7Smrg		  " movntq %%mm0, 32(%0)\n"				\
28172b676d7Smrg		  " movntq %%mm1, 40(%0)\n"				\
28272b676d7Smrg		  " movntq %%mm2, 48(%0)\n"				\
28372b676d7Smrg		  " movntq %%mm3, 56(%0)\n"				\
28472b676d7Smrg		  " decl %2\n"						\
28572b676d7Smrg		  " leal 64(%0),%0\n"					\
28672b676d7Smrg		  " jne 1b\n"						\
28772b676d7Smrg		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
28872b676d7Smrg		  : "0" (to), "1" (from), "2" (lcnt) 			\
28972b676d7Smrg		  : "memory", "cc");
29072b676d7Smrg
29172b676d7Smrg
29272b676d7Smrg#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)		\
29372b676d7Smrg									\
29472b676d7Smrg    static void prefix##_memcpy(UChar *to,				\
29572b676d7Smrg				const UChar *from,			\
29672b676d7Smrg				int size)				\
29772b676d7Smrg    {									\
29872b676d7Smrg	int lcnt = size >> 6;						\
29972b676d7Smrg	int rest = size & 63;						\
30072b676d7Smrg	register int dummy;						\
30172b676d7Smrg									\
30272b676d7Smrg	PREFETCH1(ptype##_PREFETCH,from);				\
30372b676d7Smrg									\
30472b676d7Smrg	begin;								\
30572b676d7Smrg	if(lcnt) {							\
30672b676d7Smrg	   itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt);		\
30772b676d7Smrg	}								\
30872b676d7Smrg	if(rest) {							\
30972b676d7Smrg	   PREFETCH2(ptype##_PREFETCH,from);				\
31072b676d7Smrg	   small(to, from, rest);					\
31172b676d7Smrg	   PREFETCH3(ptype##_PREFETCH,from);				\
31272b676d7Smrg	}								\
31372b676d7Smrg	fence;								\
31472b676d7Smrg    }
31572b676d7Smrg
31672b676d7Smrg#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small)			\
31772b676d7Smrg									\
31872b676d7Smrg    static void prefix##_memcpy(UChar *to,				\
31972b676d7Smrg				const UChar *from,			\
32072b676d7Smrg				int size)				\
32172b676d7Smrg    {									\
32272b676d7Smrg	int lcnt = size >> 6;						\
32372b676d7Smrg	int rest = size & 63;						\
32472b676d7Smrg	register int dummy;						\
32572b676d7Smrg									\
32672b676d7Smrg	begin;								\
32772b676d7Smrg	if(lcnt) {							\
32872b676d7Smrg	   itype##_CPY("#",from,to,dummy,lcnt);				\
32972b676d7Smrg	}								\
33072b676d7Smrg	if(rest) {							\
33172b676d7Smrg	   small(to, from, rest);					\
33272b676d7Smrg	}								\
33372b676d7Smrg	fence;								\
33472b676d7Smrg    }
33572b676d7Smrg
33672b676d7Smrg/* Other archs */
33772b676d7Smrg
33872b676d7Smrg/* ... */
33972b676d7Smrg
34072b676d7Smrg
34172b676d7Smrg/* Type for table for benchmark list */
34272b676d7Smrg
34372b676d7Smrgtypedef struct {
34472b676d7Smrg    vidCopyFunc  mFunc;
34572b676d7Smrg    char         *mName;
34672b676d7Smrg    unsigned int mycpuflag;
34772b676d7Smrg    int          grade;
34872b676d7Smrg    int 	 gradefrom;
34972b676d7Smrg    Bool         reqAlignment;
35072b676d7Smrg} SISMCFuncData;
35172b676d7Smrg
35272b676d7Smrg/************************************************************************/
35372b676d7Smrg/*                   libc memcpy() wrapper - generic                    */
35472b676d7Smrg/************************************************************************/
35572b676d7Smrg
35672b676d7Smrgstatic void SiS_libc_memcpy(UChar *dst, const UChar *src, int size)
35772b676d7Smrg{
35872b676d7Smrg    memcpy(dst, src, size);
35972b676d7Smrg}
36072b676d7Smrg
36172b676d7Smrg/************************************************************************/
36272b676d7Smrg/* We only do all that stuff under gcc; no idea what other compilers 	*/
36372b676d7Smrg/* would do with our asm code.  					*/
36472b676d7Smrg/************************************************************************/
36572b676d7Smrg
36672b676d7Smrg#ifndef __GNUC__
36772b676d7Smrg
36872b676d7Smrgunsigned int SiSGetCPUFlags(ScrnInfoPtr pScrn)
36972b676d7Smrg{
37072b676d7Smrg    return 0;
37172b676d7Smrg}
37272b676d7Smrg
37372b676d7SmrgvidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
37472b676d7Smrg{
37572b676d7Smrg    *UMemCpy = SiS_libc_memcpy;
37672b676d7Smrg    return SiS_libc_memcpy;
37772b676d7Smrg}
37872b676d7Smrg
37972b676d7SmrgvidCopyFunc SiSVidCopyGetDefault(void)
38072b676d7Smrg{
38172b676d7Smrg    return SiS_libc_memcpy;
38272b676d7Smrg}
38372b676d7Smrg
38472b676d7Smrg#else /* ! Everything below is gcc specific ! */
38572b676d7Smrg
38672b676d7Smrg/************************************************************************/
38772b676d7Smrg/*                    Definitions for archs and OSes                    */
38872b676d7Smrg/************************************************************************/
38972b676d7Smrg
39072b676d7Smrg#undef SiS_checkosforsse
39172b676d7Smrg#undef SiS_canBenchmark
39272b676d7Smrg#undef SiS_haveProc
39372b676d7Smrg#undef SiS_haveBuiltInMC
39472b676d7Smrg
39572b676d7Smrg#if defined(__i386__) /* ***************************************** i386 */
39672b676d7Smrg
39772b676d7Smrg#define SiS_checkosforsse 	/* Does this cpu support sse and do we need to check os? */
39872b676d7Smrg#define SiS_canBenchmark	/* Can we perform a benchmark? */
39972b676d7Smrg#ifdef SIS_LINUX
40072b676d7Smrg#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
40172b676d7Smrg#endif
40272b676d7Smrg#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
40372b676d7Smrg
40472b676d7Smrg/* Built-in memcpy for i386 */
40572b676d7Smrgstatic __inline void * builtin_memcpy(void * to, const void * from, size_t n)
40672b676d7Smrg{
40772b676d7Smrg    int d1,d2,d3;
40872b676d7Smrg
40972b676d7Smrg    __asm__ __volatile__(
41072b676d7Smrg		  " cld\n"
41172b676d7Smrg		  " shrl $1, %%ecx\n"
41272b676d7Smrg		  " jnc 1f\n"
41372b676d7Smrg		  " movsb\n"
41472b676d7Smrg		"1: shrl $1, %%ecx\n"
41572b676d7Smrg		  " jnc 2f\n"
41672b676d7Smrg		  " movsw\n"
41772b676d7Smrg		"2: rep ; movsl\n"
41872b676d7Smrg		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
41972b676d7Smrg		  : "0" (n), "1" ((long) to), "2" ((long) from)
42072b676d7Smrg		  : "memory", "cc");
42172b676d7Smrg
42272b676d7Smrg    return(to);
42372b676d7Smrg}
42472b676d7Smrg
42572b676d7Smrg/* Alternative for 586: Unroll loop, copy 32 bytes at a time */
42672b676d7Smrgstatic void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
42772b676d7Smrg{
42872b676d7Smrg    int d1,d2,d3;
42972b676d7Smrg
43072b676d7Smrg    __asm__ __volatile__(
43172b676d7Smrg		  " movl %%edi, %%eax\n"
43272b676d7Smrg		  " cmpl $32, %%ecx\n"
43372b676d7Smrg		  " cld\n"
43472b676d7Smrg		  " jbe 3f\n"
43572b676d7Smrg		  " negl %%eax\n"		/* Align dest */
43672b676d7Smrg		  " andl $3, %%eax\n"
43772b676d7Smrg		  " subl %%eax, %%ecx\n"
43872b676d7Smrg		  " xchgl %%eax, %%ecx\n"
43972b676d7Smrg		  " rep ; movsb\n"
44072b676d7Smrg		  " movl %%eax, %%ecx\n"
44172b676d7Smrg		  " subl $32, %%ecx\n"
44272b676d7Smrg		  " js 2f\n"
44372b676d7Smrg		  " movl (%%edi), %%eax\n"
44472b676d7Smrg		"1: movl 28(%%edi), %%edx\n"   	/* Trick: Read-ahead */
44572b676d7Smrg		  " subl $32, %%ecx\n"
44672b676d7Smrg		  " movl (%%esi), %%eax\n"
44772b676d7Smrg		  " movl 4(%%esi), %%edx\n"
44872b676d7Smrg		  " movl %%eax, (%%edi)\n"
44972b676d7Smrg		  " movl %%edx, 4(%%edi)\n"
45072b676d7Smrg		  " movl 8(%%esi), %%eax\n"
45172b676d7Smrg		  " movl 12(%%esi), %%edx\n"
45272b676d7Smrg		  " movl %%eax, 8(%%edi)\n"
45372b676d7Smrg		  " movl %%edx, 12(%%edi)\n"
45472b676d7Smrg		  " movl 16(%%esi), %%eax\n"
45572b676d7Smrg		  " movl 20(%%esi), %%edx\n"
45672b676d7Smrg		  " movl %%eax, 16(%%edi)\n"
45772b676d7Smrg		  " movl %%edx, 20(%%edi)\n"
45872b676d7Smrg		  " movl 24(%%esi), %%eax\n"
45972b676d7Smrg		  " movl 28(%%esi), %%edx\n"
46072b676d7Smrg		  " movl %%eax, 24(%%edi)\n"
46172b676d7Smrg		  " movl %%edx, 28(%%edi)\n"
46272b676d7Smrg		  " leal 32(%%esi), %%esi\n"
46372b676d7Smrg		  " leal 32(%%edi), %%edi\n"
46472b676d7Smrg		  " jns 1b\n"
46572b676d7Smrg		"2: addl $32, %%ecx\n"
46672b676d7Smrg		"3: rep ; movsb"
46772b676d7Smrg		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
46872b676d7Smrg		  : "0" (n), "1" ((long) to), "2" ((long) from)
46972b676d7Smrg		  : "eax", "edx", "memory", "cc");
47072b676d7Smrg
47172b676d7Smrg}
47272b676d7Smrg
47372b676d7Smrgstatic unsigned int taketime(void)	/* get current time (for benchmarking) */
47472b676d7Smrg{
47572b676d7Smrg    unsigned int eax;
47672b676d7Smrg
47772b676d7Smrg    __asm__ volatile (
47872b676d7Smrg		" pushl %%ebx\n"
47972b676d7Smrg		" cpuid\n"
48072b676d7Smrg		" rdtsc\n"
48172b676d7Smrg		" popl %%ebx\n"
48272b676d7Smrg		: "=a" (eax)
48372b676d7Smrg		: "0" (0)
48472b676d7Smrg		: "ecx", "edx", "cc");
48572b676d7Smrg
48672b676d7Smrg    return(eax);
48772b676d7Smrg}
48872b676d7Smrg
48972b676d7Smrg#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */
49072b676d7Smrg
49172b676d7Smrg#define SiS_checkosforsse	/* Does this cpu support sse and do we need to check os? */
49272b676d7Smrg#define SiS_canBenchmark	/* Can we perform a benchmark? */
49372b676d7Smrg#ifdef SIS_LINUX
49472b676d7Smrg#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
49572b676d7Smrg#endif
49672b676d7Smrg#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
49772b676d7Smrg
49872b676d7Smrg/* Built-in memcpy for AMD64 */
49972b676d7Smrgstatic __inline void * builtin_memcpy(void * to, const void * from, int n)
50072b676d7Smrg{
50172b676d7Smrg    long d1, d2, d3;
50272b676d7Smrg
50372b676d7Smrg    __asm__ __volatile__ (
50472b676d7Smrg		" cld\n"
50572b676d7Smrg		" rep ; movsq\n"
50672b676d7Smrg		" movq %4, %%rcx\n"
50772b676d7Smrg		" rep ; movsb"
50872b676d7Smrg		: "=%c" (d1), "=&D" (d2), "=&S" (d3)
50972b676d7Smrg		: "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)),
51072b676d7Smrg		  "1" ((long) to), "2" ((long) from)
51172b676d7Smrg		: "memory");
51272b676d7Smrg
51372b676d7Smrg    return(to);
51472b676d7Smrg}
51572b676d7Smrg
51672b676d7Smrg/* Alternative: Unroll loop, copy 32 bytes at a time */
51772b676d7Smrgstatic void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
51872b676d7Smrg{
51972b676d7Smrg    long d1,d2,d3;
52072b676d7Smrg
52172b676d7Smrg    __asm__ __volatile__(
52272b676d7Smrg		  " movq %%rdi, %%rax\n"
52372b676d7Smrg		  " cmpq $32, %%rcx\n"
52472b676d7Smrg		  " cld\n"			/* Pipeline; no other flags but DF */
52572b676d7Smrg		  " jbe 1f\n"
52672b676d7Smrg		  " negq %%rax\n"		/* Align dest */
52772b676d7Smrg		  " andq $7, %%rax\n"
52872b676d7Smrg		  " subq %%rax, %%rcx\n"
52972b676d7Smrg		  " xchgq %%rax, %%rcx\n"
53072b676d7Smrg		  " rep ; movsb\n"
53172b676d7Smrg		  " movq %%rax, %%rcx\n"
53272b676d7Smrg		  " subq $32, %%rcx\n"
53372b676d7Smrg		  " js 2f\n"
53472b676d7Smrg		  ".p2align 4\n"
53572b676d7Smrg		"3: subq $32, %%rcx\n"
53672b676d7Smrg		  " movq (%%rsi), %%rax\n"
53772b676d7Smrg		  " movq 8(%%rsi), %%rdx\n"
53872b676d7Smrg		  " movq 16(%%rsi), %%r8\n"
53972b676d7Smrg		  " movq 24(%%rsi), %%r9\n"
54072b676d7Smrg		  " movq %%rax, (%%rdi)\n"
54172b676d7Smrg		  " movq %%rdx, 8(%%rdi)\n"
54272b676d7Smrg		  " movq %%r8, 16(%%rdi)\n"
54372b676d7Smrg		  " movq %%r9, 24(%%rdi)\n"
54472b676d7Smrg		  " leaq 32(%%rsi), %%rsi\n"
54572b676d7Smrg		  " leaq 32(%%rdi), %%rdi\n"
54672b676d7Smrg		  " jns 3b\n"
54772b676d7Smrg		"2: addq $32, %%rcx\n"
54872b676d7Smrg		"1: rep ; movsb"
54972b676d7Smrg		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
55072b676d7Smrg		  :"0" ((ULong) n), "1" ((long) to), "2" ((long) from)
55172b676d7Smrg		  : "rax", "rdx", "r8", "r9", "memory", "cc");
55272b676d7Smrg
55372b676d7Smrg}
55472b676d7Smrg
55572b676d7Smrgstatic unsigned int taketime(void)	/* get current time (for benchmarking) */
55672b676d7Smrg{
55772b676d7Smrg    unsigned int eax;
55872b676d7Smrg
55972b676d7Smrg    __asm__ volatile (
56072b676d7Smrg		" pushq %%rbx\n"
56172b676d7Smrg		" cpuid\n"
56272b676d7Smrg		" rdtsc\n"
56372b676d7Smrg		" popq %%rbx\n"
56472b676d7Smrg		: "=a" (eax)
56572b676d7Smrg		: "0" (0)
56672b676d7Smrg		: "rcx", "rdx", "cc");
56772b676d7Smrg
56872b676d7Smrg    return(eax);
56972b676d7Smrg}
57072b676d7Smrg
57172b676d7Smrg#else		/* **************************************** Other archs */
57272b676d7Smrg
57372b676d7Smrg/* 1. Can we do a benchmark?		*/
57472b676d7Smrg/* #define SiS_canBenchmark		*/
57572b676d7Smrg
57672b676d7Smrg/* 2. Do we have /proc filesystem or similar for CPU information? */
57772b676d7Smrg/* #define SiS_haveproc			*/
57872b676d7Smrg
57972b676d7Smrg/* 3. Optional: build-in memcpy()	*/
58072b676d7Smrg/* #define SiS_haveBuiltInMC		*/
58172b676d7Smrg/* static __inline void * builtin_memcpy(void * to, const void * from, int n)
58272b676d7Smrg   {
58372b676d7Smrg   }
58472b676d7Smrg*/
58572b676d7Smrg
58672b676d7Smrg/* 4. Function for getting current time (for benchmarking)  */
58772b676d7Smrg/* static unsigned int taketime(void)
58872b676d7Smrg   {
58972b676d7Smrg   }
59072b676d7Smrg*/
59172b676d7Smrg
59272b676d7Smrg#endif
59372b676d7Smrg
59472b676d7Smrg/************************************************************************/
59572b676d7Smrg/*                   Generic built-in memcpy wrapper                    */
59672b676d7Smrg/************************************************************************/
59772b676d7Smrg
59872b676d7Smrg#ifdef SiS_haveBuiltInMC
59972b676d7Smrgstatic void SiS_builtin_memcpy(UChar *dst, const UChar *src, int size)
60072b676d7Smrg{
60172b676d7Smrg    builtin_memcpy(dst, src, size);
60272b676d7Smrg}
60372b676d7Smrg#endif
60472b676d7Smrg
60572b676d7Smrg/************************************************************************/
60672b676d7Smrg/* Generic routines if Benchmark can be performed (all archs, all OSes) */
60772b676d7Smrg/************************************************************************/
60872b676d7Smrg
60972b676d7Smrg#ifdef SiS_canBenchmark
61072b676d7Smrg
61172b676d7Smrg/* Get time (unsigned int) */
61272b676d7Smrgstatic unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size)
61372b676d7Smrg{
61472b676d7Smrg    unsigned int t1, t2;
61572b676d7Smrg
61672b676d7Smrg    t1 = taketime();
61772b676d7Smrg
61872b676d7Smrg    (*mf)(buf1, buf2, size);
61972b676d7Smrg
62072b676d7Smrg    t2 = taketime();
62172b676d7Smrg
62272b676d7Smrg    return((t1 <  t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1));
62372b676d7Smrg}
62472b676d7Smrg
62572b676d7Smrg/* Allocate an area of offscreen FB memory (buf1), a simulated video
62672b676d7Smrg * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
62772b676d7Smrg */
62872b676d7Smrgstatic void *
62972b676d7SmrgSiS_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3)
63072b676d7Smrg{
63172b676d7Smrg    SISPtr pSiS = SISPTR(pScrn);
63272b676d7Smrg    unsigned int offset;
63372b676d7Smrg    void *handle = NULL;
63472b676d7Smrg
63572b676d7Smrg    if(!(offset = SISAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) {
63672b676d7Smrg       return NULL;
63772b676d7Smrg    }
63872b676d7Smrg    (*buf1) = (UChar *)pSiS->FbBase + offset;
63972b676d7Smrg    (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31);
64072b676d7Smrg
64174c14cd6Smrg    if(!((*buf2) = (UChar *)malloc(BUFFERSIZE + 15))) {
64272b676d7Smrg       SISFreeFBMemory(pScrn, &handle);
64372b676d7Smrg       return NULL;
64472b676d7Smrg    }
64572b676d7Smrg
64674c14cd6Smrg    if(!((*buf3) = (UChar *)malloc(BUFFERSIZE + 15))) {
64774c14cd6Smrg       free((*buf2));
64872b676d7Smrg       SISFreeFBMemory(pScrn, &handle);
64972b676d7Smrg       return NULL;
65072b676d7Smrg    }
65172b676d7Smrg
65272b676d7Smrg    return handle;
65372b676d7Smrg}
65472b676d7Smrg
65572b676d7Smrg/* Perform Benchmark */
65672b676d7Smrgstatic int SiS_BenchmarkMemcpy(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
65772b676d7Smrg                               unsigned int myCPUflags, UChar *buf1, UChar *buf2,
65872b676d7Smrg			       UChar *buf3, char *frqBuf, double cpuFreq,
65972b676d7Smrg			       vidCopyFunc *UMemCpy, int *best2, Bool from)
66072b676d7Smrg{
66172b676d7Smrg    SISMCFuncData *curData;
66272b676d7Smrg    int j = 0, bestSoFar = 0;
66372b676d7Smrg    unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU;
66472b676d7Smrg
66572b676d7Smrg    (*best2) = 0;
66672b676d7Smrg
66772b676d7Smrg    /* Make probable buf1 and buf2 are not paged out by referencing them */
66872b676d7Smrg    SiS_libc_memcpy(buf1, buf2, BUFFERSIZE);
66972b676d7Smrg
67072b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
67172b676d7Smrg	       "Benchmarking %s RAM to %s RAM memory transfer methods:\n",
67272b676d7Smrg	       from ? "video" : "system",
67372b676d7Smrg	       from ? "system" : "video");
67472b676d7Smrg
67572b676d7Smrg#ifdef TWDEBUG
67672b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags);
67772b676d7Smrg#endif
67872b676d7Smrg
67972b676d7Smrg    j = 0;
68072b676d7Smrg    while(MCFunctions[j].mFunc) {
68172b676d7Smrg
68272b676d7Smrg	curData = MCFunctions + j;
68372b676d7Smrg
68472b676d7Smrg	if(myCPUflags & curData->mycpuflag) {
68572b676d7Smrg
68672b676d7Smrg	   /* Simulate setup of the video buffer and copy result to framebuffer */
68772b676d7Smrg	   /* Do this 4 times to verify results */
68872b676d7Smrg	   if(!from) {
68972b676d7Smrg	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
69072b676d7Smrg	      tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
69172b676d7Smrg	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
69272b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
69372b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
69472b676d7Smrg	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
69572b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
69672b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
69772b676d7Smrg	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
69872b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
69972b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
70072b676d7Smrg	   } else {
70172b676d7Smrg	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
70272b676d7Smrg	      tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
70372b676d7Smrg	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
70472b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
70572b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
70672b676d7Smrg	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
70772b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
70872b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
70972b676d7Smrg	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
71072b676d7Smrg	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
71172b676d7Smrg	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
71272b676d7Smrg	   }
71372b676d7Smrg
71472b676d7Smrg	   if((!frqBuf) || (tmp1 == 0)) {
71572b676d7Smrg	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
71672b676d7Smrg			   "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1);
71772b676d7Smrg	   } else {
71872b676d7Smrg	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
71972b676d7Smrg			   "\tChecked %s memcpy()... \t%.1f MiB/s\n",
72072b676d7Smrg			   curData->mName,
72172b676d7Smrg			   cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000)));
72272b676d7Smrg	   }
72372b676d7Smrg
72472b676d7Smrg	   if(tmp1 < best) {
72572b676d7Smrg	      best = tmp1;
72672b676d7Smrg	      bestSoFar = j;
72772b676d7Smrg	   }
72872b676d7Smrg
72972b676d7Smrg	   if(!curData->reqAlignment) {
73072b676d7Smrg	      if(tmp1 < sbest) {
73172b676d7Smrg	         sbest = tmp1;
73272b676d7Smrg	         (*best2) = j;
73372b676d7Smrg	      }
73472b676d7Smrg	   }
73572b676d7Smrg
73672b676d7Smrg	}
73772b676d7Smrg
73872b676d7Smrg	j++;
73972b676d7Smrg    }
74072b676d7Smrg
74172b676d7Smrg    return bestSoFar;
74272b676d7Smrg}
74372b676d7Smrg
74472b676d7Smrgstatic vidCopyFunc SiS_GetBestByGrade(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
74572b676d7Smrg			unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from)
74672b676d7Smrg{
74772b676d7Smrg    int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10;
74872b676d7Smrg    int grade;
74972b676d7Smrg
75072b676d7Smrg    *UMemCpy = SiS_libc_memcpy;
75172b676d7Smrg
75272b676d7Smrg    while(MCFunctions[j].mFunc) {
75372b676d7Smrg	if(myCPUflags & MCFunctions[j].mycpuflag) {
75472b676d7Smrg	   grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade;
75572b676d7Smrg	   if(grade < bestSoFar) {
75672b676d7Smrg	      best = j;
75772b676d7Smrg	      bestSoFar = grade;
75872b676d7Smrg	   }
75972b676d7Smrg	   if(grade < best2SoFar) {
76072b676d7Smrg	      if(!MCFunctions[j].reqAlignment) {
76172b676d7Smrg	         secondbest = j;
76272b676d7Smrg		 best2SoFar = grade;
76372b676d7Smrg	      }
76472b676d7Smrg	   }
76572b676d7Smrg	}
76672b676d7Smrg	j++;
76772b676d7Smrg    }
76872b676d7Smrg    if(best >= 0) {
76972b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
77072b676d7Smrg		"Chose %s method for aligned data transfers %s video RAM\n",
77172b676d7Smrg		MCFunctions[best].mName,
77272b676d7Smrg		from ? "from" : "to");
77372b676d7Smrg       if(secondbest >= 0) {
77472b676d7Smrg          xf86DrvMsg(pScrn->scrnIndex, X_INFO,
77572b676d7Smrg		"Chose %s method for unaligned data transfers %s video RAM\n",
77672b676d7Smrg		   MCFunctions[secondbest].mName,
77772b676d7Smrg		   from ? "from" : "to");
77872b676d7Smrg          *UMemCpy = MCFunctions[secondbest].mFunc;
77972b676d7Smrg       }
78072b676d7Smrg       return MCFunctions[best].mFunc;
78172b676d7Smrg    }
78272b676d7Smrg
78372b676d7Smrg    return SiS_libc_memcpy;
78472b676d7Smrg}
78572b676d7Smrg#endif /* canBenchmark */
78672b676d7Smrg
78772b676d7Smrg/**********************************************************************/
78872b676d7Smrg/*      Generic routines if /proc filesystem is available (Linux)     */
78972b676d7Smrg/**********************************************************************/
79072b676d7Smrg
79172b676d7Smrg#ifdef SiS_haveProc
79272b676d7Smrg/* Linux: Read file (/proc/cpuinfo) into buffer */
79372b676d7Smrgstatic int SiS_ReadProc(char *buf, char *filename)
79472b676d7Smrg{
79572b676d7Smrg    FILE *cpuInfoFile;
79672b676d7Smrg    int count;
79772b676d7Smrg
79872b676d7Smrg    if((cpuInfoFile = fopen(filename, "r")) == NULL) {
79972b676d7Smrg       return 0;
80072b676d7Smrg    }
80172b676d7Smrg
80272b676d7Smrg    count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile);
80372b676d7Smrg    if(ferror(cpuInfoFile)) {
80472b676d7Smrg       fclose(cpuInfoFile);
80572b676d7Smrg       return 0;
80672b676d7Smrg    }
80772b676d7Smrg
80872b676d7Smrg    fclose(cpuInfoFile);
80972b676d7Smrg
81072b676d7Smrg    if(count >= CPUBUFFERSIZE - 2) {
81172b676d7Smrg       return 0;
81272b676d7Smrg    }
81372b676d7Smrg
81472b676d7Smrg    buf[count] = 0;
81572b676d7Smrg
81672b676d7Smrg    return count;
81772b676d7Smrg}
81872b676d7Smrg
81972b676d7Smrg/* Linux: Extract CPU speed from /proc/cpuinfo */
82072b676d7Smrgstatic char *SiS_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq)
82172b676d7Smrg{
82272b676d7Smrg    char *frqBuf, *endBuf;
82372b676d7Smrg
82472b676d7Smrg    (*cpuFreq) = 0.0;
82572b676d7Smrg
82672b676d7Smrg    if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) {
82772b676d7Smrg       frqBuf += 11;
82872b676d7Smrg       (*cpuFreq) = strtod(frqBuf, &endBuf);
82972b676d7Smrg       if(endBuf == frqBuf) frqBuf = NULL;
83072b676d7Smrg       if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */
83172b676d7Smrg       if(frqBuf) {
83272b676d7Smrg          xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq));
83372b676d7Smrg       }
83472b676d7Smrg    }
83572b676d7Smrg
83672b676d7Smrg    return frqBuf;
83772b676d7Smrg}
83872b676d7Smrg#endif /* haveProc */
83972b676d7Smrg
84072b676d7Smrg/**********************************************************************/
84172b676d7Smrg/*                      Arch-specific routines                        */
84272b676d7Smrg/**********************************************************************/
84372b676d7Smrg
84472b676d7Smrg#ifdef SiS_checkosforsse   /* Common i386, AMD64  */
84572b676d7Smrg
84672b676d7Smrg#ifdef SISCHECKOSSSE
84772b676d7Smrg
84872b676d7Smrg#ifndef XFree86LOADER
84972b676d7Smrg#include <setjmp.h>
85072b676d7Smrg#endif
85172b676d7Smrg
85272b676d7Smrgstatic jmp_buf sigill_return;
85372b676d7Smrg
85472b676d7Smrgstatic void sigill_handler(void)
85572b676d7Smrg{
85672b676d7Smrg    longjmp(sigill_return, 1);
85772b676d7Smrg}
85872b676d7Smrg#endif
85972b676d7Smrg
86072b676d7Smrgstatic Bool CheckOSforSSE(ScrnInfoPtr pScrn)
86172b676d7Smrg{
86272b676d7Smrg#ifdef SISCHECKOSSSE  /* Check OS for SSE possible: */
86372b676d7Smrg    int signo = -1;
86472b676d7Smrg
86572b676d7Smrg#ifdef SISDGBMC
86672b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n");
86772b676d7Smrg#endif
86872b676d7Smrg
86972b676d7Smrg    xf86InterceptSigIll(&sigill_handler);
87072b676d7Smrg
87172b676d7Smrg    if(setjmp(sigill_return)) {
87272b676d7Smrg       signo = 4;
87372b676d7Smrg    } else {
87472b676d7Smrg       __asm__ __volatile__ (" xorps %xmm0, %xmm0\n");
87572b676d7Smrg       /* __asm__ __volatile__ (" .byte 0xff\n"); */  /* For test */
87672b676d7Smrg    }
87772b676d7Smrg
87872b676d7Smrg    xf86InterceptSigIll(NULL);
87972b676d7Smrg
88072b676d7Smrg#ifdef SISDGBMC
88172b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo);
88272b676d7Smrg#endif
88372b676d7Smrg
88472b676d7Smrg    if(signo != -1) {
88572b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
88672b676d7Smrg		"OS does not support SSE instructions\n");
88772b676d7Smrg    }
88872b676d7Smrg
88972b676d7Smrg    return (signo >= 0) ? FALSE : TRUE;
89072b676d7Smrg
89172b676d7Smrg#else  /* no check for SSE possible: */
89272b676d7Smrg
89372b676d7Smrg    SISPtr pSiS = SISPTR(pScrn);
89472b676d7Smrg
89572b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, pSiS->XvSSEMemcpy ? X_WARNING : X_INFO,
89672b676d7Smrg	"Checking OS for SSE support is not supported in this version of " SISMYSERVERNAME "\n");
89772b676d7Smrg
89872b676d7Smrg    if(pSiS->XvSSEMemcpy) {
89972b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
90072b676d7Smrg	"If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n");
90172b676d7Smrg       return TRUE;
90272b676d7Smrg    } else {
90372b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
90472b676d7Smrg	"If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n");
90572b676d7Smrg       return FALSE;
90672b676d7Smrg    }
90772b676d7Smrg#endif
90872b676d7Smrg}
90972b676d7Smrg
91072b676d7Smrg#endif /* SiS_checkosforsse */
91172b676d7Smrg
91272b676d7Smrg#ifdef __i386__   /* i386 specific *************************************/
91372b676d7Smrg
91472b676d7SmrgPREFETCH_FUNC(SiS_sse,SSE,SSE,,FENCE,small_memcpy_i386)
91572b676d7SmrgPREFETCH_FUNC(SiS_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386)
91672b676d7SmrgPREFETCH_FUNC(SiS_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386)
91772b676d7SmrgNOPREFETCH_FUNC(SiS_mmx,MMX,EMMS,EMMS,small_memcpy_i386)
91872b676d7Smrg
91972b676d7Smrgstatic SISMCFuncData MCFunctions_i386[] = {
92072b676d7Smrg    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC,  4,  4, FALSE},
92172b676d7Smrg    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,    5,  5, FALSE},
92272b676d7Smrg    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,   6,  6, FALSE},
92372b676d7Smrg    {SiS_mmx_memcpy,    "MMX",       SIS_CPUFL_MMX,   3,  3, FALSE},
92472b676d7Smrg    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,   1,  0, TRUE},
92572b676d7Smrg    {SiS_now_memcpy,    "3DNow!",    SIS_CPUFL_3DNOW, 2,  2, FALSE},
92672b676d7Smrg    {SiS_mmxext_memcpy, "MMX2",      SIS_CPUFL_MMX2,  0,  1, FALSE},
92772b676d7Smrg    {NULL,              "",          0,              10, 10, FALSE}
92872b676d7Smrg};
92972b676d7Smrg
93072b676d7Smrg#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)  /* Default methods */
93172b676d7Smrg
93272b676d7Smrg#define cpuid(op, eax, ebx, ecx, edx) 		\
93372b676d7Smrg    __asm__ __volatile__ (			\
93472b676d7Smrg		" pushl %%ebx\n"		\
93572b676d7Smrg		" cpuid\n"			\
93672b676d7Smrg		" movl %%ebx, %1\n"		\
93772b676d7Smrg		" popl %%ebx\n"			\
93872b676d7Smrg		: "=a" (eax), "=r" (ebx), 	\
93972b676d7Smrg		  "=c" (ecx), "=d" (edx)	\
94072b676d7Smrg		: "a" (op)			\
94172b676d7Smrg		: "cc")
94272b676d7Smrg
94372b676d7Smrgstatic Bool cpuIDSupported(ScrnInfoPtr pScrn)
94472b676d7Smrg{
94572b676d7Smrg    int eax, ebx, ecx, edx;
94672b676d7Smrg
94772b676d7Smrg    /* Check for cpuid instruction */
94872b676d7Smrg    __asm__ __volatile__ (
94972b676d7Smrg		" pushf\n"
95072b676d7Smrg		" popl %0\n"
95172b676d7Smrg		" movl %0, %1\n"
95272b676d7Smrg		" xorl $0x200000, %0\n"
95372b676d7Smrg		" push %0\n"
95472b676d7Smrg		" popf\n"
95572b676d7Smrg		" pushf\n"
95672b676d7Smrg		" popl %0\n"
95772b676d7Smrg		: "=a" (eax), "=c" (ecx)
95872b676d7Smrg		:
95972b676d7Smrg		: "cc");
96072b676d7Smrg
96172b676d7Smrg    if(eax == ecx) {
96272b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n");
96372b676d7Smrg       return FALSE;
96472b676d7Smrg    }
96572b676d7Smrg
96672b676d7Smrg    /* Check for cpuid level */
96772b676d7Smrg    cpuid(0x00000000, eax, ebx, ecx, edx);
96872b676d7Smrg    if(!eax) {
96972b676d7Smrg       return FALSE;
97072b676d7Smrg    }
97172b676d7Smrg
97272b676d7Smrg    /* Check for RDTSC */
97372b676d7Smrg    cpuid(0x00000001, eax, ebx, ecx, edx);
97472b676d7Smrg
97572b676d7Smrg    if(!(edx & 0x10)) {
97672b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n");
97772b676d7Smrg       return FALSE;
97872b676d7Smrg    }
97972b676d7Smrg
98072b676d7Smrg    return TRUE;
98172b676d7Smrg}
98272b676d7Smrg
98372b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
98472b676d7Smrg{
98572b676d7Smrg    unsigned int flags = 0, eax, ebx, ecx, edx;
98672b676d7Smrg    Bool IsAMD;
98772b676d7Smrg
98872b676d7Smrg    /* Check if cpuid and rdtsc instructions are supported */
98972b676d7Smrg    if(!cpuIDSupported(pScrn)) {
99072b676d7Smrg       return 0;
99172b676d7Smrg    }
99272b676d7Smrg
99372b676d7Smrg    cpuid(0x00000000, eax, ebx, ecx, edx);
99472b676d7Smrg
99572b676d7Smrg    IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163);
99672b676d7Smrg
99772b676d7Smrg    cpuid(0x00000001, eax, ebx, ecx, edx);
99872b676d7Smrg    /* MMX */
99972b676d7Smrg    if(edx & 0x00800000) flags |= SIS_CPUFL_MMX;
100072b676d7Smrg    /* SSE, MMXEXT */
100172b676d7Smrg    if(edx & 0x02000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_MMX2);
100272b676d7Smrg    /* SSE2 - don't need this one directly, set SSE instead */
100372b676d7Smrg    if(edx & 0x04000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_SSE2);
100472b676d7Smrg
100572b676d7Smrg    cpuid(0x80000000, eax, ebx, ecx, edx);
100672b676d7Smrg    if(eax >= 0x80000001) {
100772b676d7Smrg       cpuid(0x80000001, eax, ebx, ecx, edx);
100872b676d7Smrg       /* 3DNow! */
100972b676d7Smrg       if(edx & 0x80000000) flags |= SIS_CPUFL_3DNOW;
101072b676d7Smrg       /* AMD MMXEXT */
101172b676d7Smrg       if(IsAMD && (edx & 0x00400000)) flags |= SIS_CPUFL_MMX2;
101272b676d7Smrg    }
101372b676d7Smrg
101472b676d7Smrg    return flags;
101572b676d7Smrg}
101672b676d7Smrg
101772b676d7Smrg#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */
101872b676d7Smrg
101972b676d7SmrgPREFETCH_FUNC(SiS_sse,SSE64,SSE,,FENCE,small_memcpy_amd64)
102072b676d7Smrg
102172b676d7Smrgstatic SISMCFuncData MCFunctions_AMD64[] = {
102272b676d7Smrg    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC, 2,  2, FALSE},
102372b676d7Smrg    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,   1,  1, FALSE},
102472b676d7Smrg    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,  3,  3, FALSE},
102572b676d7Smrg    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,  0,  0, TRUE},
102672b676d7Smrg    {NULL,              "",          0,             10, 10, FALSE}
102772b676d7Smrg};
102872b676d7Smrg
102972b676d7Smrg#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)
103072b676d7Smrg
103172b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
103272b676d7Smrg{
103372b676d7Smrg    return((unsigned int)(SIS_CPUFL_SSE|SIS_CPUFL_SSE2));
103472b676d7Smrg}
103572b676d7Smrg
103672b676d7Smrg#else  /* Specific for other archs ******************************** */
103772b676d7Smrg
103872b676d7Smrg/* Fill in here */
103972b676d7Smrg
104072b676d7Smrg#define Def_FL  (SIS_CPUFL_LIBC)
104172b676d7Smrg
104272b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
104372b676d7Smrg{
104472b676d7Smrg    return((unsigned int)(0));
104572b676d7Smrg}
104672b676d7Smrg
104772b676d7Smrg#endif
104872b676d7Smrg
104972b676d7Smrg/**********************************************************************/
105072b676d7Smrg/*     Benchmark the video copy routines and choose the fastest       */
105172b676d7Smrg/**********************************************************************/
105272b676d7Smrg
105372b676d7Smrg#ifdef SiS_canBenchmark
105472b676d7Smrgstatic vidCopyFunc
105572b676d7SmrgSiSVidCopyInitGen(ScreenPtr pScreen, SISMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from)
105672b676d7Smrg{
105774c14cd6Smrg    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
105872b676d7Smrg    SISPtr pSiS = SISPTR(pScrn);
105972b676d7Smrg    void *fbhandle = NULL;
106072b676d7Smrg    char  *frqBuf = NULL;
106172b676d7Smrg    UChar *buf1, *buf2, *buf3;
106272b676d7Smrg    double cpuFreq = 0.0;
106372b676d7Smrg    unsigned int myCPUflags = pSiS->CPUFlags | Def_FL;
106472b676d7Smrg    int best, secondbest;
106572b676d7Smrg#ifdef SiS_haveProc
106672b676d7Smrg    char buf[CPUBUFFERSIZE];
106772b676d7Smrg#endif
106872b676d7Smrg
106972b676d7Smrg    *UMemCpy = SiS_libc_memcpy;
107072b676d7Smrg
107172b676d7Smrg    /* Bail out if user disabled benchmarking */
107272b676d7Smrg    if(!pSiS->BenchMemCpy) {
107372b676d7Smrg       return SiS_libc_memcpy;
107472b676d7Smrg    }
107572b676d7Smrg
107672b676d7Smrg#ifdef SiS_haveProc
107772b676d7Smrg    /* Read /proc/cpuinfo into buf */
107872b676d7Smrg    if(SiS_ReadProc(buf, "/proc/cpuinfo")) {
107972b676d7Smrg
108072b676d7Smrg       /* Extract CPU frequency */
108172b676d7Smrg       frqBuf = SiS_GetCPUFreq(pScrn, buf, &cpuFreq);
108272b676d7Smrg
108372b676d7Smrg    }
108472b676d7Smrg#endif
108572b676d7Smrg
108672b676d7Smrg    /* Allocate buffers */
108772b676d7Smrg    if(!(fbhandle = SiS_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) {
108872b676d7Smrg       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
108972b676d7Smrg       		"Failed to allocate video RAM for video data transfer benchmark\n");
109072b676d7Smrg       return SiS_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from);
109172b676d7Smrg    }
109272b676d7Smrg
109372b676d7Smrg    /* Perform Benchmark */
109472b676d7Smrg    best = SiS_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1,
109572b676d7Smrg    				(UChar *)(((unsigned long)buf2 + 15) & ~15),
109672b676d7Smrg				(UChar *)(((unsigned long)buf3 + 15) & ~15),
109772b676d7Smrg				frqBuf, cpuFreq, UMemCpy, &secondbest, from);
109872b676d7Smrg
109972b676d7Smrg    /* Free buffers */
110072b676d7Smrg    SISFreeFBMemory(pScrn, &fbhandle);
110174c14cd6Smrg    free(buf2);
110274c14cd6Smrg    free(buf3);
110372b676d7Smrg
110472b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
110572b676d7Smrg	       "Using %s method for aligned data transfers %s video RAM\n",
110672b676d7Smrg	       MCFunctions[best].mName,
110772b676d7Smrg	       from ? "from" : "to");
110872b676d7Smrg
110972b676d7Smrg    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
111072b676d7Smrg	       "Using %s method for unaligned data transfers %s video RAM\n",
111172b676d7Smrg	       MCFunctions[secondbest].mName,
111272b676d7Smrg	       from ? "from" : "to");
111372b676d7Smrg
111472b676d7Smrg    return MCFunctions[best].mFunc;
111572b676d7Smrg}
111672b676d7Smrg#endif /* canBenchmark */
111772b676d7Smrg
111872b676d7Smrg/**********************************************************************/
111972b676d7Smrg/* 		       main(): Get CPU capabilities		      */
112072b676d7Smrg/* 			    (called externally)			      */
112172b676d7Smrg/**********************************************************************/
112272b676d7Smrg
112372b676d7Smrgunsigned int
112472b676d7SmrgSiSGetCPUFlags(ScrnInfoPtr pScrn)
112572b676d7Smrg{
112672b676d7Smrg    unsigned int myCPUflags = SiS_GetCpuFeatures(pScrn);
112772b676d7Smrg
112872b676d7Smrg#ifdef SiS_checkosforsse
112972b676d7Smrg    if(myCPUflags & (SIS_CPUFL_SSE | SIS_CPUFL_SSE2)) {
113072b676d7Smrg
113172b676d7Smrg       /* Check if OS supports usage of SSE instructions */
113272b676d7Smrg       if(!(CheckOSforSSE(pScrn))) {
113372b676d7Smrg          myCPUflags &= ~(SIS_CPUFL_SSE | SIS_CPUFL_SSE2);
113472b676d7Smrg       }
113572b676d7Smrg
113672b676d7Smrg    }
113772b676d7Smrg#endif
113872b676d7Smrg
113972b676d7Smrg    return myCPUflags;
114072b676d7Smrg}
114172b676d7Smrg
114272b676d7Smrg/**********************************************************************/
114372b676d7Smrg/*                       main(): SiSVidCopyInit()                     */
114472b676d7Smrg/*			    (called externally)			      */
114572b676d7Smrg/*		(SiSGetCPUFlags must be called before this one)       */
114672b676d7Smrg/**********************************************************************/
114772b676d7Smrg
114872b676d7SmrgvidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
114972b676d7Smrg{
115072b676d7Smrg#if defined(__i386__) && defined(SiS_canBenchmark)
115172b676d7Smrg    return(SiSVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from));
115272b676d7Smrg#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(SiS_canBenchmark)
115372b676d7Smrg    return(SiSVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from));
115472b676d7Smrg#else /* Other cases: Use libc memcpy() */
115572b676d7Smrg    *UMemCpy = SiS_libc_memcpy;
115672b676d7Smrg    return SiS_libc_memcpy;
115772b676d7Smrg#endif
115872b676d7Smrg}
115972b676d7Smrg
116072b676d7SmrgvidCopyFunc SiSVidCopyGetDefault(void)
116172b676d7Smrg{
116272b676d7Smrg    return SiS_libc_memcpy;
116372b676d7Smrg}
116472b676d7Smrg
116572b676d7Smrg#endif /* GNU C */
116672b676d7Smrg
116772b676d7Smrg
1168