1/*
2 * SiS memcpy() routines (assembly)
3 *
4 * Copyright (C) 2004-2005 Thomas Winischhofer
5 *
6 * Idea and some code bits from via_memcpy.c which is
7 * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sub license,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice (including the
17 * next paragraph) shall be included in all copies or substantial portions
18 * of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
23 * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 *
28 */
29
30#ifdef HAVE_CONFIG_H
31#include "config.h"
32#endif
33
34#include <stdlib.h>
35#include "sis.h"
36
37#if 0			/* Debug */
38#define SISDGBMC
39#endif
40
41extern unsigned int SISAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize);
42extern void	    SISFreeFBMemory(ScrnInfoPtr pScrn, void **handle);
43
44#define CPUBUFFERSIZE 2048       /* Size of /proc/cpuinfo buffer */
45#define BUFFERSIZE (576 * 1152)  /* Matches 720x576 YUV420 */
46
47/************************************************************************/
48/*                   arch specific memcpy() routines                    */
49/************************************************************************/
50
51/* i386, AMD64 */
52
53#define FENCE 			\
54     __asm__ __volatile__( 	\
55		  " sfence\n" 	\
56		  :		\
57		  :		\
58		  : "memory");
59
60#define FENCEMMS 		\
61     __asm__ __volatile__ (	\
62		  " sfence\n"	\
63		  " emms\n"	\
64		  :		\
65		  :		\
66		  : "memory");
67
68#define FEMMS 			\
69     __asm__ __volatile__(	\
70		  " femms\n"	\
71		  :		\
72		  :		\
73		  : "memory");
74
75#define EMMS 			\
76     __asm__ __volatile__(	\
77		  " emms\n"	\
78		  :		\
79		  :		\
80		  : "memory");
81
82#define SSE_PREFETCH " prefetchnta "
83#define NOW_PREFETCH " prefetch "
84
85#define PREFETCH1(arch_prefetch,from)		\
86    __asm__ __volatile__ (			\
87		  arch_prefetch "(%0)\n"	\
88		  arch_prefetch "32(%0)\n"	\
89		  arch_prefetch "64(%0)\n"	\
90		  arch_prefetch "96(%0)\n"	\
91		  arch_prefetch "128(%0)\n"	\
92		  arch_prefetch "160(%0)\n"	\
93		  arch_prefetch "192(%0)\n"	\
94		  arch_prefetch "256(%0)\n"	\
95		  arch_prefetch "288(%0)\n"	\
96		  : 				\
97		  : "r" (from) );
98
99#define PREFETCH2(arch_prefetch,from)		\
100    __asm__ __volatile__ (			\
101		  arch_prefetch "320(%0)\n"	\
102		  : 				\
103		  : "r" (from) );
104
105#define PREFETCH3(arch_prefetch,from)		\
106    __asm__ __volatile__ (			\
107		  arch_prefetch "288(%0)\n"	\
108		  : 				\
109		  : "r" (from) );
110
111#define small_memcpy_i386(to,from,n)					\
112    {									\
113	__asm__ __volatile__(						\
114		  " cld\n"						\
115		  " shrl $1, %%ecx\n"					\
116		  " jnc 1f\n"						\
117		  " movsb\n"						\
118		"1: shrl $1, %%ecx\n"					\
119		  " jnc 2f\n"						\
120		  " movsw\n"						\
121		"2: rep ; movsl"					\
122		  : "=&D" (to), "=&S" (from)				\
123		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
124		  : "memory", "cc");					\
125    }
126
127#define small_memcpy_amd64(to,from,n)					\
128    {									\
129	__asm__ __volatile__(						\
130		  " cld\n"						\
131		  " shrq $1, %%rcx\n"					\
132		  " jnc 1f\n"						\
133		  " movsb\n"						\
134		"1: shrq $1, %%rcx\n"					\
135		  " jnc 2f\n"						\
136		  " movsw\n"						\
137		"2: shrq $1, %%rcx\n"					\
138		  " jnc 3f\n"						\
139		  " movsl\n"						\
140		"3: rep ; movsq"					\
141		  : "=&D" (to), "=&S" (from)				\
142		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
143		  : "memory", "cc");					\
144    }
145
146#define MMX_CPY(prefetch,from,to,dummy,lcnt)				\
147    __asm__ __volatile__ (						\
148	        "1:\n"							\
149		    prefetch "320(%1)\n"				\
150	          " movq (%1), %%mm0\n"					\
151		  " movq 8(%1), %%mm1\n"				\
152		  " movq 16(%1), %%mm2\n"				\
153		  " movq 24(%1), %%mm3\n"				\
154		  " movq %%mm0, (%0)\n"					\
155		  " movq %%mm1, 8(%0)\n"				\
156		  " movq %%mm2, 16(%0)\n"				\
157		  " movq %%mm3, 24(%0)\n"				\
158		    prefetch "352(%1)\n"				\
159		  " movq 32(%1), %%mm0\n"				\
160		  " movq 40(%1), %%mm1\n"				\
161		  " movq 48(%1), %%mm2\n"				\
162		  " movq 56(%1), %%mm3\n"				\
163		  " leal 64(%1),%1\n"					\
164		  " movq %%mm0, 32(%0)\n"				\
165		  " movq %%mm1, 40(%0)\n"				\
166		  " movq %%mm2, 48(%0)\n"				\
167		  " movq %%mm3, 56(%0)\n"				\
168		  " decl %2\n"						\
169		  " leal 64(%0),%0\n"					\
170		  " jne 1b\n"						\
171		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
172		  : "0" (to), "1" (from), "2" (lcnt) 			\
173		  : "memory", "cc");
174
175#define SSE_CPY(prefetch,from,to,dummy,lcnt)				\
176    if((ULong) from & 15) {						\
177	__asm__ __volatile__ (						\
178		"1:\n"							\
179		    prefetch "320(%1)\n"				\
180		  " movups (%1), %%xmm0\n"				\
181		  " movups 16(%1), %%xmm1\n"				\
182		  " movntps %%xmm0, (%0)\n"				\
183		  " movntps %%xmm1, 16(%0)\n"				\
184		    prefetch "352(%1)\n"				\
185		  " movups 32(%1), %%xmm2\n"				\
186		  " movups 48(%1), %%xmm3\n"				\
187		  " leal 64(%1),%1\n"					\
188		  " movntps %%xmm2, 32(%0)\n"				\
189		  " movntps %%xmm3, 48(%0)\n"				\
190		  " decl %2\n"						\
191		  " leal 64(%0),%0\n"					\
192		  " jne 1b\n"						\
193		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
194		  : "0" (to), "1" (from), "2" (lcnt)			\
195		  : "memory", "cc"); 					\
196    } else {								\
197	__asm__ __volatile__ (						\
198		"2:\n"							\
199		    prefetch "320(%1)\n"				\
200		  " movaps (%1), %%xmm0\n"				\
201		  " movaps 16(%1), %%xmm1\n"				\
202		  " movntps %%xmm0, (%0)\n"				\
203		  " movntps %%xmm1, 16(%0)\n"				\
204        	    prefetch "352(%1)\n"				\
205		  " movaps 32(%1), %%xmm2\n"				\
206		  " movaps 48(%1), %%xmm3\n"				\
207		  " leal 64(%1),%1\n"					\
208		  " movntps %%xmm2, 32(%0)\n"				\
209		  " movntps %%xmm3, 48(%0)\n"				\
210		  " decl %2\n"						\
211		  " leal 64(%0),%0\n"					\
212		  " jne 2b\n"						\
213		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
214		  : "0" (to), "1" (from), "2" (lcnt)			\
215		  : "memory", "cc");					\
216    }
217
218#define SSE64_CPY(prefetch,from,to,dummy,lcnt)				\
219    if((ULong) from & 15) {						\
220	__asm__ __volatile__ (						\
221		"1:\n"							\
222		    prefetch "320(%1)\n"				\
223		  " movups (%1), %%xmm0\n"				\
224		  " movups 16(%1), %%xmm1\n"				\
225		  " movntps %%xmm0, (%0)\n"				\
226		  " movntps %%xmm1, 16(%0)\n"				\
227		    prefetch "352(%1)\n"				\
228		  " movups 32(%1), %%xmm2\n"				\
229		  " movups 48(%1), %%xmm3\n"				\
230		  " leaq 64(%1),%1\n"					\
231		  " movntps %%xmm2, 32(%0)\n"				\
232		  " movntps %%xmm3, 48(%0)\n"				\
233		  " decl %2\n"						\
234		  " leaq 64(%0),%0\n"					\
235		  " jne 1b\n"						\
236		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
237		  : "0" (to), "1" (from), "2" (lcnt)			\
238		  : "memory", "cc"); 					\
239    } else {								\
240	__asm__ __volatile__ (						\
241		"2:\n"							\
242		    prefetch "320(%1)\n"				\
243		  " movaps (%1), %%xmm0\n"				\
244		  " movaps 16(%1), %%xmm1\n"				\
245		  " movntps %%xmm0, (%0)\n"				\
246		  " movntps %%xmm1, 16(%0)\n"				\
247        	    prefetch "352(%1)\n"				\
248		  " movaps 32(%1), %%xmm2\n"				\
249		  " movaps 48(%1), %%xmm3\n"				\
250		  " leaq 64(%1),%1\n"					\
251		  " movntps %%xmm2, 32(%0)\n"				\
252		  " movntps %%xmm3, 48(%0)\n"				\
253		  " decl %2\n"						\
254		  " leaq 64(%0),%0\n"					\
255		  " jne 2b\n"						\
256		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
257		  : "0" (to), "1" (from), "2" (lcnt)			\
258		  : "memory", "cc");					\
259    }
260
261#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt)				\
262    __asm__ __volatile__ (						\
263		  ".p2align 4,,7\n"					\
264		 "1:\n"							\
265		    prefetch "320(%1)\n"				\
266		  " movq (%1), %%mm0\n"					\
267		  " movq 8(%1), %%mm1\n"				\
268		  " movq 16(%1), %%mm2\n"				\
269		  " movq 24(%1), %%mm3\n"				\
270		  " movntq %%mm0, (%0)\n"				\
271		  " movntq %%mm1, 8(%0)\n"				\
272		  " movntq %%mm2, 16(%0)\n"				\
273		  " movntq %%mm3, 24(%0)\n"				\
274		    prefetch "352(%1)\n"				\
275		  " movq 32(%1), %%mm0\n"				\
276		  " movq 40(%1), %%mm1\n"				\
277		  " movq 48(%1), %%mm2\n"				\
278		  " movq 56(%1), %%mm3\n"				\
279		  " leal 64(%1),%1\n"					\
280		  " movntq %%mm0, 32(%0)\n"				\
281		  " movntq %%mm1, 40(%0)\n"				\
282		  " movntq %%mm2, 48(%0)\n"				\
283		  " movntq %%mm3, 56(%0)\n"				\
284		  " decl %2\n"						\
285		  " leal 64(%0),%0\n"					\
286		  " jne 1b\n"						\
287		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
288		  : "0" (to), "1" (from), "2" (lcnt) 			\
289		  : "memory", "cc");
290
291
292#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)		\
293									\
294    static void prefix##_memcpy(UChar *to,				\
295				const UChar *from,			\
296				int size)				\
297    {									\
298	int lcnt = size >> 6;						\
299	int rest = size & 63;						\
300	register int dummy;						\
301									\
302	PREFETCH1(ptype##_PREFETCH,from);				\
303									\
304	begin;								\
305	if(lcnt) {							\
306	   itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt);		\
307	}								\
308	if(rest) {							\
309	   PREFETCH2(ptype##_PREFETCH,from);				\
310	   small(to, from, rest);					\
311	   PREFETCH3(ptype##_PREFETCH,from);				\
312	}								\
313	fence;								\
314    }
315
316#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small)			\
317									\
318    static void prefix##_memcpy(UChar *to,				\
319				const UChar *from,			\
320				int size)				\
321    {									\
322	int lcnt = size >> 6;						\
323	int rest = size & 63;						\
324	register int dummy;						\
325									\
326	begin;								\
327	if(lcnt) {							\
328	   itype##_CPY("#",from,to,dummy,lcnt);				\
329	}								\
330	if(rest) {							\
331	   small(to, from, rest);					\
332	}								\
333	fence;								\
334    }
335
336/* Other archs */
337
338/* ... */
339
340
341/* Type for table for benchmark list */
342
343typedef struct {
344    vidCopyFunc  mFunc;
345    char         *mName;
346    unsigned int mycpuflag;
347    int          grade;
348    int 	 gradefrom;
349    Bool         reqAlignment;
350} SISMCFuncData;
351
352/************************************************************************/
353/*                   libc memcpy() wrapper - generic                    */
354/************************************************************************/
355
356static void SiS_libc_memcpy(UChar *dst, const UChar *src, int size)
357{
358    memcpy(dst, src, size);
359}
360
361/************************************************************************/
362/* We only do all that stuff under gcc; no idea what other compilers 	*/
363/* would do with our asm code.  					*/
364/************************************************************************/
365
366#ifndef __GNUC__
367
368unsigned int SiSGetCPUFlags(ScrnInfoPtr pScrn)
369{
370    return 0;
371}
372
373vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
374{
375    *UMemCpy = SiS_libc_memcpy;
376    return SiS_libc_memcpy;
377}
378
379vidCopyFunc SiSVidCopyGetDefault(void)
380{
381    return SiS_libc_memcpy;
382}
383
384#else /* ! Everything below is gcc specific ! */
385
386/************************************************************************/
387/*                    Definitions for archs and OSes                    */
388/************************************************************************/
389
390#undef SiS_checkosforsse
391#undef SiS_canBenchmark
392#undef SiS_haveProc
393#undef SiS_haveBuiltInMC
394
395#if defined(__i386__) /* ***************************************** i386 */
396
397#define SiS_checkosforsse 	/* Does this cpu support sse and do we need to check os? */
398#define SiS_canBenchmark	/* Can we perform a benchmark? */
399#ifdef SIS_LINUX
400#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
401#endif
402#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
403
404/* Built-in memcpy for i386 */
405static __inline void * builtin_memcpy(void * to, const void * from, size_t n)
406{
407    int d1,d2,d3;
408
409    __asm__ __volatile__(
410		  " cld\n"
411		  " shrl $1, %%ecx\n"
412		  " jnc 1f\n"
413		  " movsb\n"
414		"1: shrl $1, %%ecx\n"
415		  " jnc 2f\n"
416		  " movsw\n"
417		"2: rep ; movsl\n"
418		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
419		  : "0" (n), "1" ((long) to), "2" ((long) from)
420		  : "memory", "cc");
421
422    return(to);
423}
424
425/* Alternative for 586: Unroll loop, copy 32 bytes at a time */
426static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
427{
428    int d1,d2,d3;
429
430    __asm__ __volatile__(
431		  " movl %%edi, %%eax\n"
432		  " cmpl $32, %%ecx\n"
433		  " cld\n"
434		  " jbe 3f\n"
435		  " negl %%eax\n"		/* Align dest */
436		  " andl $3, %%eax\n"
437		  " subl %%eax, %%ecx\n"
438		  " xchgl %%eax, %%ecx\n"
439		  " rep ; movsb\n"
440		  " movl %%eax, %%ecx\n"
441		  " subl $32, %%ecx\n"
442		  " js 2f\n"
443		  " movl (%%edi), %%eax\n"
444		"1: movl 28(%%edi), %%edx\n"   	/* Trick: Read-ahead */
445		  " subl $32, %%ecx\n"
446		  " movl (%%esi), %%eax\n"
447		  " movl 4(%%esi), %%edx\n"
448		  " movl %%eax, (%%edi)\n"
449		  " movl %%edx, 4(%%edi)\n"
450		  " movl 8(%%esi), %%eax\n"
451		  " movl 12(%%esi), %%edx\n"
452		  " movl %%eax, 8(%%edi)\n"
453		  " movl %%edx, 12(%%edi)\n"
454		  " movl 16(%%esi), %%eax\n"
455		  " movl 20(%%esi), %%edx\n"
456		  " movl %%eax, 16(%%edi)\n"
457		  " movl %%edx, 20(%%edi)\n"
458		  " movl 24(%%esi), %%eax\n"
459		  " movl 28(%%esi), %%edx\n"
460		  " movl %%eax, 24(%%edi)\n"
461		  " movl %%edx, 28(%%edi)\n"
462		  " leal 32(%%esi), %%esi\n"
463		  " leal 32(%%edi), %%edi\n"
464		  " jns 1b\n"
465		"2: addl $32, %%ecx\n"
466		"3: rep ; movsb"
467		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
468		  : "0" (n), "1" ((long) to), "2" ((long) from)
469		  : "eax", "edx", "memory", "cc");
470
471}
472
473static unsigned int taketime(void)	/* get current time (for benchmarking) */
474{
475    unsigned int eax;
476
477    __asm__ volatile (
478		" pushl %%ebx\n"
479		" cpuid\n"
480		" rdtsc\n"
481		" popl %%ebx\n"
482		: "=a" (eax)
483		: "0" (0)
484		: "ecx", "edx", "cc");
485
486    return(eax);
487}
488
489#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */
490
491#define SiS_checkosforsse	/* Does this cpu support sse and do we need to check os? */
492#define SiS_canBenchmark	/* Can we perform a benchmark? */
493#ifdef SIS_LINUX
494#define SiS_haveProc		/* Do we have /proc/cpuinfo or similar? */
495#endif
496#define SiS_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
497
498/* Built-in memcpy for AMD64 */
499static __inline void * builtin_memcpy(void * to, const void * from, int n)
500{
501    long d1, d2, d3;
502
503    __asm__ __volatile__ (
504		" cld\n"
505		" rep ; movsq\n"
506		" movq %4, %%rcx\n"
507		" rep ; movsb"
508		: "=%c" (d1), "=&D" (d2), "=&S" (d3)
509		: "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)),
510		  "1" ((long) to), "2" ((long) from)
511		: "memory");
512
513    return(to);
514}
515
516/* Alternative: Unroll loop, copy 32 bytes at a time */
517static void SiS_builtin_memcp2(UChar *to, const UChar *from, int n)
518{
519    long d1,d2,d3;
520
521    __asm__ __volatile__(
522		  " movq %%rdi, %%rax\n"
523		  " cmpq $32, %%rcx\n"
524		  " cld\n"			/* Pipeline; no other flags but DF */
525		  " jbe 1f\n"
526		  " negq %%rax\n"		/* Align dest */
527		  " andq $7, %%rax\n"
528		  " subq %%rax, %%rcx\n"
529		  " xchgq %%rax, %%rcx\n"
530		  " rep ; movsb\n"
531		  " movq %%rax, %%rcx\n"
532		  " subq $32, %%rcx\n"
533		  " js 2f\n"
534		  ".p2align 4\n"
535		"3: subq $32, %%rcx\n"
536		  " movq (%%rsi), %%rax\n"
537		  " movq 8(%%rsi), %%rdx\n"
538		  " movq 16(%%rsi), %%r8\n"
539		  " movq 24(%%rsi), %%r9\n"
540		  " movq %%rax, (%%rdi)\n"
541		  " movq %%rdx, 8(%%rdi)\n"
542		  " movq %%r8, 16(%%rdi)\n"
543		  " movq %%r9, 24(%%rdi)\n"
544		  " leaq 32(%%rsi), %%rsi\n"
545		  " leaq 32(%%rdi), %%rdi\n"
546		  " jns 3b\n"
547		"2: addq $32, %%rcx\n"
548		"1: rep ; movsb"
549		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
550		  :"0" ((ULong) n), "1" ((long) to), "2" ((long) from)
551		  : "rax", "rdx", "r8", "r9", "memory", "cc");
552
553}
554
555static unsigned int taketime(void)	/* get current time (for benchmarking) */
556{
557    unsigned int eax;
558
559    __asm__ volatile (
560		" pushq %%rbx\n"
561		" cpuid\n"
562		" rdtsc\n"
563		" popq %%rbx\n"
564		: "=a" (eax)
565		: "0" (0)
566		: "rcx", "rdx", "cc");
567
568    return(eax);
569}
570
571#else		/* **************************************** Other archs */
572
573/* 1. Can we do a benchmark?		*/
574/* #define SiS_canBenchmark		*/
575
576/* 2. Do we have /proc filesystem or similar for CPU information? */
577/* #define SiS_haveproc			*/
578
579/* 3. Optional: build-in memcpy()	*/
580/* #define SiS_haveBuiltInMC		*/
581/* static __inline void * builtin_memcpy(void * to, const void * from, int n)
582   {
583   }
584*/
585
586/* 4. Function for getting current time (for benchmarking)  */
587/* static unsigned int taketime(void)
588   {
589   }
590*/
591
592#endif
593
594/************************************************************************/
595/*                   Generic built-in memcpy wrapper                    */
596/************************************************************************/
597
598#ifdef SiS_haveBuiltInMC
599static void SiS_builtin_memcpy(UChar *dst, const UChar *src, int size)
600{
601    builtin_memcpy(dst, src, size);
602}
603#endif
604
605/************************************************************************/
606/* Generic routines if Benchmark can be performed (all archs, all OSes) */
607/************************************************************************/
608
609#ifdef SiS_canBenchmark
610
611/* Get time (unsigned int) */
612static unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size)
613{
614    unsigned int t1, t2;
615
616    t1 = taketime();
617
618    (*mf)(buf1, buf2, size);
619
620    t2 = taketime();
621
622    return((t1 <  t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1));
623}
624
625/* Allocate an area of offscreen FB memory (buf1), a simulated video
626 * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
627 */
628static void *
629SiS_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3)
630{
631    SISPtr pSiS = SISPTR(pScrn);
632    unsigned int offset;
633    void *handle = NULL;
634
635    if(!(offset = SISAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) {
636       return NULL;
637    }
638    (*buf1) = (UChar *)pSiS->FbBase + offset;
639    (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31);
640
641    if(!((*buf2) = (UChar *)malloc(BUFFERSIZE + 15))) {
642       SISFreeFBMemory(pScrn, &handle);
643       return NULL;
644    }
645
646    if(!((*buf3) = (UChar *)malloc(BUFFERSIZE + 15))) {
647       free((*buf2));
648       SISFreeFBMemory(pScrn, &handle);
649       return NULL;
650    }
651
652    return handle;
653}
654
655/* Perform Benchmark */
656static int SiS_BenchmarkMemcpy(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
657                               unsigned int myCPUflags, UChar *buf1, UChar *buf2,
658			       UChar *buf3, char *frqBuf, double cpuFreq,
659			       vidCopyFunc *UMemCpy, int *best2, Bool from)
660{
661    SISMCFuncData *curData;
662    int j = 0, bestSoFar = 0;
663    unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU;
664
665    (*best2) = 0;
666
667    /* Make probable buf1 and buf2 are not paged out by referencing them */
668    SiS_libc_memcpy(buf1, buf2, BUFFERSIZE);
669
670    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
671	       "Benchmarking %s RAM to %s RAM memory transfer methods:\n",
672	       from ? "video" : "system",
673	       from ? "system" : "video");
674
675#ifdef TWDEBUG
676    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags);
677#endif
678
679    j = 0;
680    while(MCFunctions[j].mFunc) {
681
682	curData = MCFunctions + j;
683
684	if(myCPUflags & curData->mycpuflag) {
685
686	   /* Simulate setup of the video buffer and copy result to framebuffer */
687	   /* Do this 4 times to verify results */
688	   if(!from) {
689	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
690	      tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
691	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
692	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
693	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
694	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
695	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
696	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
697	      SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE);
698	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
699	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
700	   } else {
701	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
702	      tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
703	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
704	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
705	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
706	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
707	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
708	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
709	      SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE);
710	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
711	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
712	   }
713
714	   if((!frqBuf) || (tmp1 == 0)) {
715	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
716			   "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1);
717	   } else {
718	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
719			   "\tChecked %s memcpy()... \t%.1f MiB/s\n",
720			   curData->mName,
721			   cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000)));
722	   }
723
724	   if(tmp1 < best) {
725	      best = tmp1;
726	      bestSoFar = j;
727	   }
728
729	   if(!curData->reqAlignment) {
730	      if(tmp1 < sbest) {
731	         sbest = tmp1;
732	         (*best2) = j;
733	      }
734	   }
735
736	}
737
738	j++;
739    }
740
741    return bestSoFar;
742}
743
744static vidCopyFunc SiS_GetBestByGrade(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions,
745			unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from)
746{
747    int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10;
748    int grade;
749
750    *UMemCpy = SiS_libc_memcpy;
751
752    while(MCFunctions[j].mFunc) {
753	if(myCPUflags & MCFunctions[j].mycpuflag) {
754	   grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade;
755	   if(grade < bestSoFar) {
756	      best = j;
757	      bestSoFar = grade;
758	   }
759	   if(grade < best2SoFar) {
760	      if(!MCFunctions[j].reqAlignment) {
761	         secondbest = j;
762		 best2SoFar = grade;
763	      }
764	   }
765	}
766	j++;
767    }
768    if(best >= 0) {
769       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
770		"Chose %s method for aligned data transfers %s video RAM\n",
771		MCFunctions[best].mName,
772		from ? "from" : "to");
773       if(secondbest >= 0) {
774          xf86DrvMsg(pScrn->scrnIndex, X_INFO,
775		"Chose %s method for unaligned data transfers %s video RAM\n",
776		   MCFunctions[secondbest].mName,
777		   from ? "from" : "to");
778          *UMemCpy = MCFunctions[secondbest].mFunc;
779       }
780       return MCFunctions[best].mFunc;
781    }
782
783    return SiS_libc_memcpy;
784}
785#endif /* canBenchmark */
786
787/**********************************************************************/
788/*      Generic routines if /proc filesystem is available (Linux)     */
789/**********************************************************************/
790
791#ifdef SiS_haveProc
792/* Linux: Read file (/proc/cpuinfo) into buffer */
793static int SiS_ReadProc(char *buf, char *filename)
794{
795    FILE *cpuInfoFile;
796    int count;
797
798    if((cpuInfoFile = fopen(filename, "r")) == NULL) {
799       return 0;
800    }
801
802    count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile);
803    if(ferror(cpuInfoFile)) {
804       fclose(cpuInfoFile);
805       return 0;
806    }
807
808    fclose(cpuInfoFile);
809
810    if(count >= CPUBUFFERSIZE - 2) {
811       return 0;
812    }
813
814    buf[count] = 0;
815
816    return count;
817}
818
819/* Linux: Extract CPU speed from /proc/cpuinfo */
820static char *SiS_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq)
821{
822    char *frqBuf, *endBuf;
823
824    (*cpuFreq) = 0.0;
825
826    if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) {
827       frqBuf += 11;
828       (*cpuFreq) = strtod(frqBuf, &endBuf);
829       if(endBuf == frqBuf) frqBuf = NULL;
830       if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */
831       if(frqBuf) {
832          xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq));
833       }
834    }
835
836    return frqBuf;
837}
838#endif /* haveProc */
839
840/**********************************************************************/
841/*                      Arch-specific routines                        */
842/**********************************************************************/
843
844#ifdef SiS_checkosforsse   /* Common i386, AMD64  */
845
846#ifdef SISCHECKOSSSE
847
848#ifndef XFree86LOADER
849#include <setjmp.h>
850#endif
851
852static jmp_buf sigill_return;
853
854static void sigill_handler(void)
855{
856    longjmp(sigill_return, 1);
857}
858#endif
859
860static Bool CheckOSforSSE(ScrnInfoPtr pScrn)
861{
862#ifdef SISCHECKOSSSE  /* Check OS for SSE possible: */
863    int signo = -1;
864
865#ifdef SISDGBMC
866    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n");
867#endif
868
869    xf86InterceptSigIll(&sigill_handler);
870
871    if(setjmp(sigill_return)) {
872       signo = 4;
873    } else {
874       __asm__ __volatile__ (" xorps %xmm0, %xmm0\n");
875       /* __asm__ __volatile__ (" .byte 0xff\n"); */  /* For test */
876    }
877
878    xf86InterceptSigIll(NULL);
879
880#ifdef SISDGBMC
881    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo);
882#endif
883
884    if(signo != -1) {
885       xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
886		"OS does not support SSE instructions\n");
887    }
888
889    return (signo >= 0) ? FALSE : TRUE;
890
891#else  /* no check for SSE possible: */
892
893    SISPtr pSiS = SISPTR(pScrn);
894
895    xf86DrvMsg(pScrn->scrnIndex, pSiS->XvSSEMemcpy ? X_WARNING : X_INFO,
896	"Checking OS for SSE support is not supported in this version of " SISMYSERVERNAME "\n");
897
898    if(pSiS->XvSSEMemcpy) {
899       xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
900	"If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n");
901       return TRUE;
902    } else {
903       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
904	"If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n");
905       return FALSE;
906    }
907#endif
908}
909
910#endif /* SiS_checkosforsse */
911
912#ifdef __i386__   /* i386 specific *************************************/
913
914PREFETCH_FUNC(SiS_sse,SSE,SSE,,FENCE,small_memcpy_i386)
915PREFETCH_FUNC(SiS_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386)
916PREFETCH_FUNC(SiS_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386)
917NOPREFETCH_FUNC(SiS_mmx,MMX,EMMS,EMMS,small_memcpy_i386)
918
919static SISMCFuncData MCFunctions_i386[] = {
920    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC,  4,  4, FALSE},
921    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,    5,  5, FALSE},
922    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,   6,  6, FALSE},
923    {SiS_mmx_memcpy,    "MMX",       SIS_CPUFL_MMX,   3,  3, FALSE},
924    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,   1,  0, TRUE},
925    {SiS_now_memcpy,    "3DNow!",    SIS_CPUFL_3DNOW, 2,  2, FALSE},
926    {SiS_mmxext_memcpy, "MMX2",      SIS_CPUFL_MMX2,  0,  1, FALSE},
927    {NULL,              "",          0,              10, 10, FALSE}
928};
929
930#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)  /* Default methods */
931
932#define cpuid(op, eax, ebx, ecx, edx) 		\
933    __asm__ __volatile__ (			\
934		" pushl %%ebx\n"		\
935		" cpuid\n"			\
936		" movl %%ebx, %1\n"		\
937		" popl %%ebx\n"			\
938		: "=a" (eax), "=r" (ebx), 	\
939		  "=c" (ecx), "=d" (edx)	\
940		: "a" (op)			\
941		: "cc")
942
943static Bool cpuIDSupported(ScrnInfoPtr pScrn)
944{
945    int eax, ebx, ecx, edx;
946
947    /* Check for cpuid instruction */
948    __asm__ __volatile__ (
949		" pushf\n"
950		" popl %0\n"
951		" movl %0, %1\n"
952		" xorl $0x200000, %0\n"
953		" push %0\n"
954		" popf\n"
955		" pushf\n"
956		" popl %0\n"
957		: "=a" (eax), "=c" (ecx)
958		:
959		: "cc");
960
961    if(eax == ecx) {
962       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n");
963       return FALSE;
964    }
965
966    /* Check for cpuid level */
967    cpuid(0x00000000, eax, ebx, ecx, edx);
968    if(!eax) {
969       return FALSE;
970    }
971
972    /* Check for RDTSC */
973    cpuid(0x00000001, eax, ebx, ecx, edx);
974
975    if(!(edx & 0x10)) {
976       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n");
977       return FALSE;
978    }
979
980    return TRUE;
981}
982
983static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
984{
985    unsigned int flags = 0, eax, ebx, ecx, edx;
986    Bool IsAMD;
987
988    /* Check if cpuid and rdtsc instructions are supported */
989    if(!cpuIDSupported(pScrn)) {
990       return 0;
991    }
992
993    cpuid(0x00000000, eax, ebx, ecx, edx);
994
995    IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163);
996
997    cpuid(0x00000001, eax, ebx, ecx, edx);
998    /* MMX */
999    if(edx & 0x00800000) flags |= SIS_CPUFL_MMX;
1000    /* SSE, MMXEXT */
1001    if(edx & 0x02000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_MMX2);
1002    /* SSE2 - don't need this one directly, set SSE instead */
1003    if(edx & 0x04000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_SSE2);
1004
1005    cpuid(0x80000000, eax, ebx, ecx, edx);
1006    if(eax >= 0x80000001) {
1007       cpuid(0x80000001, eax, ebx, ecx, edx);
1008       /* 3DNow! */
1009       if(edx & 0x80000000) flags |= SIS_CPUFL_3DNOW;
1010       /* AMD MMXEXT */
1011       if(IsAMD && (edx & 0x00400000)) flags |= SIS_CPUFL_MMX2;
1012    }
1013
1014    return flags;
1015}
1016
1017#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */
1018
1019PREFETCH_FUNC(SiS_sse,SSE64,SSE,,FENCE,small_memcpy_amd64)
1020
1021static SISMCFuncData MCFunctions_AMD64[] = {
1022    {SiS_libc_memcpy,   "libc",      SIS_CPUFL_LIBC, 2,  2, FALSE},
1023    {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI,   1,  1, FALSE},
1024    {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2,  3,  3, FALSE},
1025    {SiS_sse_memcpy,    "SSE",       SIS_CPUFL_SSE,  0,  0, TRUE},
1026    {NULL,              "",          0,             10, 10, FALSE}
1027};
1028
1029#define Def_FL  (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2)
1030
1031static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
1032{
1033    return((unsigned int)(SIS_CPUFL_SSE|SIS_CPUFL_SSE2));
1034}
1035
1036#else  /* Specific for other archs ******************************** */
1037
1038/* Fill in here */
1039
1040#define Def_FL  (SIS_CPUFL_LIBC)
1041
1042static unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn)
1043{
1044    return((unsigned int)(0));
1045}
1046
1047#endif
1048
1049/**********************************************************************/
1050/*     Benchmark the video copy routines and choose the fastest       */
1051/**********************************************************************/
1052
1053#ifdef SiS_canBenchmark
1054static vidCopyFunc
1055SiSVidCopyInitGen(ScreenPtr pScreen, SISMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from)
1056{
1057    ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen);
1058    SISPtr pSiS = SISPTR(pScrn);
1059    void *fbhandle = NULL;
1060    char  *frqBuf = NULL;
1061    UChar *buf1, *buf2, *buf3;
1062    double cpuFreq = 0.0;
1063    unsigned int myCPUflags = pSiS->CPUFlags | Def_FL;
1064    int best, secondbest;
1065#ifdef SiS_haveProc
1066    char buf[CPUBUFFERSIZE];
1067#endif
1068
1069    *UMemCpy = SiS_libc_memcpy;
1070
1071    /* Bail out if user disabled benchmarking */
1072    if(!pSiS->BenchMemCpy) {
1073       return SiS_libc_memcpy;
1074    }
1075
1076#ifdef SiS_haveProc
1077    /* Read /proc/cpuinfo into buf */
1078    if(SiS_ReadProc(buf, "/proc/cpuinfo")) {
1079
1080       /* Extract CPU frequency */
1081       frqBuf = SiS_GetCPUFreq(pScrn, buf, &cpuFreq);
1082
1083    }
1084#endif
1085
1086    /* Allocate buffers */
1087    if(!(fbhandle = SiS_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) {
1088       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
1089       		"Failed to allocate video RAM for video data transfer benchmark\n");
1090       return SiS_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from);
1091    }
1092
1093    /* Perform Benchmark */
1094    best = SiS_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1,
1095    				(UChar *)(((unsigned long)buf2 + 15) & ~15),
1096				(UChar *)(((unsigned long)buf3 + 15) & ~15),
1097				frqBuf, cpuFreq, UMemCpy, &secondbest, from);
1098
1099    /* Free buffers */
1100    SISFreeFBMemory(pScrn, &fbhandle);
1101    free(buf2);
1102    free(buf3);
1103
1104    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
1105	       "Using %s method for aligned data transfers %s video RAM\n",
1106	       MCFunctions[best].mName,
1107	       from ? "from" : "to");
1108
1109    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
1110	       "Using %s method for unaligned data transfers %s video RAM\n",
1111	       MCFunctions[secondbest].mName,
1112	       from ? "from" : "to");
1113
1114    return MCFunctions[best].mFunc;
1115}
1116#endif /* canBenchmark */
1117
1118/**********************************************************************/
1119/* 		       main(): Get CPU capabilities		      */
1120/* 			    (called externally)			      */
1121/**********************************************************************/
1122
1123unsigned int
1124SiSGetCPUFlags(ScrnInfoPtr pScrn)
1125{
1126    unsigned int myCPUflags = SiS_GetCpuFeatures(pScrn);
1127
1128#ifdef SiS_checkosforsse
1129    if(myCPUflags & (SIS_CPUFL_SSE | SIS_CPUFL_SSE2)) {
1130
1131       /* Check if OS supports usage of SSE instructions */
1132       if(!(CheckOSforSSE(pScrn))) {
1133          myCPUflags &= ~(SIS_CPUFL_SSE | SIS_CPUFL_SSE2);
1134       }
1135
1136    }
1137#endif
1138
1139    return myCPUflags;
1140}
1141
1142/**********************************************************************/
1143/*                       main(): SiSVidCopyInit()                     */
1144/*			    (called externally)			      */
1145/*		(SiSGetCPUFlags must be called before this one)       */
1146/**********************************************************************/
1147
1148vidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
1149{
1150#if defined(__i386__) && defined(SiS_canBenchmark)
1151    return(SiSVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from));
1152#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(SiS_canBenchmark)
1153    return(SiSVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from));
1154#else /* Other cases: Use libc memcpy() */
1155    *UMemCpy = SiS_libc_memcpy;
1156    return SiS_libc_memcpy;
1157#endif
1158}
1159
1160vidCopyFunc SiSVidCopyGetDefault(void)
1161{
1162    return SiS_libc_memcpy;
1163}
1164
1165#endif /* GNU C */
1166
1167
1168