1/*
2 * XGI memcpy() routines (assembly)
3 *
4 * Copyright (C) 2004-2005 Thomas Winischhofer
5 *
6 * Idea and some code bits from via_memcpy.c which is
7 * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved.
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sub license,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice (including the
17 * next paragraph) shall be included in all copies or substantial portions
18 * of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
23 * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 *
28 */
29
30#ifdef HAVE_CONFIG_H
31#include "config.h"
32#endif
33
34/* #include "xgi.h" */
35
36/* Jong 07/10/2008; use run-time debug instead except for HW acceleration routines */
37/* extern bool g_bRunTimeDebug;
38#define RUNTIMEDEBUG(p)		if(g_bRunTimeDebug)p; */
39
40/* #define DEBUG */
41#ifdef  DEBUG
42#define PDEBUG(p)     p  /* RUNTIMEDEBUG(p) */
43#else
44#define PDEBUG(p)
45#endif
46
47/* Jong 01/15/2009; CPU flags (for memcpy() etc.) */
48#define XGI_CPUFL_LIBC  0x0001
49#define XGI_CPUFL_BI    0x0002
50#define XGI_CPUFL_SSE   0x0004
51#define XGI_CPUFL_MMX   0x0008
52#define XGI_CPUFL_3DNOW 0x0010
53#define XGI_CPUFL_MMX2  0x0020
54#define XGI_CPUFL_BI2   0x0040
55#define XGI_CPUFL_SSE2  0x0080
56#define XGI_CPUFL_FLAG  0x8000
57
58#if 0			/* Debug */
59#define XGIDGBMC
60#endif
61
62/* Jong 01/07/2009; debug */
63/*
64#ifdef XGI_DEBUG
65#define PDEBUG(p)  p
66#else
67#define PDEBUG(p)
68#endif
69*/
70
71#if 0 /* Jong 01/15/2009; ignore at the moment */
72extern unsigned int XGIAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize);
73extern void	    XGIFreeFBMemory(ScrnInfoPtr pScrn, void **handle);
74#endif
75
76#define CPUBUFFERSIZE 2048       /* Size of /proc/cpuinfo buffer */
77#define BUFFERSIZE (576 * 1152)  /* Matches 720x576 YUV420 */
78
79/************************************************************************/
80/*                   arch specific memcpy() routines                    */
81/************************************************************************/
82
83/* i386, AMD64 */
84
85#define FENCE 			\
86     __asm__ __volatile__( 	\
87		  " sfence\n" 	\
88		  :		\
89		  :		\
90		  : "memory");
91
92#define FENCEMMS 		\
93     __asm__ __volatile__ (	\
94		  " sfence\n"	\
95		  " emms\n"	\
96		  :		\
97		  :		\
98		  : "memory");
99
100#define FEMMS 			\
101     __asm__ __volatile__(	\
102		  " femms\n"	\
103		  :		\
104		  :		\
105		  : "memory");
106
107#define EMMS 			\
108     __asm__ __volatile__(	\
109		  " emms\n"	\
110		  :		\
111		  :		\
112		  : "memory");
113
114#define SSE_PREFETCH " prefetchnta "
115#define NOW_PREFETCH " prefetch "
116
117#define PREFETCH1(arch_prefetch,from)		\
118    __asm__ __volatile__ (			\
119		  arch_prefetch "(%0)\n"	\
120		  arch_prefetch "32(%0)\n"	\
121		  arch_prefetch "64(%0)\n"	\
122		  arch_prefetch "96(%0)\n"	\
123		  arch_prefetch "128(%0)\n"	\
124		  arch_prefetch "160(%0)\n"	\
125		  arch_prefetch "192(%0)\n"	\
126		  arch_prefetch "256(%0)\n"	\
127		  arch_prefetch "288(%0)\n"	\
128		  : 				\
129		  : "r" (from) );
130
131#define PREFETCH2(arch_prefetch,from)		\
132    __asm__ __volatile__ (			\
133		  arch_prefetch "320(%0)\n"	\
134		  : 				\
135		  : "r" (from) );
136
137#define PREFETCH3(arch_prefetch,from)		\
138    __asm__ __volatile__ (			\
139		  arch_prefetch "288(%0)\n"	\
140		  : 				\
141		  : "r" (from) );
142
143#define small_memcpy_i386(to,from,n)					\
144    {									\
145	__asm__ __volatile__(						\
146		  " cld\n"						\
147		  " shrl $1, %%ecx\n"					\
148		  " jnc 1f\n"						\
149		  " movsb\n"						\
150		"1: shrl $1, %%ecx\n"					\
151		  " jnc 2f\n"						\
152		  " movsw\n"						\
153		"2: rep ; movsl"					\
154		  : "=&D" (to), "=&S" (from)				\
155		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
156		  : "memory", "cc");					\
157      PDEBUG(ErrorF("Jong-small_memcpy_i386(to,from,n)...\n"));		\
158    }
159
160#define small_memcpy_amd64(to,from,n)					\
161    {									\
162	__asm__ __volatile__(						\
163		  " cld\n"						\
164		  " shrq $1, %%rcx\n"					\
165		  " jnc 1f\n"						\
166		  " movsb\n"						\
167		"1: shrq $1, %%rcx\n"					\
168		  " jnc 2f\n"						\
169		  " movsw\n"						\
170		"2: shrq $1, %%rcx\n"					\
171		  " jnc 3f\n"						\
172		  " movsl\n"						\
173		"3: rep ; movsq"					\
174		  : "=&D" (to), "=&S" (from)				\
175		  : "c" (n), "0" ((long) to), "1" ((long) from) 	\
176		  : "memory", "cc");					\
177      PDEBUG(ErrorF("Jong-small_memcpy_amd64(to,from,n)...\n"));		\
178    }
179
180#define MMX_CPY(prefetch,from,to,dummy,lcnt)				\
181    __asm__ __volatile__ (						\
182	        "1:\n"							\
183		    prefetch "320(%1)\n"				\
184	          " movq (%1), %%mm0\n"					\
185		  " movq 8(%1), %%mm1\n"				\
186		  " movq 16(%1), %%mm2\n"				\
187		  " movq 24(%1), %%mm3\n"				\
188		  " movq %%mm0, (%0)\n"					\
189		  " movq %%mm1, 8(%0)\n"				\
190		  " movq %%mm2, 16(%0)\n"				\
191		  " movq %%mm3, 24(%0)\n"				\
192		    prefetch "352(%1)\n"				\
193		  " movq 32(%1), %%mm0\n"				\
194		  " movq 40(%1), %%mm1\n"				\
195		  " movq 48(%1), %%mm2\n"				\
196		  " movq 56(%1), %%mm3\n"				\
197		  " leal 64(%1),%1\n"					\
198		  " movq %%mm0, 32(%0)\n"				\
199		  " movq %%mm1, 40(%0)\n"				\
200		  " movq %%mm2, 48(%0)\n"				\
201		  " movq %%mm3, 56(%0)\n"				\
202		  " decl %2\n"						\
203		  " leal 64(%0),%0\n"					\
204		  " jne 1b\n"						\
205		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
206		  : "0" (to), "1" (from), "2" (lcnt) 			\
207		  : "memory", "cc");	\
208        PDEBUG(ErrorF("Jong-MMX_CPY(prefetch,from,to,dummy,lcnt)...\n"));
209
210
211#define SSE_CPY(prefetch,from,to,dummy,lcnt)				\
212    if((ULong) from & 15) {						\
213	__asm__ __volatile__ (						\
214		"1:\n"							\
215		    prefetch "320(%1)\n"				\
216		  " movups (%1), %%xmm0\n"				\
217		  " movups 16(%1), %%xmm1\n"				\
218		  " movntps %%xmm0, (%0)\n"				\
219		  " movntps %%xmm1, 16(%0)\n"				\
220		    prefetch "352(%1)\n"				\
221		  " movups 32(%1), %%xmm2\n"				\
222		  " movups 48(%1), %%xmm3\n"				\
223		  " leal 64(%1),%1\n"					\
224		  " movntps %%xmm2, 32(%0)\n"				\
225		  " movntps %%xmm3, 48(%0)\n"				\
226		  " decl %2\n"						\
227		  " leal 64(%0),%0\n"					\
228		  " jne 1b\n"						\
229		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
230		  : "0" (to), "1" (from), "2" (lcnt)			\
231		  : "memory", "cc"); 					\
232	      PDEBUG(ErrorF("Jong-SSE_CPY(prefetch,from,to,dummy,lcnt)-1...\n")); \
233    } else {								\
234	__asm__ __volatile__ (						\
235		"2:\n"							\
236		    prefetch "320(%1)\n"				\
237		  " movaps (%1), %%xmm0\n"				\
238		  " movaps 16(%1), %%xmm1\n"				\
239		  " movntps %%xmm0, (%0)\n"				\
240		  " movntps %%xmm1, 16(%0)\n"				\
241        	    prefetch "352(%1)\n"				\
242		  " movaps 32(%1), %%xmm2\n"				\
243		  " movaps 48(%1), %%xmm3\n"				\
244		  " leal 64(%1),%1\n"					\
245		  " movntps %%xmm2, 32(%0)\n"				\
246		  " movntps %%xmm3, 48(%0)\n"				\
247		  " decl %2\n"						\
248		  " leal 64(%0),%0\n"					\
249		  " jne 2b\n"						\
250		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
251		  : "0" (to), "1" (from), "2" (lcnt)			\
252		  : "memory", "cc");					\
253	      PDEBUG(ErrorF("Jong-SSE_CPY(prefetch,from,to,dummy,lcnt)-2...\n")); \
254    }
255
256#define SSE64_CPY(prefetch,from,to,dummy,lcnt)				\
257    if((ULong) from & 15) {						\
258	__asm__ __volatile__ (						\
259		"1:\n"							\
260		    prefetch "320(%1)\n"				\
261		  " movups (%1), %%xmm0\n"				\
262		  " movups 16(%1), %%xmm1\n"				\
263		  " movntps %%xmm0, (%0)\n"				\
264		  " movntps %%xmm1, 16(%0)\n"				\
265		    prefetch "352(%1)\n"				\
266		  " movups 32(%1), %%xmm2\n"				\
267		  " movups 48(%1), %%xmm3\n"				\
268		  " leaq 64(%1),%1\n"					\
269		  " movntps %%xmm2, 32(%0)\n"				\
270		  " movntps %%xmm3, 48(%0)\n"				\
271		  " decl %2\n"						\
272		  " leaq 64(%0),%0\n"					\
273		  " jne 1b\n"						\
274		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
275		  : "0" (to), "1" (from), "2" (lcnt)			\
276		  : "memory", "cc"); 					\
277	      PDEBUG(ErrorF("Jong-SSE64_CPY(prefetch,from,to,dummy,lcnt)-1...\n")); \
278    } else {								\
279	__asm__ __volatile__ (						\
280		"2:\n"							\
281		    prefetch "320(%1)\n"				\
282		  " movaps (%1), %%xmm0\n"				\
283		  " movaps 16(%1), %%xmm1\n"				\
284		  " movntps %%xmm0, (%0)\n"				\
285		  " movntps %%xmm1, 16(%0)\n"				\
286        	    prefetch "352(%1)\n"				\
287		  " movaps 32(%1), %%xmm2\n"				\
288		  " movaps 48(%1), %%xmm3\n"				\
289		  " leaq 64(%1),%1\n"					\
290		  " movntps %%xmm2, 32(%0)\n"				\
291		  " movntps %%xmm3, 48(%0)\n"				\
292		  " decl %2\n"						\
293		  " leaq 64(%0),%0\n"					\
294		  " jne 2b\n"						\
295		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
296		  : "0" (to), "1" (from), "2" (lcnt)			\
297		  : "memory", "cc");					\
298	      PDEBUG(ErrorF("Jong-SSE64_CPY(prefetch,from,to,dummy,lcnt)-2...\n")); \
299    }
300
301#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt)				\
302    __asm__ __volatile__ (						\
303		  ".p2align 4,,7\n"					\
304		 "1:\n"							\
305		    prefetch "320(%1)\n"				\
306		  " movq (%1), %%mm0\n"					\
307		  " movq 8(%1), %%mm1\n"				\
308		  " movq 16(%1), %%mm2\n"				\
309		  " movq 24(%1), %%mm3\n"				\
310		  " movntq %%mm0, (%0)\n"				\
311		  " movntq %%mm1, 8(%0)\n"				\
312		  " movntq %%mm2, 16(%0)\n"				\
313		  " movntq %%mm3, 24(%0)\n"				\
314		    prefetch "352(%1)\n"				\
315		  " movq 32(%1), %%mm0\n"				\
316		  " movq 40(%1), %%mm1\n"				\
317		  " movq 48(%1), %%mm2\n"				\
318		  " movq 56(%1), %%mm3\n"				\
319		  " leal 64(%1),%1\n"					\
320		  " movntq %%mm0, 32(%0)\n"				\
321		  " movntq %%mm1, 40(%0)\n"				\
322		  " movntq %%mm2, 48(%0)\n"				\
323		  " movntq %%mm3, 56(%0)\n"				\
324		  " decl %2\n"						\
325		  " leal 64(%0),%0\n"					\
326		  " jne 1b\n"						\
327		  : "=&D"(to), "=&S"(from), "=&r"(dummy)		\
328		  : "0" (to), "1" (from), "2" (lcnt) 			\
329		  : "memory", "cc");	\
330	      PDEBUG(ErrorF("Jong-MMXEXT_CPY(prefetch,from,to,dummy,lcnt)...\n"));
331
332
333#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)		\
334									\
335/*    static void prefix##_memcpy(UChar *to,		*/		\
336    void prefix##_memcpy(UChar *to,				\
337				const UChar *from,			\
338				int size)				\
339    {									\
340	int lcnt = size >> 6;						\
341	int rest = size & 63;						\
342	register int dummy;						\
343								\
344	PREFETCH1(ptype##_PREFETCH,from);				\
345	begin;								\
346    PDEBUG(ErrorF("Jong-After-begin()...\n"));	\
347	if(lcnt) {							\
348	   PDEBUG(ErrorF("Jong-Before-SSE_CPY()...\n"));		\
349	   itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt);		\
350	   PDEBUG(ErrorF("Jong-After-SSE_CPY()...\n"));		\
351	}								\
352	if(rest) {							\
353	   PDEBUG(ErrorF("Jong-Before-PREFETCH2()...\n"));	\
354	   PREFETCH2(ptype##_PREFETCH,from);				\
355	   PDEBUG(ErrorF("Jong-After-PREFETCH2()...\n"));	\
356	   small(to, from, rest);					\
357	   PDEBUG(ErrorF("Jong-After-small...\n"));		\
358	   PREFETCH3(ptype##_PREFETCH,from);				\
359	   PDEBUG(ErrorF("Jong-After-PREFETCH3()...\n"));	\
360	}								\
361    PDEBUG(ErrorF("Jong-Before-fence...\n"));	\
362	fence;								\
363    PDEBUG(ErrorF("Jong-PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)-end...\n"));	\
364    }
365
366#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small)			\
367									\
368    static void prefix##_memcpy(UChar *to,				\
369				const UChar *from,			\
370				int size)				\
371    {									\
372	int lcnt = size >> 6;						\
373	int rest = size & 63;						\
374	register int dummy;						\
375									\
376	begin;								\
377	if(lcnt) {							\
378	   itype##_CPY("#",from,to,dummy,lcnt);				\
379	}								\
380	if(rest) {							\
381	   small(to, from, rest);					\
382	}								\
383	fence;								\
384    PDEBUG(ErrorF("Jong-NOPREFETCH_FUNC(prefix,itype,ptype,begin,fence,small)...\n"));	\
385    }
386
387/* Other archs */
388
389/* ... */
390
391
392#if 0 /* Jong 01/15/2009; ignore at the moment */
393/* Type for table for benchmark list */
394typedef struct {
395    vidCopyFunc  mFunc;
396    char         *mName;
397    unsigned int mycpuflag;
398    int          grade;
399    int 	 gradefrom;
400    Bool         reqAlignment;
401} XGIMCFuncData;
402#endif
403
404/************************************************************************/
405/*                   libc memcpy() wrapper - generic                    */
406/************************************************************************/
407#define UChar  unsigned char
408
409void XGI_libc_memcpy(UChar *dst, const UChar *src, int size)
410{
411    PDEBUG(ErrorF("Jong-XGI_libc_memcpy()...\n"));
412    memcpy(dst, src, size);
413}
414
415/************************************************************************/
416/* We only do all that stuff under gcc; no idea what other compilers 	*/
417/* would do with our asm code.  					*/
418/************************************************************************/
419
420#ifndef __GNUC__
421
422unsigned int XGIGetCPUFlags(ScrnInfoPtr pScrn)
423{
424    PDEBUG(ErrorF("Jong-XGIGetCPUFlags(ScrnInfoPtr pScrn)...\n"));
425    return 0;
426}
427
428vidCopyFunc XGIVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
429{
430    PDEBUG(ErrorF("Jong-XGIVidCopyInit()...\n"));
431    *UMemCpy = XGI_libc_memcpy;
432    return XGI_libc_memcpy;
433}
434
435vidCopyFunc XGIVidCopyGetDefault(void)
436{
437    PDEBUG(ErrorF("Jong-XGIVidCopyGetDefault()...\n"));
438    return XGI_libc_memcpy;
439}
440
441#else /* ! Everything below is gcc specific ! */
442
443/************************************************************************/
444/*                    Definitions for archs and OSes                    */
445/************************************************************************/
446
447#undef XGI_checkosforsse
448#undef XGI_canBenchmark
449#undef XGI_haveProc
450#undef XGI_haveBuiltInMC
451
452/* Jong Lin */
453#if defined(__arm__)
454void XGI_builtin_memcpy_arm(UChar *to, const UChar *from, int n)
455{
456    long d1,d2,d3;
457
458    PDEBUG(ErrorF("XGI_builtin_memcpy_arm_begin-size=%d...\n", n));
459
460    __asm__  __volatile__(
461	" start: \n"
462		   /* " stmfd sp!, {r0-r12}\n" */ /* Save some working registers */
463	" blockcopy: \n"
464		   " movs r3,r2, lsr #3\n" /* Number of eight word multiples */
465		   " beq copywords\n" /* Less than eight words to move? */
466		   " stmfd sp!, {r4-r11}\n" /* Save some working registers */
467	" octcopy: \n"
468		   " ldmia r1!, {r4-r11}\n" /* Load 8 words from the source */
469		   " stmia r0!, {r4-r11}\n" /* Put them at the destination */
470		   " subs r3, r3, #1\n" /* Decrement the counter */
471		   " bne octcopy\n" /* ... copy more */
472		   " ldmfd sp!, {r4-r11}\n" /* Don't need these now - restore */
473	" copywords: \n"
474		   " ands r2, r2, #7\n" /* Number of odd words to copy */
475		   " beq stop\n" /* No words left to copy? */
476	" wordcopy: \n"
477		  " ldr r3, [r1], #4\n" /* Load a word from the source */
478		  " str r3, [r0], #4\n" /* and store it to the destination */
479		  " subs r2, r2, #1\n" /* Decrement the counter */
480		  " bne wordcopy\n" /*  ... copy more */
481	" stop: \n"
482		  /* "ldmfd sp!, {r0-r12}\n" */ /* Don't need these now - restore */
483		  :
484		  :"r" ((long) to), "r" ((long) from), "r" ((unsigned long) n)
485		  : "memory", "r3");
486
487}
488#endif
489
490#if defined(__i386__) /* ***************************************** i386 */
491
492#define XGI_checkosforsse 	/* Does this cpu support sse and do we need to check os? */
493#define XGI_canBenchmark	/* Can we perform a benchmark? */
494#ifdef XGI_LINUX
495#define XGI_haveProc		/* Do we have /proc/cpuinfo or similar? */
496#endif
497#define XGI_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
498
499/* Built-in memcpy for i386 */
500#define size_t	int
501static __inline void * builtin_memcpy(void * to, const void * from, size_t n)
502{
503    int d1,d2,d3;
504
505    __asm__ __volatile__(
506		  " cld\n"
507		  " shrl $1, %%ecx\n"
508		  " jnc 1f\n"
509		  " movsb\n"
510		"1: shrl $1, %%ecx\n"
511		  " jnc 2f\n"
512		  " movsw\n"
513		"2: rep ; movsl\n"
514		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
515		  : "0" (n), "1" ((long) to), "2" ((long) from)
516		  : "memory", "cc");
517
518    PDEBUG(ErrorF("Jong-__inline builtin_memcpy() for i386...\n"));
519    return(to);
520}
521
522/* Alternative for 586: Unroll loop, copy 32 bytes at a time */
523static void XGI_builtin_memcp2(UChar *to, const UChar *from, int n)
524{
525    int d1,d2,d3;
526
527    __asm__ __volatile__(
528		  " movl %%edi, %%eax\n"
529		  " cmpl $32, %%ecx\n"
530		  " cld\n"
531		  " jbe 3f\n"
532		  " negl %%eax\n"		/* Align dest */
533		  " andl $3, %%eax\n"
534		  " subl %%eax, %%ecx\n"
535		  " xchgl %%eax, %%ecx\n"
536		  " rep ; movsb\n"
537		  " movl %%eax, %%ecx\n"
538		  " subl $32, %%ecx\n"
539		  " js 2f\n"
540		  " movl (%%edi), %%eax\n"
541		"1: movl 28(%%edi), %%edx\n"   	/* Trick: Read-ahead */
542		  " subl $32, %%ecx\n"
543		  " movl (%%esi), %%eax\n"
544		  " movl 4(%%esi), %%edx\n"
545		  " movl %%eax, (%%edi)\n"
546		  " movl %%edx, 4(%%edi)\n"
547		  " movl 8(%%esi), %%eax\n"
548		  " movl 12(%%esi), %%edx\n"
549		  " movl %%eax, 8(%%edi)\n"
550		  " movl %%edx, 12(%%edi)\n"
551		  " movl 16(%%esi), %%eax\n"
552		  " movl 20(%%esi), %%edx\n"
553		  " movl %%eax, 16(%%edi)\n"
554		  " movl %%edx, 20(%%edi)\n"
555		  " movl 24(%%esi), %%eax\n"
556		  " movl 28(%%esi), %%edx\n"
557		  " movl %%eax, 24(%%edi)\n"
558		  " movl %%edx, 28(%%edi)\n"
559		  " leal 32(%%esi), %%esi\n"
560		  " leal 32(%%edi), %%edi\n"
561		  " jns 1b\n"
562		"2: addl $32, %%ecx\n"
563		"3: rep ; movsb"
564		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
565		  : "0" (n), "1" ((long) to), "2" ((long) from)
566		  : "eax", "edx", "memory", "cc");
567
568    PDEBUG(ErrorF("Jong-XGI_builtin_memcp2()-copy 32 bytes at a time-586...\n"));
569}
570
571static unsigned int taketime(void)	/* get current time (for benchmarking) */
572{
573    unsigned int eax;
574
575    __asm__ volatile (
576		" pushl %%ebx\n"
577		" cpuid\n"
578		" rdtsc\n"
579		" popl %%ebx\n"
580		: "=a" (eax)
581		: "0" (0)
582		: "ecx", "edx", "cc");
583
584    return(eax);
585}
586
587#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */
588
589#define XGI_checkosforsse	/* Does this cpu support sse and do we need to check os? */
590#define XGI_canBenchmark	/* Can we perform a benchmark? */
591#ifdef XGI_LINUX
592#define XGI_haveProc		/* Do we have /proc/cpuinfo or similar? */
593#endif
594#define XGI_haveBuiltInMC	/* Is there a built-in memcpy for this arch? */
595
596/* Built-in memcpy for AMD64 */
597static __inline void * builtin_memcpy(void * to, const void * from, int n)
598{
599    long d1, d2, d3;
600
601    __asm__ __volatile__ (
602		" cld\n"
603		" rep ; movsq\n"
604		" movq %4, %%rcx\n"
605		" rep ; movsb"
606		: "=%c" (d1), "=&D" (d2), "=&S" (d3)
607		: "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)),
608		  "1" ((long) to), "2" ((long) from)
609		: "memory");
610
611    PDEBUG(ErrorF("Jong-builtin_memcpy() for AMD64...\n"));
612    return(to);
613}
614
615/* Alternative: Unroll loop, copy 32 bytes at a time */
616static void XGI_builtin_memcp2(UChar *to, const UChar *from, int n)
617{
618    long d1,d2,d3;
619
620    __asm__ __volatile__(
621		  " movq %%rdi, %%rax\n"
622		  " cmpq $32, %%rcx\n"
623		  " cld\n"			/* Pipeline; no other flags but DF */
624		  " jbe 1f\n"
625		  " negq %%rax\n"		/* Align dest */
626		  " andq $7, %%rax\n"
627		  " subq %%rax, %%rcx\n"
628		  " xchgq %%rax, %%rcx\n"
629		  " rep ; movsb\n"
630		  " movq %%rax, %%rcx\n"
631		  " subq $32, %%rcx\n"
632		  " js 2f\n"
633		  ".p2align 4\n"
634		"3: subq $32, %%rcx\n"
635		  " movq (%%rsi), %%rax\n"
636		  " movq 8(%%rsi), %%rdx\n"
637		  " movq 16(%%rsi), %%r8\n"
638		  " movq 24(%%rsi), %%r9\n"
639		  " movq %%rax, (%%rdi)\n"
640		  " movq %%rdx, 8(%%rdi)\n"
641		  " movq %%r8, 16(%%rdi)\n"
642		  " movq %%r9, 24(%%rdi)\n"
643		  " leaq 32(%%rsi), %%rsi\n"
644		  " leaq 32(%%rdi), %%rdi\n"
645		  " jns 3b\n"
646		"2: addq $32, %%rcx\n"
647		"1: rep ; movsb"
648		  : "=&c" (d1), "=&D" (d2), "=&S" (d3)
649		  :"0" ((ULong) n), "1" ((long) to), "2" ((long) from)
650		  : "rax", "rdx", "r8", "r9", "memory", "cc");
651
652    PDEBUG(ErrorF("Jong-XGI_builtin_memcp2()-copy 32 bytes at a time-AMD...\n"));
653}
654
655static unsigned int taketime(void)	/* get current time (for benchmarking) */
656{
657    unsigned int eax;
658
659    __asm__ volatile (
660		" pushq %%rbx\n"
661		" cpuid\n"
662		" rdtsc\n"
663		" popq %%rbx\n"
664		: "=a" (eax)
665		: "0" (0)
666		: "rcx", "rdx", "cc");
667
668    return(eax);
669}
670
671#else		/* **************************************** Other archs */
672
673/* 1. Can we do a benchmark?		*/
674/* #define XGI_canBenchmark		*/
675
676/* 2. Do we have /proc filesystem or similar for CPU information? */
677/* #define XGI_haveproc			*/
678
679/* 3. Optional: build-in memcpy()	*/
680/* #define XGI_haveBuiltInMC		*/
681/* static __inline void * builtin_memcpy(void * to, const void * from, int n)
682   {
683   }
684*/
685
686/* 4. Function for getting current time (for benchmarking)  */
687/* static unsigned int taketime(void)
688   {
689   }
690*/
691
692#endif
693
694/************************************************************************/
695/*                   Generic built-in memcpy wrapper                    */
696/************************************************************************/
697
698#ifdef XGI_haveBuiltInMC
699static void XGI_builtin_memcpy(UChar *dst, const UChar *src, int size)
700{
701    PDEBUG(ErrorF("Jong-XGI_builtin_memcpy()...\n"));
702    builtin_memcpy(dst, src, size);
703}
704#endif
705
706#if 0 /* Jong 01/15/2009; ignore at the moment */
707/************************************************************************/
708/* Generic routines if Benchmark can be performed (all archs, all OSes) */
709/************************************************************************/
710
711#ifdef XGI_canBenchmark
712
713/* Get time (unsigned int) */
714static unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size)
715{
716    unsigned int t1, t2;
717
718    t1 = taketime();
719
720    (*mf)(buf1, buf2, size);
721
722    t2 = taketime();
723
724    return((t1 <  t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1));
725}
726
727/* Allocate an area of offscreen FB memory (buf1), a simulated video
728 * player buffer (buf2) and a pool of uninitialized "video" data (buf3).
729 */
730static void *
731XGI_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3)
732{
733    XGIPtr pXGI = XGIPTR(pScrn);
734    unsigned int offset;
735    void *handle = NULL;
736
737    if(!(offset = XGIAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) {
738       return NULL;
739    }
740    (*buf1) = (UChar *)pXGI->FbBase + offset;
741    (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31);
742
743    if(!((*buf2) = (UChar *)xalloc(BUFFERSIZE + 15))) {
744       XGIFreeFBMemory(pScrn, &handle);
745       return NULL;
746    }
747
748    if(!((*buf3) = (UChar *)xalloc(BUFFERSIZE + 15))) {
749       xfree((*buf2));
750       XGIFreeFBMemory(pScrn, &handle);
751       return NULL;
752    }
753
754    return handle;
755}
756#endif
757
758/* Perform Benchmark */
759static int XGI_BenchmarkMemcpy(ScrnInfoPtr pScrn, XGIMCFuncData *MCFunctions,
760                               unsigned int myCPUflags, UChar *buf1, UChar *buf2,
761			       UChar *buf3, char *frqBuf, double cpuFreq,
762			       vidCopyFunc *UMemCpy, int *best2, Bool from)
763{
764    XGIMCFuncData *curData;
765    int j = 0, bestSoFar = 0;
766    unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU;
767
768    (*best2) = 0;
769
770    /* Make probable buf1 and buf2 are not paged out by referencing them */
771    XGI_libc_memcpy(buf1, buf2, BUFFERSIZE);
772
773    xf86DrvMsg(pScrn->scrnIndex, X_INFO,
774	       "Benchmarking %s RAM to %s RAM memory transfer methods:\n",
775	       from ? "video" : "system",
776	       from ? "system" : "video");
777
778#ifdef TWDEBUG
779    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags);
780#endif
781
782    j = 0;
783    while(MCFunctions[j].mFunc) {
784	PDEBUG(ErrorF("Check function[%d]-%s...\n", j, MCFunctions[j].mName));
785
786	curData = MCFunctions + j;
787
788	if(myCPUflags & curData->mycpuflag) {
789
790	   /* Simulate setup of the video buffer and copy result to framebuffer */
791	   /* Do this 4 times to verify results */
792	   if(!from) {
793	      XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE);
794	      tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
795	      XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE);
796	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
797	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
798	      XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE);
799	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
800	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
801	      XGI_builtin_memcpy(buf2, buf3, BUFFERSIZE);
802	      tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE);
803	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
804	   } else {
805	      XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE);
806	      tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
807	      XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE);
808	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
809	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
810	      XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE);
811	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
812	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
813	      XGI_builtin_memcpy(buf3, buf2, BUFFERSIZE);
814	      tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE);
815	      tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1;
816	   }
817
818	   if((!frqBuf) || (tmp1 == 0)) {
819	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
820			   "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1);
821	   } else {
822	      xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
823			   "\tChecked %s memcpy()... \t%.1f MiB/s\n",
824			   curData->mName,
825			   cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000)));
826	   }
827
828	   if(tmp1 < best) {
829	      best = tmp1;
830	      bestSoFar = j;
831		  PDEBUG(ErrorF("* bestSoFar is %d...\n", bestSoFar));
832	   }
833
834	   if(!curData->reqAlignment) {
835	      if(tmp1 < sbest) {
836	         sbest = tmp1;
837	         (*best2) = j;
838	      }
839	   }
840
841	}
842
843	j++;
844    }
845
846    PDEBUG(ErrorF("***** The best is func(%d)-%s...\n", bestSoFar, MCFunctions[bestSoFar].mName));
847
848	/* Jong 01/07/2009; test for performance */
849	/* bestSoFar = 2; */ /* the worst case; buit-in-2 */
850    return bestSoFar;
851}
852
853static vidCopyFunc XGI_GetBestByGrade(ScrnInfoPtr pScrn, XGIMCFuncData *MCFunctions,
854			unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from)
855{
856    int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10;
857    int grade;
858
859    *UMemCpy = XGI_libc_memcpy;
860
861    while(MCFunctions[j].mFunc) {
862	if(myCPUflags & MCFunctions[j].mycpuflag) {
863	   grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade;
864	   if(grade < bestSoFar) {
865	      best = j;
866	      bestSoFar = grade;
867	   }
868	   if(grade < best2SoFar) {
869	      if(!MCFunctions[j].reqAlignment) {
870	         secondbest = j;
871		 best2SoFar = grade;
872	      }
873	   }
874	}
875	j++;
876    }
877    if(best >= 0) {
878       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
879		"Chose %s method for aligned data transfers %s video RAM\n",
880		MCFunctions[best].mName,
881		from ? "from" : "to");
882       if(secondbest >= 0) {
883          xf86DrvMsg(pScrn->scrnIndex, X_INFO,
884		"Chose %s method for unaligned data transfers %s video RAM\n",
885		   MCFunctions[secondbest].mName,
886		   from ? "from" : "to");
887          *UMemCpy = MCFunctions[secondbest].mFunc;
888       }
889       return MCFunctions[best].mFunc;
890    }
891
892    return XGI_libc_memcpy;
893}
894#endif /* canBenchmark */
895
896/**********************************************************************/
897/*      Generic routines if /proc filesystem is available (Linux)     */
898/**********************************************************************/
899
900#ifdef XGI_haveProc
901/* Linux: Read file (/proc/cpuinfo) into buffer */
902static int XGI_ReadProc(char *buf, char *filename)
903{
904    FILE *cpuInfoFile;
905    int count;
906
907    if((cpuInfoFile = fopen(filename, "r")) == NULL) {
908       return 0;
909    }
910
911    count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile);
912    if(ferror(cpuInfoFile)) {
913       fclose(cpuInfoFile);
914       return 0;
915    }
916
917    fclose(cpuInfoFile);
918
919    if(count >= CPUBUFFERSIZE - 2) {
920       return 0;
921    }
922
923    buf[count] = 0;
924
925    return count;
926}
927
928/* Linux: Extract CPU speed from /proc/cpuinfo */
929static char *XGI_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq)
930{
931    char *frqBuf, *endBuf;
932
933    (*cpuFreq) = 0.0;
934
935    if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) {
936       frqBuf += 11;
937       (*cpuFreq) = strtod(frqBuf, &endBuf);
938       if(endBuf == frqBuf) frqBuf = NULL;
939       if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */
940       if(frqBuf) {
941          xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq));
942       }
943    }
944
945    return frqBuf;
946}
947#endif /* haveProc */
948
949/**********************************************************************/
950/*                      Arch-specific routines                        */
951/**********************************************************************/
952
953#ifdef XGI_checkosforsse   /* Common i386, AMD64  */
954
955#ifdef XGICHECKOSSSE
956
957#ifndef XFree86LOADER
958#include <setjmp.h>
959#endif
960
961static jmp_buf sigill_return;
962
963static void sigill_handler(void)
964{
965    longjmp(sigill_return, 1);
966}
967#endif
968
969#if 0 /* Jong 01/15/2009; ignore at the moment */
970static Bool CheckOSforSSE(ScrnInfoPtr pScrn)
971{
972#ifdef XGICHECKOSSSE  /* Check OS for SSE possible: */
973    int signo = -1;
974
975#ifdef XGIDGBMC
976    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n");
977#endif
978
979    xf86InterceptSigIll(&sigill_handler);
980
981    if(setjmp(sigill_return)) {
982       signo = 4;
983    } else {
984       __asm__ __volatile__ (" xorps %xmm0, %xmm0\n");
985       /* __asm__ __volatile__ (" .byte 0xff\n"); */  /* For test */
986    }
987
988    xf86InterceptSigIll(NULL);
989
990#ifdef XGIDGBMC
991    xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo);
992#endif
993
994    if(signo != -1) {
995       xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
996		"OS does not support SSE instructions\n");
997    }
998
999    return (signo >= 0) ? FALSE : TRUE;
1000
1001#else  /* no check for SSE possible: */
1002
1003    XGIPtr pXGI = XGIPTR(pScrn);
1004
1005    xf86DrvMsg(pScrn->scrnIndex, pXGI->XvSSEMemcpy ? X_WARNING : X_INFO,
1006	"Checking OS for SSE support is not supported in this version of " XGIMYSERVERNAME "\n");
1007
1008    if(pXGI->XvSSEMemcpy) {
1009       xf86DrvMsg(pScrn->scrnIndex, X_WARNING,
1010	"If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n");
1011       return TRUE;
1012    } else {
1013       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
1014	"If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n");
1015       return FALSE;
1016    }
1017#endif
1018}
1019
1020#endif /* XGI_checkosforsse */
1021#endif
1022
1023#ifdef __i386__   /* i386 specific *************************************/
1024#define ULong	unsigned long
1025
1026PREFETCH_FUNC(XGI_sse,SSE,SSE,,FENCE,small_memcpy_i386)
1027PREFETCH_FUNC(XGI_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386)
1028PREFETCH_FUNC(XGI_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386)
1029NOPREFETCH_FUNC(XGI_mmx,MMX,EMMS,EMMS,small_memcpy_i386)
1030
1031#if 0 /* Jong 01/15/2009; ignore at the moment */
1032static XGIMCFuncData MCFunctions_i386[] = {
1033    {XGI_libc_memcpy,   "libc",      XGI_CPUFL_LIBC,  4,  4, FALSE},
1034    {XGI_builtin_memcpy,"built-in-1",XGI_CPUFL_BI,    5,  5, FALSE},
1035    {XGI_builtin_memcp2,"built-in-2",XGI_CPUFL_BI2,   6,  6, FALSE},
1036    {XGI_mmx_memcpy,    "MMX",       XGI_CPUFL_MMX,   3,  3, FALSE},
1037    {XGI_sse_memcpy,    "SSE",       XGI_CPUFL_SSE,   1,  0, TRUE},
1038    {XGI_now_memcpy,    "3DNow!",    XGI_CPUFL_3DNOW, 2,  2, FALSE},
1039    {XGI_mmxext_memcpy, "MMX2",      XGI_CPUFL_MMX2,  0,  1, FALSE},
1040    {NULL,              "",          0,              10, 10, FALSE}
1041};
1042#endif
1043
1044#define Def_FL  (XGI_CPUFL_LIBC | XGI_CPUFL_BI | XGI_CPUFL_BI2)  /* Default methods */
1045
1046#define cpuid(op, eax, ebx, ecx, edx) 		\
1047    __asm__ __volatile__ (			\
1048		" pushl %%ebx\n"		\
1049		" cpuid\n"			\
1050		" movl %%ebx, %1\n"		\
1051		" popl %%ebx\n"			\
1052		: "=a" (eax), "=r" (ebx), 	\
1053		  "=c" (ecx), "=d" (edx)	\
1054		: "a" (op)			\
1055		: "cc")
1056
1057#if 0 /* Jong 01/15/2009; ignore at the moment */
1058static Bool cpuIDSupported(ScrnInfoPtr pScrn)
1059{
1060    int eax, ebx, ecx, edx;
1061
1062    /* Check for cpuid instruction */
1063    __asm__ __volatile__ (
1064		" pushf\n"
1065		" popl %0\n"
1066		" movl %0, %1\n"
1067		" xorl $0x200000, %0\n"
1068		" push %0\n"
1069		" popf\n"
1070		" pushf\n"
1071		" popl %0\n"
1072		: "=a" (eax), "=c" (ecx)
1073		:
1074		: "cc");
1075
1076    if(eax == ecx) {
1077       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n");
1078       return FALSE;
1079    }
1080
1081    /* Check for cpuid level */
1082    cpuid(0x00000000, eax, ebx, ecx, edx);
1083    if(!eax) {
1084       return FALSE;
1085    }
1086
1087    /* Check for RDTSC */
1088    cpuid(0x00000001, eax, ebx, ecx, edx);
1089
1090    if(!(edx & 0x10)) {
1091       xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n");
1092       return FALSE;
1093    }
1094
1095    return TRUE;
1096}
1097
1098static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn)
1099{
1100    unsigned int flags = 0, eax, ebx, ecx, edx;
1101    Bool IsAMD;
1102
1103    /* Check if cpuid and rdtsc instructions are supported */
1104    if(!cpuIDSupported(pScrn)) {
1105       return 0;
1106    }
1107
1108    cpuid(0x00000000, eax, ebx, ecx, edx);
1109
1110    IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163);
1111
1112    cpuid(0x00000001, eax, ebx, ecx, edx);
1113    /* MMX */
1114    if(edx & 0x00800000) flags |= XGI_CPUFL_MMX;
1115    /* SSE, MMXEXT */
1116    if(edx & 0x02000000) flags |= (XGI_CPUFL_SSE | XGI_CPUFL_MMX2);
1117    /* SSE2 - don't need this one directly, set SSE instead */
1118    if(edx & 0x04000000) flags |= (XGI_CPUFL_SSE | XGI_CPUFL_SSE2);
1119
1120    cpuid(0x80000000, eax, ebx, ecx, edx);
1121    if(eax >= 0x80000001) {
1122       cpuid(0x80000001, eax, ebx, ecx, edx);
1123       /* 3DNow! */
1124       if(edx & 0x80000000) flags |= XGI_CPUFL_3DNOW;
1125       /* AMD MMXEXT */
1126       if(IsAMD && (edx & 0x00400000)) flags |= XGI_CPUFL_MMX2;
1127    }
1128
1129    return flags;
1130}
1131#endif
1132
1133#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */
1134
1135PREFETCH_FUNC(XGI_sse,SSE64,SSE,,FENCE,small_memcpy_amd64)
1136
1137#if 0 /* Jong 01/15/2009; ignore at the moment */
1138
1139static XGIMCFuncData MCFunctions_AMD64[] = {
1140    {XGI_libc_memcpy,   "libc",      XGI_CPUFL_LIBC, 2,  2, FALSE},
1141    {XGI_builtin_memcpy,"built-in-1",XGI_CPUFL_BI,   1,  1, FALSE},
1142    {XGI_builtin_memcp2,"built-in-2",XGI_CPUFL_BI2,  3,  3, FALSE},
1143    {XGI_sse_memcpy,    "SSE",       XGI_CPUFL_SSE,  0,  0, TRUE},
1144    {NULL,              "",          0,             10, 10, FALSE}
1145};
1146
1147#define Def_FL  (XGI_CPUFL_LIBC | XGI_CPUFL_BI | XGI_CPUFL_BI2)
1148
1149static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn)
1150{
1151    return((unsigned int)(XGI_CPUFL_SSE|XGI_CPUFL_SSE2));
1152}
1153#endif
1154
1155#else  /* Specific for other archs ******************************** */
1156
1157/* Fill in here */
1158#if 0 /* Jong 01/15/2009; ignore at the moment */
1159
1160#define Def_FL  (XGI_CPUFL_LIBC)
1161
1162static unsigned int XGI_GetCpuFeatures(ScrnInfoPtr pScrn)
1163{
1164    return((unsigned int)(0));
1165}
1166
1167#endif
1168
1169#endif
1170
1171/**********************************************************************/
1172/*     Benchmark the video copy routines and choose the fastest       */
1173/**********************************************************************/
1174
1175#if 0 /* Jong 01/15/2009; ignore at the moment */
1176#ifdef XGI_canBenchmark
1177static vidCopyFunc
1178XGIVidCopyInitGen(ScreenPtr pScreen, XGIMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from)
1179{
1180    ScrnInfoPtr pScrn = xf86Screens[pScreen->myNum];
1181    XGIPtr pXGI = XGIPTR(pScrn);
1182    void *fbhandle = NULL;
1183    char  *frqBuf = NULL;
1184    UChar *buf1, *buf2, *buf3;
1185    double cpuFreq = 0.0;
1186    unsigned int myCPUflags = pXGI->CPUFlags | Def_FL;
1187    int best, secondbest;
1188#ifdef XGI_haveProc
1189    char buf[CPUBUFFERSIZE];
1190#endif
1191
1192	PDEBUG(ErrorF("---XGIVidCopyInitGen()...begin\n"));
1193    *UMemCpy = XGI_libc_memcpy;
1194
1195    /* Bail out if user disabled benchmarking */
1196    if(!pXGI->BenchMemCpy) {
1197	   PDEBUG(ErrorF("---return XGI_libc_memcpy() as vidCopyFunc...\n"));
1198       return XGI_libc_memcpy;
1199    }
1200
1201#ifdef XGI_haveProc
1202    /* Read /proc/cpuinfo into buf */
1203    if(XGI_ReadProc(buf, "/proc/cpuinfo")) {
1204
1205       /* Extract CPU frequency */
1206       frqBuf = XGI_GetCPUFreq(pScrn, buf, &cpuFreq);
1207
1208    }
1209#endif
1210
1211    /* Allocate buffers; buf1:LFB; buf2,buf3:system memory */
1212    if(!(fbhandle = XGI_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) {
1213       xf86DrvMsg(pScrn->scrnIndex, X_INFO,
1214       		"Failed to allocate video RAM for video data transfer benchmark\n");
1215       return XGI_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from);
1216    }
1217
1218    /* Perform Benchmark */
1219    PDEBUG(ErrorF("---Perform XGI_BenchmarkMemcpy()...\n"));
1220    best = XGI_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1,
1221    				(UChar *)(((unsigned long)buf2 + 15) & ~15),
1222				(UChar *)(((unsigned long)buf3 + 15) & ~15),
1223				frqBuf, cpuFreq, UMemCpy, &secondbest, from);
1224
1225    /* Free buffers */
1226    XGIFreeFBMemory(pScrn, &fbhandle);
1227    xfree(buf2);
1228    xfree(buf3);
1229
1230    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
1231	       "Using %s method for aligned data transfers %s video RAM\n",
1232	       MCFunctions[best].mName,
1233	       from ? "from" : "to");
1234
1235    xf86DrvMsg(pScrn->scrnIndex, X_PROBED,
1236	       "Using %s method for unaligned data transfers %s video RAM\n",
1237	       MCFunctions[secondbest].mName,
1238	       from ? "from" : "to");
1239
1240    return MCFunctions[best].mFunc;
1241}
1242#endif /* canBenchmark */
1243#endif
1244
1245/**********************************************************************/
1246/* 		       main(): Get CPU capabilities		      */
1247/* 			    (called externally)			      */
1248/**********************************************************************/
1249#if 0 /* Jong 01/15/2009; ignore at the moment */
1250
1251unsigned int
1252XGIGetCPUFlags(ScrnInfoPtr pScrn)
1253{
1254    unsigned int myCPUflags = XGI_GetCpuFeatures(pScrn);
1255
1256#ifdef XGI_checkosforsse
1257    if(myCPUflags & (XGI_CPUFL_SSE | XGI_CPUFL_SSE2)) {
1258
1259       /* Check if OS supports usage of SSE instructions */
1260       if(!(CheckOSforSSE(pScrn))) {
1261          myCPUflags &= ~(XGI_CPUFL_SSE | XGI_CPUFL_SSE2);
1262       }
1263
1264    }
1265#endif
1266
1267    return myCPUflags;
1268}
1269
1270#endif
1271
1272/**********************************************************************/
1273/*                       main(): XGIVidCopyInit()                     */
1274/*			    (called externally)			      */
1275/*		(XGIGetCPUFlags must be called before this one)       */
1276/**********************************************************************/
1277
1278#if 0 /* Jong 01/15/2009; ignore at the moment */
1279vidCopyFunc XGIVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from)
1280{
1281/* Jong 01/08/2009; test for performance */
1282#if defined(__i386__) && defined(XGI_canBenchmark)
1283	PDEBUG(ErrorF("XGIVidCopyInit()-i386...\n"));
1284    return(XGIVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from));
1285#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(XGI_canBenchmark)
1286	PDEBUG(ErrorF("XGIVidCopyInit()-AMD64-x86_64...\n"));
1287    return(XGIVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from));
1288#else /* Other cases: Use libc memcpy() */
1289    *UMemCpy = XGI_libc_memcpy;
1290	PDEBUG(ErrorF("XGIVidCopyInit()-generic...\n"));
1291    return XGI_libc_memcpy;
1292#endif
1293}
1294#endif
1295
1296#if 0 /* Jong 01/15/2009; ignore at the moment */
1297vidCopyFunc XGIVidCopyGetDefault(void)
1298{
1299    return XGI_libc_memcpy;
1300}
1301#endif
1302
1303#endif /* GNU C */
1304
1305
1306