172b676d7Smrg/* 272b676d7Smrg * SiS memcpy() routines (assembly) 372b676d7Smrg * 472b676d7Smrg * Copyright (C) 2004-2005 Thomas Winischhofer 572b676d7Smrg * 672b676d7Smrg * Idea and some code bits from via_memcpy.c which is 772b676d7Smrg * Copyright (C) 2004 Thomas Hellstroem, All Rights Reserved. 872b676d7Smrg * 972b676d7Smrg * Permission is hereby granted, free of charge, to any person obtaining a 1072b676d7Smrg * copy of this software and associated documentation files (the "Software"), 1172b676d7Smrg * to deal in the Software without restriction, including without limitation 1272b676d7Smrg * the rights to use, copy, modify, merge, publish, distribute, sub license, 1372b676d7Smrg * and/or sell copies of the Software, and to permit persons to whom the 1472b676d7Smrg * Software is furnished to do so, subject to the following conditions: 1572b676d7Smrg * 1672b676d7Smrg * The above copyright notice and this permission notice (including the 1772b676d7Smrg * next paragraph) shall be included in all copies or substantial portions 1872b676d7Smrg * of the Software. 1972b676d7Smrg * 2072b676d7Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2172b676d7Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2272b676d7Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 2372b676d7Smrg * THE CODE SUPPLIER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR 2472b676d7Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 2572b676d7Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 2672b676d7Smrg * DEALINGS IN THE SOFTWARE. 2772b676d7Smrg * 2872b676d7Smrg */ 2972b676d7Smrg 3072b676d7Smrg#ifdef HAVE_CONFIG_H 3172b676d7Smrg#include "config.h" 3272b676d7Smrg#endif 3372b676d7Smrg 3472b676d7Smrg#include <stdlib.h> 3572b676d7Smrg#include "sis.h" 3672b676d7Smrg 3772b676d7Smrg#if 0 /* Debug */ 3872b676d7Smrg#define SISDGBMC 3972b676d7Smrg#endif 4072b676d7Smrg 4172b676d7Smrgextern unsigned int SISAllocateFBMemory(ScrnInfoPtr pScrn, void **handle, int bytesize); 4272b676d7Smrgextern void SISFreeFBMemory(ScrnInfoPtr pScrn, void **handle); 4372b676d7Smrg 4472b676d7Smrg#define CPUBUFFERSIZE 2048 /* Size of /proc/cpuinfo buffer */ 4572b676d7Smrg#define BUFFERSIZE (576 * 1152) /* Matches 720x576 YUV420 */ 4672b676d7Smrg 4772b676d7Smrg/************************************************************************/ 4872b676d7Smrg/* arch specific memcpy() routines */ 4972b676d7Smrg/************************************************************************/ 5072b676d7Smrg 5172b676d7Smrg/* i386, AMD64 */ 5272b676d7Smrg 5372b676d7Smrg#define FENCE \ 5472b676d7Smrg __asm__ __volatile__( \ 5572b676d7Smrg " sfence\n" \ 5672b676d7Smrg : \ 5772b676d7Smrg : \ 5872b676d7Smrg : "memory"); 5972b676d7Smrg 6072b676d7Smrg#define FENCEMMS \ 6172b676d7Smrg __asm__ __volatile__ ( \ 6272b676d7Smrg " sfence\n" \ 6372b676d7Smrg " emms\n" \ 6472b676d7Smrg : \ 6572b676d7Smrg : \ 6672b676d7Smrg : "memory"); 6772b676d7Smrg 6872b676d7Smrg#define FEMMS \ 6972b676d7Smrg __asm__ __volatile__( \ 7072b676d7Smrg " femms\n" \ 7172b676d7Smrg : \ 7272b676d7Smrg : \ 7372b676d7Smrg : "memory"); 7472b676d7Smrg 7572b676d7Smrg#define EMMS \ 7672b676d7Smrg __asm__ __volatile__( \ 7772b676d7Smrg " emms\n" \ 7872b676d7Smrg : \ 7972b676d7Smrg : \ 8072b676d7Smrg : "memory"); 8172b676d7Smrg 8272b676d7Smrg#define SSE_PREFETCH " prefetchnta " 8372b676d7Smrg#define NOW_PREFETCH " prefetch " 8472b676d7Smrg 8572b676d7Smrg#define PREFETCH1(arch_prefetch,from) \ 8672b676d7Smrg __asm__ __volatile__ ( \ 8772b676d7Smrg arch_prefetch "(%0)\n" \ 8872b676d7Smrg arch_prefetch "32(%0)\n" \ 8972b676d7Smrg arch_prefetch "64(%0)\n" \ 9072b676d7Smrg arch_prefetch "96(%0)\n" \ 9172b676d7Smrg arch_prefetch "128(%0)\n" \ 9272b676d7Smrg arch_prefetch "160(%0)\n" \ 9372b676d7Smrg arch_prefetch "192(%0)\n" \ 9472b676d7Smrg arch_prefetch "256(%0)\n" \ 9572b676d7Smrg arch_prefetch "288(%0)\n" \ 9672b676d7Smrg : \ 9772b676d7Smrg : "r" (from) ); 9872b676d7Smrg 9972b676d7Smrg#define PREFETCH2(arch_prefetch,from) \ 10072b676d7Smrg __asm__ __volatile__ ( \ 10172b676d7Smrg arch_prefetch "320(%0)\n" \ 10272b676d7Smrg : \ 10372b676d7Smrg : "r" (from) ); 10472b676d7Smrg 10572b676d7Smrg#define PREFETCH3(arch_prefetch,from) \ 10672b676d7Smrg __asm__ __volatile__ ( \ 10772b676d7Smrg arch_prefetch "288(%0)\n" \ 10872b676d7Smrg : \ 10972b676d7Smrg : "r" (from) ); 11072b676d7Smrg 11172b676d7Smrg#define small_memcpy_i386(to,from,n) \ 11272b676d7Smrg { \ 11372b676d7Smrg __asm__ __volatile__( \ 11472b676d7Smrg " cld\n" \ 11572b676d7Smrg " shrl $1, %%ecx\n" \ 11672b676d7Smrg " jnc 1f\n" \ 11772b676d7Smrg " movsb\n" \ 11872b676d7Smrg "1: shrl $1, %%ecx\n" \ 11972b676d7Smrg " jnc 2f\n" \ 12072b676d7Smrg " movsw\n" \ 12172b676d7Smrg "2: rep ; movsl" \ 12272b676d7Smrg : "=&D" (to), "=&S" (from) \ 12372b676d7Smrg : "c" (n), "0" ((long) to), "1" ((long) from) \ 12472b676d7Smrg : "memory", "cc"); \ 12572b676d7Smrg } 12672b676d7Smrg 12772b676d7Smrg#define small_memcpy_amd64(to,from,n) \ 12872b676d7Smrg { \ 12972b676d7Smrg __asm__ __volatile__( \ 13072b676d7Smrg " cld\n" \ 13172b676d7Smrg " shrq $1, %%rcx\n" \ 13272b676d7Smrg " jnc 1f\n" \ 13372b676d7Smrg " movsb\n" \ 13472b676d7Smrg "1: shrq $1, %%rcx\n" \ 13572b676d7Smrg " jnc 2f\n" \ 13672b676d7Smrg " movsw\n" \ 13772b676d7Smrg "2: shrq $1, %%rcx\n" \ 13872b676d7Smrg " jnc 3f\n" \ 13972b676d7Smrg " movsl\n" \ 14072b676d7Smrg "3: rep ; movsq" \ 14172b676d7Smrg : "=&D" (to), "=&S" (from) \ 14272b676d7Smrg : "c" (n), "0" ((long) to), "1" ((long) from) \ 14372b676d7Smrg : "memory", "cc"); \ 14472b676d7Smrg } 14572b676d7Smrg 14672b676d7Smrg#define MMX_CPY(prefetch,from,to,dummy,lcnt) \ 14772b676d7Smrg __asm__ __volatile__ ( \ 14872b676d7Smrg "1:\n" \ 14972b676d7Smrg prefetch "320(%1)\n" \ 15072b676d7Smrg " movq (%1), %%mm0\n" \ 15172b676d7Smrg " movq 8(%1), %%mm1\n" \ 15272b676d7Smrg " movq 16(%1), %%mm2\n" \ 15372b676d7Smrg " movq 24(%1), %%mm3\n" \ 15472b676d7Smrg " movq %%mm0, (%0)\n" \ 15572b676d7Smrg " movq %%mm1, 8(%0)\n" \ 15672b676d7Smrg " movq %%mm2, 16(%0)\n" \ 15772b676d7Smrg " movq %%mm3, 24(%0)\n" \ 15872b676d7Smrg prefetch "352(%1)\n" \ 15972b676d7Smrg " movq 32(%1), %%mm0\n" \ 16072b676d7Smrg " movq 40(%1), %%mm1\n" \ 16172b676d7Smrg " movq 48(%1), %%mm2\n" \ 16272b676d7Smrg " movq 56(%1), %%mm3\n" \ 16372b676d7Smrg " leal 64(%1),%1\n" \ 16472b676d7Smrg " movq %%mm0, 32(%0)\n" \ 16572b676d7Smrg " movq %%mm1, 40(%0)\n" \ 16672b676d7Smrg " movq %%mm2, 48(%0)\n" \ 16772b676d7Smrg " movq %%mm3, 56(%0)\n" \ 16872b676d7Smrg " decl %2\n" \ 16972b676d7Smrg " leal 64(%0),%0\n" \ 17072b676d7Smrg " jne 1b\n" \ 17172b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 17272b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 17372b676d7Smrg : "memory", "cc"); 17472b676d7Smrg 17572b676d7Smrg#define SSE_CPY(prefetch,from,to,dummy,lcnt) \ 17672b676d7Smrg if((ULong) from & 15) { \ 17772b676d7Smrg __asm__ __volatile__ ( \ 17872b676d7Smrg "1:\n" \ 17972b676d7Smrg prefetch "320(%1)\n" \ 18072b676d7Smrg " movups (%1), %%xmm0\n" \ 18172b676d7Smrg " movups 16(%1), %%xmm1\n" \ 18272b676d7Smrg " movntps %%xmm0, (%0)\n" \ 18372b676d7Smrg " movntps %%xmm1, 16(%0)\n" \ 18472b676d7Smrg prefetch "352(%1)\n" \ 18572b676d7Smrg " movups 32(%1), %%xmm2\n" \ 18672b676d7Smrg " movups 48(%1), %%xmm3\n" \ 18772b676d7Smrg " leal 64(%1),%1\n" \ 18872b676d7Smrg " movntps %%xmm2, 32(%0)\n" \ 18972b676d7Smrg " movntps %%xmm3, 48(%0)\n" \ 19072b676d7Smrg " decl %2\n" \ 19172b676d7Smrg " leal 64(%0),%0\n" \ 19272b676d7Smrg " jne 1b\n" \ 19372b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 19472b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 19572b676d7Smrg : "memory", "cc"); \ 19672b676d7Smrg } else { \ 19772b676d7Smrg __asm__ __volatile__ ( \ 19872b676d7Smrg "2:\n" \ 19972b676d7Smrg prefetch "320(%1)\n" \ 20072b676d7Smrg " movaps (%1), %%xmm0\n" \ 20172b676d7Smrg " movaps 16(%1), %%xmm1\n" \ 20272b676d7Smrg " movntps %%xmm0, (%0)\n" \ 20372b676d7Smrg " movntps %%xmm1, 16(%0)\n" \ 20472b676d7Smrg prefetch "352(%1)\n" \ 20572b676d7Smrg " movaps 32(%1), %%xmm2\n" \ 20672b676d7Smrg " movaps 48(%1), %%xmm3\n" \ 20772b676d7Smrg " leal 64(%1),%1\n" \ 20872b676d7Smrg " movntps %%xmm2, 32(%0)\n" \ 20972b676d7Smrg " movntps %%xmm3, 48(%0)\n" \ 21072b676d7Smrg " decl %2\n" \ 21172b676d7Smrg " leal 64(%0),%0\n" \ 21272b676d7Smrg " jne 2b\n" \ 21372b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 21472b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 21572b676d7Smrg : "memory", "cc"); \ 21672b676d7Smrg } 21772b676d7Smrg 21872b676d7Smrg#define SSE64_CPY(prefetch,from,to,dummy,lcnt) \ 21972b676d7Smrg if((ULong) from & 15) { \ 22072b676d7Smrg __asm__ __volatile__ ( \ 22172b676d7Smrg "1:\n" \ 22272b676d7Smrg prefetch "320(%1)\n" \ 22372b676d7Smrg " movups (%1), %%xmm0\n" \ 22472b676d7Smrg " movups 16(%1), %%xmm1\n" \ 22572b676d7Smrg " movntps %%xmm0, (%0)\n" \ 22672b676d7Smrg " movntps %%xmm1, 16(%0)\n" \ 22772b676d7Smrg prefetch "352(%1)\n" \ 22872b676d7Smrg " movups 32(%1), %%xmm2\n" \ 22972b676d7Smrg " movups 48(%1), %%xmm3\n" \ 23072b676d7Smrg " leaq 64(%1),%1\n" \ 23172b676d7Smrg " movntps %%xmm2, 32(%0)\n" \ 23272b676d7Smrg " movntps %%xmm3, 48(%0)\n" \ 23372b676d7Smrg " decl %2\n" \ 23472b676d7Smrg " leaq 64(%0),%0\n" \ 23572b676d7Smrg " jne 1b\n" \ 23672b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 23772b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 23872b676d7Smrg : "memory", "cc"); \ 23972b676d7Smrg } else { \ 24072b676d7Smrg __asm__ __volatile__ ( \ 24172b676d7Smrg "2:\n" \ 24272b676d7Smrg prefetch "320(%1)\n" \ 24372b676d7Smrg " movaps (%1), %%xmm0\n" \ 24472b676d7Smrg " movaps 16(%1), %%xmm1\n" \ 24572b676d7Smrg " movntps %%xmm0, (%0)\n" \ 24672b676d7Smrg " movntps %%xmm1, 16(%0)\n" \ 24772b676d7Smrg prefetch "352(%1)\n" \ 24872b676d7Smrg " movaps 32(%1), %%xmm2\n" \ 24972b676d7Smrg " movaps 48(%1), %%xmm3\n" \ 25072b676d7Smrg " leaq 64(%1),%1\n" \ 25172b676d7Smrg " movntps %%xmm2, 32(%0)\n" \ 25272b676d7Smrg " movntps %%xmm3, 48(%0)\n" \ 25372b676d7Smrg " decl %2\n" \ 25472b676d7Smrg " leaq 64(%0),%0\n" \ 25572b676d7Smrg " jne 2b\n" \ 25672b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 25772b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 25872b676d7Smrg : "memory", "cc"); \ 25972b676d7Smrg } 26072b676d7Smrg 26172b676d7Smrg#define MMXEXT_CPY(prefetch,from,to,dummy,lcnt) \ 26272b676d7Smrg __asm__ __volatile__ ( \ 26372b676d7Smrg ".p2align 4,,7\n" \ 26472b676d7Smrg "1:\n" \ 26572b676d7Smrg prefetch "320(%1)\n" \ 26672b676d7Smrg " movq (%1), %%mm0\n" \ 26772b676d7Smrg " movq 8(%1), %%mm1\n" \ 26872b676d7Smrg " movq 16(%1), %%mm2\n" \ 26972b676d7Smrg " movq 24(%1), %%mm3\n" \ 27072b676d7Smrg " movntq %%mm0, (%0)\n" \ 27172b676d7Smrg " movntq %%mm1, 8(%0)\n" \ 27272b676d7Smrg " movntq %%mm2, 16(%0)\n" \ 27372b676d7Smrg " movntq %%mm3, 24(%0)\n" \ 27472b676d7Smrg prefetch "352(%1)\n" \ 27572b676d7Smrg " movq 32(%1), %%mm0\n" \ 27672b676d7Smrg " movq 40(%1), %%mm1\n" \ 27772b676d7Smrg " movq 48(%1), %%mm2\n" \ 27872b676d7Smrg " movq 56(%1), %%mm3\n" \ 27972b676d7Smrg " leal 64(%1),%1\n" \ 28072b676d7Smrg " movntq %%mm0, 32(%0)\n" \ 28172b676d7Smrg " movntq %%mm1, 40(%0)\n" \ 28272b676d7Smrg " movntq %%mm2, 48(%0)\n" \ 28372b676d7Smrg " movntq %%mm3, 56(%0)\n" \ 28472b676d7Smrg " decl %2\n" \ 28572b676d7Smrg " leal 64(%0),%0\n" \ 28672b676d7Smrg " jne 1b\n" \ 28772b676d7Smrg : "=&D"(to), "=&S"(from), "=&r"(dummy) \ 28872b676d7Smrg : "0" (to), "1" (from), "2" (lcnt) \ 28972b676d7Smrg : "memory", "cc"); 29072b676d7Smrg 29172b676d7Smrg 29272b676d7Smrg#define PREFETCH_FUNC(prefix,itype,ptype,begin,fence,small) \ 29372b676d7Smrg \ 29472b676d7Smrg static void prefix##_memcpy(UChar *to, \ 29572b676d7Smrg const UChar *from, \ 29672b676d7Smrg int size) \ 29772b676d7Smrg { \ 29872b676d7Smrg int lcnt = size >> 6; \ 29972b676d7Smrg int rest = size & 63; \ 30072b676d7Smrg register int dummy; \ 30172b676d7Smrg \ 30272b676d7Smrg PREFETCH1(ptype##_PREFETCH,from); \ 30372b676d7Smrg \ 30472b676d7Smrg begin; \ 30572b676d7Smrg if(lcnt) { \ 30672b676d7Smrg itype##_CPY(ptype##_PREFETCH,from,to,dummy,lcnt); \ 30772b676d7Smrg } \ 30872b676d7Smrg if(rest) { \ 30972b676d7Smrg PREFETCH2(ptype##_PREFETCH,from); \ 31072b676d7Smrg small(to, from, rest); \ 31172b676d7Smrg PREFETCH3(ptype##_PREFETCH,from); \ 31272b676d7Smrg } \ 31372b676d7Smrg fence; \ 31472b676d7Smrg } 31572b676d7Smrg 31672b676d7Smrg#define NOPREFETCH_FUNC(prefix,itype,begin,fence,small) \ 31772b676d7Smrg \ 31872b676d7Smrg static void prefix##_memcpy(UChar *to, \ 31972b676d7Smrg const UChar *from, \ 32072b676d7Smrg int size) \ 32172b676d7Smrg { \ 32272b676d7Smrg int lcnt = size >> 6; \ 32372b676d7Smrg int rest = size & 63; \ 32472b676d7Smrg register int dummy; \ 32572b676d7Smrg \ 32672b676d7Smrg begin; \ 32772b676d7Smrg if(lcnt) { \ 32872b676d7Smrg itype##_CPY("#",from,to,dummy,lcnt); \ 32972b676d7Smrg } \ 33072b676d7Smrg if(rest) { \ 33172b676d7Smrg small(to, from, rest); \ 33272b676d7Smrg } \ 33372b676d7Smrg fence; \ 33472b676d7Smrg } 33572b676d7Smrg 33672b676d7Smrg/* Other archs */ 33772b676d7Smrg 33872b676d7Smrg/* ... */ 33972b676d7Smrg 34072b676d7Smrg 34172b676d7Smrg/* Type for table for benchmark list */ 34272b676d7Smrg 34372b676d7Smrgtypedef struct { 34472b676d7Smrg vidCopyFunc mFunc; 34572b676d7Smrg char *mName; 34672b676d7Smrg unsigned int mycpuflag; 34772b676d7Smrg int grade; 34872b676d7Smrg int gradefrom; 34972b676d7Smrg Bool reqAlignment; 35072b676d7Smrg} SISMCFuncData; 35172b676d7Smrg 35272b676d7Smrg/************************************************************************/ 35372b676d7Smrg/* libc memcpy() wrapper - generic */ 35472b676d7Smrg/************************************************************************/ 35572b676d7Smrg 35672b676d7Smrgstatic void SiS_libc_memcpy(UChar *dst, const UChar *src, int size) 35772b676d7Smrg{ 35872b676d7Smrg memcpy(dst, src, size); 35972b676d7Smrg} 36072b676d7Smrg 36172b676d7Smrg/************************************************************************/ 36272b676d7Smrg/* We only do all that stuff under gcc; no idea what other compilers */ 36372b676d7Smrg/* would do with our asm code. */ 36472b676d7Smrg/************************************************************************/ 36572b676d7Smrg 36672b676d7Smrg#ifndef __GNUC__ 36772b676d7Smrg 36872b676d7Smrgunsigned int SiSGetCPUFlags(ScrnInfoPtr pScrn) 36972b676d7Smrg{ 37072b676d7Smrg return 0; 37172b676d7Smrg} 37272b676d7Smrg 37372b676d7SmrgvidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 37472b676d7Smrg{ 37572b676d7Smrg *UMemCpy = SiS_libc_memcpy; 37672b676d7Smrg return SiS_libc_memcpy; 37772b676d7Smrg} 37872b676d7Smrg 37972b676d7SmrgvidCopyFunc SiSVidCopyGetDefault(void) 38072b676d7Smrg{ 38172b676d7Smrg return SiS_libc_memcpy; 38272b676d7Smrg} 38372b676d7Smrg 38472b676d7Smrg#else /* ! Everything below is gcc specific ! */ 38572b676d7Smrg 38672b676d7Smrg/************************************************************************/ 38772b676d7Smrg/* Definitions for archs and OSes */ 38872b676d7Smrg/************************************************************************/ 38972b676d7Smrg 39072b676d7Smrg#undef SiS_checkosforsse 39172b676d7Smrg#undef SiS_canBenchmark 39272b676d7Smrg#undef SiS_haveProc 39372b676d7Smrg#undef SiS_haveBuiltInMC 39472b676d7Smrg 39572b676d7Smrg#if defined(__i386__) /* ***************************************** i386 */ 39672b676d7Smrg 39772b676d7Smrg#define SiS_checkosforsse /* Does this cpu support sse and do we need to check os? */ 39872b676d7Smrg#define SiS_canBenchmark /* Can we perform a benchmark? */ 39972b676d7Smrg#ifdef SIS_LINUX 40072b676d7Smrg#define SiS_haveProc /* Do we have /proc/cpuinfo or similar? */ 40172b676d7Smrg#endif 40272b676d7Smrg#define SiS_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 40372b676d7Smrg 40472b676d7Smrg/* Built-in memcpy for i386 */ 40572b676d7Smrgstatic __inline void * builtin_memcpy(void * to, const void * from, size_t n) 40672b676d7Smrg{ 40772b676d7Smrg int d1,d2,d3; 40872b676d7Smrg 40972b676d7Smrg __asm__ __volatile__( 41072b676d7Smrg " cld\n" 41172b676d7Smrg " shrl $1, %%ecx\n" 41272b676d7Smrg " jnc 1f\n" 41372b676d7Smrg " movsb\n" 41472b676d7Smrg "1: shrl $1, %%ecx\n" 41572b676d7Smrg " jnc 2f\n" 41672b676d7Smrg " movsw\n" 41772b676d7Smrg "2: rep ; movsl\n" 41872b676d7Smrg : "=&c" (d1), "=&D" (d2), "=&S" (d3) 41972b676d7Smrg : "0" (n), "1" ((long) to), "2" ((long) from) 42072b676d7Smrg : "memory", "cc"); 42172b676d7Smrg 42272b676d7Smrg return(to); 42372b676d7Smrg} 42472b676d7Smrg 42572b676d7Smrg/* Alternative for 586: Unroll loop, copy 32 bytes at a time */ 42672b676d7Smrgstatic void SiS_builtin_memcp2(UChar *to, const UChar *from, int n) 42772b676d7Smrg{ 42872b676d7Smrg int d1,d2,d3; 42972b676d7Smrg 43072b676d7Smrg __asm__ __volatile__( 43172b676d7Smrg " movl %%edi, %%eax\n" 43272b676d7Smrg " cmpl $32, %%ecx\n" 43372b676d7Smrg " cld\n" 43472b676d7Smrg " jbe 3f\n" 43572b676d7Smrg " negl %%eax\n" /* Align dest */ 43672b676d7Smrg " andl $3, %%eax\n" 43772b676d7Smrg " subl %%eax, %%ecx\n" 43872b676d7Smrg " xchgl %%eax, %%ecx\n" 43972b676d7Smrg " rep ; movsb\n" 44072b676d7Smrg " movl %%eax, %%ecx\n" 44172b676d7Smrg " subl $32, %%ecx\n" 44272b676d7Smrg " js 2f\n" 44372b676d7Smrg " movl (%%edi), %%eax\n" 44472b676d7Smrg "1: movl 28(%%edi), %%edx\n" /* Trick: Read-ahead */ 44572b676d7Smrg " subl $32, %%ecx\n" 44672b676d7Smrg " movl (%%esi), %%eax\n" 44772b676d7Smrg " movl 4(%%esi), %%edx\n" 44872b676d7Smrg " movl %%eax, (%%edi)\n" 44972b676d7Smrg " movl %%edx, 4(%%edi)\n" 45072b676d7Smrg " movl 8(%%esi), %%eax\n" 45172b676d7Smrg " movl 12(%%esi), %%edx\n" 45272b676d7Smrg " movl %%eax, 8(%%edi)\n" 45372b676d7Smrg " movl %%edx, 12(%%edi)\n" 45472b676d7Smrg " movl 16(%%esi), %%eax\n" 45572b676d7Smrg " movl 20(%%esi), %%edx\n" 45672b676d7Smrg " movl %%eax, 16(%%edi)\n" 45772b676d7Smrg " movl %%edx, 20(%%edi)\n" 45872b676d7Smrg " movl 24(%%esi), %%eax\n" 45972b676d7Smrg " movl 28(%%esi), %%edx\n" 46072b676d7Smrg " movl %%eax, 24(%%edi)\n" 46172b676d7Smrg " movl %%edx, 28(%%edi)\n" 46272b676d7Smrg " leal 32(%%esi), %%esi\n" 46372b676d7Smrg " leal 32(%%edi), %%edi\n" 46472b676d7Smrg " jns 1b\n" 46572b676d7Smrg "2: addl $32, %%ecx\n" 46672b676d7Smrg "3: rep ; movsb" 46772b676d7Smrg : "=&c" (d1), "=&D" (d2), "=&S" (d3) 46872b676d7Smrg : "0" (n), "1" ((long) to), "2" ((long) from) 46972b676d7Smrg : "eax", "edx", "memory", "cc"); 47072b676d7Smrg 47172b676d7Smrg} 47272b676d7Smrg 47372b676d7Smrgstatic unsigned int taketime(void) /* get current time (for benchmarking) */ 47472b676d7Smrg{ 47572b676d7Smrg unsigned int eax; 47672b676d7Smrg 47772b676d7Smrg __asm__ volatile ( 47872b676d7Smrg " pushl %%ebx\n" 47972b676d7Smrg " cpuid\n" 48072b676d7Smrg " rdtsc\n" 48172b676d7Smrg " popl %%ebx\n" 48272b676d7Smrg : "=a" (eax) 48372b676d7Smrg : "0" (0) 48472b676d7Smrg : "ecx", "edx", "cc"); 48572b676d7Smrg 48672b676d7Smrg return(eax); 48772b676d7Smrg} 48872b676d7Smrg 48972b676d7Smrg#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /***************** AMD64 */ 49072b676d7Smrg 49172b676d7Smrg#define SiS_checkosforsse /* Does this cpu support sse and do we need to check os? */ 49272b676d7Smrg#define SiS_canBenchmark /* Can we perform a benchmark? */ 49372b676d7Smrg#ifdef SIS_LINUX 49472b676d7Smrg#define SiS_haveProc /* Do we have /proc/cpuinfo or similar? */ 49572b676d7Smrg#endif 49672b676d7Smrg#define SiS_haveBuiltInMC /* Is there a built-in memcpy for this arch? */ 49772b676d7Smrg 49872b676d7Smrg/* Built-in memcpy for AMD64 */ 49972b676d7Smrgstatic __inline void * builtin_memcpy(void * to, const void * from, int n) 50072b676d7Smrg{ 50172b676d7Smrg long d1, d2, d3; 50272b676d7Smrg 50372b676d7Smrg __asm__ __volatile__ ( 50472b676d7Smrg " cld\n" 50572b676d7Smrg " rep ; movsq\n" 50672b676d7Smrg " movq %4, %%rcx\n" 50772b676d7Smrg " rep ; movsb" 50872b676d7Smrg : "=%c" (d1), "=&D" (d2), "=&S" (d3) 50972b676d7Smrg : "0" ((ULong)(n >> 3)), "q" ((ULong)(n & 7)), 51072b676d7Smrg "1" ((long) to), "2" ((long) from) 51172b676d7Smrg : "memory"); 51272b676d7Smrg 51372b676d7Smrg return(to); 51472b676d7Smrg} 51572b676d7Smrg 51672b676d7Smrg/* Alternative: Unroll loop, copy 32 bytes at a time */ 51772b676d7Smrgstatic void SiS_builtin_memcp2(UChar *to, const UChar *from, int n) 51872b676d7Smrg{ 51972b676d7Smrg long d1,d2,d3; 52072b676d7Smrg 52172b676d7Smrg __asm__ __volatile__( 52272b676d7Smrg " movq %%rdi, %%rax\n" 52372b676d7Smrg " cmpq $32, %%rcx\n" 52472b676d7Smrg " cld\n" /* Pipeline; no other flags but DF */ 52572b676d7Smrg " jbe 1f\n" 52672b676d7Smrg " negq %%rax\n" /* Align dest */ 52772b676d7Smrg " andq $7, %%rax\n" 52872b676d7Smrg " subq %%rax, %%rcx\n" 52972b676d7Smrg " xchgq %%rax, %%rcx\n" 53072b676d7Smrg " rep ; movsb\n" 53172b676d7Smrg " movq %%rax, %%rcx\n" 53272b676d7Smrg " subq $32, %%rcx\n" 53372b676d7Smrg " js 2f\n" 53472b676d7Smrg ".p2align 4\n" 53572b676d7Smrg "3: subq $32, %%rcx\n" 53672b676d7Smrg " movq (%%rsi), %%rax\n" 53772b676d7Smrg " movq 8(%%rsi), %%rdx\n" 53872b676d7Smrg " movq 16(%%rsi), %%r8\n" 53972b676d7Smrg " movq 24(%%rsi), %%r9\n" 54072b676d7Smrg " movq %%rax, (%%rdi)\n" 54172b676d7Smrg " movq %%rdx, 8(%%rdi)\n" 54272b676d7Smrg " movq %%r8, 16(%%rdi)\n" 54372b676d7Smrg " movq %%r9, 24(%%rdi)\n" 54472b676d7Smrg " leaq 32(%%rsi), %%rsi\n" 54572b676d7Smrg " leaq 32(%%rdi), %%rdi\n" 54672b676d7Smrg " jns 3b\n" 54772b676d7Smrg "2: addq $32, %%rcx\n" 54872b676d7Smrg "1: rep ; movsb" 54972b676d7Smrg : "=&c" (d1), "=&D" (d2), "=&S" (d3) 55072b676d7Smrg :"0" ((ULong) n), "1" ((long) to), "2" ((long) from) 55172b676d7Smrg : "rax", "rdx", "r8", "r9", "memory", "cc"); 55272b676d7Smrg 55372b676d7Smrg} 55472b676d7Smrg 55572b676d7Smrgstatic unsigned int taketime(void) /* get current time (for benchmarking) */ 55672b676d7Smrg{ 55772b676d7Smrg unsigned int eax; 55872b676d7Smrg 55972b676d7Smrg __asm__ volatile ( 56072b676d7Smrg " pushq %%rbx\n" 56172b676d7Smrg " cpuid\n" 56272b676d7Smrg " rdtsc\n" 56372b676d7Smrg " popq %%rbx\n" 56472b676d7Smrg : "=a" (eax) 56572b676d7Smrg : "0" (0) 56672b676d7Smrg : "rcx", "rdx", "cc"); 56772b676d7Smrg 56872b676d7Smrg return(eax); 56972b676d7Smrg} 57072b676d7Smrg 57172b676d7Smrg#else /* **************************************** Other archs */ 57272b676d7Smrg 57372b676d7Smrg/* 1. Can we do a benchmark? */ 57472b676d7Smrg/* #define SiS_canBenchmark */ 57572b676d7Smrg 57672b676d7Smrg/* 2. Do we have /proc filesystem or similar for CPU information? */ 57772b676d7Smrg/* #define SiS_haveproc */ 57872b676d7Smrg 57972b676d7Smrg/* 3. Optional: build-in memcpy() */ 58072b676d7Smrg/* #define SiS_haveBuiltInMC */ 58172b676d7Smrg/* static __inline void * builtin_memcpy(void * to, const void * from, int n) 58272b676d7Smrg { 58372b676d7Smrg } 58472b676d7Smrg*/ 58572b676d7Smrg 58672b676d7Smrg/* 4. Function for getting current time (for benchmarking) */ 58772b676d7Smrg/* static unsigned int taketime(void) 58872b676d7Smrg { 58972b676d7Smrg } 59072b676d7Smrg*/ 59172b676d7Smrg 59272b676d7Smrg#endif 59372b676d7Smrg 59472b676d7Smrg/************************************************************************/ 59572b676d7Smrg/* Generic built-in memcpy wrapper */ 59672b676d7Smrg/************************************************************************/ 59772b676d7Smrg 59872b676d7Smrg#ifdef SiS_haveBuiltInMC 59972b676d7Smrgstatic void SiS_builtin_memcpy(UChar *dst, const UChar *src, int size) 60072b676d7Smrg{ 60172b676d7Smrg builtin_memcpy(dst, src, size); 60272b676d7Smrg} 60372b676d7Smrg#endif 60472b676d7Smrg 60572b676d7Smrg/************************************************************************/ 60672b676d7Smrg/* Generic routines if Benchmark can be performed (all archs, all OSes) */ 60772b676d7Smrg/************************************************************************/ 60872b676d7Smrg 60972b676d7Smrg#ifdef SiS_canBenchmark 61072b676d7Smrg 61172b676d7Smrg/* Get time (unsigned int) */ 61272b676d7Smrgstatic unsigned int time_function(vidCopyFunc mf, UChar *buf1, UChar *buf2, int size) 61372b676d7Smrg{ 61472b676d7Smrg unsigned int t1, t2; 61572b676d7Smrg 61672b676d7Smrg t1 = taketime(); 61772b676d7Smrg 61872b676d7Smrg (*mf)(buf1, buf2, size); 61972b676d7Smrg 62072b676d7Smrg t2 = taketime(); 62172b676d7Smrg 62272b676d7Smrg return((t1 < t2) ? t2 - t1 : 0xFFFFFFFFU - (t1 - t2 - 1)); 62372b676d7Smrg} 62472b676d7Smrg 62572b676d7Smrg/* Allocate an area of offscreen FB memory (buf1), a simulated video 62672b676d7Smrg * player buffer (buf2) and a pool of uninitialized "video" data (buf3). 62772b676d7Smrg */ 62872b676d7Smrgstatic void * 62972b676d7SmrgSiS_AllocBuffers(ScrnInfoPtr pScrn, UChar **buf1, UChar **buf2, UChar **buf3) 63072b676d7Smrg{ 63172b676d7Smrg SISPtr pSiS = SISPTR(pScrn); 63272b676d7Smrg unsigned int offset; 63372b676d7Smrg void *handle = NULL; 63472b676d7Smrg 63572b676d7Smrg if(!(offset = SISAllocateFBMemory(pScrn, &handle, BUFFERSIZE + 31))) { 63672b676d7Smrg return NULL; 63772b676d7Smrg } 63872b676d7Smrg (*buf1) = (UChar *)pSiS->FbBase + offset; 63972b676d7Smrg (*buf1) = (UChar *)(((ULong)(*buf1) + 31) & ~31); 64072b676d7Smrg 64174c14cd6Smrg if(!((*buf2) = (UChar *)malloc(BUFFERSIZE + 15))) { 64272b676d7Smrg SISFreeFBMemory(pScrn, &handle); 64372b676d7Smrg return NULL; 64472b676d7Smrg } 64572b676d7Smrg 64674c14cd6Smrg if(!((*buf3) = (UChar *)malloc(BUFFERSIZE + 15))) { 64774c14cd6Smrg free((*buf2)); 64872b676d7Smrg SISFreeFBMemory(pScrn, &handle); 64972b676d7Smrg return NULL; 65072b676d7Smrg } 65172b676d7Smrg 65272b676d7Smrg return handle; 65372b676d7Smrg} 65472b676d7Smrg 65572b676d7Smrg/* Perform Benchmark */ 65672b676d7Smrgstatic int SiS_BenchmarkMemcpy(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions, 65772b676d7Smrg unsigned int myCPUflags, UChar *buf1, UChar *buf2, 65872b676d7Smrg UChar *buf3, char *frqBuf, double cpuFreq, 65972b676d7Smrg vidCopyFunc *UMemCpy, int *best2, Bool from) 66072b676d7Smrg{ 66172b676d7Smrg SISMCFuncData *curData; 66272b676d7Smrg int j = 0, bestSoFar = 0; 66372b676d7Smrg unsigned int tmp1, tmp2, best = 0xFFFFFFFFU, sbest = 0xFFFFFFFFU; 66472b676d7Smrg 66572b676d7Smrg (*best2) = 0; 66672b676d7Smrg 66772b676d7Smrg /* Make probable buf1 and buf2 are not paged out by referencing them */ 66872b676d7Smrg SiS_libc_memcpy(buf1, buf2, BUFFERSIZE); 66972b676d7Smrg 67072b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 67172b676d7Smrg "Benchmarking %s RAM to %s RAM memory transfer methods:\n", 67272b676d7Smrg from ? "video" : "system", 67372b676d7Smrg from ? "system" : "video"); 67472b676d7Smrg 67572b676d7Smrg#ifdef TWDEBUG 67672b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Benchmark: CPUFlags %x\n", myCPUflags); 67772b676d7Smrg#endif 67872b676d7Smrg 67972b676d7Smrg j = 0; 68072b676d7Smrg while(MCFunctions[j].mFunc) { 68172b676d7Smrg 68272b676d7Smrg curData = MCFunctions + j; 68372b676d7Smrg 68472b676d7Smrg if(myCPUflags & curData->mycpuflag) { 68572b676d7Smrg 68672b676d7Smrg /* Simulate setup of the video buffer and copy result to framebuffer */ 68772b676d7Smrg /* Do this 4 times to verify results */ 68872b676d7Smrg if(!from) { 68972b676d7Smrg SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 69072b676d7Smrg tmp1 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 69172b676d7Smrg SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 69272b676d7Smrg tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 69372b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 69472b676d7Smrg SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 69572b676d7Smrg tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 69672b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 69772b676d7Smrg SiS_builtin_memcpy(buf2, buf3, BUFFERSIZE); 69872b676d7Smrg tmp2 = time_function(curData->mFunc, buf1, buf2, BUFFERSIZE); 69972b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 70072b676d7Smrg } else { 70172b676d7Smrg SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 70272b676d7Smrg tmp1 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 70372b676d7Smrg SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 70472b676d7Smrg tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 70572b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 70672b676d7Smrg SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 70772b676d7Smrg tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 70872b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 70972b676d7Smrg SiS_builtin_memcpy(buf3, buf2, BUFFERSIZE); 71072b676d7Smrg tmp2 = time_function(curData->mFunc, buf2, buf1, BUFFERSIZE); 71172b676d7Smrg tmp1 = (tmp2 < tmp1) ? tmp2 : tmp1; 71272b676d7Smrg } 71372b676d7Smrg 71472b676d7Smrg if((!frqBuf) || (tmp1 == 0)) { 71572b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 71672b676d7Smrg "\tChecked %s memcpy()... \t%u\n",curData->mName, tmp1); 71772b676d7Smrg } else { 71872b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 71972b676d7Smrg "\tChecked %s memcpy()... \t%.1f MiB/s\n", 72072b676d7Smrg curData->mName, 72172b676d7Smrg cpuFreq * 1.e6 * (double)BUFFERSIZE / ((double)(tmp1) * (double)(0x100000))); 72272b676d7Smrg } 72372b676d7Smrg 72472b676d7Smrg if(tmp1 < best) { 72572b676d7Smrg best = tmp1; 72672b676d7Smrg bestSoFar = j; 72772b676d7Smrg } 72872b676d7Smrg 72972b676d7Smrg if(!curData->reqAlignment) { 73072b676d7Smrg if(tmp1 < sbest) { 73172b676d7Smrg sbest = tmp1; 73272b676d7Smrg (*best2) = j; 73372b676d7Smrg } 73472b676d7Smrg } 73572b676d7Smrg 73672b676d7Smrg } 73772b676d7Smrg 73872b676d7Smrg j++; 73972b676d7Smrg } 74072b676d7Smrg 74172b676d7Smrg return bestSoFar; 74272b676d7Smrg} 74372b676d7Smrg 74472b676d7Smrgstatic vidCopyFunc SiS_GetBestByGrade(ScrnInfoPtr pScrn, SISMCFuncData *MCFunctions, 74572b676d7Smrg unsigned int myCPUflags, vidCopyFunc *UMemCpy, Bool from) 74672b676d7Smrg{ 74772b676d7Smrg int j = 0, best = -1, secondbest = -1, bestSoFar = 10, best2SoFar = 10; 74872b676d7Smrg int grade; 74972b676d7Smrg 75072b676d7Smrg *UMemCpy = SiS_libc_memcpy; 75172b676d7Smrg 75272b676d7Smrg while(MCFunctions[j].mFunc) { 75372b676d7Smrg if(myCPUflags & MCFunctions[j].mycpuflag) { 75472b676d7Smrg grade = from ? MCFunctions[j].gradefrom : MCFunctions[j].grade; 75572b676d7Smrg if(grade < bestSoFar) { 75672b676d7Smrg best = j; 75772b676d7Smrg bestSoFar = grade; 75872b676d7Smrg } 75972b676d7Smrg if(grade < best2SoFar) { 76072b676d7Smrg if(!MCFunctions[j].reqAlignment) { 76172b676d7Smrg secondbest = j; 76272b676d7Smrg best2SoFar = grade; 76372b676d7Smrg } 76472b676d7Smrg } 76572b676d7Smrg } 76672b676d7Smrg j++; 76772b676d7Smrg } 76872b676d7Smrg if(best >= 0) { 76972b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 77072b676d7Smrg "Chose %s method for aligned data transfers %s video RAM\n", 77172b676d7Smrg MCFunctions[best].mName, 77272b676d7Smrg from ? "from" : "to"); 77372b676d7Smrg if(secondbest >= 0) { 77472b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 77572b676d7Smrg "Chose %s method for unaligned data transfers %s video RAM\n", 77672b676d7Smrg MCFunctions[secondbest].mName, 77772b676d7Smrg from ? "from" : "to"); 77872b676d7Smrg *UMemCpy = MCFunctions[secondbest].mFunc; 77972b676d7Smrg } 78072b676d7Smrg return MCFunctions[best].mFunc; 78172b676d7Smrg } 78272b676d7Smrg 78372b676d7Smrg return SiS_libc_memcpy; 78472b676d7Smrg} 78572b676d7Smrg#endif /* canBenchmark */ 78672b676d7Smrg 78772b676d7Smrg/**********************************************************************/ 78872b676d7Smrg/* Generic routines if /proc filesystem is available (Linux) */ 78972b676d7Smrg/**********************************************************************/ 79072b676d7Smrg 79172b676d7Smrg#ifdef SiS_haveProc 79272b676d7Smrg/* Linux: Read file (/proc/cpuinfo) into buffer */ 79372b676d7Smrgstatic int SiS_ReadProc(char *buf, char *filename) 79472b676d7Smrg{ 79572b676d7Smrg FILE *cpuInfoFile; 79672b676d7Smrg int count; 79772b676d7Smrg 79872b676d7Smrg if((cpuInfoFile = fopen(filename, "r")) == NULL) { 79972b676d7Smrg return 0; 80072b676d7Smrg } 80172b676d7Smrg 80272b676d7Smrg count = fread(buf, 1, CPUBUFFERSIZE, cpuInfoFile); 80372b676d7Smrg if(ferror(cpuInfoFile)) { 80472b676d7Smrg fclose(cpuInfoFile); 80572b676d7Smrg return 0; 80672b676d7Smrg } 80772b676d7Smrg 80872b676d7Smrg fclose(cpuInfoFile); 80972b676d7Smrg 81072b676d7Smrg if(count >= CPUBUFFERSIZE - 2) { 81172b676d7Smrg return 0; 81272b676d7Smrg } 81372b676d7Smrg 81472b676d7Smrg buf[count] = 0; 81572b676d7Smrg 81672b676d7Smrg return count; 81772b676d7Smrg} 81872b676d7Smrg 81972b676d7Smrg/* Linux: Extract CPU speed from /proc/cpuinfo */ 82072b676d7Smrgstatic char *SiS_GetCPUFreq(ScrnInfoPtr pScrn, char *buf, double *cpuFreq) 82172b676d7Smrg{ 82272b676d7Smrg char *frqBuf, *endBuf; 82372b676d7Smrg 82472b676d7Smrg (*cpuFreq) = 0.0; 82572b676d7Smrg 82672b676d7Smrg if((frqBuf = strstr(buf,"cpu MHz\t\t:"))) { 82772b676d7Smrg frqBuf += 11; 82872b676d7Smrg (*cpuFreq) = strtod(frqBuf, &endBuf); 82972b676d7Smrg if(endBuf == frqBuf) frqBuf = NULL; 83072b676d7Smrg if((*cpuFreq) < 10.0) frqBuf = NULL; /* sanity check */ 83172b676d7Smrg if(frqBuf) { 83272b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU frequency %.2fMhz\n", (*cpuFreq)); 83372b676d7Smrg } 83472b676d7Smrg } 83572b676d7Smrg 83672b676d7Smrg return frqBuf; 83772b676d7Smrg} 83872b676d7Smrg#endif /* haveProc */ 83972b676d7Smrg 84072b676d7Smrg/**********************************************************************/ 84172b676d7Smrg/* Arch-specific routines */ 84272b676d7Smrg/**********************************************************************/ 84372b676d7Smrg 84472b676d7Smrg#ifdef SiS_checkosforsse /* Common i386, AMD64 */ 84572b676d7Smrg 84672b676d7Smrg#ifdef SISCHECKOSSSE 84772b676d7Smrg 84872b676d7Smrg#ifndef XFree86LOADER 84972b676d7Smrg#include <setjmp.h> 85072b676d7Smrg#endif 85172b676d7Smrg 85272b676d7Smrgstatic jmp_buf sigill_return; 85372b676d7Smrg 85472b676d7Smrgstatic void sigill_handler(void) 85572b676d7Smrg{ 85672b676d7Smrg longjmp(sigill_return, 1); 85772b676d7Smrg} 85872b676d7Smrg#endif 85972b676d7Smrg 86072b676d7Smrgstatic Bool CheckOSforSSE(ScrnInfoPtr pScrn) 86172b676d7Smrg{ 86272b676d7Smrg#ifdef SISCHECKOSSSE /* Check OS for SSE possible: */ 86372b676d7Smrg int signo = -1; 86472b676d7Smrg 86572b676d7Smrg#ifdef SISDGBMC 86672b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, "Checking OS SSE support\n"); 86772b676d7Smrg#endif 86872b676d7Smrg 86972b676d7Smrg xf86InterceptSigIll(&sigill_handler); 87072b676d7Smrg 87172b676d7Smrg if(setjmp(sigill_return)) { 87272b676d7Smrg signo = 4; 87372b676d7Smrg } else { 87472b676d7Smrg __asm__ __volatile__ (" xorps %xmm0, %xmm0\n"); 87572b676d7Smrg /* __asm__ __volatile__ (" .byte 0xff\n"); */ /* For test */ 87672b676d7Smrg } 87772b676d7Smrg 87872b676d7Smrg xf86InterceptSigIll(NULL); 87972b676d7Smrg 88072b676d7Smrg#ifdef SISDGBMC 88172b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, "OS SSE support signal %d\n", signo); 88272b676d7Smrg#endif 88372b676d7Smrg 88472b676d7Smrg if(signo != -1) { 88572b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 88672b676d7Smrg "OS does not support SSE instructions\n"); 88772b676d7Smrg } 88872b676d7Smrg 88972b676d7Smrg return (signo >= 0) ? FALSE : TRUE; 89072b676d7Smrg 89172b676d7Smrg#else /* no check for SSE possible: */ 89272b676d7Smrg 89372b676d7Smrg SISPtr pSiS = SISPTR(pScrn); 89472b676d7Smrg 89572b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, pSiS->XvSSEMemcpy ? X_WARNING : X_INFO, 89672b676d7Smrg "Checking OS for SSE support is not supported in this version of " SISMYSERVERNAME "\n"); 89772b676d7Smrg 89872b676d7Smrg if(pSiS->XvSSEMemcpy) { 89972b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_WARNING, 90072b676d7Smrg "If you get a signal 4 here, set the option \"UseSSE\" to \"off\".\n"); 90172b676d7Smrg return TRUE; 90272b676d7Smrg } else { 90372b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 90472b676d7Smrg "If your OS supports SSE, set the option \"UseSSE\" to \"on\".\n"); 90572b676d7Smrg return FALSE; 90672b676d7Smrg } 90772b676d7Smrg#endif 90872b676d7Smrg} 90972b676d7Smrg 91072b676d7Smrg#endif /* SiS_checkosforsse */ 91172b676d7Smrg 91272b676d7Smrg#ifdef __i386__ /* i386 specific *************************************/ 91372b676d7Smrg 91472b676d7SmrgPREFETCH_FUNC(SiS_sse,SSE,SSE,,FENCE,small_memcpy_i386) 91572b676d7SmrgPREFETCH_FUNC(SiS_mmxext,MMXEXT,SSE,EMMS,FENCEMMS,small_memcpy_i386) 91672b676d7SmrgPREFETCH_FUNC(SiS_now,MMX,NOW,FEMMS,FEMMS,small_memcpy_i386) 91772b676d7SmrgNOPREFETCH_FUNC(SiS_mmx,MMX,EMMS,EMMS,small_memcpy_i386) 91872b676d7Smrg 91972b676d7Smrgstatic SISMCFuncData MCFunctions_i386[] = { 92072b676d7Smrg {SiS_libc_memcpy, "libc", SIS_CPUFL_LIBC, 4, 4, FALSE}, 92172b676d7Smrg {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI, 5, 5, FALSE}, 92272b676d7Smrg {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2, 6, 6, FALSE}, 92372b676d7Smrg {SiS_mmx_memcpy, "MMX", SIS_CPUFL_MMX, 3, 3, FALSE}, 92472b676d7Smrg {SiS_sse_memcpy, "SSE", SIS_CPUFL_SSE, 1, 0, TRUE}, 92572b676d7Smrg {SiS_now_memcpy, "3DNow!", SIS_CPUFL_3DNOW, 2, 2, FALSE}, 92672b676d7Smrg {SiS_mmxext_memcpy, "MMX2", SIS_CPUFL_MMX2, 0, 1, FALSE}, 92772b676d7Smrg {NULL, "", 0, 10, 10, FALSE} 92872b676d7Smrg}; 92972b676d7Smrg 93072b676d7Smrg#define Def_FL (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2) /* Default methods */ 93172b676d7Smrg 93272b676d7Smrg#define cpuid(op, eax, ebx, ecx, edx) \ 93372b676d7Smrg __asm__ __volatile__ ( \ 93472b676d7Smrg " pushl %%ebx\n" \ 93572b676d7Smrg " cpuid\n" \ 93672b676d7Smrg " movl %%ebx, %1\n" \ 93772b676d7Smrg " popl %%ebx\n" \ 93872b676d7Smrg : "=a" (eax), "=r" (ebx), \ 93972b676d7Smrg "=c" (ecx), "=d" (edx) \ 94072b676d7Smrg : "a" (op) \ 94172b676d7Smrg : "cc") 94272b676d7Smrg 94372b676d7Smrgstatic Bool cpuIDSupported(ScrnInfoPtr pScrn) 94472b676d7Smrg{ 94572b676d7Smrg int eax, ebx, ecx, edx; 94672b676d7Smrg 94772b676d7Smrg /* Check for cpuid instruction */ 94872b676d7Smrg __asm__ __volatile__ ( 94972b676d7Smrg " pushf\n" 95072b676d7Smrg " popl %0\n" 95172b676d7Smrg " movl %0, %1\n" 95272b676d7Smrg " xorl $0x200000, %0\n" 95372b676d7Smrg " push %0\n" 95472b676d7Smrg " popf\n" 95572b676d7Smrg " pushf\n" 95672b676d7Smrg " popl %0\n" 95772b676d7Smrg : "=a" (eax), "=c" (ecx) 95872b676d7Smrg : 95972b676d7Smrg : "cc"); 96072b676d7Smrg 96172b676d7Smrg if(eax == ecx) { 96272b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support CPUID instruction\n"); 96372b676d7Smrg return FALSE; 96472b676d7Smrg } 96572b676d7Smrg 96672b676d7Smrg /* Check for cpuid level */ 96772b676d7Smrg cpuid(0x00000000, eax, ebx, ecx, edx); 96872b676d7Smrg if(!eax) { 96972b676d7Smrg return FALSE; 97072b676d7Smrg } 97172b676d7Smrg 97272b676d7Smrg /* Check for RDTSC */ 97372b676d7Smrg cpuid(0x00000001, eax, ebx, ecx, edx); 97472b676d7Smrg 97572b676d7Smrg if(!(edx & 0x10)) { 97672b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, "CPU does not support RDTSC instruction\n"); 97772b676d7Smrg return FALSE; 97872b676d7Smrg } 97972b676d7Smrg 98072b676d7Smrg return TRUE; 98172b676d7Smrg} 98272b676d7Smrg 98372b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 98472b676d7Smrg{ 98572b676d7Smrg unsigned int flags = 0, eax, ebx, ecx, edx; 98672b676d7Smrg Bool IsAMD; 98772b676d7Smrg 98872b676d7Smrg /* Check if cpuid and rdtsc instructions are supported */ 98972b676d7Smrg if(!cpuIDSupported(pScrn)) { 99072b676d7Smrg return 0; 99172b676d7Smrg } 99272b676d7Smrg 99372b676d7Smrg cpuid(0x00000000, eax, ebx, ecx, edx); 99472b676d7Smrg 99572b676d7Smrg IsAMD = (ebx == 0x68747541) && (edx == 0x69746e65) && (ecx == 0x444d4163); 99672b676d7Smrg 99772b676d7Smrg cpuid(0x00000001, eax, ebx, ecx, edx); 99872b676d7Smrg /* MMX */ 99972b676d7Smrg if(edx & 0x00800000) flags |= SIS_CPUFL_MMX; 100072b676d7Smrg /* SSE, MMXEXT */ 100172b676d7Smrg if(edx & 0x02000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_MMX2); 100272b676d7Smrg /* SSE2 - don't need this one directly, set SSE instead */ 100372b676d7Smrg if(edx & 0x04000000) flags |= (SIS_CPUFL_SSE | SIS_CPUFL_SSE2); 100472b676d7Smrg 100572b676d7Smrg cpuid(0x80000000, eax, ebx, ecx, edx); 100672b676d7Smrg if(eax >= 0x80000001) { 100772b676d7Smrg cpuid(0x80000001, eax, ebx, ecx, edx); 100872b676d7Smrg /* 3DNow! */ 100972b676d7Smrg if(edx & 0x80000000) flags |= SIS_CPUFL_3DNOW; 101072b676d7Smrg /* AMD MMXEXT */ 101172b676d7Smrg if(IsAMD && (edx & 0x00400000)) flags |= SIS_CPUFL_MMX2; 101272b676d7Smrg } 101372b676d7Smrg 101472b676d7Smrg return flags; 101572b676d7Smrg} 101672b676d7Smrg 101772b676d7Smrg#elif defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__) /* AMD64 specific ***** */ 101872b676d7Smrg 101972b676d7SmrgPREFETCH_FUNC(SiS_sse,SSE64,SSE,,FENCE,small_memcpy_amd64) 102072b676d7Smrg 102172b676d7Smrgstatic SISMCFuncData MCFunctions_AMD64[] = { 102272b676d7Smrg {SiS_libc_memcpy, "libc", SIS_CPUFL_LIBC, 2, 2, FALSE}, 102372b676d7Smrg {SiS_builtin_memcpy,"built-in-1",SIS_CPUFL_BI, 1, 1, FALSE}, 102472b676d7Smrg {SiS_builtin_memcp2,"built-in-2",SIS_CPUFL_BI2, 3, 3, FALSE}, 102572b676d7Smrg {SiS_sse_memcpy, "SSE", SIS_CPUFL_SSE, 0, 0, TRUE}, 102672b676d7Smrg {NULL, "", 0, 10, 10, FALSE} 102772b676d7Smrg}; 102872b676d7Smrg 102972b676d7Smrg#define Def_FL (SIS_CPUFL_LIBC | SIS_CPUFL_BI | SIS_CPUFL_BI2) 103072b676d7Smrg 103172b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 103272b676d7Smrg{ 103372b676d7Smrg return((unsigned int)(SIS_CPUFL_SSE|SIS_CPUFL_SSE2)); 103472b676d7Smrg} 103572b676d7Smrg 103672b676d7Smrg#else /* Specific for other archs ******************************** */ 103772b676d7Smrg 103872b676d7Smrg/* Fill in here */ 103972b676d7Smrg 104072b676d7Smrg#define Def_FL (SIS_CPUFL_LIBC) 104172b676d7Smrg 104272b676d7Smrgstatic unsigned int SiS_GetCpuFeatures(ScrnInfoPtr pScrn) 104372b676d7Smrg{ 104472b676d7Smrg return((unsigned int)(0)); 104572b676d7Smrg} 104672b676d7Smrg 104772b676d7Smrg#endif 104872b676d7Smrg 104972b676d7Smrg/**********************************************************************/ 105072b676d7Smrg/* Benchmark the video copy routines and choose the fastest */ 105172b676d7Smrg/**********************************************************************/ 105272b676d7Smrg 105372b676d7Smrg#ifdef SiS_canBenchmark 105472b676d7Smrgstatic vidCopyFunc 105572b676d7SmrgSiSVidCopyInitGen(ScreenPtr pScreen, SISMCFuncData *MCFunctions, vidCopyFunc *UMemCpy, Bool from) 105672b676d7Smrg{ 105774c14cd6Smrg ScrnInfoPtr pScrn = xf86ScreenToScrn(pScreen); 105872b676d7Smrg SISPtr pSiS = SISPTR(pScrn); 105972b676d7Smrg void *fbhandle = NULL; 106072b676d7Smrg char *frqBuf = NULL; 106172b676d7Smrg UChar *buf1, *buf2, *buf3; 106272b676d7Smrg double cpuFreq = 0.0; 106372b676d7Smrg unsigned int myCPUflags = pSiS->CPUFlags | Def_FL; 106472b676d7Smrg int best, secondbest; 106572b676d7Smrg#ifdef SiS_haveProc 106672b676d7Smrg char buf[CPUBUFFERSIZE]; 106772b676d7Smrg#endif 106872b676d7Smrg 106972b676d7Smrg *UMemCpy = SiS_libc_memcpy; 107072b676d7Smrg 107172b676d7Smrg /* Bail out if user disabled benchmarking */ 107272b676d7Smrg if(!pSiS->BenchMemCpy) { 107372b676d7Smrg return SiS_libc_memcpy; 107472b676d7Smrg } 107572b676d7Smrg 107672b676d7Smrg#ifdef SiS_haveProc 107772b676d7Smrg /* Read /proc/cpuinfo into buf */ 107872b676d7Smrg if(SiS_ReadProc(buf, "/proc/cpuinfo")) { 107972b676d7Smrg 108072b676d7Smrg /* Extract CPU frequency */ 108172b676d7Smrg frqBuf = SiS_GetCPUFreq(pScrn, buf, &cpuFreq); 108272b676d7Smrg 108372b676d7Smrg } 108472b676d7Smrg#endif 108572b676d7Smrg 108672b676d7Smrg /* Allocate buffers */ 108772b676d7Smrg if(!(fbhandle = SiS_AllocBuffers(pScrn, &buf1, &buf2, &buf3))) { 108872b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_INFO, 108972b676d7Smrg "Failed to allocate video RAM for video data transfer benchmark\n"); 109072b676d7Smrg return SiS_GetBestByGrade(pScrn, MCFunctions, myCPUflags, UMemCpy, from); 109172b676d7Smrg } 109272b676d7Smrg 109372b676d7Smrg /* Perform Benchmark */ 109472b676d7Smrg best = SiS_BenchmarkMemcpy(pScrn, MCFunctions, myCPUflags, buf1, 109572b676d7Smrg (UChar *)(((unsigned long)buf2 + 15) & ~15), 109672b676d7Smrg (UChar *)(((unsigned long)buf3 + 15) & ~15), 109772b676d7Smrg frqBuf, cpuFreq, UMemCpy, &secondbest, from); 109872b676d7Smrg 109972b676d7Smrg /* Free buffers */ 110072b676d7Smrg SISFreeFBMemory(pScrn, &fbhandle); 110174c14cd6Smrg free(buf2); 110274c14cd6Smrg free(buf3); 110372b676d7Smrg 110472b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 110572b676d7Smrg "Using %s method for aligned data transfers %s video RAM\n", 110672b676d7Smrg MCFunctions[best].mName, 110772b676d7Smrg from ? "from" : "to"); 110872b676d7Smrg 110972b676d7Smrg xf86DrvMsg(pScrn->scrnIndex, X_PROBED, 111072b676d7Smrg "Using %s method for unaligned data transfers %s video RAM\n", 111172b676d7Smrg MCFunctions[secondbest].mName, 111272b676d7Smrg from ? "from" : "to"); 111372b676d7Smrg 111472b676d7Smrg return MCFunctions[best].mFunc; 111572b676d7Smrg} 111672b676d7Smrg#endif /* canBenchmark */ 111772b676d7Smrg 111872b676d7Smrg/**********************************************************************/ 111972b676d7Smrg/* main(): Get CPU capabilities */ 112072b676d7Smrg/* (called externally) */ 112172b676d7Smrg/**********************************************************************/ 112272b676d7Smrg 112372b676d7Smrgunsigned int 112472b676d7SmrgSiSGetCPUFlags(ScrnInfoPtr pScrn) 112572b676d7Smrg{ 112672b676d7Smrg unsigned int myCPUflags = SiS_GetCpuFeatures(pScrn); 112772b676d7Smrg 112872b676d7Smrg#ifdef SiS_checkosforsse 112972b676d7Smrg if(myCPUflags & (SIS_CPUFL_SSE | SIS_CPUFL_SSE2)) { 113072b676d7Smrg 113172b676d7Smrg /* Check if OS supports usage of SSE instructions */ 113272b676d7Smrg if(!(CheckOSforSSE(pScrn))) { 113372b676d7Smrg myCPUflags &= ~(SIS_CPUFL_SSE | SIS_CPUFL_SSE2); 113472b676d7Smrg } 113572b676d7Smrg 113672b676d7Smrg } 113772b676d7Smrg#endif 113872b676d7Smrg 113972b676d7Smrg return myCPUflags; 114072b676d7Smrg} 114172b676d7Smrg 114272b676d7Smrg/**********************************************************************/ 114372b676d7Smrg/* main(): SiSVidCopyInit() */ 114472b676d7Smrg/* (called externally) */ 114572b676d7Smrg/* (SiSGetCPUFlags must be called before this one) */ 114672b676d7Smrg/**********************************************************************/ 114772b676d7Smrg 114872b676d7SmrgvidCopyFunc SiSVidCopyInit(ScreenPtr pScreen, vidCopyFunc *UMemCpy, Bool from) 114972b676d7Smrg{ 115072b676d7Smrg#if defined(__i386__) && defined(SiS_canBenchmark) 115172b676d7Smrg return(SiSVidCopyInitGen(pScreen, MCFunctions_i386, UMemCpy, from)); 115272b676d7Smrg#elif (defined(__AMD64__) || defined(__amd64__) || defined(__x86_64__)) && defined(SiS_canBenchmark) 115372b676d7Smrg return(SiSVidCopyInitGen(pScreen, MCFunctions_AMD64, UMemCpy, from)); 115472b676d7Smrg#else /* Other cases: Use libc memcpy() */ 115572b676d7Smrg *UMemCpy = SiS_libc_memcpy; 115672b676d7Smrg return SiS_libc_memcpy; 115772b676d7Smrg#endif 115872b676d7Smrg} 115972b676d7Smrg 116072b676d7SmrgvidCopyFunc SiSVidCopyGetDefault(void) 116172b676d7Smrg{ 116272b676d7Smrg return SiS_libc_memcpy; 116372b676d7Smrg} 116472b676d7Smrg 116572b676d7Smrg#endif /* GNU C */ 116672b676d7Smrg 116772b676d7Smrg 1168