101e04c3fSmrg/************************************************************************** 201e04c3fSmrg * 301e04c3fSmrg * Copyright 2008 VMware, Inc. 401e04c3fSmrg * All Rights Reserved. 501e04c3fSmrg * 601e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 701e04c3fSmrg * copy of this software and associated documentation files (the 801e04c3fSmrg * "Software"), to deal in the Software without restriction, including 901e04c3fSmrg * without limitation the rights to use, copy, modify, merge, publish, 1001e04c3fSmrg * distribute, sub license, and/or sell copies of the Software, and to 1101e04c3fSmrg * permit persons to whom the Software is furnished to do so, subject to 1201e04c3fSmrg * the following conditions: 1301e04c3fSmrg * 1401e04c3fSmrg * The above copyright notice and this permission notice (including the 1501e04c3fSmrg * next paragraph) shall be included in all copies or substantial portions 1601e04c3fSmrg * of the Software. 1701e04c3fSmrg * 1801e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 1901e04c3fSmrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2001e04c3fSmrg * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 2101e04c3fSmrg * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 2201e04c3fSmrg * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 2301e04c3fSmrg * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 2401e04c3fSmrg * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 2501e04c3fSmrg * 2601e04c3fSmrg **************************************************************************/ 2701e04c3fSmrg 2801e04c3fSmrg 2901e04c3fSmrg 3001e04c3fSmrg#include "pipe/p_config.h" 3101e04c3fSmrg#include "util/u_math.h" 3201e04c3fSmrg#include "util/u_cpu_detect.h" 3301e04c3fSmrg 3401e04c3fSmrg#if defined(PIPE_ARCH_SSE) 3501e04c3fSmrg#include <xmmintrin.h> 3601e04c3fSmrg/* This is defined in pmmintrin.h, but it can only be included when -msse3 is 3701e04c3fSmrg * used, so just define it here to avoid further. */ 387e995a2eSmrg#ifndef _MM_DENORMALS_ZERO_MASK 3901e04c3fSmrg#define _MM_DENORMALS_ZERO_MASK 0x0040 4001e04c3fSmrg#endif 417e995a2eSmrg#endif 4201e04c3fSmrg 4301e04c3fSmrg 4401e04c3fSmrg/** log2(x), for x in [1.0, 2.0) */ 4501e04c3fSmrgfloat log2_table[LOG2_TABLE_SIZE]; 4601e04c3fSmrg 4701e04c3fSmrg 4801e04c3fSmrgstatic void 4901e04c3fSmrginit_log2_table(void) 5001e04c3fSmrg{ 5101e04c3fSmrg unsigned i; 5201e04c3fSmrg for (i = 0; i < LOG2_TABLE_SIZE; i++) 5301e04c3fSmrg log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE)); 5401e04c3fSmrg} 5501e04c3fSmrg 5601e04c3fSmrg 5701e04c3fSmrg/** 5801e04c3fSmrg * One time init for math utilities. 5901e04c3fSmrg */ 6001e04c3fSmrgvoid 6101e04c3fSmrgutil_init_math(void) 6201e04c3fSmrg{ 631463c08dSmrg static bool initialized = false; 6401e04c3fSmrg if (!initialized) { 6501e04c3fSmrg init_log2_table(); 661463c08dSmrg initialized = true; 6701e04c3fSmrg } 6801e04c3fSmrg} 6901e04c3fSmrg 7001e04c3fSmrg/** 7101e04c3fSmrg * Fetches the contents of the fpstate (mxcsr on x86) register. 7201e04c3fSmrg * 7301e04c3fSmrg * On platforms without support for it just returns 0. 7401e04c3fSmrg */ 7501e04c3fSmrgunsigned 7601e04c3fSmrgutil_fpstate_get(void) 7701e04c3fSmrg{ 7801e04c3fSmrg unsigned mxcsr = 0; 7901e04c3fSmrg 8001e04c3fSmrg#if defined(PIPE_ARCH_SSE) 811463c08dSmrg if (util_get_cpu_caps()->has_sse) { 8201e04c3fSmrg mxcsr = _mm_getcsr(); 8301e04c3fSmrg } 8401e04c3fSmrg#endif 8501e04c3fSmrg 8601e04c3fSmrg return mxcsr; 8701e04c3fSmrg} 8801e04c3fSmrg 8901e04c3fSmrg/** 9001e04c3fSmrg * Make sure that the fp treats the denormalized floating 9101e04c3fSmrg * point numbers as zero. 9201e04c3fSmrg * 9301e04c3fSmrg * This is the behavior required by D3D10. OpenGL doesn't care. 9401e04c3fSmrg */ 9501e04c3fSmrgunsigned 9601e04c3fSmrgutil_fpstate_set_denorms_to_zero(unsigned current_mxcsr) 9701e04c3fSmrg{ 9801e04c3fSmrg#if defined(PIPE_ARCH_SSE) 991463c08dSmrg if (util_get_cpu_caps()->has_sse) { 10001e04c3fSmrg /* Enable flush to zero mode */ 10101e04c3fSmrg current_mxcsr |= _MM_FLUSH_ZERO_MASK; 1021463c08dSmrg if (util_get_cpu_caps()->has_daz) { 10301e04c3fSmrg /* Enable denormals are zero mode */ 10401e04c3fSmrg current_mxcsr |= _MM_DENORMALS_ZERO_MASK; 10501e04c3fSmrg } 10601e04c3fSmrg util_fpstate_set(current_mxcsr); 10701e04c3fSmrg } 10801e04c3fSmrg#endif 10901e04c3fSmrg return current_mxcsr; 11001e04c3fSmrg} 11101e04c3fSmrg 11201e04c3fSmrg/** 11301e04c3fSmrg * Set the state of the fpstate (mxcsr on x86) register. 11401e04c3fSmrg * 11501e04c3fSmrg * On platforms without support for it's a noop. 11601e04c3fSmrg */ 11701e04c3fSmrgvoid 11801e04c3fSmrgutil_fpstate_set(unsigned mxcsr) 11901e04c3fSmrg{ 12001e04c3fSmrg#if defined(PIPE_ARCH_SSE) 1211463c08dSmrg if (util_get_cpu_caps()->has_sse) { 12201e04c3fSmrg _mm_setcsr(mxcsr); 12301e04c3fSmrg } 12401e04c3fSmrg#endif 12501e04c3fSmrg} 1261463c08dSmrg 1271463c08dSmrg/** 1281463c08dSmrg * Compute inverse of 4x4 matrix. 1291463c08dSmrg * 1301463c08dSmrg * \return false if the source matrix is singular. 1311463c08dSmrg * 1321463c08dSmrg * \author 1331463c08dSmrg * Code contributed by Jacques Leroy jle@star.be 1341463c08dSmrg * 1351463c08dSmrg * Calculates the inverse matrix by performing the gaussian matrix reduction 1361463c08dSmrg * with partial pivoting followed by back/substitution with the loops manually 1371463c08dSmrg * unrolled. 1381463c08dSmrg */ 1391463c08dSmrgbool 1401463c08dSmrgutil_invert_mat4x4(float *out, const float *m) 1411463c08dSmrg{ 1421463c08dSmrg float wtmp[4][8]; 1431463c08dSmrg float m0, m1, m2, m3, s; 1441463c08dSmrg float *r0, *r1, *r2, *r3; 1451463c08dSmrg 1461463c08dSmrg#define MAT(m, r, c) (m)[(c)*4 + (r)] 1471463c08dSmrg#define SWAP_ROWS(a, b) \ 1481463c08dSmrg { \ 1491463c08dSmrg float *_tmp = a; \ 1501463c08dSmrg (a) = (b); \ 1511463c08dSmrg (b) = _tmp; \ 1521463c08dSmrg } 1531463c08dSmrg 1541463c08dSmrg r0 = wtmp[0], r1 = wtmp[1], r2 = wtmp[2], r3 = wtmp[3]; 1551463c08dSmrg 1561463c08dSmrg r0[0] = MAT(m, 0, 0), r0[1] = MAT(m, 0, 1), r0[2] = MAT(m, 0, 2), r0[3] = MAT(m, 0, 3), 1571463c08dSmrg r0[4] = 1.0, r0[5] = r0[6] = r0[7] = 0.0, 1581463c08dSmrg 1591463c08dSmrg r1[0] = MAT(m, 1, 0), r1[1] = MAT(m, 1, 1), r1[2] = MAT(m, 1, 2), r1[3] = MAT(m, 1, 3), 1601463c08dSmrg r1[5] = 1.0, r1[4] = r1[6] = r1[7] = 0.0, 1611463c08dSmrg 1621463c08dSmrg r2[0] = MAT(m, 2, 0), r2[1] = MAT(m, 2, 1), r2[2] = MAT(m, 2, 2), r2[3] = MAT(m, 2, 3), 1631463c08dSmrg r2[6] = 1.0, r2[4] = r2[5] = r2[7] = 0.0, 1641463c08dSmrg 1651463c08dSmrg r3[0] = MAT(m, 3, 0), r3[1] = MAT(m, 3, 1), r3[2] = MAT(m, 3, 2), r3[3] = MAT(m, 3, 3), 1661463c08dSmrg r3[7] = 1.0, r3[4] = r3[5] = r3[6] = 0.0; 1671463c08dSmrg 1681463c08dSmrg /* choose pivot - or die */ 1691463c08dSmrg if (fabsf(r3[0]) > fabsf(r2[0])) 1701463c08dSmrg SWAP_ROWS(r3, r2); 1711463c08dSmrg if (fabsf(r2[0]) > fabsf(r1[0])) 1721463c08dSmrg SWAP_ROWS(r2, r1); 1731463c08dSmrg if (fabsf(r1[0]) > fabsf(r0[0])) 1741463c08dSmrg SWAP_ROWS(r1, r0); 1751463c08dSmrg if (0.0F == r0[0]) 1761463c08dSmrg return false; 1771463c08dSmrg 1781463c08dSmrg /* eliminate first variable */ 1791463c08dSmrg m1 = r1[0] / r0[0]; 1801463c08dSmrg m2 = r2[0] / r0[0]; 1811463c08dSmrg m3 = r3[0] / r0[0]; 1821463c08dSmrg s = r0[1]; 1831463c08dSmrg r1[1] -= m1 * s; 1841463c08dSmrg r2[1] -= m2 * s; 1851463c08dSmrg r3[1] -= m3 * s; 1861463c08dSmrg s = r0[2]; 1871463c08dSmrg r1[2] -= m1 * s; 1881463c08dSmrg r2[2] -= m2 * s; 1891463c08dSmrg r3[2] -= m3 * s; 1901463c08dSmrg s = r0[3]; 1911463c08dSmrg r1[3] -= m1 * s; 1921463c08dSmrg r2[3] -= m2 * s; 1931463c08dSmrg r3[3] -= m3 * s; 1941463c08dSmrg s = r0[4]; 1951463c08dSmrg if (s != 0.0F) { 1961463c08dSmrg r1[4] -= m1 * s; 1971463c08dSmrg r2[4] -= m2 * s; 1981463c08dSmrg r3[4] -= m3 * s; 1991463c08dSmrg } 2001463c08dSmrg s = r0[5]; 2011463c08dSmrg if (s != 0.0F) { 2021463c08dSmrg r1[5] -= m1 * s; 2031463c08dSmrg r2[5] -= m2 * s; 2041463c08dSmrg r3[5] -= m3 * s; 2051463c08dSmrg } 2061463c08dSmrg s = r0[6]; 2071463c08dSmrg if (s != 0.0F) { 2081463c08dSmrg r1[6] -= m1 * s; 2091463c08dSmrg r2[6] -= m2 * s; 2101463c08dSmrg r3[6] -= m3 * s; 2111463c08dSmrg } 2121463c08dSmrg s = r0[7]; 2131463c08dSmrg if (s != 0.0F) { 2141463c08dSmrg r1[7] -= m1 * s; 2151463c08dSmrg r2[7] -= m2 * s; 2161463c08dSmrg r3[7] -= m3 * s; 2171463c08dSmrg } 2181463c08dSmrg 2191463c08dSmrg /* choose pivot - or die */ 2201463c08dSmrg if (fabsf(r3[1]) > fabsf(r2[1])) 2211463c08dSmrg SWAP_ROWS(r3, r2); 2221463c08dSmrg if (fabsf(r2[1]) > fabsf(r1[1])) 2231463c08dSmrg SWAP_ROWS(r2, r1); 2241463c08dSmrg if (0.0F == r1[1]) 2251463c08dSmrg return false; 2261463c08dSmrg 2271463c08dSmrg /* eliminate second variable */ 2281463c08dSmrg m2 = r2[1] / r1[1]; 2291463c08dSmrg m3 = r3[1] / r1[1]; 2301463c08dSmrg r2[2] -= m2 * r1[2]; 2311463c08dSmrg r3[2] -= m3 * r1[2]; 2321463c08dSmrg r2[3] -= m2 * r1[3]; 2331463c08dSmrg r3[3] -= m3 * r1[3]; 2341463c08dSmrg s = r1[4]; 2351463c08dSmrg if (0.0F != s) { 2361463c08dSmrg r2[4] -= m2 * s; 2371463c08dSmrg r3[4] -= m3 * s; 2381463c08dSmrg } 2391463c08dSmrg s = r1[5]; 2401463c08dSmrg if (0.0F != s) { 2411463c08dSmrg r2[5] -= m2 * s; 2421463c08dSmrg r3[5] -= m3 * s; 2431463c08dSmrg } 2441463c08dSmrg s = r1[6]; 2451463c08dSmrg if (0.0F != s) { 2461463c08dSmrg r2[6] -= m2 * s; 2471463c08dSmrg r3[6] -= m3 * s; 2481463c08dSmrg } 2491463c08dSmrg s = r1[7]; 2501463c08dSmrg if (0.0F != s) { 2511463c08dSmrg r2[7] -= m2 * s; 2521463c08dSmrg r3[7] -= m3 * s; 2531463c08dSmrg } 2541463c08dSmrg 2551463c08dSmrg /* choose pivot - or die */ 2561463c08dSmrg if (fabsf(r3[2]) > fabsf(r2[2])) 2571463c08dSmrg SWAP_ROWS(r3, r2); 2581463c08dSmrg if (0.0F == r2[2]) 2591463c08dSmrg return false; 2601463c08dSmrg 2611463c08dSmrg /* eliminate third variable */ 2621463c08dSmrg m3 = r3[2] / r2[2]; 2631463c08dSmrg r3[3] -= m3 * r2[3], r3[4] -= m3 * r2[4], r3[5] -= m3 * r2[5], r3[6] -= m3 * r2[6], 2641463c08dSmrg r3[7] -= m3 * r2[7]; 2651463c08dSmrg 2661463c08dSmrg /* last check */ 2671463c08dSmrg if (0.0F == r3[3]) 2681463c08dSmrg return false; 2691463c08dSmrg 2701463c08dSmrg s = 1.0F / r3[3]; /* now back substitute row 3 */ 2711463c08dSmrg r3[4] *= s; 2721463c08dSmrg r3[5] *= s; 2731463c08dSmrg r3[6] *= s; 2741463c08dSmrg r3[7] *= s; 2751463c08dSmrg 2761463c08dSmrg m2 = r2[3]; /* now back substitute row 2 */ 2771463c08dSmrg s = 1.0F / r2[2]; 2781463c08dSmrg r2[4] = s * (r2[4] - r3[4] * m2), r2[5] = s * (r2[5] - r3[5] * m2), 2791463c08dSmrg r2[6] = s * (r2[6] - r3[6] * m2), r2[7] = s * (r2[7] - r3[7] * m2); 2801463c08dSmrg m1 = r1[3]; 2811463c08dSmrg r1[4] -= r3[4] * m1, r1[5] -= r3[5] * m1, r1[6] -= r3[6] * m1, r1[7] -= r3[7] * m1; 2821463c08dSmrg m0 = r0[3]; 2831463c08dSmrg r0[4] -= r3[4] * m0, r0[5] -= r3[5] * m0, r0[6] -= r3[6] * m0, r0[7] -= r3[7] * m0; 2841463c08dSmrg 2851463c08dSmrg m1 = r1[2]; /* now back substitute row 1 */ 2861463c08dSmrg s = 1.0F / r1[1]; 2871463c08dSmrg r1[4] = s * (r1[4] - r2[4] * m1), r1[5] = s * (r1[5] - r2[5] * m1), 2881463c08dSmrg r1[6] = s * (r1[6] - r2[6] * m1), r1[7] = s * (r1[7] - r2[7] * m1); 2891463c08dSmrg m0 = r0[2]; 2901463c08dSmrg r0[4] -= r2[4] * m0, r0[5] -= r2[5] * m0, r0[6] -= r2[6] * m0, r0[7] -= r2[7] * m0; 2911463c08dSmrg 2921463c08dSmrg m0 = r0[1]; /* now back substitute row 0 */ 2931463c08dSmrg s = 1.0F / r0[0]; 2941463c08dSmrg r0[4] = s * (r0[4] - r1[4] * m0), r0[5] = s * (r0[5] - r1[5] * m0), 2951463c08dSmrg r0[6] = s * (r0[6] - r1[6] * m0), r0[7] = s * (r0[7] - r1[7] * m0); 2961463c08dSmrg 2971463c08dSmrg MAT(out, 0, 0) = r0[4]; 2981463c08dSmrg MAT(out, 0, 1) = r0[5], MAT(out, 0, 2) = r0[6]; 2991463c08dSmrg MAT(out, 0, 3) = r0[7], MAT(out, 1, 0) = r1[4]; 3001463c08dSmrg MAT(out, 1, 1) = r1[5], MAT(out, 1, 2) = r1[6]; 3011463c08dSmrg MAT(out, 1, 3) = r1[7], MAT(out, 2, 0) = r2[4]; 3021463c08dSmrg MAT(out, 2, 1) = r2[5], MAT(out, 2, 2) = r2[6]; 3031463c08dSmrg MAT(out, 2, 3) = r2[7], MAT(out, 3, 0) = r3[4]; 3041463c08dSmrg MAT(out, 3, 1) = r3[5], MAT(out, 3, 2) = r3[6]; 3051463c08dSmrg MAT(out, 3, 3) = r3[7]; 3061463c08dSmrg 3071463c08dSmrg#undef MAT 3081463c08dSmrg#undef SWAP_ROWS 3091463c08dSmrg 3101463c08dSmrg return true; 3111463c08dSmrg} 312