1848b8605Smrg/* 2848b8605Smrg * Mesa 3-D graphics library 3848b8605Smrg * 4848b8605Smrg * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 5848b8605Smrg * 6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 7848b8605Smrg * copy of this software and associated documentation files (the "Software"), 8848b8605Smrg * to deal in the Software without restriction, including without limitation 9848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the 11848b8605Smrg * Software is furnished to do so, subject to the following conditions: 12848b8605Smrg * 13848b8605Smrg * The above copyright notice and this permission notice shall be included 14848b8605Smrg * in all copies or substantial portions of the Software. 15848b8605Smrg * 16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE. 23848b8605Smrg * 24848b8605Smrg * Authors: 25848b8605Smrg * Keith Whitwell <keithw@vmware.com> 26848b8605Smrg */ 27848b8605Smrg 28848b8605Smrg#include "main/glheader.h" 29848b8605Smrg#include "main/context.h" 30848b8605Smrg#include "main/imports.h" 31848b8605Smrg#include "main/mtypes.h" 32848b8605Smrg 33848b8605Smrg#include "t_context.h" 34848b8605Smrg#include "t_pipeline.h" 35848b8605Smrg#include "t_vp_build.h" 36848b8605Smrg#include "t_vertex.h" 37848b8605Smrg 38848b8605Smrgvoid _tnl_install_pipeline( struct gl_context *ctx, 39848b8605Smrg const struct tnl_pipeline_stage **stages ) 40848b8605Smrg{ 41848b8605Smrg TNLcontext *tnl = TNL_CONTEXT(ctx); 42848b8605Smrg GLuint i; 43848b8605Smrg 44848b8605Smrg tnl->pipeline.new_state = ~0; 45848b8605Smrg 46848b8605Smrg /* Create a writeable copy of each stage. 47848b8605Smrg */ 48848b8605Smrg for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) { 49848b8605Smrg struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 50848b8605Smrg memcpy(s, stages[i], sizeof(*s)); 51848b8605Smrg if (s->create) 52848b8605Smrg s->create(ctx, s); 53848b8605Smrg } 54848b8605Smrg 55848b8605Smrg tnl->pipeline.nr_stages = i; 56848b8605Smrg} 57848b8605Smrg 58848b8605Smrgvoid _tnl_destroy_pipeline( struct gl_context *ctx ) 59848b8605Smrg{ 60848b8605Smrg TNLcontext *tnl = TNL_CONTEXT(ctx); 61848b8605Smrg GLuint i; 62848b8605Smrg 63848b8605Smrg for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) { 64848b8605Smrg struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 65848b8605Smrg if (s->destroy) 66848b8605Smrg s->destroy(s); 67848b8605Smrg } 68848b8605Smrg 69848b8605Smrg tnl->pipeline.nr_stages = 0; 70848b8605Smrg} 71848b8605Smrg 72848b8605Smrg 73848b8605Smrg 74848b8605Smrgstatic GLuint check_input_changes( struct gl_context *ctx ) 75848b8605Smrg{ 76848b8605Smrg TNLcontext *tnl = TNL_CONTEXT(ctx); 77848b8605Smrg GLuint i; 78848b8605Smrg 79848b8605Smrg for (i = 0; i <= _TNL_LAST_MAT; i++) { 80848b8605Smrg if (tnl->vb.AttribPtr[i]->size != tnl->pipeline.last_attrib_size[i] || 81848b8605Smrg tnl->vb.AttribPtr[i]->stride != tnl->pipeline.last_attrib_stride[i]) { 82848b8605Smrg tnl->pipeline.last_attrib_size[i] = tnl->vb.AttribPtr[i]->size; 83848b8605Smrg tnl->pipeline.last_attrib_stride[i] = tnl->vb.AttribPtr[i]->stride; 84848b8605Smrg tnl->pipeline.input_changes |= 1<<i; 85848b8605Smrg } 86848b8605Smrg } 87848b8605Smrg 88848b8605Smrg return tnl->pipeline.input_changes; 89848b8605Smrg} 90848b8605Smrg 91848b8605Smrg 92848b8605Smrgstatic GLuint check_output_changes( struct gl_context *ctx ) 93848b8605Smrg{ 94848b8605Smrg#if 0 95848b8605Smrg TNLcontext *tnl = TNL_CONTEXT(ctx); 96848b8605Smrg 97848b8605Smrg for (i = 0; i < VARYING_SLOT_MAX; i++) { 98848b8605Smrg if (tnl->vb.ResultPtr[i]->size != tnl->last_result_size[i] || 99848b8605Smrg tnl->vb.ResultPtr[i]->stride != tnl->last_result_stride[i]) { 100848b8605Smrg tnl->last_result_size[i] = tnl->vb.ResultPtr[i]->size; 101848b8605Smrg tnl->last_result_stride[i] = tnl->vb.ResultPtr[i]->stride; 102848b8605Smrg tnl->pipeline.output_changes |= 1<<i; 103848b8605Smrg } 104848b8605Smrg } 105848b8605Smrg 106848b8605Smrg if (tnl->pipeline.output_changes) 107848b8605Smrg tnl->Driver.NotifyOutputChanges( ctx, tnl->pipeline.output_changes ); 108848b8605Smrg 109848b8605Smrg return tnl->pipeline.output_changes; 110848b8605Smrg#else 111848b8605Smrg return ~0; 112848b8605Smrg#endif 113848b8605Smrg} 114848b8605Smrg 115b8e80941Smrg/** 116b8e80941Smrg * START/END_FAST_MATH macros: 117b8e80941Smrg * 118b8e80941Smrg * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save 119b8e80941Smrg * original mode to a temporary). 120b8e80941Smrg * END_FAST_MATH: Restore x86 FPU to original mode. 121b8e80941Smrg */ 122b8e80941Smrg#if defined(__GNUC__) && defined(__i386__) 123b8e80941Smrg/* 124b8e80941Smrg * Set the x86 FPU control word to guarentee only 32 bits of precision 125b8e80941Smrg * are stored in registers. Allowing the FPU to store more introduces 126b8e80941Smrg * differences between situations where numbers are pulled out of memory 127b8e80941Smrg * vs. situations where the compiler is able to optimize register usage. 128b8e80941Smrg * 129b8e80941Smrg * In the worst case, we force the compiler to use a memory access to 130b8e80941Smrg * truncate the float, by specifying the 'volatile' keyword. 131b8e80941Smrg */ 132b8e80941Smrg/* Hardware default: All exceptions masked, extended double precision, 133b8e80941Smrg * round to nearest (IEEE compliant): 134b8e80941Smrg */ 135b8e80941Smrg#define DEFAULT_X86_FPU 0x037f 136b8e80941Smrg/* All exceptions masked, single precision, round to nearest: 137b8e80941Smrg */ 138b8e80941Smrg#define FAST_X86_FPU 0x003f 139b8e80941Smrg/* The fldcw instruction will cause any pending FP exceptions to be 140b8e80941Smrg * raised prior to entering the block, and we clear any pending 141b8e80941Smrg * exceptions before exiting the block. Hence, asm code has free 142b8e80941Smrg * reign over the FPU while in the fast math block. 143b8e80941Smrg */ 144b8e80941Smrg#if defined(NO_FAST_MATH) 145b8e80941Smrg#define START_FAST_MATH(x) \ 146b8e80941Smrgdo { \ 147b8e80941Smrg static GLuint mask = DEFAULT_X86_FPU; \ 148b8e80941Smrg __asm__ ( "fnstcw %0" : "=m" (*&(x)) ); \ 149b8e80941Smrg __asm__ ( "fldcw %0" : : "m" (mask) ); \ 150b8e80941Smrg} while (0) 151b8e80941Smrg#else 152b8e80941Smrg#define START_FAST_MATH(x) \ 153b8e80941Smrgdo { \ 154b8e80941Smrg static GLuint mask = FAST_X86_FPU; \ 155b8e80941Smrg __asm__ ( "fnstcw %0" : "=m" (*&(x)) ); \ 156b8e80941Smrg __asm__ ( "fldcw %0" : : "m" (mask) ); \ 157b8e80941Smrg} while (0) 158b8e80941Smrg#endif 159b8e80941Smrg/* Restore original FPU mode, and clear any exceptions that may have 160b8e80941Smrg * occurred in the FAST_MATH block. 161b8e80941Smrg */ 162b8e80941Smrg#define END_FAST_MATH(x) \ 163b8e80941Smrgdo { \ 164b8e80941Smrg __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) ); \ 165b8e80941Smrg} while (0) 166b8e80941Smrg 167b8e80941Smrg#elif defined(_MSC_VER) && defined(_M_IX86) 168b8e80941Smrg#define DEFAULT_X86_FPU 0x037f /* See GCC comments above */ 169b8e80941Smrg#define FAST_X86_FPU 0x003f /* See GCC comments above */ 170b8e80941Smrg#if defined(NO_FAST_MATH) 171b8e80941Smrg#define START_FAST_MATH(x) do {\ 172b8e80941Smrg static GLuint mask = DEFAULT_X86_FPU;\ 173b8e80941Smrg __asm fnstcw word ptr [x]\ 174b8e80941Smrg __asm fldcw word ptr [mask]\ 175b8e80941Smrg} while(0) 176b8e80941Smrg#else 177b8e80941Smrg#define START_FAST_MATH(x) do {\ 178b8e80941Smrg static GLuint mask = FAST_X86_FPU;\ 179b8e80941Smrg __asm fnstcw word ptr [x]\ 180b8e80941Smrg __asm fldcw word ptr [mask]\ 181b8e80941Smrg} while(0) 182b8e80941Smrg#endif 183b8e80941Smrg#define END_FAST_MATH(x) do {\ 184b8e80941Smrg __asm fnclex\ 185b8e80941Smrg __asm fldcw word ptr [x]\ 186b8e80941Smrg} while(0) 187b8e80941Smrg 188b8e80941Smrg#else 189b8e80941Smrg#define START_FAST_MATH(x) x = 0 190b8e80941Smrg#define END_FAST_MATH(x) (void)(x) 191b8e80941Smrg#endif 192b8e80941Smrg 193848b8605Smrg 194848b8605Smrgvoid _tnl_run_pipeline( struct gl_context *ctx ) 195848b8605Smrg{ 196848b8605Smrg TNLcontext *tnl = TNL_CONTEXT(ctx); 197848b8605Smrg unsigned short __tmp; 198848b8605Smrg GLuint i; 199848b8605Smrg 200848b8605Smrg if (!tnl->vb.Count) 201848b8605Smrg return; 202848b8605Smrg 203848b8605Smrg /* Check for changed input sizes or change in stride to/from zero 204848b8605Smrg * (ie const or non-const). 205848b8605Smrg */ 206848b8605Smrg if (check_input_changes( ctx ) || tnl->pipeline.new_state) { 207848b8605Smrg if (ctx->VertexProgram._MaintainTnlProgram) 208848b8605Smrg _tnl_UpdateFixedFunctionProgram( ctx ); 209848b8605Smrg 210848b8605Smrg for (i = 0; i < tnl->pipeline.nr_stages ; i++) { 211848b8605Smrg struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 212848b8605Smrg if (s->validate) 213848b8605Smrg s->validate( ctx, s ); 214848b8605Smrg } 215848b8605Smrg 216848b8605Smrg tnl->pipeline.new_state = 0; 217848b8605Smrg tnl->pipeline.input_changes = 0; 218848b8605Smrg 219848b8605Smrg /* Pipeline can only change its output in response to either a 220848b8605Smrg * statechange or an input size/stride change. No other changes 221848b8605Smrg * are allowed. 222848b8605Smrg */ 223848b8605Smrg if (check_output_changes( ctx )) 224848b8605Smrg _tnl_notify_pipeline_output_change( ctx ); 225848b8605Smrg } 226848b8605Smrg 227848b8605Smrg#ifndef _OPENMP 228848b8605Smrg /* Don't adjust FPU precision mode in case multiple threads are to be used. 229848b8605Smrg * This would require that the additional threads also changed the FPU mode 230848b8605Smrg * which is quite a mess as this had to be done in all parallelized sections; 231848b8605Smrg * otherwise the master thread and all other threads are running in different 232848b8605Smrg * modes, producing inconsistent results. 233848b8605Smrg * Note that all x64 implementations don't define/use START_FAST_MATH, so 234848b8605Smrg * this is "hack" is only used in i386 mode 235848b8605Smrg */ 236848b8605Smrg START_FAST_MATH(__tmp); 237848b8605Smrg#endif 238848b8605Smrg 239848b8605Smrg for (i = 0; i < tnl->pipeline.nr_stages ; i++) { 240848b8605Smrg struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i]; 241848b8605Smrg if (!s->run( ctx, s )) 242848b8605Smrg break; 243848b8605Smrg } 244848b8605Smrg 245848b8605Smrg#ifndef _OPENMP 246848b8605Smrg END_FAST_MATH(__tmp); 247848b8605Smrg#endif 248848b8605Smrg} 249848b8605Smrg 250848b8605Smrg 251848b8605Smrg 252848b8605Smrg/* The default pipeline. This is useful for software rasterizers, and 253848b8605Smrg * simple hardware rasterizers. For customization, I don't recommend 254848b8605Smrg * tampering with the internals of these stages in the way that 255848b8605Smrg * drivers did in Mesa 3.4. These stages are basically black boxes, 256848b8605Smrg * and should be left intact. 257848b8605Smrg * 258848b8605Smrg * To customize the pipeline, consider: 259848b8605Smrg * 260848b8605Smrg * - removing redundant stages (making sure that the software rasterizer 261848b8605Smrg * can cope with this on fallback paths). An example is fog 262848b8605Smrg * coordinate generation, which is not required in the FX driver. 263848b8605Smrg * 264848b8605Smrg * - replacing general-purpose machine-independent stages with 265848b8605Smrg * general-purpose machine-specific stages. There is no example of 266848b8605Smrg * this to date, though it must be borne in mind that all subsequent 267848b8605Smrg * stages that reference the output of the new stage must cope with 268848b8605Smrg * any machine-specific data introduced. This may not be easy 269848b8605Smrg * unless there are no such stages (ie the new stage is the last in 270848b8605Smrg * the pipe). 271848b8605Smrg * 272848b8605Smrg * - inserting optimized (but specialized) stages ahead of the 273848b8605Smrg * general-purpose fallback implementation. For example, the old 274848b8605Smrg * fastpath mechanism, which only works when the VB->Elts input is 275848b8605Smrg * available, can be duplicated by placing the fastpath stage at the 276848b8605Smrg * head of this pipeline. Such specialized stages are currently 277848b8605Smrg * constrained to have no outputs (ie. they must either finish the * 278848b8605Smrg * pipeline by returning GL_FALSE from run(), or do nothing). 279848b8605Smrg * 280848b8605Smrg * Some work can be done to lift some of the restrictions in the final 281848b8605Smrg * case, if it becomes necessary to do so. 282848b8605Smrg */ 283848b8605Smrgconst struct tnl_pipeline_stage *_tnl_default_pipeline[] = { 284848b8605Smrg &_tnl_vertex_transform_stage, 285848b8605Smrg &_tnl_normal_transform_stage, 286848b8605Smrg &_tnl_lighting_stage, 287848b8605Smrg &_tnl_texgen_stage, 288848b8605Smrg &_tnl_texture_transform_stage, 289848b8605Smrg &_tnl_point_attenuation_stage, 290848b8605Smrg &_tnl_vertex_program_stage, 291848b8605Smrg &_tnl_fog_coordinate_stage, 292848b8605Smrg &_tnl_render_stage, 293848b8605Smrg NULL 294848b8605Smrg}; 295848b8605Smrg 296848b8605Smrgconst struct tnl_pipeline_stage *_tnl_vp_pipeline[] = { 297848b8605Smrg &_tnl_vertex_program_stage, 298848b8605Smrg &_tnl_render_stage, 299848b8605Smrg NULL 300848b8605Smrg}; 301