1848b8605Smrg/*
2848b8605Smrg * Mesa 3-D graphics library
3848b8605Smrg *
4848b8605Smrg * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
5848b8605Smrg *
6848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7848b8605Smrg * copy of this software and associated documentation files (the "Software"),
8848b8605Smrg * to deal in the Software without restriction, including without limitation
9848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
11848b8605Smrg * Software is furnished to do so, subject to the following conditions:
12848b8605Smrg *
13848b8605Smrg * The above copyright notice and this permission notice shall be included
14848b8605Smrg * in all copies or substantial portions of the Software.
15848b8605Smrg *
16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17848b8605Smrg * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20848b8605Smrg * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21848b8605Smrg * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22848b8605Smrg * OTHER DEALINGS IN THE SOFTWARE.
23848b8605Smrg *
24848b8605Smrg * Authors:
25848b8605Smrg *    Keith Whitwell <keithw@vmware.com>
26848b8605Smrg */
27848b8605Smrg
28848b8605Smrg#include "main/glheader.h"
29848b8605Smrg#include "main/context.h"
30848b8605Smrg#include "main/imports.h"
31848b8605Smrg#include "main/mtypes.h"
32848b8605Smrg
33848b8605Smrg#include "t_context.h"
34848b8605Smrg#include "t_pipeline.h"
35848b8605Smrg#include "t_vp_build.h"
36848b8605Smrg#include "t_vertex.h"
37848b8605Smrg
38848b8605Smrgvoid _tnl_install_pipeline( struct gl_context *ctx,
39848b8605Smrg			    const struct tnl_pipeline_stage **stages )
40848b8605Smrg{
41848b8605Smrg   TNLcontext *tnl = TNL_CONTEXT(ctx);
42848b8605Smrg   GLuint i;
43848b8605Smrg
44848b8605Smrg   tnl->pipeline.new_state = ~0;
45848b8605Smrg
46848b8605Smrg   /* Create a writeable copy of each stage.
47848b8605Smrg    */
48848b8605Smrg   for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) {
49848b8605Smrg      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
50848b8605Smrg      memcpy(s, stages[i], sizeof(*s));
51848b8605Smrg      if (s->create)
52848b8605Smrg	 s->create(ctx, s);
53848b8605Smrg   }
54848b8605Smrg
55848b8605Smrg   tnl->pipeline.nr_stages = i;
56848b8605Smrg}
57848b8605Smrg
58848b8605Smrgvoid _tnl_destroy_pipeline( struct gl_context *ctx )
59848b8605Smrg{
60848b8605Smrg   TNLcontext *tnl = TNL_CONTEXT(ctx);
61848b8605Smrg   GLuint i;
62848b8605Smrg
63848b8605Smrg   for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) {
64848b8605Smrg      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
65848b8605Smrg      if (s->destroy)
66848b8605Smrg	 s->destroy(s);
67848b8605Smrg   }
68848b8605Smrg
69848b8605Smrg   tnl->pipeline.nr_stages = 0;
70848b8605Smrg}
71848b8605Smrg
72848b8605Smrg
73848b8605Smrg
74848b8605Smrgstatic GLuint check_input_changes( struct gl_context *ctx )
75848b8605Smrg{
76848b8605Smrg   TNLcontext *tnl = TNL_CONTEXT(ctx);
77848b8605Smrg   GLuint i;
78848b8605Smrg
79848b8605Smrg   for (i = 0; i <= _TNL_LAST_MAT; i++) {
80848b8605Smrg      if (tnl->vb.AttribPtr[i]->size != tnl->pipeline.last_attrib_size[i] ||
81848b8605Smrg	  tnl->vb.AttribPtr[i]->stride != tnl->pipeline.last_attrib_stride[i]) {
82848b8605Smrg	 tnl->pipeline.last_attrib_size[i] = tnl->vb.AttribPtr[i]->size;
83848b8605Smrg	 tnl->pipeline.last_attrib_stride[i] = tnl->vb.AttribPtr[i]->stride;
84848b8605Smrg	 tnl->pipeline.input_changes |= 1<<i;
85848b8605Smrg      }
86848b8605Smrg   }
87848b8605Smrg
88848b8605Smrg   return tnl->pipeline.input_changes;
89848b8605Smrg}
90848b8605Smrg
91848b8605Smrg
92848b8605Smrgstatic GLuint check_output_changes( struct gl_context *ctx )
93848b8605Smrg{
94848b8605Smrg#if 0
95848b8605Smrg   TNLcontext *tnl = TNL_CONTEXT(ctx);
96848b8605Smrg
97848b8605Smrg   for (i = 0; i < VARYING_SLOT_MAX; i++) {
98848b8605Smrg      if (tnl->vb.ResultPtr[i]->size != tnl->last_result_size[i] ||
99848b8605Smrg	  tnl->vb.ResultPtr[i]->stride != tnl->last_result_stride[i]) {
100848b8605Smrg	 tnl->last_result_size[i] = tnl->vb.ResultPtr[i]->size;
101848b8605Smrg	 tnl->last_result_stride[i] = tnl->vb.ResultPtr[i]->stride;
102848b8605Smrg	 tnl->pipeline.output_changes |= 1<<i;
103848b8605Smrg      }
104848b8605Smrg   }
105848b8605Smrg
106848b8605Smrg   if (tnl->pipeline.output_changes)
107848b8605Smrg      tnl->Driver.NotifyOutputChanges( ctx, tnl->pipeline.output_changes );
108848b8605Smrg
109848b8605Smrg   return tnl->pipeline.output_changes;
110848b8605Smrg#else
111848b8605Smrg   return ~0;
112848b8605Smrg#endif
113848b8605Smrg}
114848b8605Smrg
115b8e80941Smrg/**
116b8e80941Smrg * START/END_FAST_MATH macros:
117b8e80941Smrg *
118b8e80941Smrg * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save
119b8e80941Smrg *                  original mode to a temporary).
120b8e80941Smrg * END_FAST_MATH: Restore x86 FPU to original mode.
121b8e80941Smrg */
122b8e80941Smrg#if defined(__GNUC__) && defined(__i386__)
123b8e80941Smrg/*
124b8e80941Smrg * Set the x86 FPU control word to guarentee only 32 bits of precision
125b8e80941Smrg * are stored in registers.  Allowing the FPU to store more introduces
126b8e80941Smrg * differences between situations where numbers are pulled out of memory
127b8e80941Smrg * vs. situations where the compiler is able to optimize register usage.
128b8e80941Smrg *
129b8e80941Smrg * In the worst case, we force the compiler to use a memory access to
130b8e80941Smrg * truncate the float, by specifying the 'volatile' keyword.
131b8e80941Smrg */
132b8e80941Smrg/* Hardware default: All exceptions masked, extended double precision,
133b8e80941Smrg * round to nearest (IEEE compliant):
134b8e80941Smrg */
135b8e80941Smrg#define DEFAULT_X86_FPU		0x037f
136b8e80941Smrg/* All exceptions masked, single precision, round to nearest:
137b8e80941Smrg */
138b8e80941Smrg#define FAST_X86_FPU		0x003f
139b8e80941Smrg/* The fldcw instruction will cause any pending FP exceptions to be
140b8e80941Smrg * raised prior to entering the block, and we clear any pending
141b8e80941Smrg * exceptions before exiting the block.  Hence, asm code has free
142b8e80941Smrg * reign over the FPU while in the fast math block.
143b8e80941Smrg */
144b8e80941Smrg#if defined(NO_FAST_MATH)
145b8e80941Smrg#define START_FAST_MATH(x)						\
146b8e80941Smrgdo {									\
147b8e80941Smrg   static GLuint mask = DEFAULT_X86_FPU;				\
148b8e80941Smrg   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
149b8e80941Smrg   __asm__ ( "fldcw %0" : : "m" (mask) );				\
150b8e80941Smrg} while (0)
151b8e80941Smrg#else
152b8e80941Smrg#define START_FAST_MATH(x)						\
153b8e80941Smrgdo {									\
154b8e80941Smrg   static GLuint mask = FAST_X86_FPU;					\
155b8e80941Smrg   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
156b8e80941Smrg   __asm__ ( "fldcw %0" : : "m" (mask) );				\
157b8e80941Smrg} while (0)
158b8e80941Smrg#endif
159b8e80941Smrg/* Restore original FPU mode, and clear any exceptions that may have
160b8e80941Smrg * occurred in the FAST_MATH block.
161b8e80941Smrg */
162b8e80941Smrg#define END_FAST_MATH(x)						\
163b8e80941Smrgdo {									\
164b8e80941Smrg   __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) );			\
165b8e80941Smrg} while (0)
166b8e80941Smrg
167b8e80941Smrg#elif defined(_MSC_VER) && defined(_M_IX86)
168b8e80941Smrg#define DEFAULT_X86_FPU		0x037f /* See GCC comments above */
169b8e80941Smrg#define FAST_X86_FPU		0x003f /* See GCC comments above */
170b8e80941Smrg#if defined(NO_FAST_MATH)
171b8e80941Smrg#define START_FAST_MATH(x) do {\
172b8e80941Smrg	static GLuint mask = DEFAULT_X86_FPU;\
173b8e80941Smrg	__asm fnstcw word ptr [x]\
174b8e80941Smrg	__asm fldcw word ptr [mask]\
175b8e80941Smrg} while(0)
176b8e80941Smrg#else
177b8e80941Smrg#define START_FAST_MATH(x) do {\
178b8e80941Smrg	static GLuint mask = FAST_X86_FPU;\
179b8e80941Smrg	__asm fnstcw word ptr [x]\
180b8e80941Smrg	__asm fldcw word ptr [mask]\
181b8e80941Smrg} while(0)
182b8e80941Smrg#endif
183b8e80941Smrg#define END_FAST_MATH(x) do {\
184b8e80941Smrg	__asm fnclex\
185b8e80941Smrg	__asm fldcw word ptr [x]\
186b8e80941Smrg} while(0)
187b8e80941Smrg
188b8e80941Smrg#else
189b8e80941Smrg#define START_FAST_MATH(x)  x = 0
190b8e80941Smrg#define END_FAST_MATH(x)  (void)(x)
191b8e80941Smrg#endif
192b8e80941Smrg
193848b8605Smrg
194848b8605Smrgvoid _tnl_run_pipeline( struct gl_context *ctx )
195848b8605Smrg{
196848b8605Smrg   TNLcontext *tnl = TNL_CONTEXT(ctx);
197848b8605Smrg   unsigned short __tmp;
198848b8605Smrg   GLuint i;
199848b8605Smrg
200848b8605Smrg   if (!tnl->vb.Count)
201848b8605Smrg      return;
202848b8605Smrg
203848b8605Smrg   /* Check for changed input sizes or change in stride to/from zero
204848b8605Smrg    * (ie const or non-const).
205848b8605Smrg    */
206848b8605Smrg   if (check_input_changes( ctx ) || tnl->pipeline.new_state) {
207848b8605Smrg      if (ctx->VertexProgram._MaintainTnlProgram)
208848b8605Smrg	 _tnl_UpdateFixedFunctionProgram( ctx );
209848b8605Smrg
210848b8605Smrg      for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
211848b8605Smrg	 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
212848b8605Smrg	 if (s->validate)
213848b8605Smrg	    s->validate( ctx, s );
214848b8605Smrg      }
215848b8605Smrg
216848b8605Smrg      tnl->pipeline.new_state = 0;
217848b8605Smrg      tnl->pipeline.input_changes = 0;
218848b8605Smrg
219848b8605Smrg      /* Pipeline can only change its output in response to either a
220848b8605Smrg       * statechange or an input size/stride change.  No other changes
221848b8605Smrg       * are allowed.
222848b8605Smrg       */
223848b8605Smrg      if (check_output_changes( ctx ))
224848b8605Smrg	 _tnl_notify_pipeline_output_change( ctx );
225848b8605Smrg   }
226848b8605Smrg
227848b8605Smrg#ifndef _OPENMP
228848b8605Smrg   /* Don't adjust FPU precision mode in case multiple threads are to be used.
229848b8605Smrg    * This would require that the additional threads also changed the FPU mode
230848b8605Smrg    * which is quite a mess as this had to be done in all parallelized sections;
231848b8605Smrg    * otherwise the master thread and all other threads are running in different
232848b8605Smrg    * modes, producing inconsistent results.
233848b8605Smrg    * Note that all x64 implementations don't define/use START_FAST_MATH, so
234848b8605Smrg    * this is "hack" is only used in i386 mode
235848b8605Smrg    */
236848b8605Smrg   START_FAST_MATH(__tmp);
237848b8605Smrg#endif
238848b8605Smrg
239848b8605Smrg   for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
240848b8605Smrg      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
241848b8605Smrg      if (!s->run( ctx, s ))
242848b8605Smrg	 break;
243848b8605Smrg   }
244848b8605Smrg
245848b8605Smrg#ifndef _OPENMP
246848b8605Smrg   END_FAST_MATH(__tmp);
247848b8605Smrg#endif
248848b8605Smrg}
249848b8605Smrg
250848b8605Smrg
251848b8605Smrg
252848b8605Smrg/* The default pipeline.  This is useful for software rasterizers, and
253848b8605Smrg * simple hardware rasterizers.  For customization, I don't recommend
254848b8605Smrg * tampering with the internals of these stages in the way that
255848b8605Smrg * drivers did in Mesa 3.4.  These stages are basically black boxes,
256848b8605Smrg * and should be left intact.
257848b8605Smrg *
258848b8605Smrg * To customize the pipeline, consider:
259848b8605Smrg *
260848b8605Smrg * - removing redundant stages (making sure that the software rasterizer
261848b8605Smrg *   can cope with this on fallback paths).  An example is fog
262848b8605Smrg *   coordinate generation, which is not required in the FX driver.
263848b8605Smrg *
264848b8605Smrg * - replacing general-purpose machine-independent stages with
265848b8605Smrg *   general-purpose machine-specific stages.  There is no example of
266848b8605Smrg *   this to date, though it must be borne in mind that all subsequent
267848b8605Smrg *   stages that reference the output of the new stage must cope with
268848b8605Smrg *   any machine-specific data introduced.  This may not be easy
269848b8605Smrg *   unless there are no such stages (ie the new stage is the last in
270848b8605Smrg *   the pipe).
271848b8605Smrg *
272848b8605Smrg * - inserting optimized (but specialized) stages ahead of the
273848b8605Smrg *   general-purpose fallback implementation.  For example, the old
274848b8605Smrg *   fastpath mechanism, which only works when the VB->Elts input is
275848b8605Smrg *   available, can be duplicated by placing the fastpath stage at the
276848b8605Smrg *   head of this pipeline.  Such specialized stages are currently
277848b8605Smrg *   constrained to have no outputs (ie. they must either finish the *
278848b8605Smrg *   pipeline by returning GL_FALSE from run(), or do nothing).
279848b8605Smrg *
280848b8605Smrg * Some work can be done to lift some of the restrictions in the final
281848b8605Smrg * case, if it becomes necessary to do so.
282848b8605Smrg */
283848b8605Smrgconst struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
284848b8605Smrg   &_tnl_vertex_transform_stage,
285848b8605Smrg   &_tnl_normal_transform_stage,
286848b8605Smrg   &_tnl_lighting_stage,
287848b8605Smrg   &_tnl_texgen_stage,
288848b8605Smrg   &_tnl_texture_transform_stage,
289848b8605Smrg   &_tnl_point_attenuation_stage,
290848b8605Smrg   &_tnl_vertex_program_stage,
291848b8605Smrg   &_tnl_fog_coordinate_stage,
292848b8605Smrg   &_tnl_render_stage,
293848b8605Smrg   NULL
294848b8605Smrg};
295848b8605Smrg
296848b8605Smrgconst struct tnl_pipeline_stage *_tnl_vp_pipeline[] = {
297848b8605Smrg   &_tnl_vertex_program_stage,
298848b8605Smrg   &_tnl_render_stage,
299848b8605Smrg   NULL
300848b8605Smrg};
301