1/*
2 * Mesa 3-D graphics library
3 *
4 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@vmware.com>
26 */
27
28#include "main/glheader.h"
29#include "main/context.h"
30
31#include "main/mtypes.h"
32
33#include "t_context.h"
34#include "t_pipeline.h"
35#include "t_vp_build.h"
36#include "t_vertex.h"
37
38void _tnl_install_pipeline( struct gl_context *ctx,
39			    const struct tnl_pipeline_stage **stages )
40{
41   TNLcontext *tnl = TNL_CONTEXT(ctx);
42   GLuint i;
43
44   tnl->pipeline.new_state = ~0;
45
46   /* Create a writeable copy of each stage.
47    */
48   for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) {
49      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
50      memcpy(s, stages[i], sizeof(*s));
51      if (s->create)
52	 s->create(ctx, s);
53   }
54
55   tnl->pipeline.nr_stages = i;
56}
57
58void _tnl_destroy_pipeline( struct gl_context *ctx )
59{
60   TNLcontext *tnl = TNL_CONTEXT(ctx);
61   GLuint i;
62
63   for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) {
64      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
65      if (s->destroy)
66	 s->destroy(s);
67   }
68
69   tnl->pipeline.nr_stages = 0;
70}
71
72
73
74static GLuint check_input_changes( struct gl_context *ctx )
75{
76   TNLcontext *tnl = TNL_CONTEXT(ctx);
77   GLuint i;
78
79   for (i = 0; i <= _TNL_LAST_MAT; i++) {
80      if (tnl->vb.AttribPtr[i]->size != tnl->pipeline.last_attrib_size[i] ||
81	  tnl->vb.AttribPtr[i]->stride != tnl->pipeline.last_attrib_stride[i]) {
82	 tnl->pipeline.last_attrib_size[i] = tnl->vb.AttribPtr[i]->size;
83	 tnl->pipeline.last_attrib_stride[i] = tnl->vb.AttribPtr[i]->stride;
84	 tnl->pipeline.input_changes |= 1<<i;
85      }
86   }
87
88   return tnl->pipeline.input_changes;
89}
90
91
92static GLuint check_output_changes( struct gl_context *ctx )
93{
94#if 0
95   TNLcontext *tnl = TNL_CONTEXT(ctx);
96
97   for (i = 0; i < VARYING_SLOT_MAX; i++) {
98      if (tnl->vb.ResultPtr[i]->size != tnl->last_result_size[i] ||
99	  tnl->vb.ResultPtr[i]->stride != tnl->last_result_stride[i]) {
100	 tnl->last_result_size[i] = tnl->vb.ResultPtr[i]->size;
101	 tnl->last_result_stride[i] = tnl->vb.ResultPtr[i]->stride;
102	 tnl->pipeline.output_changes |= 1<<i;
103      }
104   }
105
106   if (tnl->pipeline.output_changes)
107      tnl->Driver.NotifyOutputChanges( ctx, tnl->pipeline.output_changes );
108
109   return tnl->pipeline.output_changes;
110#else
111   return ~0;
112#endif
113}
114
115/**
116 * START/END_FAST_MATH macros:
117 *
118 * START_FAST_MATH: Set x86 FPU to faster, 32-bit precision mode (and save
119 *                  original mode to a temporary).
120 * END_FAST_MATH: Restore x86 FPU to original mode.
121 */
122#if defined(__GNUC__) && defined(__i386__)
123/*
124 * Set the x86 FPU control word to guarentee only 32 bits of precision
125 * are stored in registers.  Allowing the FPU to store more introduces
126 * differences between situations where numbers are pulled out of memory
127 * vs. situations where the compiler is able to optimize register usage.
128 *
129 * In the worst case, we force the compiler to use a memory access to
130 * truncate the float, by specifying the 'volatile' keyword.
131 */
132/* Hardware default: All exceptions masked, extended double precision,
133 * round to nearest (IEEE compliant):
134 */
135#define DEFAULT_X86_FPU		0x037f
136/* All exceptions masked, single precision, round to nearest:
137 */
138#define FAST_X86_FPU		0x003f
139/* The fldcw instruction will cause any pending FP exceptions to be
140 * raised prior to entering the block, and we clear any pending
141 * exceptions before exiting the block.  Hence, asm code has free
142 * reign over the FPU while in the fast math block.
143 */
144#if defined(NO_FAST_MATH)
145#define START_FAST_MATH(x)						\
146do {									\
147   static GLuint mask = DEFAULT_X86_FPU;				\
148   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
149   __asm__ ( "fldcw %0" : : "m" (mask) );				\
150} while (0)
151#else
152#define START_FAST_MATH(x)						\
153do {									\
154   static GLuint mask = FAST_X86_FPU;					\
155   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
156   __asm__ ( "fldcw %0" : : "m" (mask) );				\
157} while (0)
158#endif
159/* Restore original FPU mode, and clear any exceptions that may have
160 * occurred in the FAST_MATH block.
161 */
162#define END_FAST_MATH(x)						\
163do {									\
164   __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) );			\
165} while (0)
166
167#elif defined(_MSC_VER) && defined(_M_IX86)
168#define DEFAULT_X86_FPU		0x037f /* See GCC comments above */
169#define FAST_X86_FPU		0x003f /* See GCC comments above */
170#if defined(NO_FAST_MATH)
171#define START_FAST_MATH(x) do {\
172	static GLuint mask = DEFAULT_X86_FPU;\
173	__asm fnstcw word ptr [x]\
174	__asm fldcw word ptr [mask]\
175} while(0)
176#else
177#define START_FAST_MATH(x) do {\
178	static GLuint mask = FAST_X86_FPU;\
179	__asm fnstcw word ptr [x]\
180	__asm fldcw word ptr [mask]\
181} while(0)
182#endif
183#define END_FAST_MATH(x) do {\
184	__asm fnclex\
185	__asm fldcw word ptr [x]\
186} while(0)
187
188#else
189#define START_FAST_MATH(x)  x = 0
190#define END_FAST_MATH(x)  (void)(x)
191#endif
192
193
194void _tnl_run_pipeline( struct gl_context *ctx )
195{
196   TNLcontext *tnl = TNL_CONTEXT(ctx);
197   unsigned short __tmp;
198   GLuint i;
199
200   if (!tnl->vb.Count)
201      return;
202
203   /* Check for changed input sizes or change in stride to/from zero
204    * (ie const or non-const).
205    */
206   if (check_input_changes( ctx ) || tnl->pipeline.new_state) {
207      if (ctx->VertexProgram._MaintainTnlProgram)
208	 _tnl_UpdateFixedFunctionProgram( ctx );
209
210      for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
211	 struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
212	 if (s->validate)
213	    s->validate( ctx, s );
214      }
215
216      tnl->pipeline.new_state = 0;
217      tnl->pipeline.input_changes = 0;
218
219      /* Pipeline can only change its output in response to either a
220       * statechange or an input size/stride change.  No other changes
221       * are allowed.
222       */
223      if (check_output_changes( ctx ))
224	 _tnl_notify_pipeline_output_change( ctx );
225   }
226
227#ifndef _OPENMP
228   /* Don't adjust FPU precision mode in case multiple threads are to be used.
229    * This would require that the additional threads also changed the FPU mode
230    * which is quite a mess as this had to be done in all parallelized sections;
231    * otherwise the master thread and all other threads are running in different
232    * modes, producing inconsistent results.
233    * Note that all x64 implementations don't define/use START_FAST_MATH, so
234    * this is "hack" is only used in i386 mode
235    */
236   START_FAST_MATH(__tmp);
237#endif
238
239   for (i = 0; i < tnl->pipeline.nr_stages ; i++) {
240      struct tnl_pipeline_stage *s = &tnl->pipeline.stages[i];
241      if (!s->run( ctx, s ))
242	 break;
243   }
244
245#ifndef _OPENMP
246   END_FAST_MATH(__tmp);
247#endif
248}
249
250
251
252/* The default pipeline.  This is useful for software rasterizers, and
253 * simple hardware rasterizers.  For customization, I don't recommend
254 * tampering with the internals of these stages in the way that
255 * drivers did in Mesa 3.4.  These stages are basically black boxes,
256 * and should be left intact.
257 *
258 * To customize the pipeline, consider:
259 *
260 * - removing redundant stages (making sure that the software rasterizer
261 *   can cope with this on fallback paths).  An example is fog
262 *   coordinate generation, which is not required in the FX driver.
263 *
264 * - replacing general-purpose machine-independent stages with
265 *   general-purpose machine-specific stages.  There is no example of
266 *   this to date, though it must be borne in mind that all subsequent
267 *   stages that reference the output of the new stage must cope with
268 *   any machine-specific data introduced.  This may not be easy
269 *   unless there are no such stages (ie the new stage is the last in
270 *   the pipe).
271 *
272 * - inserting optimized (but specialized) stages ahead of the
273 *   general-purpose fallback implementation.  For example, the old
274 *   fastpath mechanism, which only works when the VB->Elts input is
275 *   available, can be duplicated by placing the fastpath stage at the
276 *   head of this pipeline.  Such specialized stages are currently
277 *   constrained to have no outputs (ie. they must either finish the *
278 *   pipeline by returning GL_FALSE from run(), or do nothing).
279 *
280 * Some work can be done to lift some of the restrictions in the final
281 * case, if it becomes necessary to do so.
282 */
283const struct tnl_pipeline_stage *_tnl_default_pipeline[] = {
284   &_tnl_vertex_transform_stage,
285   &_tnl_normal_transform_stage,
286   &_tnl_lighting_stage,
287   &_tnl_texgen_stage,
288   &_tnl_texture_transform_stage,
289   &_tnl_point_attenuation_stage,
290   &_tnl_vertex_program_stage,
291   &_tnl_fog_coordinate_stage,
292   &_tnl_render_stage,
293   NULL
294};
295
296const struct tnl_pipeline_stage *_tnl_vp_pipeline[] = {
297   &_tnl_vertex_program_stage,
298   &_tnl_render_stage,
299   NULL
300};
301