tgsi_exec.c revision cdc920a0
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_memory.h"
62#include "util/u_math.h"
63
64
65#define FAST_MATH 1
66
67#define TILE_TOP_LEFT     0
68#define TILE_TOP_RIGHT    1
69#define TILE_BOTTOM_LEFT  2
70#define TILE_BOTTOM_RIGHT 3
71
72static void
73micro_abs(union tgsi_exec_channel *dst,
74          const union tgsi_exec_channel *src)
75{
76   dst->f[0] = fabsf(src->f[0]);
77   dst->f[1] = fabsf(src->f[1]);
78   dst->f[2] = fabsf(src->f[2]);
79   dst->f[3] = fabsf(src->f[3]);
80}
81
82static void
83micro_arl(union tgsi_exec_channel *dst,
84          const union tgsi_exec_channel *src)
85{
86   dst->i[0] = (int)floorf(src->f[0]);
87   dst->i[1] = (int)floorf(src->f[1]);
88   dst->i[2] = (int)floorf(src->f[2]);
89   dst->i[3] = (int)floorf(src->f[3]);
90}
91
92static void
93micro_arr(union tgsi_exec_channel *dst,
94          const union tgsi_exec_channel *src)
95{
96   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
97   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
98   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
99   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
100}
101
102static void
103micro_ceil(union tgsi_exec_channel *dst,
104           const union tgsi_exec_channel *src)
105{
106   dst->f[0] = ceilf(src->f[0]);
107   dst->f[1] = ceilf(src->f[1]);
108   dst->f[2] = ceilf(src->f[2]);
109   dst->f[3] = ceilf(src->f[3]);
110}
111
112static void
113micro_clamp(union tgsi_exec_channel *dst,
114            const union tgsi_exec_channel *src0,
115            const union tgsi_exec_channel *src1,
116            const union tgsi_exec_channel *src2)
117{
118   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
119   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
120   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
121   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
122}
123
124static void
125micro_cmp(union tgsi_exec_channel *dst,
126          const union tgsi_exec_channel *src0,
127          const union tgsi_exec_channel *src1,
128          const union tgsi_exec_channel *src2)
129{
130   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
131   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
132   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
133   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
134}
135
136static void
137micro_cnd(union tgsi_exec_channel *dst,
138          const union tgsi_exec_channel *src0,
139          const union tgsi_exec_channel *src1,
140          const union tgsi_exec_channel *src2)
141{
142   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
143   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
144   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
145   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
146}
147
148static void
149micro_cos(union tgsi_exec_channel *dst,
150          const union tgsi_exec_channel *src)
151{
152   dst->f[0] = cosf(src->f[0]);
153   dst->f[1] = cosf(src->f[1]);
154   dst->f[2] = cosf(src->f[2]);
155   dst->f[3] = cosf(src->f[3]);
156}
157
158static void
159micro_ddx(union tgsi_exec_channel *dst,
160          const union tgsi_exec_channel *src)
161{
162   dst->f[0] =
163   dst->f[1] =
164   dst->f[2] =
165   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
166}
167
168static void
169micro_ddy(union tgsi_exec_channel *dst,
170          const union tgsi_exec_channel *src)
171{
172   dst->f[0] =
173   dst->f[1] =
174   dst->f[2] =
175   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
176}
177
178static void
179micro_exp2(union tgsi_exec_channel *dst,
180           const union tgsi_exec_channel *src)
181{
182#if FAST_MATH
183   dst->f[0] = util_fast_exp2(src->f[0]);
184   dst->f[1] = util_fast_exp2(src->f[1]);
185   dst->f[2] = util_fast_exp2(src->f[2]);
186   dst->f[3] = util_fast_exp2(src->f[3]);
187#else
188#if DEBUG
189   /* Inf is okay for this instruction, so clamp it to silence assertions. */
190   uint i;
191   union tgsi_exec_channel clamped;
192
193   for (i = 0; i < 4; i++) {
194      if (src->f[i] > 127.99999f) {
195         clamped.f[i] = 127.99999f;
196      } else if (src->f[i] < -126.99999f) {
197         clamped.f[i] = -126.99999f;
198      } else {
199         clamped.f[i] = src->f[i];
200      }
201   }
202   src = &clamped;
203#endif /* DEBUG */
204
205   dst->f[0] = powf(2.0f, src->f[0]);
206   dst->f[1] = powf(2.0f, src->f[1]);
207   dst->f[2] = powf(2.0f, src->f[2]);
208   dst->f[3] = powf(2.0f, src->f[3]);
209#endif /* FAST_MATH */
210}
211
212static void
213micro_flr(union tgsi_exec_channel *dst,
214          const union tgsi_exec_channel *src)
215{
216   dst->f[0] = floorf(src->f[0]);
217   dst->f[1] = floorf(src->f[1]);
218   dst->f[2] = floorf(src->f[2]);
219   dst->f[3] = floorf(src->f[3]);
220}
221
222static void
223micro_frc(union tgsi_exec_channel *dst,
224          const union tgsi_exec_channel *src)
225{
226   dst->f[0] = src->f[0] - floorf(src->f[0]);
227   dst->f[1] = src->f[1] - floorf(src->f[1]);
228   dst->f[2] = src->f[2] - floorf(src->f[2]);
229   dst->f[3] = src->f[3] - floorf(src->f[3]);
230}
231
232static void
233micro_iabs(union tgsi_exec_channel *dst,
234           const union tgsi_exec_channel *src)
235{
236   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
237   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
238   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
239   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
240}
241
242static void
243micro_ineg(union tgsi_exec_channel *dst,
244           const union tgsi_exec_channel *src)
245{
246   dst->i[0] = -src->i[0];
247   dst->i[1] = -src->i[1];
248   dst->i[2] = -src->i[2];
249   dst->i[3] = -src->i[3];
250}
251
252static void
253micro_lg2(union tgsi_exec_channel *dst,
254          const union tgsi_exec_channel *src)
255{
256#if FAST_MATH
257   dst->f[0] = util_fast_log2(src->f[0]);
258   dst->f[1] = util_fast_log2(src->f[1]);
259   dst->f[2] = util_fast_log2(src->f[2]);
260   dst->f[3] = util_fast_log2(src->f[3]);
261#else
262   dst->f[0] = logf(src->f[0]) * 1.442695f;
263   dst->f[1] = logf(src->f[1]) * 1.442695f;
264   dst->f[2] = logf(src->f[2]) * 1.442695f;
265   dst->f[3] = logf(src->f[3]) * 1.442695f;
266#endif
267}
268
269static void
270micro_lrp(union tgsi_exec_channel *dst,
271          const union tgsi_exec_channel *src0,
272          const union tgsi_exec_channel *src1,
273          const union tgsi_exec_channel *src2)
274{
275   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
276   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
277   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
278   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
279}
280
281static void
282micro_mad(union tgsi_exec_channel *dst,
283          const union tgsi_exec_channel *src0,
284          const union tgsi_exec_channel *src1,
285          const union tgsi_exec_channel *src2)
286{
287   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
288   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
289   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
290   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
291}
292
293static void
294micro_mov(union tgsi_exec_channel *dst,
295          const union tgsi_exec_channel *src)
296{
297   dst->u[0] = src->u[0];
298   dst->u[1] = src->u[1];
299   dst->u[2] = src->u[2];
300   dst->u[3] = src->u[3];
301}
302
303static void
304micro_rcp(union tgsi_exec_channel *dst,
305          const union tgsi_exec_channel *src)
306{
307#if 0 /* for debugging */
308   assert(src->f[0] != 0.0f);
309   assert(src->f[1] != 0.0f);
310   assert(src->f[2] != 0.0f);
311   assert(src->f[3] != 0.0f);
312#endif
313   dst->f[0] = 1.0f / src->f[0];
314   dst->f[1] = 1.0f / src->f[1];
315   dst->f[2] = 1.0f / src->f[2];
316   dst->f[3] = 1.0f / src->f[3];
317}
318
319static void
320micro_rnd(union tgsi_exec_channel *dst,
321          const union tgsi_exec_channel *src)
322{
323   dst->f[0] = floorf(src->f[0] + 0.5f);
324   dst->f[1] = floorf(src->f[1] + 0.5f);
325   dst->f[2] = floorf(src->f[2] + 0.5f);
326   dst->f[3] = floorf(src->f[3] + 0.5f);
327}
328
329static void
330micro_rsq(union tgsi_exec_channel *dst,
331          const union tgsi_exec_channel *src)
332{
333#if 0 /* for debugging */
334   assert(src->f[0] != 0.0f);
335   assert(src->f[1] != 0.0f);
336   assert(src->f[2] != 0.0f);
337   assert(src->f[3] != 0.0f);
338#endif
339   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
340   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
341   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
342   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
343}
344
345static void
346micro_seq(union tgsi_exec_channel *dst,
347          const union tgsi_exec_channel *src0,
348          const union tgsi_exec_channel *src1)
349{
350   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
351   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
352   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
353   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
354}
355
356static void
357micro_sge(union tgsi_exec_channel *dst,
358          const union tgsi_exec_channel *src0,
359          const union tgsi_exec_channel *src1)
360{
361   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
362   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
363   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
364   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
365}
366
367static void
368micro_sgn(union tgsi_exec_channel *dst,
369          const union tgsi_exec_channel *src)
370{
371   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
372   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
373   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
374   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
375}
376
377static void
378micro_sgt(union tgsi_exec_channel *dst,
379          const union tgsi_exec_channel *src0,
380          const union tgsi_exec_channel *src1)
381{
382   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
383   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
384   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
385   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
386}
387
388static void
389micro_sin(union tgsi_exec_channel *dst,
390          const union tgsi_exec_channel *src)
391{
392   dst->f[0] = sinf(src->f[0]);
393   dst->f[1] = sinf(src->f[1]);
394   dst->f[2] = sinf(src->f[2]);
395   dst->f[3] = sinf(src->f[3]);
396}
397
398static void
399micro_sle(union tgsi_exec_channel *dst,
400          const union tgsi_exec_channel *src0,
401          const union tgsi_exec_channel *src1)
402{
403   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
404   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
405   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
406   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
407}
408
409static void
410micro_slt(union tgsi_exec_channel *dst,
411          const union tgsi_exec_channel *src0,
412          const union tgsi_exec_channel *src1)
413{
414   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
415   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
416   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
417   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
418}
419
420static void
421micro_sne(union tgsi_exec_channel *dst,
422          const union tgsi_exec_channel *src0,
423          const union tgsi_exec_channel *src1)
424{
425   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
426   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
427   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
428   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
429}
430
431static void
432micro_trunc(union tgsi_exec_channel *dst,
433            const union tgsi_exec_channel *src)
434{
435   dst->f[0] = (float)(int)src->f[0];
436   dst->f[1] = (float)(int)src->f[1];
437   dst->f[2] = (float)(int)src->f[2];
438   dst->f[3] = (float)(int)src->f[3];
439}
440
441
442#define CHAN_X  0
443#define CHAN_Y  1
444#define CHAN_Z  2
445#define CHAN_W  3
446
447enum tgsi_exec_datatype {
448   TGSI_EXEC_DATA_FLOAT,
449   TGSI_EXEC_DATA_INT,
450   TGSI_EXEC_DATA_UINT
451};
452
453/*
454 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
455 */
456#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
457#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
458#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
459#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
460#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
461#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
462#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
463#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
464#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
465#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
466#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
467#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
468#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
469#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
470#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
471#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
472#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
473#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
474#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
475#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
476#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
477#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
478#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
479#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
480#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
481#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
482#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
483#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
484#define TEMP_R0            TGSI_EXEC_TEMP_R0
485#define TEMP_P0            TGSI_EXEC_TEMP_P0
486
487#define IS_CHANNEL_ENABLED(INST, CHAN)\
488   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
489
490#define IS_CHANNEL_ENABLED2(INST, CHAN)\
491   ((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
492
493#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
494   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
495      if (IS_CHANNEL_ENABLED( INST, CHAN ))
496
497#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
498   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
499      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
500
501
502/** The execution mask depends on the conditional mask and the loop mask */
503#define UPDATE_EXEC_MASK(MACH) \
504      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
505
506
507static const union tgsi_exec_channel ZeroVec =
508   { { 0.0, 0.0, 0.0, 0.0 } };
509
510static const union tgsi_exec_channel OneVec = {
511   {1.0f, 1.0f, 1.0f, 1.0f}
512};
513
514
515/**
516 * Assert that none of the float values in 'chan' are infinite or NaN.
517 * NaN and Inf may occur normally during program execution and should
518 * not lead to crashes, etc.  But when debugging, it's helpful to catch
519 * them.
520 */
521static INLINE void
522check_inf_or_nan(const union tgsi_exec_channel *chan)
523{
524   assert(!util_is_inf_or_nan((chan)->f[0]));
525   assert(!util_is_inf_or_nan((chan)->f[1]));
526   assert(!util_is_inf_or_nan((chan)->f[2]));
527   assert(!util_is_inf_or_nan((chan)->f[3]));
528}
529
530
531#ifdef DEBUG
532static void
533print_chan(const char *msg, const union tgsi_exec_channel *chan)
534{
535   debug_printf("%s = {%f, %f, %f, %f}\n",
536                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
537}
538#endif
539
540
541#ifdef DEBUG
542static void
543print_temp(const struct tgsi_exec_machine *mach, uint index)
544{
545   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
546   int i;
547   debug_printf("Temp[%u] =\n", index);
548   for (i = 0; i < 4; i++) {
549      debug_printf("  %c: { %f, %f, %f, %f }\n",
550                   "XYZW"[i],
551                   tmp->xyzw[i].f[0],
552                   tmp->xyzw[i].f[1],
553                   tmp->xyzw[i].f[2],
554                   tmp->xyzw[i].f[3]);
555   }
556}
557#endif
558
559
560/**
561 * Check if there's a potential src/dst register data dependency when
562 * using SOA execution.
563 * Example:
564 *   MOV T, T.yxwz;
565 * This would expand into:
566 *   MOV t0, t1;
567 *   MOV t1, t0;
568 *   MOV t2, t3;
569 *   MOV t3, t2;
570 * The second instruction will have the wrong value for t0 if executed as-is.
571 */
572boolean
573tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
574{
575   uint i, chan;
576
577   uint writemask = inst->Dst[0].Register.WriteMask;
578   if (writemask == TGSI_WRITEMASK_X ||
579       writemask == TGSI_WRITEMASK_Y ||
580       writemask == TGSI_WRITEMASK_Z ||
581       writemask == TGSI_WRITEMASK_W ||
582       writemask == TGSI_WRITEMASK_NONE) {
583      /* no chance of data dependency */
584      return FALSE;
585   }
586
587   /* loop over src regs */
588   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
589      if ((inst->Src[i].Register.File ==
590           inst->Dst[0].Register.File) &&
591          (inst->Src[i].Register.Index ==
592           inst->Dst[0].Register.Index)) {
593         /* loop over dest channels */
594         uint channelsWritten = 0x0;
595         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
596            /* check if we're reading a channel that's been written */
597            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
598            if (channelsWritten & (1 << swizzle)) {
599               return TRUE;
600            }
601
602            channelsWritten |= (1 << chan);
603         }
604      }
605   }
606   return FALSE;
607}
608
609
610/**
611 * Initialize machine state by expanding tokens to full instructions,
612 * allocating temporary storage, setting up constants, etc.
613 * After this, we can call tgsi_exec_machine_run() many times.
614 */
615void
616tgsi_exec_machine_bind_shader(
617   struct tgsi_exec_machine *mach,
618   const struct tgsi_token *tokens,
619   uint numSamplers,
620   struct tgsi_sampler **samplers)
621{
622   uint k;
623   struct tgsi_parse_context parse;
624   struct tgsi_exec_labels *labels = &mach->Labels;
625   struct tgsi_full_instruction *instructions;
626   struct tgsi_full_declaration *declarations;
627   uint maxInstructions = 10, numInstructions = 0;
628   uint maxDeclarations = 10, numDeclarations = 0;
629   uint instno = 0;
630
631#if 0
632   tgsi_dump(tokens, 0);
633#endif
634
635   util_init_math();
636
637   mach->Tokens = tokens;
638   mach->Samplers = samplers;
639
640   k = tgsi_parse_init (&parse, mach->Tokens);
641   if (k != TGSI_PARSE_OK) {
642      debug_printf( "Problem parsing!\n" );
643      return;
644   }
645
646   mach->Processor = parse.FullHeader.Processor.Processor;
647   mach->ImmLimit = 0;
648   labels->count = 0;
649
650   declarations = (struct tgsi_full_declaration *)
651      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
652
653   if (!declarations) {
654      return;
655   }
656
657   instructions = (struct tgsi_full_instruction *)
658      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
659
660   if (!instructions) {
661      FREE( declarations );
662      return;
663   }
664
665   while( !tgsi_parse_end_of_tokens( &parse ) ) {
666      uint pointer = parse.Position;
667      uint i;
668
669      tgsi_parse_token( &parse );
670      switch( parse.FullToken.Token.Type ) {
671      case TGSI_TOKEN_TYPE_DECLARATION:
672         /* save expanded declaration */
673         if (numDeclarations == maxDeclarations) {
674            declarations = REALLOC(declarations,
675                                   maxDeclarations
676                                   * sizeof(struct tgsi_full_declaration),
677                                   (maxDeclarations + 10)
678                                   * sizeof(struct tgsi_full_declaration));
679            maxDeclarations += 10;
680         }
681         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
682            unsigned reg;
683            for (reg = parse.FullToken.FullDeclaration.Range.First;
684                 reg <= parse.FullToken.FullDeclaration.Range.Last;
685                 ++reg) {
686               ++mach->NumOutputs;
687            }
688         }
689         memcpy(declarations + numDeclarations,
690                &parse.FullToken.FullDeclaration,
691                sizeof(declarations[0]));
692         numDeclarations++;
693         break;
694
695      case TGSI_TOKEN_TYPE_IMMEDIATE:
696         {
697            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
698            assert( size <= 4 );
699            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
700
701            for( i = 0; i < size; i++ ) {
702               mach->Imms[mach->ImmLimit][i] =
703		  parse.FullToken.FullImmediate.u[i].Float;
704            }
705            mach->ImmLimit += 1;
706         }
707         break;
708
709      case TGSI_TOKEN_TYPE_INSTRUCTION:
710         assert( labels->count < MAX_LABELS );
711
712         labels->labels[labels->count][0] = instno;
713         labels->labels[labels->count][1] = pointer;
714         labels->count++;
715
716         /* save expanded instruction */
717         if (numInstructions == maxInstructions) {
718            instructions = REALLOC(instructions,
719                                   maxInstructions
720                                   * sizeof(struct tgsi_full_instruction),
721                                   (maxInstructions + 10)
722                                   * sizeof(struct tgsi_full_instruction));
723            maxInstructions += 10;
724         }
725
726         memcpy(instructions + numInstructions,
727                &parse.FullToken.FullInstruction,
728                sizeof(instructions[0]));
729
730         numInstructions++;
731         break;
732
733      case TGSI_TOKEN_TYPE_PROPERTY:
734         break;
735
736      default:
737         assert( 0 );
738      }
739   }
740   tgsi_parse_free (&parse);
741
742   if (mach->Declarations) {
743      FREE( mach->Declarations );
744   }
745   mach->Declarations = declarations;
746   mach->NumDeclarations = numDeclarations;
747
748   if (mach->Instructions) {
749      FREE( mach->Instructions );
750   }
751   mach->Instructions = instructions;
752   mach->NumInstructions = numInstructions;
753}
754
755
756struct tgsi_exec_machine *
757tgsi_exec_machine_create( void )
758{
759   struct tgsi_exec_machine *mach;
760   uint i;
761
762   mach = align_malloc( sizeof *mach, 16 );
763   if (!mach)
764      goto fail;
765
766   memset(mach, 0, sizeof(*mach));
767
768   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
769   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
770   mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
771
772   /* Setup constants. */
773   for( i = 0; i < 4; i++ ) {
774      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
775      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
776      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
777      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
778      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
779      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
780      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
781      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
782      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
783      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
784   }
785
786#ifdef DEBUG
787   /* silence warnings */
788   (void) print_chan;
789   (void) print_temp;
790#endif
791
792   return mach;
793
794fail:
795   align_free(mach);
796   return NULL;
797}
798
799
800void
801tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
802{
803   if (mach) {
804      FREE(mach->Instructions);
805      FREE(mach->Declarations);
806   }
807
808   align_free(mach);
809}
810
811static void
812micro_add(union tgsi_exec_channel *dst,
813          const union tgsi_exec_channel *src0,
814          const union tgsi_exec_channel *src1)
815{
816   dst->f[0] = src0->f[0] + src1->f[0];
817   dst->f[1] = src0->f[1] + src1->f[1];
818   dst->f[2] = src0->f[2] + src1->f[2];
819   dst->f[3] = src0->f[3] + src1->f[3];
820}
821
822static void
823micro_div(
824   union tgsi_exec_channel *dst,
825   const union tgsi_exec_channel *src0,
826   const union tgsi_exec_channel *src1 )
827{
828   if (src1->f[0] != 0) {
829      dst->f[0] = src0->f[0] / src1->f[0];
830   }
831   if (src1->f[1] != 0) {
832      dst->f[1] = src0->f[1] / src1->f[1];
833   }
834   if (src1->f[2] != 0) {
835      dst->f[2] = src0->f[2] / src1->f[2];
836   }
837   if (src1->f[3] != 0) {
838      dst->f[3] = src0->f[3] / src1->f[3];
839   }
840}
841
842static void
843micro_float_clamp(union tgsi_exec_channel *dst,
844                  const union tgsi_exec_channel *src)
845{
846   uint i;
847
848   for (i = 0; i < 4; i++) {
849      if (src->f[i] > 0.0f) {
850         if (src->f[i] > 1.884467e+019f)
851            dst->f[i] = 1.884467e+019f;
852         else if (src->f[i] < 5.42101e-020f)
853            dst->f[i] = 5.42101e-020f;
854         else
855            dst->f[i] = src->f[i];
856      }
857      else {
858         if (src->f[i] < -1.884467e+019f)
859            dst->f[i] = -1.884467e+019f;
860         else if (src->f[i] > -5.42101e-020f)
861            dst->f[i] = -5.42101e-020f;
862         else
863            dst->f[i] = src->f[i];
864      }
865   }
866}
867
868static void
869micro_lt(
870   union tgsi_exec_channel *dst,
871   const union tgsi_exec_channel *src0,
872   const union tgsi_exec_channel *src1,
873   const union tgsi_exec_channel *src2,
874   const union tgsi_exec_channel *src3 )
875{
876   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
877   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
878   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
879   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
880}
881
882static void
883micro_max(union tgsi_exec_channel *dst,
884          const union tgsi_exec_channel *src0,
885          const union tgsi_exec_channel *src1)
886{
887   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
888   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
889   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
890   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
891}
892
893static void
894micro_min(union tgsi_exec_channel *dst,
895          const union tgsi_exec_channel *src0,
896          const union tgsi_exec_channel *src1)
897{
898   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
899   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
900   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
901   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
902}
903
904static void
905micro_mul(union tgsi_exec_channel *dst,
906          const union tgsi_exec_channel *src0,
907          const union tgsi_exec_channel *src1)
908{
909   dst->f[0] = src0->f[0] * src1->f[0];
910   dst->f[1] = src0->f[1] * src1->f[1];
911   dst->f[2] = src0->f[2] * src1->f[2];
912   dst->f[3] = src0->f[3] * src1->f[3];
913}
914
915#if 0
916static void
917micro_imul64(
918   union tgsi_exec_channel *dst0,
919   union tgsi_exec_channel *dst1,
920   const union tgsi_exec_channel *src0,
921   const union tgsi_exec_channel *src1 )
922{
923   dst1->i[0] = src0->i[0] * src1->i[0];
924   dst1->i[1] = src0->i[1] * src1->i[1];
925   dst1->i[2] = src0->i[2] * src1->i[2];
926   dst1->i[3] = src0->i[3] * src1->i[3];
927   dst0->i[0] = 0;
928   dst0->i[1] = 0;
929   dst0->i[2] = 0;
930   dst0->i[3] = 0;
931}
932#endif
933
934#if 0
935static void
936micro_umul64(
937   union tgsi_exec_channel *dst0,
938   union tgsi_exec_channel *dst1,
939   const union tgsi_exec_channel *src0,
940   const union tgsi_exec_channel *src1 )
941{
942   dst1->u[0] = src0->u[0] * src1->u[0];
943   dst1->u[1] = src0->u[1] * src1->u[1];
944   dst1->u[2] = src0->u[2] * src1->u[2];
945   dst1->u[3] = src0->u[3] * src1->u[3];
946   dst0->u[0] = 0;
947   dst0->u[1] = 0;
948   dst0->u[2] = 0;
949   dst0->u[3] = 0;
950}
951#endif
952
953
954#if 0
955static void
956micro_movc(
957   union tgsi_exec_channel *dst,
958   const union tgsi_exec_channel *src0,
959   const union tgsi_exec_channel *src1,
960   const union tgsi_exec_channel *src2 )
961{
962   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
963   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
964   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
965   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
966}
967#endif
968
969static void
970micro_neg(
971   union tgsi_exec_channel *dst,
972   const union tgsi_exec_channel *src )
973{
974   dst->f[0] = -src->f[0];
975   dst->f[1] = -src->f[1];
976   dst->f[2] = -src->f[2];
977   dst->f[3] = -src->f[3];
978}
979
980static void
981micro_pow(
982   union tgsi_exec_channel *dst,
983   const union tgsi_exec_channel *src0,
984   const union tgsi_exec_channel *src1 )
985{
986#if FAST_MATH
987   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
988   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
989   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
990   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
991#else
992   dst->f[0] = powf( src0->f[0], src1->f[0] );
993   dst->f[1] = powf( src0->f[1], src1->f[1] );
994   dst->f[2] = powf( src0->f[2], src1->f[2] );
995   dst->f[3] = powf( src0->f[3], src1->f[3] );
996#endif
997}
998
999static void
1000micro_sub(union tgsi_exec_channel *dst,
1001          const union tgsi_exec_channel *src0,
1002          const union tgsi_exec_channel *src1)
1003{
1004   dst->f[0] = src0->f[0] - src1->f[0];
1005   dst->f[1] = src0->f[1] - src1->f[1];
1006   dst->f[2] = src0->f[2] - src1->f[2];
1007   dst->f[3] = src0->f[3] - src1->f[3];
1008}
1009
1010static void
1011fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1012                       const uint file,
1013                       const uint swizzle,
1014                       const union tgsi_exec_channel *index,
1015                       const union tgsi_exec_channel *index2D,
1016                       union tgsi_exec_channel *chan)
1017{
1018   uint i;
1019
1020   switch (file) {
1021   case TGSI_FILE_CONSTANT:
1022      for (i = 0; i < QUAD_SIZE; i++) {
1023         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1024         assert(mach->Consts[index2D->i[i]]);
1025
1026         if (index->i[i] < 0) {
1027            chan->u[i] = 0;
1028         } else {
1029            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
1030
1031            chan->u[i] = p[index->i[i] * 4 + swizzle];
1032         }
1033      }
1034      break;
1035
1036   case TGSI_FILE_INPUT:
1037   case TGSI_FILE_SYSTEM_VALUE:
1038      for (i = 0; i < QUAD_SIZE; i++) {
1039         /* XXX: 2D indexing */
1040         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
1041      }
1042      break;
1043
1044   case TGSI_FILE_TEMPORARY:
1045      for (i = 0; i < QUAD_SIZE; i++) {
1046         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1047         assert(index2D->i[i] == 0);
1048
1049         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1050      }
1051      break;
1052
1053   case TGSI_FILE_IMMEDIATE:
1054      for (i = 0; i < QUAD_SIZE; i++) {
1055         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1056         assert(index2D->i[i] == 0);
1057
1058         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1059      }
1060      break;
1061
1062   case TGSI_FILE_ADDRESS:
1063      for (i = 0; i < QUAD_SIZE; i++) {
1064         assert(index->i[i] >= 0);
1065         assert(index2D->i[i] == 0);
1066
1067         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1068      }
1069      break;
1070
1071   case TGSI_FILE_PREDICATE:
1072      for (i = 0; i < QUAD_SIZE; i++) {
1073         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
1074         assert(index2D->i[i] == 0);
1075
1076         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
1077      }
1078      break;
1079
1080   case TGSI_FILE_OUTPUT:
1081      /* vertex/fragment output vars can be read too */
1082      for (i = 0; i < QUAD_SIZE; i++) {
1083         assert(index->i[i] >= 0);
1084         assert(index2D->i[i] == 0);
1085
1086         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1087      }
1088      break;
1089
1090   default:
1091      assert(0);
1092      for (i = 0; i < QUAD_SIZE; i++) {
1093         chan->u[i] = 0;
1094      }
1095   }
1096}
1097
1098static void
1099fetch_source(const struct tgsi_exec_machine *mach,
1100             union tgsi_exec_channel *chan,
1101             const struct tgsi_full_src_register *reg,
1102             const uint chan_index,
1103             enum tgsi_exec_datatype src_datatype)
1104{
1105   union tgsi_exec_channel index;
1106   union tgsi_exec_channel index2D;
1107   uint swizzle;
1108
1109   /* We start with a direct index into a register file.
1110    *
1111    *    file[1],
1112    *    where:
1113    *       file = Register.File
1114    *       [1] = Register.Index
1115    */
1116   index.i[0] =
1117   index.i[1] =
1118   index.i[2] =
1119   index.i[3] = reg->Register.Index;
1120
1121   /* There is an extra source register that indirectly subscripts
1122    * a register file. The direct index now becomes an offset
1123    * that is being added to the indirect register.
1124    *
1125    *    file[ind[2].x+1],
1126    *    where:
1127    *       ind = Indirect.File
1128    *       [2] = Indirect.Index
1129    *       .x = Indirect.SwizzleX
1130    */
1131   if (reg->Register.Indirect) {
1132      union tgsi_exec_channel index2;
1133      union tgsi_exec_channel indir_index;
1134      const uint execmask = mach->ExecMask;
1135      uint i;
1136
1137      /* which address register (always zero now) */
1138      index2.i[0] =
1139      index2.i[1] =
1140      index2.i[2] =
1141      index2.i[3] = reg->Indirect.Index;
1142
1143      /* get current value of address register[swizzle] */
1144      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1145      fetch_src_file_channel(mach,
1146                             reg->Indirect.File,
1147                             swizzle,
1148                             &index2,
1149                             &ZeroVec,
1150                             &indir_index);
1151
1152      /* add value of address register to the offset */
1153      index.i[0] += indir_index.i[0];
1154      index.i[1] += indir_index.i[1];
1155      index.i[2] += indir_index.i[2];
1156      index.i[3] += indir_index.i[3];
1157
1158      /* for disabled execution channels, zero-out the index to
1159       * avoid using a potential garbage value.
1160       */
1161      for (i = 0; i < QUAD_SIZE; i++) {
1162         if ((execmask & (1 << i)) == 0)
1163            index.i[i] = 0;
1164      }
1165   }
1166
1167   /* There is an extra source register that is a second
1168    * subscript to a register file. Effectively it means that
1169    * the register file is actually a 2D array of registers.
1170    *
1171    *    file[3][1],
1172    *    where:
1173    *       [3] = Dimension.Index
1174    */
1175   if (reg->Register.Dimension) {
1176      index2D.i[0] =
1177      index2D.i[1] =
1178      index2D.i[2] =
1179      index2D.i[3] = reg->Dimension.Index;
1180
1181      /* Again, the second subscript index can be addressed indirectly
1182       * identically to the first one.
1183       * Nothing stops us from indirectly addressing the indirect register,
1184       * but there is no need for that, so we won't exercise it.
1185       *
1186       *    file[ind[4].y+3][1],
1187       *    where:
1188       *       ind = DimIndirect.File
1189       *       [4] = DimIndirect.Index
1190       *       .y = DimIndirect.SwizzleX
1191       */
1192      if (reg->Dimension.Indirect) {
1193         union tgsi_exec_channel index2;
1194         union tgsi_exec_channel indir_index;
1195         const uint execmask = mach->ExecMask;
1196         uint i;
1197
1198         index2.i[0] =
1199         index2.i[1] =
1200         index2.i[2] =
1201         index2.i[3] = reg->DimIndirect.Index;
1202
1203         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
1204         fetch_src_file_channel(mach,
1205                                reg->DimIndirect.File,
1206                                swizzle,
1207                                &index2,
1208                                &ZeroVec,
1209                                &indir_index);
1210
1211         index2D.i[0] += indir_index.i[0];
1212         index2D.i[1] += indir_index.i[1];
1213         index2D.i[2] += indir_index.i[2];
1214         index2D.i[3] += indir_index.i[3];
1215
1216         /* for disabled execution channels, zero-out the index to
1217          * avoid using a potential garbage value.
1218          */
1219         for (i = 0; i < QUAD_SIZE; i++) {
1220            if ((execmask & (1 << i)) == 0) {
1221               index2D.i[i] = 0;
1222            }
1223         }
1224      }
1225
1226      /* If by any chance there was a need for a 3D array of register
1227       * files, we would have to check whether Dimension is followed
1228       * by a dimension register and continue the saga.
1229       */
1230   } else {
1231      index2D.i[0] =
1232      index2D.i[1] =
1233      index2D.i[2] =
1234      index2D.i[3] = 0;
1235   }
1236
1237   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1238   fetch_src_file_channel(mach,
1239                          reg->Register.File,
1240                          swizzle,
1241                          &index,
1242                          &index2D,
1243                          chan);
1244
1245   if (reg->Register.Absolute) {
1246      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1247         micro_abs(chan, chan);
1248      } else {
1249         micro_iabs(chan, chan);
1250      }
1251   }
1252
1253   if (reg->Register.Negate) {
1254      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1255         micro_neg(chan, chan);
1256      } else {
1257         micro_ineg(chan, chan);
1258      }
1259   }
1260}
1261
1262static void
1263store_dest(struct tgsi_exec_machine *mach,
1264           const union tgsi_exec_channel *chan,
1265           const struct tgsi_full_dst_register *reg,
1266           const struct tgsi_full_instruction *inst,
1267           uint chan_index,
1268           enum tgsi_exec_datatype dst_datatype)
1269{
1270   uint i;
1271   union tgsi_exec_channel null;
1272   union tgsi_exec_channel *dst;
1273   uint execmask = mach->ExecMask;
1274   int offset = 0;  /* indirection offset */
1275   int index;
1276
1277   /* for debugging */
1278   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1279      check_inf_or_nan(chan);
1280   }
1281
1282   /* There is an extra source register that indirectly subscripts
1283    * a register file. The direct index now becomes an offset
1284    * that is being added to the indirect register.
1285    *
1286    *    file[ind[2].x+1],
1287    *    where:
1288    *       ind = Indirect.File
1289    *       [2] = Indirect.Index
1290    *       .x = Indirect.SwizzleX
1291    */
1292   if (reg->Register.Indirect) {
1293      union tgsi_exec_channel index;
1294      union tgsi_exec_channel indir_index;
1295      uint swizzle;
1296
1297      /* which address register (always zero for now) */
1298      index.i[0] =
1299      index.i[1] =
1300      index.i[2] =
1301      index.i[3] = reg->Indirect.Index;
1302
1303      /* get current value of address register[swizzle] */
1304      swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
1305
1306      /* fetch values from the address/indirection register */
1307      fetch_src_file_channel(mach,
1308                             reg->Indirect.File,
1309                             swizzle,
1310                             &index,
1311                             &ZeroVec,
1312                             &indir_index);
1313
1314      /* save indirection offset */
1315      offset = indir_index.i[0];
1316   }
1317
1318   switch (reg->Register.File) {
1319   case TGSI_FILE_NULL:
1320      dst = &null;
1321      break;
1322
1323   case TGSI_FILE_OUTPUT:
1324      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1325         + reg->Register.Index;
1326      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1327#if 0
1328      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
1329         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1330         for (i = 0; i < QUAD_SIZE; i++)
1331            if (execmask & (1 << i))
1332               fprintf(stderr, "%f, ", chan->f[i]);
1333         fprintf(stderr, ")\n");
1334      }
1335#endif
1336      break;
1337
1338   case TGSI_FILE_TEMPORARY:
1339      index = reg->Register.Index;
1340      assert( index < TGSI_EXEC_NUM_TEMPS );
1341      dst = &mach->Temps[offset + index].xyzw[chan_index];
1342      break;
1343
1344   case TGSI_FILE_ADDRESS:
1345      index = reg->Register.Index;
1346      dst = &mach->Addrs[index].xyzw[chan_index];
1347      break;
1348
1349   case TGSI_FILE_LOOP:
1350      assert(reg->Register.Index == 0);
1351      assert(mach->LoopCounterStackTop > 0);
1352      assert(chan_index == CHAN_X);
1353      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
1354      break;
1355
1356   case TGSI_FILE_PREDICATE:
1357      index = reg->Register.Index;
1358      assert(index < TGSI_EXEC_NUM_PREDS);
1359      dst = &mach->Predicates[index].xyzw[chan_index];
1360      break;
1361
1362   default:
1363      assert( 0 );
1364      return;
1365   }
1366
1367   if (inst->Instruction.Predicate) {
1368      uint swizzle;
1369      union tgsi_exec_channel *pred;
1370
1371      switch (chan_index) {
1372      case CHAN_X:
1373         swizzle = inst->Predicate.SwizzleX;
1374         break;
1375      case CHAN_Y:
1376         swizzle = inst->Predicate.SwizzleY;
1377         break;
1378      case CHAN_Z:
1379         swizzle = inst->Predicate.SwizzleZ;
1380         break;
1381      case CHAN_W:
1382         swizzle = inst->Predicate.SwizzleW;
1383         break;
1384      default:
1385         assert(0);
1386         return;
1387      }
1388
1389      assert(inst->Predicate.Index == 0);
1390
1391      pred = &mach->Predicates[inst->Predicate.Index].xyzw[swizzle];
1392
1393      if (inst->Predicate.Negate) {
1394         for (i = 0; i < QUAD_SIZE; i++) {
1395            if (pred->u[i]) {
1396               execmask &= ~(1 << i);
1397            }
1398         }
1399      } else {
1400         for (i = 0; i < QUAD_SIZE; i++) {
1401            if (!pred->u[i]) {
1402               execmask &= ~(1 << i);
1403            }
1404         }
1405      }
1406   }
1407
1408   switch (inst->Instruction.Saturate) {
1409   case TGSI_SAT_NONE:
1410      for (i = 0; i < QUAD_SIZE; i++)
1411         if (execmask & (1 << i))
1412            dst->i[i] = chan->i[i];
1413      break;
1414
1415   case TGSI_SAT_ZERO_ONE:
1416      for (i = 0; i < QUAD_SIZE; i++)
1417         if (execmask & (1 << i)) {
1418            if (chan->f[i] < 0.0f)
1419               dst->f[i] = 0.0f;
1420            else if (chan->f[i] > 1.0f)
1421               dst->f[i] = 1.0f;
1422            else
1423               dst->i[i] = chan->i[i];
1424         }
1425      break;
1426
1427   case TGSI_SAT_MINUS_PLUS_ONE:
1428      for (i = 0; i < QUAD_SIZE; i++)
1429         if (execmask & (1 << i)) {
1430            if (chan->f[i] < -1.0f)
1431               dst->f[i] = -1.0f;
1432            else if (chan->f[i] > 1.0f)
1433               dst->f[i] = 1.0f;
1434            else
1435               dst->i[i] = chan->i[i];
1436         }
1437      break;
1438
1439   default:
1440      assert( 0 );
1441   }
1442}
1443
1444#define FETCH(VAL,INDEX,CHAN)\
1445    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1446
1447#define STORE(VAL,INDEX,CHAN)\
1448   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
1449
1450
1451/**
1452 * Execute ARB-style KIL which is predicated by a src register.
1453 * Kill fragment if any of the four values is less than zero.
1454 */
1455static void
1456exec_kil(struct tgsi_exec_machine *mach,
1457         const struct tgsi_full_instruction *inst)
1458{
1459   uint uniquemask;
1460   uint chan_index;
1461   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1462   union tgsi_exec_channel r[1];
1463
1464   /* This mask stores component bits that were already tested. */
1465   uniquemask = 0;
1466
1467   for (chan_index = 0; chan_index < 4; chan_index++)
1468   {
1469      uint swizzle;
1470      uint i;
1471
1472      /* unswizzle channel */
1473      swizzle = tgsi_util_get_full_src_register_swizzle (
1474                        &inst->Src[0],
1475                        chan_index);
1476
1477      /* check if the component has not been already tested */
1478      if (uniquemask & (1 << swizzle))
1479         continue;
1480      uniquemask |= 1 << swizzle;
1481
1482      FETCH(&r[0], 0, chan_index);
1483      for (i = 0; i < 4; i++)
1484         if (r[0].f[i] < 0.0f)
1485            kilmask |= 1 << i;
1486   }
1487
1488   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1489}
1490
1491/**
1492 * Execute NVIDIA-style KIL which is predicated by a condition code.
1493 * Kill fragment if the condition code is TRUE.
1494 */
1495static void
1496exec_kilp(struct tgsi_exec_machine *mach,
1497          const struct tgsi_full_instruction *inst)
1498{
1499   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1500
1501   /* "unconditional" kil */
1502   kilmask = mach->ExecMask;
1503   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1504}
1505
1506static void
1507emit_vertex(struct tgsi_exec_machine *mach)
1508{
1509   /* FIXME: check for exec mask correctly
1510   unsigned i;
1511   for (i = 0; i < QUAD_SIZE; ++i) {
1512         if ((mach->ExecMask & (1 << i)))
1513   */
1514   if (mach->ExecMask) {
1515      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
1516      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
1517   }
1518}
1519
1520static void
1521emit_primitive(struct tgsi_exec_machine *mach)
1522{
1523   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
1524   /* FIXME: check for exec mask correctly
1525   unsigned i;
1526   for (i = 0; i < QUAD_SIZE; ++i) {
1527         if ((mach->ExecMask & (1 << i)))
1528   */
1529   if (mach->ExecMask) {
1530      ++(*prim_count);
1531      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
1532      mach->Primitives[*prim_count] = 0;
1533   }
1534}
1535
1536/*
1537 * Fetch four texture samples using STR texture coordinates.
1538 */
1539static void
1540fetch_texel( struct tgsi_sampler *sampler,
1541             const union tgsi_exec_channel *s,
1542             const union tgsi_exec_channel *t,
1543             const union tgsi_exec_channel *p,
1544             const union tgsi_exec_channel *c0,
1545             enum tgsi_sampler_control control,
1546             union tgsi_exec_channel *r,
1547             union tgsi_exec_channel *g,
1548             union tgsi_exec_channel *b,
1549             union tgsi_exec_channel *a )
1550{
1551   uint j;
1552   float rgba[NUM_CHANNELS][QUAD_SIZE];
1553
1554   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
1555
1556   for (j = 0; j < 4; j++) {
1557      r->f[j] = rgba[0][j];
1558      g->f[j] = rgba[1][j];
1559      b->f[j] = rgba[2][j];
1560      a->f[j] = rgba[3][j];
1561   }
1562}
1563
1564
1565#define TEX_MODIFIER_NONE           0
1566#define TEX_MODIFIER_PROJECTED      1
1567#define TEX_MODIFIER_LOD_BIAS       2
1568#define TEX_MODIFIER_EXPLICIT_LOD   3
1569
1570
1571static void
1572exec_tex(struct tgsi_exec_machine *mach,
1573         const struct tgsi_full_instruction *inst,
1574         uint modifier)
1575{
1576   const uint unit = inst->Src[1].Register.Index;
1577   union tgsi_exec_channel r[4];
1578   const union tgsi_exec_channel *lod = &ZeroVec;
1579   enum tgsi_sampler_control control;
1580   uint chan_index;
1581
1582   if (modifier != TEX_MODIFIER_NONE) {
1583      FETCH(&r[3], 0, CHAN_W);
1584      if (modifier != TEX_MODIFIER_PROJECTED) {
1585         lod = &r[3];
1586      }
1587   }
1588
1589   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
1590      control = tgsi_sampler_lod_explicit;
1591   } else {
1592      control = tgsi_sampler_lod_bias;
1593   }
1594
1595   switch (inst->Texture.Texture) {
1596   case TGSI_TEXTURE_1D:
1597   case TGSI_TEXTURE_SHADOW1D:
1598      FETCH(&r[0], 0, CHAN_X);
1599
1600      if (modifier == TEX_MODIFIER_PROJECTED) {
1601         micro_div(&r[0], &r[0], &r[3]);
1602      }
1603
1604      fetch_texel(mach->Samplers[unit],
1605                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
1606                  control,
1607                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
1608      break;
1609
1610   case TGSI_TEXTURE_2D:
1611   case TGSI_TEXTURE_RECT:
1612   case TGSI_TEXTURE_SHADOW2D:
1613   case TGSI_TEXTURE_SHADOWRECT:
1614      FETCH(&r[0], 0, CHAN_X);
1615      FETCH(&r[1], 0, CHAN_Y);
1616      FETCH(&r[2], 0, CHAN_Z);
1617
1618      if (modifier == TEX_MODIFIER_PROJECTED) {
1619         micro_div(&r[0], &r[0], &r[3]);
1620         micro_div(&r[1], &r[1], &r[3]);
1621         micro_div(&r[2], &r[2], &r[3]);
1622      }
1623
1624      fetch_texel(mach->Samplers[unit],
1625                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
1626                  control,
1627                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1628      break;
1629
1630   case TGSI_TEXTURE_3D:
1631   case TGSI_TEXTURE_CUBE:
1632      FETCH(&r[0], 0, CHAN_X);
1633      FETCH(&r[1], 0, CHAN_Y);
1634      FETCH(&r[2], 0, CHAN_Z);
1635
1636      if (modifier == TEX_MODIFIER_PROJECTED) {
1637         micro_div(&r[0], &r[0], &r[3]);
1638         micro_div(&r[1], &r[1], &r[3]);
1639         micro_div(&r[2], &r[2], &r[3]);
1640      }
1641
1642      fetch_texel(mach->Samplers[unit],
1643                  &r[0], &r[1], &r[2], lod,
1644                  control,
1645                  &r[0], &r[1], &r[2], &r[3]);
1646      break;
1647
1648   default:
1649      assert(0);
1650   }
1651
1652   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1653      STORE(&r[chan_index], 0, chan_index);
1654   }
1655}
1656
1657static void
1658exec_txd(struct tgsi_exec_machine *mach,
1659         const struct tgsi_full_instruction *inst)
1660{
1661   const uint unit = inst->Src[3].Register.Index;
1662   union tgsi_exec_channel r[4];
1663   uint chan_index;
1664
1665   /*
1666    * XXX: This is fake TXD -- the derivatives are not taken into account, yet.
1667    */
1668
1669   switch (inst->Texture.Texture) {
1670   case TGSI_TEXTURE_1D:
1671   case TGSI_TEXTURE_SHADOW1D:
1672
1673      FETCH(&r[0], 0, CHAN_X);
1674
1675      fetch_texel(mach->Samplers[unit],
1676                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
1677                  tgsi_sampler_lod_bias,
1678                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
1679      break;
1680
1681   case TGSI_TEXTURE_2D:
1682   case TGSI_TEXTURE_RECT:
1683   case TGSI_TEXTURE_SHADOW2D:
1684   case TGSI_TEXTURE_SHADOWRECT:
1685
1686      FETCH(&r[0], 0, CHAN_X);
1687      FETCH(&r[1], 0, CHAN_Y);
1688      FETCH(&r[2], 0, CHAN_Z);
1689
1690      fetch_texel(mach->Samplers[unit],
1691                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
1692                  tgsi_sampler_lod_bias,
1693                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
1694      break;
1695
1696   case TGSI_TEXTURE_3D:
1697   case TGSI_TEXTURE_CUBE:
1698
1699      FETCH(&r[0], 0, CHAN_X);
1700      FETCH(&r[1], 0, CHAN_Y);
1701      FETCH(&r[2], 0, CHAN_Z);
1702
1703      fetch_texel(mach->Samplers[unit],
1704                  &r[0], &r[1], &r[2], &ZeroVec,
1705                  tgsi_sampler_lod_bias,
1706                  &r[0], &r[1], &r[2], &r[3]);
1707      break;
1708
1709   default:
1710      assert(0);
1711   }
1712
1713   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
1714      STORE(&r[chan_index], 0, chan_index);
1715   }
1716}
1717
1718
1719/**
1720 * Evaluate a constant-valued coefficient at the position of the
1721 * current quad.
1722 */
1723static void
1724eval_constant_coef(
1725   struct tgsi_exec_machine *mach,
1726   unsigned attrib,
1727   unsigned chan )
1728{
1729   unsigned i;
1730
1731   for( i = 0; i < QUAD_SIZE; i++ ) {
1732      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1733   }
1734}
1735
1736/**
1737 * Evaluate a linear-valued coefficient at the position of the
1738 * current quad.
1739 */
1740static void
1741eval_linear_coef(
1742   struct tgsi_exec_machine *mach,
1743   unsigned attrib,
1744   unsigned chan )
1745{
1746   const float x = mach->QuadPos.xyzw[0].f[0];
1747   const float y = mach->QuadPos.xyzw[1].f[0];
1748   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1749   const float dady = mach->InterpCoefs[attrib].dady[chan];
1750   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1751   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1752   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1753   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1754   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1755}
1756
1757/**
1758 * Evaluate a perspective-valued coefficient at the position of the
1759 * current quad.
1760 */
1761static void
1762eval_perspective_coef(
1763   struct tgsi_exec_machine *mach,
1764   unsigned attrib,
1765   unsigned chan )
1766{
1767   const float x = mach->QuadPos.xyzw[0].f[0];
1768   const float y = mach->QuadPos.xyzw[1].f[0];
1769   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1770   const float dady = mach->InterpCoefs[attrib].dady[chan];
1771   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1772   const float *w = mach->QuadPos.xyzw[3].f;
1773   /* divide by W here */
1774   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1775   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1776   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1777   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1778}
1779
1780
1781typedef void (* eval_coef_func)(
1782   struct tgsi_exec_machine *mach,
1783   unsigned attrib,
1784   unsigned chan );
1785
1786static void
1787exec_declaration(struct tgsi_exec_machine *mach,
1788                 const struct tgsi_full_declaration *decl)
1789{
1790   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
1791      if (decl->Declaration.File == TGSI_FILE_INPUT ||
1792          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1793         uint first, last, mask;
1794
1795         first = decl->Range.First;
1796         last = decl->Range.Last;
1797         mask = decl->Declaration.UsageMask;
1798
1799         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
1800            uint i;
1801
1802            assert(decl->Semantic.Index == 0);
1803            assert(first == last);
1804
1805            for (i = 0; i < QUAD_SIZE; i++) {
1806               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
1807            }
1808         } else {
1809            eval_coef_func eval;
1810            uint i, j;
1811
1812            switch (decl->Declaration.Interpolate) {
1813            case TGSI_INTERPOLATE_CONSTANT:
1814               eval = eval_constant_coef;
1815               break;
1816
1817            case TGSI_INTERPOLATE_LINEAR:
1818               eval = eval_linear_coef;
1819               break;
1820
1821            case TGSI_INTERPOLATE_PERSPECTIVE:
1822               eval = eval_perspective_coef;
1823               break;
1824
1825            default:
1826               assert(0);
1827               return;
1828            }
1829
1830            for (j = 0; j < NUM_CHANNELS; j++) {
1831               if (mask & (1 << j)) {
1832                  for (i = first; i <= last; i++) {
1833                     eval(mach, i, j);
1834                  }
1835               }
1836            }
1837         }
1838      }
1839   }
1840}
1841
1842typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
1843                                const union tgsi_exec_channel *src);
1844
1845static void
1846exec_scalar_unary(struct tgsi_exec_machine *mach,
1847                  const struct tgsi_full_instruction *inst,
1848                  micro_unary_op op,
1849                  enum tgsi_exec_datatype dst_datatype,
1850                  enum tgsi_exec_datatype src_datatype)
1851{
1852   unsigned int chan;
1853   union tgsi_exec_channel src;
1854   union tgsi_exec_channel dst;
1855
1856   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
1857   op(&dst, &src);
1858   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1859      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1860         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
1861      }
1862   }
1863}
1864
1865static void
1866exec_vector_unary(struct tgsi_exec_machine *mach,
1867                  const struct tgsi_full_instruction *inst,
1868                  micro_unary_op op,
1869                  enum tgsi_exec_datatype dst_datatype,
1870                  enum tgsi_exec_datatype src_datatype)
1871{
1872   unsigned int chan;
1873   struct tgsi_exec_vector dst;
1874
1875   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1876      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1877         union tgsi_exec_channel src;
1878
1879         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
1880         op(&dst.xyzw[chan], &src);
1881      }
1882   }
1883   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1884      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1885         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1886      }
1887   }
1888}
1889
1890typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
1891                                 const union tgsi_exec_channel *src0,
1892                                 const union tgsi_exec_channel *src1);
1893
1894static void
1895exec_vector_binary(struct tgsi_exec_machine *mach,
1896                   const struct tgsi_full_instruction *inst,
1897                   micro_binary_op op,
1898                   enum tgsi_exec_datatype dst_datatype,
1899                   enum tgsi_exec_datatype src_datatype)
1900{
1901   unsigned int chan;
1902   struct tgsi_exec_vector dst;
1903
1904   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1905      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1906         union tgsi_exec_channel src[2];
1907
1908         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1909         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1910         op(&dst.xyzw[chan], &src[0], &src[1]);
1911      }
1912   }
1913   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1914      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1915         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1916      }
1917   }
1918}
1919
1920typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
1921                                  const union tgsi_exec_channel *src0,
1922                                  const union tgsi_exec_channel *src1,
1923                                  const union tgsi_exec_channel *src2);
1924
1925static void
1926exec_vector_trinary(struct tgsi_exec_machine *mach,
1927                    const struct tgsi_full_instruction *inst,
1928                    micro_trinary_op op,
1929                    enum tgsi_exec_datatype dst_datatype,
1930                    enum tgsi_exec_datatype src_datatype)
1931{
1932   unsigned int chan;
1933   struct tgsi_exec_vector dst;
1934
1935   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1936      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1937         union tgsi_exec_channel src[3];
1938
1939         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
1940         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
1941         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
1942         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
1943      }
1944   }
1945   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1946      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1947         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
1948      }
1949   }
1950}
1951
1952static void
1953exec_dp3(struct tgsi_exec_machine *mach,
1954         const struct tgsi_full_instruction *inst)
1955{
1956   unsigned int chan;
1957   union tgsi_exec_channel arg[3];
1958
1959   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1960   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1961   micro_mul(&arg[2], &arg[0], &arg[1]);
1962
1963   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
1964      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1965      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1966      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
1967   }
1968
1969   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1970      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1971         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1972      }
1973   }
1974}
1975
1976static void
1977exec_dp4(struct tgsi_exec_machine *mach,
1978         const struct tgsi_full_instruction *inst)
1979{
1980   unsigned int chan;
1981   union tgsi_exec_channel arg[3];
1982
1983   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1984   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
1985   micro_mul(&arg[2], &arg[0], &arg[1]);
1986
1987   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
1988      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
1989      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
1990      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
1991   }
1992
1993   for (chan = 0; chan < NUM_CHANNELS; chan++) {
1994      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1995         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
1996      }
1997   }
1998}
1999
2000static void
2001exec_dp2a(struct tgsi_exec_machine *mach,
2002          const struct tgsi_full_instruction *inst)
2003{
2004   unsigned int chan;
2005   union tgsi_exec_channel arg[3];
2006
2007   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2008   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2009   micro_mul(&arg[2], &arg[0], &arg[1]);
2010
2011   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2012   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2013   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2014
2015   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2016   micro_add(&arg[0], &arg[0], &arg[1]);
2017
2018   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2019      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2020         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2021      }
2022   }
2023}
2024
2025static void
2026exec_dph(struct tgsi_exec_machine *mach,
2027         const struct tgsi_full_instruction *inst)
2028{
2029   unsigned int chan;
2030   union tgsi_exec_channel arg[3];
2031
2032   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2033   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2034   micro_mul(&arg[2], &arg[0], &arg[1]);
2035
2036   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2037   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2038   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2039
2040   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2041   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
2042   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
2043
2044   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
2045   micro_add(&arg[0], &arg[0], &arg[1]);
2046
2047   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2048      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2049         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2050      }
2051   }
2052}
2053
2054static void
2055exec_dp2(struct tgsi_exec_machine *mach,
2056         const struct tgsi_full_instruction *inst)
2057{
2058   unsigned int chan;
2059   union tgsi_exec_channel arg[3];
2060
2061   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2062   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2063   micro_mul(&arg[2], &arg[0], &arg[1]);
2064
2065   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2066   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
2067   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
2068
2069   for (chan = 0; chan < NUM_CHANNELS; chan++) {
2070      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2071         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2072      }
2073   }
2074}
2075
2076static void
2077exec_nrm4(struct tgsi_exec_machine *mach,
2078          const struct tgsi_full_instruction *inst)
2079{
2080   unsigned int chan;
2081   union tgsi_exec_channel arg[4];
2082   union tgsi_exec_channel scale;
2083
2084   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2085   micro_mul(&scale, &arg[0], &arg[0]);
2086
2087   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
2088      union tgsi_exec_channel product;
2089
2090      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2091      micro_mul(&product, &arg[chan], &arg[chan]);
2092      micro_add(&scale, &scale, &product);
2093   }
2094
2095   micro_rsq(&scale, &scale);
2096
2097   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
2098      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2099         micro_mul(&arg[chan], &arg[chan], &scale);
2100         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2101      }
2102   }
2103}
2104
2105static void
2106exec_nrm3(struct tgsi_exec_machine *mach,
2107          const struct tgsi_full_instruction *inst)
2108{
2109   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
2110      unsigned int chan;
2111      union tgsi_exec_channel arg[3];
2112      union tgsi_exec_channel scale;
2113
2114      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
2115      micro_mul(&scale, &arg[0], &arg[0]);
2116
2117      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
2118         union tgsi_exec_channel product;
2119
2120         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
2121         micro_mul(&product, &arg[chan], &arg[chan]);
2122         micro_add(&scale, &scale, &product);
2123      }
2124
2125      micro_rsq(&scale, &scale);
2126
2127      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
2128         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2129            micro_mul(&arg[chan], &arg[chan], &scale);
2130            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2131         }
2132      }
2133   }
2134
2135   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2136      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
2137   }
2138}
2139
2140static void
2141exec_break(struct tgsi_exec_machine *mach)
2142{
2143   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
2144      /* turn off loop channels for each enabled exec channel */
2145      mach->LoopMask &= ~mach->ExecMask;
2146      /* Todo: if mach->LoopMask == 0, jump to end of loop */
2147      UPDATE_EXEC_MASK(mach);
2148   } else {
2149      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
2150
2151      mach->Switch.mask = 0x0;
2152
2153      UPDATE_EXEC_MASK(mach);
2154   }
2155}
2156
2157static void
2158exec_switch(struct tgsi_exec_machine *mach,
2159            const struct tgsi_full_instruction *inst)
2160{
2161   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
2162   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
2163
2164   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
2165   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2166   mach->Switch.mask = 0x0;
2167   mach->Switch.defaultMask = 0x0;
2168
2169   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
2170   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
2171
2172   UPDATE_EXEC_MASK(mach);
2173}
2174
2175static void
2176exec_case(struct tgsi_exec_machine *mach,
2177          const struct tgsi_full_instruction *inst)
2178{
2179   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2180   union tgsi_exec_channel src;
2181   uint mask = 0;
2182
2183   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
2184
2185   if (mach->Switch.selector.u[0] == src.u[0]) {
2186      mask |= 0x1;
2187   }
2188   if (mach->Switch.selector.u[1] == src.u[1]) {
2189      mask |= 0x2;
2190   }
2191   if (mach->Switch.selector.u[2] == src.u[2]) {
2192      mask |= 0x4;
2193   }
2194   if (mach->Switch.selector.u[3] == src.u[3]) {
2195      mask |= 0x8;
2196   }
2197
2198   mach->Switch.defaultMask |= mask;
2199
2200   mach->Switch.mask |= mask & prevMask;
2201
2202   UPDATE_EXEC_MASK(mach);
2203}
2204
2205static void
2206exec_default(struct tgsi_exec_machine *mach)
2207{
2208   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
2209
2210   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
2211
2212   UPDATE_EXEC_MASK(mach);
2213}
2214
2215static void
2216exec_endswitch(struct tgsi_exec_machine *mach)
2217{
2218   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
2219   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
2220
2221   UPDATE_EXEC_MASK(mach);
2222}
2223
2224static void
2225micro_i2f(union tgsi_exec_channel *dst,
2226          const union tgsi_exec_channel *src)
2227{
2228   dst->f[0] = (float)src->i[0];
2229   dst->f[1] = (float)src->i[1];
2230   dst->f[2] = (float)src->i[2];
2231   dst->f[3] = (float)src->i[3];
2232}
2233
2234static void
2235micro_not(union tgsi_exec_channel *dst,
2236          const union tgsi_exec_channel *src)
2237{
2238   dst->u[0] = ~src->u[0];
2239   dst->u[1] = ~src->u[1];
2240   dst->u[2] = ~src->u[2];
2241   dst->u[3] = ~src->u[3];
2242}
2243
2244static void
2245micro_shl(union tgsi_exec_channel *dst,
2246          const union tgsi_exec_channel *src0,
2247          const union tgsi_exec_channel *src1)
2248{
2249   dst->u[0] = src0->u[0] << src1->u[0];
2250   dst->u[1] = src0->u[1] << src1->u[1];
2251   dst->u[2] = src0->u[2] << src1->u[2];
2252   dst->u[3] = src0->u[3] << src1->u[3];
2253}
2254
2255static void
2256micro_and(union tgsi_exec_channel *dst,
2257          const union tgsi_exec_channel *src0,
2258          const union tgsi_exec_channel *src1)
2259{
2260   dst->u[0] = src0->u[0] & src1->u[0];
2261   dst->u[1] = src0->u[1] & src1->u[1];
2262   dst->u[2] = src0->u[2] & src1->u[2];
2263   dst->u[3] = src0->u[3] & src1->u[3];
2264}
2265
2266static void
2267micro_or(union tgsi_exec_channel *dst,
2268         const union tgsi_exec_channel *src0,
2269         const union tgsi_exec_channel *src1)
2270{
2271   dst->u[0] = src0->u[0] | src1->u[0];
2272   dst->u[1] = src0->u[1] | src1->u[1];
2273   dst->u[2] = src0->u[2] | src1->u[2];
2274   dst->u[3] = src0->u[3] | src1->u[3];
2275}
2276
2277static void
2278micro_xor(union tgsi_exec_channel *dst,
2279          const union tgsi_exec_channel *src0,
2280          const union tgsi_exec_channel *src1)
2281{
2282   dst->u[0] = src0->u[0] ^ src1->u[0];
2283   dst->u[1] = src0->u[1] ^ src1->u[1];
2284   dst->u[2] = src0->u[2] ^ src1->u[2];
2285   dst->u[3] = src0->u[3] ^ src1->u[3];
2286}
2287
2288static void
2289micro_f2i(union tgsi_exec_channel *dst,
2290          const union tgsi_exec_channel *src)
2291{
2292   dst->i[0] = (int)src->f[0];
2293   dst->i[1] = (int)src->f[1];
2294   dst->i[2] = (int)src->f[2];
2295   dst->i[3] = (int)src->f[3];
2296}
2297
2298static void
2299micro_idiv(union tgsi_exec_channel *dst,
2300           const union tgsi_exec_channel *src0,
2301           const union tgsi_exec_channel *src1)
2302{
2303   dst->i[0] = src0->i[0] / src1->i[0];
2304   dst->i[1] = src0->i[1] / src1->i[1];
2305   dst->i[2] = src0->i[2] / src1->i[2];
2306   dst->i[3] = src0->i[3] / src1->i[3];
2307}
2308
2309static void
2310micro_imax(union tgsi_exec_channel *dst,
2311           const union tgsi_exec_channel *src0,
2312           const union tgsi_exec_channel *src1)
2313{
2314   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
2315   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
2316   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
2317   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
2318}
2319
2320static void
2321micro_imin(union tgsi_exec_channel *dst,
2322           const union tgsi_exec_channel *src0,
2323           const union tgsi_exec_channel *src1)
2324{
2325   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
2326   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
2327   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
2328   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
2329}
2330
2331static void
2332micro_isge(union tgsi_exec_channel *dst,
2333           const union tgsi_exec_channel *src0,
2334           const union tgsi_exec_channel *src1)
2335{
2336   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
2337   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
2338   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
2339   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
2340}
2341
2342static void
2343micro_ishr(union tgsi_exec_channel *dst,
2344           const union tgsi_exec_channel *src0,
2345           const union tgsi_exec_channel *src1)
2346{
2347   dst->i[0] = src0->i[0] >> src1->i[0];
2348   dst->i[1] = src0->i[1] >> src1->i[1];
2349   dst->i[2] = src0->i[2] >> src1->i[2];
2350   dst->i[3] = src0->i[3] >> src1->i[3];
2351}
2352
2353static void
2354micro_islt(union tgsi_exec_channel *dst,
2355           const union tgsi_exec_channel *src0,
2356           const union tgsi_exec_channel *src1)
2357{
2358   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
2359   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
2360   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
2361   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
2362}
2363
2364static void
2365micro_f2u(union tgsi_exec_channel *dst,
2366          const union tgsi_exec_channel *src)
2367{
2368   dst->u[0] = (uint)src->f[0];
2369   dst->u[1] = (uint)src->f[1];
2370   dst->u[2] = (uint)src->f[2];
2371   dst->u[3] = (uint)src->f[3];
2372}
2373
2374static void
2375micro_u2f(union tgsi_exec_channel *dst,
2376          const union tgsi_exec_channel *src)
2377{
2378   dst->f[0] = (float)src->u[0];
2379   dst->f[1] = (float)src->u[1];
2380   dst->f[2] = (float)src->u[2];
2381   dst->f[3] = (float)src->u[3];
2382}
2383
2384static void
2385micro_uadd(union tgsi_exec_channel *dst,
2386           const union tgsi_exec_channel *src0,
2387           const union tgsi_exec_channel *src1)
2388{
2389   dst->u[0] = src0->u[0] + src1->u[0];
2390   dst->u[1] = src0->u[1] + src1->u[1];
2391   dst->u[2] = src0->u[2] + src1->u[2];
2392   dst->u[3] = src0->u[3] + src1->u[3];
2393}
2394
2395static void
2396micro_udiv(union tgsi_exec_channel *dst,
2397           const union tgsi_exec_channel *src0,
2398           const union tgsi_exec_channel *src1)
2399{
2400   dst->u[0] = src0->u[0] / src1->u[0];
2401   dst->u[1] = src0->u[1] / src1->u[1];
2402   dst->u[2] = src0->u[2] / src1->u[2];
2403   dst->u[3] = src0->u[3] / src1->u[3];
2404}
2405
2406static void
2407micro_umad(union tgsi_exec_channel *dst,
2408           const union tgsi_exec_channel *src0,
2409           const union tgsi_exec_channel *src1,
2410           const union tgsi_exec_channel *src2)
2411{
2412   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
2413   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
2414   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
2415   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
2416}
2417
2418static void
2419micro_umax(union tgsi_exec_channel *dst,
2420           const union tgsi_exec_channel *src0,
2421           const union tgsi_exec_channel *src1)
2422{
2423   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
2424   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
2425   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
2426   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
2427}
2428
2429static void
2430micro_umin(union tgsi_exec_channel *dst,
2431           const union tgsi_exec_channel *src0,
2432           const union tgsi_exec_channel *src1)
2433{
2434   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
2435   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
2436   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
2437   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
2438}
2439
2440static void
2441micro_umod(union tgsi_exec_channel *dst,
2442           const union tgsi_exec_channel *src0,
2443           const union tgsi_exec_channel *src1)
2444{
2445   dst->u[0] = src0->u[0] % src1->u[0];
2446   dst->u[1] = src0->u[1] % src1->u[1];
2447   dst->u[2] = src0->u[2] % src1->u[2];
2448   dst->u[3] = src0->u[3] % src1->u[3];
2449}
2450
2451static void
2452micro_umul(union tgsi_exec_channel *dst,
2453           const union tgsi_exec_channel *src0,
2454           const union tgsi_exec_channel *src1)
2455{
2456   dst->u[0] = src0->u[0] * src1->u[0];
2457   dst->u[1] = src0->u[1] * src1->u[1];
2458   dst->u[2] = src0->u[2] * src1->u[2];
2459   dst->u[3] = src0->u[3] * src1->u[3];
2460}
2461
2462static void
2463micro_useq(union tgsi_exec_channel *dst,
2464           const union tgsi_exec_channel *src0,
2465           const union tgsi_exec_channel *src1)
2466{
2467   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
2468   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
2469   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
2470   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
2471}
2472
2473static void
2474micro_usge(union tgsi_exec_channel *dst,
2475           const union tgsi_exec_channel *src0,
2476           const union tgsi_exec_channel *src1)
2477{
2478   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
2479   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
2480   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
2481   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
2482}
2483
2484static void
2485micro_ushr(union tgsi_exec_channel *dst,
2486           const union tgsi_exec_channel *src0,
2487           const union tgsi_exec_channel *src1)
2488{
2489   dst->u[0] = src0->u[0] >> src1->u[0];
2490   dst->u[1] = src0->u[1] >> src1->u[1];
2491   dst->u[2] = src0->u[2] >> src1->u[2];
2492   dst->u[3] = src0->u[3] >> src1->u[3];
2493}
2494
2495static void
2496micro_uslt(union tgsi_exec_channel *dst,
2497           const union tgsi_exec_channel *src0,
2498           const union tgsi_exec_channel *src1)
2499{
2500   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
2501   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
2502   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
2503   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
2504}
2505
2506static void
2507micro_usne(union tgsi_exec_channel *dst,
2508           const union tgsi_exec_channel *src0,
2509           const union tgsi_exec_channel *src1)
2510{
2511   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
2512   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
2513   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
2514   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
2515}
2516
2517static void
2518exec_instruction(
2519   struct tgsi_exec_machine *mach,
2520   const struct tgsi_full_instruction *inst,
2521   int *pc )
2522{
2523   uint chan_index;
2524   union tgsi_exec_channel r[10];
2525   union tgsi_exec_channel d[8];
2526
2527   (*pc)++;
2528
2529   switch (inst->Instruction.Opcode) {
2530   case TGSI_OPCODE_ARL:
2531      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
2532      break;
2533
2534   case TGSI_OPCODE_MOV:
2535      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
2536      break;
2537
2538   case TGSI_OPCODE_LIT:
2539      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2540         FETCH( &r[0], 0, CHAN_X );
2541         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2542            micro_max(&d[CHAN_Y], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2543         }
2544
2545         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2546            FETCH( &r[1], 0, CHAN_Y );
2547            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2548
2549            FETCH( &r[2], 0, CHAN_W );
2550            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
2551            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
2552            micro_pow( &r[1], &r[1], &r[2] );
2553            micro_lt(&d[CHAN_Z], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
2554         }
2555
2556         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2557            STORE(&d[CHAN_Y], 0, CHAN_Y);
2558         }
2559         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2560            STORE(&d[CHAN_Z], 0, CHAN_Z);
2561         }
2562      }
2563      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2564         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2565      }
2566      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2567         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2568      }
2569      break;
2570
2571   case TGSI_OPCODE_RCP:
2572      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2573      break;
2574
2575   case TGSI_OPCODE_RSQ:
2576      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2577      break;
2578
2579   case TGSI_OPCODE_EXP:
2580      FETCH( &r[0], 0, CHAN_X );
2581      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
2582      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2583         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
2584         STORE( &r[2], 0, CHAN_X );        /* store r2 */
2585      }
2586      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2587         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
2588         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
2589      }
2590      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2591         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
2592         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
2593      }
2594      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2595         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2596      }
2597      break;
2598
2599   case TGSI_OPCODE_LOG:
2600      FETCH( &r[0], 0, CHAN_X );
2601      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
2602      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
2603      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
2604      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2605         STORE( &r[0], 0, CHAN_X );
2606      }
2607      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2608         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
2609         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
2610         STORE( &r[0], 0, CHAN_Y );
2611      }
2612      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2613         STORE( &r[1], 0, CHAN_Z );
2614      }
2615      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2616         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2617      }
2618      break;
2619
2620   case TGSI_OPCODE_MUL:
2621      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2622      break;
2623
2624   case TGSI_OPCODE_ADD:
2625      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2626      break;
2627
2628   case TGSI_OPCODE_DP3:
2629      exec_dp3(mach, inst);
2630      break;
2631
2632   case TGSI_OPCODE_DP4:
2633      exec_dp4(mach, inst);
2634      break;
2635
2636   case TGSI_OPCODE_DST:
2637      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2638         FETCH( &r[0], 0, CHAN_Y );
2639         FETCH( &r[1], 1, CHAN_Y);
2640         micro_mul(&d[CHAN_Y], &r[0], &r[1]);
2641      }
2642      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2643         FETCH(&d[CHAN_Z], 0, CHAN_Z);
2644      }
2645      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2646         FETCH(&d[CHAN_W], 1, CHAN_W);
2647      }
2648
2649      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2650         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X);
2651      }
2652      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2653         STORE(&d[CHAN_Y], 0, CHAN_Y);
2654      }
2655      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2656         STORE(&d[CHAN_Z], 0, CHAN_Z);
2657      }
2658      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2659         STORE(&d[CHAN_W], 0, CHAN_W);
2660      }
2661      break;
2662
2663   case TGSI_OPCODE_MIN:
2664      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2665      break;
2666
2667   case TGSI_OPCODE_MAX:
2668      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2669      break;
2670
2671   case TGSI_OPCODE_SLT:
2672      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2673      break;
2674
2675   case TGSI_OPCODE_SGE:
2676      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2677      break;
2678
2679   case TGSI_OPCODE_MAD:
2680      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2681      break;
2682
2683   case TGSI_OPCODE_SUB:
2684      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2685      break;
2686
2687   case TGSI_OPCODE_LRP:
2688      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2689      break;
2690
2691   case TGSI_OPCODE_CND:
2692      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2693      break;
2694
2695   case TGSI_OPCODE_DP2A:
2696      exec_dp2a(mach, inst);
2697      break;
2698
2699   case TGSI_OPCODE_FRC:
2700      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2701      break;
2702
2703   case TGSI_OPCODE_CLAMP:
2704      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2705      break;
2706
2707   case TGSI_OPCODE_FLR:
2708      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2709      break;
2710
2711   case TGSI_OPCODE_ROUND:
2712      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2713      break;
2714
2715   case TGSI_OPCODE_EX2:
2716      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2717      break;
2718
2719   case TGSI_OPCODE_LG2:
2720      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2721      break;
2722
2723   case TGSI_OPCODE_POW:
2724      FETCH(&r[0], 0, CHAN_X);
2725      FETCH(&r[1], 1, CHAN_X);
2726
2727      micro_pow( &r[0], &r[0], &r[1] );
2728
2729      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2730         STORE( &r[0], 0, chan_index );
2731      }
2732      break;
2733
2734   case TGSI_OPCODE_XPD:
2735      FETCH(&r[0], 0, CHAN_Y);
2736      FETCH(&r[1], 1, CHAN_Z);
2737
2738      micro_mul( &r[2], &r[0], &r[1] );
2739
2740      FETCH(&r[3], 0, CHAN_Z);
2741      FETCH(&r[4], 1, CHAN_Y);
2742
2743      micro_mul( &r[5], &r[3], &r[4] );
2744      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2745
2746      FETCH(&r[2], 1, CHAN_X);
2747
2748      micro_mul( &r[3], &r[3], &r[2] );
2749
2750      FETCH(&r[5], 0, CHAN_X);
2751
2752      micro_mul( &r[1], &r[1], &r[5] );
2753      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2754
2755      micro_mul( &r[5], &r[5], &r[4] );
2756      micro_mul( &r[0], &r[0], &r[2] );
2757      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2758
2759      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2760         STORE(&d[CHAN_X], 0, CHAN_X);
2761      }
2762      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2763         STORE(&d[CHAN_Y], 0, CHAN_Y);
2764      }
2765      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2766         STORE(&d[CHAN_Z], 0, CHAN_Z);
2767      }
2768      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2769         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2770      }
2771      break;
2772
2773   case TGSI_OPCODE_ABS:
2774      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2775      break;
2776
2777   case TGSI_OPCODE_RCC:
2778      FETCH(&r[0], 0, CHAN_X);
2779      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2780      micro_float_clamp(&r[0], &r[0]);
2781      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2782         STORE(&r[0], 0, chan_index);
2783      }
2784      break;
2785
2786   case TGSI_OPCODE_DPH:
2787      exec_dph(mach, inst);
2788      break;
2789
2790   case TGSI_OPCODE_COS:
2791      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2792      break;
2793
2794   case TGSI_OPCODE_DDX:
2795      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2796      break;
2797
2798   case TGSI_OPCODE_DDY:
2799      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2800      break;
2801
2802   case TGSI_OPCODE_KILP:
2803      exec_kilp (mach, inst);
2804      break;
2805
2806   case TGSI_OPCODE_KIL:
2807      exec_kil (mach, inst);
2808      break;
2809
2810   case TGSI_OPCODE_PK2H:
2811      assert (0);
2812      break;
2813
2814   case TGSI_OPCODE_PK2US:
2815      assert (0);
2816      break;
2817
2818   case TGSI_OPCODE_PK4B:
2819      assert (0);
2820      break;
2821
2822   case TGSI_OPCODE_PK4UB:
2823      assert (0);
2824      break;
2825
2826   case TGSI_OPCODE_RFL:
2827      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2828          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2829          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2830         /* r0 = dp3(src0, src0) */
2831         FETCH(&r[2], 0, CHAN_X);
2832         micro_mul(&r[0], &r[2], &r[2]);
2833         FETCH(&r[4], 0, CHAN_Y);
2834         micro_mul(&r[8], &r[4], &r[4]);
2835         micro_add(&r[0], &r[0], &r[8]);
2836         FETCH(&r[6], 0, CHAN_Z);
2837         micro_mul(&r[8], &r[6], &r[6]);
2838         micro_add(&r[0], &r[0], &r[8]);
2839
2840         /* r1 = dp3(src0, src1) */
2841         FETCH(&r[3], 1, CHAN_X);
2842         micro_mul(&r[1], &r[2], &r[3]);
2843         FETCH(&r[5], 1, CHAN_Y);
2844         micro_mul(&r[8], &r[4], &r[5]);
2845         micro_add(&r[1], &r[1], &r[8]);
2846         FETCH(&r[7], 1, CHAN_Z);
2847         micro_mul(&r[8], &r[6], &r[7]);
2848         micro_add(&r[1], &r[1], &r[8]);
2849
2850         /* r1 = 2 * r1 / r0 */
2851         micro_add(&r[1], &r[1], &r[1]);
2852         micro_div(&r[1], &r[1], &r[0]);
2853
2854         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2855            micro_mul(&r[2], &r[2], &r[1]);
2856            micro_sub(&r[2], &r[2], &r[3]);
2857            STORE(&r[2], 0, CHAN_X);
2858         }
2859         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2860            micro_mul(&r[4], &r[4], &r[1]);
2861            micro_sub(&r[4], &r[4], &r[5]);
2862            STORE(&r[4], 0, CHAN_Y);
2863         }
2864         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2865            micro_mul(&r[6], &r[6], &r[1]);
2866            micro_sub(&r[6], &r[6], &r[7]);
2867            STORE(&r[6], 0, CHAN_Z);
2868         }
2869      }
2870      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2871         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2872      }
2873      break;
2874
2875   case TGSI_OPCODE_SEQ:
2876      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2877      break;
2878
2879   case TGSI_OPCODE_SFL:
2880      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2881         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2882      }
2883      break;
2884
2885   case TGSI_OPCODE_SGT:
2886      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2887      break;
2888
2889   case TGSI_OPCODE_SIN:
2890      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2891      break;
2892
2893   case TGSI_OPCODE_SLE:
2894      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2895      break;
2896
2897   case TGSI_OPCODE_SNE:
2898      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
2899      break;
2900
2901   case TGSI_OPCODE_STR:
2902      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2903         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2904      }
2905      break;
2906
2907   case TGSI_OPCODE_TEX:
2908      /* simple texture lookup */
2909      /* src[0] = texcoord */
2910      /* src[1] = sampler unit */
2911      exec_tex(mach, inst, TEX_MODIFIER_NONE);
2912      break;
2913
2914   case TGSI_OPCODE_TXB:
2915      /* Texture lookup with lod bias */
2916      /* src[0] = texcoord (src[0].w = LOD bias) */
2917      /* src[1] = sampler unit */
2918      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
2919      break;
2920
2921   case TGSI_OPCODE_TXD:
2922      /* Texture lookup with explict partial derivatives */
2923      /* src[0] = texcoord */
2924      /* src[1] = d[strq]/dx */
2925      /* src[2] = d[strq]/dy */
2926      /* src[3] = sampler unit */
2927      exec_txd(mach, inst);
2928      break;
2929
2930   case TGSI_OPCODE_TXL:
2931      /* Texture lookup with explit LOD */
2932      /* src[0] = texcoord (src[0].w = LOD) */
2933      /* src[1] = sampler unit */
2934      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
2935      break;
2936
2937   case TGSI_OPCODE_TXP:
2938      /* Texture lookup with projection */
2939      /* src[0] = texcoord (src[0].w = projection) */
2940      /* src[1] = sampler unit */
2941      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
2942      break;
2943
2944   case TGSI_OPCODE_UP2H:
2945      assert (0);
2946      break;
2947
2948   case TGSI_OPCODE_UP2US:
2949      assert (0);
2950      break;
2951
2952   case TGSI_OPCODE_UP4B:
2953      assert (0);
2954      break;
2955
2956   case TGSI_OPCODE_UP4UB:
2957      assert (0);
2958      break;
2959
2960   case TGSI_OPCODE_X2D:
2961      FETCH(&r[0], 1, CHAN_X);
2962      FETCH(&r[1], 1, CHAN_Y);
2963      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2964          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2965         FETCH(&r[2], 2, CHAN_X);
2966         micro_mul(&r[2], &r[2], &r[0]);
2967         FETCH(&r[3], 2, CHAN_Y);
2968         micro_mul(&r[3], &r[3], &r[1]);
2969         micro_add(&r[2], &r[2], &r[3]);
2970         FETCH(&r[3], 0, CHAN_X);
2971         micro_add(&d[CHAN_X], &r[2], &r[3]);
2972
2973      }
2974      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2975          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2976         FETCH(&r[2], 2, CHAN_Z);
2977         micro_mul(&r[2], &r[2], &r[0]);
2978         FETCH(&r[3], 2, CHAN_W);
2979         micro_mul(&r[3], &r[3], &r[1]);
2980         micro_add(&r[2], &r[2], &r[3]);
2981         FETCH(&r[3], 0, CHAN_Y);
2982         micro_add(&d[CHAN_Y], &r[2], &r[3]);
2983
2984      }
2985      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2986         STORE(&d[CHAN_X], 0, CHAN_X);
2987      }
2988      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2989         STORE(&d[CHAN_Y], 0, CHAN_Y);
2990      }
2991      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2992         STORE(&d[CHAN_X], 0, CHAN_Z);
2993      }
2994      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2995         STORE(&d[CHAN_Y], 0, CHAN_W);
2996      }
2997      break;
2998
2999   case TGSI_OPCODE_ARA:
3000      assert (0);
3001      break;
3002
3003   case TGSI_OPCODE_ARR:
3004      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3005      break;
3006
3007   case TGSI_OPCODE_BRA:
3008      assert (0);
3009      break;
3010
3011   case TGSI_OPCODE_CAL:
3012      /* skip the call if no execution channels are enabled */
3013      if (mach->ExecMask) {
3014         /* do the call */
3015
3016         /* First, record the depths of the execution stacks.
3017          * This is important for deeply nested/looped return statements.
3018          * We have to unwind the stacks by the correct amount.  For a
3019          * real code generator, we could determine the number of entries
3020          * to pop off each stack with simple static analysis and avoid
3021          * implementing this data structure at run time.
3022          */
3023         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
3024         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
3025         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
3026         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
3027         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
3028         /* note that PC was already incremented above */
3029         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
3030
3031         mach->CallStackTop++;
3032
3033         /* Second, push the Cond, Loop, Cont, Func stacks */
3034         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3035         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3036         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3037         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3038         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3039         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
3040
3041         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3042         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3043         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3044         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3045         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3046         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
3047
3048         /* Finally, jump to the subroutine */
3049         *pc = inst->Label.Label;
3050      }
3051      break;
3052
3053   case TGSI_OPCODE_RET:
3054      mach->FuncMask &= ~mach->ExecMask;
3055      UPDATE_EXEC_MASK(mach);
3056
3057      if (mach->FuncMask == 0x0) {
3058         /* really return now (otherwise, keep executing */
3059
3060         if (mach->CallStackTop == 0) {
3061            /* returning from main() */
3062            *pc = -1;
3063            return;
3064         }
3065
3066         assert(mach->CallStackTop > 0);
3067         mach->CallStackTop--;
3068
3069         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3070         mach->CondMask = mach->CondStack[mach->CondStackTop];
3071
3072         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3073         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3074
3075         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3076         mach->ContMask = mach->ContStack[mach->ContStackTop];
3077
3078         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3079         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3080
3081         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3082         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3083
3084         assert(mach->FuncStackTop > 0);
3085         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3086
3087         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3088
3089         UPDATE_EXEC_MASK(mach);
3090      }
3091      break;
3092
3093   case TGSI_OPCODE_SSG:
3094      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3095      break;
3096
3097   case TGSI_OPCODE_CMP:
3098      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3099      break;
3100
3101   case TGSI_OPCODE_SCS:
3102      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
3103         FETCH( &r[0], 0, CHAN_X );
3104         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
3105            micro_cos(&r[1], &r[0]);
3106            STORE(&r[1], 0, CHAN_X);
3107         }
3108         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
3109            micro_sin(&r[1], &r[0]);
3110            STORE(&r[1], 0, CHAN_Y);
3111         }
3112      }
3113      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
3114         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
3115      }
3116      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
3117         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
3118      }
3119      break;
3120
3121   case TGSI_OPCODE_NRM:
3122      exec_nrm3(mach, inst);
3123      break;
3124
3125   case TGSI_OPCODE_NRM4:
3126      exec_nrm4(mach, inst);
3127      break;
3128
3129   case TGSI_OPCODE_DIV:
3130      assert( 0 );
3131      break;
3132
3133   case TGSI_OPCODE_DP2:
3134      exec_dp2(mach, inst);
3135      break;
3136
3137   case TGSI_OPCODE_IF:
3138      /* push CondMask */
3139      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
3140      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
3141      FETCH( &r[0], 0, CHAN_X );
3142      /* update CondMask */
3143      if( ! r[0].u[0] ) {
3144         mach->CondMask &= ~0x1;
3145      }
3146      if( ! r[0].u[1] ) {
3147         mach->CondMask &= ~0x2;
3148      }
3149      if( ! r[0].u[2] ) {
3150         mach->CondMask &= ~0x4;
3151      }
3152      if( ! r[0].u[3] ) {
3153         mach->CondMask &= ~0x8;
3154      }
3155      UPDATE_EXEC_MASK(mach);
3156      /* Todo: If CondMask==0, jump to ELSE */
3157      break;
3158
3159   case TGSI_OPCODE_ELSE:
3160      /* invert CondMask wrt previous mask */
3161      {
3162         uint prevMask;
3163         assert(mach->CondStackTop > 0);
3164         prevMask = mach->CondStack[mach->CondStackTop - 1];
3165         mach->CondMask = ~mach->CondMask & prevMask;
3166         UPDATE_EXEC_MASK(mach);
3167         /* Todo: If CondMask==0, jump to ENDIF */
3168      }
3169      break;
3170
3171   case TGSI_OPCODE_ENDIF:
3172      /* pop CondMask */
3173      assert(mach->CondStackTop > 0);
3174      mach->CondMask = mach->CondStack[--mach->CondStackTop];
3175      UPDATE_EXEC_MASK(mach);
3176      break;
3177
3178   case TGSI_OPCODE_END:
3179      /* halt execution */
3180      *pc = -1;
3181      break;
3182
3183   case TGSI_OPCODE_REP:
3184      assert (0);
3185      break;
3186
3187   case TGSI_OPCODE_ENDREP:
3188       assert (0);
3189       break;
3190
3191   case TGSI_OPCODE_PUSHA:
3192      assert (0);
3193      break;
3194
3195   case TGSI_OPCODE_POPA:
3196      assert (0);
3197      break;
3198
3199   case TGSI_OPCODE_CEIL:
3200      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3201      break;
3202
3203   case TGSI_OPCODE_I2F:
3204      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
3205      break;
3206
3207   case TGSI_OPCODE_NOT:
3208      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3209      break;
3210
3211   case TGSI_OPCODE_TRUNC:
3212      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
3213      break;
3214
3215   case TGSI_OPCODE_SHL:
3216      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3217      break;
3218
3219   case TGSI_OPCODE_AND:
3220      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3221      break;
3222
3223   case TGSI_OPCODE_OR:
3224      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3225      break;
3226
3227   case TGSI_OPCODE_MOD:
3228      assert (0);
3229      break;
3230
3231   case TGSI_OPCODE_XOR:
3232      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3233      break;
3234
3235   case TGSI_OPCODE_SAD:
3236      assert (0);
3237      break;
3238
3239   case TGSI_OPCODE_TXF:
3240      assert (0);
3241      break;
3242
3243   case TGSI_OPCODE_TXQ:
3244      assert (0);
3245      break;
3246
3247   case TGSI_OPCODE_EMIT:
3248      emit_vertex(mach);
3249      break;
3250
3251   case TGSI_OPCODE_ENDPRIM:
3252      emit_primitive(mach);
3253      break;
3254
3255   case TGSI_OPCODE_BGNFOR:
3256      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3257      for (chan_index = 0; chan_index < 3; chan_index++) {
3258         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
3259      }
3260      ++mach->LoopCounterStackTop;
3261      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X], 0, CHAN_X);
3262      /* update LoopMask */
3263      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3264         mach->LoopMask &= ~0x1;
3265      }
3266      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3267         mach->LoopMask &= ~0x2;
3268      }
3269      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3270         mach->LoopMask &= ~0x4;
3271      }
3272      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3273         mach->LoopMask &= ~0x8;
3274      }
3275      /* TODO: if mach->LoopMask == 0, jump to end of loop */
3276      UPDATE_EXEC_MASK(mach);
3277      /* fall-through (for now) */
3278   case TGSI_OPCODE_BGNLOOP:
3279      /* push LoopMask and ContMasks */
3280      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3281      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3282      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
3283      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3284
3285      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
3286      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
3287      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
3288      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3289      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
3290      break;
3291
3292   case TGSI_OPCODE_ENDFOR:
3293      assert(mach->LoopCounterStackTop > 0);
3294      micro_sub(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3295                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3296                &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
3297      /* update LoopMask */
3298      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[0] <= 0.0f) {
3299         mach->LoopMask &= ~0x1;
3300      }
3301      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[1] <= 0.0f) {
3302         mach->LoopMask &= ~0x2;
3303      }
3304      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[2] <= 0.0f) {
3305         mach->LoopMask &= ~0x4;
3306      }
3307      if (mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y].f[3] <= 0.0f) {
3308         mach->LoopMask &= ~0x8;
3309      }
3310      micro_add(&mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3311                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
3312                &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3313      assert(mach->LoopLabelStackTop > 0);
3314      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3315      STORE(&mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_X], 0, CHAN_X);
3316      /* Restore ContMask, but don't pop */
3317      assert(mach->ContStackTop > 0);
3318      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3319      UPDATE_EXEC_MASK(mach);
3320      if (mach->ExecMask) {
3321         /* repeat loop: jump to instruction just past BGNLOOP */
3322         assert(mach->LoopLabelStackTop > 0);
3323         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3324      }
3325      else {
3326         /* exit loop: pop LoopMask */
3327         assert(mach->LoopStackTop > 0);
3328         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3329         /* pop ContMask */
3330         assert(mach->ContStackTop > 0);
3331         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3332         assert(mach->LoopLabelStackTop > 0);
3333         --mach->LoopLabelStackTop;
3334         assert(mach->LoopCounterStackTop > 0);
3335         --mach->LoopCounterStackTop;
3336
3337         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3338      }
3339      UPDATE_EXEC_MASK(mach);
3340      break;
3341
3342   case TGSI_OPCODE_ENDLOOP:
3343      /* Restore ContMask, but don't pop */
3344      assert(mach->ContStackTop > 0);
3345      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3346      UPDATE_EXEC_MASK(mach);
3347      if (mach->ExecMask) {
3348         /* repeat loop: jump to instruction just past BGNLOOP */
3349         assert(mach->LoopLabelStackTop > 0);
3350         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3351      }
3352      else {
3353         /* exit loop: pop LoopMask */
3354         assert(mach->LoopStackTop > 0);
3355         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3356         /* pop ContMask */
3357         assert(mach->ContStackTop > 0);
3358         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3359         assert(mach->LoopLabelStackTop > 0);
3360         --mach->LoopLabelStackTop;
3361
3362         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3363      }
3364      UPDATE_EXEC_MASK(mach);
3365      break;
3366
3367   case TGSI_OPCODE_BRK:
3368      exec_break(mach);
3369      break;
3370
3371   case TGSI_OPCODE_CONT:
3372      /* turn off cont channels for each enabled exec channel */
3373      mach->ContMask &= ~mach->ExecMask;
3374      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3375      UPDATE_EXEC_MASK(mach);
3376      break;
3377
3378   case TGSI_OPCODE_BGNSUB:
3379      /* no-op */
3380      break;
3381
3382   case TGSI_OPCODE_ENDSUB:
3383      /*
3384       * XXX: This really should be a no-op. We should never reach this opcode.
3385       */
3386
3387      assert(mach->CallStackTop > 0);
3388      mach->CallStackTop--;
3389
3390      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
3391      mach->CondMask = mach->CondStack[mach->CondStackTop];
3392
3393      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
3394      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
3395
3396      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
3397      mach->ContMask = mach->ContStack[mach->ContStackTop];
3398
3399      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
3400      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
3401
3402      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
3403      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
3404
3405      assert(mach->FuncStackTop > 0);
3406      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
3407
3408      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
3409
3410      UPDATE_EXEC_MASK(mach);
3411      break;
3412
3413   case TGSI_OPCODE_NOP:
3414      break;
3415
3416   case TGSI_OPCODE_BREAKC:
3417      FETCH(&r[0], 0, CHAN_X);
3418      /* update CondMask */
3419      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
3420         mach->LoopMask &= ~0x1;
3421      }
3422      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
3423         mach->LoopMask &= ~0x2;
3424      }
3425      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
3426         mach->LoopMask &= ~0x4;
3427      }
3428      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
3429         mach->LoopMask &= ~0x8;
3430      }
3431      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3432      UPDATE_EXEC_MASK(mach);
3433      break;
3434
3435   case TGSI_OPCODE_F2I:
3436      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
3437      break;
3438
3439   case TGSI_OPCODE_IDIV:
3440      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3441      break;
3442
3443   case TGSI_OPCODE_IMAX:
3444      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3445      break;
3446
3447   case TGSI_OPCODE_IMIN:
3448      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3449      break;
3450
3451   case TGSI_OPCODE_INEG:
3452      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3453      break;
3454
3455   case TGSI_OPCODE_ISGE:
3456      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3457      break;
3458
3459   case TGSI_OPCODE_ISHR:
3460      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3461      break;
3462
3463   case TGSI_OPCODE_ISLT:
3464      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
3465      break;
3466
3467   case TGSI_OPCODE_F2U:
3468      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
3469      break;
3470
3471   case TGSI_OPCODE_U2F:
3472      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
3473      break;
3474
3475   case TGSI_OPCODE_UADD:
3476      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3477      break;
3478
3479   case TGSI_OPCODE_UDIV:
3480      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3481      break;
3482
3483   case TGSI_OPCODE_UMAD:
3484      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3485      break;
3486
3487   case TGSI_OPCODE_UMAX:
3488      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3489      break;
3490
3491   case TGSI_OPCODE_UMIN:
3492      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3493      break;
3494
3495   case TGSI_OPCODE_UMOD:
3496      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3497      break;
3498
3499   case TGSI_OPCODE_UMUL:
3500      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3501      break;
3502
3503   case TGSI_OPCODE_USEQ:
3504      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3505      break;
3506
3507   case TGSI_OPCODE_USGE:
3508      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3509      break;
3510
3511   case TGSI_OPCODE_USHR:
3512      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3513      break;
3514
3515   case TGSI_OPCODE_USLT:
3516      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3517      break;
3518
3519   case TGSI_OPCODE_USNE:
3520      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
3521      break;
3522
3523   case TGSI_OPCODE_SWITCH:
3524      exec_switch(mach, inst);
3525      break;
3526
3527   case TGSI_OPCODE_CASE:
3528      exec_case(mach, inst);
3529      break;
3530
3531   case TGSI_OPCODE_DEFAULT:
3532      exec_default(mach);
3533      break;
3534
3535   case TGSI_OPCODE_ENDSWITCH:
3536      exec_endswitch(mach);
3537      break;
3538
3539   default:
3540      assert( 0 );
3541   }
3542}
3543
3544
3545#define DEBUG_EXECUTION 0
3546
3547
3548/**
3549 * Run TGSI interpreter.
3550 * \return bitmask of "alive" quad components
3551 */
3552uint
3553tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3554{
3555   uint i;
3556   int pc = 0;
3557
3558   mach->CondMask = 0xf;
3559   mach->LoopMask = 0xf;
3560   mach->ContMask = 0xf;
3561   mach->FuncMask = 0xf;
3562   mach->ExecMask = 0xf;
3563
3564   mach->Switch.mask = 0xf;
3565
3566   assert(mach->CondStackTop == 0);
3567   assert(mach->LoopStackTop == 0);
3568   assert(mach->ContStackTop == 0);
3569   assert(mach->SwitchStackTop == 0);
3570   assert(mach->BreakStackTop == 0);
3571   assert(mach->CallStackTop == 0);
3572
3573   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3574   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3575
3576   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3577      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3578      mach->Primitives[0] = 0;
3579   }
3580
3581   for (i = 0; i < QUAD_SIZE; i++) {
3582      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3583         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3584         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3585         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3586         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3587   }
3588
3589   /* execute declarations (interpolants) */
3590   for (i = 0; i < mach->NumDeclarations; i++) {
3591      exec_declaration( mach, mach->Declarations+i );
3592   }
3593
3594   {
3595#if DEBUG_EXECUTION
3596      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
3597      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
3598      uint inst = 1;
3599
3600      memcpy(temps, mach->Temps, sizeof(temps));
3601      memcpy(outputs, mach->Outputs, sizeof(outputs));
3602#endif
3603
3604      /* execute instructions, until pc is set to -1 */
3605      while (pc != -1) {
3606
3607#if DEBUG_EXECUTION
3608         uint i;
3609
3610         tgsi_dump_instruction(&mach->Instructions[pc], inst++);
3611#endif
3612
3613         assert(pc < (int) mach->NumInstructions);
3614         exec_instruction(mach, mach->Instructions + pc, &pc);
3615
3616#if DEBUG_EXECUTION
3617         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
3618            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
3619               uint j;
3620
3621               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
3622               debug_printf("TEMP[%2u] = ", i);
3623               for (j = 0; j < 4; j++) {
3624                  if (j > 0) {
3625                     debug_printf("           ");
3626                  }
3627                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3628                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
3629                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
3630                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
3631                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
3632               }
3633            }
3634         }
3635         for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
3636            if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
3637               uint j;
3638
3639               memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
3640               debug_printf("OUT[%2u] =  ", i);
3641               for (j = 0; j < 4; j++) {
3642                  if (j > 0) {
3643                     debug_printf("           ");
3644                  }
3645                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3646                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
3647                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
3648                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
3649                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
3650               }
3651            }
3652         }
3653#endif
3654      }
3655   }
3656
3657#if 0
3658   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3659   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3660      /*
3661       * Scale back depth component.
3662       */
3663      for (i = 0; i < 4; i++)
3664         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3665   }
3666#endif
3667
3668   assert(mach->CondStackTop == 0);
3669   assert(mach->LoopStackTop == 0);
3670   assert(mach->ContStackTop == 0);
3671   assert(mach->SwitchStackTop == 0);
3672   assert(mach->BreakStackTop == 0);
3673   assert(mach->CallStackTop == 0);
3674
3675   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3676}
3677