tgsi_exec.c revision 4a49301e
1/**************************************************************************
2 *
3 * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * TGSI interpreter/executor.
30 *
31 * Flow control information:
32 *
33 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
34 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
35 * care since a condition may be true for some quad components but false
36 * for other components.
37 *
38 * We basically execute all statements (even if they're in the part of
39 * an IF/ELSE clause that's "not taken") and use a special mask to
40 * control writing to destination registers.  This is the ExecMask.
41 * See store_dest().
42 *
43 * The ExecMask is computed from three other masks (CondMask, LoopMask and
44 * ContMask) which are controlled by the flow control instructions (namely:
45 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
46 *
47 *
48 * Authors:
49 *   Michal Krol
50 *   Brian Paul
51 */
52
53#include "pipe/p_compiler.h"
54#include "pipe/p_state.h"
55#include "pipe/p_shader_tokens.h"
56#include "tgsi/tgsi_dump.h"
57#include "tgsi/tgsi_parse.h"
58#include "tgsi/tgsi_util.h"
59#include "tgsi_exec.h"
60#include "util/u_memory.h"
61#include "util/u_math.h"
62
63#define FAST_MATH 1
64
65/** for tgsi_full_instruction::Flags */
66#define SOA_DEPENDENCY_FLAG 0x1
67
68#define TILE_TOP_LEFT     0
69#define TILE_TOP_RIGHT    1
70#define TILE_BOTTOM_LEFT  2
71#define TILE_BOTTOM_RIGHT 3
72
73#define CHAN_X  0
74#define CHAN_Y  1
75#define CHAN_Z  2
76#define CHAN_W  3
77
78/*
79 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
80 */
81#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
82#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
83#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
84#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
85#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
86#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
87#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
88#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
89#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
90#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
91#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
92#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
93#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
94#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
95#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
96#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
97#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
98#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
99#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
100#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
101#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
102#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
103#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
104#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
105#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
106#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
107#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
108#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
109#define TEMP_R0            TGSI_EXEC_TEMP_R0
110#define TEMP_P0            TGSI_EXEC_TEMP_P0
111
112#define IS_CHANNEL_ENABLED(INST, CHAN)\
113   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
114
115#define IS_CHANNEL_ENABLED2(INST, CHAN)\
116   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
117
118#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
119   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
120      if (IS_CHANNEL_ENABLED( INST, CHAN ))
121
122#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
123   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
124      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
125
126
127/** The execution mask depends on the conditional mask and the loop mask */
128#define UPDATE_EXEC_MASK(MACH) \
129      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
130
131
132static const union tgsi_exec_channel ZeroVec =
133   { { 0.0, 0.0, 0.0, 0.0 } };
134
135
136static INLINE void
137check_inf_or_nan(const union tgsi_exec_channel *chan)
138{
139   assert(!util_is_inf_or_nan(chan->f[0]));
140   assert(!util_is_inf_or_nan(chan->f[1]));
141   assert(!util_is_inf_or_nan(chan->f[2]));
142   assert(!util_is_inf_or_nan(chan->f[3]));
143}
144
145
146#ifdef DEBUG
147static void
148print_chan(const char *msg, const union tgsi_exec_channel *chan)
149{
150   debug_printf("%s = {%f, %f, %f, %f}\n",
151                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
152}
153#endif
154
155
156#ifdef DEBUG
157static void
158print_temp(const struct tgsi_exec_machine *mach, uint index)
159{
160   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
161   int i;
162   debug_printf("Temp[%u] =\n", index);
163   for (i = 0; i < 4; i++) {
164      debug_printf("  %c: { %f, %f, %f, %f }\n",
165                   "XYZW"[i],
166                   tmp->xyzw[i].f[0],
167                   tmp->xyzw[i].f[1],
168                   tmp->xyzw[i].f[2],
169                   tmp->xyzw[i].f[3]);
170   }
171}
172#endif
173
174
175/**
176 * Check if there's a potential src/dst register data dependency when
177 * using SOA execution.
178 * Example:
179 *   MOV T, T.yxwz;
180 * This would expand into:
181 *   MOV t0, t1;
182 *   MOV t1, t0;
183 *   MOV t2, t3;
184 *   MOV t3, t2;
185 * The second instruction will have the wrong value for t0 if executed as-is.
186 */
187boolean
188tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
189{
190   uint i, chan;
191
192   uint writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
193   if (writemask == TGSI_WRITEMASK_X ||
194       writemask == TGSI_WRITEMASK_Y ||
195       writemask == TGSI_WRITEMASK_Z ||
196       writemask == TGSI_WRITEMASK_W ||
197       writemask == TGSI_WRITEMASK_NONE) {
198      /* no chance of data dependency */
199      return FALSE;
200   }
201
202   /* loop over src regs */
203   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
204      if ((inst->FullSrcRegisters[i].SrcRegister.File ==
205           inst->FullDstRegisters[0].DstRegister.File) &&
206          (inst->FullSrcRegisters[i].SrcRegister.Index ==
207           inst->FullDstRegisters[0].DstRegister.Index)) {
208         /* loop over dest channels */
209         uint channelsWritten = 0x0;
210         FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
211            /* check if we're reading a channel that's been written */
212            uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->FullSrcRegisters[i], chan);
213            if (channelsWritten & (1 << swizzle)) {
214               return TRUE;
215            }
216
217            channelsWritten |= (1 << chan);
218         }
219      }
220   }
221   return FALSE;
222}
223
224
225/**
226 * Initialize machine state by expanding tokens to full instructions,
227 * allocating temporary storage, setting up constants, etc.
228 * After this, we can call tgsi_exec_machine_run() many times.
229 */
230void
231tgsi_exec_machine_bind_shader(
232   struct tgsi_exec_machine *mach,
233   const struct tgsi_token *tokens,
234   uint numSamplers,
235   struct tgsi_sampler **samplers)
236{
237   uint k;
238   struct tgsi_parse_context parse;
239   struct tgsi_exec_labels *labels = &mach->Labels;
240   struct tgsi_full_instruction *instructions;
241   struct tgsi_full_declaration *declarations;
242   uint maxInstructions = 10, numInstructions = 0;
243   uint maxDeclarations = 10, numDeclarations = 0;
244   uint instno = 0;
245
246#if 0
247   tgsi_dump(tokens, 0);
248#endif
249
250   util_init_math();
251
252   mach->Tokens = tokens;
253   mach->Samplers = samplers;
254
255   k = tgsi_parse_init (&parse, mach->Tokens);
256   if (k != TGSI_PARSE_OK) {
257      debug_printf( "Problem parsing!\n" );
258      return;
259   }
260
261   mach->Processor = parse.FullHeader.Processor.Processor;
262   mach->ImmLimit = 0;
263   labels->count = 0;
264
265   declarations = (struct tgsi_full_declaration *)
266      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
267
268   if (!declarations) {
269      return;
270   }
271
272   instructions = (struct tgsi_full_instruction *)
273      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
274
275   if (!instructions) {
276      FREE( declarations );
277      return;
278   }
279
280   while( !tgsi_parse_end_of_tokens( &parse ) ) {
281      uint pointer = parse.Position;
282      uint i;
283
284      tgsi_parse_token( &parse );
285      switch( parse.FullToken.Token.Type ) {
286      case TGSI_TOKEN_TYPE_DECLARATION:
287         /* save expanded declaration */
288         if (numDeclarations == maxDeclarations) {
289            declarations = REALLOC(declarations,
290                                   maxDeclarations
291                                   * sizeof(struct tgsi_full_declaration),
292                                   (maxDeclarations + 10)
293                                   * sizeof(struct tgsi_full_declaration));
294            maxDeclarations += 10;
295         }
296         memcpy(declarations + numDeclarations,
297                &parse.FullToken.FullDeclaration,
298                sizeof(declarations[0]));
299         numDeclarations++;
300         break;
301
302      case TGSI_TOKEN_TYPE_IMMEDIATE:
303         {
304            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
305            assert( size <= 4 );
306            assert( mach->ImmLimit + 1 <= TGSI_EXEC_NUM_IMMEDIATES );
307
308            for( i = 0; i < size; i++ ) {
309               mach->Imms[mach->ImmLimit][i] =
310		  parse.FullToken.FullImmediate.u[i].Float;
311            }
312            mach->ImmLimit += 1;
313         }
314         break;
315
316      case TGSI_TOKEN_TYPE_INSTRUCTION:
317         assert( labels->count < MAX_LABELS );
318
319         labels->labels[labels->count][0] = instno;
320         labels->labels[labels->count][1] = pointer;
321         labels->count++;
322
323         /* save expanded instruction */
324         if (numInstructions == maxInstructions) {
325            instructions = REALLOC(instructions,
326                                   maxInstructions
327                                   * sizeof(struct tgsi_full_instruction),
328                                   (maxInstructions + 10)
329                                   * sizeof(struct tgsi_full_instruction));
330            maxInstructions += 10;
331         }
332
333         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
334            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
335            parse.FullToken.FullInstruction.Flags = SOA_DEPENDENCY_FLAG;
336            /* XXX we only handle SOA dependencies properly for MOV/SWZ
337             * at this time!
338             */
339            if (opcode != TGSI_OPCODE_MOV &&
340                opcode != TGSI_OPCODE_MUL &&
341                opcode != TGSI_OPCODE_CMP) {
342               debug_printf("Warning: SOA dependency in instruction"
343                            " is not handled:\n");
344               tgsi_dump_instruction(&parse.FullToken.FullInstruction,
345                                     numInstructions);
346            }
347         }
348
349         memcpy(instructions + numInstructions,
350                &parse.FullToken.FullInstruction,
351                sizeof(instructions[0]));
352
353         numInstructions++;
354         break;
355
356      default:
357         assert( 0 );
358      }
359   }
360   tgsi_parse_free (&parse);
361
362   if (mach->Declarations) {
363      FREE( mach->Declarations );
364   }
365   mach->Declarations = declarations;
366   mach->NumDeclarations = numDeclarations;
367
368   if (mach->Instructions) {
369      FREE( mach->Instructions );
370   }
371   mach->Instructions = instructions;
372   mach->NumInstructions = numInstructions;
373}
374
375
376struct tgsi_exec_machine *
377tgsi_exec_machine_create( void )
378{
379   struct tgsi_exec_machine *mach;
380   uint i;
381
382   mach = align_malloc( sizeof *mach, 16 );
383   if (!mach)
384      goto fail;
385
386   memset(mach, 0, sizeof(*mach));
387
388   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
389
390   /* Setup constants. */
391   for( i = 0; i < 4; i++ ) {
392      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
393      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
394      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
395      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
396      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
397      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
398      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
399      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
400      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
401      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
402   }
403
404#ifdef DEBUG
405   /* silence warnings */
406   (void) print_chan;
407   (void) print_temp;
408#endif
409
410   return mach;
411
412fail:
413   align_free(mach);
414   return NULL;
415}
416
417
418void
419tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
420{
421   if (mach) {
422      FREE(mach->Instructions);
423      FREE(mach->Declarations);
424   }
425
426   align_free(mach);
427}
428
429
430static void
431micro_abs(
432   union tgsi_exec_channel *dst,
433   const union tgsi_exec_channel *src )
434{
435   dst->f[0] = fabsf( src->f[0] );
436   dst->f[1] = fabsf( src->f[1] );
437   dst->f[2] = fabsf( src->f[2] );
438   dst->f[3] = fabsf( src->f[3] );
439}
440
441static void
442micro_add(
443   union tgsi_exec_channel *dst,
444   const union tgsi_exec_channel *src0,
445   const union tgsi_exec_channel *src1 )
446{
447   dst->f[0] = src0->f[0] + src1->f[0];
448   dst->f[1] = src0->f[1] + src1->f[1];
449   dst->f[2] = src0->f[2] + src1->f[2];
450   dst->f[3] = src0->f[3] + src1->f[3];
451}
452
453#if 0
454static void
455micro_iadd(
456   union tgsi_exec_channel *dst,
457   const union tgsi_exec_channel *src0,
458   const union tgsi_exec_channel *src1 )
459{
460   dst->i[0] = src0->i[0] + src1->i[0];
461   dst->i[1] = src0->i[1] + src1->i[1];
462   dst->i[2] = src0->i[2] + src1->i[2];
463   dst->i[3] = src0->i[3] + src1->i[3];
464}
465#endif
466
467static void
468micro_and(
469   union tgsi_exec_channel *dst,
470   const union tgsi_exec_channel *src0,
471   const union tgsi_exec_channel *src1 )
472{
473   dst->u[0] = src0->u[0] & src1->u[0];
474   dst->u[1] = src0->u[1] & src1->u[1];
475   dst->u[2] = src0->u[2] & src1->u[2];
476   dst->u[3] = src0->u[3] & src1->u[3];
477}
478
479static void
480micro_ceil(
481   union tgsi_exec_channel *dst,
482   const union tgsi_exec_channel *src )
483{
484   dst->f[0] = ceilf( src->f[0] );
485   dst->f[1] = ceilf( src->f[1] );
486   dst->f[2] = ceilf( src->f[2] );
487   dst->f[3] = ceilf( src->f[3] );
488}
489
490static void
491micro_cos(
492   union tgsi_exec_channel *dst,
493   const union tgsi_exec_channel *src )
494{
495   dst->f[0] = cosf( src->f[0] );
496   dst->f[1] = cosf( src->f[1] );
497   dst->f[2] = cosf( src->f[2] );
498   dst->f[3] = cosf( src->f[3] );
499}
500
501static void
502micro_ddx(
503   union tgsi_exec_channel *dst,
504   const union tgsi_exec_channel *src )
505{
506   dst->f[0] =
507   dst->f[1] =
508   dst->f[2] =
509   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
510}
511
512static void
513micro_ddy(
514   union tgsi_exec_channel *dst,
515   const union tgsi_exec_channel *src )
516{
517   dst->f[0] =
518   dst->f[1] =
519   dst->f[2] =
520   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
521}
522
523static void
524micro_div(
525   union tgsi_exec_channel *dst,
526   const union tgsi_exec_channel *src0,
527   const union tgsi_exec_channel *src1 )
528{
529   if (src1->f[0] != 0) {
530      dst->f[0] = src0->f[0] / src1->f[0];
531   }
532   if (src1->f[1] != 0) {
533      dst->f[1] = src0->f[1] / src1->f[1];
534   }
535   if (src1->f[2] != 0) {
536      dst->f[2] = src0->f[2] / src1->f[2];
537   }
538   if (src1->f[3] != 0) {
539      dst->f[3] = src0->f[3] / src1->f[3];
540   }
541}
542
543#if 0
544static void
545micro_udiv(
546   union tgsi_exec_channel *dst,
547   const union tgsi_exec_channel *src0,
548   const union tgsi_exec_channel *src1 )
549{
550   dst->u[0] = src0->u[0] / src1->u[0];
551   dst->u[1] = src0->u[1] / src1->u[1];
552   dst->u[2] = src0->u[2] / src1->u[2];
553   dst->u[3] = src0->u[3] / src1->u[3];
554}
555#endif
556
557static void
558micro_eq(
559   union tgsi_exec_channel *dst,
560   const union tgsi_exec_channel *src0,
561   const union tgsi_exec_channel *src1,
562   const union tgsi_exec_channel *src2,
563   const union tgsi_exec_channel *src3 )
564{
565   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
566   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
567   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
568   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
569}
570
571#if 0
572static void
573micro_ieq(
574   union tgsi_exec_channel *dst,
575   const union tgsi_exec_channel *src0,
576   const union tgsi_exec_channel *src1,
577   const union tgsi_exec_channel *src2,
578   const union tgsi_exec_channel *src3 )
579{
580   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
581   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
582   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
583   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
584}
585#endif
586
587static void
588micro_exp2(
589   union tgsi_exec_channel *dst,
590   const union tgsi_exec_channel *src)
591{
592#if FAST_MATH
593   dst->f[0] = util_fast_exp2( src->f[0] );
594   dst->f[1] = util_fast_exp2( src->f[1] );
595   dst->f[2] = util_fast_exp2( src->f[2] );
596   dst->f[3] = util_fast_exp2( src->f[3] );
597#else
598   dst->f[0] = powf( 2.0f, src->f[0] );
599   dst->f[1] = powf( 2.0f, src->f[1] );
600   dst->f[2] = powf( 2.0f, src->f[2] );
601   dst->f[3] = powf( 2.0f, src->f[3] );
602#endif
603}
604
605#if 0
606static void
607micro_f2ut(
608   union tgsi_exec_channel *dst,
609   const union tgsi_exec_channel *src )
610{
611   dst->u[0] = (uint) src->f[0];
612   dst->u[1] = (uint) src->f[1];
613   dst->u[2] = (uint) src->f[2];
614   dst->u[3] = (uint) src->f[3];
615}
616#endif
617
618static void
619micro_float_clamp(union tgsi_exec_channel *dst,
620                  const union tgsi_exec_channel *src)
621{
622   uint i;
623
624   for (i = 0; i < 4; i++) {
625      if (src->f[i] > 0.0f) {
626         if (src->f[i] > 1.884467e+019f)
627            dst->f[i] = 1.884467e+019f;
628         else if (src->f[i] < 5.42101e-020f)
629            dst->f[i] = 5.42101e-020f;
630         else
631            dst->f[i] = src->f[i];
632      }
633      else {
634         if (src->f[i] < -1.884467e+019f)
635            dst->f[i] = -1.884467e+019f;
636         else if (src->f[i] > -5.42101e-020f)
637            dst->f[i] = -5.42101e-020f;
638         else
639            dst->f[i] = src->f[i];
640      }
641   }
642}
643
644static void
645micro_flr(
646   union tgsi_exec_channel *dst,
647   const union tgsi_exec_channel *src )
648{
649   dst->f[0] = floorf( src->f[0] );
650   dst->f[1] = floorf( src->f[1] );
651   dst->f[2] = floorf( src->f[2] );
652   dst->f[3] = floorf( src->f[3] );
653}
654
655static void
656micro_frc(
657   union tgsi_exec_channel *dst,
658   const union tgsi_exec_channel *src )
659{
660   dst->f[0] = src->f[0] - floorf( src->f[0] );
661   dst->f[1] = src->f[1] - floorf( src->f[1] );
662   dst->f[2] = src->f[2] - floorf( src->f[2] );
663   dst->f[3] = src->f[3] - floorf( src->f[3] );
664}
665
666static void
667micro_i2f(
668   union tgsi_exec_channel *dst,
669   const union tgsi_exec_channel *src )
670{
671   dst->f[0] = (float) src->i[0];
672   dst->f[1] = (float) src->i[1];
673   dst->f[2] = (float) src->i[2];
674   dst->f[3] = (float) src->i[3];
675}
676
677static void
678micro_lg2(
679   union tgsi_exec_channel *dst,
680   const union tgsi_exec_channel *src )
681{
682#if FAST_MATH
683   dst->f[0] = util_fast_log2( src->f[0] );
684   dst->f[1] = util_fast_log2( src->f[1] );
685   dst->f[2] = util_fast_log2( src->f[2] );
686   dst->f[3] = util_fast_log2( src->f[3] );
687#else
688   dst->f[0] = logf( src->f[0] ) * 1.442695f;
689   dst->f[1] = logf( src->f[1] ) * 1.442695f;
690   dst->f[2] = logf( src->f[2] ) * 1.442695f;
691   dst->f[3] = logf( src->f[3] ) * 1.442695f;
692#endif
693}
694
695static void
696micro_le(
697   union tgsi_exec_channel *dst,
698   const union tgsi_exec_channel *src0,
699   const union tgsi_exec_channel *src1,
700   const union tgsi_exec_channel *src2,
701   const union tgsi_exec_channel *src3 )
702{
703   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
704   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
705   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
706   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
707}
708
709static void
710micro_lt(
711   union tgsi_exec_channel *dst,
712   const union tgsi_exec_channel *src0,
713   const union tgsi_exec_channel *src1,
714   const union tgsi_exec_channel *src2,
715   const union tgsi_exec_channel *src3 )
716{
717   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
718   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
719   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
720   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
721}
722
723#if 0
724static void
725micro_ilt(
726   union tgsi_exec_channel *dst,
727   const union tgsi_exec_channel *src0,
728   const union tgsi_exec_channel *src1,
729   const union tgsi_exec_channel *src2,
730   const union tgsi_exec_channel *src3 )
731{
732   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
733   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
734   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
735   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
736}
737#endif
738
739#if 0
740static void
741micro_ult(
742   union tgsi_exec_channel *dst,
743   const union tgsi_exec_channel *src0,
744   const union tgsi_exec_channel *src1,
745   const union tgsi_exec_channel *src2,
746   const union tgsi_exec_channel *src3 )
747{
748   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
749   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
750   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
751   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
752}
753#endif
754
755static void
756micro_max(
757   union tgsi_exec_channel *dst,
758   const union tgsi_exec_channel *src0,
759   const union tgsi_exec_channel *src1 )
760{
761   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
762   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
763   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
764   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
765}
766
767#if 0
768static void
769micro_imax(
770   union tgsi_exec_channel *dst,
771   const union tgsi_exec_channel *src0,
772   const union tgsi_exec_channel *src1 )
773{
774   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
775   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
776   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
777   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
778}
779#endif
780
781#if 0
782static void
783micro_umax(
784   union tgsi_exec_channel *dst,
785   const union tgsi_exec_channel *src0,
786   const union tgsi_exec_channel *src1 )
787{
788   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
789   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
790   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
791   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
792}
793#endif
794
795static void
796micro_min(
797   union tgsi_exec_channel *dst,
798   const union tgsi_exec_channel *src0,
799   const union tgsi_exec_channel *src1 )
800{
801   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
802   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
803   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
804   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
805}
806
807#if 0
808static void
809micro_imin(
810   union tgsi_exec_channel *dst,
811   const union tgsi_exec_channel *src0,
812   const union tgsi_exec_channel *src1 )
813{
814   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
815   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
816   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
817   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
818}
819#endif
820
821#if 0
822static void
823micro_umin(
824   union tgsi_exec_channel *dst,
825   const union tgsi_exec_channel *src0,
826   const union tgsi_exec_channel *src1 )
827{
828   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
829   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
830   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
831   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
832}
833#endif
834
835#if 0
836static void
837micro_umod(
838   union tgsi_exec_channel *dst,
839   const union tgsi_exec_channel *src0,
840   const union tgsi_exec_channel *src1 )
841{
842   dst->u[0] = src0->u[0] % src1->u[0];
843   dst->u[1] = src0->u[1] % src1->u[1];
844   dst->u[2] = src0->u[2] % src1->u[2];
845   dst->u[3] = src0->u[3] % src1->u[3];
846}
847#endif
848
849static void
850micro_mul(
851   union tgsi_exec_channel *dst,
852   const union tgsi_exec_channel *src0,
853   const union tgsi_exec_channel *src1 )
854{
855   dst->f[0] = src0->f[0] * src1->f[0];
856   dst->f[1] = src0->f[1] * src1->f[1];
857   dst->f[2] = src0->f[2] * src1->f[2];
858   dst->f[3] = src0->f[3] * src1->f[3];
859}
860
861#if 0
862static void
863micro_imul(
864   union tgsi_exec_channel *dst,
865   const union tgsi_exec_channel *src0,
866   const union tgsi_exec_channel *src1 )
867{
868   dst->i[0] = src0->i[0] * src1->i[0];
869   dst->i[1] = src0->i[1] * src1->i[1];
870   dst->i[2] = src0->i[2] * src1->i[2];
871   dst->i[3] = src0->i[3] * src1->i[3];
872}
873#endif
874
875#if 0
876static void
877micro_imul64(
878   union tgsi_exec_channel *dst0,
879   union tgsi_exec_channel *dst1,
880   const union tgsi_exec_channel *src0,
881   const union tgsi_exec_channel *src1 )
882{
883   dst1->i[0] = src0->i[0] * src1->i[0];
884   dst1->i[1] = src0->i[1] * src1->i[1];
885   dst1->i[2] = src0->i[2] * src1->i[2];
886   dst1->i[3] = src0->i[3] * src1->i[3];
887   dst0->i[0] = 0;
888   dst0->i[1] = 0;
889   dst0->i[2] = 0;
890   dst0->i[3] = 0;
891}
892#endif
893
894#if 0
895static void
896micro_umul64(
897   union tgsi_exec_channel *dst0,
898   union tgsi_exec_channel *dst1,
899   const union tgsi_exec_channel *src0,
900   const union tgsi_exec_channel *src1 )
901{
902   dst1->u[0] = src0->u[0] * src1->u[0];
903   dst1->u[1] = src0->u[1] * src1->u[1];
904   dst1->u[2] = src0->u[2] * src1->u[2];
905   dst1->u[3] = src0->u[3] * src1->u[3];
906   dst0->u[0] = 0;
907   dst0->u[1] = 0;
908   dst0->u[2] = 0;
909   dst0->u[3] = 0;
910}
911#endif
912
913
914#if 0
915static void
916micro_movc(
917   union tgsi_exec_channel *dst,
918   const union tgsi_exec_channel *src0,
919   const union tgsi_exec_channel *src1,
920   const union tgsi_exec_channel *src2 )
921{
922   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
923   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
924   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
925   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
926}
927#endif
928
929static void
930micro_neg(
931   union tgsi_exec_channel *dst,
932   const union tgsi_exec_channel *src )
933{
934   dst->f[0] = -src->f[0];
935   dst->f[1] = -src->f[1];
936   dst->f[2] = -src->f[2];
937   dst->f[3] = -src->f[3];
938}
939
940#if 0
941static void
942micro_ineg(
943   union tgsi_exec_channel *dst,
944   const union tgsi_exec_channel *src )
945{
946   dst->i[0] = -src->i[0];
947   dst->i[1] = -src->i[1];
948   dst->i[2] = -src->i[2];
949   dst->i[3] = -src->i[3];
950}
951#endif
952
953static void
954micro_not(
955   union tgsi_exec_channel *dst,
956   const union tgsi_exec_channel *src )
957{
958   dst->u[0] = ~src->u[0];
959   dst->u[1] = ~src->u[1];
960   dst->u[2] = ~src->u[2];
961   dst->u[3] = ~src->u[3];
962}
963
964static void
965micro_or(
966   union tgsi_exec_channel *dst,
967   const union tgsi_exec_channel *src0,
968   const union tgsi_exec_channel *src1 )
969{
970   dst->u[0] = src0->u[0] | src1->u[0];
971   dst->u[1] = src0->u[1] | src1->u[1];
972   dst->u[2] = src0->u[2] | src1->u[2];
973   dst->u[3] = src0->u[3] | src1->u[3];
974}
975
976static void
977micro_pow(
978   union tgsi_exec_channel *dst,
979   const union tgsi_exec_channel *src0,
980   const union tgsi_exec_channel *src1 )
981{
982#if FAST_MATH
983   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
984   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
985   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
986   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
987#else
988   dst->f[0] = powf( src0->f[0], src1->f[0] );
989   dst->f[1] = powf( src0->f[1], src1->f[1] );
990   dst->f[2] = powf( src0->f[2], src1->f[2] );
991   dst->f[3] = powf( src0->f[3], src1->f[3] );
992#endif
993}
994
995static void
996micro_rnd(
997   union tgsi_exec_channel *dst,
998   const union tgsi_exec_channel *src )
999{
1000   dst->f[0] = floorf( src->f[0] + 0.5f );
1001   dst->f[1] = floorf( src->f[1] + 0.5f );
1002   dst->f[2] = floorf( src->f[2] + 0.5f );
1003   dst->f[3] = floorf( src->f[3] + 0.5f );
1004}
1005
1006static void
1007micro_sgn(
1008   union tgsi_exec_channel *dst,
1009   const union tgsi_exec_channel *src )
1010{
1011   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
1012   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
1013   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
1014   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
1015}
1016
1017static void
1018micro_shl(
1019   union tgsi_exec_channel *dst,
1020   const union tgsi_exec_channel *src0,
1021   const union tgsi_exec_channel *src1 )
1022{
1023   dst->i[0] = src0->i[0] << src1->i[0];
1024   dst->i[1] = src0->i[1] << src1->i[1];
1025   dst->i[2] = src0->i[2] << src1->i[2];
1026   dst->i[3] = src0->i[3] << src1->i[3];
1027}
1028
1029static void
1030micro_ishr(
1031   union tgsi_exec_channel *dst,
1032   const union tgsi_exec_channel *src0,
1033   const union tgsi_exec_channel *src1 )
1034{
1035   dst->i[0] = src0->i[0] >> src1->i[0];
1036   dst->i[1] = src0->i[1] >> src1->i[1];
1037   dst->i[2] = src0->i[2] >> src1->i[2];
1038   dst->i[3] = src0->i[3] >> src1->i[3];
1039}
1040
1041static void
1042micro_trunc(
1043   union tgsi_exec_channel *dst,
1044   const union tgsi_exec_channel *src0 )
1045{
1046   dst->f[0] = (float) (int) src0->f[0];
1047   dst->f[1] = (float) (int) src0->f[1];
1048   dst->f[2] = (float) (int) src0->f[2];
1049   dst->f[3] = (float) (int) src0->f[3];
1050}
1051
1052#if 0
1053static void
1054micro_ushr(
1055   union tgsi_exec_channel *dst,
1056   const union tgsi_exec_channel *src0,
1057   const union tgsi_exec_channel *src1 )
1058{
1059   dst->u[0] = src0->u[0] >> src1->u[0];
1060   dst->u[1] = src0->u[1] >> src1->u[1];
1061   dst->u[2] = src0->u[2] >> src1->u[2];
1062   dst->u[3] = src0->u[3] >> src1->u[3];
1063}
1064#endif
1065
1066static void
1067micro_sin(
1068   union tgsi_exec_channel *dst,
1069   const union tgsi_exec_channel *src )
1070{
1071   dst->f[0] = sinf( src->f[0] );
1072   dst->f[1] = sinf( src->f[1] );
1073   dst->f[2] = sinf( src->f[2] );
1074   dst->f[3] = sinf( src->f[3] );
1075}
1076
1077static void
1078micro_sqrt( union tgsi_exec_channel *dst,
1079            const union tgsi_exec_channel *src )
1080{
1081   dst->f[0] = sqrtf( src->f[0] );
1082   dst->f[1] = sqrtf( src->f[1] );
1083   dst->f[2] = sqrtf( src->f[2] );
1084   dst->f[3] = sqrtf( src->f[3] );
1085}
1086
1087static void
1088micro_sub(
1089   union tgsi_exec_channel *dst,
1090   const union tgsi_exec_channel *src0,
1091   const union tgsi_exec_channel *src1 )
1092{
1093   dst->f[0] = src0->f[0] - src1->f[0];
1094   dst->f[1] = src0->f[1] - src1->f[1];
1095   dst->f[2] = src0->f[2] - src1->f[2];
1096   dst->f[3] = src0->f[3] - src1->f[3];
1097}
1098
1099#if 0
1100static void
1101micro_u2f(
1102   union tgsi_exec_channel *dst,
1103   const union tgsi_exec_channel *src )
1104{
1105   dst->f[0] = (float) src->u[0];
1106   dst->f[1] = (float) src->u[1];
1107   dst->f[2] = (float) src->u[2];
1108   dst->f[3] = (float) src->u[3];
1109}
1110#endif
1111
1112static void
1113micro_xor(
1114   union tgsi_exec_channel *dst,
1115   const union tgsi_exec_channel *src0,
1116   const union tgsi_exec_channel *src1 )
1117{
1118   dst->u[0] = src0->u[0] ^ src1->u[0];
1119   dst->u[1] = src0->u[1] ^ src1->u[1];
1120   dst->u[2] = src0->u[2] ^ src1->u[2];
1121   dst->u[3] = src0->u[3] ^ src1->u[3];
1122}
1123
1124static void
1125fetch_src_file_channel(
1126   const struct tgsi_exec_machine *mach,
1127   const uint file,
1128   const uint swizzle,
1129   const union tgsi_exec_channel *index,
1130   union tgsi_exec_channel *chan )
1131{
1132   switch( swizzle ) {
1133   case TGSI_SWIZZLE_X:
1134   case TGSI_SWIZZLE_Y:
1135   case TGSI_SWIZZLE_Z:
1136   case TGSI_SWIZZLE_W:
1137      switch( file ) {
1138      case TGSI_FILE_CONSTANT:
1139         assert(mach->Consts);
1140         if (index->i[0] < 0)
1141            chan->f[0] = 0.0f;
1142         else
1143            chan->f[0] = mach->Consts[index->i[0]][swizzle];
1144         if (index->i[1] < 0)
1145            chan->f[1] = 0.0f;
1146         else
1147            chan->f[1] = mach->Consts[index->i[1]][swizzle];
1148         if (index->i[2] < 0)
1149            chan->f[2] = 0.0f;
1150         else
1151            chan->f[2] = mach->Consts[index->i[2]][swizzle];
1152         if (index->i[3] < 0)
1153            chan->f[3] = 0.0f;
1154         else
1155            chan->f[3] = mach->Consts[index->i[3]][swizzle];
1156         break;
1157
1158      case TGSI_FILE_INPUT:
1159         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
1160         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
1161         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
1162         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
1163         break;
1164
1165      case TGSI_FILE_TEMPORARY:
1166         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
1167         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
1168         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
1169         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
1170         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
1171         break;
1172
1173      case TGSI_FILE_IMMEDIATE:
1174         assert( index->i[0] < (int) mach->ImmLimit );
1175         chan->f[0] = mach->Imms[index->i[0]][swizzle];
1176         assert( index->i[1] < (int) mach->ImmLimit );
1177         chan->f[1] = mach->Imms[index->i[1]][swizzle];
1178         assert( index->i[2] < (int) mach->ImmLimit );
1179         chan->f[2] = mach->Imms[index->i[2]][swizzle];
1180         assert( index->i[3] < (int) mach->ImmLimit );
1181         chan->f[3] = mach->Imms[index->i[3]][swizzle];
1182         break;
1183
1184      case TGSI_FILE_ADDRESS:
1185         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
1186         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
1187         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
1188         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
1189         break;
1190
1191      case TGSI_FILE_PREDICATE:
1192         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
1193         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
1194         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
1195         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
1196         chan->u[0] = mach->Addrs[0].xyzw[swizzle].u[0];
1197         chan->u[1] = mach->Addrs[0].xyzw[swizzle].u[1];
1198         chan->u[2] = mach->Addrs[0].xyzw[swizzle].u[2];
1199         chan->u[3] = mach->Addrs[0].xyzw[swizzle].u[3];
1200         break;
1201
1202      case TGSI_FILE_OUTPUT:
1203         /* vertex/fragment output vars can be read too */
1204         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
1205         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
1206         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
1207         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
1208         break;
1209
1210      default:
1211         assert( 0 );
1212         chan->u[0] = 0;
1213         chan->u[1] = 0;
1214         chan->u[2] = 0;
1215         chan->u[3] = 0;
1216      }
1217      break;
1218
1219   default:
1220      assert( 0 );
1221      chan->u[0] = 0;
1222      chan->u[1] = 0;
1223      chan->u[2] = 0;
1224      chan->u[3] = 0;
1225   }
1226}
1227
1228static void
1229fetch_source(
1230   const struct tgsi_exec_machine *mach,
1231   union tgsi_exec_channel *chan,
1232   const struct tgsi_full_src_register *reg,
1233   const uint chan_index )
1234{
1235   union tgsi_exec_channel index;
1236   uint swizzle;
1237
1238   /* We start with a direct index into a register file.
1239    *
1240    *    file[1],
1241    *    where:
1242    *       file = SrcRegister.File
1243    *       [1] = SrcRegister.Index
1244    */
1245   index.i[0] =
1246   index.i[1] =
1247   index.i[2] =
1248   index.i[3] = reg->SrcRegister.Index;
1249
1250   /* There is an extra source register that indirectly subscripts
1251    * a register file. The direct index now becomes an offset
1252    * that is being added to the indirect register.
1253    *
1254    *    file[ind[2].x+1],
1255    *    where:
1256    *       ind = SrcRegisterInd.File
1257    *       [2] = SrcRegisterInd.Index
1258    *       .x = SrcRegisterInd.SwizzleX
1259    */
1260   if (reg->SrcRegister.Indirect) {
1261      union tgsi_exec_channel index2;
1262      union tgsi_exec_channel indir_index;
1263      const uint execmask = mach->ExecMask;
1264      uint i;
1265
1266      /* which address register (always zero now) */
1267      index2.i[0] =
1268      index2.i[1] =
1269      index2.i[2] =
1270      index2.i[3] = reg->SrcRegisterInd.Index;
1271
1272      /* get current value of address register[swizzle] */
1273      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
1274      fetch_src_file_channel(
1275         mach,
1276         reg->SrcRegisterInd.File,
1277         swizzle,
1278         &index2,
1279         &indir_index );
1280
1281      /* add value of address register to the offset */
1282      index.i[0] += (int) indir_index.f[0];
1283      index.i[1] += (int) indir_index.f[1];
1284      index.i[2] += (int) indir_index.f[2];
1285      index.i[3] += (int) indir_index.f[3];
1286
1287      /* for disabled execution channels, zero-out the index to
1288       * avoid using a potential garbage value.
1289       */
1290      for (i = 0; i < QUAD_SIZE; i++) {
1291         if ((execmask & (1 << i)) == 0)
1292            index.i[i] = 0;
1293      }
1294   }
1295
1296   /* There is an extra source register that is a second
1297    * subscript to a register file. Effectively it means that
1298    * the register file is actually a 2D array of registers.
1299    *
1300    *    file[1][3] == file[1*sizeof(file[1])+3],
1301    *    where:
1302    *       [3] = SrcRegisterDim.Index
1303    */
1304   if (reg->SrcRegister.Dimension) {
1305      /* The size of the first-order array depends on the register file type.
1306       * We need to multiply the index to the first array to get an effective,
1307       * "flat" index that points to the beginning of the second-order array.
1308       */
1309      switch (reg->SrcRegister.File) {
1310      case TGSI_FILE_INPUT:
1311         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1312         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1313         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1314         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
1315         break;
1316      case TGSI_FILE_CONSTANT:
1317         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
1318         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
1319         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
1320         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
1321         break;
1322      default:
1323         assert( 0 );
1324      }
1325
1326      index.i[0] += reg->SrcRegisterDim.Index;
1327      index.i[1] += reg->SrcRegisterDim.Index;
1328      index.i[2] += reg->SrcRegisterDim.Index;
1329      index.i[3] += reg->SrcRegisterDim.Index;
1330
1331      /* Again, the second subscript index can be addressed indirectly
1332       * identically to the first one.
1333       * Nothing stops us from indirectly addressing the indirect register,
1334       * but there is no need for that, so we won't exercise it.
1335       *
1336       *    file[1][ind[4].y+3],
1337       *    where:
1338       *       ind = SrcRegisterDimInd.File
1339       *       [4] = SrcRegisterDimInd.Index
1340       *       .y = SrcRegisterDimInd.SwizzleX
1341       */
1342      if (reg->SrcRegisterDim.Indirect) {
1343         union tgsi_exec_channel index2;
1344         union tgsi_exec_channel indir_index;
1345         const uint execmask = mach->ExecMask;
1346         uint i;
1347
1348         index2.i[0] =
1349         index2.i[1] =
1350         index2.i[2] =
1351         index2.i[3] = reg->SrcRegisterDimInd.Index;
1352
1353         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
1354         fetch_src_file_channel(
1355            mach,
1356            reg->SrcRegisterDimInd.File,
1357            swizzle,
1358            &index2,
1359            &indir_index );
1360
1361         index.i[0] += (int) indir_index.f[0];
1362         index.i[1] += (int) indir_index.f[1];
1363         index.i[2] += (int) indir_index.f[2];
1364         index.i[3] += (int) indir_index.f[3];
1365
1366         /* for disabled execution channels, zero-out the index to
1367          * avoid using a potential garbage value.
1368          */
1369         for (i = 0; i < QUAD_SIZE; i++) {
1370            if ((execmask & (1 << i)) == 0)
1371               index.i[i] = 0;
1372         }
1373      }
1374
1375      /* If by any chance there was a need for a 3D array of register
1376       * files, we would have to check whether SrcRegisterDim is followed
1377       * by a dimension register and continue the saga.
1378       */
1379   }
1380
1381   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1382   fetch_src_file_channel(
1383      mach,
1384      reg->SrcRegister.File,
1385      swizzle,
1386      &index,
1387      chan );
1388
1389   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
1390   case TGSI_UTIL_SIGN_CLEAR:
1391      micro_abs( chan, chan );
1392      break;
1393
1394   case TGSI_UTIL_SIGN_SET:
1395      micro_abs( chan, chan );
1396      micro_neg( chan, chan );
1397      break;
1398
1399   case TGSI_UTIL_SIGN_TOGGLE:
1400      micro_neg( chan, chan );
1401      break;
1402
1403   case TGSI_UTIL_SIGN_KEEP:
1404      break;
1405   }
1406
1407   if (reg->SrcRegisterExtMod.Complement) {
1408      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
1409   }
1410}
1411
1412static void
1413store_dest(
1414   struct tgsi_exec_machine *mach,
1415   const union tgsi_exec_channel *chan,
1416   const struct tgsi_full_dst_register *reg,
1417   const struct tgsi_full_instruction *inst,
1418   uint chan_index )
1419{
1420   uint i;
1421   union tgsi_exec_channel null;
1422   union tgsi_exec_channel *dst;
1423   uint execmask = mach->ExecMask;
1424   int offset = 0;  /* indirection offset */
1425   int index;
1426
1427   if (0) {
1428      check_inf_or_nan(chan);
1429   }
1430
1431   /* There is an extra source register that indirectly subscripts
1432    * a register file. The direct index now becomes an offset
1433    * that is being added to the indirect register.
1434    *
1435    *    file[ind[2].x+1],
1436    *    where:
1437    *       ind = DstRegisterInd.File
1438    *       [2] = DstRegisterInd.Index
1439    *       .x = DstRegisterInd.SwizzleX
1440    */
1441   if (reg->DstRegister.Indirect) {
1442      union tgsi_exec_channel index;
1443      union tgsi_exec_channel indir_index;
1444      uint swizzle;
1445
1446      /* which address register (always zero for now) */
1447      index.i[0] =
1448      index.i[1] =
1449      index.i[2] =
1450      index.i[3] = reg->DstRegisterInd.Index;
1451
1452      /* get current value of address register[swizzle] */
1453      swizzle = tgsi_util_get_src_register_swizzle( &reg->DstRegisterInd, CHAN_X );
1454
1455      /* fetch values from the address/indirection register */
1456      fetch_src_file_channel(
1457         mach,
1458         reg->DstRegisterInd.File,
1459         swizzle,
1460         &index,
1461         &indir_index );
1462
1463      /* save indirection offset */
1464      offset = (int) indir_index.f[0];
1465   }
1466
1467   switch (reg->DstRegister.File) {
1468   case TGSI_FILE_NULL:
1469      dst = &null;
1470      break;
1471
1472   case TGSI_FILE_OUTPUT:
1473      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1474         + reg->DstRegister.Index;
1475      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1476      break;
1477
1478   case TGSI_FILE_TEMPORARY:
1479      index = reg->DstRegister.Index;
1480      assert( index < TGSI_EXEC_NUM_TEMPS );
1481      dst = &mach->Temps[offset + index].xyzw[chan_index];
1482      break;
1483
1484   case TGSI_FILE_ADDRESS:
1485      index = reg->DstRegister.Index;
1486      dst = &mach->Addrs[index].xyzw[chan_index];
1487      break;
1488
1489   case TGSI_FILE_PREDICATE:
1490      index = reg->DstRegister.Index;
1491      assert(index < TGSI_EXEC_NUM_PREDS);
1492      dst = &mach->Addrs[index].xyzw[chan_index];
1493      break;
1494
1495   default:
1496      assert( 0 );
1497      return;
1498   }
1499
1500   switch (inst->Instruction.Saturate) {
1501   case TGSI_SAT_NONE:
1502      for (i = 0; i < QUAD_SIZE; i++)
1503         if (execmask & (1 << i))
1504            dst->i[i] = chan->i[i];
1505      break;
1506
1507   case TGSI_SAT_ZERO_ONE:
1508      for (i = 0; i < QUAD_SIZE; i++)
1509         if (execmask & (1 << i)) {
1510            if (chan->f[i] < 0.0f)
1511               dst->f[i] = 0.0f;
1512            else if (chan->f[i] > 1.0f)
1513               dst->f[i] = 1.0f;
1514            else
1515               dst->i[i] = chan->i[i];
1516         }
1517      break;
1518
1519   case TGSI_SAT_MINUS_PLUS_ONE:
1520      for (i = 0; i < QUAD_SIZE; i++)
1521         if (execmask & (1 << i)) {
1522            if (chan->f[i] < -1.0f)
1523               dst->f[i] = -1.0f;
1524            else if (chan->f[i] > 1.0f)
1525               dst->f[i] = 1.0f;
1526            else
1527               dst->i[i] = chan->i[i];
1528         }
1529      break;
1530
1531   default:
1532      assert( 0 );
1533   }
1534}
1535
1536#define FETCH(VAL,INDEX,CHAN)\
1537    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
1538
1539#define STORE(VAL,INDEX,CHAN)\
1540    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
1541
1542
1543/**
1544 * Execute ARB-style KIL which is predicated by a src register.
1545 * Kill fragment if any of the four values is less than zero.
1546 */
1547static void
1548exec_kil(struct tgsi_exec_machine *mach,
1549         const struct tgsi_full_instruction *inst)
1550{
1551   uint uniquemask;
1552   uint chan_index;
1553   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1554   union tgsi_exec_channel r[1];
1555
1556   /* This mask stores component bits that were already tested. */
1557   uniquemask = 0;
1558
1559   for (chan_index = 0; chan_index < 4; chan_index++)
1560   {
1561      uint swizzle;
1562      uint i;
1563
1564      /* unswizzle channel */
1565      swizzle = tgsi_util_get_full_src_register_swizzle (
1566                        &inst->FullSrcRegisters[0],
1567                        chan_index);
1568
1569      /* check if the component has not been already tested */
1570      if (uniquemask & (1 << swizzle))
1571         continue;
1572      uniquemask |= 1 << swizzle;
1573
1574      FETCH(&r[0], 0, chan_index);
1575      for (i = 0; i < 4; i++)
1576         if (r[0].f[i] < 0.0f)
1577            kilmask |= 1 << i;
1578   }
1579
1580   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1581}
1582
1583/**
1584 * Execute NVIDIA-style KIL which is predicated by a condition code.
1585 * Kill fragment if the condition code is TRUE.
1586 */
1587static void
1588exec_kilp(struct tgsi_exec_machine *mach,
1589          const struct tgsi_full_instruction *inst)
1590{
1591   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1592
1593   /* "unconditional" kil */
1594   kilmask = mach->ExecMask;
1595   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
1596}
1597
1598
1599/*
1600 * Fetch a four texture samples using STR texture coordinates.
1601 */
1602static void
1603fetch_texel( struct tgsi_sampler *sampler,
1604             const union tgsi_exec_channel *s,
1605             const union tgsi_exec_channel *t,
1606             const union tgsi_exec_channel *p,
1607             float lodbias,  /* XXX should be float[4] */
1608             union tgsi_exec_channel *r,
1609             union tgsi_exec_channel *g,
1610             union tgsi_exec_channel *b,
1611             union tgsi_exec_channel *a )
1612{
1613   uint j;
1614   float rgba[NUM_CHANNELS][QUAD_SIZE];
1615
1616   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
1617
1618   for (j = 0; j < 4; j++) {
1619      r->f[j] = rgba[0][j];
1620      g->f[j] = rgba[1][j];
1621      b->f[j] = rgba[2][j];
1622      a->f[j] = rgba[3][j];
1623   }
1624}
1625
1626
1627static void
1628exec_tex(struct tgsi_exec_machine *mach,
1629         const struct tgsi_full_instruction *inst,
1630         boolean biasLod,
1631         boolean projected)
1632{
1633   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
1634   union tgsi_exec_channel r[4];
1635   uint chan_index;
1636   float lodBias;
1637
1638   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
1639
1640   switch (inst->InstructionExtTexture.Texture) {
1641   case TGSI_TEXTURE_1D:
1642   case TGSI_TEXTURE_SHADOW1D:
1643
1644      FETCH(&r[0], 0, CHAN_X);
1645
1646      if (projected) {
1647         FETCH(&r[1], 0, CHAN_W);
1648         micro_div( &r[0], &r[0], &r[1] );
1649      }
1650
1651      if (biasLod) {
1652         FETCH(&r[1], 0, CHAN_W);
1653         lodBias = r[2].f[0];
1654      }
1655      else
1656         lodBias = 0.0;
1657
1658      fetch_texel(mach->Samplers[unit],
1659                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
1660                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
1661      break;
1662
1663   case TGSI_TEXTURE_2D:
1664   case TGSI_TEXTURE_RECT:
1665   case TGSI_TEXTURE_SHADOW2D:
1666   case TGSI_TEXTURE_SHADOWRECT:
1667
1668      FETCH(&r[0], 0, CHAN_X);
1669      FETCH(&r[1], 0, CHAN_Y);
1670      FETCH(&r[2], 0, CHAN_Z);
1671
1672      if (projected) {
1673         FETCH(&r[3], 0, CHAN_W);
1674         micro_div( &r[0], &r[0], &r[3] );
1675         micro_div( &r[1], &r[1], &r[3] );
1676         micro_div( &r[2], &r[2], &r[3] );
1677      }
1678
1679      if (biasLod) {
1680         FETCH(&r[3], 0, CHAN_W);
1681         lodBias = r[3].f[0];
1682      }
1683      else
1684         lodBias = 0.0;
1685
1686      fetch_texel(mach->Samplers[unit],
1687                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
1688                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
1689      break;
1690
1691   case TGSI_TEXTURE_3D:
1692   case TGSI_TEXTURE_CUBE:
1693
1694      FETCH(&r[0], 0, CHAN_X);
1695      FETCH(&r[1], 0, CHAN_Y);
1696      FETCH(&r[2], 0, CHAN_Z);
1697
1698      if (projected) {
1699         FETCH(&r[3], 0, CHAN_W);
1700         micro_div( &r[0], &r[0], &r[3] );
1701         micro_div( &r[1], &r[1], &r[3] );
1702         micro_div( &r[2], &r[2], &r[3] );
1703      }
1704
1705      if (biasLod) {
1706         FETCH(&r[3], 0, CHAN_W);
1707         lodBias = r[3].f[0];
1708      }
1709      else
1710         lodBias = 0.0;
1711
1712      fetch_texel(mach->Samplers[unit],
1713                  &r[0], &r[1], &r[2], lodBias,
1714                  &r[0], &r[1], &r[2], &r[3]);
1715      break;
1716
1717   default:
1718      assert (0);
1719   }
1720
1721   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1722      STORE( &r[chan_index], 0, chan_index );
1723   }
1724}
1725
1726
1727/**
1728 * Evaluate a constant-valued coefficient at the position of the
1729 * current quad.
1730 */
1731static void
1732eval_constant_coef(
1733   struct tgsi_exec_machine *mach,
1734   unsigned attrib,
1735   unsigned chan )
1736{
1737   unsigned i;
1738
1739   for( i = 0; i < QUAD_SIZE; i++ ) {
1740      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
1741   }
1742}
1743
1744/**
1745 * Evaluate a linear-valued coefficient at the position of the
1746 * current quad.
1747 */
1748static void
1749eval_linear_coef(
1750   struct tgsi_exec_machine *mach,
1751   unsigned attrib,
1752   unsigned chan )
1753{
1754   const float x = mach->QuadPos.xyzw[0].f[0];
1755   const float y = mach->QuadPos.xyzw[1].f[0];
1756   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1757   const float dady = mach->InterpCoefs[attrib].dady[chan];
1758   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1759   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
1760   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
1761   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
1762   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
1763}
1764
1765/**
1766 * Evaluate a perspective-valued coefficient at the position of the
1767 * current quad.
1768 */
1769static void
1770eval_perspective_coef(
1771   struct tgsi_exec_machine *mach,
1772   unsigned attrib,
1773   unsigned chan )
1774{
1775   const float x = mach->QuadPos.xyzw[0].f[0];
1776   const float y = mach->QuadPos.xyzw[1].f[0];
1777   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
1778   const float dady = mach->InterpCoefs[attrib].dady[chan];
1779   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
1780   const float *w = mach->QuadPos.xyzw[3].f;
1781   /* divide by W here */
1782   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
1783   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
1784   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
1785   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
1786}
1787
1788
1789typedef void (* eval_coef_func)(
1790   struct tgsi_exec_machine *mach,
1791   unsigned attrib,
1792   unsigned chan );
1793
1794static void
1795exec_declaration(
1796   struct tgsi_exec_machine *mach,
1797   const struct tgsi_full_declaration *decl )
1798{
1799   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
1800      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
1801         unsigned first, last, mask;
1802         eval_coef_func eval;
1803
1804         first = decl->DeclarationRange.First;
1805         last = decl->DeclarationRange.Last;
1806         mask = decl->Declaration.UsageMask;
1807
1808         switch( decl->Declaration.Interpolate ) {
1809         case TGSI_INTERPOLATE_CONSTANT:
1810            eval = eval_constant_coef;
1811            break;
1812
1813         case TGSI_INTERPOLATE_LINEAR:
1814            eval = eval_linear_coef;
1815            break;
1816
1817         case TGSI_INTERPOLATE_PERSPECTIVE:
1818            eval = eval_perspective_coef;
1819            break;
1820
1821         default:
1822            assert( 0 );
1823            return;
1824         }
1825
1826         if( mask == TGSI_WRITEMASK_XYZW ) {
1827            unsigned i, j;
1828
1829            for( i = first; i <= last; i++ ) {
1830               for( j = 0; j < NUM_CHANNELS; j++ ) {
1831                  eval( mach, i, j );
1832               }
1833            }
1834         }
1835         else {
1836            unsigned i, j;
1837
1838            for( j = 0; j < NUM_CHANNELS; j++ ) {
1839               if( mask & (1 << j) ) {
1840                  for( i = first; i <= last; i++ ) {
1841                     eval( mach, i, j );
1842                  }
1843               }
1844            }
1845         }
1846      }
1847   }
1848}
1849
1850static void
1851exec_instruction(
1852   struct tgsi_exec_machine *mach,
1853   const struct tgsi_full_instruction *inst,
1854   int *pc )
1855{
1856   uint chan_index;
1857   union tgsi_exec_channel r[3 * NUM_CHANNELS];
1858   union tgsi_exec_channel d[8];
1859
1860   (*pc)++;
1861
1862   switch (inst->Instruction.Opcode) {
1863   case TGSI_OPCODE_ARL:
1864   case TGSI_OPCODE_FLR:
1865      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1866         FETCH( &r[0], 0, chan_index );
1867         micro_flr( &r[0], &r[0] );
1868         STORE( &r[0], 0, chan_index );
1869      }
1870      break;
1871
1872   case TGSI_OPCODE_MOV:
1873      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
1874         /* Do all fetches into temp regs, then do all stores to avoid
1875          * intermediate/accidental clobbering.  This could be done all the
1876          * time for MOV but for other instructions we'll need more temps...
1877          */
1878         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1879            FETCH( &r[chan_index], 0, chan_index );
1880         }
1881         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1882            STORE( &r[chan_index], 0, chan_index );
1883         }
1884      }
1885      else {
1886         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1887            FETCH( &r[0], 0, chan_index );
1888            STORE( &r[0], 0, chan_index );
1889         }
1890      }
1891      break;
1892
1893   case TGSI_OPCODE_LIT:
1894      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1895         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
1896      }
1897
1898      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1899         FETCH( &r[0], 0, CHAN_X );
1900         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1901            micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1902            STORE( &r[0], 0, CHAN_Y );
1903         }
1904
1905         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1906            FETCH( &r[1], 0, CHAN_Y );
1907            micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1908
1909            FETCH( &r[2], 0, CHAN_W );
1910            micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
1911            micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
1912            micro_pow( &r[1], &r[1], &r[2] );
1913            micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
1914            STORE( &r[0], 0, CHAN_Z );
1915         }
1916      }
1917
1918      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1919         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1920      }
1921      break;
1922
1923   case TGSI_OPCODE_RCP:
1924   /* TGSI_OPCODE_RECIP */
1925      FETCH( &r[0], 0, CHAN_X );
1926      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1927      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1928         STORE( &r[0], 0, chan_index );
1929      }
1930      break;
1931
1932   case TGSI_OPCODE_RSQ:
1933   /* TGSI_OPCODE_RECIPSQRT */
1934      FETCH( &r[0], 0, CHAN_X );
1935      micro_abs( &r[0], &r[0] );
1936      micro_sqrt( &r[0], &r[0] );
1937      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
1938      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
1939         STORE( &r[0], 0, chan_index );
1940      }
1941      break;
1942
1943   case TGSI_OPCODE_EXP:
1944      FETCH( &r[0], 0, CHAN_X );
1945      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
1946      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1947         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
1948         STORE( &r[2], 0, CHAN_X );        /* store r2 */
1949      }
1950      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1951         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
1952         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
1953      }
1954      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1955         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
1956         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
1957      }
1958      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1959         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1960      }
1961      break;
1962
1963   case TGSI_OPCODE_LOG:
1964      FETCH( &r[0], 0, CHAN_X );
1965      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
1966      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
1967      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
1968      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
1969         STORE( &r[0], 0, CHAN_X );
1970      }
1971      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
1972         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
1973         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
1974         STORE( &r[0], 0, CHAN_Y );
1975      }
1976      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
1977         STORE( &r[1], 0, CHAN_Z );
1978      }
1979      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
1980         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
1981      }
1982      break;
1983
1984   case TGSI_OPCODE_MUL:
1985      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
1986         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1987         {
1988            FETCH(&r[chan_index], 0, chan_index);
1989            FETCH(&r[chan_index + NUM_CHANNELS], 1, chan_index);
1990         }
1991         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1992         {
1993            micro_mul( &r[chan_index], &r[chan_index], &r[chan_index + NUM_CHANNELS] );
1994            STORE(&r[chan_index], 0, chan_index);
1995         }
1996      } else {
1997         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
1998         {
1999            FETCH(&r[0], 0, chan_index);
2000            FETCH(&r[1], 1, chan_index);
2001
2002            micro_mul( &r[0], &r[0], &r[1] );
2003
2004            STORE(&r[0], 0, chan_index);
2005         }
2006      }
2007      break;
2008
2009   case TGSI_OPCODE_ADD:
2010      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2011         FETCH( &r[0], 0, chan_index );
2012         FETCH( &r[1], 1, chan_index );
2013         micro_add( &r[0], &r[0], &r[1] );
2014         STORE( &r[0], 0, chan_index );
2015      }
2016      break;
2017
2018   case TGSI_OPCODE_DP3:
2019   /* TGSI_OPCODE_DOT3 */
2020      FETCH( &r[0], 0, CHAN_X );
2021      FETCH( &r[1], 1, CHAN_X );
2022      micro_mul( &r[0], &r[0], &r[1] );
2023
2024      FETCH( &r[1], 0, CHAN_Y );
2025      FETCH( &r[2], 1, CHAN_Y );
2026      micro_mul( &r[1], &r[1], &r[2] );
2027      micro_add( &r[0], &r[0], &r[1] );
2028
2029      FETCH( &r[1], 0, CHAN_Z );
2030      FETCH( &r[2], 1, CHAN_Z );
2031      micro_mul( &r[1], &r[1], &r[2] );
2032      micro_add( &r[0], &r[0], &r[1] );
2033
2034      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2035         STORE( &r[0], 0, chan_index );
2036      }
2037      break;
2038
2039    case TGSI_OPCODE_DP4:
2040    /* TGSI_OPCODE_DOT4 */
2041       FETCH(&r[0], 0, CHAN_X);
2042       FETCH(&r[1], 1, CHAN_X);
2043
2044       micro_mul( &r[0], &r[0], &r[1] );
2045
2046       FETCH(&r[1], 0, CHAN_Y);
2047       FETCH(&r[2], 1, CHAN_Y);
2048
2049       micro_mul( &r[1], &r[1], &r[2] );
2050       micro_add( &r[0], &r[0], &r[1] );
2051
2052       FETCH(&r[1], 0, CHAN_Z);
2053       FETCH(&r[2], 1, CHAN_Z);
2054
2055       micro_mul( &r[1], &r[1], &r[2] );
2056       micro_add( &r[0], &r[0], &r[1] );
2057
2058       FETCH(&r[1], 0, CHAN_W);
2059       FETCH(&r[2], 1, CHAN_W);
2060
2061       micro_mul( &r[1], &r[1], &r[2] );
2062       micro_add( &r[0], &r[0], &r[1] );
2063
2064      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2065         STORE( &r[0], 0, chan_index );
2066      }
2067      break;
2068
2069   case TGSI_OPCODE_DST:
2070      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
2071         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
2072      }
2073
2074      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
2075         FETCH( &r[0], 0, CHAN_Y );
2076         FETCH( &r[1], 1, CHAN_Y);
2077         micro_mul( &r[0], &r[0], &r[1] );
2078         STORE( &r[0], 0, CHAN_Y );
2079      }
2080
2081      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
2082         FETCH( &r[0], 0, CHAN_Z );
2083         STORE( &r[0], 0, CHAN_Z );
2084      }
2085
2086      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2087         FETCH( &r[0], 1, CHAN_W );
2088         STORE( &r[0], 0, CHAN_W );
2089      }
2090      break;
2091
2092   case TGSI_OPCODE_MIN:
2093      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2094         FETCH(&r[0], 0, chan_index);
2095         FETCH(&r[1], 1, chan_index);
2096
2097         /* XXX use micro_min()?? */
2098         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
2099
2100         STORE(&r[0], 0, chan_index);
2101      }
2102      break;
2103
2104   case TGSI_OPCODE_MAX:
2105      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2106         FETCH(&r[0], 0, chan_index);
2107         FETCH(&r[1], 1, chan_index);
2108
2109         /* XXX use micro_max()?? */
2110         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
2111
2112         STORE(&r[0], 0, chan_index );
2113      }
2114      break;
2115
2116   case TGSI_OPCODE_SLT:
2117   /* TGSI_OPCODE_SETLT */
2118      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2119         FETCH( &r[0], 0, chan_index );
2120         FETCH( &r[1], 1, chan_index );
2121         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2122         STORE( &r[0], 0, chan_index );
2123      }
2124      break;
2125
2126   case TGSI_OPCODE_SGE:
2127   /* TGSI_OPCODE_SETGE */
2128      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2129         FETCH( &r[0], 0, chan_index );
2130         FETCH( &r[1], 1, chan_index );
2131         micro_le( &r[0], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2132         STORE( &r[0], 0, chan_index );
2133      }
2134      break;
2135
2136   case TGSI_OPCODE_MAD:
2137   /* TGSI_OPCODE_MADD */
2138      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2139         FETCH( &r[0], 0, chan_index );
2140         FETCH( &r[1], 1, chan_index );
2141         micro_mul( &r[0], &r[0], &r[1] );
2142         FETCH( &r[1], 2, chan_index );
2143         micro_add( &r[0], &r[0], &r[1] );
2144         STORE( &r[0], 0, chan_index );
2145      }
2146      break;
2147
2148   case TGSI_OPCODE_SUB:
2149      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2150         FETCH(&r[0], 0, chan_index);
2151         FETCH(&r[1], 1, chan_index);
2152
2153         micro_sub( &r[0], &r[0], &r[1] );
2154
2155         STORE(&r[0], 0, chan_index);
2156      }
2157      break;
2158
2159   case TGSI_OPCODE_LRP:
2160      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2161         FETCH(&r[0], 0, chan_index);
2162         FETCH(&r[1], 1, chan_index);
2163         FETCH(&r[2], 2, chan_index);
2164
2165         micro_sub( &r[1], &r[1], &r[2] );
2166         micro_mul( &r[0], &r[0], &r[1] );
2167         micro_add( &r[0], &r[0], &r[2] );
2168
2169         STORE(&r[0], 0, chan_index);
2170      }
2171      break;
2172
2173   case TGSI_OPCODE_CND:
2174      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2175         FETCH(&r[0], 0, chan_index);
2176         FETCH(&r[1], 1, chan_index);
2177         FETCH(&r[2], 2, chan_index);
2178         micro_lt(&r[0], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
2179         STORE(&r[0], 0, chan_index);
2180      }
2181      break;
2182
2183   case TGSI_OPCODE_DP2A:
2184      FETCH( &r[0], 0, CHAN_X );
2185      FETCH( &r[1], 1, CHAN_X );
2186      micro_mul( &r[0], &r[0], &r[1] );
2187
2188      FETCH( &r[1], 0, CHAN_Y );
2189      FETCH( &r[2], 1, CHAN_Y );
2190      micro_mul( &r[1], &r[1], &r[2] );
2191      micro_add( &r[0], &r[0], &r[1] );
2192
2193      FETCH( &r[2], 2, CHAN_X );
2194      micro_add( &r[0], &r[0], &r[2] );
2195
2196      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2197         STORE( &r[0], 0, chan_index );
2198      }
2199      break;
2200
2201   case TGSI_OPCODE_FRC:
2202      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2203         FETCH( &r[0], 0, chan_index );
2204         micro_frc( &r[0], &r[0] );
2205         STORE( &r[0], 0, chan_index );
2206      }
2207      break;
2208
2209   case TGSI_OPCODE_CLAMP:
2210      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2211         FETCH(&r[0], 0, chan_index);
2212         FETCH(&r[1], 1, chan_index);
2213         micro_max(&r[0], &r[0], &r[1]);
2214         FETCH(&r[1], 2, chan_index);
2215         micro_min(&r[0], &r[0], &r[1]);
2216         STORE(&r[0], 0, chan_index);
2217      }
2218      break;
2219
2220   case TGSI_OPCODE_ROUND:
2221   case TGSI_OPCODE_ARR:
2222      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2223         FETCH( &r[0], 0, chan_index );
2224         micro_rnd( &r[0], &r[0] );
2225         STORE( &r[0], 0, chan_index );
2226      }
2227      break;
2228
2229   case TGSI_OPCODE_EX2:
2230      FETCH(&r[0], 0, CHAN_X);
2231
2232#if FAST_MATH
2233      micro_exp2( &r[0], &r[0] );
2234#else
2235      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
2236#endif
2237
2238      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2239         STORE( &r[0], 0, chan_index );
2240      }
2241      break;
2242
2243   case TGSI_OPCODE_LG2:
2244      FETCH( &r[0], 0, CHAN_X );
2245      micro_lg2( &r[0], &r[0] );
2246      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2247         STORE( &r[0], 0, chan_index );
2248      }
2249      break;
2250
2251   case TGSI_OPCODE_POW:
2252      FETCH(&r[0], 0, CHAN_X);
2253      FETCH(&r[1], 1, CHAN_X);
2254
2255      micro_pow( &r[0], &r[0], &r[1] );
2256
2257      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2258         STORE( &r[0], 0, chan_index );
2259      }
2260      break;
2261
2262   case TGSI_OPCODE_XPD:
2263      FETCH(&r[0], 0, CHAN_Y);
2264      FETCH(&r[1], 1, CHAN_Z);
2265
2266      micro_mul( &r[2], &r[0], &r[1] );
2267
2268      FETCH(&r[3], 0, CHAN_Z);
2269      FETCH(&r[4], 1, CHAN_Y);
2270
2271      micro_mul( &r[5], &r[3], &r[4] );
2272      micro_sub(&d[CHAN_X], &r[2], &r[5]);
2273
2274      FETCH(&r[2], 1, CHAN_X);
2275
2276      micro_mul( &r[3], &r[3], &r[2] );
2277
2278      FETCH(&r[5], 0, CHAN_X);
2279
2280      micro_mul( &r[1], &r[1], &r[5] );
2281      micro_sub(&d[CHAN_Y], &r[3], &r[1]);
2282
2283      micro_mul( &r[5], &r[5], &r[4] );
2284      micro_mul( &r[0], &r[0], &r[2] );
2285      micro_sub(&d[CHAN_Z], &r[5], &r[0]);
2286
2287      if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2288         STORE(&d[CHAN_X], 0, CHAN_X);
2289      }
2290      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2291         STORE(&d[CHAN_Y], 0, CHAN_Y);
2292      }
2293      if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2294         STORE(&d[CHAN_Z], 0, CHAN_Z);
2295      }
2296      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
2297         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2298      }
2299      break;
2300
2301   case TGSI_OPCODE_ABS:
2302       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2303          FETCH(&r[0], 0, chan_index);
2304
2305          micro_abs( &r[0], &r[0] );
2306
2307          STORE(&r[0], 0, chan_index);
2308       }
2309       break;
2310
2311   case TGSI_OPCODE_RCC:
2312      FETCH(&r[0], 0, CHAN_X);
2313      micro_div(&r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0]);
2314      micro_float_clamp(&r[0], &r[0]);
2315      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2316         STORE(&r[0], 0, chan_index);
2317      }
2318      break;
2319
2320   case TGSI_OPCODE_DPH:
2321      FETCH(&r[0], 0, CHAN_X);
2322      FETCH(&r[1], 1, CHAN_X);
2323
2324      micro_mul( &r[0], &r[0], &r[1] );
2325
2326      FETCH(&r[1], 0, CHAN_Y);
2327      FETCH(&r[2], 1, CHAN_Y);
2328
2329      micro_mul( &r[1], &r[1], &r[2] );
2330      micro_add( &r[0], &r[0], &r[1] );
2331
2332      FETCH(&r[1], 0, CHAN_Z);
2333      FETCH(&r[2], 1, CHAN_Z);
2334
2335      micro_mul( &r[1], &r[1], &r[2] );
2336      micro_add( &r[0], &r[0], &r[1] );
2337
2338      FETCH(&r[1], 1, CHAN_W);
2339
2340      micro_add( &r[0], &r[0], &r[1] );
2341
2342      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2343         STORE( &r[0], 0, chan_index );
2344      }
2345      break;
2346
2347   case TGSI_OPCODE_COS:
2348      FETCH(&r[0], 0, CHAN_X);
2349
2350      micro_cos( &r[0], &r[0] );
2351
2352      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2353         STORE( &r[0], 0, chan_index );
2354      }
2355      break;
2356
2357   case TGSI_OPCODE_DDX:
2358      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2359         FETCH( &r[0], 0, chan_index );
2360         micro_ddx( &r[0], &r[0] );
2361         STORE( &r[0], 0, chan_index );
2362      }
2363      break;
2364
2365   case TGSI_OPCODE_DDY:
2366      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2367         FETCH( &r[0], 0, chan_index );
2368         micro_ddy( &r[0], &r[0] );
2369         STORE( &r[0], 0, chan_index );
2370      }
2371      break;
2372
2373   case TGSI_OPCODE_KILP:
2374      exec_kilp (mach, inst);
2375      break;
2376
2377   case TGSI_OPCODE_KIL:
2378      exec_kil (mach, inst);
2379      break;
2380
2381   case TGSI_OPCODE_PK2H:
2382      assert (0);
2383      break;
2384
2385   case TGSI_OPCODE_PK2US:
2386      assert (0);
2387      break;
2388
2389   case TGSI_OPCODE_PK4B:
2390      assert (0);
2391      break;
2392
2393   case TGSI_OPCODE_PK4UB:
2394      assert (0);
2395      break;
2396
2397   case TGSI_OPCODE_RFL:
2398      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2399          IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2400          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2401         /* r0 = dp3(src0, src0) */
2402         FETCH(&r[2], 0, CHAN_X);
2403         micro_mul(&r[0], &r[2], &r[2]);
2404         FETCH(&r[4], 0, CHAN_Y);
2405         micro_mul(&r[8], &r[4], &r[4]);
2406         micro_add(&r[0], &r[0], &r[8]);
2407         FETCH(&r[6], 0, CHAN_Z);
2408         micro_mul(&r[8], &r[6], &r[6]);
2409         micro_add(&r[0], &r[0], &r[8]);
2410
2411         /* r1 = dp3(src0, src1) */
2412         FETCH(&r[3], 1, CHAN_X);
2413         micro_mul(&r[1], &r[2], &r[3]);
2414         FETCH(&r[5], 1, CHAN_Y);
2415         micro_mul(&r[8], &r[4], &r[5]);
2416         micro_add(&r[1], &r[1], &r[8]);
2417         FETCH(&r[7], 1, CHAN_Z);
2418         micro_mul(&r[8], &r[6], &r[7]);
2419         micro_add(&r[1], &r[1], &r[8]);
2420
2421         /* r1 = 2 * r1 / r0 */
2422         micro_add(&r[1], &r[1], &r[1]);
2423         micro_div(&r[1], &r[1], &r[0]);
2424
2425         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2426            micro_mul(&r[2], &r[2], &r[1]);
2427            micro_sub(&r[2], &r[2], &r[3]);
2428            STORE(&r[2], 0, CHAN_X);
2429         }
2430         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2431            micro_mul(&r[4], &r[4], &r[1]);
2432            micro_sub(&r[4], &r[4], &r[5]);
2433            STORE(&r[4], 0, CHAN_Y);
2434         }
2435         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2436            micro_mul(&r[6], &r[6], &r[1]);
2437            micro_sub(&r[6], &r[6], &r[7]);
2438            STORE(&r[6], 0, CHAN_Z);
2439         }
2440      }
2441      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2442         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2443      }
2444      break;
2445
2446   case TGSI_OPCODE_SEQ:
2447      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2448         FETCH( &r[0], 0, chan_index );
2449         FETCH( &r[1], 1, chan_index );
2450         micro_eq( &r[0], &r[0], &r[1],
2451                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
2452                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2453         STORE( &r[0], 0, chan_index );
2454      }
2455      break;
2456
2457   case TGSI_OPCODE_SFL:
2458      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2459         STORE(&mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, chan_index);
2460      }
2461      break;
2462
2463   case TGSI_OPCODE_SGT:
2464      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2465         FETCH( &r[0], 0, chan_index );
2466         FETCH( &r[1], 1, chan_index );
2467         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2468         STORE( &r[0], 0, chan_index );
2469      }
2470      break;
2471
2472   case TGSI_OPCODE_SIN:
2473      FETCH( &r[0], 0, CHAN_X );
2474      micro_sin( &r[0], &r[0] );
2475      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2476         STORE( &r[0], 0, chan_index );
2477      }
2478      break;
2479
2480   case TGSI_OPCODE_SLE:
2481      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2482         FETCH( &r[0], 0, chan_index );
2483         FETCH( &r[1], 1, chan_index );
2484         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
2485         STORE( &r[0], 0, chan_index );
2486      }
2487      break;
2488
2489   case TGSI_OPCODE_SNE:
2490      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2491         FETCH( &r[0], 0, chan_index );
2492         FETCH( &r[1], 1, chan_index );
2493         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
2494         STORE( &r[0], 0, chan_index );
2495      }
2496      break;
2497
2498   case TGSI_OPCODE_STR:
2499      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
2500         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, chan_index);
2501      }
2502      break;
2503
2504   case TGSI_OPCODE_TEX:
2505      /* simple texture lookup */
2506      /* src[0] = texcoord */
2507      /* src[1] = sampler unit */
2508      exec_tex(mach, inst, FALSE, FALSE);
2509      break;
2510
2511   case TGSI_OPCODE_TXB:
2512      /* Texture lookup with lod bias */
2513      /* src[0] = texcoord (src[0].w = LOD bias) */
2514      /* src[1] = sampler unit */
2515      exec_tex(mach, inst, TRUE, FALSE);
2516      break;
2517
2518   case TGSI_OPCODE_TXD:
2519      /* Texture lookup with explict partial derivatives */
2520      /* src[0] = texcoord */
2521      /* src[1] = d[strq]/dx */
2522      /* src[2] = d[strq]/dy */
2523      /* src[3] = sampler unit */
2524      assert (0);
2525      break;
2526
2527   case TGSI_OPCODE_TXL:
2528      /* Texture lookup with explit LOD */
2529      /* src[0] = texcoord (src[0].w = LOD) */
2530      /* src[1] = sampler unit */
2531      exec_tex(mach, inst, TRUE, FALSE);
2532      break;
2533
2534   case TGSI_OPCODE_TXP:
2535      /* Texture lookup with projection */
2536      /* src[0] = texcoord (src[0].w = projection) */
2537      /* src[1] = sampler unit */
2538      exec_tex(mach, inst, FALSE, TRUE);
2539      break;
2540
2541   case TGSI_OPCODE_UP2H:
2542      assert (0);
2543      break;
2544
2545   case TGSI_OPCODE_UP2US:
2546      assert (0);
2547      break;
2548
2549   case TGSI_OPCODE_UP4B:
2550      assert (0);
2551      break;
2552
2553   case TGSI_OPCODE_UP4UB:
2554      assert (0);
2555      break;
2556
2557   case TGSI_OPCODE_X2D:
2558      FETCH(&r[0], 1, CHAN_X);
2559      FETCH(&r[1], 1, CHAN_Y);
2560      if (IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2561          IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2562         FETCH(&r[2], 2, CHAN_X);
2563         micro_mul(&r[2], &r[2], &r[0]);
2564         FETCH(&r[3], 2, CHAN_Y);
2565         micro_mul(&r[3], &r[3], &r[1]);
2566         micro_add(&r[2], &r[2], &r[3]);
2567         FETCH(&r[3], 0, CHAN_X);
2568         micro_add(&r[2], &r[2], &r[3]);
2569         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2570            STORE(&r[2], 0, CHAN_X);
2571         }
2572         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2573            STORE(&r[2], 0, CHAN_Z);
2574         }
2575      }
2576      if (IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2577          IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2578         FETCH(&r[2], 2, CHAN_Z);
2579         micro_mul(&r[2], &r[2], &r[0]);
2580         FETCH(&r[3], 2, CHAN_W);
2581         micro_mul(&r[3], &r[3], &r[1]);
2582         micro_add(&r[2], &r[2], &r[3]);
2583         FETCH(&r[3], 0, CHAN_Y);
2584         micro_add(&r[2], &r[2], &r[3]);
2585         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2586            STORE(&r[2], 0, CHAN_Y);
2587         }
2588         if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2589            STORE(&r[2], 0, CHAN_W);
2590         }
2591      }
2592      break;
2593
2594   case TGSI_OPCODE_ARA:
2595      assert (0);
2596      break;
2597
2598   case TGSI_OPCODE_BRA:
2599      assert (0);
2600      break;
2601
2602   case TGSI_OPCODE_CAL:
2603      /* skip the call if no execution channels are enabled */
2604      if (mach->ExecMask) {
2605         /* do the call */
2606
2607         /* First, record the depths of the execution stacks.
2608          * This is important for deeply nested/looped return statements.
2609          * We have to unwind the stacks by the correct amount.  For a
2610          * real code generator, we could determine the number of entries
2611          * to pop off each stack with simple static analysis and avoid
2612          * implementing this data structure at run time.
2613          */
2614         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
2615         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
2616         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
2617         /* note that PC was already incremented above */
2618         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
2619
2620         mach->CallStackTop++;
2621
2622         /* Second, push the Cond, Loop, Cont, Func stacks */
2623         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2624         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2625         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2626         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2627         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2628         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2629         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
2630         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
2631
2632         /* Finally, jump to the subroutine */
2633         *pc = inst->InstructionExtLabel.Label;
2634      }
2635      break;
2636
2637   case TGSI_OPCODE_RET:
2638      mach->FuncMask &= ~mach->ExecMask;
2639      UPDATE_EXEC_MASK(mach);
2640
2641      if (mach->FuncMask == 0x0) {
2642         /* really return now (otherwise, keep executing */
2643
2644         if (mach->CallStackTop == 0) {
2645            /* returning from main() */
2646            *pc = -1;
2647            return;
2648         }
2649
2650         assert(mach->CallStackTop > 0);
2651         mach->CallStackTop--;
2652
2653         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
2654         mach->CondMask = mach->CondStack[mach->CondStackTop];
2655
2656         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
2657         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
2658
2659         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
2660         mach->ContMask = mach->ContStack[mach->ContStackTop];
2661
2662         assert(mach->FuncStackTop > 0);
2663         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
2664
2665         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
2666
2667         UPDATE_EXEC_MASK(mach);
2668      }
2669      break;
2670
2671   case TGSI_OPCODE_SSG:
2672   /* TGSI_OPCODE_SGN */
2673      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2674         FETCH( &r[0], 0, chan_index );
2675         micro_sgn( &r[0], &r[0] );
2676         STORE( &r[0], 0, chan_index );
2677      }
2678      break;
2679
2680   case TGSI_OPCODE_CMP:
2681      if (inst->Flags & SOA_DEPENDENCY_FLAG) {
2682         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2683            FETCH(&r[chan_index], 0, chan_index);
2684            FETCH(&r[chan_index + NUM_CHANNELS], 1, chan_index);
2685            FETCH(&r[chan_index + 2 * NUM_CHANNELS], 2, chan_index);
2686         }
2687         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2688            micro_lt( &r[chan_index], &r[chan_index],
2689                      &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[chan_index + NUM_CHANNELS],
2690                      &r[chan_index + 2*NUM_CHANNELS] );
2691            STORE(&r[chan_index], 0, chan_index);
2692         }
2693      } else {
2694         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2695            FETCH(&r[0], 0, chan_index);
2696            FETCH(&r[1], 1, chan_index);
2697            FETCH(&r[2], 2, chan_index);
2698
2699            micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
2700
2701            STORE(&r[0], 0, chan_index);
2702         }
2703      }
2704      break;
2705
2706   case TGSI_OPCODE_SCS:
2707      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
2708         FETCH( &r[0], 0, CHAN_X );
2709         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2710            micro_cos(&r[1], &r[0]);
2711            STORE(&r[1], 0, CHAN_X);
2712         }
2713         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2714            micro_sin(&r[1], &r[0]);
2715            STORE(&r[1], 0, CHAN_Y);
2716         }
2717      }
2718      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
2719         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
2720      }
2721      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
2722         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
2723      }
2724      break;
2725
2726   case TGSI_OPCODE_NRM:
2727      /* 3-component vector normalize */
2728      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
2729         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
2730         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2731         /* r3 = sqrt(dp3(src0, src0)) */
2732         FETCH(&r[0], 0, CHAN_X);
2733         micro_mul(&r[3], &r[0], &r[0]);
2734         FETCH(&r[1], 0, CHAN_Y);
2735         micro_mul(&r[4], &r[1], &r[1]);
2736         micro_add(&r[3], &r[3], &r[4]);
2737         FETCH(&r[2], 0, CHAN_Z);
2738         micro_mul(&r[4], &r[2], &r[2]);
2739         micro_add(&r[3], &r[3], &r[4]);
2740         micro_sqrt(&r[3], &r[3]);
2741
2742         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
2743            micro_div(&r[0], &r[0], &r[3]);
2744            STORE(&r[0], 0, CHAN_X);
2745         }
2746         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
2747            micro_div(&r[1], &r[1], &r[3]);
2748            STORE(&r[1], 0, CHAN_Y);
2749         }
2750         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
2751            micro_div(&r[2], &r[2], &r[3]);
2752            STORE(&r[2], 0, CHAN_Z);
2753         }
2754      }
2755      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
2756         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
2757      }
2758      break;
2759
2760   case TGSI_OPCODE_NRM4:
2761      /* 4-component vector normalize */
2762      {
2763         union tgsi_exec_channel tmp, dot;
2764
2765         /* tmp = dp4(src0, src0): */
2766         FETCH( &r[0], 0, CHAN_X );
2767         micro_mul( &tmp, &r[0], &r[0] );
2768
2769         FETCH( &r[1], 0, CHAN_Y );
2770         micro_mul( &dot, &r[1], &r[1] );
2771         micro_add( &tmp, &tmp, &dot );
2772
2773         FETCH( &r[2], 0, CHAN_Z );
2774         micro_mul( &dot, &r[2], &r[2] );
2775         micro_add( &tmp, &tmp, &dot );
2776
2777         FETCH( &r[3], 0, CHAN_W );
2778         micro_mul( &dot, &r[3], &r[3] );
2779         micro_add( &tmp, &tmp, &dot );
2780
2781         /* tmp = 1 / sqrt(tmp) */
2782         micro_sqrt( &tmp, &tmp );
2783         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
2784
2785         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2786            /* chan = chan * tmp */
2787            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
2788            STORE( &r[chan_index], 0, chan_index );
2789         }
2790      }
2791      break;
2792
2793   case TGSI_OPCODE_DIV:
2794      assert( 0 );
2795      break;
2796
2797   case TGSI_OPCODE_DP2:
2798      FETCH( &r[0], 0, CHAN_X );
2799      FETCH( &r[1], 1, CHAN_X );
2800      micro_mul( &r[0], &r[0], &r[1] );
2801
2802      FETCH( &r[1], 0, CHAN_Y );
2803      FETCH( &r[2], 1, CHAN_Y );
2804      micro_mul( &r[1], &r[1], &r[2] );
2805      micro_add( &r[0], &r[0], &r[1] );
2806
2807      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2808         STORE( &r[0], 0, chan_index );
2809      }
2810      break;
2811
2812   case TGSI_OPCODE_IF:
2813      /* push CondMask */
2814      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
2815      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
2816      FETCH( &r[0], 0, CHAN_X );
2817      /* update CondMask */
2818      if( ! r[0].u[0] ) {
2819         mach->CondMask &= ~0x1;
2820      }
2821      if( ! r[0].u[1] ) {
2822         mach->CondMask &= ~0x2;
2823      }
2824      if( ! r[0].u[2] ) {
2825         mach->CondMask &= ~0x4;
2826      }
2827      if( ! r[0].u[3] ) {
2828         mach->CondMask &= ~0x8;
2829      }
2830      UPDATE_EXEC_MASK(mach);
2831      /* Todo: If CondMask==0, jump to ELSE */
2832      break;
2833
2834   case TGSI_OPCODE_ELSE:
2835      /* invert CondMask wrt previous mask */
2836      {
2837         uint prevMask;
2838         assert(mach->CondStackTop > 0);
2839         prevMask = mach->CondStack[mach->CondStackTop - 1];
2840         mach->CondMask = ~mach->CondMask & prevMask;
2841         UPDATE_EXEC_MASK(mach);
2842         /* Todo: If CondMask==0, jump to ENDIF */
2843      }
2844      break;
2845
2846   case TGSI_OPCODE_ENDIF:
2847      /* pop CondMask */
2848      assert(mach->CondStackTop > 0);
2849      mach->CondMask = mach->CondStack[--mach->CondStackTop];
2850      UPDATE_EXEC_MASK(mach);
2851      break;
2852
2853   case TGSI_OPCODE_END:
2854      /* halt execution */
2855      *pc = -1;
2856      break;
2857
2858   case TGSI_OPCODE_REP:
2859      assert (0);
2860      break;
2861
2862   case TGSI_OPCODE_ENDREP:
2863       assert (0);
2864       break;
2865
2866   case TGSI_OPCODE_PUSHA:
2867      assert (0);
2868      break;
2869
2870   case TGSI_OPCODE_POPA:
2871      assert (0);
2872      break;
2873
2874   case TGSI_OPCODE_CEIL:
2875      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2876         FETCH( &r[0], 0, chan_index );
2877         micro_ceil( &r[0], &r[0] );
2878         STORE( &r[0], 0, chan_index );
2879      }
2880      break;
2881
2882   case TGSI_OPCODE_I2F:
2883      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2884         FETCH( &r[0], 0, chan_index );
2885         micro_i2f( &r[0], &r[0] );
2886         STORE( &r[0], 0, chan_index );
2887      }
2888      break;
2889
2890   case TGSI_OPCODE_NOT:
2891      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2892         FETCH( &r[0], 0, chan_index );
2893         micro_not( &r[0], &r[0] );
2894         STORE( &r[0], 0, chan_index );
2895      }
2896      break;
2897
2898   case TGSI_OPCODE_TRUNC:
2899      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2900         FETCH( &r[0], 0, chan_index );
2901         micro_trunc( &r[0], &r[0] );
2902         STORE( &r[0], 0, chan_index );
2903      }
2904      break;
2905
2906   case TGSI_OPCODE_SHL:
2907      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2908         FETCH( &r[0], 0, chan_index );
2909         FETCH( &r[1], 1, chan_index );
2910         micro_shl( &r[0], &r[0], &r[1] );
2911         STORE( &r[0], 0, chan_index );
2912      }
2913      break;
2914
2915   case TGSI_OPCODE_SHR:
2916      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2917         FETCH( &r[0], 0, chan_index );
2918         FETCH( &r[1], 1, chan_index );
2919         micro_ishr( &r[0], &r[0], &r[1] );
2920         STORE( &r[0], 0, chan_index );
2921      }
2922      break;
2923
2924   case TGSI_OPCODE_AND:
2925      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2926         FETCH( &r[0], 0, chan_index );
2927         FETCH( &r[1], 1, chan_index );
2928         micro_and( &r[0], &r[0], &r[1] );
2929         STORE( &r[0], 0, chan_index );
2930      }
2931      break;
2932
2933   case TGSI_OPCODE_OR:
2934      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2935         FETCH( &r[0], 0, chan_index );
2936         FETCH( &r[1], 1, chan_index );
2937         micro_or( &r[0], &r[0], &r[1] );
2938         STORE( &r[0], 0, chan_index );
2939      }
2940      break;
2941
2942   case TGSI_OPCODE_MOD:
2943      assert (0);
2944      break;
2945
2946   case TGSI_OPCODE_XOR:
2947      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
2948         FETCH( &r[0], 0, chan_index );
2949         FETCH( &r[1], 1, chan_index );
2950         micro_xor( &r[0], &r[0], &r[1] );
2951         STORE( &r[0], 0, chan_index );
2952      }
2953      break;
2954
2955   case TGSI_OPCODE_SAD:
2956      assert (0);
2957      break;
2958
2959   case TGSI_OPCODE_TXF:
2960      assert (0);
2961      break;
2962
2963   case TGSI_OPCODE_TXQ:
2964      assert (0);
2965      break;
2966
2967   case TGSI_OPCODE_EMIT:
2968      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
2969      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2970      break;
2971
2972   case TGSI_OPCODE_ENDPRIM:
2973      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
2974      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
2975      break;
2976
2977   case TGSI_OPCODE_BGNFOR:
2978      assert(mach->LoopCounterStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2979      for (chan_index = 0; chan_index < 3; chan_index++) {
2980         FETCH( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[chan_index], 0, chan_index );
2981      }
2982      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
2983      ++mach->LoopCounterStackTop;
2984      /* fall-through (for now) */
2985   case TGSI_OPCODE_BGNLOOP:
2986      /* push LoopMask and ContMasks */
2987      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2988      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
2989      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2990      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
2991      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
2992      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
2993      break;
2994
2995   case TGSI_OPCODE_ENDFOR:
2996      assert(mach->LoopCounterStackTop > 0);
2997      micro_sub( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
2998                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X],
2999                 &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
3000      /* update LoopMask */
3001      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[0] <= 0) {
3002         mach->LoopMask &= ~0x1;
3003      }
3004      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[1] <= 0 ) {
3005         mach->LoopMask &= ~0x2;
3006      }
3007      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[2] <= 0 ) {
3008         mach->LoopMask &= ~0x4;
3009      }
3010      if( mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_X].f[3] <= 0 ) {
3011         mach->LoopMask &= ~0x8;
3012      }
3013      micro_add( &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3014                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Y],
3015                 &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[CHAN_Z]);
3016      assert(mach->LoopLabelStackTop > 0);
3017      inst = mach->Instructions + mach->LoopLabelStack[mach->LoopLabelStackTop - 1];
3018      STORE( &mach->LoopCounterStack[mach->LoopCounterStackTop].xyzw[CHAN_Y], 0, CHAN_X );
3019      /* Restore ContMask, but don't pop */
3020      assert(mach->ContStackTop > 0);
3021      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3022      UPDATE_EXEC_MASK(mach);
3023      if (mach->ExecMask) {
3024         /* repeat loop: jump to instruction just past BGNLOOP */
3025         assert(mach->LoopLabelStackTop > 0);
3026         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3027      }
3028      else {
3029         /* exit loop: pop LoopMask */
3030         assert(mach->LoopStackTop > 0);
3031         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3032         /* pop ContMask */
3033         assert(mach->ContStackTop > 0);
3034         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3035         assert(mach->LoopLabelStackTop > 0);
3036         --mach->LoopLabelStackTop;
3037         assert(mach->LoopCounterStackTop > 0);
3038         --mach->LoopCounterStackTop;
3039      }
3040      UPDATE_EXEC_MASK(mach);
3041      break;
3042
3043   case TGSI_OPCODE_ENDLOOP:
3044      /* Restore ContMask, but don't pop */
3045      assert(mach->ContStackTop > 0);
3046      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
3047      UPDATE_EXEC_MASK(mach);
3048      if (mach->ExecMask) {
3049         /* repeat loop: jump to instruction just past BGNLOOP */
3050         assert(mach->LoopLabelStackTop > 0);
3051         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
3052      }
3053      else {
3054         /* exit loop: pop LoopMask */
3055         assert(mach->LoopStackTop > 0);
3056         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
3057         /* pop ContMask */
3058         assert(mach->ContStackTop > 0);
3059         mach->ContMask = mach->ContStack[--mach->ContStackTop];
3060         assert(mach->LoopLabelStackTop > 0);
3061         --mach->LoopLabelStackTop;
3062      }
3063      UPDATE_EXEC_MASK(mach);
3064      break;
3065
3066   case TGSI_OPCODE_BRK:
3067      /* turn off loop channels for each enabled exec channel */
3068      mach->LoopMask &= ~mach->ExecMask;
3069      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3070      UPDATE_EXEC_MASK(mach);
3071      break;
3072
3073   case TGSI_OPCODE_CONT:
3074      /* turn off cont channels for each enabled exec channel */
3075      mach->ContMask &= ~mach->ExecMask;
3076      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3077      UPDATE_EXEC_MASK(mach);
3078      break;
3079
3080   case TGSI_OPCODE_BGNSUB:
3081      /* no-op */
3082      break;
3083
3084   case TGSI_OPCODE_ENDSUB:
3085      /* no-op */
3086      break;
3087
3088   case TGSI_OPCODE_NOP:
3089      break;
3090
3091   default:
3092      assert( 0 );
3093   }
3094}
3095
3096
3097/**
3098 * Run TGSI interpreter.
3099 * \return bitmask of "alive" quad components
3100 */
3101uint
3102tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
3103{
3104   uint i;
3105   int pc = 0;
3106
3107   mach->CondMask = 0xf;
3108   mach->LoopMask = 0xf;
3109   mach->ContMask = 0xf;
3110   mach->FuncMask = 0xf;
3111   mach->ExecMask = 0xf;
3112
3113   assert(mach->CondStackTop == 0);
3114   assert(mach->LoopStackTop == 0);
3115   assert(mach->ContStackTop == 0);
3116   assert(mach->CallStackTop == 0);
3117
3118   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
3119   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
3120
3121   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
3122      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
3123      mach->Primitives[0] = 0;
3124   }
3125
3126   for (i = 0; i < QUAD_SIZE; i++) {
3127      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
3128         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
3129         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
3130         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
3131         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
3132   }
3133
3134   /* execute declarations (interpolants) */
3135   for (i = 0; i < mach->NumDeclarations; i++) {
3136      exec_declaration( mach, mach->Declarations+i );
3137   }
3138
3139   /* execute instructions, until pc is set to -1 */
3140   while (pc != -1) {
3141      assert(pc < (int) mach->NumInstructions);
3142      exec_instruction( mach, mach->Instructions + pc, &pc );
3143   }
3144
3145#if 0
3146   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
3147   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
3148      /*
3149       * Scale back depth component.
3150       */
3151      for (i = 0; i < 4; i++)
3152         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
3153   }
3154#endif
3155
3156   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3157}
3158