tgsi_exec.c revision 01e04c3f
1/**************************************************************************
2 *
3 * Copyright 2007-2008 VMware, Inc.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_half.h"
62#include "util/u_memory.h"
63#include "util/u_math.h"
64#include "util/rounding.h"
65
66
67#define DEBUG_EXECUTION 0
68
69
70#define FAST_MATH 0
71
72#define TILE_TOP_LEFT     0
73#define TILE_TOP_RIGHT    1
74#define TILE_BOTTOM_LEFT  2
75#define TILE_BOTTOM_RIGHT 3
76
77union tgsi_double_channel {
78   double d[TGSI_QUAD_SIZE];
79   unsigned u[TGSI_QUAD_SIZE][2];
80   uint64_t u64[TGSI_QUAD_SIZE];
81   int64_t i64[TGSI_QUAD_SIZE];
82};
83
84struct tgsi_double_vector {
85   union tgsi_double_channel xy;
86   union tgsi_double_channel zw;
87};
88
89static void
90micro_abs(union tgsi_exec_channel *dst,
91          const union tgsi_exec_channel *src)
92{
93   dst->f[0] = fabsf(src->f[0]);
94   dst->f[1] = fabsf(src->f[1]);
95   dst->f[2] = fabsf(src->f[2]);
96   dst->f[3] = fabsf(src->f[3]);
97}
98
99static void
100micro_arl(union tgsi_exec_channel *dst,
101          const union tgsi_exec_channel *src)
102{
103   dst->i[0] = (int)floorf(src->f[0]);
104   dst->i[1] = (int)floorf(src->f[1]);
105   dst->i[2] = (int)floorf(src->f[2]);
106   dst->i[3] = (int)floorf(src->f[3]);
107}
108
109static void
110micro_arr(union tgsi_exec_channel *dst,
111          const union tgsi_exec_channel *src)
112{
113   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
114   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
115   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
116   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
117}
118
119static void
120micro_ceil(union tgsi_exec_channel *dst,
121           const union tgsi_exec_channel *src)
122{
123   dst->f[0] = ceilf(src->f[0]);
124   dst->f[1] = ceilf(src->f[1]);
125   dst->f[2] = ceilf(src->f[2]);
126   dst->f[3] = ceilf(src->f[3]);
127}
128
129static void
130micro_cmp(union tgsi_exec_channel *dst,
131          const union tgsi_exec_channel *src0,
132          const union tgsi_exec_channel *src1,
133          const union tgsi_exec_channel *src2)
134{
135   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
136   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
137   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
138   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
139}
140
141static void
142micro_cos(union tgsi_exec_channel *dst,
143          const union tgsi_exec_channel *src)
144{
145   dst->f[0] = cosf(src->f[0]);
146   dst->f[1] = cosf(src->f[1]);
147   dst->f[2] = cosf(src->f[2]);
148   dst->f[3] = cosf(src->f[3]);
149}
150
151static void
152micro_d2f(union tgsi_exec_channel *dst,
153          const union tgsi_double_channel *src)
154{
155   dst->f[0] = (float)src->d[0];
156   dst->f[1] = (float)src->d[1];
157   dst->f[2] = (float)src->d[2];
158   dst->f[3] = (float)src->d[3];
159}
160
161static void
162micro_d2i(union tgsi_exec_channel *dst,
163          const union tgsi_double_channel *src)
164{
165   dst->i[0] = (int)src->d[0];
166   dst->i[1] = (int)src->d[1];
167   dst->i[2] = (int)src->d[2];
168   dst->i[3] = (int)src->d[3];
169}
170
171static void
172micro_d2u(union tgsi_exec_channel *dst,
173          const union tgsi_double_channel *src)
174{
175   dst->u[0] = (unsigned)src->d[0];
176   dst->u[1] = (unsigned)src->d[1];
177   dst->u[2] = (unsigned)src->d[2];
178   dst->u[3] = (unsigned)src->d[3];
179}
180static void
181micro_dabs(union tgsi_double_channel *dst,
182           const union tgsi_double_channel *src)
183{
184   dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
185   dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
186   dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
187   dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
188}
189
190static void
191micro_dadd(union tgsi_double_channel *dst,
192          const union tgsi_double_channel *src)
193{
194   dst->d[0] = src[0].d[0] + src[1].d[0];
195   dst->d[1] = src[0].d[1] + src[1].d[1];
196   dst->d[2] = src[0].d[2] + src[1].d[2];
197   dst->d[3] = src[0].d[3] + src[1].d[3];
198}
199
200static void
201micro_ddiv(union tgsi_double_channel *dst,
202          const union tgsi_double_channel *src)
203{
204   dst->d[0] = src[0].d[0] / src[1].d[0];
205   dst->d[1] = src[0].d[1] / src[1].d[1];
206   dst->d[2] = src[0].d[2] / src[1].d[2];
207   dst->d[3] = src[0].d[3] / src[1].d[3];
208}
209
210static void
211micro_ddx(union tgsi_exec_channel *dst,
212          const union tgsi_exec_channel *src)
213{
214   dst->f[0] =
215   dst->f[1] =
216   dst->f[2] =
217   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
218}
219
220static void
221micro_ddy(union tgsi_exec_channel *dst,
222          const union tgsi_exec_channel *src)
223{
224   dst->f[0] =
225   dst->f[1] =
226   dst->f[2] =
227   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
228}
229
230static void
231micro_dmul(union tgsi_double_channel *dst,
232           const union tgsi_double_channel *src)
233{
234   dst->d[0] = src[0].d[0] * src[1].d[0];
235   dst->d[1] = src[0].d[1] * src[1].d[1];
236   dst->d[2] = src[0].d[2] * src[1].d[2];
237   dst->d[3] = src[0].d[3] * src[1].d[3];
238}
239
240static void
241micro_dmax(union tgsi_double_channel *dst,
242           const union tgsi_double_channel *src)
243{
244   dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
245   dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
246   dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
247   dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
248}
249
250static void
251micro_dmin(union tgsi_double_channel *dst,
252           const union tgsi_double_channel *src)
253{
254   dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
255   dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
256   dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
257   dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
258}
259
260static void
261micro_dneg(union tgsi_double_channel *dst,
262           const union tgsi_double_channel *src)
263{
264   dst->d[0] = -src->d[0];
265   dst->d[1] = -src->d[1];
266   dst->d[2] = -src->d[2];
267   dst->d[3] = -src->d[3];
268}
269
270static void
271micro_dslt(union tgsi_double_channel *dst,
272           const union tgsi_double_channel *src)
273{
274   dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
275   dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
276   dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
277   dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
278}
279
280static void
281micro_dsne(union tgsi_double_channel *dst,
282           const union tgsi_double_channel *src)
283{
284   dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
285   dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
286   dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
287   dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
288}
289
290static void
291micro_dsge(union tgsi_double_channel *dst,
292           const union tgsi_double_channel *src)
293{
294   dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
295   dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
296   dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
297   dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
298}
299
300static void
301micro_dseq(union tgsi_double_channel *dst,
302           const union tgsi_double_channel *src)
303{
304   dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
305   dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
306   dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
307   dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
308}
309
310static void
311micro_drcp(union tgsi_double_channel *dst,
312           const union tgsi_double_channel *src)
313{
314   dst->d[0] = 1.0 / src->d[0];
315   dst->d[1] = 1.0 / src->d[1];
316   dst->d[2] = 1.0 / src->d[2];
317   dst->d[3] = 1.0 / src->d[3];
318}
319
320static void
321micro_dsqrt(union tgsi_double_channel *dst,
322            const union tgsi_double_channel *src)
323{
324   dst->d[0] = sqrt(src->d[0]);
325   dst->d[1] = sqrt(src->d[1]);
326   dst->d[2] = sqrt(src->d[2]);
327   dst->d[3] = sqrt(src->d[3]);
328}
329
330static void
331micro_drsq(union tgsi_double_channel *dst,
332          const union tgsi_double_channel *src)
333{
334   dst->d[0] = 1.0 / sqrt(src->d[0]);
335   dst->d[1] = 1.0 / sqrt(src->d[1]);
336   dst->d[2] = 1.0 / sqrt(src->d[2]);
337   dst->d[3] = 1.0 / sqrt(src->d[3]);
338}
339
340static void
341micro_dmad(union tgsi_double_channel *dst,
342           const union tgsi_double_channel *src)
343{
344   dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
345   dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
346   dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
347   dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
348}
349
350static void
351micro_dfrac(union tgsi_double_channel *dst,
352            const union tgsi_double_channel *src)
353{
354   dst->d[0] = src->d[0] - floor(src->d[0]);
355   dst->d[1] = src->d[1] - floor(src->d[1]);
356   dst->d[2] = src->d[2] - floor(src->d[2]);
357   dst->d[3] = src->d[3] - floor(src->d[3]);
358}
359
360static void
361micro_dldexp(union tgsi_double_channel *dst,
362             const union tgsi_double_channel *src0,
363             union tgsi_exec_channel *src1)
364{
365   dst->d[0] = ldexp(src0->d[0], src1->i[0]);
366   dst->d[1] = ldexp(src0->d[1], src1->i[1]);
367   dst->d[2] = ldexp(src0->d[2], src1->i[2]);
368   dst->d[3] = ldexp(src0->d[3], src1->i[3]);
369}
370
371static void
372micro_dfracexp(union tgsi_double_channel *dst,
373               union tgsi_exec_channel *dst_exp,
374               const union tgsi_double_channel *src)
375{
376   dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
377   dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
378   dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
379   dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
380}
381
382static void
383micro_exp2(union tgsi_exec_channel *dst,
384           const union tgsi_exec_channel *src)
385{
386#if FAST_MATH
387   dst->f[0] = util_fast_exp2(src->f[0]);
388   dst->f[1] = util_fast_exp2(src->f[1]);
389   dst->f[2] = util_fast_exp2(src->f[2]);
390   dst->f[3] = util_fast_exp2(src->f[3]);
391#else
392#if DEBUG
393   /* Inf is okay for this instruction, so clamp it to silence assertions. */
394   uint i;
395   union tgsi_exec_channel clamped;
396
397   for (i = 0; i < 4; i++) {
398      if (src->f[i] > 127.99999f) {
399         clamped.f[i] = 127.99999f;
400      } else if (src->f[i] < -126.99999f) {
401         clamped.f[i] = -126.99999f;
402      } else {
403         clamped.f[i] = src->f[i];
404      }
405   }
406   src = &clamped;
407#endif /* DEBUG */
408
409   dst->f[0] = powf(2.0f, src->f[0]);
410   dst->f[1] = powf(2.0f, src->f[1]);
411   dst->f[2] = powf(2.0f, src->f[2]);
412   dst->f[3] = powf(2.0f, src->f[3]);
413#endif /* FAST_MATH */
414}
415
416static void
417micro_f2d(union tgsi_double_channel *dst,
418          const union tgsi_exec_channel *src)
419{
420   dst->d[0] = (double)src->f[0];
421   dst->d[1] = (double)src->f[1];
422   dst->d[2] = (double)src->f[2];
423   dst->d[3] = (double)src->f[3];
424}
425
426static void
427micro_flr(union tgsi_exec_channel *dst,
428          const union tgsi_exec_channel *src)
429{
430   dst->f[0] = floorf(src->f[0]);
431   dst->f[1] = floorf(src->f[1]);
432   dst->f[2] = floorf(src->f[2]);
433   dst->f[3] = floorf(src->f[3]);
434}
435
436static void
437micro_frc(union tgsi_exec_channel *dst,
438          const union tgsi_exec_channel *src)
439{
440   dst->f[0] = src->f[0] - floorf(src->f[0]);
441   dst->f[1] = src->f[1] - floorf(src->f[1]);
442   dst->f[2] = src->f[2] - floorf(src->f[2]);
443   dst->f[3] = src->f[3] - floorf(src->f[3]);
444}
445
446static void
447micro_i2d(union tgsi_double_channel *dst,
448          const union tgsi_exec_channel *src)
449{
450   dst->d[0] = (double)src->i[0];
451   dst->d[1] = (double)src->i[1];
452   dst->d[2] = (double)src->i[2];
453   dst->d[3] = (double)src->i[3];
454}
455
456static void
457micro_iabs(union tgsi_exec_channel *dst,
458           const union tgsi_exec_channel *src)
459{
460   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
461   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
462   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
463   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
464}
465
466static void
467micro_ineg(union tgsi_exec_channel *dst,
468           const union tgsi_exec_channel *src)
469{
470   dst->i[0] = -src->i[0];
471   dst->i[1] = -src->i[1];
472   dst->i[2] = -src->i[2];
473   dst->i[3] = -src->i[3];
474}
475
476static void
477micro_lg2(union tgsi_exec_channel *dst,
478          const union tgsi_exec_channel *src)
479{
480#if FAST_MATH
481   dst->f[0] = util_fast_log2(src->f[0]);
482   dst->f[1] = util_fast_log2(src->f[1]);
483   dst->f[2] = util_fast_log2(src->f[2]);
484   dst->f[3] = util_fast_log2(src->f[3]);
485#else
486   dst->f[0] = logf(src->f[0]) * 1.442695f;
487   dst->f[1] = logf(src->f[1]) * 1.442695f;
488   dst->f[2] = logf(src->f[2]) * 1.442695f;
489   dst->f[3] = logf(src->f[3]) * 1.442695f;
490#endif
491}
492
493static void
494micro_lrp(union tgsi_exec_channel *dst,
495          const union tgsi_exec_channel *src0,
496          const union tgsi_exec_channel *src1,
497          const union tgsi_exec_channel *src2)
498{
499   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
500   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
501   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
502   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
503}
504
505static void
506micro_mad(union tgsi_exec_channel *dst,
507          const union tgsi_exec_channel *src0,
508          const union tgsi_exec_channel *src1,
509          const union tgsi_exec_channel *src2)
510{
511   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
512   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
513   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
514   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
515}
516
517static void
518micro_mov(union tgsi_exec_channel *dst,
519          const union tgsi_exec_channel *src)
520{
521   dst->u[0] = src->u[0];
522   dst->u[1] = src->u[1];
523   dst->u[2] = src->u[2];
524   dst->u[3] = src->u[3];
525}
526
527static void
528micro_rcp(union tgsi_exec_channel *dst,
529          const union tgsi_exec_channel *src)
530{
531#if 0 /* for debugging */
532   assert(src->f[0] != 0.0f);
533   assert(src->f[1] != 0.0f);
534   assert(src->f[2] != 0.0f);
535   assert(src->f[3] != 0.0f);
536#endif
537   dst->f[0] = 1.0f / src->f[0];
538   dst->f[1] = 1.0f / src->f[1];
539   dst->f[2] = 1.0f / src->f[2];
540   dst->f[3] = 1.0f / src->f[3];
541}
542
543static void
544micro_rnd(union tgsi_exec_channel *dst,
545          const union tgsi_exec_channel *src)
546{
547   dst->f[0] = _mesa_roundevenf(src->f[0]);
548   dst->f[1] = _mesa_roundevenf(src->f[1]);
549   dst->f[2] = _mesa_roundevenf(src->f[2]);
550   dst->f[3] = _mesa_roundevenf(src->f[3]);
551}
552
553static void
554micro_rsq(union tgsi_exec_channel *dst,
555          const union tgsi_exec_channel *src)
556{
557#if 0 /* for debugging */
558   assert(src->f[0] != 0.0f);
559   assert(src->f[1] != 0.0f);
560   assert(src->f[2] != 0.0f);
561   assert(src->f[3] != 0.0f);
562#endif
563   dst->f[0] = 1.0f / sqrtf(src->f[0]);
564   dst->f[1] = 1.0f / sqrtf(src->f[1]);
565   dst->f[2] = 1.0f / sqrtf(src->f[2]);
566   dst->f[3] = 1.0f / sqrtf(src->f[3]);
567}
568
569static void
570micro_sqrt(union tgsi_exec_channel *dst,
571           const union tgsi_exec_channel *src)
572{
573   dst->f[0] = sqrtf(src->f[0]);
574   dst->f[1] = sqrtf(src->f[1]);
575   dst->f[2] = sqrtf(src->f[2]);
576   dst->f[3] = sqrtf(src->f[3]);
577}
578
579static void
580micro_seq(union tgsi_exec_channel *dst,
581          const union tgsi_exec_channel *src0,
582          const union tgsi_exec_channel *src1)
583{
584   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
585   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
586   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
587   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
588}
589
590static void
591micro_sge(union tgsi_exec_channel *dst,
592          const union tgsi_exec_channel *src0,
593          const union tgsi_exec_channel *src1)
594{
595   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
596   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
597   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
598   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
599}
600
601static void
602micro_sgn(union tgsi_exec_channel *dst,
603          const union tgsi_exec_channel *src)
604{
605   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
606   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
607   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
608   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
609}
610
611static void
612micro_isgn(union tgsi_exec_channel *dst,
613          const union tgsi_exec_channel *src)
614{
615   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
616   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
617   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
618   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
619}
620
621static void
622micro_sgt(union tgsi_exec_channel *dst,
623          const union tgsi_exec_channel *src0,
624          const union tgsi_exec_channel *src1)
625{
626   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
627   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
628   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
629   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
630}
631
632static void
633micro_sin(union tgsi_exec_channel *dst,
634          const union tgsi_exec_channel *src)
635{
636   dst->f[0] = sinf(src->f[0]);
637   dst->f[1] = sinf(src->f[1]);
638   dst->f[2] = sinf(src->f[2]);
639   dst->f[3] = sinf(src->f[3]);
640}
641
642static void
643micro_sle(union tgsi_exec_channel *dst,
644          const union tgsi_exec_channel *src0,
645          const union tgsi_exec_channel *src1)
646{
647   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
648   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
649   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
650   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
651}
652
653static void
654micro_slt(union tgsi_exec_channel *dst,
655          const union tgsi_exec_channel *src0,
656          const union tgsi_exec_channel *src1)
657{
658   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
659   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
660   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
661   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
662}
663
664static void
665micro_sne(union tgsi_exec_channel *dst,
666          const union tgsi_exec_channel *src0,
667          const union tgsi_exec_channel *src1)
668{
669   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
670   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
671   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
672   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
673}
674
675static void
676micro_trunc(union tgsi_exec_channel *dst,
677            const union tgsi_exec_channel *src)
678{
679   dst->f[0] = truncf(src->f[0]);
680   dst->f[1] = truncf(src->f[1]);
681   dst->f[2] = truncf(src->f[2]);
682   dst->f[3] = truncf(src->f[3]);
683}
684
685static void
686micro_u2d(union tgsi_double_channel *dst,
687          const union tgsi_exec_channel *src)
688{
689   dst->d[0] = (double)src->u[0];
690   dst->d[1] = (double)src->u[1];
691   dst->d[2] = (double)src->u[2];
692   dst->d[3] = (double)src->u[3];
693}
694
695static void
696micro_i64abs(union tgsi_double_channel *dst,
697             const union tgsi_double_channel *src)
698{
699   dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
700   dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
701   dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
702   dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
703}
704
705static void
706micro_i64sgn(union tgsi_double_channel *dst,
707             const union tgsi_double_channel *src)
708{
709   dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
710   dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
711   dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
712   dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
713}
714
715static void
716micro_i64neg(union tgsi_double_channel *dst,
717             const union tgsi_double_channel *src)
718{
719   dst->i64[0] = -src->i64[0];
720   dst->i64[1] = -src->i64[1];
721   dst->i64[2] = -src->i64[2];
722   dst->i64[3] = -src->i64[3];
723}
724
725static void
726micro_u64seq(union tgsi_double_channel *dst,
727           const union tgsi_double_channel *src)
728{
729   dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
730   dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
731   dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
732   dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
733}
734
735static void
736micro_u64sne(union tgsi_double_channel *dst,
737             const union tgsi_double_channel *src)
738{
739   dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
740   dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
741   dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
742   dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
743}
744
745static void
746micro_i64slt(union tgsi_double_channel *dst,
747             const union tgsi_double_channel *src)
748{
749   dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
750   dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
751   dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
752   dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
753}
754
755static void
756micro_u64slt(union tgsi_double_channel *dst,
757             const union tgsi_double_channel *src)
758{
759   dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
760   dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
761   dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
762   dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
763}
764
765static void
766micro_i64sge(union tgsi_double_channel *dst,
767           const union tgsi_double_channel *src)
768{
769   dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
770   dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
771   dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
772   dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
773}
774
775static void
776micro_u64sge(union tgsi_double_channel *dst,
777             const union tgsi_double_channel *src)
778{
779   dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
780   dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
781   dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
782   dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
783}
784
785static void
786micro_u64max(union tgsi_double_channel *dst,
787             const union tgsi_double_channel *src)
788{
789   dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
790   dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
791   dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
792   dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
793}
794
795static void
796micro_i64max(union tgsi_double_channel *dst,
797             const union tgsi_double_channel *src)
798{
799   dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
800   dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
801   dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
802   dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
803}
804
805static void
806micro_u64min(union tgsi_double_channel *dst,
807             const union tgsi_double_channel *src)
808{
809   dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
810   dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
811   dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
812   dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
813}
814
815static void
816micro_i64min(union tgsi_double_channel *dst,
817             const union tgsi_double_channel *src)
818{
819   dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
820   dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
821   dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
822   dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
823}
824
825static void
826micro_u64add(union tgsi_double_channel *dst,
827             const union tgsi_double_channel *src)
828{
829   dst->u64[0] = src[0].u64[0] + src[1].u64[0];
830   dst->u64[1] = src[0].u64[1] + src[1].u64[1];
831   dst->u64[2] = src[0].u64[2] + src[1].u64[2];
832   dst->u64[3] = src[0].u64[3] + src[1].u64[3];
833}
834
835static void
836micro_u64mul(union tgsi_double_channel *dst,
837             const union tgsi_double_channel *src)
838{
839   dst->u64[0] = src[0].u64[0] * src[1].u64[0];
840   dst->u64[1] = src[0].u64[1] * src[1].u64[1];
841   dst->u64[2] = src[0].u64[2] * src[1].u64[2];
842   dst->u64[3] = src[0].u64[3] * src[1].u64[3];
843}
844
845static void
846micro_u64div(union tgsi_double_channel *dst,
847             const union tgsi_double_channel *src)
848{
849   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
850   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
851   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
852   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
853}
854
855static void
856micro_i64div(union tgsi_double_channel *dst,
857             const union tgsi_double_channel *src)
858{
859   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
860   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
861   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
862   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
863}
864
865static void
866micro_u64mod(union tgsi_double_channel *dst,
867             const union tgsi_double_channel *src)
868{
869   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
870   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
871   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
872   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
873}
874
875static void
876micro_i64mod(union tgsi_double_channel *dst,
877             const union tgsi_double_channel *src)
878{
879   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
880   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
881   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
882   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
883}
884
885static void
886micro_u64shl(union tgsi_double_channel *dst,
887             const union tgsi_double_channel *src0,
888             union tgsi_exec_channel *src1)
889{
890   unsigned masked_count;
891   masked_count = src1->u[0] & 0x3f;
892   dst->u64[0] = src0->u64[0] << masked_count;
893   masked_count = src1->u[1] & 0x3f;
894   dst->u64[1] = src0->u64[1] << masked_count;
895   masked_count = src1->u[2] & 0x3f;
896   dst->u64[2] = src0->u64[2] << masked_count;
897   masked_count = src1->u[3] & 0x3f;
898   dst->u64[3] = src0->u64[3] << masked_count;
899}
900
901static void
902micro_i64shr(union tgsi_double_channel *dst,
903             const union tgsi_double_channel *src0,
904             union tgsi_exec_channel *src1)
905{
906   unsigned masked_count;
907   masked_count = src1->u[0] & 0x3f;
908   dst->i64[0] = src0->i64[0] >> masked_count;
909   masked_count = src1->u[1] & 0x3f;
910   dst->i64[1] = src0->i64[1] >> masked_count;
911   masked_count = src1->u[2] & 0x3f;
912   dst->i64[2] = src0->i64[2] >> masked_count;
913   masked_count = src1->u[3] & 0x3f;
914   dst->i64[3] = src0->i64[3] >> masked_count;
915}
916
917static void
918micro_u64shr(union tgsi_double_channel *dst,
919             const union tgsi_double_channel *src0,
920             union tgsi_exec_channel *src1)
921{
922   unsigned masked_count;
923   masked_count = src1->u[0] & 0x3f;
924   dst->u64[0] = src0->u64[0] >> masked_count;
925   masked_count = src1->u[1] & 0x3f;
926   dst->u64[1] = src0->u64[1] >> masked_count;
927   masked_count = src1->u[2] & 0x3f;
928   dst->u64[2] = src0->u64[2] >> masked_count;
929   masked_count = src1->u[3] & 0x3f;
930   dst->u64[3] = src0->u64[3] >> masked_count;
931}
932
933enum tgsi_exec_datatype {
934   TGSI_EXEC_DATA_FLOAT,
935   TGSI_EXEC_DATA_INT,
936   TGSI_EXEC_DATA_UINT,
937   TGSI_EXEC_DATA_DOUBLE,
938   TGSI_EXEC_DATA_INT64,
939   TGSI_EXEC_DATA_UINT64,
940};
941
942/*
943 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
944 */
945#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
946#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
947#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
948#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
949#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
950#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
951
952
953/** The execution mask depends on the conditional mask and the loop mask */
954#define UPDATE_EXEC_MASK(MACH) \
955      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
956
957
958static const union tgsi_exec_channel ZeroVec =
959   { { 0.0, 0.0, 0.0, 0.0 } };
960
961static const union tgsi_exec_channel OneVec = {
962   {1.0f, 1.0f, 1.0f, 1.0f}
963};
964
965static const union tgsi_exec_channel P128Vec = {
966   {128.0f, 128.0f, 128.0f, 128.0f}
967};
968
969static const union tgsi_exec_channel M128Vec = {
970   {-128.0f, -128.0f, -128.0f, -128.0f}
971};
972
973
974/**
975 * Assert that none of the float values in 'chan' are infinite or NaN.
976 * NaN and Inf may occur normally during program execution and should
977 * not lead to crashes, etc.  But when debugging, it's helpful to catch
978 * them.
979 */
980static inline void
981check_inf_or_nan(const union tgsi_exec_channel *chan)
982{
983   assert(!util_is_inf_or_nan((chan)->f[0]));
984   assert(!util_is_inf_or_nan((chan)->f[1]));
985   assert(!util_is_inf_or_nan((chan)->f[2]));
986   assert(!util_is_inf_or_nan((chan)->f[3]));
987}
988
989
990#ifdef DEBUG
991static void
992print_chan(const char *msg, const union tgsi_exec_channel *chan)
993{
994   debug_printf("%s = {%f, %f, %f, %f}\n",
995                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
996}
997#endif
998
999
1000#ifdef DEBUG
1001static void
1002print_temp(const struct tgsi_exec_machine *mach, uint index)
1003{
1004   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1005   int i;
1006   debug_printf("Temp[%u] =\n", index);
1007   for (i = 0; i < 4; i++) {
1008      debug_printf("  %c: { %f, %f, %f, %f }\n",
1009                   "XYZW"[i],
1010                   tmp->xyzw[i].f[0],
1011                   tmp->xyzw[i].f[1],
1012                   tmp->xyzw[i].f[2],
1013                   tmp->xyzw[i].f[3]);
1014   }
1015}
1016#endif
1017
1018
1019void
1020tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1021                               unsigned num_bufs,
1022                               const void **bufs,
1023                               const unsigned *buf_sizes)
1024{
1025   unsigned i;
1026
1027   for (i = 0; i < num_bufs; i++) {
1028      mach->Consts[i] = bufs[i];
1029      mach->ConstsSize[i] = buf_sizes[i];
1030   }
1031}
1032
1033
1034/**
1035 * Check if there's a potential src/dst register data dependency when
1036 * using SOA execution.
1037 * Example:
1038 *   MOV T, T.yxwz;
1039 * This would expand into:
1040 *   MOV t0, t1;
1041 *   MOV t1, t0;
1042 *   MOV t2, t3;
1043 *   MOV t3, t2;
1044 * The second instruction will have the wrong value for t0 if executed as-is.
1045 */
1046boolean
1047tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
1048{
1049   uint i, chan;
1050
1051   uint writemask = inst->Dst[0].Register.WriteMask;
1052   if (writemask == TGSI_WRITEMASK_X ||
1053       writemask == TGSI_WRITEMASK_Y ||
1054       writemask == TGSI_WRITEMASK_Z ||
1055       writemask == TGSI_WRITEMASK_W ||
1056       writemask == TGSI_WRITEMASK_NONE) {
1057      /* no chance of data dependency */
1058      return FALSE;
1059   }
1060
1061   /* loop over src regs */
1062   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1063      if ((inst->Src[i].Register.File ==
1064           inst->Dst[0].Register.File) &&
1065          ((inst->Src[i].Register.Index ==
1066            inst->Dst[0].Register.Index) ||
1067           inst->Src[i].Register.Indirect ||
1068           inst->Dst[0].Register.Indirect)) {
1069         /* loop over dest channels */
1070         uint channelsWritten = 0x0;
1071         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1072            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1073               /* check if we're reading a channel that's been written */
1074               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
1075               if (channelsWritten & (1 << swizzle)) {
1076                  return TRUE;
1077               }
1078
1079               channelsWritten |= (1 << chan);
1080            }
1081         }
1082      }
1083   }
1084   return FALSE;
1085}
1086
1087
1088/**
1089 * Initialize machine state by expanding tokens to full instructions,
1090 * allocating temporary storage, setting up constants, etc.
1091 * After this, we can call tgsi_exec_machine_run() many times.
1092 */
1093void
1094tgsi_exec_machine_bind_shader(
1095   struct tgsi_exec_machine *mach,
1096   const struct tgsi_token *tokens,
1097   struct tgsi_sampler *sampler,
1098   struct tgsi_image *image,
1099   struct tgsi_buffer *buffer)
1100{
1101   uint k;
1102   struct tgsi_parse_context parse;
1103   struct tgsi_full_instruction *instructions;
1104   struct tgsi_full_declaration *declarations;
1105   uint maxInstructions = 10, numInstructions = 0;
1106   uint maxDeclarations = 10, numDeclarations = 0;
1107
1108#if 0
1109   tgsi_dump(tokens, 0);
1110#endif
1111
1112   util_init_math();
1113
1114
1115   mach->Tokens = tokens;
1116   mach->Sampler = sampler;
1117   mach->Image = image;
1118   mach->Buffer = buffer;
1119
1120   if (!tokens) {
1121      /* unbind and free all */
1122      FREE(mach->Declarations);
1123      mach->Declarations = NULL;
1124      mach->NumDeclarations = 0;
1125
1126      FREE(mach->Instructions);
1127      mach->Instructions = NULL;
1128      mach->NumInstructions = 0;
1129
1130      return;
1131   }
1132
1133   k = tgsi_parse_init (&parse, mach->Tokens);
1134   if (k != TGSI_PARSE_OK) {
1135      debug_printf( "Problem parsing!\n" );
1136      return;
1137   }
1138
1139   mach->ImmLimit = 0;
1140   mach->NumOutputs = 0;
1141
1142   for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1143      mach->SysSemanticToIndex[k] = -1;
1144
1145   if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1146       !mach->UsedGeometryShader) {
1147      struct tgsi_exec_vector *inputs;
1148      struct tgsi_exec_vector *outputs;
1149
1150      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1151                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1152                            16);
1153
1154      if (!inputs)
1155         return;
1156
1157      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1158                             TGSI_MAX_TOTAL_VERTICES, 16);
1159
1160      if (!outputs) {
1161         align_free(inputs);
1162         return;
1163      }
1164
1165      align_free(mach->Inputs);
1166      align_free(mach->Outputs);
1167
1168      mach->Inputs = inputs;
1169      mach->Outputs = outputs;
1170      mach->UsedGeometryShader = TRUE;
1171   }
1172
1173   declarations = (struct tgsi_full_declaration *)
1174      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1175
1176   if (!declarations) {
1177      return;
1178   }
1179
1180   instructions = (struct tgsi_full_instruction *)
1181      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1182
1183   if (!instructions) {
1184      FREE( declarations );
1185      return;
1186   }
1187
1188   while( !tgsi_parse_end_of_tokens( &parse ) ) {
1189      uint i;
1190
1191      tgsi_parse_token( &parse );
1192      switch( parse.FullToken.Token.Type ) {
1193      case TGSI_TOKEN_TYPE_DECLARATION:
1194         /* save expanded declaration */
1195         if (numDeclarations == maxDeclarations) {
1196            declarations = REALLOC(declarations,
1197                                   maxDeclarations
1198                                   * sizeof(struct tgsi_full_declaration),
1199                                   (maxDeclarations + 10)
1200                                   * sizeof(struct tgsi_full_declaration));
1201            maxDeclarations += 10;
1202         }
1203         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
1204            unsigned reg;
1205            for (reg = parse.FullToken.FullDeclaration.Range.First;
1206                 reg <= parse.FullToken.FullDeclaration.Range.Last;
1207                 ++reg) {
1208               ++mach->NumOutputs;
1209            }
1210         }
1211         else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1212            const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1213            mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1214         }
1215
1216         memcpy(declarations + numDeclarations,
1217                &parse.FullToken.FullDeclaration,
1218                sizeof(declarations[0]));
1219         numDeclarations++;
1220         break;
1221
1222      case TGSI_TOKEN_TYPE_IMMEDIATE:
1223         {
1224            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1225            assert( size <= 4 );
1226            if (mach->ImmLimit >= mach->ImmsReserved) {
1227               unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1228               float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1229               if (imms) {
1230                  mach->ImmsReserved = newReserved;
1231                  mach->Imms = imms;
1232               } else {
1233                  debug_printf("Unable to (re)allocate space for immidiate constants\n");
1234                  break;
1235               }
1236            }
1237
1238            for( i = 0; i < size; i++ ) {
1239               mach->Imms[mach->ImmLimit][i] =
1240		  parse.FullToken.FullImmediate.u[i].Float;
1241            }
1242            mach->ImmLimit += 1;
1243         }
1244         break;
1245
1246      case TGSI_TOKEN_TYPE_INSTRUCTION:
1247
1248         /* save expanded instruction */
1249         if (numInstructions == maxInstructions) {
1250            instructions = REALLOC(instructions,
1251                                   maxInstructions
1252                                   * sizeof(struct tgsi_full_instruction),
1253                                   (maxInstructions + 10)
1254                                   * sizeof(struct tgsi_full_instruction));
1255            maxInstructions += 10;
1256         }
1257
1258         memcpy(instructions + numInstructions,
1259                &parse.FullToken.FullInstruction,
1260                sizeof(instructions[0]));
1261
1262         numInstructions++;
1263         break;
1264
1265      case TGSI_TOKEN_TYPE_PROPERTY:
1266         if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1267            if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1268               mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1269            }
1270         }
1271         break;
1272
1273      default:
1274         assert( 0 );
1275      }
1276   }
1277   tgsi_parse_free (&parse);
1278
1279   FREE(mach->Declarations);
1280   mach->Declarations = declarations;
1281   mach->NumDeclarations = numDeclarations;
1282
1283   FREE(mach->Instructions);
1284   mach->Instructions = instructions;
1285   mach->NumInstructions = numInstructions;
1286}
1287
1288
1289struct tgsi_exec_machine *
1290tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1291{
1292   struct tgsi_exec_machine *mach;
1293   uint i;
1294
1295   mach = align_malloc( sizeof *mach, 16 );
1296   if (!mach)
1297      goto fail;
1298
1299   memset(mach, 0, sizeof(*mach));
1300
1301   mach->ShaderType = shader_type;
1302   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1303   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1304
1305   if (shader_type != PIPE_SHADER_COMPUTE) {
1306      mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1307      mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1308      if (!mach->Inputs || !mach->Outputs)
1309         goto fail;
1310   }
1311
1312   /* Setup constants needed by the SSE2 executor. */
1313   for( i = 0; i < 4; i++ ) {
1314      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
1315      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
1316      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
1317      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
1318      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
1319      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
1320      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
1321      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
1322      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
1323      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
1324   }
1325
1326#ifdef DEBUG
1327   /* silence warnings */
1328   (void) print_chan;
1329   (void) print_temp;
1330#endif
1331
1332   return mach;
1333
1334fail:
1335   if (mach) {
1336      align_free(mach->Inputs);
1337      align_free(mach->Outputs);
1338      align_free(mach);
1339   }
1340   return NULL;
1341}
1342
1343
1344void
1345tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1346{
1347   if (mach) {
1348      FREE(mach->Instructions);
1349      FREE(mach->Declarations);
1350      FREE(mach->Imms);
1351
1352      align_free(mach->Inputs);
1353      align_free(mach->Outputs);
1354
1355      align_free(mach);
1356   }
1357}
1358
1359static void
1360micro_add(union tgsi_exec_channel *dst,
1361          const union tgsi_exec_channel *src0,
1362          const union tgsi_exec_channel *src1)
1363{
1364   dst->f[0] = src0->f[0] + src1->f[0];
1365   dst->f[1] = src0->f[1] + src1->f[1];
1366   dst->f[2] = src0->f[2] + src1->f[2];
1367   dst->f[3] = src0->f[3] + src1->f[3];
1368}
1369
1370static void
1371micro_div(
1372   union tgsi_exec_channel *dst,
1373   const union tgsi_exec_channel *src0,
1374   const union tgsi_exec_channel *src1 )
1375{
1376   if (src1->f[0] != 0) {
1377      dst->f[0] = src0->f[0] / src1->f[0];
1378   }
1379   if (src1->f[1] != 0) {
1380      dst->f[1] = src0->f[1] / src1->f[1];
1381   }
1382   if (src1->f[2] != 0) {
1383      dst->f[2] = src0->f[2] / src1->f[2];
1384   }
1385   if (src1->f[3] != 0) {
1386      dst->f[3] = src0->f[3] / src1->f[3];
1387   }
1388}
1389
1390static void
1391micro_lt(
1392   union tgsi_exec_channel *dst,
1393   const union tgsi_exec_channel *src0,
1394   const union tgsi_exec_channel *src1,
1395   const union tgsi_exec_channel *src2,
1396   const union tgsi_exec_channel *src3 )
1397{
1398   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1399   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1400   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1401   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1402}
1403
1404static void
1405micro_max(union tgsi_exec_channel *dst,
1406          const union tgsi_exec_channel *src0,
1407          const union tgsi_exec_channel *src1)
1408{
1409   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1410   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1411   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1412   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1413}
1414
1415static void
1416micro_min(union tgsi_exec_channel *dst,
1417          const union tgsi_exec_channel *src0,
1418          const union tgsi_exec_channel *src1)
1419{
1420   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1421   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1422   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1423   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1424}
1425
1426static void
1427micro_mul(union tgsi_exec_channel *dst,
1428          const union tgsi_exec_channel *src0,
1429          const union tgsi_exec_channel *src1)
1430{
1431   dst->f[0] = src0->f[0] * src1->f[0];
1432   dst->f[1] = src0->f[1] * src1->f[1];
1433   dst->f[2] = src0->f[2] * src1->f[2];
1434   dst->f[3] = src0->f[3] * src1->f[3];
1435}
1436
1437static void
1438micro_neg(
1439   union tgsi_exec_channel *dst,
1440   const union tgsi_exec_channel *src )
1441{
1442   dst->f[0] = -src->f[0];
1443   dst->f[1] = -src->f[1];
1444   dst->f[2] = -src->f[2];
1445   dst->f[3] = -src->f[3];
1446}
1447
1448static void
1449micro_pow(
1450   union tgsi_exec_channel *dst,
1451   const union tgsi_exec_channel *src0,
1452   const union tgsi_exec_channel *src1 )
1453{
1454#if FAST_MATH
1455   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1456   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1457   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1458   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1459#else
1460   dst->f[0] = powf( src0->f[0], src1->f[0] );
1461   dst->f[1] = powf( src0->f[1], src1->f[1] );
1462   dst->f[2] = powf( src0->f[2], src1->f[2] );
1463   dst->f[3] = powf( src0->f[3], src1->f[3] );
1464#endif
1465}
1466
1467static void
1468micro_ldexp(union tgsi_exec_channel *dst,
1469            const union tgsi_exec_channel *src0,
1470            const union tgsi_exec_channel *src1)
1471{
1472   dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1473   dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1474   dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1475   dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1476}
1477
1478static void
1479micro_sub(union tgsi_exec_channel *dst,
1480          const union tgsi_exec_channel *src0,
1481          const union tgsi_exec_channel *src1)
1482{
1483   dst->f[0] = src0->f[0] - src1->f[0];
1484   dst->f[1] = src0->f[1] - src1->f[1];
1485   dst->f[2] = src0->f[2] - src1->f[2];
1486   dst->f[3] = src0->f[3] - src1->f[3];
1487}
1488
1489static void
1490fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1491                       const uint file,
1492                       const uint swizzle,
1493                       const union tgsi_exec_channel *index,
1494                       const union tgsi_exec_channel *index2D,
1495                       union tgsi_exec_channel *chan)
1496{
1497   uint i;
1498
1499   assert(swizzle < 4);
1500
1501   switch (file) {
1502   case TGSI_FILE_CONSTANT:
1503      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1504         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1505         assert(mach->Consts[index2D->i[i]]);
1506
1507         if (index->i[i] < 0) {
1508            chan->u[i] = 0;
1509         } else {
1510            /* NOTE: copying the const value as a uint instead of float */
1511            const uint constbuf = index2D->i[i];
1512            const uint *buf = (const uint *)mach->Consts[constbuf];
1513            const int pos = index->i[i] * 4 + swizzle;
1514            /* const buffer bounds check */
1515            if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1516               if (0) {
1517                  /* Debug: print warning */
1518                  static int count = 0;
1519                  if (count++ < 100)
1520                     debug_printf("TGSI Exec: const buffer index %d"
1521                                  " out of bounds\n", pos);
1522               }
1523               chan->u[i] = 0;
1524            }
1525            else
1526               chan->u[i] = buf[pos];
1527         }
1528      }
1529      break;
1530
1531   case TGSI_FILE_INPUT:
1532      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1533         /*
1534         if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1535            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1536                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1537                         index2D->i[i], index->i[i]);
1538                         }*/
1539         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1540         assert(pos >= 0);
1541         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1542         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1543      }
1544      break;
1545
1546   case TGSI_FILE_SYSTEM_VALUE:
1547      /* XXX no swizzling at this point.  Will be needed if we put
1548       * gl_FragCoord, for example, in a sys value register.
1549       */
1550      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1551         chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1552      }
1553      break;
1554
1555   case TGSI_FILE_TEMPORARY:
1556      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1557         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1558         assert(index2D->i[i] == 0);
1559
1560         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1561      }
1562      break;
1563
1564   case TGSI_FILE_IMMEDIATE:
1565      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1566         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1567         assert(index2D->i[i] == 0);
1568
1569         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1570      }
1571      break;
1572
1573   case TGSI_FILE_ADDRESS:
1574      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1575         assert(index->i[i] >= 0);
1576         assert(index2D->i[i] == 0);
1577
1578         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1579      }
1580      break;
1581
1582   case TGSI_FILE_OUTPUT:
1583      /* vertex/fragment output vars can be read too */
1584      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1585         assert(index->i[i] >= 0);
1586         assert(index2D->i[i] == 0);
1587
1588         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1589      }
1590      break;
1591
1592   default:
1593      assert(0);
1594      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1595         chan->u[i] = 0;
1596      }
1597   }
1598}
1599
1600static void
1601fetch_source_d(const struct tgsi_exec_machine *mach,
1602               union tgsi_exec_channel *chan,
1603               const struct tgsi_full_src_register *reg,
1604	       const uint chan_index)
1605{
1606   union tgsi_exec_channel index;
1607   union tgsi_exec_channel index2D;
1608   uint swizzle;
1609
1610   /* We start with a direct index into a register file.
1611    *
1612    *    file[1],
1613    *    where:
1614    *       file = Register.File
1615    *       [1] = Register.Index
1616    */
1617   index.i[0] =
1618   index.i[1] =
1619   index.i[2] =
1620   index.i[3] = reg->Register.Index;
1621
1622   /* There is an extra source register that indirectly subscripts
1623    * a register file. The direct index now becomes an offset
1624    * that is being added to the indirect register.
1625    *
1626    *    file[ind[2].x+1],
1627    *    where:
1628    *       ind = Indirect.File
1629    *       [2] = Indirect.Index
1630    *       .x = Indirect.SwizzleX
1631    */
1632   if (reg->Register.Indirect) {
1633      union tgsi_exec_channel index2;
1634      union tgsi_exec_channel indir_index;
1635      const uint execmask = mach->ExecMask;
1636      uint i;
1637
1638      /* which address register (always zero now) */
1639      index2.i[0] =
1640      index2.i[1] =
1641      index2.i[2] =
1642      index2.i[3] = reg->Indirect.Index;
1643      /* get current value of address register[swizzle] */
1644      swizzle = reg->Indirect.Swizzle;
1645      fetch_src_file_channel(mach,
1646                             reg->Indirect.File,
1647                             swizzle,
1648                             &index2,
1649                             &ZeroVec,
1650                             &indir_index);
1651
1652      /* add value of address register to the offset */
1653      index.i[0] += indir_index.i[0];
1654      index.i[1] += indir_index.i[1];
1655      index.i[2] += indir_index.i[2];
1656      index.i[3] += indir_index.i[3];
1657
1658      /* for disabled execution channels, zero-out the index to
1659       * avoid using a potential garbage value.
1660       */
1661      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1662         if ((execmask & (1 << i)) == 0)
1663            index.i[i] = 0;
1664      }
1665   }
1666
1667   /* There is an extra source register that is a second
1668    * subscript to a register file. Effectively it means that
1669    * the register file is actually a 2D array of registers.
1670    *
1671    *    file[3][1],
1672    *    where:
1673    *       [3] = Dimension.Index
1674    */
1675   if (reg->Register.Dimension) {
1676      index2D.i[0] =
1677      index2D.i[1] =
1678      index2D.i[2] =
1679      index2D.i[3] = reg->Dimension.Index;
1680
1681      /* Again, the second subscript index can be addressed indirectly
1682       * identically to the first one.
1683       * Nothing stops us from indirectly addressing the indirect register,
1684       * but there is no need for that, so we won't exercise it.
1685       *
1686       *    file[ind[4].y+3][1],
1687       *    where:
1688       *       ind = DimIndirect.File
1689       *       [4] = DimIndirect.Index
1690       *       .y = DimIndirect.SwizzleX
1691       */
1692      if (reg->Dimension.Indirect) {
1693         union tgsi_exec_channel index2;
1694         union tgsi_exec_channel indir_index;
1695         const uint execmask = mach->ExecMask;
1696         uint i;
1697
1698         index2.i[0] =
1699         index2.i[1] =
1700         index2.i[2] =
1701         index2.i[3] = reg->DimIndirect.Index;
1702
1703         swizzle = reg->DimIndirect.Swizzle;
1704         fetch_src_file_channel(mach,
1705                                reg->DimIndirect.File,
1706                                swizzle,
1707                                &index2,
1708                                &ZeroVec,
1709                                &indir_index);
1710
1711         index2D.i[0] += indir_index.i[0];
1712         index2D.i[1] += indir_index.i[1];
1713         index2D.i[2] += indir_index.i[2];
1714         index2D.i[3] += indir_index.i[3];
1715
1716         /* for disabled execution channels, zero-out the index to
1717          * avoid using a potential garbage value.
1718          */
1719         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1720            if ((execmask & (1 << i)) == 0) {
1721               index2D.i[i] = 0;
1722            }
1723         }
1724      }
1725
1726      /* If by any chance there was a need for a 3D array of register
1727       * files, we would have to check whether Dimension is followed
1728       * by a dimension register and continue the saga.
1729       */
1730   } else {
1731      index2D.i[0] =
1732      index2D.i[1] =
1733      index2D.i[2] =
1734      index2D.i[3] = 0;
1735   }
1736
1737   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1738   fetch_src_file_channel(mach,
1739                          reg->Register.File,
1740                          swizzle,
1741                          &index,
1742                          &index2D,
1743                          chan);
1744}
1745
1746static void
1747fetch_source(const struct tgsi_exec_machine *mach,
1748             union tgsi_exec_channel *chan,
1749             const struct tgsi_full_src_register *reg,
1750             const uint chan_index,
1751             enum tgsi_exec_datatype src_datatype)
1752{
1753   fetch_source_d(mach, chan, reg, chan_index);
1754
1755   if (reg->Register.Absolute) {
1756      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1757         micro_abs(chan, chan);
1758      } else {
1759         micro_iabs(chan, chan);
1760      }
1761   }
1762
1763   if (reg->Register.Negate) {
1764      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1765         micro_neg(chan, chan);
1766      } else {
1767         micro_ineg(chan, chan);
1768      }
1769   }
1770}
1771
1772static union tgsi_exec_channel *
1773store_dest_dstret(struct tgsi_exec_machine *mach,
1774                 const union tgsi_exec_channel *chan,
1775                 const struct tgsi_full_dst_register *reg,
1776                 uint chan_index,
1777                 enum tgsi_exec_datatype dst_datatype)
1778{
1779   static union tgsi_exec_channel null;
1780   union tgsi_exec_channel *dst;
1781   union tgsi_exec_channel index2D;
1782   int offset = 0;  /* indirection offset */
1783   int index;
1784
1785   /* for debugging */
1786   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1787      check_inf_or_nan(chan);
1788   }
1789
1790   /* There is an extra source register that indirectly subscripts
1791    * a register file. The direct index now becomes an offset
1792    * that is being added to the indirect register.
1793    *
1794    *    file[ind[2].x+1],
1795    *    where:
1796    *       ind = Indirect.File
1797    *       [2] = Indirect.Index
1798    *       .x = Indirect.SwizzleX
1799    */
1800   if (reg->Register.Indirect) {
1801      union tgsi_exec_channel index;
1802      union tgsi_exec_channel indir_index;
1803      uint swizzle;
1804
1805      /* which address register (always zero for now) */
1806      index.i[0] =
1807      index.i[1] =
1808      index.i[2] =
1809      index.i[3] = reg->Indirect.Index;
1810
1811      /* get current value of address register[swizzle] */
1812      swizzle = reg->Indirect.Swizzle;
1813
1814      /* fetch values from the address/indirection register */
1815      fetch_src_file_channel(mach,
1816                             reg->Indirect.File,
1817                             swizzle,
1818                             &index,
1819                             &ZeroVec,
1820                             &indir_index);
1821
1822      /* save indirection offset */
1823      offset = indir_index.i[0];
1824   }
1825
1826   /* There is an extra source register that is a second
1827    * subscript to a register file. Effectively it means that
1828    * the register file is actually a 2D array of registers.
1829    *
1830    *    file[3][1],
1831    *    where:
1832    *       [3] = Dimension.Index
1833    */
1834   if (reg->Register.Dimension) {
1835      index2D.i[0] =
1836      index2D.i[1] =
1837      index2D.i[2] =
1838      index2D.i[3] = reg->Dimension.Index;
1839
1840      /* Again, the second subscript index can be addressed indirectly
1841       * identically to the first one.
1842       * Nothing stops us from indirectly addressing the indirect register,
1843       * but there is no need for that, so we won't exercise it.
1844       *
1845       *    file[ind[4].y+3][1],
1846       *    where:
1847       *       ind = DimIndirect.File
1848       *       [4] = DimIndirect.Index
1849       *       .y = DimIndirect.SwizzleX
1850       */
1851      if (reg->Dimension.Indirect) {
1852         union tgsi_exec_channel index2;
1853         union tgsi_exec_channel indir_index;
1854         const uint execmask = mach->ExecMask;
1855         unsigned swizzle;
1856         uint i;
1857
1858         index2.i[0] =
1859         index2.i[1] =
1860         index2.i[2] =
1861         index2.i[3] = reg->DimIndirect.Index;
1862
1863         swizzle = reg->DimIndirect.Swizzle;
1864         fetch_src_file_channel(mach,
1865                                reg->DimIndirect.File,
1866                                swizzle,
1867                                &index2,
1868                                &ZeroVec,
1869                                &indir_index);
1870
1871         index2D.i[0] += indir_index.i[0];
1872         index2D.i[1] += indir_index.i[1];
1873         index2D.i[2] += indir_index.i[2];
1874         index2D.i[3] += indir_index.i[3];
1875
1876         /* for disabled execution channels, zero-out the index to
1877          * avoid using a potential garbage value.
1878          */
1879         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1880            if ((execmask & (1 << i)) == 0) {
1881               index2D.i[i] = 0;
1882            }
1883         }
1884      }
1885
1886      /* If by any chance there was a need for a 3D array of register
1887       * files, we would have to check whether Dimension is followed
1888       * by a dimension register and continue the saga.
1889       */
1890   } else {
1891      index2D.i[0] =
1892      index2D.i[1] =
1893      index2D.i[2] =
1894      index2D.i[3] = 0;
1895   }
1896
1897   switch (reg->Register.File) {
1898   case TGSI_FILE_NULL:
1899      dst = &null;
1900      break;
1901
1902   case TGSI_FILE_OUTPUT:
1903      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1904         + reg->Register.Index;
1905      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1906#if 0
1907      debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1908                   mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1909                   reg->Register.Index);
1910      if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1911         debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1912         for (i = 0; i < TGSI_QUAD_SIZE; i++)
1913            if (execmask & (1 << i))
1914               debug_printf("%f, ", chan->f[i]);
1915         debug_printf(")\n");
1916      }
1917#endif
1918      break;
1919
1920   case TGSI_FILE_TEMPORARY:
1921      index = reg->Register.Index;
1922      assert( index < TGSI_EXEC_NUM_TEMPS );
1923      dst = &mach->Temps[offset + index].xyzw[chan_index];
1924      break;
1925
1926   case TGSI_FILE_ADDRESS:
1927      index = reg->Register.Index;
1928      dst = &mach->Addrs[index].xyzw[chan_index];
1929      break;
1930
1931   default:
1932      assert( 0 );
1933      return NULL;
1934   }
1935
1936   return dst;
1937}
1938
1939static void
1940store_dest_double(struct tgsi_exec_machine *mach,
1941                 const union tgsi_exec_channel *chan,
1942                 const struct tgsi_full_dst_register *reg,
1943                 uint chan_index,
1944                 enum tgsi_exec_datatype dst_datatype)
1945{
1946   union tgsi_exec_channel *dst;
1947   const uint execmask = mach->ExecMask;
1948   int i;
1949
1950   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1951   if (!dst)
1952      return;
1953
1954   /* doubles path */
1955   for (i = 0; i < TGSI_QUAD_SIZE; i++)
1956      if (execmask & (1 << i))
1957         dst->i[i] = chan->i[i];
1958}
1959
1960static void
1961store_dest(struct tgsi_exec_machine *mach,
1962           const union tgsi_exec_channel *chan,
1963           const struct tgsi_full_dst_register *reg,
1964           const struct tgsi_full_instruction *inst,
1965           uint chan_index,
1966           enum tgsi_exec_datatype dst_datatype)
1967{
1968   union tgsi_exec_channel *dst;
1969   const uint execmask = mach->ExecMask;
1970   int i;
1971
1972   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1973   if (!dst)
1974      return;
1975
1976   if (!inst->Instruction.Saturate) {
1977      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1978         if (execmask & (1 << i))
1979            dst->i[i] = chan->i[i];
1980   }
1981   else {
1982      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1983         if (execmask & (1 << i)) {
1984            if (chan->f[i] < 0.0f)
1985               dst->f[i] = 0.0f;
1986            else if (chan->f[i] > 1.0f)
1987               dst->f[i] = 1.0f;
1988            else
1989               dst->i[i] = chan->i[i];
1990         }
1991   }
1992}
1993
1994#define FETCH(VAL,INDEX,CHAN)\
1995    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1996
1997#define IFETCH(VAL,INDEX,CHAN)\
1998    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1999
2000
2001/**
2002 * Execute ARB-style KIL which is predicated by a src register.
2003 * Kill fragment if any of the four values is less than zero.
2004 */
2005static void
2006exec_kill_if(struct tgsi_exec_machine *mach,
2007             const struct tgsi_full_instruction *inst)
2008{
2009   uint uniquemask;
2010   uint chan_index;
2011   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2012   union tgsi_exec_channel r[1];
2013
2014   /* This mask stores component bits that were already tested. */
2015   uniquemask = 0;
2016
2017   for (chan_index = 0; chan_index < 4; chan_index++)
2018   {
2019      uint swizzle;
2020      uint i;
2021
2022      /* unswizzle channel */
2023      swizzle = tgsi_util_get_full_src_register_swizzle (
2024                        &inst->Src[0],
2025                        chan_index);
2026
2027      /* check if the component has not been already tested */
2028      if (uniquemask & (1 << swizzle))
2029         continue;
2030      uniquemask |= 1 << swizzle;
2031
2032      FETCH(&r[0], 0, chan_index);
2033      for (i = 0; i < 4; i++)
2034         if (r[0].f[i] < 0.0f)
2035            kilmask |= 1 << i;
2036   }
2037
2038   /* restrict to fragments currently executing */
2039   kilmask &= mach->ExecMask;
2040
2041   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2042}
2043
2044/**
2045 * Unconditional fragment kill/discard.
2046 */
2047static void
2048exec_kill(struct tgsi_exec_machine *mach)
2049{
2050   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2051
2052   /* kill fragment for all fragments currently executing */
2053   kilmask = mach->ExecMask;
2054   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2055}
2056
2057static void
2058emit_vertex(struct tgsi_exec_machine *mach)
2059{
2060   /* FIXME: check for exec mask correctly
2061   unsigned i;
2062   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2063         if ((mach->ExecMask & (1 << i)))
2064   */
2065   if (mach->ExecMask) {
2066      if (mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] >= mach->MaxOutputVertices)
2067         return;
2068
2069      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2070      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
2071   }
2072}
2073
2074static void
2075emit_primitive(struct tgsi_exec_machine *mach)
2076{
2077   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
2078   /* FIXME: check for exec mask correctly
2079   unsigned i;
2080   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2081         if ((mach->ExecMask & (1 << i)))
2082   */
2083   if (mach->ExecMask) {
2084      ++(*prim_count);
2085      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2086      mach->Primitives[*prim_count] = 0;
2087   }
2088}
2089
2090static void
2091conditional_emit_primitive(struct tgsi_exec_machine *mach)
2092{
2093   if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2094      int emitted_verts =
2095         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
2096      if (emitted_verts) {
2097         emit_primitive(mach);
2098      }
2099   }
2100}
2101
2102
2103/*
2104 * Fetch four texture samples using STR texture coordinates.
2105 */
2106static void
2107fetch_texel( struct tgsi_sampler *sampler,
2108             const unsigned sview_idx,
2109             const unsigned sampler_idx,
2110             const union tgsi_exec_channel *s,
2111             const union tgsi_exec_channel *t,
2112             const union tgsi_exec_channel *p,
2113             const union tgsi_exec_channel *c0,
2114             const union tgsi_exec_channel *c1,
2115             float derivs[3][2][TGSI_QUAD_SIZE],
2116             const int8_t offset[3],
2117             enum tgsi_sampler_control control,
2118             union tgsi_exec_channel *r,
2119             union tgsi_exec_channel *g,
2120             union tgsi_exec_channel *b,
2121             union tgsi_exec_channel *a )
2122{
2123   uint j;
2124   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2125
2126   /* FIXME: handle explicit derivs, offsets */
2127   sampler->get_samples(sampler, sview_idx, sampler_idx,
2128                        s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2129
2130   for (j = 0; j < 4; j++) {
2131      r->f[j] = rgba[0][j];
2132      g->f[j] = rgba[1][j];
2133      b->f[j] = rgba[2][j];
2134      a->f[j] = rgba[3][j];
2135   }
2136}
2137
2138
2139#define TEX_MODIFIER_NONE           0
2140#define TEX_MODIFIER_PROJECTED      1
2141#define TEX_MODIFIER_LOD_BIAS       2
2142#define TEX_MODIFIER_EXPLICIT_LOD   3
2143#define TEX_MODIFIER_LEVEL_ZERO     4
2144#define TEX_MODIFIER_GATHER         5
2145
2146/*
2147 * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2148 */
2149static void
2150fetch_texel_offsets(struct tgsi_exec_machine *mach,
2151                    const struct tgsi_full_instruction *inst,
2152                    int8_t offsets[3])
2153{
2154   if (inst->Texture.NumOffsets == 1) {
2155      union tgsi_exec_channel index;
2156      union tgsi_exec_channel offset[3];
2157      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2158      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2159                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2160      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2161                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2162      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2163                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2164     offsets[0] = offset[0].i[0];
2165     offsets[1] = offset[1].i[0];
2166     offsets[2] = offset[2].i[0];
2167   } else {
2168     assert(inst->Texture.NumOffsets == 0);
2169     offsets[0] = offsets[1] = offsets[2] = 0;
2170   }
2171}
2172
2173
2174/*
2175 * Fetch dx and dy values for one channel (s, t or r).
2176 * Put dx values into one float array, dy values into another.
2177 */
2178static void
2179fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2180                           const struct tgsi_full_instruction *inst,
2181                           unsigned regdsrcx,
2182                           unsigned chan,
2183                           float derivs[2][TGSI_QUAD_SIZE])
2184{
2185   union tgsi_exec_channel d;
2186   FETCH(&d, regdsrcx, chan);
2187   derivs[0][0] = d.f[0];
2188   derivs[0][1] = d.f[1];
2189   derivs[0][2] = d.f[2];
2190   derivs[0][3] = d.f[3];
2191   FETCH(&d, regdsrcx + 1, chan);
2192   derivs[1][0] = d.f[0];
2193   derivs[1][1] = d.f[1];
2194   derivs[1][2] = d.f[2];
2195   derivs[1][3] = d.f[3];
2196}
2197
2198static uint
2199fetch_sampler_unit(struct tgsi_exec_machine *mach,
2200                   const struct tgsi_full_instruction *inst,
2201                   uint sampler)
2202{
2203   uint unit = 0;
2204   int i;
2205   if (inst->Src[sampler].Register.Indirect) {
2206      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2207      union tgsi_exec_channel indir_index, index2;
2208      const uint execmask = mach->ExecMask;
2209      index2.i[0] =
2210      index2.i[1] =
2211      index2.i[2] =
2212      index2.i[3] = reg->Indirect.Index;
2213
2214      fetch_src_file_channel(mach,
2215                             reg->Indirect.File,
2216                             reg->Indirect.Swizzle,
2217                             &index2,
2218                             &ZeroVec,
2219                             &indir_index);
2220      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2221         if (execmask & (1 << i)) {
2222            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2223            break;
2224         }
2225      }
2226
2227   } else {
2228      unit = inst->Src[sampler].Register.Index;
2229   }
2230   return unit;
2231}
2232
2233/*
2234 * execute a texture instruction.
2235 *
2236 * modifier is used to control the channel routing for the
2237 * instruction variants like proj, lod, and texture with lod bias.
2238 * sampler indicates which src register the sampler is contained in.
2239 */
2240static void
2241exec_tex(struct tgsi_exec_machine *mach,
2242         const struct tgsi_full_instruction *inst,
2243         uint modifier, uint sampler)
2244{
2245   const union tgsi_exec_channel *args[5], *proj = NULL;
2246   union tgsi_exec_channel r[5];
2247   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2248   uint chan;
2249   uint unit;
2250   int8_t offsets[3];
2251   int dim, shadow_ref, i;
2252
2253   unit = fetch_sampler_unit(mach, inst, sampler);
2254   /* always fetch all 3 offsets, overkill but keeps code simple */
2255   fetch_texel_offsets(mach, inst, offsets);
2256
2257   assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2258   assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2259
2260   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2261   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2262
2263   assert(dim <= 4);
2264   if (shadow_ref >= 0)
2265      assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2266
2267   /* fetch modifier to the last argument */
2268   if (modifier != TEX_MODIFIER_NONE) {
2269      const int last = ARRAY_SIZE(args) - 1;
2270
2271      /* fetch modifier from src0.w or src1.x */
2272      if (sampler == 1) {
2273         assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2274         FETCH(&r[last], 0, TGSI_CHAN_W);
2275      }
2276      else {
2277         assert(shadow_ref != 4);
2278         FETCH(&r[last], 1, TGSI_CHAN_X);
2279      }
2280
2281      if (modifier != TEX_MODIFIER_PROJECTED) {
2282         args[last] = &r[last];
2283      }
2284      else {
2285         proj = &r[last];
2286         args[last] = &ZeroVec;
2287      }
2288
2289      /* point unused arguments to zero vector */
2290      for (i = dim; i < last; i++)
2291         args[i] = &ZeroVec;
2292
2293      if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2294         control = TGSI_SAMPLER_LOD_EXPLICIT;
2295      else if (modifier == TEX_MODIFIER_LOD_BIAS)
2296         control = TGSI_SAMPLER_LOD_BIAS;
2297      else if (modifier == TEX_MODIFIER_GATHER)
2298         control = TGSI_SAMPLER_GATHER;
2299   }
2300   else {
2301      for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2302         args[i] = &ZeroVec;
2303   }
2304
2305   /* fetch coordinates */
2306   for (i = 0; i < dim; i++) {
2307      FETCH(&r[i], 0, TGSI_CHAN_X + i);
2308
2309      if (proj)
2310         micro_div(&r[i], &r[i], proj);
2311
2312      args[i] = &r[i];
2313   }
2314
2315   /* fetch reference value */
2316   if (shadow_ref >= 0) {
2317      FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2318
2319      if (proj)
2320         micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2321
2322      args[shadow_ref] = &r[shadow_ref];
2323   }
2324
2325   fetch_texel(mach->Sampler, unit, unit,
2326         args[0], args[1], args[2], args[3], args[4],
2327         NULL, offsets, control,
2328         &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2329
2330#if 0
2331   debug_printf("fetch r: %g %g %g %g\n",
2332         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2333   debug_printf("fetch g: %g %g %g %g\n",
2334         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2335   debug_printf("fetch b: %g %g %g %g\n",
2336         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2337   debug_printf("fetch a: %g %g %g %g\n",
2338         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2339#endif
2340
2341   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2342      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2343         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2344      }
2345   }
2346}
2347
2348static void
2349exec_lodq(struct tgsi_exec_machine *mach,
2350          const struct tgsi_full_instruction *inst)
2351{
2352   uint resource_unit, sampler_unit;
2353   unsigned dim;
2354   unsigned i;
2355   union tgsi_exec_channel coords[4];
2356   const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2357   union tgsi_exec_channel r[2];
2358
2359   resource_unit = fetch_sampler_unit(mach, inst, 1);
2360   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2361      uint target = mach->SamplerViews[resource_unit].Resource;
2362      dim = tgsi_util_get_texture_coord_dim(target);
2363      sampler_unit = fetch_sampler_unit(mach, inst, 2);
2364   } else {
2365      dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2366      sampler_unit = resource_unit;
2367   }
2368   assert(dim <= ARRAY_SIZE(coords));
2369   /* fetch coordinates */
2370   for (i = 0; i < dim; i++) {
2371      FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2372      args[i] = &coords[i];
2373   }
2374   for (i = dim; i < ARRAY_SIZE(coords); i++) {
2375      args[i] = &ZeroVec;
2376   }
2377   mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2378                            args[0]->f,
2379                            args[1]->f,
2380                            args[2]->f,
2381                            args[3]->f,
2382                            TGSI_SAMPLER_LOD_NONE,
2383                            r[0].f,
2384                            r[1].f);
2385
2386   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2387      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2388                 TGSI_EXEC_DATA_FLOAT);
2389   }
2390   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2391      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2392                 TGSI_EXEC_DATA_FLOAT);
2393   }
2394   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2395      unsigned char swizzles[4];
2396      unsigned chan;
2397      swizzles[0] = inst->Src[1].Register.SwizzleX;
2398      swizzles[1] = inst->Src[1].Register.SwizzleY;
2399      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2400      swizzles[3] = inst->Src[1].Register.SwizzleW;
2401
2402      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2403         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2404            if (swizzles[chan] >= 2) {
2405               store_dest(mach, &ZeroVec,
2406                          &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2407            } else {
2408               store_dest(mach, &r[swizzles[chan]],
2409                          &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2410            }
2411         }
2412      }
2413   } else {
2414      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2415         store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2416                    TGSI_EXEC_DATA_FLOAT);
2417      }
2418      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2419         store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2420                    TGSI_EXEC_DATA_FLOAT);
2421      }
2422   }
2423}
2424
2425static void
2426exec_txd(struct tgsi_exec_machine *mach,
2427         const struct tgsi_full_instruction *inst)
2428{
2429   union tgsi_exec_channel r[4];
2430   float derivs[3][2][TGSI_QUAD_SIZE];
2431   uint chan;
2432   uint unit;
2433   int8_t offsets[3];
2434
2435   unit = fetch_sampler_unit(mach, inst, 3);
2436   /* always fetch all 3 offsets, overkill but keeps code simple */
2437   fetch_texel_offsets(mach, inst, offsets);
2438
2439   switch (inst->Texture.Texture) {
2440   case TGSI_TEXTURE_1D:
2441      FETCH(&r[0], 0, TGSI_CHAN_X);
2442
2443      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2444
2445      fetch_texel(mach->Sampler, unit, unit,
2446                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2447                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2448                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2449      break;
2450
2451   case TGSI_TEXTURE_SHADOW1D:
2452   case TGSI_TEXTURE_1D_ARRAY:
2453   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2454      /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2455      FETCH(&r[0], 0, TGSI_CHAN_X);
2456      FETCH(&r[1], 0, TGSI_CHAN_Y);
2457      FETCH(&r[2], 0, TGSI_CHAN_Z);
2458
2459      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2460
2461      fetch_texel(mach->Sampler, unit, unit,
2462                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2463                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2464                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2465      break;
2466
2467   case TGSI_TEXTURE_2D:
2468   case TGSI_TEXTURE_RECT:
2469      FETCH(&r[0], 0, TGSI_CHAN_X);
2470      FETCH(&r[1], 0, TGSI_CHAN_Y);
2471
2472      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2473      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2474
2475      fetch_texel(mach->Sampler, unit, unit,
2476                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2477                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2478                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2479      break;
2480
2481
2482   case TGSI_TEXTURE_SHADOW2D:
2483   case TGSI_TEXTURE_SHADOWRECT:
2484   case TGSI_TEXTURE_2D_ARRAY:
2485   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2486      /* only SHADOW2D_ARRAY actually needs W */
2487      FETCH(&r[0], 0, TGSI_CHAN_X);
2488      FETCH(&r[1], 0, TGSI_CHAN_Y);
2489      FETCH(&r[2], 0, TGSI_CHAN_Z);
2490      FETCH(&r[3], 0, TGSI_CHAN_W);
2491
2492      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2493      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2494
2495      fetch_texel(mach->Sampler, unit, unit,
2496                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2497                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2498                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2499      break;
2500
2501   case TGSI_TEXTURE_3D:
2502   case TGSI_TEXTURE_CUBE:
2503   case TGSI_TEXTURE_CUBE_ARRAY:
2504   case TGSI_TEXTURE_SHADOWCUBE:
2505      /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2506      FETCH(&r[0], 0, TGSI_CHAN_X);
2507      FETCH(&r[1], 0, TGSI_CHAN_Y);
2508      FETCH(&r[2], 0, TGSI_CHAN_Z);
2509      FETCH(&r[3], 0, TGSI_CHAN_W);
2510
2511      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2512      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2513      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2514
2515      fetch_texel(mach->Sampler, unit, unit,
2516                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2517                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2518                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2519      break;
2520
2521   default:
2522      assert(0);
2523   }
2524
2525   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2526      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2527         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2528      }
2529   }
2530}
2531
2532
2533static void
2534exec_txf(struct tgsi_exec_machine *mach,
2535         const struct tgsi_full_instruction *inst)
2536{
2537   union tgsi_exec_channel r[4];
2538   uint chan;
2539   uint unit;
2540   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2541   int j;
2542   int8_t offsets[3];
2543   unsigned target;
2544
2545   unit = fetch_sampler_unit(mach, inst, 1);
2546   /* always fetch all 3 offsets, overkill but keeps code simple */
2547   fetch_texel_offsets(mach, inst, offsets);
2548
2549   IFETCH(&r[3], 0, TGSI_CHAN_W);
2550
2551   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2552       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2553      target = mach->SamplerViews[unit].Resource;
2554   }
2555   else {
2556      target = inst->Texture.Texture;
2557   }
2558   switch(target) {
2559   case TGSI_TEXTURE_3D:
2560   case TGSI_TEXTURE_2D_ARRAY:
2561   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2562   case TGSI_TEXTURE_2D_ARRAY_MSAA:
2563      IFETCH(&r[2], 0, TGSI_CHAN_Z);
2564      /* fallthrough */
2565   case TGSI_TEXTURE_2D:
2566   case TGSI_TEXTURE_RECT:
2567   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2568   case TGSI_TEXTURE_SHADOW2D:
2569   case TGSI_TEXTURE_SHADOWRECT:
2570   case TGSI_TEXTURE_1D_ARRAY:
2571   case TGSI_TEXTURE_2D_MSAA:
2572      IFETCH(&r[1], 0, TGSI_CHAN_Y);
2573      /* fallthrough */
2574   case TGSI_TEXTURE_BUFFER:
2575   case TGSI_TEXTURE_1D:
2576   case TGSI_TEXTURE_SHADOW1D:
2577      IFETCH(&r[0], 0, TGSI_CHAN_X);
2578      break;
2579   default:
2580      assert(0);
2581      break;
2582   }
2583
2584   mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2585                            offsets, rgba);
2586
2587   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2588      r[0].f[j] = rgba[0][j];
2589      r[1].f[j] = rgba[1][j];
2590      r[2].f[j] = rgba[2][j];
2591      r[3].f[j] = rgba[3][j];
2592   }
2593
2594   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2595       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2596      unsigned char swizzles[4];
2597      swizzles[0] = inst->Src[1].Register.SwizzleX;
2598      swizzles[1] = inst->Src[1].Register.SwizzleY;
2599      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2600      swizzles[3] = inst->Src[1].Register.SwizzleW;
2601
2602      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2603         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2604            store_dest(mach, &r[swizzles[chan]],
2605                       &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2606         }
2607      }
2608   }
2609   else {
2610      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2611         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2612            store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2613         }
2614      }
2615   }
2616}
2617
2618static void
2619exec_txq(struct tgsi_exec_machine *mach,
2620         const struct tgsi_full_instruction *inst)
2621{
2622   int result[4];
2623   union tgsi_exec_channel r[4], src;
2624   uint chan;
2625   uint unit;
2626   int i,j;
2627
2628   unit = fetch_sampler_unit(mach, inst, 1);
2629
2630   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2631
2632   /* XXX: This interface can't return per-pixel values */
2633   mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2634
2635   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2636      for (j = 0; j < 4; j++) {
2637         r[j].i[i] = result[j];
2638      }
2639   }
2640
2641   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2642      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2643         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2644                    TGSI_EXEC_DATA_INT);
2645      }
2646   }
2647}
2648
2649static void
2650exec_sample(struct tgsi_exec_machine *mach,
2651            const struct tgsi_full_instruction *inst,
2652            uint modifier, boolean compare)
2653{
2654   const uint resource_unit = inst->Src[1].Register.Index;
2655   const uint sampler_unit = inst->Src[2].Register.Index;
2656   union tgsi_exec_channel r[5], c1;
2657   const union tgsi_exec_channel *lod = &ZeroVec;
2658   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2659   uint chan;
2660   unsigned char swizzles[4];
2661   int8_t offsets[3];
2662
2663   /* always fetch all 3 offsets, overkill but keeps code simple */
2664   fetch_texel_offsets(mach, inst, offsets);
2665
2666   assert(modifier != TEX_MODIFIER_PROJECTED);
2667
2668   if (modifier != TEX_MODIFIER_NONE) {
2669      if (modifier == TEX_MODIFIER_LOD_BIAS) {
2670         FETCH(&c1, 3, TGSI_CHAN_X);
2671         lod = &c1;
2672         control = TGSI_SAMPLER_LOD_BIAS;
2673      }
2674      else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2675         FETCH(&c1, 3, TGSI_CHAN_X);
2676         lod = &c1;
2677         control = TGSI_SAMPLER_LOD_EXPLICIT;
2678      }
2679      else if (modifier == TEX_MODIFIER_GATHER) {
2680         control = TGSI_SAMPLER_GATHER;
2681      }
2682      else {
2683         assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2684         control = TGSI_SAMPLER_LOD_ZERO;
2685      }
2686   }
2687
2688   FETCH(&r[0], 0, TGSI_CHAN_X);
2689
2690   switch (mach->SamplerViews[resource_unit].Resource) {
2691   case TGSI_TEXTURE_1D:
2692      if (compare) {
2693         FETCH(&r[2], 3, TGSI_CHAN_X);
2694         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2695                     &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2696                     NULL, offsets, control,
2697                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2698      }
2699      else {
2700         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2701                     &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2702                     NULL, offsets, control,
2703                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2704      }
2705      break;
2706
2707   case TGSI_TEXTURE_1D_ARRAY:
2708   case TGSI_TEXTURE_2D:
2709   case TGSI_TEXTURE_RECT:
2710      FETCH(&r[1], 0, TGSI_CHAN_Y);
2711      if (compare) {
2712         FETCH(&r[2], 3, TGSI_CHAN_X);
2713         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2714                     &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2715                     NULL, offsets, control,
2716                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2717      }
2718      else {
2719         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2720                     &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2721                     NULL, offsets, control,
2722                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2723      }
2724      break;
2725
2726   case TGSI_TEXTURE_2D_ARRAY:
2727   case TGSI_TEXTURE_3D:
2728   case TGSI_TEXTURE_CUBE:
2729      FETCH(&r[1], 0, TGSI_CHAN_Y);
2730      FETCH(&r[2], 0, TGSI_CHAN_Z);
2731      if(compare) {
2732         FETCH(&r[3], 3, TGSI_CHAN_X);
2733         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2734                     &r[0], &r[1], &r[2], &r[3], lod,
2735                     NULL, offsets, control,
2736                     &r[0], &r[1], &r[2], &r[3]);
2737      }
2738      else {
2739         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2740                     &r[0], &r[1], &r[2], &ZeroVec, lod,
2741                     NULL, offsets, control,
2742                     &r[0], &r[1], &r[2], &r[3]);
2743      }
2744      break;
2745
2746   case TGSI_TEXTURE_CUBE_ARRAY:
2747      FETCH(&r[1], 0, TGSI_CHAN_Y);
2748      FETCH(&r[2], 0, TGSI_CHAN_Z);
2749      FETCH(&r[3], 0, TGSI_CHAN_W);
2750      if(compare) {
2751         FETCH(&r[4], 3, TGSI_CHAN_X);
2752         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2753                     &r[0], &r[1], &r[2], &r[3], &r[4],
2754                     NULL, offsets, control,
2755                     &r[0], &r[1], &r[2], &r[3]);
2756      }
2757      else {
2758         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2759                     &r[0], &r[1], &r[2], &r[3], lod,
2760                     NULL, offsets, control,
2761                     &r[0], &r[1], &r[2], &r[3]);
2762      }
2763      break;
2764
2765
2766   default:
2767      assert(0);
2768   }
2769
2770   swizzles[0] = inst->Src[1].Register.SwizzleX;
2771   swizzles[1] = inst->Src[1].Register.SwizzleY;
2772   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2773   swizzles[3] = inst->Src[1].Register.SwizzleW;
2774
2775   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2776      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2777         store_dest(mach, &r[swizzles[chan]],
2778                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2779      }
2780   }
2781}
2782
2783static void
2784exec_sample_d(struct tgsi_exec_machine *mach,
2785              const struct tgsi_full_instruction *inst)
2786{
2787   const uint resource_unit = inst->Src[1].Register.Index;
2788   const uint sampler_unit = inst->Src[2].Register.Index;
2789   union tgsi_exec_channel r[4];
2790   float derivs[3][2][TGSI_QUAD_SIZE];
2791   uint chan;
2792   unsigned char swizzles[4];
2793   int8_t offsets[3];
2794
2795   /* always fetch all 3 offsets, overkill but keeps code simple */
2796   fetch_texel_offsets(mach, inst, offsets);
2797
2798   FETCH(&r[0], 0, TGSI_CHAN_X);
2799
2800   switch (mach->SamplerViews[resource_unit].Resource) {
2801   case TGSI_TEXTURE_1D:
2802   case TGSI_TEXTURE_1D_ARRAY:
2803      /* only 1D array actually needs Y */
2804      FETCH(&r[1], 0, TGSI_CHAN_Y);
2805
2806      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2807
2808      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2809                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2810                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2811                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2812      break;
2813
2814   case TGSI_TEXTURE_2D:
2815   case TGSI_TEXTURE_RECT:
2816   case TGSI_TEXTURE_2D_ARRAY:
2817      /* only 2D array actually needs Z */
2818      FETCH(&r[1], 0, TGSI_CHAN_Y);
2819      FETCH(&r[2], 0, TGSI_CHAN_Z);
2820
2821      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2822      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2823
2824      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2825                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2826                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2827                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2828      break;
2829
2830   case TGSI_TEXTURE_3D:
2831   case TGSI_TEXTURE_CUBE:
2832   case TGSI_TEXTURE_CUBE_ARRAY:
2833      /* only cube array actually needs W */
2834      FETCH(&r[1], 0, TGSI_CHAN_Y);
2835      FETCH(&r[2], 0, TGSI_CHAN_Z);
2836      FETCH(&r[3], 0, TGSI_CHAN_W);
2837
2838      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2839      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2840      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2841
2842      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2843                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2844                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2845                  &r[0], &r[1], &r[2], &r[3]);
2846      break;
2847
2848   default:
2849      assert(0);
2850   }
2851
2852   swizzles[0] = inst->Src[1].Register.SwizzleX;
2853   swizzles[1] = inst->Src[1].Register.SwizzleY;
2854   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2855   swizzles[3] = inst->Src[1].Register.SwizzleW;
2856
2857   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2858      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2859         store_dest(mach, &r[swizzles[chan]],
2860                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2861      }
2862   }
2863}
2864
2865
2866/**
2867 * Evaluate a constant-valued coefficient at the position of the
2868 * current quad.
2869 */
2870static void
2871eval_constant_coef(
2872   struct tgsi_exec_machine *mach,
2873   unsigned attrib,
2874   unsigned chan )
2875{
2876   unsigned i;
2877
2878   for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2879      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2880   }
2881}
2882
2883/**
2884 * Evaluate a linear-valued coefficient at the position of the
2885 * current quad.
2886 */
2887static void
2888eval_linear_coef(
2889   struct tgsi_exec_machine *mach,
2890   unsigned attrib,
2891   unsigned chan )
2892{
2893   const float x = mach->QuadPos.xyzw[0].f[0];
2894   const float y = mach->QuadPos.xyzw[1].f[0];
2895   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2896   const float dady = mach->InterpCoefs[attrib].dady[chan];
2897   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2898   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2899   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2900   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2901   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2902}
2903
2904/**
2905 * Evaluate a perspective-valued coefficient at the position of the
2906 * current quad.
2907 */
2908static void
2909eval_perspective_coef(
2910   struct tgsi_exec_machine *mach,
2911   unsigned attrib,
2912   unsigned chan )
2913{
2914   const float x = mach->QuadPos.xyzw[0].f[0];
2915   const float y = mach->QuadPos.xyzw[1].f[0];
2916   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2917   const float dady = mach->InterpCoefs[attrib].dady[chan];
2918   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2919   const float *w = mach->QuadPos.xyzw[3].f;
2920   /* divide by W here */
2921   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2922   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2923   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2924   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2925}
2926
2927
2928typedef void (* eval_coef_func)(
2929   struct tgsi_exec_machine *mach,
2930   unsigned attrib,
2931   unsigned chan );
2932
2933static void
2934exec_declaration(struct tgsi_exec_machine *mach,
2935                 const struct tgsi_full_declaration *decl)
2936{
2937   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2938      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2939      return;
2940   }
2941
2942   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2943      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2944         uint first, last, mask;
2945
2946         first = decl->Range.First;
2947         last = decl->Range.Last;
2948         mask = decl->Declaration.UsageMask;
2949
2950         /* XXX we could remove this special-case code since
2951          * mach->InterpCoefs[first].a0 should already have the
2952          * front/back-face value.  But we should first update the
2953          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2954          * Then, we could remove the tgsi_exec_machine::Face field.
2955          */
2956         /* XXX make FACE a system value */
2957         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2958            uint i;
2959
2960            assert(decl->Semantic.Index == 0);
2961            assert(first == last);
2962
2963            for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2964               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2965            }
2966         } else {
2967            eval_coef_func eval;
2968            uint i, j;
2969
2970            switch (decl->Interp.Interpolate) {
2971            case TGSI_INTERPOLATE_CONSTANT:
2972               eval = eval_constant_coef;
2973               break;
2974
2975            case TGSI_INTERPOLATE_LINEAR:
2976               eval = eval_linear_coef;
2977               break;
2978
2979            case TGSI_INTERPOLATE_PERSPECTIVE:
2980               eval = eval_perspective_coef;
2981               break;
2982
2983            case TGSI_INTERPOLATE_COLOR:
2984               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2985               break;
2986
2987            default:
2988               assert(0);
2989               return;
2990            }
2991
2992            for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2993               if (mask & (1 << j)) {
2994                  for (i = first; i <= last; i++) {
2995                     eval(mach, i, j);
2996                  }
2997               }
2998            }
2999         }
3000
3001         if (DEBUG_EXECUTION) {
3002            uint i, j;
3003            for (i = first; i <= last; ++i) {
3004               debug_printf("IN[%2u] = ", i);
3005               for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3006                  if (j > 0) {
3007                     debug_printf("         ");
3008                  }
3009                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3010                               mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3011                               mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3012                               mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3013                               mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3014               }
3015            }
3016         }
3017      }
3018   }
3019
3020}
3021
3022typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3023                                const union tgsi_exec_channel *src);
3024
3025static void
3026exec_scalar_unary(struct tgsi_exec_machine *mach,
3027                  const struct tgsi_full_instruction *inst,
3028                  micro_unary_op op,
3029                  enum tgsi_exec_datatype dst_datatype,
3030                  enum tgsi_exec_datatype src_datatype)
3031{
3032   unsigned int chan;
3033   union tgsi_exec_channel src;
3034   union tgsi_exec_channel dst;
3035
3036   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3037   op(&dst, &src);
3038   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3039      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3040         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3041      }
3042   }
3043}
3044
3045static void
3046exec_vector_unary(struct tgsi_exec_machine *mach,
3047                  const struct tgsi_full_instruction *inst,
3048                  micro_unary_op op,
3049                  enum tgsi_exec_datatype dst_datatype,
3050                  enum tgsi_exec_datatype src_datatype)
3051{
3052   unsigned int chan;
3053   struct tgsi_exec_vector dst;
3054
3055   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3056      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3057         union tgsi_exec_channel src;
3058
3059         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3060         op(&dst.xyzw[chan], &src);
3061      }
3062   }
3063   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3064      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3065         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3066      }
3067   }
3068}
3069
3070typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3071                                 const union tgsi_exec_channel *src0,
3072                                 const union tgsi_exec_channel *src1);
3073
3074static void
3075exec_scalar_binary(struct tgsi_exec_machine *mach,
3076                   const struct tgsi_full_instruction *inst,
3077                   micro_binary_op op,
3078                   enum tgsi_exec_datatype dst_datatype,
3079                   enum tgsi_exec_datatype src_datatype)
3080{
3081   unsigned int chan;
3082   union tgsi_exec_channel src[2];
3083   union tgsi_exec_channel dst;
3084
3085   fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3086   fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3087   op(&dst, &src[0], &src[1]);
3088   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3089      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3090         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3091      }
3092   }
3093}
3094
3095static void
3096exec_vector_binary(struct tgsi_exec_machine *mach,
3097                   const struct tgsi_full_instruction *inst,
3098                   micro_binary_op op,
3099                   enum tgsi_exec_datatype dst_datatype,
3100                   enum tgsi_exec_datatype src_datatype)
3101{
3102   unsigned int chan;
3103   struct tgsi_exec_vector dst;
3104
3105   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3106      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3107         union tgsi_exec_channel src[2];
3108
3109         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3110         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3111         op(&dst.xyzw[chan], &src[0], &src[1]);
3112      }
3113   }
3114   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3115      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3116         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3117      }
3118   }
3119}
3120
3121typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3122                                  const union tgsi_exec_channel *src0,
3123                                  const union tgsi_exec_channel *src1,
3124                                  const union tgsi_exec_channel *src2);
3125
3126static void
3127exec_vector_trinary(struct tgsi_exec_machine *mach,
3128                    const struct tgsi_full_instruction *inst,
3129                    micro_trinary_op op,
3130                    enum tgsi_exec_datatype dst_datatype,
3131                    enum tgsi_exec_datatype src_datatype)
3132{
3133   unsigned int chan;
3134   struct tgsi_exec_vector dst;
3135
3136   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3137      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3138         union tgsi_exec_channel src[3];
3139
3140         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3141         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3142         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3143         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3144      }
3145   }
3146   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3147      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3148         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3149      }
3150   }
3151}
3152
3153typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3154                                     const union tgsi_exec_channel *src0,
3155                                     const union tgsi_exec_channel *src1,
3156                                     const union tgsi_exec_channel *src2,
3157                                     const union tgsi_exec_channel *src3);
3158
3159static void
3160exec_vector_quaternary(struct tgsi_exec_machine *mach,
3161                       const struct tgsi_full_instruction *inst,
3162                       micro_quaternary_op op,
3163                       enum tgsi_exec_datatype dst_datatype,
3164                       enum tgsi_exec_datatype src_datatype)
3165{
3166   unsigned int chan;
3167   struct tgsi_exec_vector dst;
3168
3169   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3170      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3171         union tgsi_exec_channel src[4];
3172
3173         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3174         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3175         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3176         fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3177         op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3178      }
3179   }
3180   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3181      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3182         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3183      }
3184   }
3185}
3186
3187static void
3188exec_dp3(struct tgsi_exec_machine *mach,
3189         const struct tgsi_full_instruction *inst)
3190{
3191   unsigned int chan;
3192   union tgsi_exec_channel arg[3];
3193
3194   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3195   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3196   micro_mul(&arg[2], &arg[0], &arg[1]);
3197
3198   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3199      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3200      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3201      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3202   }
3203
3204   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3205      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3206         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3207      }
3208   }
3209}
3210
3211static void
3212exec_dp4(struct tgsi_exec_machine *mach,
3213         const struct tgsi_full_instruction *inst)
3214{
3215   unsigned int chan;
3216   union tgsi_exec_channel arg[3];
3217
3218   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3219   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3220   micro_mul(&arg[2], &arg[0], &arg[1]);
3221
3222   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3223      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3224      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3225      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3226   }
3227
3228   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3229      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3230         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3231      }
3232   }
3233}
3234
3235static void
3236exec_dp2(struct tgsi_exec_machine *mach,
3237         const struct tgsi_full_instruction *inst)
3238{
3239   unsigned int chan;
3240   union tgsi_exec_channel arg[3];
3241
3242   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3243   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3244   micro_mul(&arg[2], &arg[0], &arg[1]);
3245
3246   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3247   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3248   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3249
3250   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3251      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3252         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3253      }
3254   }
3255}
3256
3257static void
3258exec_pk2h(struct tgsi_exec_machine *mach,
3259          const struct tgsi_full_instruction *inst)
3260{
3261   unsigned chan;
3262   union tgsi_exec_channel arg[2], dst;
3263
3264   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3265   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3266   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3267      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3268         (util_float_to_half(arg[1].f[chan]) << 16);
3269   }
3270   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3271      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3272         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3273      }
3274   }
3275}
3276
3277static void
3278exec_up2h(struct tgsi_exec_machine *mach,
3279          const struct tgsi_full_instruction *inst)
3280{
3281   unsigned chan;
3282   union tgsi_exec_channel arg, dst[2];
3283
3284   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3285   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3286      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3287      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3288   }
3289   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3290      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3291         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3292      }
3293   }
3294}
3295
3296static void
3297micro_ucmp(union tgsi_exec_channel *dst,
3298           const union tgsi_exec_channel *src0,
3299           const union tgsi_exec_channel *src1,
3300           const union tgsi_exec_channel *src2)
3301{
3302   dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3303   dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3304   dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3305   dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3306}
3307
3308static void
3309exec_ucmp(struct tgsi_exec_machine *mach,
3310          const struct tgsi_full_instruction *inst)
3311{
3312   unsigned int chan;
3313   struct tgsi_exec_vector dst;
3314
3315   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3316      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3317         union tgsi_exec_channel src[3];
3318
3319         fetch_source(mach, &src[0], &inst->Src[0], chan,
3320                      TGSI_EXEC_DATA_UINT);
3321         fetch_source(mach, &src[1], &inst->Src[1], chan,
3322                      TGSI_EXEC_DATA_FLOAT);
3323         fetch_source(mach, &src[2], &inst->Src[2], chan,
3324                      TGSI_EXEC_DATA_FLOAT);
3325         micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3326      }
3327   }
3328   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3329      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3330         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
3331                    TGSI_EXEC_DATA_FLOAT);
3332      }
3333   }
3334}
3335
3336static void
3337exec_dst(struct tgsi_exec_machine *mach,
3338         const struct tgsi_full_instruction *inst)
3339{
3340   union tgsi_exec_channel r[2];
3341   union tgsi_exec_channel d[4];
3342
3343   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3344      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3345      fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3346      micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3347   }
3348   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3349      fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3350   }
3351   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3352      fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3353   }
3354
3355   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3356      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3357   }
3358   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3359      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3360   }
3361   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3362      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3363   }
3364   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3365      store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3366   }
3367}
3368
3369static void
3370exec_log(struct tgsi_exec_machine *mach,
3371         const struct tgsi_full_instruction *inst)
3372{
3373   union tgsi_exec_channel r[3];
3374
3375   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3376   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3377   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3378   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3379   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3380      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3381   }
3382   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3383      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3384      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3385      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3386   }
3387   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3388      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3389   }
3390   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3391      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3392   }
3393}
3394
3395static void
3396exec_exp(struct tgsi_exec_machine *mach,
3397         const struct tgsi_full_instruction *inst)
3398{
3399   union tgsi_exec_channel r[3];
3400
3401   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3402   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3403   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3404      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3405      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3406   }
3407   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3408      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3409      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3410   }
3411   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3412      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3413      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3414   }
3415   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3416      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3417   }
3418}
3419
3420static void
3421exec_lit(struct tgsi_exec_machine *mach,
3422         const struct tgsi_full_instruction *inst)
3423{
3424   union tgsi_exec_channel r[3];
3425   union tgsi_exec_channel d[3];
3426
3427   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3428      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3429      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3430         fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3431         micro_max(&r[1], &r[1], &ZeroVec);
3432
3433         fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3434         micro_min(&r[2], &r[2], &P128Vec);
3435         micro_max(&r[2], &r[2], &M128Vec);
3436         micro_pow(&r[1], &r[1], &r[2]);
3437         micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3438         store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3439      }
3440      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3441         micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3442         store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3443      }
3444   }
3445   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3446      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3447   }
3448
3449   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3450      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3451   }
3452}
3453
3454static void
3455exec_break(struct tgsi_exec_machine *mach)
3456{
3457   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3458      /* turn off loop channels for each enabled exec channel */
3459      mach->LoopMask &= ~mach->ExecMask;
3460      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3461      UPDATE_EXEC_MASK(mach);
3462   } else {
3463      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3464
3465      mach->Switch.mask = 0x0;
3466
3467      UPDATE_EXEC_MASK(mach);
3468   }
3469}
3470
3471static void
3472exec_switch(struct tgsi_exec_machine *mach,
3473            const struct tgsi_full_instruction *inst)
3474{
3475   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3476   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3477
3478   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3479   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3480   mach->Switch.mask = 0x0;
3481   mach->Switch.defaultMask = 0x0;
3482
3483   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3484   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3485
3486   UPDATE_EXEC_MASK(mach);
3487}
3488
3489static void
3490exec_case(struct tgsi_exec_machine *mach,
3491          const struct tgsi_full_instruction *inst)
3492{
3493   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3494   union tgsi_exec_channel src;
3495   uint mask = 0;
3496
3497   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3498
3499   if (mach->Switch.selector.u[0] == src.u[0]) {
3500      mask |= 0x1;
3501   }
3502   if (mach->Switch.selector.u[1] == src.u[1]) {
3503      mask |= 0x2;
3504   }
3505   if (mach->Switch.selector.u[2] == src.u[2]) {
3506      mask |= 0x4;
3507   }
3508   if (mach->Switch.selector.u[3] == src.u[3]) {
3509      mask |= 0x8;
3510   }
3511
3512   mach->Switch.defaultMask |= mask;
3513
3514   mach->Switch.mask |= mask & prevMask;
3515
3516   UPDATE_EXEC_MASK(mach);
3517}
3518
3519/* FIXME: this will only work if default is last */
3520static void
3521exec_default(struct tgsi_exec_machine *mach)
3522{
3523   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3524
3525   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3526
3527   UPDATE_EXEC_MASK(mach);
3528}
3529
3530static void
3531exec_endswitch(struct tgsi_exec_machine *mach)
3532{
3533   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3534   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3535
3536   UPDATE_EXEC_MASK(mach);
3537}
3538
3539typedef void (* micro_dop)(union tgsi_double_channel *dst,
3540                           const union tgsi_double_channel *src);
3541
3542typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3543                               const union tgsi_double_channel *src0,
3544                               union tgsi_exec_channel *src1);
3545
3546typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3547                             const union tgsi_exec_channel *src);
3548
3549typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3550                             const union tgsi_double_channel *src);
3551
3552static void
3553fetch_double_channel(struct tgsi_exec_machine *mach,
3554                     union tgsi_double_channel *chan,
3555                     const struct tgsi_full_src_register *reg,
3556                     uint chan_0,
3557                     uint chan_1)
3558{
3559   union tgsi_exec_channel src[2];
3560   uint i;
3561
3562   fetch_source_d(mach, &src[0], reg, chan_0);
3563   fetch_source_d(mach, &src[1], reg, chan_1);
3564
3565   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3566      chan->u[i][0] = src[0].u[i];
3567      chan->u[i][1] = src[1].u[i];
3568   }
3569   if (reg->Register.Absolute) {
3570      micro_dabs(chan, chan);
3571   }
3572   if (reg->Register.Negate) {
3573      micro_dneg(chan, chan);
3574   }
3575}
3576
3577static void
3578store_double_channel(struct tgsi_exec_machine *mach,
3579                     const union tgsi_double_channel *chan,
3580                     const struct tgsi_full_dst_register *reg,
3581                     const struct tgsi_full_instruction *inst,
3582                     uint chan_0,
3583                     uint chan_1)
3584{
3585   union tgsi_exec_channel dst[2];
3586   uint i;
3587   union tgsi_double_channel temp;
3588   const uint execmask = mach->ExecMask;
3589
3590   if (!inst->Instruction.Saturate) {
3591      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3592         if (execmask & (1 << i)) {
3593            dst[0].u[i] = chan->u[i][0];
3594            dst[1].u[i] = chan->u[i][1];
3595         }
3596   }
3597   else {
3598      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3599         if (execmask & (1 << i)) {
3600            if (chan->d[i] < 0.0)
3601               temp.d[i] = 0.0;
3602            else if (chan->d[i] > 1.0)
3603               temp.d[i] = 1.0;
3604            else
3605               temp.d[i] = chan->d[i];
3606
3607            dst[0].u[i] = temp.u[i][0];
3608            dst[1].u[i] = temp.u[i][1];
3609         }
3610   }
3611
3612   store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3613   if (chan_1 != (unsigned)-1)
3614      store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3615}
3616
3617static void
3618exec_double_unary(struct tgsi_exec_machine *mach,
3619                  const struct tgsi_full_instruction *inst,
3620                  micro_dop op)
3621{
3622   union tgsi_double_channel src;
3623   union tgsi_double_channel dst;
3624
3625   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3626      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3627      op(&dst, &src);
3628      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3629   }
3630   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3631      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3632      op(&dst, &src);
3633      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3634   }
3635}
3636
3637static void
3638exec_double_binary(struct tgsi_exec_machine *mach,
3639                   const struct tgsi_full_instruction *inst,
3640                   micro_dop op,
3641                   enum tgsi_exec_datatype dst_datatype)
3642{
3643   union tgsi_double_channel src[2];
3644   union tgsi_double_channel dst;
3645   int first_dest_chan, second_dest_chan;
3646   int wmask;
3647
3648   wmask = inst->Dst[0].Register.WriteMask;
3649   /* these are & because of the way DSLT etc store their destinations */
3650   if (wmask & TGSI_WRITEMASK_XY) {
3651      first_dest_chan = TGSI_CHAN_X;
3652      second_dest_chan = TGSI_CHAN_Y;
3653      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3654         first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3655         second_dest_chan = -1;
3656      }
3657
3658      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3659      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3660      op(&dst, src);
3661      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3662   }
3663
3664   if (wmask & TGSI_WRITEMASK_ZW) {
3665      first_dest_chan = TGSI_CHAN_Z;
3666      second_dest_chan = TGSI_CHAN_W;
3667      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3668         first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3669         second_dest_chan = -1;
3670      }
3671
3672      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3673      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3674      op(&dst, src);
3675      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3676   }
3677}
3678
3679static void
3680exec_double_trinary(struct tgsi_exec_machine *mach,
3681                    const struct tgsi_full_instruction *inst,
3682                    micro_dop op)
3683{
3684   union tgsi_double_channel src[3];
3685   union tgsi_double_channel dst;
3686
3687   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3688      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3689      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3690      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3691      op(&dst, src);
3692      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3693   }
3694   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3695      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3696      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3697      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3698      op(&dst, src);
3699      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3700   }
3701}
3702
3703static void
3704exec_dldexp(struct tgsi_exec_machine *mach,
3705            const struct tgsi_full_instruction *inst)
3706{
3707   union tgsi_double_channel src0;
3708   union tgsi_exec_channel src1;
3709   union tgsi_double_channel dst;
3710   int wmask;
3711
3712   wmask = inst->Dst[0].Register.WriteMask;
3713   if (wmask & TGSI_WRITEMASK_XY) {
3714      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3715      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3716      micro_dldexp(&dst, &src0, &src1);
3717      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3718   }
3719
3720   if (wmask & TGSI_WRITEMASK_ZW) {
3721      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3722      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3723      micro_dldexp(&dst, &src0, &src1);
3724      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3725   }
3726}
3727
3728static void
3729exec_dfracexp(struct tgsi_exec_machine *mach,
3730              const struct tgsi_full_instruction *inst)
3731{
3732   union tgsi_double_channel src;
3733   union tgsi_double_channel dst;
3734   union tgsi_exec_channel dst_exp;
3735
3736   fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3737   micro_dfracexp(&dst, &dst_exp, &src);
3738   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3739      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3740   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3741      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3742   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3743      if (inst->Dst[1].Register.WriteMask & (1 << chan))
3744         store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
3745   }
3746}
3747
3748static void
3749exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3750            const struct tgsi_full_instruction *inst,
3751            micro_dop_sop op)
3752{
3753   union tgsi_double_channel src0;
3754   union tgsi_exec_channel src1;
3755   union tgsi_double_channel dst;
3756   int wmask;
3757
3758   wmask = inst->Dst[0].Register.WriteMask;
3759   if (wmask & TGSI_WRITEMASK_XY) {
3760      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3761      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3762      op(&dst, &src0, &src1);
3763      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3764   }
3765
3766   if (wmask & TGSI_WRITEMASK_ZW) {
3767      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3768      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3769      op(&dst, &src0, &src1);
3770      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3771   }
3772}
3773
3774static int
3775get_image_coord_dim(unsigned tgsi_tex)
3776{
3777   int dim;
3778   switch (tgsi_tex) {
3779   case TGSI_TEXTURE_BUFFER:
3780   case TGSI_TEXTURE_1D:
3781      dim = 1;
3782      break;
3783   case TGSI_TEXTURE_2D:
3784   case TGSI_TEXTURE_RECT:
3785   case TGSI_TEXTURE_1D_ARRAY:
3786   case TGSI_TEXTURE_2D_MSAA:
3787      dim = 2;
3788      break;
3789   case TGSI_TEXTURE_3D:
3790   case TGSI_TEXTURE_CUBE:
3791   case TGSI_TEXTURE_2D_ARRAY:
3792   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3793   case TGSI_TEXTURE_CUBE_ARRAY:
3794      dim = 3;
3795      break;
3796   default:
3797      assert(!"unknown texture target");
3798      dim = 0;
3799      break;
3800   }
3801
3802   return dim;
3803}
3804
3805static int
3806get_image_coord_sample(unsigned tgsi_tex)
3807{
3808   int sample = 0;
3809   switch (tgsi_tex) {
3810   case TGSI_TEXTURE_2D_MSAA:
3811      sample = 3;
3812      break;
3813   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3814      sample = 4;
3815      break;
3816   default:
3817      break;
3818   }
3819   return sample;
3820}
3821
3822static void
3823exec_load_img(struct tgsi_exec_machine *mach,
3824              const struct tgsi_full_instruction *inst)
3825{
3826   union tgsi_exec_channel r[4], sample_r;
3827   uint unit;
3828   int sample;
3829   int i, j;
3830   int dim;
3831   uint chan;
3832   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3833   struct tgsi_image_params params;
3834   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3835
3836   unit = fetch_sampler_unit(mach, inst, 0);
3837   dim = get_image_coord_dim(inst->Memory.Texture);
3838   sample = get_image_coord_sample(inst->Memory.Texture);
3839   assert(dim <= 3);
3840
3841   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3842   params.unit = unit;
3843   params.tgsi_tex_instr = inst->Memory.Texture;
3844   params.format = inst->Memory.Format;
3845
3846   for (i = 0; i < dim; i++) {
3847      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3848   }
3849
3850   if (sample)
3851      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3852
3853   mach->Image->load(mach->Image, &params,
3854                     r[0].i, r[1].i, r[2].i, sample_r.i,
3855                     rgba);
3856   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3857      r[0].f[j] = rgba[0][j];
3858      r[1].f[j] = rgba[1][j];
3859      r[2].f[j] = rgba[2][j];
3860      r[3].f[j] = rgba[3][j];
3861   }
3862   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3863      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3864         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3865      }
3866   }
3867}
3868
3869static void
3870exec_load_buf(struct tgsi_exec_machine *mach,
3871              const struct tgsi_full_instruction *inst)
3872{
3873   union tgsi_exec_channel r[4];
3874   uint unit;
3875   int j;
3876   uint chan;
3877   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3878   struct tgsi_buffer_params params;
3879   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3880
3881   unit = fetch_sampler_unit(mach, inst, 0);
3882
3883   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3884   params.unit = unit;
3885   IFETCH(&r[0], 1, TGSI_CHAN_X);
3886
3887   mach->Buffer->load(mach->Buffer, &params,
3888                      r[0].i, rgba);
3889   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3890      r[0].f[j] = rgba[0][j];
3891      r[1].f[j] = rgba[1][j];
3892      r[2].f[j] = rgba[2][j];
3893      r[3].f[j] = rgba[3][j];
3894   }
3895   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3896      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3897         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3898      }
3899   }
3900}
3901
3902static void
3903exec_load_mem(struct tgsi_exec_machine *mach,
3904              const struct tgsi_full_instruction *inst)
3905{
3906   union tgsi_exec_channel r[4];
3907   uint chan;
3908   char *ptr = mach->LocalMem;
3909   uint32_t offset;
3910   int j;
3911
3912   IFETCH(&r[0], 1, TGSI_CHAN_X);
3913   if (r[0].u[0] >= mach->LocalMemSize)
3914      return;
3915
3916   offset = r[0].u[0];
3917   ptr += offset;
3918
3919   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3920      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3921         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3922            memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
3923         }
3924      }
3925   }
3926
3927   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3928      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3929         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3930      }
3931   }
3932}
3933
3934static void
3935exec_load(struct tgsi_exec_machine *mach,
3936          const struct tgsi_full_instruction *inst)
3937{
3938   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3939      exec_load_img(mach, inst);
3940   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
3941      exec_load_buf(mach, inst);
3942   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
3943      exec_load_mem(mach, inst);
3944}
3945
3946static void
3947exec_store_img(struct tgsi_exec_machine *mach,
3948               const struct tgsi_full_instruction *inst)
3949{
3950   union tgsi_exec_channel r[3], sample_r;
3951   union tgsi_exec_channel value[4];
3952   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3953   struct tgsi_image_params params;
3954   int dim;
3955   int sample;
3956   int i, j;
3957   uint unit;
3958   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3959   unit = inst->Dst[0].Register.Index;
3960   dim = get_image_coord_dim(inst->Memory.Texture);
3961   sample = get_image_coord_sample(inst->Memory.Texture);
3962   assert(dim <= 3);
3963
3964   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3965   params.unit = unit;
3966   params.tgsi_tex_instr = inst->Memory.Texture;
3967   params.format = inst->Memory.Format;
3968
3969   for (i = 0; i < dim; i++) {
3970      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
3971   }
3972
3973   for (i = 0; i < 4; i++) {
3974      FETCH(&value[i], 1, TGSI_CHAN_X + i);
3975   }
3976   if (sample)
3977      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
3978
3979   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3980      rgba[0][j] = value[0].f[j];
3981      rgba[1][j] = value[1].f[j];
3982      rgba[2][j] = value[2].f[j];
3983      rgba[3][j] = value[3].f[j];
3984   }
3985
3986   mach->Image->store(mach->Image, &params,
3987                      r[0].i, r[1].i, r[2].i, sample_r.i,
3988                      rgba);
3989}
3990
3991static void
3992exec_store_buf(struct tgsi_exec_machine *mach,
3993               const struct tgsi_full_instruction *inst)
3994{
3995   union tgsi_exec_channel r[3];
3996   union tgsi_exec_channel value[4];
3997   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3998   struct tgsi_buffer_params params;
3999   int i, j;
4000   uint unit;
4001   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4002
4003   unit = inst->Dst[0].Register.Index;
4004
4005   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4006   params.unit = unit;
4007   params.writemask = inst->Dst[0].Register.WriteMask;
4008
4009   IFETCH(&r[0], 0, TGSI_CHAN_X);
4010   for (i = 0; i < 4; i++) {
4011      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4012   }
4013
4014   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4015      rgba[0][j] = value[0].f[j];
4016      rgba[1][j] = value[1].f[j];
4017      rgba[2][j] = value[2].f[j];
4018      rgba[3][j] = value[3].f[j];
4019   }
4020
4021   mach->Buffer->store(mach->Buffer, &params,
4022                      r[0].i,
4023                      rgba);
4024}
4025
4026static void
4027exec_store_mem(struct tgsi_exec_machine *mach,
4028               const struct tgsi_full_instruction *inst)
4029{
4030   union tgsi_exec_channel r[3];
4031   union tgsi_exec_channel value[4];
4032   uint i, chan;
4033   char *ptr = mach->LocalMem;
4034   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4035   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4036
4037   IFETCH(&r[0], 0, TGSI_CHAN_X);
4038
4039   for (i = 0; i < 4; i++) {
4040      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4041   }
4042
4043   if (r[0].u[0] >= mach->LocalMemSize)
4044      return;
4045   ptr += r[0].u[0];
4046
4047   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4048      if (execmask & (1 << i)) {
4049         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4050            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4051               memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4052            }
4053         }
4054      }
4055   }
4056}
4057
4058static void
4059exec_store(struct tgsi_exec_machine *mach,
4060           const struct tgsi_full_instruction *inst)
4061{
4062   if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4063      exec_store_img(mach, inst);
4064   else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4065      exec_store_buf(mach, inst);
4066   else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4067      exec_store_mem(mach, inst);
4068}
4069
4070static void
4071exec_atomop_img(struct tgsi_exec_machine *mach,
4072                const struct tgsi_full_instruction *inst)
4073{
4074   union tgsi_exec_channel r[4], sample_r;
4075   union tgsi_exec_channel value[4], value2[4];
4076   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4077   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4078   struct tgsi_image_params params;
4079   int dim;
4080   int sample;
4081   int i, j;
4082   uint unit, chan;
4083   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4084   unit = fetch_sampler_unit(mach, inst, 0);
4085   dim = get_image_coord_dim(inst->Memory.Texture);
4086   sample = get_image_coord_sample(inst->Memory.Texture);
4087   assert(dim <= 3);
4088
4089   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4090   params.unit = unit;
4091   params.tgsi_tex_instr = inst->Memory.Texture;
4092   params.format = inst->Memory.Format;
4093
4094   for (i = 0; i < dim; i++) {
4095      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4096   }
4097
4098   for (i = 0; i < 4; i++) {
4099      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4100      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4101         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4102   }
4103   if (sample)
4104      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4105
4106   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4107      rgba[0][j] = value[0].f[j];
4108      rgba[1][j] = value[1].f[j];
4109      rgba[2][j] = value[2].f[j];
4110      rgba[3][j] = value[3].f[j];
4111   }
4112   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4113      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4114         rgba2[0][j] = value2[0].f[j];
4115         rgba2[1][j] = value2[1].f[j];
4116         rgba2[2][j] = value2[2].f[j];
4117         rgba2[3][j] = value2[3].f[j];
4118      }
4119   }
4120
4121   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4122                   r[0].i, r[1].i, r[2].i, sample_r.i,
4123                   rgba, rgba2);
4124
4125   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4126      r[0].f[j] = rgba[0][j];
4127      r[1].f[j] = rgba[1][j];
4128      r[2].f[j] = rgba[2][j];
4129      r[3].f[j] = rgba[3][j];
4130   }
4131   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4132      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4133         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4134      }
4135   }
4136}
4137
4138static void
4139exec_atomop_buf(struct tgsi_exec_machine *mach,
4140                const struct tgsi_full_instruction *inst)
4141{
4142   union tgsi_exec_channel r[4];
4143   union tgsi_exec_channel value[4], value2[4];
4144   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4145   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4146   struct tgsi_buffer_params params;
4147   int i, j;
4148   uint unit, chan;
4149   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4150
4151   unit = fetch_sampler_unit(mach, inst, 0);
4152
4153   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4154   params.unit = unit;
4155   params.writemask = inst->Dst[0].Register.WriteMask;
4156
4157   IFETCH(&r[0], 1, TGSI_CHAN_X);
4158
4159   for (i = 0; i < 4; i++) {
4160      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4161      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4162         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4163   }
4164
4165   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4166      rgba[0][j] = value[0].f[j];
4167      rgba[1][j] = value[1].f[j];
4168      rgba[2][j] = value[2].f[j];
4169      rgba[3][j] = value[3].f[j];
4170   }
4171   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4172      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4173         rgba2[0][j] = value2[0].f[j];
4174         rgba2[1][j] = value2[1].f[j];
4175         rgba2[2][j] = value2[2].f[j];
4176         rgba2[3][j] = value2[3].f[j];
4177      }
4178   }
4179
4180   mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4181                   r[0].i,
4182                   rgba, rgba2);
4183
4184   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4185      r[0].f[j] = rgba[0][j];
4186      r[1].f[j] = rgba[1][j];
4187      r[2].f[j] = rgba[2][j];
4188      r[3].f[j] = rgba[3][j];
4189   }
4190   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4191      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4192         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4193      }
4194   }
4195}
4196
4197static void
4198exec_atomop_mem(struct tgsi_exec_machine *mach,
4199                const struct tgsi_full_instruction *inst)
4200{
4201   union tgsi_exec_channel r[4];
4202   union tgsi_exec_channel value[4], value2[4];
4203   char *ptr = mach->LocalMem;
4204   uint32_t val;
4205   uint chan, i;
4206   uint32_t offset;
4207   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4208   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4209   IFETCH(&r[0], 1, TGSI_CHAN_X);
4210
4211   if (r[0].u[0] >= mach->LocalMemSize)
4212      return;
4213
4214   offset = r[0].u[0];
4215   ptr += offset;
4216   for (i = 0; i < 4; i++) {
4217      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4218      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4219         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4220   }
4221
4222   memcpy(&r[0].u[0], ptr, 4);
4223   val = r[0].u[0];
4224   switch (inst->Instruction.Opcode) {
4225   case TGSI_OPCODE_ATOMUADD:
4226      val += value[0].u[0];
4227      break;
4228   case TGSI_OPCODE_ATOMXOR:
4229      val ^= value[0].u[0];
4230      break;
4231   case TGSI_OPCODE_ATOMOR:
4232      val |= value[0].u[0];
4233      break;
4234   case TGSI_OPCODE_ATOMAND:
4235      val &= value[0].u[0];
4236      break;
4237   case TGSI_OPCODE_ATOMUMIN:
4238      val = MIN2(val, value[0].u[0]);
4239      break;
4240   case TGSI_OPCODE_ATOMUMAX:
4241      val = MAX2(val, value[0].u[0]);
4242      break;
4243   case TGSI_OPCODE_ATOMIMIN:
4244      val = MIN2(r[0].i[0], value[0].i[0]);
4245      break;
4246   case TGSI_OPCODE_ATOMIMAX:
4247      val = MAX2(r[0].i[0], value[0].i[0]);
4248      break;
4249   case TGSI_OPCODE_ATOMXCHG:
4250      val = value[0].i[0];
4251      break;
4252   case TGSI_OPCODE_ATOMCAS:
4253      if (val == value[0].u[0])
4254         val = value2[0].u[0];
4255      break;
4256   default:
4257      break;
4258   }
4259   for (i = 0; i < TGSI_QUAD_SIZE; i++)
4260      if (execmask & (1 << i))
4261         memcpy(ptr, &val, 4);
4262
4263   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4264      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4265         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4266      }
4267   }
4268}
4269
4270static void
4271exec_atomop(struct tgsi_exec_machine *mach,
4272            const struct tgsi_full_instruction *inst)
4273{
4274   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4275      exec_atomop_img(mach, inst);
4276   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4277      exec_atomop_buf(mach, inst);
4278   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4279      exec_atomop_mem(mach, inst);
4280}
4281
4282static void
4283exec_resq_img(struct tgsi_exec_machine *mach,
4284              const struct tgsi_full_instruction *inst)
4285{
4286   int result[4];
4287   union tgsi_exec_channel r[4];
4288   uint unit;
4289   int i, chan, j;
4290   struct tgsi_image_params params;
4291   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4292
4293   unit = fetch_sampler_unit(mach, inst, 0);
4294
4295   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4296   params.unit = unit;
4297   params.tgsi_tex_instr = inst->Memory.Texture;
4298   params.format = inst->Memory.Format;
4299
4300   mach->Image->get_dims(mach->Image, &params, result);
4301
4302   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4303      for (j = 0; j < 4; j++) {
4304         r[j].i[i] = result[j];
4305      }
4306   }
4307
4308   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4309      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4310         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4311                    TGSI_EXEC_DATA_INT);
4312      }
4313   }
4314}
4315
4316static void
4317exec_resq_buf(struct tgsi_exec_machine *mach,
4318              const struct tgsi_full_instruction *inst)
4319{
4320   int result;
4321   union tgsi_exec_channel r[4];
4322   uint unit;
4323   int i, chan;
4324   struct tgsi_buffer_params params;
4325   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4326
4327   unit = fetch_sampler_unit(mach, inst, 0);
4328
4329   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4330   params.unit = unit;
4331
4332   mach->Buffer->get_dims(mach->Buffer, &params, &result);
4333
4334   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4335      r[0].i[i] = result;
4336   }
4337
4338   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4339      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4340         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4341                    TGSI_EXEC_DATA_INT);
4342      }
4343   }
4344}
4345
4346static void
4347exec_resq(struct tgsi_exec_machine *mach,
4348          const struct tgsi_full_instruction *inst)
4349{
4350   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4351      exec_resq_img(mach, inst);
4352   else
4353      exec_resq_buf(mach, inst);
4354}
4355
4356static void
4357micro_f2u64(union tgsi_double_channel *dst,
4358            const union tgsi_exec_channel *src)
4359{
4360   dst->u64[0] = (uint64_t)src->f[0];
4361   dst->u64[1] = (uint64_t)src->f[1];
4362   dst->u64[2] = (uint64_t)src->f[2];
4363   dst->u64[3] = (uint64_t)src->f[3];
4364}
4365
4366static void
4367micro_f2i64(union tgsi_double_channel *dst,
4368            const union tgsi_exec_channel *src)
4369{
4370   dst->i64[0] = (int64_t)src->f[0];
4371   dst->i64[1] = (int64_t)src->f[1];
4372   dst->i64[2] = (int64_t)src->f[2];
4373   dst->i64[3] = (int64_t)src->f[3];
4374}
4375
4376static void
4377micro_u2i64(union tgsi_double_channel *dst,
4378            const union tgsi_exec_channel *src)
4379{
4380   dst->u64[0] = (uint64_t)src->u[0];
4381   dst->u64[1] = (uint64_t)src->u[1];
4382   dst->u64[2] = (uint64_t)src->u[2];
4383   dst->u64[3] = (uint64_t)src->u[3];
4384}
4385
4386static void
4387micro_i2i64(union tgsi_double_channel *dst,
4388            const union tgsi_exec_channel *src)
4389{
4390   dst->i64[0] = (int64_t)src->i[0];
4391   dst->i64[1] = (int64_t)src->i[1];
4392   dst->i64[2] = (int64_t)src->i[2];
4393   dst->i64[3] = (int64_t)src->i[3];
4394}
4395
4396static void
4397micro_d2u64(union tgsi_double_channel *dst,
4398           const union tgsi_double_channel *src)
4399{
4400   dst->u64[0] = (uint64_t)src->d[0];
4401   dst->u64[1] = (uint64_t)src->d[1];
4402   dst->u64[2] = (uint64_t)src->d[2];
4403   dst->u64[3] = (uint64_t)src->d[3];
4404}
4405
4406static void
4407micro_d2i64(union tgsi_double_channel *dst,
4408           const union tgsi_double_channel *src)
4409{
4410   dst->i64[0] = (int64_t)src->d[0];
4411   dst->i64[1] = (int64_t)src->d[1];
4412   dst->i64[2] = (int64_t)src->d[2];
4413   dst->i64[3] = (int64_t)src->d[3];
4414}
4415
4416static void
4417micro_u642d(union tgsi_double_channel *dst,
4418           const union tgsi_double_channel *src)
4419{
4420   dst->d[0] = (double)src->u64[0];
4421   dst->d[1] = (double)src->u64[1];
4422   dst->d[2] = (double)src->u64[2];
4423   dst->d[3] = (double)src->u64[3];
4424}
4425
4426static void
4427micro_i642d(union tgsi_double_channel *dst,
4428           const union tgsi_double_channel *src)
4429{
4430   dst->d[0] = (double)src->i64[0];
4431   dst->d[1] = (double)src->i64[1];
4432   dst->d[2] = (double)src->i64[2];
4433   dst->d[3] = (double)src->i64[3];
4434}
4435
4436static void
4437micro_u642f(union tgsi_exec_channel *dst,
4438            const union tgsi_double_channel *src)
4439{
4440   dst->f[0] = (float)src->u64[0];
4441   dst->f[1] = (float)src->u64[1];
4442   dst->f[2] = (float)src->u64[2];
4443   dst->f[3] = (float)src->u64[3];
4444}
4445
4446static void
4447micro_i642f(union tgsi_exec_channel *dst,
4448            const union tgsi_double_channel *src)
4449{
4450   dst->f[0] = (float)src->i64[0];
4451   dst->f[1] = (float)src->i64[1];
4452   dst->f[2] = (float)src->i64[2];
4453   dst->f[3] = (float)src->i64[3];
4454}
4455
4456static void
4457exec_t_2_64(struct tgsi_exec_machine *mach,
4458          const struct tgsi_full_instruction *inst,
4459          micro_dop_s op,
4460          enum tgsi_exec_datatype src_datatype)
4461{
4462   union tgsi_exec_channel src;
4463   union tgsi_double_channel dst;
4464
4465   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4466      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4467      op(&dst, &src);
4468      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4469   }
4470   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4471      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4472      op(&dst, &src);
4473      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4474   }
4475}
4476
4477static void
4478exec_64_2_t(struct tgsi_exec_machine *mach,
4479            const struct tgsi_full_instruction *inst,
4480            micro_sop_d op,
4481            enum tgsi_exec_datatype dst_datatype)
4482{
4483   union tgsi_double_channel src;
4484   union tgsi_exec_channel dst;
4485   int wm = inst->Dst[0].Register.WriteMask;
4486   int i;
4487   int bit;
4488   for (i = 0; i < 2; i++) {
4489      bit = ffs(wm);
4490      if (bit) {
4491         wm &= ~(1 << (bit - 1));
4492         if (i == 0)
4493            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4494         else
4495            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4496         op(&dst, &src);
4497         store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4498      }
4499   }
4500}
4501
4502static void
4503micro_i2f(union tgsi_exec_channel *dst,
4504          const union tgsi_exec_channel *src)
4505{
4506   dst->f[0] = (float)src->i[0];
4507   dst->f[1] = (float)src->i[1];
4508   dst->f[2] = (float)src->i[2];
4509   dst->f[3] = (float)src->i[3];
4510}
4511
4512static void
4513micro_not(union tgsi_exec_channel *dst,
4514          const union tgsi_exec_channel *src)
4515{
4516   dst->u[0] = ~src->u[0];
4517   dst->u[1] = ~src->u[1];
4518   dst->u[2] = ~src->u[2];
4519   dst->u[3] = ~src->u[3];
4520}
4521
4522static void
4523micro_shl(union tgsi_exec_channel *dst,
4524          const union tgsi_exec_channel *src0,
4525          const union tgsi_exec_channel *src1)
4526{
4527   unsigned masked_count;
4528   masked_count = src1->u[0] & 0x1f;
4529   dst->u[0] = src0->u[0] << masked_count;
4530   masked_count = src1->u[1] & 0x1f;
4531   dst->u[1] = src0->u[1] << masked_count;
4532   masked_count = src1->u[2] & 0x1f;
4533   dst->u[2] = src0->u[2] << masked_count;
4534   masked_count = src1->u[3] & 0x1f;
4535   dst->u[3] = src0->u[3] << masked_count;
4536}
4537
4538static void
4539micro_and(union tgsi_exec_channel *dst,
4540          const union tgsi_exec_channel *src0,
4541          const union tgsi_exec_channel *src1)
4542{
4543   dst->u[0] = src0->u[0] & src1->u[0];
4544   dst->u[1] = src0->u[1] & src1->u[1];
4545   dst->u[2] = src0->u[2] & src1->u[2];
4546   dst->u[3] = src0->u[3] & src1->u[3];
4547}
4548
4549static void
4550micro_or(union tgsi_exec_channel *dst,
4551         const union tgsi_exec_channel *src0,
4552         const union tgsi_exec_channel *src1)
4553{
4554   dst->u[0] = src0->u[0] | src1->u[0];
4555   dst->u[1] = src0->u[1] | src1->u[1];
4556   dst->u[2] = src0->u[2] | src1->u[2];
4557   dst->u[3] = src0->u[3] | src1->u[3];
4558}
4559
4560static void
4561micro_xor(union tgsi_exec_channel *dst,
4562          const union tgsi_exec_channel *src0,
4563          const union tgsi_exec_channel *src1)
4564{
4565   dst->u[0] = src0->u[0] ^ src1->u[0];
4566   dst->u[1] = src0->u[1] ^ src1->u[1];
4567   dst->u[2] = src0->u[2] ^ src1->u[2];
4568   dst->u[3] = src0->u[3] ^ src1->u[3];
4569}
4570
4571static void
4572micro_mod(union tgsi_exec_channel *dst,
4573          const union tgsi_exec_channel *src0,
4574          const union tgsi_exec_channel *src1)
4575{
4576   dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4577   dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4578   dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4579   dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4580}
4581
4582static void
4583micro_f2i(union tgsi_exec_channel *dst,
4584          const union tgsi_exec_channel *src)
4585{
4586   dst->i[0] = (int)src->f[0];
4587   dst->i[1] = (int)src->f[1];
4588   dst->i[2] = (int)src->f[2];
4589   dst->i[3] = (int)src->f[3];
4590}
4591
4592static void
4593micro_fseq(union tgsi_exec_channel *dst,
4594           const union tgsi_exec_channel *src0,
4595           const union tgsi_exec_channel *src1)
4596{
4597   dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4598   dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4599   dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4600   dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4601}
4602
4603static void
4604micro_fsge(union tgsi_exec_channel *dst,
4605           const union tgsi_exec_channel *src0,
4606           const union tgsi_exec_channel *src1)
4607{
4608   dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4609   dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4610   dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4611   dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4612}
4613
4614static void
4615micro_fslt(union tgsi_exec_channel *dst,
4616           const union tgsi_exec_channel *src0,
4617           const union tgsi_exec_channel *src1)
4618{
4619   dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4620   dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4621   dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4622   dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4623}
4624
4625static void
4626micro_fsne(union tgsi_exec_channel *dst,
4627           const union tgsi_exec_channel *src0,
4628           const union tgsi_exec_channel *src1)
4629{
4630   dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4631   dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4632   dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4633   dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4634}
4635
4636static void
4637micro_idiv(union tgsi_exec_channel *dst,
4638           const union tgsi_exec_channel *src0,
4639           const union tgsi_exec_channel *src1)
4640{
4641   dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4642   dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4643   dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4644   dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4645}
4646
4647static void
4648micro_imax(union tgsi_exec_channel *dst,
4649           const union tgsi_exec_channel *src0,
4650           const union tgsi_exec_channel *src1)
4651{
4652   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4653   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4654   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4655   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4656}
4657
4658static void
4659micro_imin(union tgsi_exec_channel *dst,
4660           const union tgsi_exec_channel *src0,
4661           const union tgsi_exec_channel *src1)
4662{
4663   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4664   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4665   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4666   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4667}
4668
4669static void
4670micro_isge(union tgsi_exec_channel *dst,
4671           const union tgsi_exec_channel *src0,
4672           const union tgsi_exec_channel *src1)
4673{
4674   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4675   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4676   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4677   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4678}
4679
4680static void
4681micro_ishr(union tgsi_exec_channel *dst,
4682           const union tgsi_exec_channel *src0,
4683           const union tgsi_exec_channel *src1)
4684{
4685   unsigned masked_count;
4686   masked_count = src1->i[0] & 0x1f;
4687   dst->i[0] = src0->i[0] >> masked_count;
4688   masked_count = src1->i[1] & 0x1f;
4689   dst->i[1] = src0->i[1] >> masked_count;
4690   masked_count = src1->i[2] & 0x1f;
4691   dst->i[2] = src0->i[2] >> masked_count;
4692   masked_count = src1->i[3] & 0x1f;
4693   dst->i[3] = src0->i[3] >> masked_count;
4694}
4695
4696static void
4697micro_islt(union tgsi_exec_channel *dst,
4698           const union tgsi_exec_channel *src0,
4699           const union tgsi_exec_channel *src1)
4700{
4701   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4702   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4703   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4704   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4705}
4706
4707static void
4708micro_f2u(union tgsi_exec_channel *dst,
4709          const union tgsi_exec_channel *src)
4710{
4711   dst->u[0] = (uint)src->f[0];
4712   dst->u[1] = (uint)src->f[1];
4713   dst->u[2] = (uint)src->f[2];
4714   dst->u[3] = (uint)src->f[3];
4715}
4716
4717static void
4718micro_u2f(union tgsi_exec_channel *dst,
4719          const union tgsi_exec_channel *src)
4720{
4721   dst->f[0] = (float)src->u[0];
4722   dst->f[1] = (float)src->u[1];
4723   dst->f[2] = (float)src->u[2];
4724   dst->f[3] = (float)src->u[3];
4725}
4726
4727static void
4728micro_uadd(union tgsi_exec_channel *dst,
4729           const union tgsi_exec_channel *src0,
4730           const union tgsi_exec_channel *src1)
4731{
4732   dst->u[0] = src0->u[0] + src1->u[0];
4733   dst->u[1] = src0->u[1] + src1->u[1];
4734   dst->u[2] = src0->u[2] + src1->u[2];
4735   dst->u[3] = src0->u[3] + src1->u[3];
4736}
4737
4738static void
4739micro_udiv(union tgsi_exec_channel *dst,
4740           const union tgsi_exec_channel *src0,
4741           const union tgsi_exec_channel *src1)
4742{
4743   dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4744   dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4745   dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4746   dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4747}
4748
4749static void
4750micro_umad(union tgsi_exec_channel *dst,
4751           const union tgsi_exec_channel *src0,
4752           const union tgsi_exec_channel *src1,
4753           const union tgsi_exec_channel *src2)
4754{
4755   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4756   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4757   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4758   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4759}
4760
4761static void
4762micro_umax(union tgsi_exec_channel *dst,
4763           const union tgsi_exec_channel *src0,
4764           const union tgsi_exec_channel *src1)
4765{
4766   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4767   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4768   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4769   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4770}
4771
4772static void
4773micro_umin(union tgsi_exec_channel *dst,
4774           const union tgsi_exec_channel *src0,
4775           const union tgsi_exec_channel *src1)
4776{
4777   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4778   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4779   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4780   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4781}
4782
4783static void
4784micro_umod(union tgsi_exec_channel *dst,
4785           const union tgsi_exec_channel *src0,
4786           const union tgsi_exec_channel *src1)
4787{
4788   dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4789   dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4790   dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4791   dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4792}
4793
4794static void
4795micro_umul(union tgsi_exec_channel *dst,
4796           const union tgsi_exec_channel *src0,
4797           const union tgsi_exec_channel *src1)
4798{
4799   dst->u[0] = src0->u[0] * src1->u[0];
4800   dst->u[1] = src0->u[1] * src1->u[1];
4801   dst->u[2] = src0->u[2] * src1->u[2];
4802   dst->u[3] = src0->u[3] * src1->u[3];
4803}
4804
4805static void
4806micro_imul_hi(union tgsi_exec_channel *dst,
4807              const union tgsi_exec_channel *src0,
4808              const union tgsi_exec_channel *src1)
4809{
4810#define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4811   dst->i[0] = I64M(src0->i[0], src1->i[0]);
4812   dst->i[1] = I64M(src0->i[1], src1->i[1]);
4813   dst->i[2] = I64M(src0->i[2], src1->i[2]);
4814   dst->i[3] = I64M(src0->i[3], src1->i[3]);
4815#undef I64M
4816}
4817
4818static void
4819micro_umul_hi(union tgsi_exec_channel *dst,
4820              const union tgsi_exec_channel *src0,
4821              const union tgsi_exec_channel *src1)
4822{
4823#define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4824   dst->u[0] = U64M(src0->u[0], src1->u[0]);
4825   dst->u[1] = U64M(src0->u[1], src1->u[1]);
4826   dst->u[2] = U64M(src0->u[2], src1->u[2]);
4827   dst->u[3] = U64M(src0->u[3], src1->u[3]);
4828#undef U64M
4829}
4830
4831static void
4832micro_useq(union tgsi_exec_channel *dst,
4833           const union tgsi_exec_channel *src0,
4834           const union tgsi_exec_channel *src1)
4835{
4836   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4837   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4838   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4839   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4840}
4841
4842static void
4843micro_usge(union tgsi_exec_channel *dst,
4844           const union tgsi_exec_channel *src0,
4845           const union tgsi_exec_channel *src1)
4846{
4847   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4848   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4849   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4850   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4851}
4852
4853static void
4854micro_ushr(union tgsi_exec_channel *dst,
4855           const union tgsi_exec_channel *src0,
4856           const union tgsi_exec_channel *src1)
4857{
4858   unsigned masked_count;
4859   masked_count = src1->u[0] & 0x1f;
4860   dst->u[0] = src0->u[0] >> masked_count;
4861   masked_count = src1->u[1] & 0x1f;
4862   dst->u[1] = src0->u[1] >> masked_count;
4863   masked_count = src1->u[2] & 0x1f;
4864   dst->u[2] = src0->u[2] >> masked_count;
4865   masked_count = src1->u[3] & 0x1f;
4866   dst->u[3] = src0->u[3] >> masked_count;
4867}
4868
4869static void
4870micro_uslt(union tgsi_exec_channel *dst,
4871           const union tgsi_exec_channel *src0,
4872           const union tgsi_exec_channel *src1)
4873{
4874   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4875   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4876   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4877   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4878}
4879
4880static void
4881micro_usne(union tgsi_exec_channel *dst,
4882           const union tgsi_exec_channel *src0,
4883           const union tgsi_exec_channel *src1)
4884{
4885   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4886   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4887   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4888   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4889}
4890
4891static void
4892micro_uarl(union tgsi_exec_channel *dst,
4893           const union tgsi_exec_channel *src)
4894{
4895   dst->i[0] = src->u[0];
4896   dst->i[1] = src->u[1];
4897   dst->i[2] = src->u[2];
4898   dst->i[3] = src->u[3];
4899}
4900
4901/**
4902 * Signed bitfield extract (i.e. sign-extend the extracted bits)
4903 */
4904static void
4905micro_ibfe(union tgsi_exec_channel *dst,
4906           const union tgsi_exec_channel *src0,
4907           const union tgsi_exec_channel *src1,
4908           const union tgsi_exec_channel *src2)
4909{
4910   int i;
4911   for (i = 0; i < 4; i++) {
4912      int width = src2->i[i] & 0x1f;
4913      int offset = src1->i[i] & 0x1f;
4914      if (width == 0)
4915         dst->i[i] = 0;
4916      else if (width + offset < 32)
4917         dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
4918      else
4919         dst->i[i] = src0->i[i] >> offset;
4920   }
4921}
4922
4923/**
4924 * Unsigned bitfield extract
4925 */
4926static void
4927micro_ubfe(union tgsi_exec_channel *dst,
4928           const union tgsi_exec_channel *src0,
4929           const union tgsi_exec_channel *src1,
4930           const union tgsi_exec_channel *src2)
4931{
4932   int i;
4933   for (i = 0; i < 4; i++) {
4934      int width = src2->u[i] & 0x1f;
4935      int offset = src1->u[i] & 0x1f;
4936      if (width == 0)
4937         dst->u[i] = 0;
4938      else if (width + offset < 32)
4939         dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
4940      else
4941         dst->u[i] = src0->u[i] >> offset;
4942   }
4943}
4944
4945/**
4946 * Bitfield insert: copy low bits from src1 into a region of src0.
4947 */
4948static void
4949micro_bfi(union tgsi_exec_channel *dst,
4950          const union tgsi_exec_channel *src0,
4951          const union tgsi_exec_channel *src1,
4952          const union tgsi_exec_channel *src2,
4953          const union tgsi_exec_channel *src3)
4954{
4955   int i;
4956   for (i = 0; i < 4; i++) {
4957      int width = src3->u[i] & 0x1f;
4958      int offset = src2->u[i] & 0x1f;
4959      int bitmask = ((1 << width) - 1) << offset;
4960      dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
4961   }
4962}
4963
4964static void
4965micro_brev(union tgsi_exec_channel *dst,
4966           const union tgsi_exec_channel *src)
4967{
4968   dst->u[0] = util_bitreverse(src->u[0]);
4969   dst->u[1] = util_bitreverse(src->u[1]);
4970   dst->u[2] = util_bitreverse(src->u[2]);
4971   dst->u[3] = util_bitreverse(src->u[3]);
4972}
4973
4974static void
4975micro_popc(union tgsi_exec_channel *dst,
4976           const union tgsi_exec_channel *src)
4977{
4978   dst->u[0] = util_bitcount(src->u[0]);
4979   dst->u[1] = util_bitcount(src->u[1]);
4980   dst->u[2] = util_bitcount(src->u[2]);
4981   dst->u[3] = util_bitcount(src->u[3]);
4982}
4983
4984static void
4985micro_lsb(union tgsi_exec_channel *dst,
4986          const union tgsi_exec_channel *src)
4987{
4988   dst->i[0] = ffs(src->u[0]) - 1;
4989   dst->i[1] = ffs(src->u[1]) - 1;
4990   dst->i[2] = ffs(src->u[2]) - 1;
4991   dst->i[3] = ffs(src->u[3]) - 1;
4992}
4993
4994static void
4995micro_imsb(union tgsi_exec_channel *dst,
4996           const union tgsi_exec_channel *src)
4997{
4998   dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
4999   dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5000   dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5001   dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5002}
5003
5004static void
5005micro_umsb(union tgsi_exec_channel *dst,
5006           const union tgsi_exec_channel *src)
5007{
5008   dst->i[0] = util_last_bit(src->u[0]) - 1;
5009   dst->i[1] = util_last_bit(src->u[1]) - 1;
5010   dst->i[2] = util_last_bit(src->u[2]) - 1;
5011   dst->i[3] = util_last_bit(src->u[3]) - 1;
5012}
5013
5014/**
5015 * Execute a TGSI instruction.
5016 * Returns TRUE if a barrier instruction is hit,
5017 * otherwise FALSE.
5018 */
5019static boolean
5020exec_instruction(
5021   struct tgsi_exec_machine *mach,
5022   const struct tgsi_full_instruction *inst,
5023   int *pc )
5024{
5025   union tgsi_exec_channel r[10];
5026
5027   (*pc)++;
5028
5029   switch (inst->Instruction.Opcode) {
5030   case TGSI_OPCODE_ARL:
5031      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5032      break;
5033
5034   case TGSI_OPCODE_MOV:
5035      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5036      break;
5037
5038   case TGSI_OPCODE_LIT:
5039      exec_lit(mach, inst);
5040      break;
5041
5042   case TGSI_OPCODE_RCP:
5043      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5044      break;
5045
5046   case TGSI_OPCODE_RSQ:
5047      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5048      break;
5049
5050   case TGSI_OPCODE_EXP:
5051      exec_exp(mach, inst);
5052      break;
5053
5054   case TGSI_OPCODE_LOG:
5055      exec_log(mach, inst);
5056      break;
5057
5058   case TGSI_OPCODE_MUL:
5059      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5060      break;
5061
5062   case TGSI_OPCODE_ADD:
5063      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5064      break;
5065
5066   case TGSI_OPCODE_DP3:
5067      exec_dp3(mach, inst);
5068      break;
5069
5070   case TGSI_OPCODE_DP4:
5071      exec_dp4(mach, inst);
5072      break;
5073
5074   case TGSI_OPCODE_DST:
5075      exec_dst(mach, inst);
5076      break;
5077
5078   case TGSI_OPCODE_MIN:
5079      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5080      break;
5081
5082   case TGSI_OPCODE_MAX:
5083      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5084      break;
5085
5086   case TGSI_OPCODE_SLT:
5087      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5088      break;
5089
5090   case TGSI_OPCODE_SGE:
5091      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5092      break;
5093
5094   case TGSI_OPCODE_MAD:
5095      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5096      break;
5097
5098   case TGSI_OPCODE_LRP:
5099      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5100      break;
5101
5102   case TGSI_OPCODE_SQRT:
5103      exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5104      break;
5105
5106   case TGSI_OPCODE_FRC:
5107      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5108      break;
5109
5110   case TGSI_OPCODE_FLR:
5111      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5112      break;
5113
5114   case TGSI_OPCODE_ROUND:
5115      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5116      break;
5117
5118   case TGSI_OPCODE_EX2:
5119      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5120      break;
5121
5122   case TGSI_OPCODE_LG2:
5123      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5124      break;
5125
5126   case TGSI_OPCODE_POW:
5127      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5128      break;
5129
5130   case TGSI_OPCODE_LDEXP:
5131      exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5132      break;
5133
5134   case TGSI_OPCODE_COS:
5135      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5136      break;
5137
5138   case TGSI_OPCODE_DDX:
5139      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5140      break;
5141
5142   case TGSI_OPCODE_DDY:
5143      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5144      break;
5145
5146   case TGSI_OPCODE_KILL:
5147      exec_kill (mach);
5148      break;
5149
5150   case TGSI_OPCODE_KILL_IF:
5151      exec_kill_if (mach, inst);
5152      break;
5153
5154   case TGSI_OPCODE_PK2H:
5155      exec_pk2h(mach, inst);
5156      break;
5157
5158   case TGSI_OPCODE_PK2US:
5159      assert (0);
5160      break;
5161
5162   case TGSI_OPCODE_PK4B:
5163      assert (0);
5164      break;
5165
5166   case TGSI_OPCODE_PK4UB:
5167      assert (0);
5168      break;
5169
5170   case TGSI_OPCODE_SEQ:
5171      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5172      break;
5173
5174   case TGSI_OPCODE_SGT:
5175      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5176      break;
5177
5178   case TGSI_OPCODE_SIN:
5179      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5180      break;
5181
5182   case TGSI_OPCODE_SLE:
5183      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5184      break;
5185
5186   case TGSI_OPCODE_SNE:
5187      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5188      break;
5189
5190   case TGSI_OPCODE_TEX:
5191      /* simple texture lookup */
5192      /* src[0] = texcoord */
5193      /* src[1] = sampler unit */
5194      exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5195      break;
5196
5197   case TGSI_OPCODE_TXB:
5198      /* Texture lookup with lod bias */
5199      /* src[0] = texcoord (src[0].w = LOD bias) */
5200      /* src[1] = sampler unit */
5201      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5202      break;
5203
5204   case TGSI_OPCODE_TXD:
5205      /* Texture lookup with explict partial derivatives */
5206      /* src[0] = texcoord */
5207      /* src[1] = d[strq]/dx */
5208      /* src[2] = d[strq]/dy */
5209      /* src[3] = sampler unit */
5210      exec_txd(mach, inst);
5211      break;
5212
5213   case TGSI_OPCODE_TXL:
5214      /* Texture lookup with explit LOD */
5215      /* src[0] = texcoord (src[0].w = LOD) */
5216      /* src[1] = sampler unit */
5217      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5218      break;
5219
5220   case TGSI_OPCODE_TXP:
5221      /* Texture lookup with projection */
5222      /* src[0] = texcoord (src[0].w = projection) */
5223      /* src[1] = sampler unit */
5224      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5225      break;
5226
5227   case TGSI_OPCODE_TG4:
5228      /* src[0] = texcoord */
5229      /* src[1] = component */
5230      /* src[2] = sampler unit */
5231      exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5232      break;
5233
5234   case TGSI_OPCODE_LODQ:
5235      /* src[0] = texcoord */
5236      /* src[1] = sampler unit */
5237      exec_lodq(mach, inst);
5238      break;
5239
5240   case TGSI_OPCODE_UP2H:
5241      exec_up2h(mach, inst);
5242      break;
5243
5244   case TGSI_OPCODE_UP2US:
5245      assert (0);
5246      break;
5247
5248   case TGSI_OPCODE_UP4B:
5249      assert (0);
5250      break;
5251
5252   case TGSI_OPCODE_UP4UB:
5253      assert (0);
5254      break;
5255
5256   case TGSI_OPCODE_ARR:
5257      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5258      break;
5259
5260   case TGSI_OPCODE_CAL:
5261      /* skip the call if no execution channels are enabled */
5262      if (mach->ExecMask) {
5263         /* do the call */
5264
5265         /* First, record the depths of the execution stacks.
5266          * This is important for deeply nested/looped return statements.
5267          * We have to unwind the stacks by the correct amount.  For a
5268          * real code generator, we could determine the number of entries
5269          * to pop off each stack with simple static analysis and avoid
5270          * implementing this data structure at run time.
5271          */
5272         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5273         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5274         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5275         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5276         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5277         /* note that PC was already incremented above */
5278         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5279
5280         mach->CallStackTop++;
5281
5282         /* Second, push the Cond, Loop, Cont, Func stacks */
5283         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5284         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5285         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5286         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5287         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5288         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5289
5290         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5291         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5292         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5293         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5294         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5295         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5296
5297         /* Finally, jump to the subroutine.  The label is a pointer
5298          * (an instruction number) to the BGNSUB instruction.
5299          */
5300         *pc = inst->Label.Label;
5301         assert(mach->Instructions[*pc].Instruction.Opcode
5302                == TGSI_OPCODE_BGNSUB);
5303      }
5304      break;
5305
5306   case TGSI_OPCODE_RET:
5307      mach->FuncMask &= ~mach->ExecMask;
5308      UPDATE_EXEC_MASK(mach);
5309
5310      if (mach->FuncMask == 0x0) {
5311         /* really return now (otherwise, keep executing */
5312
5313         if (mach->CallStackTop == 0) {
5314            /* returning from main() */
5315            mach->CondStackTop = 0;
5316            mach->LoopStackTop = 0;
5317            mach->ContStackTop = 0;
5318            mach->LoopLabelStackTop = 0;
5319            mach->SwitchStackTop = 0;
5320            mach->BreakStackTop = 0;
5321            *pc = -1;
5322            return FALSE;
5323         }
5324
5325         assert(mach->CallStackTop > 0);
5326         mach->CallStackTop--;
5327
5328         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5329         mach->CondMask = mach->CondStack[mach->CondStackTop];
5330
5331         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5332         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5333
5334         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5335         mach->ContMask = mach->ContStack[mach->ContStackTop];
5336
5337         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5338         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5339
5340         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5341         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5342
5343         assert(mach->FuncStackTop > 0);
5344         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5345
5346         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5347
5348         UPDATE_EXEC_MASK(mach);
5349      }
5350      break;
5351
5352   case TGSI_OPCODE_SSG:
5353      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5354      break;
5355
5356   case TGSI_OPCODE_CMP:
5357      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5358      break;
5359
5360   case TGSI_OPCODE_DIV:
5361      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5362      break;
5363
5364   case TGSI_OPCODE_DP2:
5365      exec_dp2(mach, inst);
5366      break;
5367
5368   case TGSI_OPCODE_IF:
5369      /* push CondMask */
5370      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5371      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5372      FETCH( &r[0], 0, TGSI_CHAN_X );
5373      /* update CondMask */
5374      if( ! r[0].f[0] ) {
5375         mach->CondMask &= ~0x1;
5376      }
5377      if( ! r[0].f[1] ) {
5378         mach->CondMask &= ~0x2;
5379      }
5380      if( ! r[0].f[2] ) {
5381         mach->CondMask &= ~0x4;
5382      }
5383      if( ! r[0].f[3] ) {
5384         mach->CondMask &= ~0x8;
5385      }
5386      UPDATE_EXEC_MASK(mach);
5387      /* Todo: If CondMask==0, jump to ELSE */
5388      break;
5389
5390   case TGSI_OPCODE_UIF:
5391      /* push CondMask */
5392      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5393      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5394      IFETCH( &r[0], 0, TGSI_CHAN_X );
5395      /* update CondMask */
5396      if( ! r[0].u[0] ) {
5397         mach->CondMask &= ~0x1;
5398      }
5399      if( ! r[0].u[1] ) {
5400         mach->CondMask &= ~0x2;
5401      }
5402      if( ! r[0].u[2] ) {
5403         mach->CondMask &= ~0x4;
5404      }
5405      if( ! r[0].u[3] ) {
5406         mach->CondMask &= ~0x8;
5407      }
5408      UPDATE_EXEC_MASK(mach);
5409      /* Todo: If CondMask==0, jump to ELSE */
5410      break;
5411
5412   case TGSI_OPCODE_ELSE:
5413      /* invert CondMask wrt previous mask */
5414      {
5415         uint prevMask;
5416         assert(mach->CondStackTop > 0);
5417         prevMask = mach->CondStack[mach->CondStackTop - 1];
5418         mach->CondMask = ~mach->CondMask & prevMask;
5419         UPDATE_EXEC_MASK(mach);
5420         /* Todo: If CondMask==0, jump to ENDIF */
5421      }
5422      break;
5423
5424   case TGSI_OPCODE_ENDIF:
5425      /* pop CondMask */
5426      assert(mach->CondStackTop > 0);
5427      mach->CondMask = mach->CondStack[--mach->CondStackTop];
5428      UPDATE_EXEC_MASK(mach);
5429      break;
5430
5431   case TGSI_OPCODE_END:
5432      /* make sure we end primitives which haven't
5433       * been explicitly emitted */
5434      conditional_emit_primitive(mach);
5435      /* halt execution */
5436      *pc = -1;
5437      break;
5438
5439   case TGSI_OPCODE_CEIL:
5440      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5441      break;
5442
5443   case TGSI_OPCODE_I2F:
5444      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5445      break;
5446
5447   case TGSI_OPCODE_NOT:
5448      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5449      break;
5450
5451   case TGSI_OPCODE_TRUNC:
5452      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5453      break;
5454
5455   case TGSI_OPCODE_SHL:
5456      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5457      break;
5458
5459   case TGSI_OPCODE_AND:
5460      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5461      break;
5462
5463   case TGSI_OPCODE_OR:
5464      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5465      break;
5466
5467   case TGSI_OPCODE_MOD:
5468      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5469      break;
5470
5471   case TGSI_OPCODE_XOR:
5472      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5473      break;
5474
5475   case TGSI_OPCODE_TXF:
5476      exec_txf(mach, inst);
5477      break;
5478
5479   case TGSI_OPCODE_TXQ:
5480      exec_txq(mach, inst);
5481      break;
5482
5483   case TGSI_OPCODE_EMIT:
5484      emit_vertex(mach);
5485      break;
5486
5487   case TGSI_OPCODE_ENDPRIM:
5488      emit_primitive(mach);
5489      break;
5490
5491   case TGSI_OPCODE_BGNLOOP:
5492      /* push LoopMask and ContMasks */
5493      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5494      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5495      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5496      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5497
5498      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5499      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5500      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5501      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5502      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5503      break;
5504
5505   case TGSI_OPCODE_ENDLOOP:
5506      /* Restore ContMask, but don't pop */
5507      assert(mach->ContStackTop > 0);
5508      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5509      UPDATE_EXEC_MASK(mach);
5510      if (mach->ExecMask) {
5511         /* repeat loop: jump to instruction just past BGNLOOP */
5512         assert(mach->LoopLabelStackTop > 0);
5513         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5514      }
5515      else {
5516         /* exit loop: pop LoopMask */
5517         assert(mach->LoopStackTop > 0);
5518         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5519         /* pop ContMask */
5520         assert(mach->ContStackTop > 0);
5521         mach->ContMask = mach->ContStack[--mach->ContStackTop];
5522         assert(mach->LoopLabelStackTop > 0);
5523         --mach->LoopLabelStackTop;
5524
5525         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5526      }
5527      UPDATE_EXEC_MASK(mach);
5528      break;
5529
5530   case TGSI_OPCODE_BRK:
5531      exec_break(mach);
5532      break;
5533
5534   case TGSI_OPCODE_CONT:
5535      /* turn off cont channels for each enabled exec channel */
5536      mach->ContMask &= ~mach->ExecMask;
5537      /* Todo: if mach->LoopMask == 0, jump to end of loop */
5538      UPDATE_EXEC_MASK(mach);
5539      break;
5540
5541   case TGSI_OPCODE_BGNSUB:
5542      /* no-op */
5543      break;
5544
5545   case TGSI_OPCODE_ENDSUB:
5546      /*
5547       * XXX: This really should be a no-op. We should never reach this opcode.
5548       */
5549
5550      assert(mach->CallStackTop > 0);
5551      mach->CallStackTop--;
5552
5553      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5554      mach->CondMask = mach->CondStack[mach->CondStackTop];
5555
5556      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5557      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5558
5559      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5560      mach->ContMask = mach->ContStack[mach->ContStackTop];
5561
5562      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5563      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5564
5565      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5566      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5567
5568      assert(mach->FuncStackTop > 0);
5569      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5570
5571      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5572
5573      UPDATE_EXEC_MASK(mach);
5574      break;
5575
5576   case TGSI_OPCODE_NOP:
5577      break;
5578
5579   case TGSI_OPCODE_F2I:
5580      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5581      break;
5582
5583   case TGSI_OPCODE_FSEQ:
5584      exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5585      break;
5586
5587   case TGSI_OPCODE_FSGE:
5588      exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5589      break;
5590
5591   case TGSI_OPCODE_FSLT:
5592      exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5593      break;
5594
5595   case TGSI_OPCODE_FSNE:
5596      exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5597      break;
5598
5599   case TGSI_OPCODE_IDIV:
5600      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5601      break;
5602
5603   case TGSI_OPCODE_IMAX:
5604      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5605      break;
5606
5607   case TGSI_OPCODE_IMIN:
5608      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5609      break;
5610
5611   case TGSI_OPCODE_INEG:
5612      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5613      break;
5614
5615   case TGSI_OPCODE_ISGE:
5616      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5617      break;
5618
5619   case TGSI_OPCODE_ISHR:
5620      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5621      break;
5622
5623   case TGSI_OPCODE_ISLT:
5624      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5625      break;
5626
5627   case TGSI_OPCODE_F2U:
5628      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5629      break;
5630
5631   case TGSI_OPCODE_U2F:
5632      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5633      break;
5634
5635   case TGSI_OPCODE_UADD:
5636      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5637      break;
5638
5639   case TGSI_OPCODE_UDIV:
5640      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5641      break;
5642
5643   case TGSI_OPCODE_UMAD:
5644      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5645      break;
5646
5647   case TGSI_OPCODE_UMAX:
5648      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5649      break;
5650
5651   case TGSI_OPCODE_UMIN:
5652      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5653      break;
5654
5655   case TGSI_OPCODE_UMOD:
5656      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5657      break;
5658
5659   case TGSI_OPCODE_UMUL:
5660      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5661      break;
5662
5663   case TGSI_OPCODE_IMUL_HI:
5664      exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5665      break;
5666
5667   case TGSI_OPCODE_UMUL_HI:
5668      exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5669      break;
5670
5671   case TGSI_OPCODE_USEQ:
5672      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5673      break;
5674
5675   case TGSI_OPCODE_USGE:
5676      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5677      break;
5678
5679   case TGSI_OPCODE_USHR:
5680      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5681      break;
5682
5683   case TGSI_OPCODE_USLT:
5684      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5685      break;
5686
5687   case TGSI_OPCODE_USNE:
5688      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5689      break;
5690
5691   case TGSI_OPCODE_SWITCH:
5692      exec_switch(mach, inst);
5693      break;
5694
5695   case TGSI_OPCODE_CASE:
5696      exec_case(mach, inst);
5697      break;
5698
5699   case TGSI_OPCODE_DEFAULT:
5700      exec_default(mach);
5701      break;
5702
5703   case TGSI_OPCODE_ENDSWITCH:
5704      exec_endswitch(mach);
5705      break;
5706
5707   case TGSI_OPCODE_SAMPLE_I:
5708      exec_txf(mach, inst);
5709      break;
5710
5711   case TGSI_OPCODE_SAMPLE_I_MS:
5712      exec_txf(mach, inst);
5713      break;
5714
5715   case TGSI_OPCODE_SAMPLE:
5716      exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5717      break;
5718
5719   case TGSI_OPCODE_SAMPLE_B:
5720      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5721      break;
5722
5723   case TGSI_OPCODE_SAMPLE_C:
5724      exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5725      break;
5726
5727   case TGSI_OPCODE_SAMPLE_C_LZ:
5728      exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5729      break;
5730
5731   case TGSI_OPCODE_SAMPLE_D:
5732      exec_sample_d(mach, inst);
5733      break;
5734
5735   case TGSI_OPCODE_SAMPLE_L:
5736      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5737      break;
5738
5739   case TGSI_OPCODE_GATHER4:
5740      exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
5741      break;
5742
5743   case TGSI_OPCODE_SVIEWINFO:
5744      exec_txq(mach, inst);
5745      break;
5746
5747   case TGSI_OPCODE_SAMPLE_POS:
5748      assert(0);
5749      break;
5750
5751   case TGSI_OPCODE_SAMPLE_INFO:
5752      assert(0);
5753      break;
5754
5755   case TGSI_OPCODE_LOD:
5756      exec_lodq(mach, inst);
5757      break;
5758
5759   case TGSI_OPCODE_UARL:
5760      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5761      break;
5762
5763   case TGSI_OPCODE_UCMP:
5764      exec_ucmp(mach, inst);
5765      break;
5766
5767   case TGSI_OPCODE_IABS:
5768      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5769      break;
5770
5771   case TGSI_OPCODE_ISSG:
5772      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5773      break;
5774
5775   case TGSI_OPCODE_TEX2:
5776      /* simple texture lookup */
5777      /* src[0] = texcoord */
5778      /* src[1] = compare */
5779      /* src[2] = sampler unit */
5780      exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
5781      break;
5782   case TGSI_OPCODE_TXB2:
5783      /* simple texture lookup */
5784      /* src[0] = texcoord */
5785      /* src[1] = bias */
5786      /* src[2] = sampler unit */
5787      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
5788      break;
5789   case TGSI_OPCODE_TXL2:
5790      /* simple texture lookup */
5791      /* src[0] = texcoord */
5792      /* src[1] = lod */
5793      /* src[2] = sampler unit */
5794      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
5795      break;
5796
5797   case TGSI_OPCODE_IBFE:
5798      exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5799      break;
5800   case TGSI_OPCODE_UBFE:
5801      exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5802      break;
5803   case TGSI_OPCODE_BFI:
5804      exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5805      break;
5806   case TGSI_OPCODE_BREV:
5807      exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5808      break;
5809   case TGSI_OPCODE_POPC:
5810      exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5811      break;
5812   case TGSI_OPCODE_LSB:
5813      exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5814      break;
5815   case TGSI_OPCODE_IMSB:
5816      exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5817      break;
5818   case TGSI_OPCODE_UMSB:
5819      exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
5820      break;
5821
5822   case TGSI_OPCODE_F2D:
5823      exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
5824      break;
5825
5826   case TGSI_OPCODE_D2F:
5827      exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
5828      break;
5829
5830   case TGSI_OPCODE_DABS:
5831      exec_double_unary(mach, inst, micro_dabs);
5832      break;
5833
5834   case TGSI_OPCODE_DNEG:
5835      exec_double_unary(mach, inst, micro_dneg);
5836      break;
5837
5838   case TGSI_OPCODE_DADD:
5839      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
5840      break;
5841
5842   case TGSI_OPCODE_DDIV:
5843      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
5844      break;
5845
5846   case TGSI_OPCODE_DMUL:
5847      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
5848      break;
5849
5850   case TGSI_OPCODE_DMAX:
5851      exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
5852      break;
5853
5854   case TGSI_OPCODE_DMIN:
5855      exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
5856      break;
5857
5858   case TGSI_OPCODE_DSLT:
5859      exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
5860      break;
5861
5862   case TGSI_OPCODE_DSGE:
5863      exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
5864      break;
5865
5866   case TGSI_OPCODE_DSEQ:
5867      exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
5868      break;
5869
5870   case TGSI_OPCODE_DSNE:
5871      exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
5872      break;
5873
5874   case TGSI_OPCODE_DRCP:
5875      exec_double_unary(mach, inst, micro_drcp);
5876      break;
5877
5878   case TGSI_OPCODE_DSQRT:
5879      exec_double_unary(mach, inst, micro_dsqrt);
5880      break;
5881
5882   case TGSI_OPCODE_DRSQ:
5883      exec_double_unary(mach, inst, micro_drsq);
5884      break;
5885
5886   case TGSI_OPCODE_DMAD:
5887      exec_double_trinary(mach, inst, micro_dmad);
5888      break;
5889
5890   case TGSI_OPCODE_DFRAC:
5891      exec_double_unary(mach, inst, micro_dfrac);
5892      break;
5893
5894   case TGSI_OPCODE_DLDEXP:
5895      exec_dldexp(mach, inst);
5896      break;
5897
5898   case TGSI_OPCODE_DFRACEXP:
5899      exec_dfracexp(mach, inst);
5900      break;
5901
5902   case TGSI_OPCODE_I2D:
5903      exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
5904      break;
5905
5906   case TGSI_OPCODE_D2I:
5907      exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
5908      break;
5909
5910   case TGSI_OPCODE_U2D:
5911      exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
5912      break;
5913
5914   case TGSI_OPCODE_D2U:
5915      exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
5916      break;
5917
5918   case TGSI_OPCODE_LOAD:
5919      exec_load(mach, inst);
5920      break;
5921
5922   case TGSI_OPCODE_STORE:
5923      exec_store(mach, inst);
5924      break;
5925
5926   case TGSI_OPCODE_ATOMUADD:
5927   case TGSI_OPCODE_ATOMXCHG:
5928   case TGSI_OPCODE_ATOMCAS:
5929   case TGSI_OPCODE_ATOMAND:
5930   case TGSI_OPCODE_ATOMOR:
5931   case TGSI_OPCODE_ATOMXOR:
5932   case TGSI_OPCODE_ATOMUMIN:
5933   case TGSI_OPCODE_ATOMUMAX:
5934   case TGSI_OPCODE_ATOMIMIN:
5935   case TGSI_OPCODE_ATOMIMAX:
5936      exec_atomop(mach, inst);
5937      break;
5938
5939   case TGSI_OPCODE_RESQ:
5940      exec_resq(mach, inst);
5941      break;
5942   case TGSI_OPCODE_BARRIER:
5943   case TGSI_OPCODE_MEMBAR:
5944      return TRUE;
5945      break;
5946
5947   case TGSI_OPCODE_I64ABS:
5948      exec_double_unary(mach, inst, micro_i64abs);
5949      break;
5950
5951   case TGSI_OPCODE_I64SSG:
5952      exec_double_unary(mach, inst, micro_i64sgn);
5953      break;
5954
5955   case TGSI_OPCODE_I64NEG:
5956      exec_double_unary(mach, inst, micro_i64neg);
5957      break;
5958
5959   case TGSI_OPCODE_U64SEQ:
5960      exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
5961      break;
5962
5963   case TGSI_OPCODE_U64SNE:
5964      exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
5965      break;
5966
5967   case TGSI_OPCODE_I64SLT:
5968      exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
5969      break;
5970   case TGSI_OPCODE_U64SLT:
5971      exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
5972      break;
5973
5974   case TGSI_OPCODE_I64SGE:
5975      exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
5976      break;
5977   case TGSI_OPCODE_U64SGE:
5978      exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
5979      break;
5980
5981   case TGSI_OPCODE_I64MIN:
5982      exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
5983      break;
5984   case TGSI_OPCODE_U64MIN:
5985      exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
5986      break;
5987   case TGSI_OPCODE_I64MAX:
5988      exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
5989      break;
5990   case TGSI_OPCODE_U64MAX:
5991      exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
5992      break;
5993   case TGSI_OPCODE_U64ADD:
5994      exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
5995      break;
5996   case TGSI_OPCODE_U64MUL:
5997      exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
5998      break;
5999   case TGSI_OPCODE_U64SHL:
6000      exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6001      break;
6002   case TGSI_OPCODE_I64SHR:
6003      exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6004      break;
6005   case TGSI_OPCODE_U64SHR:
6006      exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6007      break;
6008   case TGSI_OPCODE_U64DIV:
6009      exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6010      break;
6011   case TGSI_OPCODE_I64DIV:
6012      exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6013      break;
6014   case TGSI_OPCODE_U64MOD:
6015      exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6016      break;
6017   case TGSI_OPCODE_I64MOD:
6018      exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6019      break;
6020
6021   case TGSI_OPCODE_F2U64:
6022      exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6023      break;
6024
6025   case TGSI_OPCODE_F2I64:
6026      exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6027      break;
6028
6029   case TGSI_OPCODE_U2I64:
6030      exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6031      break;
6032   case TGSI_OPCODE_I2I64:
6033      exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6034      break;
6035
6036   case TGSI_OPCODE_D2U64:
6037      exec_double_unary(mach, inst, micro_d2u64);
6038      break;
6039
6040   case TGSI_OPCODE_D2I64:
6041      exec_double_unary(mach, inst, micro_d2i64);
6042      break;
6043
6044   case TGSI_OPCODE_U642F:
6045      exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6046      break;
6047   case TGSI_OPCODE_I642F:
6048      exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6049      break;
6050
6051   case TGSI_OPCODE_U642D:
6052      exec_double_unary(mach, inst, micro_u642d);
6053      break;
6054   case TGSI_OPCODE_I642D:
6055      exec_double_unary(mach, inst, micro_i642d);
6056      break;
6057
6058   default:
6059      assert( 0 );
6060   }
6061   return FALSE;
6062}
6063
6064static void
6065tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6066{
6067   uint default_mask = 0xf;
6068
6069   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6070   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6071
6072   if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6073      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
6074      mach->Primitives[0] = 0;
6075      /* GS runs on a single primitive for now */
6076      default_mask = 0x1;
6077   }
6078
6079   if (mach->NonHelperMask == 0)
6080      mach->NonHelperMask = default_mask;
6081   mach->CondMask = default_mask;
6082   mach->LoopMask = default_mask;
6083   mach->ContMask = default_mask;
6084   mach->FuncMask = default_mask;
6085   mach->ExecMask = default_mask;
6086
6087   mach->Switch.mask = default_mask;
6088
6089   assert(mach->CondStackTop == 0);
6090   assert(mach->LoopStackTop == 0);
6091   assert(mach->ContStackTop == 0);
6092   assert(mach->SwitchStackTop == 0);
6093   assert(mach->BreakStackTop == 0);
6094   assert(mach->CallStackTop == 0);
6095}
6096
6097/**
6098 * Run TGSI interpreter.
6099 * \return bitmask of "alive" quad components
6100 */
6101uint
6102tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6103{
6104   uint i;
6105
6106   mach->pc = start_pc;
6107
6108   if (!start_pc) {
6109      tgsi_exec_machine_setup_masks(mach);
6110
6111      /* execute declarations (interpolants) */
6112      for (i = 0; i < mach->NumDeclarations; i++) {
6113         exec_declaration( mach, mach->Declarations+i );
6114      }
6115   }
6116
6117   {
6118#if DEBUG_EXECUTION
6119      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6120      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6121      uint inst = 1;
6122
6123      if (!start_pc) {
6124         memset(mach->Temps, 0, sizeof(temps));
6125         if (mach->Outputs)
6126            memset(mach->Outputs, 0, sizeof(outputs));
6127         memset(temps, 0, sizeof(temps));
6128         memset(outputs, 0, sizeof(outputs));
6129      }
6130#endif
6131
6132      /* execute instructions, until pc is set to -1 */
6133      while (mach->pc != -1) {
6134         boolean barrier_hit;
6135#if DEBUG_EXECUTION
6136         uint i;
6137
6138         tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6139#endif
6140
6141         assert(mach->pc < (int) mach->NumInstructions);
6142         barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6143
6144         /* for compute shaders if we hit a barrier return now for later rescheduling */
6145         if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6146            return 0;
6147
6148#if DEBUG_EXECUTION
6149         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6150            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6151               uint j;
6152
6153               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6154               debug_printf("TEMP[%2u] = ", i);
6155               for (j = 0; j < 4; j++) {
6156                  if (j > 0) {
6157                     debug_printf("           ");
6158                  }
6159                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6160                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6161                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6162                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6163                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6164               }
6165            }
6166         }
6167         if (mach->Outputs) {
6168            for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6169               if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6170                  uint j;
6171
6172                  memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6173                  debug_printf("OUT[%2u] =  ", i);
6174                  for (j = 0; j < 4; j++) {
6175                     if (j > 0) {
6176                        debug_printf("           ");
6177                     }
6178                     debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6179                                  outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6180                                  outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6181                                  outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6182                                  outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6183                  }
6184               }
6185            }
6186         }
6187#endif
6188      }
6189   }
6190
6191#if 0
6192   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6193   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6194      /*
6195       * Scale back depth component.
6196       */
6197      for (i = 0; i < 4; i++)
6198         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6199   }
6200#endif
6201
6202   /* Strictly speaking, these assertions aren't really needed but they
6203    * can potentially catch some bugs in the control flow code.
6204    */
6205   assert(mach->CondStackTop == 0);
6206   assert(mach->LoopStackTop == 0);
6207   assert(mach->ContStackTop == 0);
6208   assert(mach->SwitchStackTop == 0);
6209   assert(mach->BreakStackTop == 0);
6210   assert(mach->CallStackTop == 0);
6211
6212   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6213}
6214