1/**************************************************************************
2 *
3 * Copyright 2007-2008 VMware, Inc.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/u_half.h"
62#include "util/u_memory.h"
63#include "util/u_math.h"
64#include "util/rounding.h"
65
66
67#define DEBUG_EXECUTION 0
68
69
70#define FAST_MATH 0
71
72#define TILE_TOP_LEFT     0
73#define TILE_TOP_RIGHT    1
74#define TILE_BOTTOM_LEFT  2
75#define TILE_BOTTOM_RIGHT 3
76
77union tgsi_double_channel {
78   double d[TGSI_QUAD_SIZE];
79   unsigned u[TGSI_QUAD_SIZE][2];
80   uint64_t u64[TGSI_QUAD_SIZE];
81   int64_t i64[TGSI_QUAD_SIZE];
82};
83
84struct tgsi_double_vector {
85   union tgsi_double_channel xy;
86   union tgsi_double_channel zw;
87};
88
89static void
90micro_abs(union tgsi_exec_channel *dst,
91          const union tgsi_exec_channel *src)
92{
93   dst->f[0] = fabsf(src->f[0]);
94   dst->f[1] = fabsf(src->f[1]);
95   dst->f[2] = fabsf(src->f[2]);
96   dst->f[3] = fabsf(src->f[3]);
97}
98
99static void
100micro_arl(union tgsi_exec_channel *dst,
101          const union tgsi_exec_channel *src)
102{
103   dst->i[0] = (int)floorf(src->f[0]);
104   dst->i[1] = (int)floorf(src->f[1]);
105   dst->i[2] = (int)floorf(src->f[2]);
106   dst->i[3] = (int)floorf(src->f[3]);
107}
108
109static void
110micro_arr(union tgsi_exec_channel *dst,
111          const union tgsi_exec_channel *src)
112{
113   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
114   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
115   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
116   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
117}
118
119static void
120micro_ceil(union tgsi_exec_channel *dst,
121           const union tgsi_exec_channel *src)
122{
123   dst->f[0] = ceilf(src->f[0]);
124   dst->f[1] = ceilf(src->f[1]);
125   dst->f[2] = ceilf(src->f[2]);
126   dst->f[3] = ceilf(src->f[3]);
127}
128
129static void
130micro_cmp(union tgsi_exec_channel *dst,
131          const union tgsi_exec_channel *src0,
132          const union tgsi_exec_channel *src1,
133          const union tgsi_exec_channel *src2)
134{
135   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
136   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
137   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
138   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
139}
140
141static void
142micro_cos(union tgsi_exec_channel *dst,
143          const union tgsi_exec_channel *src)
144{
145   dst->f[0] = cosf(src->f[0]);
146   dst->f[1] = cosf(src->f[1]);
147   dst->f[2] = cosf(src->f[2]);
148   dst->f[3] = cosf(src->f[3]);
149}
150
151static void
152micro_d2f(union tgsi_exec_channel *dst,
153          const union tgsi_double_channel *src)
154{
155   dst->f[0] = (float)src->d[0];
156   dst->f[1] = (float)src->d[1];
157   dst->f[2] = (float)src->d[2];
158   dst->f[3] = (float)src->d[3];
159}
160
161static void
162micro_d2i(union tgsi_exec_channel *dst,
163          const union tgsi_double_channel *src)
164{
165   dst->i[0] = (int)src->d[0];
166   dst->i[1] = (int)src->d[1];
167   dst->i[2] = (int)src->d[2];
168   dst->i[3] = (int)src->d[3];
169}
170
171static void
172micro_d2u(union tgsi_exec_channel *dst,
173          const union tgsi_double_channel *src)
174{
175   dst->u[0] = (unsigned)src->d[0];
176   dst->u[1] = (unsigned)src->d[1];
177   dst->u[2] = (unsigned)src->d[2];
178   dst->u[3] = (unsigned)src->d[3];
179}
180static void
181micro_dabs(union tgsi_double_channel *dst,
182           const union tgsi_double_channel *src)
183{
184   dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
185   dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
186   dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
187   dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
188}
189
190static void
191micro_dadd(union tgsi_double_channel *dst,
192          const union tgsi_double_channel *src)
193{
194   dst->d[0] = src[0].d[0] + src[1].d[0];
195   dst->d[1] = src[0].d[1] + src[1].d[1];
196   dst->d[2] = src[0].d[2] + src[1].d[2];
197   dst->d[3] = src[0].d[3] + src[1].d[3];
198}
199
200static void
201micro_ddiv(union tgsi_double_channel *dst,
202          const union tgsi_double_channel *src)
203{
204   dst->d[0] = src[0].d[0] / src[1].d[0];
205   dst->d[1] = src[0].d[1] / src[1].d[1];
206   dst->d[2] = src[0].d[2] / src[1].d[2];
207   dst->d[3] = src[0].d[3] / src[1].d[3];
208}
209
210static void
211micro_ddx(union tgsi_exec_channel *dst,
212          const union tgsi_exec_channel *src)
213{
214   dst->f[0] =
215   dst->f[1] =
216   dst->f[2] =
217   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
218}
219
220static void
221micro_ddy(union tgsi_exec_channel *dst,
222          const union tgsi_exec_channel *src)
223{
224   dst->f[0] =
225   dst->f[1] =
226   dst->f[2] =
227   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
228}
229
230static void
231micro_dmul(union tgsi_double_channel *dst,
232           const union tgsi_double_channel *src)
233{
234   dst->d[0] = src[0].d[0] * src[1].d[0];
235   dst->d[1] = src[0].d[1] * src[1].d[1];
236   dst->d[2] = src[0].d[2] * src[1].d[2];
237   dst->d[3] = src[0].d[3] * src[1].d[3];
238}
239
240static void
241micro_dmax(union tgsi_double_channel *dst,
242           const union tgsi_double_channel *src)
243{
244   dst->d[0] = src[0].d[0] > src[1].d[0] ? src[0].d[0] : src[1].d[0];
245   dst->d[1] = src[0].d[1] > src[1].d[1] ? src[0].d[1] : src[1].d[1];
246   dst->d[2] = src[0].d[2] > src[1].d[2] ? src[0].d[2] : src[1].d[2];
247   dst->d[3] = src[0].d[3] > src[1].d[3] ? src[0].d[3] : src[1].d[3];
248}
249
250static void
251micro_dmin(union tgsi_double_channel *dst,
252           const union tgsi_double_channel *src)
253{
254   dst->d[0] = src[0].d[0] < src[1].d[0] ? src[0].d[0] : src[1].d[0];
255   dst->d[1] = src[0].d[1] < src[1].d[1] ? src[0].d[1] : src[1].d[1];
256   dst->d[2] = src[0].d[2] < src[1].d[2] ? src[0].d[2] : src[1].d[2];
257   dst->d[3] = src[0].d[3] < src[1].d[3] ? src[0].d[3] : src[1].d[3];
258}
259
260static void
261micro_dneg(union tgsi_double_channel *dst,
262           const union tgsi_double_channel *src)
263{
264   dst->d[0] = -src->d[0];
265   dst->d[1] = -src->d[1];
266   dst->d[2] = -src->d[2];
267   dst->d[3] = -src->d[3];
268}
269
270static void
271micro_dslt(union tgsi_double_channel *dst,
272           const union tgsi_double_channel *src)
273{
274   dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
275   dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
276   dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
277   dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
278}
279
280static void
281micro_dsne(union tgsi_double_channel *dst,
282           const union tgsi_double_channel *src)
283{
284   dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
285   dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
286   dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
287   dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
288}
289
290static void
291micro_dsge(union tgsi_double_channel *dst,
292           const union tgsi_double_channel *src)
293{
294   dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
295   dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
296   dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
297   dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
298}
299
300static void
301micro_dseq(union tgsi_double_channel *dst,
302           const union tgsi_double_channel *src)
303{
304   dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
305   dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
306   dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
307   dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
308}
309
310static void
311micro_drcp(union tgsi_double_channel *dst,
312           const union tgsi_double_channel *src)
313{
314   dst->d[0] = 1.0 / src->d[0];
315   dst->d[1] = 1.0 / src->d[1];
316   dst->d[2] = 1.0 / src->d[2];
317   dst->d[3] = 1.0 / src->d[3];
318}
319
320static void
321micro_dsqrt(union tgsi_double_channel *dst,
322            const union tgsi_double_channel *src)
323{
324   dst->d[0] = sqrt(src->d[0]);
325   dst->d[1] = sqrt(src->d[1]);
326   dst->d[2] = sqrt(src->d[2]);
327   dst->d[3] = sqrt(src->d[3]);
328}
329
330static void
331micro_drsq(union tgsi_double_channel *dst,
332          const union tgsi_double_channel *src)
333{
334   dst->d[0] = 1.0 / sqrt(src->d[0]);
335   dst->d[1] = 1.0 / sqrt(src->d[1]);
336   dst->d[2] = 1.0 / sqrt(src->d[2]);
337   dst->d[3] = 1.0 / sqrt(src->d[3]);
338}
339
340static void
341micro_dmad(union tgsi_double_channel *dst,
342           const union tgsi_double_channel *src)
343{
344   dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
345   dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
346   dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
347   dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
348}
349
350static void
351micro_dfrac(union tgsi_double_channel *dst,
352            const union tgsi_double_channel *src)
353{
354   dst->d[0] = src->d[0] - floor(src->d[0]);
355   dst->d[1] = src->d[1] - floor(src->d[1]);
356   dst->d[2] = src->d[2] - floor(src->d[2]);
357   dst->d[3] = src->d[3] - floor(src->d[3]);
358}
359
360static void
361micro_dldexp(union tgsi_double_channel *dst,
362             const union tgsi_double_channel *src0,
363             union tgsi_exec_channel *src1)
364{
365   dst->d[0] = ldexp(src0->d[0], src1->i[0]);
366   dst->d[1] = ldexp(src0->d[1], src1->i[1]);
367   dst->d[2] = ldexp(src0->d[2], src1->i[2]);
368   dst->d[3] = ldexp(src0->d[3], src1->i[3]);
369}
370
371static void
372micro_dfracexp(union tgsi_double_channel *dst,
373               union tgsi_exec_channel *dst_exp,
374               const union tgsi_double_channel *src)
375{
376   dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
377   dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
378   dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
379   dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
380}
381
382static void
383micro_exp2(union tgsi_exec_channel *dst,
384           const union tgsi_exec_channel *src)
385{
386#if FAST_MATH
387   dst->f[0] = util_fast_exp2(src->f[0]);
388   dst->f[1] = util_fast_exp2(src->f[1]);
389   dst->f[2] = util_fast_exp2(src->f[2]);
390   dst->f[3] = util_fast_exp2(src->f[3]);
391#else
392#if DEBUG
393   /* Inf is okay for this instruction, so clamp it to silence assertions. */
394   uint i;
395   union tgsi_exec_channel clamped;
396
397   for (i = 0; i < 4; i++) {
398      if (src->f[i] > 127.99999f) {
399         clamped.f[i] = 127.99999f;
400      } else if (src->f[i] < -126.99999f) {
401         clamped.f[i] = -126.99999f;
402      } else {
403         clamped.f[i] = src->f[i];
404      }
405   }
406   src = &clamped;
407#endif /* DEBUG */
408
409   dst->f[0] = powf(2.0f, src->f[0]);
410   dst->f[1] = powf(2.0f, src->f[1]);
411   dst->f[2] = powf(2.0f, src->f[2]);
412   dst->f[3] = powf(2.0f, src->f[3]);
413#endif /* FAST_MATH */
414}
415
416static void
417micro_f2d(union tgsi_double_channel *dst,
418          const union tgsi_exec_channel *src)
419{
420   dst->d[0] = (double)src->f[0];
421   dst->d[1] = (double)src->f[1];
422   dst->d[2] = (double)src->f[2];
423   dst->d[3] = (double)src->f[3];
424}
425
426static void
427micro_flr(union tgsi_exec_channel *dst,
428          const union tgsi_exec_channel *src)
429{
430   dst->f[0] = floorf(src->f[0]);
431   dst->f[1] = floorf(src->f[1]);
432   dst->f[2] = floorf(src->f[2]);
433   dst->f[3] = floorf(src->f[3]);
434}
435
436static void
437micro_frc(union tgsi_exec_channel *dst,
438          const union tgsi_exec_channel *src)
439{
440   dst->f[0] = src->f[0] - floorf(src->f[0]);
441   dst->f[1] = src->f[1] - floorf(src->f[1]);
442   dst->f[2] = src->f[2] - floorf(src->f[2]);
443   dst->f[3] = src->f[3] - floorf(src->f[3]);
444}
445
446static void
447micro_i2d(union tgsi_double_channel *dst,
448          const union tgsi_exec_channel *src)
449{
450   dst->d[0] = (double)src->i[0];
451   dst->d[1] = (double)src->i[1];
452   dst->d[2] = (double)src->i[2];
453   dst->d[3] = (double)src->i[3];
454}
455
456static void
457micro_iabs(union tgsi_exec_channel *dst,
458           const union tgsi_exec_channel *src)
459{
460   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
461   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
462   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
463   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
464}
465
466static void
467micro_ineg(union tgsi_exec_channel *dst,
468           const union tgsi_exec_channel *src)
469{
470   dst->i[0] = -src->i[0];
471   dst->i[1] = -src->i[1];
472   dst->i[2] = -src->i[2];
473   dst->i[3] = -src->i[3];
474}
475
476static void
477micro_lg2(union tgsi_exec_channel *dst,
478          const union tgsi_exec_channel *src)
479{
480#if FAST_MATH
481   dst->f[0] = util_fast_log2(src->f[0]);
482   dst->f[1] = util_fast_log2(src->f[1]);
483   dst->f[2] = util_fast_log2(src->f[2]);
484   dst->f[3] = util_fast_log2(src->f[3]);
485#else
486   dst->f[0] = logf(src->f[0]) * 1.442695f;
487   dst->f[1] = logf(src->f[1]) * 1.442695f;
488   dst->f[2] = logf(src->f[2]) * 1.442695f;
489   dst->f[3] = logf(src->f[3]) * 1.442695f;
490#endif
491}
492
493static void
494micro_lrp(union tgsi_exec_channel *dst,
495          const union tgsi_exec_channel *src0,
496          const union tgsi_exec_channel *src1,
497          const union tgsi_exec_channel *src2)
498{
499   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
500   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
501   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
502   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
503}
504
505static void
506micro_mad(union tgsi_exec_channel *dst,
507          const union tgsi_exec_channel *src0,
508          const union tgsi_exec_channel *src1,
509          const union tgsi_exec_channel *src2)
510{
511   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
512   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
513   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
514   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
515}
516
517static void
518micro_mov(union tgsi_exec_channel *dst,
519          const union tgsi_exec_channel *src)
520{
521   dst->u[0] = src->u[0];
522   dst->u[1] = src->u[1];
523   dst->u[2] = src->u[2];
524   dst->u[3] = src->u[3];
525}
526
527static void
528micro_rcp(union tgsi_exec_channel *dst,
529          const union tgsi_exec_channel *src)
530{
531#if 0 /* for debugging */
532   assert(src->f[0] != 0.0f);
533   assert(src->f[1] != 0.0f);
534   assert(src->f[2] != 0.0f);
535   assert(src->f[3] != 0.0f);
536#endif
537   dst->f[0] = 1.0f / src->f[0];
538   dst->f[1] = 1.0f / src->f[1];
539   dst->f[2] = 1.0f / src->f[2];
540   dst->f[3] = 1.0f / src->f[3];
541}
542
543static void
544micro_rnd(union tgsi_exec_channel *dst,
545          const union tgsi_exec_channel *src)
546{
547   dst->f[0] = _mesa_roundevenf(src->f[0]);
548   dst->f[1] = _mesa_roundevenf(src->f[1]);
549   dst->f[2] = _mesa_roundevenf(src->f[2]);
550   dst->f[3] = _mesa_roundevenf(src->f[3]);
551}
552
553static void
554micro_rsq(union tgsi_exec_channel *dst,
555          const union tgsi_exec_channel *src)
556{
557#if 0 /* for debugging */
558   assert(src->f[0] != 0.0f);
559   assert(src->f[1] != 0.0f);
560   assert(src->f[2] != 0.0f);
561   assert(src->f[3] != 0.0f);
562#endif
563   dst->f[0] = 1.0f / sqrtf(src->f[0]);
564   dst->f[1] = 1.0f / sqrtf(src->f[1]);
565   dst->f[2] = 1.0f / sqrtf(src->f[2]);
566   dst->f[3] = 1.0f / sqrtf(src->f[3]);
567}
568
569static void
570micro_sqrt(union tgsi_exec_channel *dst,
571           const union tgsi_exec_channel *src)
572{
573   dst->f[0] = sqrtf(src->f[0]);
574   dst->f[1] = sqrtf(src->f[1]);
575   dst->f[2] = sqrtf(src->f[2]);
576   dst->f[3] = sqrtf(src->f[3]);
577}
578
579static void
580micro_seq(union tgsi_exec_channel *dst,
581          const union tgsi_exec_channel *src0,
582          const union tgsi_exec_channel *src1)
583{
584   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
585   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
586   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
587   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
588}
589
590static void
591micro_sge(union tgsi_exec_channel *dst,
592          const union tgsi_exec_channel *src0,
593          const union tgsi_exec_channel *src1)
594{
595   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
596   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
597   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
598   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
599}
600
601static void
602micro_sgn(union tgsi_exec_channel *dst,
603          const union tgsi_exec_channel *src)
604{
605   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
606   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
607   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
608   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
609}
610
611static void
612micro_isgn(union tgsi_exec_channel *dst,
613          const union tgsi_exec_channel *src)
614{
615   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
616   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
617   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
618   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
619}
620
621static void
622micro_sgt(union tgsi_exec_channel *dst,
623          const union tgsi_exec_channel *src0,
624          const union tgsi_exec_channel *src1)
625{
626   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
627   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
628   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
629   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
630}
631
632static void
633micro_sin(union tgsi_exec_channel *dst,
634          const union tgsi_exec_channel *src)
635{
636   dst->f[0] = sinf(src->f[0]);
637   dst->f[1] = sinf(src->f[1]);
638   dst->f[2] = sinf(src->f[2]);
639   dst->f[3] = sinf(src->f[3]);
640}
641
642static void
643micro_sle(union tgsi_exec_channel *dst,
644          const union tgsi_exec_channel *src0,
645          const union tgsi_exec_channel *src1)
646{
647   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
648   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
649   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
650   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
651}
652
653static void
654micro_slt(union tgsi_exec_channel *dst,
655          const union tgsi_exec_channel *src0,
656          const union tgsi_exec_channel *src1)
657{
658   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
659   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
660   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
661   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
662}
663
664static void
665micro_sne(union tgsi_exec_channel *dst,
666          const union tgsi_exec_channel *src0,
667          const union tgsi_exec_channel *src1)
668{
669   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
670   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
671   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
672   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
673}
674
675static void
676micro_trunc(union tgsi_exec_channel *dst,
677            const union tgsi_exec_channel *src)
678{
679   dst->f[0] = truncf(src->f[0]);
680   dst->f[1] = truncf(src->f[1]);
681   dst->f[2] = truncf(src->f[2]);
682   dst->f[3] = truncf(src->f[3]);
683}
684
685static void
686micro_u2d(union tgsi_double_channel *dst,
687          const union tgsi_exec_channel *src)
688{
689   dst->d[0] = (double)src->u[0];
690   dst->d[1] = (double)src->u[1];
691   dst->d[2] = (double)src->u[2];
692   dst->d[3] = (double)src->u[3];
693}
694
695static void
696micro_i64abs(union tgsi_double_channel *dst,
697             const union tgsi_double_channel *src)
698{
699   dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
700   dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
701   dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
702   dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
703}
704
705static void
706micro_i64sgn(union tgsi_double_channel *dst,
707             const union tgsi_double_channel *src)
708{
709   dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
710   dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
711   dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
712   dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
713}
714
715static void
716micro_i64neg(union tgsi_double_channel *dst,
717             const union tgsi_double_channel *src)
718{
719   dst->i64[0] = -src->i64[0];
720   dst->i64[1] = -src->i64[1];
721   dst->i64[2] = -src->i64[2];
722   dst->i64[3] = -src->i64[3];
723}
724
725static void
726micro_u64seq(union tgsi_double_channel *dst,
727           const union tgsi_double_channel *src)
728{
729   dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
730   dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
731   dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
732   dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
733}
734
735static void
736micro_u64sne(union tgsi_double_channel *dst,
737             const union tgsi_double_channel *src)
738{
739   dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
740   dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
741   dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
742   dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
743}
744
745static void
746micro_i64slt(union tgsi_double_channel *dst,
747             const union tgsi_double_channel *src)
748{
749   dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
750   dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
751   dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
752   dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
753}
754
755static void
756micro_u64slt(union tgsi_double_channel *dst,
757             const union tgsi_double_channel *src)
758{
759   dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
760   dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
761   dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
762   dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
763}
764
765static void
766micro_i64sge(union tgsi_double_channel *dst,
767           const union tgsi_double_channel *src)
768{
769   dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
770   dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
771   dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
772   dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
773}
774
775static void
776micro_u64sge(union tgsi_double_channel *dst,
777             const union tgsi_double_channel *src)
778{
779   dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
780   dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
781   dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
782   dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
783}
784
785static void
786micro_u64max(union tgsi_double_channel *dst,
787             const union tgsi_double_channel *src)
788{
789   dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
790   dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
791   dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
792   dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
793}
794
795static void
796micro_i64max(union tgsi_double_channel *dst,
797             const union tgsi_double_channel *src)
798{
799   dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
800   dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
801   dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
802   dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
803}
804
805static void
806micro_u64min(union tgsi_double_channel *dst,
807             const union tgsi_double_channel *src)
808{
809   dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
810   dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
811   dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
812   dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
813}
814
815static void
816micro_i64min(union tgsi_double_channel *dst,
817             const union tgsi_double_channel *src)
818{
819   dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
820   dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
821   dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
822   dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
823}
824
825static void
826micro_u64add(union tgsi_double_channel *dst,
827             const union tgsi_double_channel *src)
828{
829   dst->u64[0] = src[0].u64[0] + src[1].u64[0];
830   dst->u64[1] = src[0].u64[1] + src[1].u64[1];
831   dst->u64[2] = src[0].u64[2] + src[1].u64[2];
832   dst->u64[3] = src[0].u64[3] + src[1].u64[3];
833}
834
835static void
836micro_u64mul(union tgsi_double_channel *dst,
837             const union tgsi_double_channel *src)
838{
839   dst->u64[0] = src[0].u64[0] * src[1].u64[0];
840   dst->u64[1] = src[0].u64[1] * src[1].u64[1];
841   dst->u64[2] = src[0].u64[2] * src[1].u64[2];
842   dst->u64[3] = src[0].u64[3] * src[1].u64[3];
843}
844
845static void
846micro_u64div(union tgsi_double_channel *dst,
847             const union tgsi_double_channel *src)
848{
849   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
850   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
851   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
852   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
853}
854
855static void
856micro_i64div(union tgsi_double_channel *dst,
857             const union tgsi_double_channel *src)
858{
859   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
860   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
861   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
862   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
863}
864
865static void
866micro_u64mod(union tgsi_double_channel *dst,
867             const union tgsi_double_channel *src)
868{
869   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
870   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
871   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
872   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
873}
874
875static void
876micro_i64mod(union tgsi_double_channel *dst,
877             const union tgsi_double_channel *src)
878{
879   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
880   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
881   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
882   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
883}
884
885static void
886micro_u64shl(union tgsi_double_channel *dst,
887             const union tgsi_double_channel *src0,
888             union tgsi_exec_channel *src1)
889{
890   unsigned masked_count;
891   masked_count = src1->u[0] & 0x3f;
892   dst->u64[0] = src0->u64[0] << masked_count;
893   masked_count = src1->u[1] & 0x3f;
894   dst->u64[1] = src0->u64[1] << masked_count;
895   masked_count = src1->u[2] & 0x3f;
896   dst->u64[2] = src0->u64[2] << masked_count;
897   masked_count = src1->u[3] & 0x3f;
898   dst->u64[3] = src0->u64[3] << masked_count;
899}
900
901static void
902micro_i64shr(union tgsi_double_channel *dst,
903             const union tgsi_double_channel *src0,
904             union tgsi_exec_channel *src1)
905{
906   unsigned masked_count;
907   masked_count = src1->u[0] & 0x3f;
908   dst->i64[0] = src0->i64[0] >> masked_count;
909   masked_count = src1->u[1] & 0x3f;
910   dst->i64[1] = src0->i64[1] >> masked_count;
911   masked_count = src1->u[2] & 0x3f;
912   dst->i64[2] = src0->i64[2] >> masked_count;
913   masked_count = src1->u[3] & 0x3f;
914   dst->i64[3] = src0->i64[3] >> masked_count;
915}
916
917static void
918micro_u64shr(union tgsi_double_channel *dst,
919             const union tgsi_double_channel *src0,
920             union tgsi_exec_channel *src1)
921{
922   unsigned masked_count;
923   masked_count = src1->u[0] & 0x3f;
924   dst->u64[0] = src0->u64[0] >> masked_count;
925   masked_count = src1->u[1] & 0x3f;
926   dst->u64[1] = src0->u64[1] >> masked_count;
927   masked_count = src1->u[2] & 0x3f;
928   dst->u64[2] = src0->u64[2] >> masked_count;
929   masked_count = src1->u[3] & 0x3f;
930   dst->u64[3] = src0->u64[3] >> masked_count;
931}
932
933enum tgsi_exec_datatype {
934   TGSI_EXEC_DATA_FLOAT,
935   TGSI_EXEC_DATA_INT,
936   TGSI_EXEC_DATA_UINT,
937   TGSI_EXEC_DATA_DOUBLE,
938   TGSI_EXEC_DATA_INT64,
939   TGSI_EXEC_DATA_UINT64,
940};
941
942/*
943 * Shorthand locations of various utility registers (_I = Index, _C = Channel)
944 */
945#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
946#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
947#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
948#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
949#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
950#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
951#define TEMP_PRIMITIVE_S1_I   TGSI_EXEC_TEMP_PRIMITIVE_S1_I
952#define TEMP_PRIMITIVE_S1_C   TGSI_EXEC_TEMP_PRIMITIVE_S1_C
953#define TEMP_PRIMITIVE_S2_I   TGSI_EXEC_TEMP_PRIMITIVE_S2_I
954#define TEMP_PRIMITIVE_S2_C   TGSI_EXEC_TEMP_PRIMITIVE_S2_C
955#define TEMP_PRIMITIVE_S3_I   TGSI_EXEC_TEMP_PRIMITIVE_S3_I
956#define TEMP_PRIMITIVE_S3_C   TGSI_EXEC_TEMP_PRIMITIVE_S3_C
957
958static const struct {
959   int idx;
960   int chan;
961} temp_prim_idxs[] = {
962   { TEMP_PRIMITIVE_I, TEMP_PRIMITIVE_C },
963   { TEMP_PRIMITIVE_S1_I, TEMP_PRIMITIVE_S1_C },
964   { TEMP_PRIMITIVE_S2_I, TEMP_PRIMITIVE_S2_C },
965   { TEMP_PRIMITIVE_S3_I, TEMP_PRIMITIVE_S3_C },
966};
967
968/** The execution mask depends on the conditional mask and the loop mask */
969#define UPDATE_EXEC_MASK(MACH) \
970      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
971
972
973static const union tgsi_exec_channel ZeroVec =
974   { { 0.0, 0.0, 0.0, 0.0 } };
975
976static const union tgsi_exec_channel OneVec = {
977   {1.0f, 1.0f, 1.0f, 1.0f}
978};
979
980static const union tgsi_exec_channel P128Vec = {
981   {128.0f, 128.0f, 128.0f, 128.0f}
982};
983
984static const union tgsi_exec_channel M128Vec = {
985   {-128.0f, -128.0f, -128.0f, -128.0f}
986};
987
988
989/**
990 * Assert that none of the float values in 'chan' are infinite or NaN.
991 * NaN and Inf may occur normally during program execution and should
992 * not lead to crashes, etc.  But when debugging, it's helpful to catch
993 * them.
994 */
995static inline void
996check_inf_or_nan(const union tgsi_exec_channel *chan)
997{
998   assert(!util_is_inf_or_nan((chan)->f[0]));
999   assert(!util_is_inf_or_nan((chan)->f[1]));
1000   assert(!util_is_inf_or_nan((chan)->f[2]));
1001   assert(!util_is_inf_or_nan((chan)->f[3]));
1002}
1003
1004
1005#ifdef DEBUG
1006static void
1007print_chan(const char *msg, const union tgsi_exec_channel *chan)
1008{
1009   debug_printf("%s = {%f, %f, %f, %f}\n",
1010                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1011}
1012#endif
1013
1014
1015#ifdef DEBUG
1016static void
1017print_temp(const struct tgsi_exec_machine *mach, uint index)
1018{
1019   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1020   int i;
1021   debug_printf("Temp[%u] =\n", index);
1022   for (i = 0; i < 4; i++) {
1023      debug_printf("  %c: { %f, %f, %f, %f }\n",
1024                   "XYZW"[i],
1025                   tmp->xyzw[i].f[0],
1026                   tmp->xyzw[i].f[1],
1027                   tmp->xyzw[i].f[2],
1028                   tmp->xyzw[i].f[3]);
1029   }
1030}
1031#endif
1032
1033
1034void
1035tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1036                               unsigned num_bufs,
1037                               const void **bufs,
1038                               const unsigned *buf_sizes)
1039{
1040   unsigned i;
1041
1042   for (i = 0; i < num_bufs; i++) {
1043      mach->Consts[i] = bufs[i];
1044      mach->ConstsSize[i] = buf_sizes[i];
1045   }
1046}
1047
1048
1049/**
1050 * Check if there's a potential src/dst register data dependency when
1051 * using SOA execution.
1052 * Example:
1053 *   MOV T, T.yxwz;
1054 * This would expand into:
1055 *   MOV t0, t1;
1056 *   MOV t1, t0;
1057 *   MOV t2, t3;
1058 *   MOV t3, t2;
1059 * The second instruction will have the wrong value for t0 if executed as-is.
1060 */
1061boolean
1062tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
1063{
1064   uint i, chan;
1065
1066   uint writemask = inst->Dst[0].Register.WriteMask;
1067   if (writemask == TGSI_WRITEMASK_X ||
1068       writemask == TGSI_WRITEMASK_Y ||
1069       writemask == TGSI_WRITEMASK_Z ||
1070       writemask == TGSI_WRITEMASK_W ||
1071       writemask == TGSI_WRITEMASK_NONE) {
1072      /* no chance of data dependency */
1073      return FALSE;
1074   }
1075
1076   /* loop over src regs */
1077   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1078      if ((inst->Src[i].Register.File ==
1079           inst->Dst[0].Register.File) &&
1080          ((inst->Src[i].Register.Index ==
1081            inst->Dst[0].Register.Index) ||
1082           inst->Src[i].Register.Indirect ||
1083           inst->Dst[0].Register.Indirect)) {
1084         /* loop over dest channels */
1085         uint channelsWritten = 0x0;
1086         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
1087            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
1088               /* check if we're reading a channel that's been written */
1089               uint swizzle = tgsi_util_get_full_src_register_swizzle(&inst->Src[i], chan);
1090               if (channelsWritten & (1 << swizzle)) {
1091                  return TRUE;
1092               }
1093
1094               channelsWritten |= (1 << chan);
1095            }
1096         }
1097      }
1098   }
1099   return FALSE;
1100}
1101
1102
1103/**
1104 * Initialize machine state by expanding tokens to full instructions,
1105 * allocating temporary storage, setting up constants, etc.
1106 * After this, we can call tgsi_exec_machine_run() many times.
1107 */
1108void
1109tgsi_exec_machine_bind_shader(
1110   struct tgsi_exec_machine *mach,
1111   const struct tgsi_token *tokens,
1112   struct tgsi_sampler *sampler,
1113   struct tgsi_image *image,
1114   struct tgsi_buffer *buffer)
1115{
1116   uint k;
1117   struct tgsi_parse_context parse;
1118   struct tgsi_full_instruction *instructions;
1119   struct tgsi_full_declaration *declarations;
1120   uint maxInstructions = 10, numInstructions = 0;
1121   uint maxDeclarations = 10, numDeclarations = 0;
1122
1123#if 0
1124   tgsi_dump(tokens, 0);
1125#endif
1126
1127   util_init_math();
1128
1129
1130   mach->Tokens = tokens;
1131   mach->Sampler = sampler;
1132   mach->Image = image;
1133   mach->Buffer = buffer;
1134
1135   if (!tokens) {
1136      /* unbind and free all */
1137      FREE(mach->Declarations);
1138      mach->Declarations = NULL;
1139      mach->NumDeclarations = 0;
1140
1141      FREE(mach->Instructions);
1142      mach->Instructions = NULL;
1143      mach->NumInstructions = 0;
1144
1145      return;
1146   }
1147
1148   k = tgsi_parse_init (&parse, mach->Tokens);
1149   if (k != TGSI_PARSE_OK) {
1150      debug_printf( "Problem parsing!\n" );
1151      return;
1152   }
1153
1154   mach->ImmLimit = 0;
1155   mach->NumOutputs = 0;
1156
1157   for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1158      mach->SysSemanticToIndex[k] = -1;
1159
1160   if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1161       !mach->UsedGeometryShader) {
1162      struct tgsi_exec_vector *inputs;
1163      struct tgsi_exec_vector *outputs;
1164
1165      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1166                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1167                            16);
1168
1169      if (!inputs)
1170         return;
1171
1172      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1173                             TGSI_MAX_TOTAL_VERTICES, 16);
1174
1175      if (!outputs) {
1176         align_free(inputs);
1177         return;
1178      }
1179
1180      align_free(mach->Inputs);
1181      align_free(mach->Outputs);
1182
1183      mach->Inputs = inputs;
1184      mach->Outputs = outputs;
1185      mach->UsedGeometryShader = TRUE;
1186   }
1187
1188   declarations = (struct tgsi_full_declaration *)
1189      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1190
1191   if (!declarations) {
1192      return;
1193   }
1194
1195   instructions = (struct tgsi_full_instruction *)
1196      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1197
1198   if (!instructions) {
1199      FREE( declarations );
1200      return;
1201   }
1202
1203   while( !tgsi_parse_end_of_tokens( &parse ) ) {
1204      uint i;
1205
1206      tgsi_parse_token( &parse );
1207      switch( parse.FullToken.Token.Type ) {
1208      case TGSI_TOKEN_TYPE_DECLARATION:
1209         /* save expanded declaration */
1210         if (numDeclarations == maxDeclarations) {
1211            declarations = REALLOC(declarations,
1212                                   maxDeclarations
1213                                   * sizeof(struct tgsi_full_declaration),
1214                                   (maxDeclarations + 10)
1215                                   * sizeof(struct tgsi_full_declaration));
1216            maxDeclarations += 10;
1217         }
1218         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
1219            unsigned reg;
1220            for (reg = parse.FullToken.FullDeclaration.Range.First;
1221                 reg <= parse.FullToken.FullDeclaration.Range.Last;
1222                 ++reg) {
1223               ++mach->NumOutputs;
1224            }
1225         }
1226         else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1227            const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1228            mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1229         }
1230
1231         memcpy(declarations + numDeclarations,
1232                &parse.FullToken.FullDeclaration,
1233                sizeof(declarations[0]));
1234         numDeclarations++;
1235         break;
1236
1237      case TGSI_TOKEN_TYPE_IMMEDIATE:
1238         {
1239            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1240            assert( size <= 4 );
1241            if (mach->ImmLimit >= mach->ImmsReserved) {
1242               unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1243               float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1244               if (imms) {
1245                  mach->ImmsReserved = newReserved;
1246                  mach->Imms = imms;
1247               } else {
1248                  debug_printf("Unable to (re)allocate space for immidiate constants\n");
1249                  break;
1250               }
1251            }
1252
1253            for( i = 0; i < size; i++ ) {
1254               mach->Imms[mach->ImmLimit][i] =
1255		  parse.FullToken.FullImmediate.u[i].Float;
1256            }
1257            mach->ImmLimit += 1;
1258         }
1259         break;
1260
1261      case TGSI_TOKEN_TYPE_INSTRUCTION:
1262
1263         /* save expanded instruction */
1264         if (numInstructions == maxInstructions) {
1265            instructions = REALLOC(instructions,
1266                                   maxInstructions
1267                                   * sizeof(struct tgsi_full_instruction),
1268                                   (maxInstructions + 10)
1269                                   * sizeof(struct tgsi_full_instruction));
1270            maxInstructions += 10;
1271         }
1272
1273         memcpy(instructions + numInstructions,
1274                &parse.FullToken.FullInstruction,
1275                sizeof(instructions[0]));
1276
1277         numInstructions++;
1278         break;
1279
1280      case TGSI_TOKEN_TYPE_PROPERTY:
1281         if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1282            if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1283               mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1284            }
1285         }
1286         break;
1287
1288      default:
1289         assert( 0 );
1290      }
1291   }
1292   tgsi_parse_free (&parse);
1293
1294   FREE(mach->Declarations);
1295   mach->Declarations = declarations;
1296   mach->NumDeclarations = numDeclarations;
1297
1298   FREE(mach->Instructions);
1299   mach->Instructions = instructions;
1300   mach->NumInstructions = numInstructions;
1301}
1302
1303
1304struct tgsi_exec_machine *
1305tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1306{
1307   struct tgsi_exec_machine *mach;
1308   uint i;
1309
1310   mach = align_malloc( sizeof *mach, 16 );
1311   if (!mach)
1312      goto fail;
1313
1314   memset(mach, 0, sizeof(*mach));
1315
1316   mach->ShaderType = shader_type;
1317   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
1318   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
1319
1320   if (shader_type != PIPE_SHADER_COMPUTE) {
1321      mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1322      mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1323      if (!mach->Inputs || !mach->Outputs)
1324         goto fail;
1325   }
1326
1327   if (shader_type == PIPE_SHADER_FRAGMENT) {
1328      mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1329      if (!mach->InputSampleOffsetApply)
1330         goto fail;
1331   }
1332
1333   /* Setup constants needed by the SSE2 executor. */
1334   for( i = 0; i < 4; i++ ) {
1335      mach->Temps[TGSI_EXEC_TEMP_00000000_I].xyzw[TGSI_EXEC_TEMP_00000000_C].u[i] = 0x00000000;
1336      mach->Temps[TGSI_EXEC_TEMP_7FFFFFFF_I].xyzw[TGSI_EXEC_TEMP_7FFFFFFF_C].u[i] = 0x7FFFFFFF;
1337      mach->Temps[TGSI_EXEC_TEMP_80000000_I].xyzw[TGSI_EXEC_TEMP_80000000_C].u[i] = 0x80000000;
1338      mach->Temps[TGSI_EXEC_TEMP_FFFFFFFF_I].xyzw[TGSI_EXEC_TEMP_FFFFFFFF_C].u[i] = 0xFFFFFFFF;    /* not used */
1339      mach->Temps[TGSI_EXEC_TEMP_ONE_I].xyzw[TGSI_EXEC_TEMP_ONE_C].f[i] = 1.0f;
1340      mach->Temps[TGSI_EXEC_TEMP_TWO_I].xyzw[TGSI_EXEC_TEMP_TWO_C].f[i] = 2.0f;    /* not used */
1341      mach->Temps[TGSI_EXEC_TEMP_128_I].xyzw[TGSI_EXEC_TEMP_128_C].f[i] = 128.0f;
1342      mach->Temps[TGSI_EXEC_TEMP_MINUS_128_I].xyzw[TGSI_EXEC_TEMP_MINUS_128_C].f[i] = -128.0f;
1343      mach->Temps[TGSI_EXEC_TEMP_THREE_I].xyzw[TGSI_EXEC_TEMP_THREE_C].f[i] = 3.0f;
1344      mach->Temps[TGSI_EXEC_TEMP_HALF_I].xyzw[TGSI_EXEC_TEMP_HALF_C].f[i] = 0.5f;
1345   }
1346
1347#ifdef DEBUG
1348   /* silence warnings */
1349   (void) print_chan;
1350   (void) print_temp;
1351#endif
1352
1353   return mach;
1354
1355fail:
1356   if (mach) {
1357      align_free(mach->InputSampleOffsetApply);
1358      align_free(mach->Inputs);
1359      align_free(mach->Outputs);
1360      align_free(mach);
1361   }
1362   return NULL;
1363}
1364
1365
1366void
1367tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1368{
1369   if (mach) {
1370      FREE(mach->Instructions);
1371      FREE(mach->Declarations);
1372      FREE(mach->Imms);
1373
1374      align_free(mach->InputSampleOffsetApply);
1375      align_free(mach->Inputs);
1376      align_free(mach->Outputs);
1377
1378      align_free(mach);
1379   }
1380}
1381
1382static void
1383micro_add(union tgsi_exec_channel *dst,
1384          const union tgsi_exec_channel *src0,
1385          const union tgsi_exec_channel *src1)
1386{
1387   dst->f[0] = src0->f[0] + src1->f[0];
1388   dst->f[1] = src0->f[1] + src1->f[1];
1389   dst->f[2] = src0->f[2] + src1->f[2];
1390   dst->f[3] = src0->f[3] + src1->f[3];
1391}
1392
1393static void
1394micro_div(
1395   union tgsi_exec_channel *dst,
1396   const union tgsi_exec_channel *src0,
1397   const union tgsi_exec_channel *src1 )
1398{
1399   if (src1->f[0] != 0) {
1400      dst->f[0] = src0->f[0] / src1->f[0];
1401   }
1402   if (src1->f[1] != 0) {
1403      dst->f[1] = src0->f[1] / src1->f[1];
1404   }
1405   if (src1->f[2] != 0) {
1406      dst->f[2] = src0->f[2] / src1->f[2];
1407   }
1408   if (src1->f[3] != 0) {
1409      dst->f[3] = src0->f[3] / src1->f[3];
1410   }
1411}
1412
1413static void
1414micro_lt(
1415   union tgsi_exec_channel *dst,
1416   const union tgsi_exec_channel *src0,
1417   const union tgsi_exec_channel *src1,
1418   const union tgsi_exec_channel *src2,
1419   const union tgsi_exec_channel *src3 )
1420{
1421   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1422   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1423   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1424   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1425}
1426
1427static void
1428micro_max(union tgsi_exec_channel *dst,
1429          const union tgsi_exec_channel *src0,
1430          const union tgsi_exec_channel *src1)
1431{
1432   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
1433   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
1434   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
1435   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
1436}
1437
1438static void
1439micro_min(union tgsi_exec_channel *dst,
1440          const union tgsi_exec_channel *src0,
1441          const union tgsi_exec_channel *src1)
1442{
1443   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
1444   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
1445   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
1446   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
1447}
1448
1449static void
1450micro_mul(union tgsi_exec_channel *dst,
1451          const union tgsi_exec_channel *src0,
1452          const union tgsi_exec_channel *src1)
1453{
1454   dst->f[0] = src0->f[0] * src1->f[0];
1455   dst->f[1] = src0->f[1] * src1->f[1];
1456   dst->f[2] = src0->f[2] * src1->f[2];
1457   dst->f[3] = src0->f[3] * src1->f[3];
1458}
1459
1460static void
1461micro_neg(
1462   union tgsi_exec_channel *dst,
1463   const union tgsi_exec_channel *src )
1464{
1465   dst->f[0] = -src->f[0];
1466   dst->f[1] = -src->f[1];
1467   dst->f[2] = -src->f[2];
1468   dst->f[3] = -src->f[3];
1469}
1470
1471static void
1472micro_pow(
1473   union tgsi_exec_channel *dst,
1474   const union tgsi_exec_channel *src0,
1475   const union tgsi_exec_channel *src1 )
1476{
1477#if FAST_MATH
1478   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
1479   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
1480   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
1481   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
1482#else
1483   dst->f[0] = powf( src0->f[0], src1->f[0] );
1484   dst->f[1] = powf( src0->f[1], src1->f[1] );
1485   dst->f[2] = powf( src0->f[2], src1->f[2] );
1486   dst->f[3] = powf( src0->f[3], src1->f[3] );
1487#endif
1488}
1489
1490static void
1491micro_ldexp(union tgsi_exec_channel *dst,
1492            const union tgsi_exec_channel *src0,
1493            const union tgsi_exec_channel *src1)
1494{
1495   dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1496   dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1497   dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1498   dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1499}
1500
1501static void
1502micro_sub(union tgsi_exec_channel *dst,
1503          const union tgsi_exec_channel *src0,
1504          const union tgsi_exec_channel *src1)
1505{
1506   dst->f[0] = src0->f[0] - src1->f[0];
1507   dst->f[1] = src0->f[1] - src1->f[1];
1508   dst->f[2] = src0->f[2] - src1->f[2];
1509   dst->f[3] = src0->f[3] - src1->f[3];
1510}
1511
1512static void
1513fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1514                       const uint file,
1515                       const uint swizzle,
1516                       const union tgsi_exec_channel *index,
1517                       const union tgsi_exec_channel *index2D,
1518                       union tgsi_exec_channel *chan)
1519{
1520   uint i;
1521
1522   assert(swizzle < 4);
1523
1524   switch (file) {
1525   case TGSI_FILE_CONSTANT:
1526      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1527         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
1528         assert(mach->Consts[index2D->i[i]]);
1529
1530         if (index->i[i] < 0) {
1531            chan->u[i] = 0;
1532         } else {
1533            /* NOTE: copying the const value as a uint instead of float */
1534            const uint constbuf = index2D->i[i];
1535            const uint *buf = (const uint *)mach->Consts[constbuf];
1536            const int pos = index->i[i] * 4 + swizzle;
1537            /* const buffer bounds check */
1538            if (pos < 0 || pos >= (int) mach->ConstsSize[constbuf]) {
1539               if (0) {
1540                  /* Debug: print warning */
1541                  static int count = 0;
1542                  if (count++ < 100)
1543                     debug_printf("TGSI Exec: const buffer index %d"
1544                                  " out of bounds\n", pos);
1545               }
1546               chan->u[i] = 0;
1547            }
1548            else
1549               chan->u[i] = buf[pos];
1550         }
1551      }
1552      break;
1553
1554   case TGSI_FILE_INPUT:
1555      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1556         /*
1557         if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1558            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1559                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1560                         index2D->i[i], index->i[i]);
1561                         }*/
1562         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1563         assert(pos >= 0);
1564         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1565         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1566      }
1567      break;
1568
1569   case TGSI_FILE_SYSTEM_VALUE:
1570      /* XXX no swizzling at this point.  Will be needed if we put
1571       * gl_FragCoord, for example, in a sys value register.
1572       */
1573      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1574         chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1575      }
1576      break;
1577
1578   case TGSI_FILE_TEMPORARY:
1579      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1580         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1581         assert(index2D->i[i] == 0);
1582
1583         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1584      }
1585      break;
1586
1587   case TGSI_FILE_IMMEDIATE:
1588      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1589         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1590         assert(index2D->i[i] == 0);
1591
1592         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1593      }
1594      break;
1595
1596   case TGSI_FILE_ADDRESS:
1597      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1598         assert(index->i[i] >= 0);
1599         assert(index2D->i[i] == 0);
1600
1601         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1602      }
1603      break;
1604
1605   case TGSI_FILE_OUTPUT:
1606      /* vertex/fragment output vars can be read too */
1607      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1608         assert(index->i[i] >= 0);
1609         assert(index2D->i[i] == 0);
1610
1611         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1612      }
1613      break;
1614
1615   default:
1616      assert(0);
1617      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1618         chan->u[i] = 0;
1619      }
1620   }
1621}
1622
1623static void
1624get_index_registers(const struct tgsi_exec_machine *mach,
1625                    const struct tgsi_full_src_register *reg,
1626                    union tgsi_exec_channel *index,
1627                    union tgsi_exec_channel *index2D)
1628{
1629   uint swizzle;
1630
1631   /* We start with a direct index into a register file.
1632    *
1633    *    file[1],
1634    *    where:
1635    *       file = Register.File
1636    *       [1] = Register.Index
1637    */
1638   index->i[0] =
1639   index->i[1] =
1640   index->i[2] =
1641   index->i[3] = reg->Register.Index;
1642
1643   /* There is an extra source register that indirectly subscripts
1644    * a register file. The direct index now becomes an offset
1645    * that is being added to the indirect register.
1646    *
1647    *    file[ind[2].x+1],
1648    *    where:
1649    *       ind = Indirect.File
1650    *       [2] = Indirect.Index
1651    *       .x = Indirect.SwizzleX
1652    */
1653   if (reg->Register.Indirect) {
1654      union tgsi_exec_channel index2;
1655      union tgsi_exec_channel indir_index;
1656      const uint execmask = mach->ExecMask;
1657      uint i;
1658
1659      /* which address register (always zero now) */
1660      index2.i[0] =
1661      index2.i[1] =
1662      index2.i[2] =
1663      index2.i[3] = reg->Indirect.Index;
1664      /* get current value of address register[swizzle] */
1665      swizzle = reg->Indirect.Swizzle;
1666      fetch_src_file_channel(mach,
1667                             reg->Indirect.File,
1668                             swizzle,
1669                             &index2,
1670                             &ZeroVec,
1671                             &indir_index);
1672
1673      /* add value of address register to the offset */
1674      index->i[0] += indir_index.i[0];
1675      index->i[1] += indir_index.i[1];
1676      index->i[2] += indir_index.i[2];
1677      index->i[3] += indir_index.i[3];
1678
1679      /* for disabled execution channels, zero-out the index to
1680       * avoid using a potential garbage value.
1681       */
1682      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1683         if ((execmask & (1 << i)) == 0)
1684            index->i[i] = 0;
1685      }
1686   }
1687
1688   /* There is an extra source register that is a second
1689    * subscript to a register file. Effectively it means that
1690    * the register file is actually a 2D array of registers.
1691    *
1692    *    file[3][1],
1693    *    where:
1694    *       [3] = Dimension.Index
1695    */
1696   if (reg->Register.Dimension) {
1697      index2D->i[0] =
1698      index2D->i[1] =
1699      index2D->i[2] =
1700      index2D->i[3] = reg->Dimension.Index;
1701
1702      /* Again, the second subscript index can be addressed indirectly
1703       * identically to the first one.
1704       * Nothing stops us from indirectly addressing the indirect register,
1705       * but there is no need for that, so we won't exercise it.
1706       *
1707       *    file[ind[4].y+3][1],
1708       *    where:
1709       *       ind = DimIndirect.File
1710       *       [4] = DimIndirect.Index
1711       *       .y = DimIndirect.SwizzleX
1712       */
1713      if (reg->Dimension.Indirect) {
1714         union tgsi_exec_channel index2;
1715         union tgsi_exec_channel indir_index;
1716         const uint execmask = mach->ExecMask;
1717         uint i;
1718
1719         index2.i[0] =
1720         index2.i[1] =
1721         index2.i[2] =
1722         index2.i[3] = reg->DimIndirect.Index;
1723
1724         swizzle = reg->DimIndirect.Swizzle;
1725         fetch_src_file_channel(mach,
1726                                reg->DimIndirect.File,
1727                                swizzle,
1728                                &index2,
1729                                &ZeroVec,
1730                                &indir_index);
1731
1732         index2D->i[0] += indir_index.i[0];
1733         index2D->i[1] += indir_index.i[1];
1734         index2D->i[2] += indir_index.i[2];
1735         index2D->i[3] += indir_index.i[3];
1736
1737         /* for disabled execution channels, zero-out the index to
1738          * avoid using a potential garbage value.
1739          */
1740         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1741            if ((execmask & (1 << i)) == 0) {
1742               index2D->i[i] = 0;
1743            }
1744         }
1745      }
1746
1747      /* If by any chance there was a need for a 3D array of register
1748       * files, we would have to check whether Dimension is followed
1749       * by a dimension register and continue the saga.
1750       */
1751   } else {
1752      index2D->i[0] =
1753      index2D->i[1] =
1754      index2D->i[2] =
1755      index2D->i[3] = 0;
1756   }
1757}
1758
1759
1760static void
1761fetch_source_d(const struct tgsi_exec_machine *mach,
1762               union tgsi_exec_channel *chan,
1763               const struct tgsi_full_src_register *reg,
1764	       const uint chan_index)
1765{
1766   union tgsi_exec_channel index;
1767   union tgsi_exec_channel index2D;
1768   uint swizzle;
1769
1770   get_index_registers(mach, reg, &index, &index2D);
1771
1772
1773   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1774   fetch_src_file_channel(mach,
1775                          reg->Register.File,
1776                          swizzle,
1777                          &index,
1778                          &index2D,
1779                          chan);
1780}
1781
1782static void
1783fetch_source(const struct tgsi_exec_machine *mach,
1784             union tgsi_exec_channel *chan,
1785             const struct tgsi_full_src_register *reg,
1786             const uint chan_index,
1787             enum tgsi_exec_datatype src_datatype)
1788{
1789   fetch_source_d(mach, chan, reg, chan_index);
1790
1791   if (reg->Register.Absolute) {
1792      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1793         micro_abs(chan, chan);
1794      } else {
1795         micro_iabs(chan, chan);
1796      }
1797   }
1798
1799   if (reg->Register.Negate) {
1800      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1801         micro_neg(chan, chan);
1802      } else {
1803         micro_ineg(chan, chan);
1804      }
1805   }
1806}
1807
1808static union tgsi_exec_channel *
1809store_dest_dstret(struct tgsi_exec_machine *mach,
1810                 const union tgsi_exec_channel *chan,
1811                 const struct tgsi_full_dst_register *reg,
1812                 uint chan_index,
1813                 enum tgsi_exec_datatype dst_datatype)
1814{
1815   static union tgsi_exec_channel null;
1816   union tgsi_exec_channel *dst;
1817   union tgsi_exec_channel index2D;
1818   int offset = 0;  /* indirection offset */
1819   int index;
1820
1821   /* for debugging */
1822   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
1823      check_inf_or_nan(chan);
1824   }
1825
1826   /* There is an extra source register that indirectly subscripts
1827    * a register file. The direct index now becomes an offset
1828    * that is being added to the indirect register.
1829    *
1830    *    file[ind[2].x+1],
1831    *    where:
1832    *       ind = Indirect.File
1833    *       [2] = Indirect.Index
1834    *       .x = Indirect.SwizzleX
1835    */
1836   if (reg->Register.Indirect) {
1837      union tgsi_exec_channel index;
1838      union tgsi_exec_channel indir_index;
1839      uint swizzle;
1840
1841      /* which address register (always zero for now) */
1842      index.i[0] =
1843      index.i[1] =
1844      index.i[2] =
1845      index.i[3] = reg->Indirect.Index;
1846
1847      /* get current value of address register[swizzle] */
1848      swizzle = reg->Indirect.Swizzle;
1849
1850      /* fetch values from the address/indirection register */
1851      fetch_src_file_channel(mach,
1852                             reg->Indirect.File,
1853                             swizzle,
1854                             &index,
1855                             &ZeroVec,
1856                             &indir_index);
1857
1858      /* save indirection offset */
1859      offset = indir_index.i[0];
1860   }
1861
1862   /* There is an extra source register that is a second
1863    * subscript to a register file. Effectively it means that
1864    * the register file is actually a 2D array of registers.
1865    *
1866    *    file[3][1],
1867    *    where:
1868    *       [3] = Dimension.Index
1869    */
1870   if (reg->Register.Dimension) {
1871      index2D.i[0] =
1872      index2D.i[1] =
1873      index2D.i[2] =
1874      index2D.i[3] = reg->Dimension.Index;
1875
1876      /* Again, the second subscript index can be addressed indirectly
1877       * identically to the first one.
1878       * Nothing stops us from indirectly addressing the indirect register,
1879       * but there is no need for that, so we won't exercise it.
1880       *
1881       *    file[ind[4].y+3][1],
1882       *    where:
1883       *       ind = DimIndirect.File
1884       *       [4] = DimIndirect.Index
1885       *       .y = DimIndirect.SwizzleX
1886       */
1887      if (reg->Dimension.Indirect) {
1888         union tgsi_exec_channel index2;
1889         union tgsi_exec_channel indir_index;
1890         const uint execmask = mach->ExecMask;
1891         unsigned swizzle;
1892         uint i;
1893
1894         index2.i[0] =
1895         index2.i[1] =
1896         index2.i[2] =
1897         index2.i[3] = reg->DimIndirect.Index;
1898
1899         swizzle = reg->DimIndirect.Swizzle;
1900         fetch_src_file_channel(mach,
1901                                reg->DimIndirect.File,
1902                                swizzle,
1903                                &index2,
1904                                &ZeroVec,
1905                                &indir_index);
1906
1907         index2D.i[0] += indir_index.i[0];
1908         index2D.i[1] += indir_index.i[1];
1909         index2D.i[2] += indir_index.i[2];
1910         index2D.i[3] += indir_index.i[3];
1911
1912         /* for disabled execution channels, zero-out the index to
1913          * avoid using a potential garbage value.
1914          */
1915         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1916            if ((execmask & (1 << i)) == 0) {
1917               index2D.i[i] = 0;
1918            }
1919         }
1920      }
1921
1922      /* If by any chance there was a need for a 3D array of register
1923       * files, we would have to check whether Dimension is followed
1924       * by a dimension register and continue the saga.
1925       */
1926   } else {
1927      index2D.i[0] =
1928      index2D.i[1] =
1929      index2D.i[2] =
1930      index2D.i[3] = 0;
1931   }
1932
1933   switch (reg->Register.File) {
1934   case TGSI_FILE_NULL:
1935      dst = &null;
1936      break;
1937
1938   case TGSI_FILE_OUTPUT:
1939      index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
1940         + reg->Register.Index;
1941      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1942#if 0
1943      debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1944                   mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1945                   reg->Register.Index);
1946      if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1947         debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1948         for (i = 0; i < TGSI_QUAD_SIZE; i++)
1949            if (execmask & (1 << i))
1950               debug_printf("%f, ", chan->f[i]);
1951         debug_printf(")\n");
1952      }
1953#endif
1954      break;
1955
1956   case TGSI_FILE_TEMPORARY:
1957      index = reg->Register.Index;
1958      assert( index < TGSI_EXEC_NUM_TEMPS );
1959      dst = &mach->Temps[offset + index].xyzw[chan_index];
1960      break;
1961
1962   case TGSI_FILE_ADDRESS:
1963      index = reg->Register.Index;
1964      dst = &mach->Addrs[index].xyzw[chan_index];
1965      break;
1966
1967   default:
1968      assert( 0 );
1969      return NULL;
1970   }
1971
1972   return dst;
1973}
1974
1975static void
1976store_dest_double(struct tgsi_exec_machine *mach,
1977                 const union tgsi_exec_channel *chan,
1978                 const struct tgsi_full_dst_register *reg,
1979                 uint chan_index,
1980                 enum tgsi_exec_datatype dst_datatype)
1981{
1982   union tgsi_exec_channel *dst;
1983   const uint execmask = mach->ExecMask;
1984   int i;
1985
1986   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
1987   if (!dst)
1988      return;
1989
1990   /* doubles path */
1991   for (i = 0; i < TGSI_QUAD_SIZE; i++)
1992      if (execmask & (1 << i))
1993         dst->i[i] = chan->i[i];
1994}
1995
1996static void
1997store_dest(struct tgsi_exec_machine *mach,
1998           const union tgsi_exec_channel *chan,
1999           const struct tgsi_full_dst_register *reg,
2000           const struct tgsi_full_instruction *inst,
2001           uint chan_index,
2002           enum tgsi_exec_datatype dst_datatype)
2003{
2004   union tgsi_exec_channel *dst;
2005   const uint execmask = mach->ExecMask;
2006   int i;
2007
2008   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
2009   if (!dst)
2010      return;
2011
2012   if (!inst->Instruction.Saturate) {
2013      for (i = 0; i < TGSI_QUAD_SIZE; i++)
2014         if (execmask & (1 << i))
2015            dst->i[i] = chan->i[i];
2016   }
2017   else {
2018      for (i = 0; i < TGSI_QUAD_SIZE; i++)
2019         if (execmask & (1 << i)) {
2020            if (chan->f[i] < 0.0f)
2021               dst->f[i] = 0.0f;
2022            else if (chan->f[i] > 1.0f)
2023               dst->f[i] = 1.0f;
2024            else
2025               dst->i[i] = chan->i[i];
2026         }
2027   }
2028}
2029
2030#define FETCH(VAL,INDEX,CHAN)\
2031    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
2032
2033#define IFETCH(VAL,INDEX,CHAN)\
2034    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
2035
2036
2037/**
2038 * Execute ARB-style KIL which is predicated by a src register.
2039 * Kill fragment if any of the four values is less than zero.
2040 */
2041static void
2042exec_kill_if(struct tgsi_exec_machine *mach,
2043             const struct tgsi_full_instruction *inst)
2044{
2045   uint uniquemask;
2046   uint chan_index;
2047   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2048   union tgsi_exec_channel r[1];
2049
2050   /* This mask stores component bits that were already tested. */
2051   uniquemask = 0;
2052
2053   for (chan_index = 0; chan_index < 4; chan_index++)
2054   {
2055      uint swizzle;
2056      uint i;
2057
2058      /* unswizzle channel */
2059      swizzle = tgsi_util_get_full_src_register_swizzle (
2060                        &inst->Src[0],
2061                        chan_index);
2062
2063      /* check if the component has not been already tested */
2064      if (uniquemask & (1 << swizzle))
2065         continue;
2066      uniquemask |= 1 << swizzle;
2067
2068      FETCH(&r[0], 0, chan_index);
2069      for (i = 0; i < 4; i++)
2070         if (r[0].f[i] < 0.0f)
2071            kilmask |= 1 << i;
2072   }
2073
2074   /* restrict to fragments currently executing */
2075   kilmask &= mach->ExecMask;
2076
2077   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2078}
2079
2080/**
2081 * Unconditional fragment kill/discard.
2082 */
2083static void
2084exec_kill(struct tgsi_exec_machine *mach)
2085{
2086   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
2087
2088   /* kill fragment for all fragments currently executing */
2089   kilmask = mach->ExecMask;
2090   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
2091}
2092
2093static void
2094emit_vertex(struct tgsi_exec_machine *mach,
2095            const struct tgsi_full_instruction *inst)
2096{
2097   union tgsi_exec_channel r[1];
2098   unsigned stream_id;
2099   unsigned *prim_count;
2100   /* FIXME: check for exec mask correctly
2101   unsigned i;
2102   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2103         if ((mach->ExecMask & (1 << i)))
2104   */
2105   IFETCH(&r[0], 0, TGSI_CHAN_X);
2106   stream_id = r[0].u[0];
2107   prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2108   if (mach->ExecMask) {
2109      if (mach->Primitives[stream_id][*prim_count] >= mach->MaxOutputVertices)
2110         return;
2111
2112      if (mach->Primitives[stream_id][*prim_count] == 0)
2113         mach->PrimitiveOffsets[stream_id][*prim_count] = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0];
2114      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
2115      mach->Primitives[stream_id][*prim_count]++;
2116   }
2117}
2118
2119static void
2120emit_primitive(struct tgsi_exec_machine *mach,
2121               const struct tgsi_full_instruction *inst)
2122{
2123   unsigned *prim_count;
2124   union tgsi_exec_channel r[1];
2125   unsigned stream_id = 0;
2126   /* FIXME: check for exec mask correctly
2127   unsigned i;
2128   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
2129         if ((mach->ExecMask & (1 << i)))
2130   */
2131   if (inst) {
2132      IFETCH(&r[0], 0, TGSI_CHAN_X);
2133      stream_id = r[0].u[0];
2134   }
2135   prim_count = &mach->Temps[temp_prim_idxs[stream_id].idx].xyzw[temp_prim_idxs[stream_id].chan].u[0];
2136   if (mach->ExecMask) {
2137      ++(*prim_count);
2138      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
2139      mach->Primitives[stream_id][*prim_count] = 0;
2140   }
2141}
2142
2143static void
2144conditional_emit_primitive(struct tgsi_exec_machine *mach)
2145{
2146   if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
2147      int emitted_verts =
2148         mach->Primitives[0][mach->Temps[temp_prim_idxs[0].idx].xyzw[temp_prim_idxs[0].chan].u[0]];
2149      if (emitted_verts) {
2150         emit_primitive(mach, NULL);
2151      }
2152   }
2153}
2154
2155
2156/*
2157 * Fetch four texture samples using STR texture coordinates.
2158 */
2159static void
2160fetch_texel( struct tgsi_sampler *sampler,
2161             const unsigned sview_idx,
2162             const unsigned sampler_idx,
2163             const union tgsi_exec_channel *s,
2164             const union tgsi_exec_channel *t,
2165             const union tgsi_exec_channel *p,
2166             const union tgsi_exec_channel *c0,
2167             const union tgsi_exec_channel *c1,
2168             float derivs[3][2][TGSI_QUAD_SIZE],
2169             const int8_t offset[3],
2170             enum tgsi_sampler_control control,
2171             union tgsi_exec_channel *r,
2172             union tgsi_exec_channel *g,
2173             union tgsi_exec_channel *b,
2174             union tgsi_exec_channel *a )
2175{
2176   uint j;
2177   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2178
2179   /* FIXME: handle explicit derivs, offsets */
2180   sampler->get_samples(sampler, sview_idx, sampler_idx,
2181                        s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
2182
2183   for (j = 0; j < 4; j++) {
2184      r->f[j] = rgba[0][j];
2185      g->f[j] = rgba[1][j];
2186      b->f[j] = rgba[2][j];
2187      a->f[j] = rgba[3][j];
2188   }
2189}
2190
2191
2192#define TEX_MODIFIER_NONE           0
2193#define TEX_MODIFIER_PROJECTED      1
2194#define TEX_MODIFIER_LOD_BIAS       2
2195#define TEX_MODIFIER_EXPLICIT_LOD   3
2196#define TEX_MODIFIER_LEVEL_ZERO     4
2197#define TEX_MODIFIER_GATHER         5
2198
2199/*
2200 * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2201 */
2202static void
2203fetch_texel_offsets(struct tgsi_exec_machine *mach,
2204                    const struct tgsi_full_instruction *inst,
2205                    int8_t offsets[3])
2206{
2207   if (inst->Texture.NumOffsets == 1) {
2208      union tgsi_exec_channel index;
2209      union tgsi_exec_channel offset[3];
2210      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2211      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2212                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2213      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2214                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2215      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2216                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2217     offsets[0] = offset[0].i[0];
2218     offsets[1] = offset[1].i[0];
2219     offsets[2] = offset[2].i[0];
2220   } else {
2221     assert(inst->Texture.NumOffsets == 0);
2222     offsets[0] = offsets[1] = offsets[2] = 0;
2223   }
2224}
2225
2226
2227/*
2228 * Fetch dx and dy values for one channel (s, t or r).
2229 * Put dx values into one float array, dy values into another.
2230 */
2231static void
2232fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2233                           const struct tgsi_full_instruction *inst,
2234                           unsigned regdsrcx,
2235                           unsigned chan,
2236                           float derivs[2][TGSI_QUAD_SIZE])
2237{
2238   union tgsi_exec_channel d;
2239   FETCH(&d, regdsrcx, chan);
2240   derivs[0][0] = d.f[0];
2241   derivs[0][1] = d.f[1];
2242   derivs[0][2] = d.f[2];
2243   derivs[0][3] = d.f[3];
2244   FETCH(&d, regdsrcx + 1, chan);
2245   derivs[1][0] = d.f[0];
2246   derivs[1][1] = d.f[1];
2247   derivs[1][2] = d.f[2];
2248   derivs[1][3] = d.f[3];
2249}
2250
2251static uint
2252fetch_sampler_unit(struct tgsi_exec_machine *mach,
2253                   const struct tgsi_full_instruction *inst,
2254                   uint sampler)
2255{
2256   uint unit = 0;
2257   int i;
2258   if (inst->Src[sampler].Register.Indirect) {
2259      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2260      union tgsi_exec_channel indir_index, index2;
2261      const uint execmask = mach->ExecMask;
2262      index2.i[0] =
2263      index2.i[1] =
2264      index2.i[2] =
2265      index2.i[3] = reg->Indirect.Index;
2266
2267      fetch_src_file_channel(mach,
2268                             reg->Indirect.File,
2269                             reg->Indirect.Swizzle,
2270                             &index2,
2271                             &ZeroVec,
2272                             &indir_index);
2273      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2274         if (execmask & (1 << i)) {
2275            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2276            break;
2277         }
2278      }
2279
2280   } else {
2281      unit = inst->Src[sampler].Register.Index;
2282   }
2283   return unit;
2284}
2285
2286/*
2287 * execute a texture instruction.
2288 *
2289 * modifier is used to control the channel routing for the
2290 * instruction variants like proj, lod, and texture with lod bias.
2291 * sampler indicates which src register the sampler is contained in.
2292 */
2293static void
2294exec_tex(struct tgsi_exec_machine *mach,
2295         const struct tgsi_full_instruction *inst,
2296         uint modifier, uint sampler)
2297{
2298   const union tgsi_exec_channel *args[5], *proj = NULL;
2299   union tgsi_exec_channel r[5];
2300   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2301   uint chan;
2302   uint unit;
2303   int8_t offsets[3];
2304   int dim, shadow_ref, i;
2305
2306   unit = fetch_sampler_unit(mach, inst, sampler);
2307   /* always fetch all 3 offsets, overkill but keeps code simple */
2308   fetch_texel_offsets(mach, inst, offsets);
2309
2310   assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2311   assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2312
2313   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2314   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2315
2316   assert(dim <= 4);
2317   if (shadow_ref >= 0)
2318      assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2319
2320   /* fetch modifier to the last argument */
2321   if (modifier != TEX_MODIFIER_NONE) {
2322      const int last = ARRAY_SIZE(args) - 1;
2323
2324      /* fetch modifier from src0.w or src1.x */
2325      if (sampler == 1) {
2326         assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2327         FETCH(&r[last], 0, TGSI_CHAN_W);
2328      }
2329      else {
2330         FETCH(&r[last], 1, TGSI_CHAN_X);
2331      }
2332
2333      if (modifier != TEX_MODIFIER_PROJECTED) {
2334         args[last] = &r[last];
2335      }
2336      else {
2337         proj = &r[last];
2338         args[last] = &ZeroVec;
2339      }
2340
2341      /* point unused arguments to zero vector */
2342      for (i = dim; i < last; i++)
2343         args[i] = &ZeroVec;
2344
2345      if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2346         control = TGSI_SAMPLER_LOD_EXPLICIT;
2347      else if (modifier == TEX_MODIFIER_LOD_BIAS)
2348         control = TGSI_SAMPLER_LOD_BIAS;
2349      else if (modifier == TEX_MODIFIER_GATHER)
2350         control = TGSI_SAMPLER_GATHER;
2351   }
2352   else {
2353      for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2354         args[i] = &ZeroVec;
2355   }
2356
2357   /* fetch coordinates */
2358   for (i = 0; i < dim; i++) {
2359      FETCH(&r[i], 0, TGSI_CHAN_X + i);
2360
2361      if (proj)
2362         micro_div(&r[i], &r[i], proj);
2363
2364      args[i] = &r[i];
2365   }
2366
2367   /* fetch reference value */
2368   if (shadow_ref >= 0) {
2369      FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2370
2371      if (proj)
2372         micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2373
2374      args[shadow_ref] = &r[shadow_ref];
2375   }
2376
2377   fetch_texel(mach->Sampler, unit, unit,
2378         args[0], args[1], args[2], args[3], args[4],
2379         NULL, offsets, control,
2380         &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2381
2382#if 0
2383   debug_printf("fetch r: %g %g %g %g\n",
2384         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2385   debug_printf("fetch g: %g %g %g %g\n",
2386         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2387   debug_printf("fetch b: %g %g %g %g\n",
2388         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2389   debug_printf("fetch a: %g %g %g %g\n",
2390         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2391#endif
2392
2393   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2394      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2395         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2396      }
2397   }
2398}
2399
2400static void
2401exec_lodq(struct tgsi_exec_machine *mach,
2402          const struct tgsi_full_instruction *inst)
2403{
2404   uint resource_unit, sampler_unit;
2405   unsigned dim;
2406   unsigned i;
2407   union tgsi_exec_channel coords[4];
2408   const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2409   union tgsi_exec_channel r[2];
2410
2411   resource_unit = fetch_sampler_unit(mach, inst, 1);
2412   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2413      uint target = mach->SamplerViews[resource_unit].Resource;
2414      dim = tgsi_util_get_texture_coord_dim(target);
2415      sampler_unit = fetch_sampler_unit(mach, inst, 2);
2416   } else {
2417      dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2418      sampler_unit = resource_unit;
2419   }
2420   assert(dim <= ARRAY_SIZE(coords));
2421   /* fetch coordinates */
2422   for (i = 0; i < dim; i++) {
2423      FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2424      args[i] = &coords[i];
2425   }
2426   for (i = dim; i < ARRAY_SIZE(coords); i++) {
2427      args[i] = &ZeroVec;
2428   }
2429   mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2430                            args[0]->f,
2431                            args[1]->f,
2432                            args[2]->f,
2433                            args[3]->f,
2434                            TGSI_SAMPLER_LOD_NONE,
2435                            r[0].f,
2436                            r[1].f);
2437
2438   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2439      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2440                 TGSI_EXEC_DATA_FLOAT);
2441   }
2442   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2443      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2444                 TGSI_EXEC_DATA_FLOAT);
2445   }
2446   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2447      unsigned char swizzles[4];
2448      unsigned chan;
2449      swizzles[0] = inst->Src[1].Register.SwizzleX;
2450      swizzles[1] = inst->Src[1].Register.SwizzleY;
2451      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2452      swizzles[3] = inst->Src[1].Register.SwizzleW;
2453
2454      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2455         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2456            if (swizzles[chan] >= 2) {
2457               store_dest(mach, &ZeroVec,
2458                          &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2459            } else {
2460               store_dest(mach, &r[swizzles[chan]],
2461                          &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2462            }
2463         }
2464      }
2465   } else {
2466      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2467         store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X,
2468                    TGSI_EXEC_DATA_FLOAT);
2469      }
2470      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2471         store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y,
2472                    TGSI_EXEC_DATA_FLOAT);
2473      }
2474   }
2475}
2476
2477static void
2478exec_txd(struct tgsi_exec_machine *mach,
2479         const struct tgsi_full_instruction *inst)
2480{
2481   union tgsi_exec_channel r[4];
2482   float derivs[3][2][TGSI_QUAD_SIZE];
2483   uint chan;
2484   uint unit;
2485   int8_t offsets[3];
2486
2487   unit = fetch_sampler_unit(mach, inst, 3);
2488   /* always fetch all 3 offsets, overkill but keeps code simple */
2489   fetch_texel_offsets(mach, inst, offsets);
2490
2491   switch (inst->Texture.Texture) {
2492   case TGSI_TEXTURE_1D:
2493      FETCH(&r[0], 0, TGSI_CHAN_X);
2494
2495      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2496
2497      fetch_texel(mach->Sampler, unit, unit,
2498                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2499                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2500                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2501      break;
2502
2503   case TGSI_TEXTURE_SHADOW1D:
2504   case TGSI_TEXTURE_1D_ARRAY:
2505   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2506      /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2507      FETCH(&r[0], 0, TGSI_CHAN_X);
2508      FETCH(&r[1], 0, TGSI_CHAN_Y);
2509      FETCH(&r[2], 0, TGSI_CHAN_Z);
2510
2511      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2512
2513      fetch_texel(mach->Sampler, unit, unit,
2514                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2515                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2516                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2517      break;
2518
2519   case TGSI_TEXTURE_2D:
2520   case TGSI_TEXTURE_RECT:
2521      FETCH(&r[0], 0, TGSI_CHAN_X);
2522      FETCH(&r[1], 0, TGSI_CHAN_Y);
2523
2524      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2525      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2526
2527      fetch_texel(mach->Sampler, unit, unit,
2528                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2529                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2530                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2531      break;
2532
2533
2534   case TGSI_TEXTURE_SHADOW2D:
2535   case TGSI_TEXTURE_SHADOWRECT:
2536   case TGSI_TEXTURE_2D_ARRAY:
2537   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2538      /* only SHADOW2D_ARRAY actually needs W */
2539      FETCH(&r[0], 0, TGSI_CHAN_X);
2540      FETCH(&r[1], 0, TGSI_CHAN_Y);
2541      FETCH(&r[2], 0, TGSI_CHAN_Z);
2542      FETCH(&r[3], 0, TGSI_CHAN_W);
2543
2544      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2545      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2546
2547      fetch_texel(mach->Sampler, unit, unit,
2548                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2549                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2550                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2551      break;
2552
2553   case TGSI_TEXTURE_3D:
2554   case TGSI_TEXTURE_CUBE:
2555   case TGSI_TEXTURE_CUBE_ARRAY:
2556   case TGSI_TEXTURE_SHADOWCUBE:
2557      /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2558      FETCH(&r[0], 0, TGSI_CHAN_X);
2559      FETCH(&r[1], 0, TGSI_CHAN_Y);
2560      FETCH(&r[2], 0, TGSI_CHAN_Z);
2561      FETCH(&r[3], 0, TGSI_CHAN_W);
2562
2563      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2564      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2565      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2566
2567      fetch_texel(mach->Sampler, unit, unit,
2568                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2569                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2570                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2571      break;
2572
2573   default:
2574      assert(0);
2575   }
2576
2577   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2578      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2579         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2580      }
2581   }
2582}
2583
2584
2585static void
2586exec_txf(struct tgsi_exec_machine *mach,
2587         const struct tgsi_full_instruction *inst)
2588{
2589   union tgsi_exec_channel r[4];
2590   uint chan;
2591   uint unit;
2592   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2593   int j;
2594   int8_t offsets[3];
2595   unsigned target;
2596
2597   unit = fetch_sampler_unit(mach, inst, 1);
2598   /* always fetch all 3 offsets, overkill but keeps code simple */
2599   fetch_texel_offsets(mach, inst, offsets);
2600
2601   IFETCH(&r[3], 0, TGSI_CHAN_W);
2602
2603   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2604       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2605      target = mach->SamplerViews[unit].Resource;
2606   }
2607   else {
2608      target = inst->Texture.Texture;
2609   }
2610   switch(target) {
2611   case TGSI_TEXTURE_3D:
2612   case TGSI_TEXTURE_2D_ARRAY:
2613   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2614   case TGSI_TEXTURE_2D_ARRAY_MSAA:
2615      IFETCH(&r[2], 0, TGSI_CHAN_Z);
2616      /* fallthrough */
2617   case TGSI_TEXTURE_2D:
2618   case TGSI_TEXTURE_RECT:
2619   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2620   case TGSI_TEXTURE_SHADOW2D:
2621   case TGSI_TEXTURE_SHADOWRECT:
2622   case TGSI_TEXTURE_1D_ARRAY:
2623   case TGSI_TEXTURE_2D_MSAA:
2624      IFETCH(&r[1], 0, TGSI_CHAN_Y);
2625      /* fallthrough */
2626   case TGSI_TEXTURE_BUFFER:
2627   case TGSI_TEXTURE_1D:
2628   case TGSI_TEXTURE_SHADOW1D:
2629      IFETCH(&r[0], 0, TGSI_CHAN_X);
2630      break;
2631   default:
2632      assert(0);
2633      break;
2634   }
2635
2636   mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2637                            offsets, rgba);
2638
2639   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2640      r[0].f[j] = rgba[0][j];
2641      r[1].f[j] = rgba[1][j];
2642      r[2].f[j] = rgba[2][j];
2643      r[3].f[j] = rgba[3][j];
2644   }
2645
2646   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2647       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2648      unsigned char swizzles[4];
2649      swizzles[0] = inst->Src[1].Register.SwizzleX;
2650      swizzles[1] = inst->Src[1].Register.SwizzleY;
2651      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2652      swizzles[3] = inst->Src[1].Register.SwizzleW;
2653
2654      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2655         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2656            store_dest(mach, &r[swizzles[chan]],
2657                       &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2658         }
2659      }
2660   }
2661   else {
2662      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2663         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2664            store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2665         }
2666      }
2667   }
2668}
2669
2670static void
2671exec_txq(struct tgsi_exec_machine *mach,
2672         const struct tgsi_full_instruction *inst)
2673{
2674   int result[4];
2675   union tgsi_exec_channel r[4], src;
2676   uint chan;
2677   uint unit;
2678   int i,j;
2679
2680   unit = fetch_sampler_unit(mach, inst, 1);
2681
2682   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2683
2684   /* XXX: This interface can't return per-pixel values */
2685   mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2686
2687   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2688      for (j = 0; j < 4; j++) {
2689         r[j].i[i] = result[j];
2690      }
2691   }
2692
2693   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2694      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2695         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
2696                    TGSI_EXEC_DATA_INT);
2697      }
2698   }
2699}
2700
2701static void
2702exec_sample(struct tgsi_exec_machine *mach,
2703            const struct tgsi_full_instruction *inst,
2704            uint modifier, boolean compare)
2705{
2706   const uint resource_unit = inst->Src[1].Register.Index;
2707   const uint sampler_unit = inst->Src[2].Register.Index;
2708   union tgsi_exec_channel r[5], c1;
2709   const union tgsi_exec_channel *lod = &ZeroVec;
2710   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2711   uint chan;
2712   unsigned char swizzles[4];
2713   int8_t offsets[3];
2714
2715   /* always fetch all 3 offsets, overkill but keeps code simple */
2716   fetch_texel_offsets(mach, inst, offsets);
2717
2718   assert(modifier != TEX_MODIFIER_PROJECTED);
2719
2720   if (modifier != TEX_MODIFIER_NONE) {
2721      if (modifier == TEX_MODIFIER_LOD_BIAS) {
2722         FETCH(&c1, 3, TGSI_CHAN_X);
2723         lod = &c1;
2724         control = TGSI_SAMPLER_LOD_BIAS;
2725      }
2726      else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2727         FETCH(&c1, 3, TGSI_CHAN_X);
2728         lod = &c1;
2729         control = TGSI_SAMPLER_LOD_EXPLICIT;
2730      }
2731      else if (modifier == TEX_MODIFIER_GATHER) {
2732         control = TGSI_SAMPLER_GATHER;
2733      }
2734      else {
2735         assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2736         control = TGSI_SAMPLER_LOD_ZERO;
2737      }
2738   }
2739
2740   FETCH(&r[0], 0, TGSI_CHAN_X);
2741
2742   switch (mach->SamplerViews[resource_unit].Resource) {
2743   case TGSI_TEXTURE_1D:
2744      if (compare) {
2745         FETCH(&r[2], 3, TGSI_CHAN_X);
2746         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2747                     &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2748                     NULL, offsets, control,
2749                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2750      }
2751      else {
2752         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2753                     &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2754                     NULL, offsets, control,
2755                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2756      }
2757      break;
2758
2759   case TGSI_TEXTURE_1D_ARRAY:
2760   case TGSI_TEXTURE_2D:
2761   case TGSI_TEXTURE_RECT:
2762      FETCH(&r[1], 0, TGSI_CHAN_Y);
2763      if (compare) {
2764         FETCH(&r[2], 3, TGSI_CHAN_X);
2765         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2766                     &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2767                     NULL, offsets, control,
2768                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2769      }
2770      else {
2771         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2772                     &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2773                     NULL, offsets, control,
2774                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2775      }
2776      break;
2777
2778   case TGSI_TEXTURE_2D_ARRAY:
2779   case TGSI_TEXTURE_3D:
2780   case TGSI_TEXTURE_CUBE:
2781      FETCH(&r[1], 0, TGSI_CHAN_Y);
2782      FETCH(&r[2], 0, TGSI_CHAN_Z);
2783      if(compare) {
2784         FETCH(&r[3], 3, TGSI_CHAN_X);
2785         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2786                     &r[0], &r[1], &r[2], &r[3], lod,
2787                     NULL, offsets, control,
2788                     &r[0], &r[1], &r[2], &r[3]);
2789      }
2790      else {
2791         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2792                     &r[0], &r[1], &r[2], &ZeroVec, lod,
2793                     NULL, offsets, control,
2794                     &r[0], &r[1], &r[2], &r[3]);
2795      }
2796      break;
2797
2798   case TGSI_TEXTURE_CUBE_ARRAY:
2799      FETCH(&r[1], 0, TGSI_CHAN_Y);
2800      FETCH(&r[2], 0, TGSI_CHAN_Z);
2801      FETCH(&r[3], 0, TGSI_CHAN_W);
2802      if(compare) {
2803         FETCH(&r[4], 3, TGSI_CHAN_X);
2804         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2805                     &r[0], &r[1], &r[2], &r[3], &r[4],
2806                     NULL, offsets, control,
2807                     &r[0], &r[1], &r[2], &r[3]);
2808      }
2809      else {
2810         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2811                     &r[0], &r[1], &r[2], &r[3], lod,
2812                     NULL, offsets, control,
2813                     &r[0], &r[1], &r[2], &r[3]);
2814      }
2815      break;
2816
2817
2818   default:
2819      assert(0);
2820   }
2821
2822   swizzles[0] = inst->Src[1].Register.SwizzleX;
2823   swizzles[1] = inst->Src[1].Register.SwizzleY;
2824   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2825   swizzles[3] = inst->Src[1].Register.SwizzleW;
2826
2827   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2828      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2829         store_dest(mach, &r[swizzles[chan]],
2830                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2831      }
2832   }
2833}
2834
2835static void
2836exec_sample_d(struct tgsi_exec_machine *mach,
2837              const struct tgsi_full_instruction *inst)
2838{
2839   const uint resource_unit = inst->Src[1].Register.Index;
2840   const uint sampler_unit = inst->Src[2].Register.Index;
2841   union tgsi_exec_channel r[4];
2842   float derivs[3][2][TGSI_QUAD_SIZE];
2843   uint chan;
2844   unsigned char swizzles[4];
2845   int8_t offsets[3];
2846
2847   /* always fetch all 3 offsets, overkill but keeps code simple */
2848   fetch_texel_offsets(mach, inst, offsets);
2849
2850   FETCH(&r[0], 0, TGSI_CHAN_X);
2851
2852   switch (mach->SamplerViews[resource_unit].Resource) {
2853   case TGSI_TEXTURE_1D:
2854   case TGSI_TEXTURE_1D_ARRAY:
2855      /* only 1D array actually needs Y */
2856      FETCH(&r[1], 0, TGSI_CHAN_Y);
2857
2858      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2859
2860      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2861                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2862                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2863                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2864      break;
2865
2866   case TGSI_TEXTURE_2D:
2867   case TGSI_TEXTURE_RECT:
2868   case TGSI_TEXTURE_2D_ARRAY:
2869      /* only 2D array actually needs Z */
2870      FETCH(&r[1], 0, TGSI_CHAN_Y);
2871      FETCH(&r[2], 0, TGSI_CHAN_Z);
2872
2873      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2874      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2875
2876      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2877                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2878                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2879                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2880      break;
2881
2882   case TGSI_TEXTURE_3D:
2883   case TGSI_TEXTURE_CUBE:
2884   case TGSI_TEXTURE_CUBE_ARRAY:
2885      /* only cube array actually needs W */
2886      FETCH(&r[1], 0, TGSI_CHAN_Y);
2887      FETCH(&r[2], 0, TGSI_CHAN_Z);
2888      FETCH(&r[3], 0, TGSI_CHAN_W);
2889
2890      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2891      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2892      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2893
2894      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2895                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2896                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2897                  &r[0], &r[1], &r[2], &r[3]);
2898      break;
2899
2900   default:
2901      assert(0);
2902   }
2903
2904   swizzles[0] = inst->Src[1].Register.SwizzleX;
2905   swizzles[1] = inst->Src[1].Register.SwizzleY;
2906   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2907   swizzles[3] = inst->Src[1].Register.SwizzleW;
2908
2909   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2910      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2911         store_dest(mach, &r[swizzles[chan]],
2912                    &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
2913      }
2914   }
2915}
2916
2917
2918/**
2919 * Evaluate a constant-valued coefficient at the position of the
2920 * current quad.
2921 */
2922static void
2923eval_constant_coef(
2924   struct tgsi_exec_machine *mach,
2925   unsigned attrib,
2926   unsigned chan )
2927{
2928   unsigned i;
2929
2930   for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2931      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2932   }
2933}
2934
2935static void
2936interp_constant_offset(
2937      UNUSED const struct tgsi_exec_machine *mach,
2938      UNUSED unsigned attrib,
2939      UNUSED unsigned chan,
2940      UNUSED float ofs_x,
2941      UNUSED float ofs_y,
2942      UNUSED union tgsi_exec_channel *out_chan)
2943{
2944}
2945
2946/**
2947 * Evaluate a linear-valued coefficient at the position of the
2948 * current quad.
2949 */
2950static void
2951interp_linear_offset(
2952      const struct tgsi_exec_machine *mach,
2953      unsigned attrib,
2954      unsigned chan,
2955      float ofs_x,
2956      float ofs_y,
2957      union tgsi_exec_channel *out_chan)
2958{
2959   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2960   const float dady = mach->InterpCoefs[attrib].dady[chan];
2961   const float delta = ofs_x * dadx + ofs_y * dady;
2962   out_chan->f[0] += delta;
2963   out_chan->f[1] += delta;
2964   out_chan->f[2] += delta;
2965   out_chan->f[3] += delta;
2966}
2967
2968static void
2969eval_linear_coef(struct tgsi_exec_machine *mach,
2970                 unsigned attrib,
2971                 unsigned chan)
2972{
2973   const float x = mach->QuadPos.xyzw[0].f[0];
2974   const float y = mach->QuadPos.xyzw[1].f[0];
2975   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2976   const float dady = mach->InterpCoefs[attrib].dady[chan];
2977   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2978
2979   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2980   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2981   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2982   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2983}
2984
2985/**
2986 * Evaluate a perspective-valued coefficient at the position of the
2987 * current quad.
2988 */
2989
2990static void
2991interp_perspective_offset(
2992   const struct tgsi_exec_machine *mach,
2993   unsigned attrib,
2994   unsigned chan,
2995   float ofs_x,
2996   float ofs_y,
2997   union tgsi_exec_channel *out_chan)
2998{
2999   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
3000   const float dady = mach->InterpCoefs[attrib].dady[chan];
3001   const float *w = mach->QuadPos.xyzw[3].f;
3002   const float delta = ofs_x * dadx + ofs_y * dady;
3003   out_chan->f[0] += delta / w[0];
3004   out_chan->f[1] += delta / w[1];
3005   out_chan->f[2] += delta / w[2];
3006   out_chan->f[3] += delta / w[3];
3007}
3008
3009static void
3010eval_perspective_coef(
3011   struct tgsi_exec_machine *mach,
3012   unsigned attrib,
3013   unsigned chan )
3014{
3015   const float x = mach->QuadPos.xyzw[0].f[0];
3016   const float y = mach->QuadPos.xyzw[1].f[0];
3017   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
3018   const float dady = mach->InterpCoefs[attrib].dady[chan];
3019   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
3020   const float *w = mach->QuadPos.xyzw[3].f;
3021   /* divide by W here */
3022   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
3023   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
3024   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
3025   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
3026}
3027
3028
3029typedef void (* eval_coef_func)(
3030   struct tgsi_exec_machine *mach,
3031   unsigned attrib,
3032   unsigned chan );
3033
3034static void
3035exec_declaration(struct tgsi_exec_machine *mach,
3036                 const struct tgsi_full_declaration *decl)
3037{
3038   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
3039      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
3040      return;
3041   }
3042
3043   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
3044      if (decl->Declaration.File == TGSI_FILE_INPUT) {
3045         uint first, last, mask;
3046
3047         first = decl->Range.First;
3048         last = decl->Range.Last;
3049         mask = decl->Declaration.UsageMask;
3050
3051         /* XXX we could remove this special-case code since
3052          * mach->InterpCoefs[first].a0 should already have the
3053          * front/back-face value.  But we should first update the
3054          * ureg code to emit the right UsageMask value (WRITEMASK_X).
3055          * Then, we could remove the tgsi_exec_machine::Face field.
3056          */
3057         /* XXX make FACE a system value */
3058         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
3059            uint i;
3060
3061            assert(decl->Semantic.Index == 0);
3062            assert(first == last);
3063
3064            for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3065               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
3066            }
3067         } else {
3068            eval_coef_func eval;
3069            apply_sample_offset_func interp;
3070            uint i, j;
3071
3072            switch (decl->Interp.Interpolate) {
3073            case TGSI_INTERPOLATE_CONSTANT:
3074               eval = eval_constant_coef;
3075               interp = interp_constant_offset;
3076               break;
3077
3078            case TGSI_INTERPOLATE_LINEAR:
3079               eval = eval_linear_coef;
3080               interp = interp_linear_offset;
3081               break;
3082
3083            case TGSI_INTERPOLATE_PERSPECTIVE:
3084               eval = eval_perspective_coef;
3085               interp = interp_perspective_offset;
3086               break;
3087
3088            case TGSI_INTERPOLATE_COLOR:
3089               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
3090               break;
3091
3092            default:
3093               assert(0);
3094               return;
3095            }
3096
3097            for (i = first; i <= last; i++)
3098               mach->InputSampleOffsetApply[i] = interp;
3099
3100            for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3101               if (mask & (1 << j)) {
3102                  for (i = first; i <= last; i++) {
3103                     eval(mach, i, j);
3104                  }
3105               }
3106            }
3107         }
3108
3109         if (DEBUG_EXECUTION) {
3110            uint i, j;
3111            for (i = first; i <= last; ++i) {
3112               debug_printf("IN[%2u] = ", i);
3113               for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
3114                  if (j > 0) {
3115                     debug_printf("         ");
3116                  }
3117                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
3118                               mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
3119                               mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
3120                               mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
3121                               mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
3122               }
3123            }
3124         }
3125      }
3126   }
3127
3128}
3129
3130typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
3131                                const union tgsi_exec_channel *src);
3132
3133static void
3134exec_scalar_unary(struct tgsi_exec_machine *mach,
3135                  const struct tgsi_full_instruction *inst,
3136                  micro_unary_op op,
3137                  enum tgsi_exec_datatype dst_datatype,
3138                  enum tgsi_exec_datatype src_datatype)
3139{
3140   unsigned int chan;
3141   union tgsi_exec_channel src;
3142   union tgsi_exec_channel dst;
3143
3144   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
3145   op(&dst, &src);
3146   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3147      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3148         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3149      }
3150   }
3151}
3152
3153static void
3154exec_vector_unary(struct tgsi_exec_machine *mach,
3155                  const struct tgsi_full_instruction *inst,
3156                  micro_unary_op op,
3157                  enum tgsi_exec_datatype dst_datatype,
3158                  enum tgsi_exec_datatype src_datatype)
3159{
3160   unsigned int chan;
3161   struct tgsi_exec_vector dst;
3162
3163   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3164      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3165         union tgsi_exec_channel src;
3166
3167         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
3168         op(&dst.xyzw[chan], &src);
3169      }
3170   }
3171   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3172      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3173         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3174      }
3175   }
3176}
3177
3178typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
3179                                 const union tgsi_exec_channel *src0,
3180                                 const union tgsi_exec_channel *src1);
3181
3182static void
3183exec_scalar_binary(struct tgsi_exec_machine *mach,
3184                   const struct tgsi_full_instruction *inst,
3185                   micro_binary_op op,
3186                   enum tgsi_exec_datatype dst_datatype,
3187                   enum tgsi_exec_datatype src_datatype)
3188{
3189   unsigned int chan;
3190   union tgsi_exec_channel src[2];
3191   union tgsi_exec_channel dst;
3192
3193   fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
3194   fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
3195   op(&dst, &src[0], &src[1]);
3196   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3197      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3198         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
3199      }
3200   }
3201}
3202
3203static void
3204exec_vector_binary(struct tgsi_exec_machine *mach,
3205                   const struct tgsi_full_instruction *inst,
3206                   micro_binary_op op,
3207                   enum tgsi_exec_datatype dst_datatype,
3208                   enum tgsi_exec_datatype src_datatype)
3209{
3210   unsigned int chan;
3211   struct tgsi_exec_vector dst;
3212
3213   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3214      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3215         union tgsi_exec_channel src[2];
3216
3217         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3218         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3219         op(&dst.xyzw[chan], &src[0], &src[1]);
3220      }
3221   }
3222   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3223      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3224         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3225      }
3226   }
3227}
3228
3229typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3230                                  const union tgsi_exec_channel *src0,
3231                                  const union tgsi_exec_channel *src1,
3232                                  const union tgsi_exec_channel *src2);
3233
3234static void
3235exec_vector_trinary(struct tgsi_exec_machine *mach,
3236                    const struct tgsi_full_instruction *inst,
3237                    micro_trinary_op op,
3238                    enum tgsi_exec_datatype dst_datatype,
3239                    enum tgsi_exec_datatype src_datatype)
3240{
3241   unsigned int chan;
3242   struct tgsi_exec_vector dst;
3243
3244   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3245      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3246         union tgsi_exec_channel src[3];
3247
3248         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3249         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3250         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3251         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3252      }
3253   }
3254   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3255      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3256         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3257      }
3258   }
3259}
3260
3261typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3262                                     const union tgsi_exec_channel *src0,
3263                                     const union tgsi_exec_channel *src1,
3264                                     const union tgsi_exec_channel *src2,
3265                                     const union tgsi_exec_channel *src3);
3266
3267static void
3268exec_vector_quaternary(struct tgsi_exec_machine *mach,
3269                       const struct tgsi_full_instruction *inst,
3270                       micro_quaternary_op op,
3271                       enum tgsi_exec_datatype dst_datatype,
3272                       enum tgsi_exec_datatype src_datatype)
3273{
3274   unsigned int chan;
3275   struct tgsi_exec_vector dst;
3276
3277   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3278      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3279         union tgsi_exec_channel src[4];
3280
3281         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3282         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3283         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3284         fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3285         op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3286      }
3287   }
3288   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3289      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3290         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
3291      }
3292   }
3293}
3294
3295static void
3296exec_dp3(struct tgsi_exec_machine *mach,
3297         const struct tgsi_full_instruction *inst)
3298{
3299   unsigned int chan;
3300   union tgsi_exec_channel arg[3];
3301
3302   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3303   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3304   micro_mul(&arg[2], &arg[0], &arg[1]);
3305
3306   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3307      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3308      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3309      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3310   }
3311
3312   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3313      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3314         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3315      }
3316   }
3317}
3318
3319static void
3320exec_dp4(struct tgsi_exec_machine *mach,
3321         const struct tgsi_full_instruction *inst)
3322{
3323   unsigned int chan;
3324   union tgsi_exec_channel arg[3];
3325
3326   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3327   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3328   micro_mul(&arg[2], &arg[0], &arg[1]);
3329
3330   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3331      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3332      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3333      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3334   }
3335
3336   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3337      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3338         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3339      }
3340   }
3341}
3342
3343static void
3344exec_dp2(struct tgsi_exec_machine *mach,
3345         const struct tgsi_full_instruction *inst)
3346{
3347   unsigned int chan;
3348   union tgsi_exec_channel arg[3];
3349
3350   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3351   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3352   micro_mul(&arg[2], &arg[0], &arg[1]);
3353
3354   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3355   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3356   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3357
3358   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3359      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3360         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3361      }
3362   }
3363}
3364
3365static void
3366exec_pk2h(struct tgsi_exec_machine *mach,
3367          const struct tgsi_full_instruction *inst)
3368{
3369   unsigned chan;
3370   union tgsi_exec_channel arg[2], dst;
3371
3372   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3373   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3374   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3375      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
3376         (util_float_to_half(arg[1].f[chan]) << 16);
3377   }
3378   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3379      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3380         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
3381      }
3382   }
3383}
3384
3385static void
3386exec_up2h(struct tgsi_exec_machine *mach,
3387          const struct tgsi_full_instruction *inst)
3388{
3389   unsigned chan;
3390   union tgsi_exec_channel arg, dst[2];
3391
3392   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3393   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3394      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
3395      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
3396   }
3397   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3398      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3399         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3400      }
3401   }
3402}
3403
3404static void
3405micro_ucmp(union tgsi_exec_channel *dst,
3406           const union tgsi_exec_channel *src0,
3407           const union tgsi_exec_channel *src1,
3408           const union tgsi_exec_channel *src2)
3409{
3410   dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3411   dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3412   dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3413   dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3414}
3415
3416static void
3417exec_ucmp(struct tgsi_exec_machine *mach,
3418          const struct tgsi_full_instruction *inst)
3419{
3420   unsigned int chan;
3421   struct tgsi_exec_vector dst;
3422
3423   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3424      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3425         union tgsi_exec_channel src[3];
3426
3427         fetch_source(mach, &src[0], &inst->Src[0], chan,
3428                      TGSI_EXEC_DATA_UINT);
3429         fetch_source(mach, &src[1], &inst->Src[1], chan,
3430                      TGSI_EXEC_DATA_FLOAT);
3431         fetch_source(mach, &src[2], &inst->Src[2], chan,
3432                      TGSI_EXEC_DATA_FLOAT);
3433         micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3434      }
3435   }
3436   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3437      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3438         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan,
3439                    TGSI_EXEC_DATA_FLOAT);
3440      }
3441   }
3442}
3443
3444static void
3445exec_dst(struct tgsi_exec_machine *mach,
3446         const struct tgsi_full_instruction *inst)
3447{
3448   union tgsi_exec_channel r[2];
3449   union tgsi_exec_channel d[4];
3450
3451   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3452      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3453      fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3454      micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3455   }
3456   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3457      fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3458   }
3459   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3460      fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3461   }
3462
3463   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3464      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3465   }
3466   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3467      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3468   }
3469   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3470      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3471   }
3472   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3473      store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3474   }
3475}
3476
3477static void
3478exec_log(struct tgsi_exec_machine *mach,
3479         const struct tgsi_full_instruction *inst)
3480{
3481   union tgsi_exec_channel r[3];
3482
3483   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3484   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3485   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3486   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3487   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3488      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3489   }
3490   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3491      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3492      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3493      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3494   }
3495   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3496      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3497   }
3498   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3499      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3500   }
3501}
3502
3503static void
3504exec_exp(struct tgsi_exec_machine *mach,
3505         const struct tgsi_full_instruction *inst)
3506{
3507   union tgsi_exec_channel r[3];
3508
3509   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3510   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3511   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3512      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3513      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3514   }
3515   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3516      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3517      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3518   }
3519   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3520      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3521      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3522   }
3523   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3524      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3525   }
3526}
3527
3528static void
3529exec_lit(struct tgsi_exec_machine *mach,
3530         const struct tgsi_full_instruction *inst)
3531{
3532   union tgsi_exec_channel r[3];
3533   union tgsi_exec_channel d[3];
3534
3535   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3536      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3537      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3538         fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3539         micro_max(&r[1], &r[1], &ZeroVec);
3540
3541         fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3542         micro_min(&r[2], &r[2], &P128Vec);
3543         micro_max(&r[2], &r[2], &M128Vec);
3544         micro_pow(&r[1], &r[1], &r[2]);
3545         micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3546         store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3547      }
3548      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3549         micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3550         store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3551      }
3552   }
3553   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3554      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3555   }
3556
3557   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3558      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3559   }
3560}
3561
3562static void
3563exec_break(struct tgsi_exec_machine *mach)
3564{
3565   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3566      /* turn off loop channels for each enabled exec channel */
3567      mach->LoopMask &= ~mach->ExecMask;
3568      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3569      UPDATE_EXEC_MASK(mach);
3570   } else {
3571      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3572
3573      mach->Switch.mask = 0x0;
3574
3575      UPDATE_EXEC_MASK(mach);
3576   }
3577}
3578
3579static void
3580exec_switch(struct tgsi_exec_machine *mach,
3581            const struct tgsi_full_instruction *inst)
3582{
3583   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3584   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3585
3586   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3587   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3588   mach->Switch.mask = 0x0;
3589   mach->Switch.defaultMask = 0x0;
3590
3591   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3592   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3593
3594   UPDATE_EXEC_MASK(mach);
3595}
3596
3597static void
3598exec_case(struct tgsi_exec_machine *mach,
3599          const struct tgsi_full_instruction *inst)
3600{
3601   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3602   union tgsi_exec_channel src;
3603   uint mask = 0;
3604
3605   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3606
3607   if (mach->Switch.selector.u[0] == src.u[0]) {
3608      mask |= 0x1;
3609   }
3610   if (mach->Switch.selector.u[1] == src.u[1]) {
3611      mask |= 0x2;
3612   }
3613   if (mach->Switch.selector.u[2] == src.u[2]) {
3614      mask |= 0x4;
3615   }
3616   if (mach->Switch.selector.u[3] == src.u[3]) {
3617      mask |= 0x8;
3618   }
3619
3620   mach->Switch.defaultMask |= mask;
3621
3622   mach->Switch.mask |= mask & prevMask;
3623
3624   UPDATE_EXEC_MASK(mach);
3625}
3626
3627/* FIXME: this will only work if default is last */
3628static void
3629exec_default(struct tgsi_exec_machine *mach)
3630{
3631   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3632
3633   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3634
3635   UPDATE_EXEC_MASK(mach);
3636}
3637
3638static void
3639exec_endswitch(struct tgsi_exec_machine *mach)
3640{
3641   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3642   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3643
3644   UPDATE_EXEC_MASK(mach);
3645}
3646
3647typedef void (* micro_dop)(union tgsi_double_channel *dst,
3648                           const union tgsi_double_channel *src);
3649
3650typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3651                               const union tgsi_double_channel *src0,
3652                               union tgsi_exec_channel *src1);
3653
3654typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3655                             const union tgsi_exec_channel *src);
3656
3657typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3658                             const union tgsi_double_channel *src);
3659
3660static void
3661fetch_double_channel(struct tgsi_exec_machine *mach,
3662                     union tgsi_double_channel *chan,
3663                     const struct tgsi_full_src_register *reg,
3664                     uint chan_0,
3665                     uint chan_1)
3666{
3667   union tgsi_exec_channel src[2];
3668   uint i;
3669
3670   fetch_source_d(mach, &src[0], reg, chan_0);
3671   fetch_source_d(mach, &src[1], reg, chan_1);
3672
3673   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3674      chan->u[i][0] = src[0].u[i];
3675      chan->u[i][1] = src[1].u[i];
3676   }
3677   if (reg->Register.Absolute) {
3678      micro_dabs(chan, chan);
3679   }
3680   if (reg->Register.Negate) {
3681      micro_dneg(chan, chan);
3682   }
3683}
3684
3685static void
3686store_double_channel(struct tgsi_exec_machine *mach,
3687                     const union tgsi_double_channel *chan,
3688                     const struct tgsi_full_dst_register *reg,
3689                     const struct tgsi_full_instruction *inst,
3690                     uint chan_0,
3691                     uint chan_1)
3692{
3693   union tgsi_exec_channel dst[2];
3694   uint i;
3695   union tgsi_double_channel temp;
3696   const uint execmask = mach->ExecMask;
3697
3698   if (!inst->Instruction.Saturate) {
3699      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3700         if (execmask & (1 << i)) {
3701            dst[0].u[i] = chan->u[i][0];
3702            dst[1].u[i] = chan->u[i][1];
3703         }
3704   }
3705   else {
3706      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3707         if (execmask & (1 << i)) {
3708            if (chan->d[i] < 0.0)
3709               temp.d[i] = 0.0;
3710            else if (chan->d[i] > 1.0)
3711               temp.d[i] = 1.0;
3712            else
3713               temp.d[i] = chan->d[i];
3714
3715            dst[0].u[i] = temp.u[i][0];
3716            dst[1].u[i] = temp.u[i][1];
3717         }
3718   }
3719
3720   store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
3721   if (chan_1 != (unsigned)-1)
3722      store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
3723}
3724
3725static void
3726exec_double_unary(struct tgsi_exec_machine *mach,
3727                  const struct tgsi_full_instruction *inst,
3728                  micro_dop op)
3729{
3730   union tgsi_double_channel src;
3731   union tgsi_double_channel dst;
3732
3733   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3734      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3735      op(&dst, &src);
3736      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3737   }
3738   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3739      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3740      op(&dst, &src);
3741      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3742   }
3743}
3744
3745static void
3746exec_double_binary(struct tgsi_exec_machine *mach,
3747                   const struct tgsi_full_instruction *inst,
3748                   micro_dop op,
3749                   enum tgsi_exec_datatype dst_datatype)
3750{
3751   union tgsi_double_channel src[2];
3752   union tgsi_double_channel dst;
3753   int first_dest_chan, second_dest_chan;
3754   int wmask;
3755
3756   wmask = inst->Dst[0].Register.WriteMask;
3757   /* these are & because of the way DSLT etc store their destinations */
3758   if (wmask & TGSI_WRITEMASK_XY) {
3759      first_dest_chan = TGSI_CHAN_X;
3760      second_dest_chan = TGSI_CHAN_Y;
3761      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3762         first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3763         second_dest_chan = -1;
3764      }
3765
3766      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3767      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3768      op(&dst, src);
3769      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3770   }
3771
3772   if (wmask & TGSI_WRITEMASK_ZW) {
3773      first_dest_chan = TGSI_CHAN_Z;
3774      second_dest_chan = TGSI_CHAN_W;
3775      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3776         first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3777         second_dest_chan = -1;
3778      }
3779
3780      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3781      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3782      op(&dst, src);
3783      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3784   }
3785}
3786
3787static void
3788exec_double_trinary(struct tgsi_exec_machine *mach,
3789                    const struct tgsi_full_instruction *inst,
3790                    micro_dop op)
3791{
3792   union tgsi_double_channel src[3];
3793   union tgsi_double_channel dst;
3794
3795   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3796      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3797      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3798      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3799      op(&dst, src);
3800      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3801   }
3802   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3803      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3804      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3805      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3806      op(&dst, src);
3807      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3808   }
3809}
3810
3811static void
3812exec_dldexp(struct tgsi_exec_machine *mach,
3813            const struct tgsi_full_instruction *inst)
3814{
3815   union tgsi_double_channel src0;
3816   union tgsi_exec_channel src1;
3817   union tgsi_double_channel dst;
3818   int wmask;
3819
3820   wmask = inst->Dst[0].Register.WriteMask;
3821   if (wmask & TGSI_WRITEMASK_XY) {
3822      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3823      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3824      micro_dldexp(&dst, &src0, &src1);
3825      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3826   }
3827
3828   if (wmask & TGSI_WRITEMASK_ZW) {
3829      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3830      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3831      micro_dldexp(&dst, &src0, &src1);
3832      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3833   }
3834}
3835
3836static void
3837exec_dfracexp(struct tgsi_exec_machine *mach,
3838              const struct tgsi_full_instruction *inst)
3839{
3840   union tgsi_double_channel src;
3841   union tgsi_double_channel dst;
3842   union tgsi_exec_channel dst_exp;
3843
3844   fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3845   micro_dfracexp(&dst, &dst_exp, &src);
3846   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3847      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3848   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3849      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3850   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3851      if (inst->Dst[1].Register.WriteMask & (1 << chan))
3852         store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan, TGSI_EXEC_DATA_INT);
3853   }
3854}
3855
3856static void
3857exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3858            const struct tgsi_full_instruction *inst,
3859            micro_dop_sop op)
3860{
3861   union tgsi_double_channel src0;
3862   union tgsi_exec_channel src1;
3863   union tgsi_double_channel dst;
3864   int wmask;
3865
3866   wmask = inst->Dst[0].Register.WriteMask;
3867   if (wmask & TGSI_WRITEMASK_XY) {
3868      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3869      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3870      op(&dst, &src0, &src1);
3871      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3872   }
3873
3874   if (wmask & TGSI_WRITEMASK_ZW) {
3875      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3876      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3877      op(&dst, &src0, &src1);
3878      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3879   }
3880}
3881
3882static int
3883get_image_coord_dim(unsigned tgsi_tex)
3884{
3885   int dim;
3886   switch (tgsi_tex) {
3887   case TGSI_TEXTURE_BUFFER:
3888   case TGSI_TEXTURE_1D:
3889      dim = 1;
3890      break;
3891   case TGSI_TEXTURE_2D:
3892   case TGSI_TEXTURE_RECT:
3893   case TGSI_TEXTURE_1D_ARRAY:
3894   case TGSI_TEXTURE_2D_MSAA:
3895      dim = 2;
3896      break;
3897   case TGSI_TEXTURE_3D:
3898   case TGSI_TEXTURE_CUBE:
3899   case TGSI_TEXTURE_2D_ARRAY:
3900   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3901   case TGSI_TEXTURE_CUBE_ARRAY:
3902      dim = 3;
3903      break;
3904   default:
3905      assert(!"unknown texture target");
3906      dim = 0;
3907      break;
3908   }
3909
3910   return dim;
3911}
3912
3913static int
3914get_image_coord_sample(unsigned tgsi_tex)
3915{
3916   int sample = 0;
3917   switch (tgsi_tex) {
3918   case TGSI_TEXTURE_2D_MSAA:
3919      sample = 3;
3920      break;
3921   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3922      sample = 4;
3923      break;
3924   default:
3925      break;
3926   }
3927   return sample;
3928}
3929
3930static void
3931exec_load_img(struct tgsi_exec_machine *mach,
3932              const struct tgsi_full_instruction *inst)
3933{
3934   union tgsi_exec_channel r[4], sample_r;
3935   uint unit;
3936   int sample;
3937   int i, j;
3938   int dim;
3939   uint chan;
3940   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3941   struct tgsi_image_params params;
3942   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3943
3944   unit = fetch_sampler_unit(mach, inst, 0);
3945   dim = get_image_coord_dim(inst->Memory.Texture);
3946   sample = get_image_coord_sample(inst->Memory.Texture);
3947   assert(dim <= 3);
3948
3949   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3950   params.unit = unit;
3951   params.tgsi_tex_instr = inst->Memory.Texture;
3952   params.format = inst->Memory.Format;
3953
3954   for (i = 0; i < dim; i++) {
3955      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3956   }
3957
3958   if (sample)
3959      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3960
3961   mach->Image->load(mach->Image, &params,
3962                     r[0].i, r[1].i, r[2].i, sample_r.i,
3963                     rgba);
3964   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3965      r[0].f[j] = rgba[0][j];
3966      r[1].f[j] = rgba[1][j];
3967      r[2].f[j] = rgba[2][j];
3968      r[3].f[j] = rgba[3][j];
3969   }
3970   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3971      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3972         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
3973      }
3974   }
3975}
3976
3977static void
3978exec_load_buf(struct tgsi_exec_machine *mach,
3979              const struct tgsi_full_instruction *inst)
3980{
3981   union tgsi_exec_channel r[4];
3982   uint unit;
3983   int j;
3984   uint chan;
3985   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3986   struct tgsi_buffer_params params;
3987   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
3988
3989   unit = fetch_sampler_unit(mach, inst, 0);
3990
3991   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
3992   params.unit = unit;
3993   IFETCH(&r[0], 1, TGSI_CHAN_X);
3994
3995   mach->Buffer->load(mach->Buffer, &params,
3996                      r[0].i, rgba);
3997   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3998      r[0].f[j] = rgba[0][j];
3999      r[1].f[j] = rgba[1][j];
4000      r[2].f[j] = rgba[2][j];
4001      r[3].f[j] = rgba[3][j];
4002   }
4003   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4004      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4005         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4006      }
4007   }
4008}
4009
4010static void
4011exec_load_mem(struct tgsi_exec_machine *mach,
4012              const struct tgsi_full_instruction *inst)
4013{
4014   union tgsi_exec_channel r[4];
4015   uint chan;
4016   char *ptr = mach->LocalMem;
4017   uint32_t offset;
4018   int j;
4019
4020   IFETCH(&r[0], 1, TGSI_CHAN_X);
4021   if (r[0].u[0] >= mach->LocalMemSize)
4022      return;
4023
4024   offset = r[0].u[0];
4025   ptr += offset;
4026
4027   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4028      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4029         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4030            memcpy(&r[chan].u[j], ptr + (4 * chan), 4);
4031         }
4032      }
4033   }
4034
4035   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4036      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4037         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4038      }
4039   }
4040}
4041
4042static void
4043exec_load(struct tgsi_exec_machine *mach,
4044          const struct tgsi_full_instruction *inst)
4045{
4046   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4047      exec_load_img(mach, inst);
4048   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4049      exec_load_buf(mach, inst);
4050   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4051      exec_load_mem(mach, inst);
4052}
4053
4054static uint
4055fetch_store_img_unit(struct tgsi_exec_machine *mach,
4056                     const struct tgsi_full_dst_register *dst)
4057{
4058   uint unit = 0;
4059   int i;
4060   if (dst->Register.Indirect) {
4061      union tgsi_exec_channel indir_index, index2;
4062      const uint execmask = mach->ExecMask;
4063      index2.i[0] =
4064      index2.i[1] =
4065      index2.i[2] =
4066      index2.i[3] = dst->Indirect.Index;
4067
4068      fetch_src_file_channel(mach,
4069                             dst->Indirect.File,
4070                             dst->Indirect.Swizzle,
4071                             &index2,
4072                             &ZeroVec,
4073                             &indir_index);
4074      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4075         if (execmask & (1 << i)) {
4076            unit = dst->Register.Index + indir_index.i[i];
4077            break;
4078         }
4079      }
4080   } else {
4081      unit = dst->Register.Index;
4082   }
4083   return unit;
4084}
4085
4086static void
4087exec_store_img(struct tgsi_exec_machine *mach,
4088               const struct tgsi_full_instruction *inst)
4089{
4090   union tgsi_exec_channel r[3], sample_r;
4091   union tgsi_exec_channel value[4];
4092   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4093   struct tgsi_image_params params;
4094   int dim;
4095   int sample;
4096   int i, j;
4097   uint unit;
4098   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4099   unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4100   dim = get_image_coord_dim(inst->Memory.Texture);
4101   sample = get_image_coord_sample(inst->Memory.Texture);
4102   assert(dim <= 3);
4103
4104   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4105   params.unit = unit;
4106   params.tgsi_tex_instr = inst->Memory.Texture;
4107   params.format = inst->Memory.Format;
4108
4109   for (i = 0; i < dim; i++) {
4110      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
4111   }
4112
4113   for (i = 0; i < 4; i++) {
4114      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4115   }
4116   if (sample)
4117      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
4118
4119   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4120      rgba[0][j] = value[0].f[j];
4121      rgba[1][j] = value[1].f[j];
4122      rgba[2][j] = value[2].f[j];
4123      rgba[3][j] = value[3].f[j];
4124   }
4125
4126   mach->Image->store(mach->Image, &params,
4127                      r[0].i, r[1].i, r[2].i, sample_r.i,
4128                      rgba);
4129}
4130
4131static void
4132exec_store_buf(struct tgsi_exec_machine *mach,
4133               const struct tgsi_full_instruction *inst)
4134{
4135   union tgsi_exec_channel r[3];
4136   union tgsi_exec_channel value[4];
4137   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4138   struct tgsi_buffer_params params;
4139   int i, j;
4140   uint unit;
4141   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4142
4143   unit = fetch_store_img_unit(mach, &inst->Dst[0]);
4144
4145   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4146   params.unit = unit;
4147   params.writemask = inst->Dst[0].Register.WriteMask;
4148
4149   IFETCH(&r[0], 0, TGSI_CHAN_X);
4150   for (i = 0; i < 4; i++) {
4151      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4152   }
4153
4154   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4155      rgba[0][j] = value[0].f[j];
4156      rgba[1][j] = value[1].f[j];
4157      rgba[2][j] = value[2].f[j];
4158      rgba[3][j] = value[3].f[j];
4159   }
4160
4161   mach->Buffer->store(mach->Buffer, &params,
4162                      r[0].i,
4163                      rgba);
4164}
4165
4166static void
4167exec_store_mem(struct tgsi_exec_machine *mach,
4168               const struct tgsi_full_instruction *inst)
4169{
4170   union tgsi_exec_channel r[3];
4171   union tgsi_exec_channel value[4];
4172   uint i, chan;
4173   char *ptr = mach->LocalMem;
4174   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4175   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4176
4177   IFETCH(&r[0], 0, TGSI_CHAN_X);
4178
4179   for (i = 0; i < 4; i++) {
4180      FETCH(&value[i], 1, TGSI_CHAN_X + i);
4181   }
4182
4183   if (r[0].u[0] >= mach->LocalMemSize)
4184      return;
4185   ptr += r[0].u[0];
4186
4187   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4188      if (execmask & (1 << i)) {
4189         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4190            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4191               memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
4192            }
4193         }
4194      }
4195   }
4196}
4197
4198static void
4199exec_store(struct tgsi_exec_machine *mach,
4200           const struct tgsi_full_instruction *inst)
4201{
4202   if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
4203      exec_store_img(mach, inst);
4204   else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
4205      exec_store_buf(mach, inst);
4206   else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
4207      exec_store_mem(mach, inst);
4208}
4209
4210static void
4211exec_atomop_img(struct tgsi_exec_machine *mach,
4212                const struct tgsi_full_instruction *inst)
4213{
4214   union tgsi_exec_channel r[4], sample_r;
4215   union tgsi_exec_channel value[4], value2[4];
4216   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4217   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4218   struct tgsi_image_params params;
4219   int dim;
4220   int sample;
4221   int i, j;
4222   uint unit, chan;
4223   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4224   unit = fetch_sampler_unit(mach, inst, 0);
4225   dim = get_image_coord_dim(inst->Memory.Texture);
4226   sample = get_image_coord_sample(inst->Memory.Texture);
4227   assert(dim <= 3);
4228
4229   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4230   params.unit = unit;
4231   params.tgsi_tex_instr = inst->Memory.Texture;
4232   params.format = inst->Memory.Format;
4233
4234   for (i = 0; i < dim; i++) {
4235      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4236   }
4237
4238   for (i = 0; i < 4; i++) {
4239      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4240      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4241         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4242   }
4243   if (sample)
4244      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4245
4246   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4247      rgba[0][j] = value[0].f[j];
4248      rgba[1][j] = value[1].f[j];
4249      rgba[2][j] = value[2].f[j];
4250      rgba[3][j] = value[3].f[j];
4251   }
4252   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4253      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4254         rgba2[0][j] = value2[0].f[j];
4255         rgba2[1][j] = value2[1].f[j];
4256         rgba2[2][j] = value2[2].f[j];
4257         rgba2[3][j] = value2[3].f[j];
4258      }
4259   }
4260
4261   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4262                   r[0].i, r[1].i, r[2].i, sample_r.i,
4263                   rgba, rgba2);
4264
4265   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4266      r[0].f[j] = rgba[0][j];
4267      r[1].f[j] = rgba[1][j];
4268      r[2].f[j] = rgba[2][j];
4269      r[3].f[j] = rgba[3][j];
4270   }
4271   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4272      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4273         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4274      }
4275   }
4276}
4277
4278static void
4279exec_atomop_buf(struct tgsi_exec_machine *mach,
4280                const struct tgsi_full_instruction *inst)
4281{
4282   union tgsi_exec_channel r[4];
4283   union tgsi_exec_channel value[4], value2[4];
4284   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4285   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
4286   struct tgsi_buffer_params params;
4287   int i, j;
4288   uint unit, chan;
4289   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4290
4291   unit = fetch_sampler_unit(mach, inst, 0);
4292
4293   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4294   params.unit = unit;
4295   params.writemask = inst->Dst[0].Register.WriteMask;
4296
4297   IFETCH(&r[0], 1, TGSI_CHAN_X);
4298
4299   for (i = 0; i < 4; i++) {
4300      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4301      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4302         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4303   }
4304
4305   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4306      rgba[0][j] = value[0].f[j];
4307      rgba[1][j] = value[1].f[j];
4308      rgba[2][j] = value[2].f[j];
4309      rgba[3][j] = value[3].f[j];
4310   }
4311   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4312      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4313         rgba2[0][j] = value2[0].f[j];
4314         rgba2[1][j] = value2[1].f[j];
4315         rgba2[2][j] = value2[2].f[j];
4316         rgba2[3][j] = value2[3].f[j];
4317      }
4318   }
4319
4320   mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
4321                   r[0].i,
4322                   rgba, rgba2);
4323
4324   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4325      r[0].f[j] = rgba[0][j];
4326      r[1].f[j] = rgba[1][j];
4327      r[2].f[j] = rgba[2][j];
4328      r[3].f[j] = rgba[3][j];
4329   }
4330   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4331      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4332         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4333      }
4334   }
4335}
4336
4337static void
4338exec_atomop_mem(struct tgsi_exec_machine *mach,
4339                const struct tgsi_full_instruction *inst)
4340{
4341   union tgsi_exec_channel r[4];
4342   union tgsi_exec_channel value[4], value2[4];
4343   char *ptr = mach->LocalMem;
4344   uint32_t val;
4345   uint chan, i;
4346   uint32_t offset;
4347   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4348   int execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4349   IFETCH(&r[0], 1, TGSI_CHAN_X);
4350
4351   if (r[0].u[0] >= mach->LocalMemSize)
4352      return;
4353
4354   offset = r[0].u[0];
4355   ptr += offset;
4356   for (i = 0; i < 4; i++) {
4357      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4358      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4359         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4360   }
4361
4362   memcpy(&r[0].u[0], ptr, 4);
4363   val = r[0].u[0];
4364   switch (inst->Instruction.Opcode) {
4365   case TGSI_OPCODE_ATOMUADD:
4366      val += value[0].u[0];
4367      break;
4368   case TGSI_OPCODE_ATOMXOR:
4369      val ^= value[0].u[0];
4370      break;
4371   case TGSI_OPCODE_ATOMOR:
4372      val |= value[0].u[0];
4373      break;
4374   case TGSI_OPCODE_ATOMAND:
4375      val &= value[0].u[0];
4376      break;
4377   case TGSI_OPCODE_ATOMUMIN:
4378      val = MIN2(val, value[0].u[0]);
4379      break;
4380   case TGSI_OPCODE_ATOMUMAX:
4381      val = MAX2(val, value[0].u[0]);
4382      break;
4383   case TGSI_OPCODE_ATOMIMIN:
4384      val = MIN2(r[0].i[0], value[0].i[0]);
4385      break;
4386   case TGSI_OPCODE_ATOMIMAX:
4387      val = MAX2(r[0].i[0], value[0].i[0]);
4388      break;
4389   case TGSI_OPCODE_ATOMXCHG:
4390      val = value[0].i[0];
4391      break;
4392   case TGSI_OPCODE_ATOMCAS:
4393      if (val == value[0].u[0])
4394         val = value2[0].u[0];
4395      break;
4396   case TGSI_OPCODE_ATOMFADD:
4397      val = fui(r[0].f[0] + value[0].f[0]);
4398      break;
4399   default:
4400      break;
4401   }
4402   for (i = 0; i < TGSI_QUAD_SIZE; i++)
4403      if (execmask & (1 << i))
4404         memcpy(ptr, &val, 4);
4405
4406   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4407      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4408         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
4409      }
4410   }
4411}
4412
4413static void
4414exec_atomop(struct tgsi_exec_machine *mach,
4415            const struct tgsi_full_instruction *inst)
4416{
4417   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4418      exec_atomop_img(mach, inst);
4419   else if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
4420      exec_atomop_buf(mach, inst);
4421   else if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
4422      exec_atomop_mem(mach, inst);
4423}
4424
4425static void
4426exec_resq_img(struct tgsi_exec_machine *mach,
4427              const struct tgsi_full_instruction *inst)
4428{
4429   int result[4];
4430   union tgsi_exec_channel r[4];
4431   uint unit;
4432   int i, chan, j;
4433   struct tgsi_image_params params;
4434   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4435
4436   unit = fetch_sampler_unit(mach, inst, 0);
4437
4438   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4439   params.unit = unit;
4440   params.tgsi_tex_instr = inst->Memory.Texture;
4441   params.format = inst->Memory.Format;
4442
4443   mach->Image->get_dims(mach->Image, &params, result);
4444
4445   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4446      for (j = 0; j < 4; j++) {
4447         r[j].i[i] = result[j];
4448      }
4449   }
4450
4451   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4452      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4453         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4454                    TGSI_EXEC_DATA_INT);
4455      }
4456   }
4457}
4458
4459static void
4460exec_resq_buf(struct tgsi_exec_machine *mach,
4461              const struct tgsi_full_instruction *inst)
4462{
4463   int result;
4464   union tgsi_exec_channel r[4];
4465   uint unit;
4466   int i, chan;
4467   struct tgsi_buffer_params params;
4468   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
4469
4470   unit = fetch_sampler_unit(mach, inst, 0);
4471
4472   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
4473   params.unit = unit;
4474
4475   mach->Buffer->get_dims(mach->Buffer, &params, &result);
4476
4477   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4478      r[0].i[i] = result;
4479   }
4480
4481   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4482      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4483         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
4484                    TGSI_EXEC_DATA_INT);
4485      }
4486   }
4487}
4488
4489static void
4490exec_resq(struct tgsi_exec_machine *mach,
4491          const struct tgsi_full_instruction *inst)
4492{
4493   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4494      exec_resq_img(mach, inst);
4495   else
4496      exec_resq_buf(mach, inst);
4497}
4498
4499static void
4500micro_f2u64(union tgsi_double_channel *dst,
4501            const union tgsi_exec_channel *src)
4502{
4503   dst->u64[0] = (uint64_t)src->f[0];
4504   dst->u64[1] = (uint64_t)src->f[1];
4505   dst->u64[2] = (uint64_t)src->f[2];
4506   dst->u64[3] = (uint64_t)src->f[3];
4507}
4508
4509static void
4510micro_f2i64(union tgsi_double_channel *dst,
4511            const union tgsi_exec_channel *src)
4512{
4513   dst->i64[0] = (int64_t)src->f[0];
4514   dst->i64[1] = (int64_t)src->f[1];
4515   dst->i64[2] = (int64_t)src->f[2];
4516   dst->i64[3] = (int64_t)src->f[3];
4517}
4518
4519static void
4520micro_u2i64(union tgsi_double_channel *dst,
4521            const union tgsi_exec_channel *src)
4522{
4523   dst->u64[0] = (uint64_t)src->u[0];
4524   dst->u64[1] = (uint64_t)src->u[1];
4525   dst->u64[2] = (uint64_t)src->u[2];
4526   dst->u64[3] = (uint64_t)src->u[3];
4527}
4528
4529static void
4530micro_i2i64(union tgsi_double_channel *dst,
4531            const union tgsi_exec_channel *src)
4532{
4533   dst->i64[0] = (int64_t)src->i[0];
4534   dst->i64[1] = (int64_t)src->i[1];
4535   dst->i64[2] = (int64_t)src->i[2];
4536   dst->i64[3] = (int64_t)src->i[3];
4537}
4538
4539static void
4540micro_d2u64(union tgsi_double_channel *dst,
4541           const union tgsi_double_channel *src)
4542{
4543   dst->u64[0] = (uint64_t)src->d[0];
4544   dst->u64[1] = (uint64_t)src->d[1];
4545   dst->u64[2] = (uint64_t)src->d[2];
4546   dst->u64[3] = (uint64_t)src->d[3];
4547}
4548
4549static void
4550micro_d2i64(union tgsi_double_channel *dst,
4551           const union tgsi_double_channel *src)
4552{
4553   dst->i64[0] = (int64_t)src->d[0];
4554   dst->i64[1] = (int64_t)src->d[1];
4555   dst->i64[2] = (int64_t)src->d[2];
4556   dst->i64[3] = (int64_t)src->d[3];
4557}
4558
4559static void
4560micro_u642d(union tgsi_double_channel *dst,
4561           const union tgsi_double_channel *src)
4562{
4563   dst->d[0] = (double)src->u64[0];
4564   dst->d[1] = (double)src->u64[1];
4565   dst->d[2] = (double)src->u64[2];
4566   dst->d[3] = (double)src->u64[3];
4567}
4568
4569static void
4570micro_i642d(union tgsi_double_channel *dst,
4571           const union tgsi_double_channel *src)
4572{
4573   dst->d[0] = (double)src->i64[0];
4574   dst->d[1] = (double)src->i64[1];
4575   dst->d[2] = (double)src->i64[2];
4576   dst->d[3] = (double)src->i64[3];
4577}
4578
4579static void
4580micro_u642f(union tgsi_exec_channel *dst,
4581            const union tgsi_double_channel *src)
4582{
4583   dst->f[0] = (float)src->u64[0];
4584   dst->f[1] = (float)src->u64[1];
4585   dst->f[2] = (float)src->u64[2];
4586   dst->f[3] = (float)src->u64[3];
4587}
4588
4589static void
4590micro_i642f(union tgsi_exec_channel *dst,
4591            const union tgsi_double_channel *src)
4592{
4593   dst->f[0] = (float)src->i64[0];
4594   dst->f[1] = (float)src->i64[1];
4595   dst->f[2] = (float)src->i64[2];
4596   dst->f[3] = (float)src->i64[3];
4597}
4598
4599static void
4600exec_t_2_64(struct tgsi_exec_machine *mach,
4601          const struct tgsi_full_instruction *inst,
4602          micro_dop_s op,
4603          enum tgsi_exec_datatype src_datatype)
4604{
4605   union tgsi_exec_channel src;
4606   union tgsi_double_channel dst;
4607
4608   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4609      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4610      op(&dst, &src);
4611      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4612   }
4613   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4614      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4615      op(&dst, &src);
4616      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4617   }
4618}
4619
4620static void
4621exec_64_2_t(struct tgsi_exec_machine *mach,
4622            const struct tgsi_full_instruction *inst,
4623            micro_sop_d op,
4624            enum tgsi_exec_datatype dst_datatype)
4625{
4626   union tgsi_double_channel src;
4627   union tgsi_exec_channel dst;
4628   int wm = inst->Dst[0].Register.WriteMask;
4629   int i;
4630   int bit;
4631   for (i = 0; i < 2; i++) {
4632      bit = ffs(wm);
4633      if (bit) {
4634         wm &= ~(1 << (bit - 1));
4635         if (i == 0)
4636            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4637         else
4638            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4639         op(&dst, &src);
4640         store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1, dst_datatype);
4641      }
4642   }
4643}
4644
4645static void
4646micro_i2f(union tgsi_exec_channel *dst,
4647          const union tgsi_exec_channel *src)
4648{
4649   dst->f[0] = (float)src->i[0];
4650   dst->f[1] = (float)src->i[1];
4651   dst->f[2] = (float)src->i[2];
4652   dst->f[3] = (float)src->i[3];
4653}
4654
4655static void
4656micro_not(union tgsi_exec_channel *dst,
4657          const union tgsi_exec_channel *src)
4658{
4659   dst->u[0] = ~src->u[0];
4660   dst->u[1] = ~src->u[1];
4661   dst->u[2] = ~src->u[2];
4662   dst->u[3] = ~src->u[3];
4663}
4664
4665static void
4666micro_shl(union tgsi_exec_channel *dst,
4667          const union tgsi_exec_channel *src0,
4668          const union tgsi_exec_channel *src1)
4669{
4670   unsigned masked_count;
4671   masked_count = src1->u[0] & 0x1f;
4672   dst->u[0] = src0->u[0] << masked_count;
4673   masked_count = src1->u[1] & 0x1f;
4674   dst->u[1] = src0->u[1] << masked_count;
4675   masked_count = src1->u[2] & 0x1f;
4676   dst->u[2] = src0->u[2] << masked_count;
4677   masked_count = src1->u[3] & 0x1f;
4678   dst->u[3] = src0->u[3] << masked_count;
4679}
4680
4681static void
4682micro_and(union tgsi_exec_channel *dst,
4683          const union tgsi_exec_channel *src0,
4684          const union tgsi_exec_channel *src1)
4685{
4686   dst->u[0] = src0->u[0] & src1->u[0];
4687   dst->u[1] = src0->u[1] & src1->u[1];
4688   dst->u[2] = src0->u[2] & src1->u[2];
4689   dst->u[3] = src0->u[3] & src1->u[3];
4690}
4691
4692static void
4693micro_or(union tgsi_exec_channel *dst,
4694         const union tgsi_exec_channel *src0,
4695         const union tgsi_exec_channel *src1)
4696{
4697   dst->u[0] = src0->u[0] | src1->u[0];
4698   dst->u[1] = src0->u[1] | src1->u[1];
4699   dst->u[2] = src0->u[2] | src1->u[2];
4700   dst->u[3] = src0->u[3] | src1->u[3];
4701}
4702
4703static void
4704micro_xor(union tgsi_exec_channel *dst,
4705          const union tgsi_exec_channel *src0,
4706          const union tgsi_exec_channel *src1)
4707{
4708   dst->u[0] = src0->u[0] ^ src1->u[0];
4709   dst->u[1] = src0->u[1] ^ src1->u[1];
4710   dst->u[2] = src0->u[2] ^ src1->u[2];
4711   dst->u[3] = src0->u[3] ^ src1->u[3];
4712}
4713
4714static void
4715micro_mod(union tgsi_exec_channel *dst,
4716          const union tgsi_exec_channel *src0,
4717          const union tgsi_exec_channel *src1)
4718{
4719   dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4720   dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4721   dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4722   dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4723}
4724
4725static void
4726micro_f2i(union tgsi_exec_channel *dst,
4727          const union tgsi_exec_channel *src)
4728{
4729   dst->i[0] = (int)src->f[0];
4730   dst->i[1] = (int)src->f[1];
4731   dst->i[2] = (int)src->f[2];
4732   dst->i[3] = (int)src->f[3];
4733}
4734
4735static void
4736micro_fseq(union tgsi_exec_channel *dst,
4737           const union tgsi_exec_channel *src0,
4738           const union tgsi_exec_channel *src1)
4739{
4740   dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4741   dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4742   dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4743   dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4744}
4745
4746static void
4747micro_fsge(union tgsi_exec_channel *dst,
4748           const union tgsi_exec_channel *src0,
4749           const union tgsi_exec_channel *src1)
4750{
4751   dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4752   dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4753   dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4754   dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4755}
4756
4757static void
4758micro_fslt(union tgsi_exec_channel *dst,
4759           const union tgsi_exec_channel *src0,
4760           const union tgsi_exec_channel *src1)
4761{
4762   dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4763   dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4764   dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4765   dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4766}
4767
4768static void
4769micro_fsne(union tgsi_exec_channel *dst,
4770           const union tgsi_exec_channel *src0,
4771           const union tgsi_exec_channel *src1)
4772{
4773   dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4774   dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4775   dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4776   dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4777}
4778
4779static void
4780micro_idiv(union tgsi_exec_channel *dst,
4781           const union tgsi_exec_channel *src0,
4782           const union tgsi_exec_channel *src1)
4783{
4784   dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4785   dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4786   dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4787   dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4788}
4789
4790static void
4791micro_imax(union tgsi_exec_channel *dst,
4792           const union tgsi_exec_channel *src0,
4793           const union tgsi_exec_channel *src1)
4794{
4795   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4796   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4797   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4798   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4799}
4800
4801static void
4802micro_imin(union tgsi_exec_channel *dst,
4803           const union tgsi_exec_channel *src0,
4804           const union tgsi_exec_channel *src1)
4805{
4806   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4807   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4808   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4809   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4810}
4811
4812static void
4813micro_isge(union tgsi_exec_channel *dst,
4814           const union tgsi_exec_channel *src0,
4815           const union tgsi_exec_channel *src1)
4816{
4817   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4818   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4819   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4820   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4821}
4822
4823static void
4824micro_ishr(union tgsi_exec_channel *dst,
4825           const union tgsi_exec_channel *src0,
4826           const union tgsi_exec_channel *src1)
4827{
4828   unsigned masked_count;
4829   masked_count = src1->i[0] & 0x1f;
4830   dst->i[0] = src0->i[0] >> masked_count;
4831   masked_count = src1->i[1] & 0x1f;
4832   dst->i[1] = src0->i[1] >> masked_count;
4833   masked_count = src1->i[2] & 0x1f;
4834   dst->i[2] = src0->i[2] >> masked_count;
4835   masked_count = src1->i[3] & 0x1f;
4836   dst->i[3] = src0->i[3] >> masked_count;
4837}
4838
4839static void
4840micro_islt(union tgsi_exec_channel *dst,
4841           const union tgsi_exec_channel *src0,
4842           const union tgsi_exec_channel *src1)
4843{
4844   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4845   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4846   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4847   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4848}
4849
4850static void
4851micro_f2u(union tgsi_exec_channel *dst,
4852          const union tgsi_exec_channel *src)
4853{
4854   dst->u[0] = (uint)src->f[0];
4855   dst->u[1] = (uint)src->f[1];
4856   dst->u[2] = (uint)src->f[2];
4857   dst->u[3] = (uint)src->f[3];
4858}
4859
4860static void
4861micro_u2f(union tgsi_exec_channel *dst,
4862          const union tgsi_exec_channel *src)
4863{
4864   dst->f[0] = (float)src->u[0];
4865   dst->f[1] = (float)src->u[1];
4866   dst->f[2] = (float)src->u[2];
4867   dst->f[3] = (float)src->u[3];
4868}
4869
4870static void
4871micro_uadd(union tgsi_exec_channel *dst,
4872           const union tgsi_exec_channel *src0,
4873           const union tgsi_exec_channel *src1)
4874{
4875   dst->u[0] = src0->u[0] + src1->u[0];
4876   dst->u[1] = src0->u[1] + src1->u[1];
4877   dst->u[2] = src0->u[2] + src1->u[2];
4878   dst->u[3] = src0->u[3] + src1->u[3];
4879}
4880
4881static void
4882micro_udiv(union tgsi_exec_channel *dst,
4883           const union tgsi_exec_channel *src0,
4884           const union tgsi_exec_channel *src1)
4885{
4886   dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4887   dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4888   dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4889   dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4890}
4891
4892static void
4893micro_umad(union tgsi_exec_channel *dst,
4894           const union tgsi_exec_channel *src0,
4895           const union tgsi_exec_channel *src1,
4896           const union tgsi_exec_channel *src2)
4897{
4898   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4899   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4900   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4901   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4902}
4903
4904static void
4905micro_umax(union tgsi_exec_channel *dst,
4906           const union tgsi_exec_channel *src0,
4907           const union tgsi_exec_channel *src1)
4908{
4909   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4910   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4911   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4912   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4913}
4914
4915static void
4916micro_umin(union tgsi_exec_channel *dst,
4917           const union tgsi_exec_channel *src0,
4918           const union tgsi_exec_channel *src1)
4919{
4920   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4921   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4922   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4923   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4924}
4925
4926static void
4927micro_umod(union tgsi_exec_channel *dst,
4928           const union tgsi_exec_channel *src0,
4929           const union tgsi_exec_channel *src1)
4930{
4931   dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4932   dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4933   dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4934   dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4935}
4936
4937static void
4938micro_umul(union tgsi_exec_channel *dst,
4939           const union tgsi_exec_channel *src0,
4940           const union tgsi_exec_channel *src1)
4941{
4942   dst->u[0] = src0->u[0] * src1->u[0];
4943   dst->u[1] = src0->u[1] * src1->u[1];
4944   dst->u[2] = src0->u[2] * src1->u[2];
4945   dst->u[3] = src0->u[3] * src1->u[3];
4946}
4947
4948static void
4949micro_imul_hi(union tgsi_exec_channel *dst,
4950              const union tgsi_exec_channel *src0,
4951              const union tgsi_exec_channel *src1)
4952{
4953#define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4954   dst->i[0] = I64M(src0->i[0], src1->i[0]);
4955   dst->i[1] = I64M(src0->i[1], src1->i[1]);
4956   dst->i[2] = I64M(src0->i[2], src1->i[2]);
4957   dst->i[3] = I64M(src0->i[3], src1->i[3]);
4958#undef I64M
4959}
4960
4961static void
4962micro_umul_hi(union tgsi_exec_channel *dst,
4963              const union tgsi_exec_channel *src0,
4964              const union tgsi_exec_channel *src1)
4965{
4966#define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4967   dst->u[0] = U64M(src0->u[0], src1->u[0]);
4968   dst->u[1] = U64M(src0->u[1], src1->u[1]);
4969   dst->u[2] = U64M(src0->u[2], src1->u[2]);
4970   dst->u[3] = U64M(src0->u[3], src1->u[3]);
4971#undef U64M
4972}
4973
4974static void
4975micro_useq(union tgsi_exec_channel *dst,
4976           const union tgsi_exec_channel *src0,
4977           const union tgsi_exec_channel *src1)
4978{
4979   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4980   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4981   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4982   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4983}
4984
4985static void
4986micro_usge(union tgsi_exec_channel *dst,
4987           const union tgsi_exec_channel *src0,
4988           const union tgsi_exec_channel *src1)
4989{
4990   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4991   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4992   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4993   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4994}
4995
4996static void
4997micro_ushr(union tgsi_exec_channel *dst,
4998           const union tgsi_exec_channel *src0,
4999           const union tgsi_exec_channel *src1)
5000{
5001   unsigned masked_count;
5002   masked_count = src1->u[0] & 0x1f;
5003   dst->u[0] = src0->u[0] >> masked_count;
5004   masked_count = src1->u[1] & 0x1f;
5005   dst->u[1] = src0->u[1] >> masked_count;
5006   masked_count = src1->u[2] & 0x1f;
5007   dst->u[2] = src0->u[2] >> masked_count;
5008   masked_count = src1->u[3] & 0x1f;
5009   dst->u[3] = src0->u[3] >> masked_count;
5010}
5011
5012static void
5013micro_uslt(union tgsi_exec_channel *dst,
5014           const union tgsi_exec_channel *src0,
5015           const union tgsi_exec_channel *src1)
5016{
5017   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
5018   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
5019   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
5020   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
5021}
5022
5023static void
5024micro_usne(union tgsi_exec_channel *dst,
5025           const union tgsi_exec_channel *src0,
5026           const union tgsi_exec_channel *src1)
5027{
5028   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
5029   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
5030   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
5031   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
5032}
5033
5034static void
5035micro_uarl(union tgsi_exec_channel *dst,
5036           const union tgsi_exec_channel *src)
5037{
5038   dst->i[0] = src->u[0];
5039   dst->i[1] = src->u[1];
5040   dst->i[2] = src->u[2];
5041   dst->i[3] = src->u[3];
5042}
5043
5044/**
5045 * Signed bitfield extract (i.e. sign-extend the extracted bits)
5046 */
5047static void
5048micro_ibfe(union tgsi_exec_channel *dst,
5049           const union tgsi_exec_channel *src0,
5050           const union tgsi_exec_channel *src1,
5051           const union tgsi_exec_channel *src2)
5052{
5053   int i;
5054   for (i = 0; i < 4; i++) {
5055      int width = src2->i[i];
5056      int offset = src1->i[i] & 0x1f;
5057      if (width == 32 && offset == 0) {
5058         dst->i[i] = src0->i[i];
5059         continue;
5060      }
5061      width &= 0x1f;
5062      if (width == 0)
5063         dst->i[i] = 0;
5064      else if (width + offset < 32)
5065         dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
5066      else
5067         dst->i[i] = src0->i[i] >> offset;
5068   }
5069}
5070
5071/**
5072 * Unsigned bitfield extract
5073 */
5074static void
5075micro_ubfe(union tgsi_exec_channel *dst,
5076           const union tgsi_exec_channel *src0,
5077           const union tgsi_exec_channel *src1,
5078           const union tgsi_exec_channel *src2)
5079{
5080   int i;
5081   for (i = 0; i < 4; i++) {
5082      int width = src2->u[i];
5083      int offset = src1->u[i] & 0x1f;
5084      if (width == 32 && offset == 0) {
5085         dst->u[i] = src0->u[i];
5086         continue;
5087      }
5088      width &= 0x1f;
5089      if (width == 0)
5090         dst->u[i] = 0;
5091      else if (width + offset < 32)
5092         dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
5093      else
5094         dst->u[i] = src0->u[i] >> offset;
5095   }
5096}
5097
5098/**
5099 * Bitfield insert: copy low bits from src1 into a region of src0.
5100 */
5101static void
5102micro_bfi(union tgsi_exec_channel *dst,
5103          const union tgsi_exec_channel *src0,
5104          const union tgsi_exec_channel *src1,
5105          const union tgsi_exec_channel *src2,
5106          const union tgsi_exec_channel *src3)
5107{
5108   int i;
5109   for (i = 0; i < 4; i++) {
5110      int width = src3->u[i];
5111      int offset = src2->u[i] & 0x1f;
5112      if (width == 32) {
5113         dst->u[i] = src1->u[i];
5114      } else {
5115         int bitmask = ((1 << width) - 1) << offset;
5116         dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
5117      }
5118   }
5119}
5120
5121static void
5122micro_brev(union tgsi_exec_channel *dst,
5123           const union tgsi_exec_channel *src)
5124{
5125   dst->u[0] = util_bitreverse(src->u[0]);
5126   dst->u[1] = util_bitreverse(src->u[1]);
5127   dst->u[2] = util_bitreverse(src->u[2]);
5128   dst->u[3] = util_bitreverse(src->u[3]);
5129}
5130
5131static void
5132micro_popc(union tgsi_exec_channel *dst,
5133           const union tgsi_exec_channel *src)
5134{
5135   dst->u[0] = util_bitcount(src->u[0]);
5136   dst->u[1] = util_bitcount(src->u[1]);
5137   dst->u[2] = util_bitcount(src->u[2]);
5138   dst->u[3] = util_bitcount(src->u[3]);
5139}
5140
5141static void
5142micro_lsb(union tgsi_exec_channel *dst,
5143          const union tgsi_exec_channel *src)
5144{
5145   dst->i[0] = ffs(src->u[0]) - 1;
5146   dst->i[1] = ffs(src->u[1]) - 1;
5147   dst->i[2] = ffs(src->u[2]) - 1;
5148   dst->i[3] = ffs(src->u[3]) - 1;
5149}
5150
5151static void
5152micro_imsb(union tgsi_exec_channel *dst,
5153           const union tgsi_exec_channel *src)
5154{
5155   dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
5156   dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
5157   dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
5158   dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
5159}
5160
5161static void
5162micro_umsb(union tgsi_exec_channel *dst,
5163           const union tgsi_exec_channel *src)
5164{
5165   dst->i[0] = util_last_bit(src->u[0]) - 1;
5166   dst->i[1] = util_last_bit(src->u[1]) - 1;
5167   dst->i[2] = util_last_bit(src->u[2]) - 1;
5168   dst->i[3] = util_last_bit(src->u[3]) - 1;
5169}
5170
5171
5172static void
5173exec_interp_at_sample(struct tgsi_exec_machine *mach,
5174                      const struct tgsi_full_instruction *inst)
5175{
5176   union tgsi_exec_channel index;
5177   union tgsi_exec_channel index2D;
5178   union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5179   const struct tgsi_full_src_register *reg = &inst->Src[0];
5180
5181   assert(reg->Register.File == TGSI_FILE_INPUT);
5182   assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
5183
5184   get_index_registers(mach, reg, &index, &index2D);
5185   float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
5186
5187   /* Short cut: sample 0 is like a normal fetch */
5188   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5189      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5190         continue;
5191
5192      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5193                             &result[chan]);
5194      if (sample != 0.0f) {
5195
5196      /* TODO: define the samples > 0, but so far we only do fake MSAA */
5197         float x = 0;
5198         float y = 0;
5199
5200         unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
5201         assert(pos >= 0);
5202         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
5203         mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
5204      }
5205      store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5206   }
5207}
5208
5209
5210static void
5211exec_interp_at_offset(struct tgsi_exec_machine *mach,
5212                      const struct tgsi_full_instruction *inst)
5213{
5214   union tgsi_exec_channel index;
5215   union tgsi_exec_channel index2D;
5216   union tgsi_exec_channel ofsx;
5217   union tgsi_exec_channel ofsy;
5218   const struct tgsi_full_src_register *reg = &inst->Src[0];
5219
5220   assert(reg->Register.File == TGSI_FILE_INPUT);
5221
5222   get_index_registers(mach, reg, &index, &index2D);
5223   unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
5224
5225   fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
5226   fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
5227
5228   for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5229      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5230         continue;
5231      union tgsi_exec_channel result;
5232      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
5233      mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
5234      store_dest(mach, &result, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5235   }
5236}
5237
5238
5239static void
5240exec_interp_at_centroid(struct tgsi_exec_machine *mach,
5241                        const struct tgsi_full_instruction *inst)
5242{
5243   union tgsi_exec_channel index;
5244   union tgsi_exec_channel index2D;
5245   union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
5246   const struct tgsi_full_src_register *reg = &inst->Src[0];
5247
5248   assert(reg->Register.File == TGSI_FILE_INPUT);
5249   get_index_registers(mach, reg, &index, &index2D);
5250
5251   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
5252      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
5253         continue;
5254
5255      /* Here we should add the change to use a sample that lies within the
5256       * primitive (Section 15.2):
5257       *
5258       * "When interpolating variables declared using centroid in ,
5259       * the variable is sampled at a location within the pixel covered
5260       * by the primitive generating the fragment.
5261       * ...
5262       * The built-in functions interpolateAtCentroid ... will sample
5263       * variables as though they were declared with the centroid ...
5264       * qualifier[s]."
5265       *
5266       * Since we only support 1 sample currently, this is just a pass-through.
5267       */
5268      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
5269                             &result[chan]);
5270      store_dest(mach, &result[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
5271   }
5272
5273}
5274
5275
5276/**
5277 * Execute a TGSI instruction.
5278 * Returns TRUE if a barrier instruction is hit,
5279 * otherwise FALSE.
5280 */
5281static boolean
5282exec_instruction(
5283   struct tgsi_exec_machine *mach,
5284   const struct tgsi_full_instruction *inst,
5285   int *pc )
5286{
5287   union tgsi_exec_channel r[10];
5288
5289   (*pc)++;
5290
5291   switch (inst->Instruction.Opcode) {
5292   case TGSI_OPCODE_ARL:
5293      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5294      break;
5295
5296   case TGSI_OPCODE_MOV:
5297      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5298      break;
5299
5300   case TGSI_OPCODE_LIT:
5301      exec_lit(mach, inst);
5302      break;
5303
5304   case TGSI_OPCODE_RCP:
5305      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5306      break;
5307
5308   case TGSI_OPCODE_RSQ:
5309      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5310      break;
5311
5312   case TGSI_OPCODE_EXP:
5313      exec_exp(mach, inst);
5314      break;
5315
5316   case TGSI_OPCODE_LOG:
5317      exec_log(mach, inst);
5318      break;
5319
5320   case TGSI_OPCODE_MUL:
5321      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5322      break;
5323
5324   case TGSI_OPCODE_ADD:
5325      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5326      break;
5327
5328   case TGSI_OPCODE_DP3:
5329      exec_dp3(mach, inst);
5330      break;
5331
5332   case TGSI_OPCODE_DP4:
5333      exec_dp4(mach, inst);
5334      break;
5335
5336   case TGSI_OPCODE_DST:
5337      exec_dst(mach, inst);
5338      break;
5339
5340   case TGSI_OPCODE_MIN:
5341      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5342      break;
5343
5344   case TGSI_OPCODE_MAX:
5345      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5346      break;
5347
5348   case TGSI_OPCODE_SLT:
5349      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5350      break;
5351
5352   case TGSI_OPCODE_SGE:
5353      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5354      break;
5355
5356   case TGSI_OPCODE_MAD:
5357      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5358      break;
5359
5360   case TGSI_OPCODE_LRP:
5361      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5362      break;
5363
5364   case TGSI_OPCODE_SQRT:
5365      exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5366      break;
5367
5368   case TGSI_OPCODE_FRC:
5369      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5370      break;
5371
5372   case TGSI_OPCODE_FLR:
5373      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5374      break;
5375
5376   case TGSI_OPCODE_ROUND:
5377      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5378      break;
5379
5380   case TGSI_OPCODE_EX2:
5381      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5382      break;
5383
5384   case TGSI_OPCODE_LG2:
5385      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5386      break;
5387
5388   case TGSI_OPCODE_POW:
5389      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5390      break;
5391
5392   case TGSI_OPCODE_LDEXP:
5393      exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5394      break;
5395
5396   case TGSI_OPCODE_COS:
5397      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5398      break;
5399
5400   case TGSI_OPCODE_DDX:
5401      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5402      break;
5403
5404   case TGSI_OPCODE_DDY:
5405      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5406      break;
5407
5408   case TGSI_OPCODE_KILL:
5409      exec_kill (mach);
5410      break;
5411
5412   case TGSI_OPCODE_KILL_IF:
5413      exec_kill_if (mach, inst);
5414      break;
5415
5416   case TGSI_OPCODE_PK2H:
5417      exec_pk2h(mach, inst);
5418      break;
5419
5420   case TGSI_OPCODE_PK2US:
5421      assert (0);
5422      break;
5423
5424   case TGSI_OPCODE_PK4B:
5425      assert (0);
5426      break;
5427
5428   case TGSI_OPCODE_PK4UB:
5429      assert (0);
5430      break;
5431
5432   case TGSI_OPCODE_SEQ:
5433      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5434      break;
5435
5436   case TGSI_OPCODE_SGT:
5437      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5438      break;
5439
5440   case TGSI_OPCODE_SIN:
5441      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5442      break;
5443
5444   case TGSI_OPCODE_SLE:
5445      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5446      break;
5447
5448   case TGSI_OPCODE_SNE:
5449      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5450      break;
5451
5452   case TGSI_OPCODE_TEX:
5453      /* simple texture lookup */
5454      /* src[0] = texcoord */
5455      /* src[1] = sampler unit */
5456      exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5457      break;
5458
5459   case TGSI_OPCODE_TXB:
5460      /* Texture lookup with lod bias */
5461      /* src[0] = texcoord (src[0].w = LOD bias) */
5462      /* src[1] = sampler unit */
5463      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5464      break;
5465
5466   case TGSI_OPCODE_TXD:
5467      /* Texture lookup with explict partial derivatives */
5468      /* src[0] = texcoord */
5469      /* src[1] = d[strq]/dx */
5470      /* src[2] = d[strq]/dy */
5471      /* src[3] = sampler unit */
5472      exec_txd(mach, inst);
5473      break;
5474
5475   case TGSI_OPCODE_TXL:
5476      /* Texture lookup with explit LOD */
5477      /* src[0] = texcoord (src[0].w = LOD) */
5478      /* src[1] = sampler unit */
5479      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5480      break;
5481
5482   case TGSI_OPCODE_TXP:
5483      /* Texture lookup with projection */
5484      /* src[0] = texcoord (src[0].w = projection) */
5485      /* src[1] = sampler unit */
5486      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5487      break;
5488
5489   case TGSI_OPCODE_TG4:
5490      /* src[0] = texcoord */
5491      /* src[1] = component */
5492      /* src[2] = sampler unit */
5493      exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5494      break;
5495
5496   case TGSI_OPCODE_LODQ:
5497      /* src[0] = texcoord */
5498      /* src[1] = sampler unit */
5499      exec_lodq(mach, inst);
5500      break;
5501
5502   case TGSI_OPCODE_UP2H:
5503      exec_up2h(mach, inst);
5504      break;
5505
5506   case TGSI_OPCODE_UP2US:
5507      assert (0);
5508      break;
5509
5510   case TGSI_OPCODE_UP4B:
5511      assert (0);
5512      break;
5513
5514   case TGSI_OPCODE_UP4UB:
5515      assert (0);
5516      break;
5517
5518   case TGSI_OPCODE_ARR:
5519      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5520      break;
5521
5522   case TGSI_OPCODE_CAL:
5523      /* skip the call if no execution channels are enabled */
5524      if (mach->ExecMask) {
5525         /* do the call */
5526
5527         /* First, record the depths of the execution stacks.
5528          * This is important for deeply nested/looped return statements.
5529          * We have to unwind the stacks by the correct amount.  For a
5530          * real code generator, we could determine the number of entries
5531          * to pop off each stack with simple static analysis and avoid
5532          * implementing this data structure at run time.
5533          */
5534         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5535         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5536         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5537         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5538         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5539         /* note that PC was already incremented above */
5540         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5541
5542         mach->CallStackTop++;
5543
5544         /* Second, push the Cond, Loop, Cont, Func stacks */
5545         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5546         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5547         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5548         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5549         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5550         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5551
5552         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5553         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5554         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5555         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5556         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5557         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5558
5559         /* Finally, jump to the subroutine.  The label is a pointer
5560          * (an instruction number) to the BGNSUB instruction.
5561          */
5562         *pc = inst->Label.Label;
5563         assert(mach->Instructions[*pc].Instruction.Opcode
5564                == TGSI_OPCODE_BGNSUB);
5565      }
5566      break;
5567
5568   case TGSI_OPCODE_RET:
5569      mach->FuncMask &= ~mach->ExecMask;
5570      UPDATE_EXEC_MASK(mach);
5571
5572      if (mach->FuncMask == 0x0) {
5573         /* really return now (otherwise, keep executing */
5574
5575         if (mach->CallStackTop == 0) {
5576            /* returning from main() */
5577            mach->CondStackTop = 0;
5578            mach->LoopStackTop = 0;
5579            mach->ContStackTop = 0;
5580            mach->LoopLabelStackTop = 0;
5581            mach->SwitchStackTop = 0;
5582            mach->BreakStackTop = 0;
5583            *pc = -1;
5584            return FALSE;
5585         }
5586
5587         assert(mach->CallStackTop > 0);
5588         mach->CallStackTop--;
5589
5590         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5591         mach->CondMask = mach->CondStack[mach->CondStackTop];
5592
5593         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5594         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5595
5596         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5597         mach->ContMask = mach->ContStack[mach->ContStackTop];
5598
5599         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5600         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5601
5602         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5603         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5604
5605         assert(mach->FuncStackTop > 0);
5606         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5607
5608         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5609
5610         UPDATE_EXEC_MASK(mach);
5611      }
5612      break;
5613
5614   case TGSI_OPCODE_SSG:
5615      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5616      break;
5617
5618   case TGSI_OPCODE_CMP:
5619      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5620      break;
5621
5622   case TGSI_OPCODE_DIV:
5623      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5624      break;
5625
5626   case TGSI_OPCODE_DP2:
5627      exec_dp2(mach, inst);
5628      break;
5629
5630   case TGSI_OPCODE_IF:
5631      /* push CondMask */
5632      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5633      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5634      FETCH( &r[0], 0, TGSI_CHAN_X );
5635      /* update CondMask */
5636      if( ! r[0].f[0] ) {
5637         mach->CondMask &= ~0x1;
5638      }
5639      if( ! r[0].f[1] ) {
5640         mach->CondMask &= ~0x2;
5641      }
5642      if( ! r[0].f[2] ) {
5643         mach->CondMask &= ~0x4;
5644      }
5645      if( ! r[0].f[3] ) {
5646         mach->CondMask &= ~0x8;
5647      }
5648      UPDATE_EXEC_MASK(mach);
5649      /* Todo: If CondMask==0, jump to ELSE */
5650      break;
5651
5652   case TGSI_OPCODE_UIF:
5653      /* push CondMask */
5654      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5655      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5656      IFETCH( &r[0], 0, TGSI_CHAN_X );
5657      /* update CondMask */
5658      if( ! r[0].u[0] ) {
5659         mach->CondMask &= ~0x1;
5660      }
5661      if( ! r[0].u[1] ) {
5662         mach->CondMask &= ~0x2;
5663      }
5664      if( ! r[0].u[2] ) {
5665         mach->CondMask &= ~0x4;
5666      }
5667      if( ! r[0].u[3] ) {
5668         mach->CondMask &= ~0x8;
5669      }
5670      UPDATE_EXEC_MASK(mach);
5671      /* Todo: If CondMask==0, jump to ELSE */
5672      break;
5673
5674   case TGSI_OPCODE_ELSE:
5675      /* invert CondMask wrt previous mask */
5676      {
5677         uint prevMask;
5678         assert(mach->CondStackTop > 0);
5679         prevMask = mach->CondStack[mach->CondStackTop - 1];
5680         mach->CondMask = ~mach->CondMask & prevMask;
5681         UPDATE_EXEC_MASK(mach);
5682         /* Todo: If CondMask==0, jump to ENDIF */
5683      }
5684      break;
5685
5686   case TGSI_OPCODE_ENDIF:
5687      /* pop CondMask */
5688      assert(mach->CondStackTop > 0);
5689      mach->CondMask = mach->CondStack[--mach->CondStackTop];
5690      UPDATE_EXEC_MASK(mach);
5691      break;
5692
5693   case TGSI_OPCODE_END:
5694      /* make sure we end primitives which haven't
5695       * been explicitly emitted */
5696      conditional_emit_primitive(mach);
5697      /* halt execution */
5698      *pc = -1;
5699      break;
5700
5701   case TGSI_OPCODE_CEIL:
5702      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5703      break;
5704
5705   case TGSI_OPCODE_I2F:
5706      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
5707      break;
5708
5709   case TGSI_OPCODE_NOT:
5710      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5711      break;
5712
5713   case TGSI_OPCODE_TRUNC:
5714      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
5715      break;
5716
5717   case TGSI_OPCODE_SHL:
5718      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5719      break;
5720
5721   case TGSI_OPCODE_AND:
5722      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5723      break;
5724
5725   case TGSI_OPCODE_OR:
5726      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5727      break;
5728
5729   case TGSI_OPCODE_MOD:
5730      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5731      break;
5732
5733   case TGSI_OPCODE_XOR:
5734      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5735      break;
5736
5737   case TGSI_OPCODE_TXF:
5738      exec_txf(mach, inst);
5739      break;
5740
5741   case TGSI_OPCODE_TXQ:
5742      exec_txq(mach, inst);
5743      break;
5744
5745   case TGSI_OPCODE_EMIT:
5746      emit_vertex(mach, inst);
5747      break;
5748
5749   case TGSI_OPCODE_ENDPRIM:
5750      emit_primitive(mach, inst);
5751      break;
5752
5753   case TGSI_OPCODE_BGNLOOP:
5754      /* push LoopMask and ContMasks */
5755      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5756      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5757      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5758      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5759
5760      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5761      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5762      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5763      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5764      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5765      break;
5766
5767   case TGSI_OPCODE_ENDLOOP:
5768      /* Restore ContMask, but don't pop */
5769      assert(mach->ContStackTop > 0);
5770      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5771      UPDATE_EXEC_MASK(mach);
5772      if (mach->ExecMask) {
5773         /* repeat loop: jump to instruction just past BGNLOOP */
5774         assert(mach->LoopLabelStackTop > 0);
5775         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5776      }
5777      else {
5778         /* exit loop: pop LoopMask */
5779         assert(mach->LoopStackTop > 0);
5780         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5781         /* pop ContMask */
5782         assert(mach->ContStackTop > 0);
5783         mach->ContMask = mach->ContStack[--mach->ContStackTop];
5784         assert(mach->LoopLabelStackTop > 0);
5785         --mach->LoopLabelStackTop;
5786
5787         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5788      }
5789      UPDATE_EXEC_MASK(mach);
5790      break;
5791
5792   case TGSI_OPCODE_BRK:
5793      exec_break(mach);
5794      break;
5795
5796   case TGSI_OPCODE_CONT:
5797      /* turn off cont channels for each enabled exec channel */
5798      mach->ContMask &= ~mach->ExecMask;
5799      /* Todo: if mach->LoopMask == 0, jump to end of loop */
5800      UPDATE_EXEC_MASK(mach);
5801      break;
5802
5803   case TGSI_OPCODE_BGNSUB:
5804      /* no-op */
5805      break;
5806
5807   case TGSI_OPCODE_ENDSUB:
5808      /*
5809       * XXX: This really should be a no-op. We should never reach this opcode.
5810       */
5811
5812      assert(mach->CallStackTop > 0);
5813      mach->CallStackTop--;
5814
5815      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5816      mach->CondMask = mach->CondStack[mach->CondStackTop];
5817
5818      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5819      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5820
5821      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5822      mach->ContMask = mach->ContStack[mach->ContStackTop];
5823
5824      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5825      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5826
5827      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5828      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5829
5830      assert(mach->FuncStackTop > 0);
5831      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5832
5833      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5834
5835      UPDATE_EXEC_MASK(mach);
5836      break;
5837
5838   case TGSI_OPCODE_NOP:
5839      break;
5840
5841   case TGSI_OPCODE_F2I:
5842      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
5843      break;
5844
5845   case TGSI_OPCODE_FSEQ:
5846      exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5847      break;
5848
5849   case TGSI_OPCODE_FSGE:
5850      exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5851      break;
5852
5853   case TGSI_OPCODE_FSLT:
5854      exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5855      break;
5856
5857   case TGSI_OPCODE_FSNE:
5858      exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5859      break;
5860
5861   case TGSI_OPCODE_IDIV:
5862      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5863      break;
5864
5865   case TGSI_OPCODE_IMAX:
5866      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5867      break;
5868
5869   case TGSI_OPCODE_IMIN:
5870      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5871      break;
5872
5873   case TGSI_OPCODE_INEG:
5874      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5875      break;
5876
5877   case TGSI_OPCODE_ISGE:
5878      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5879      break;
5880
5881   case TGSI_OPCODE_ISHR:
5882      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5883      break;
5884
5885   case TGSI_OPCODE_ISLT:
5886      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5887      break;
5888
5889   case TGSI_OPCODE_F2U:
5890      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
5891      break;
5892
5893   case TGSI_OPCODE_U2F:
5894      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
5895      break;
5896
5897   case TGSI_OPCODE_UADD:
5898      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5899      break;
5900
5901   case TGSI_OPCODE_UDIV:
5902      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5903      break;
5904
5905   case TGSI_OPCODE_UMAD:
5906      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5907      break;
5908
5909   case TGSI_OPCODE_UMAX:
5910      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5911      break;
5912
5913   case TGSI_OPCODE_UMIN:
5914      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5915      break;
5916
5917   case TGSI_OPCODE_UMOD:
5918      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5919      break;
5920
5921   case TGSI_OPCODE_UMUL:
5922      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5923      break;
5924
5925   case TGSI_OPCODE_IMUL_HI:
5926      exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
5927      break;
5928
5929   case TGSI_OPCODE_UMUL_HI:
5930      exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5931      break;
5932
5933   case TGSI_OPCODE_USEQ:
5934      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5935      break;
5936
5937   case TGSI_OPCODE_USGE:
5938      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5939      break;
5940
5941   case TGSI_OPCODE_USHR:
5942      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5943      break;
5944
5945   case TGSI_OPCODE_USLT:
5946      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5947      break;
5948
5949   case TGSI_OPCODE_USNE:
5950      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
5951      break;
5952
5953   case TGSI_OPCODE_SWITCH:
5954      exec_switch(mach, inst);
5955      break;
5956
5957   case TGSI_OPCODE_CASE:
5958      exec_case(mach, inst);
5959      break;
5960
5961   case TGSI_OPCODE_DEFAULT:
5962      exec_default(mach);
5963      break;
5964
5965   case TGSI_OPCODE_ENDSWITCH:
5966      exec_endswitch(mach);
5967      break;
5968
5969   case TGSI_OPCODE_SAMPLE_I:
5970      exec_txf(mach, inst);
5971      break;
5972
5973   case TGSI_OPCODE_SAMPLE_I_MS:
5974      exec_txf(mach, inst);
5975      break;
5976
5977   case TGSI_OPCODE_SAMPLE:
5978      exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5979      break;
5980
5981   case TGSI_OPCODE_SAMPLE_B:
5982      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5983      break;
5984
5985   case TGSI_OPCODE_SAMPLE_C:
5986      exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5987      break;
5988
5989   case TGSI_OPCODE_SAMPLE_C_LZ:
5990      exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5991      break;
5992
5993   case TGSI_OPCODE_SAMPLE_D:
5994      exec_sample_d(mach, inst);
5995      break;
5996
5997   case TGSI_OPCODE_SAMPLE_L:
5998      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5999      break;
6000
6001   case TGSI_OPCODE_GATHER4:
6002      exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
6003      break;
6004
6005   case TGSI_OPCODE_SVIEWINFO:
6006      exec_txq(mach, inst);
6007      break;
6008
6009   case TGSI_OPCODE_SAMPLE_POS:
6010      assert(0);
6011      break;
6012
6013   case TGSI_OPCODE_SAMPLE_INFO:
6014      assert(0);
6015      break;
6016
6017   case TGSI_OPCODE_LOD:
6018      exec_lodq(mach, inst);
6019      break;
6020
6021   case TGSI_OPCODE_UARL:
6022      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6023      break;
6024
6025   case TGSI_OPCODE_UCMP:
6026      exec_ucmp(mach, inst);
6027      break;
6028
6029   case TGSI_OPCODE_IABS:
6030      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6031      break;
6032
6033   case TGSI_OPCODE_ISSG:
6034      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6035      break;
6036
6037   case TGSI_OPCODE_TEX2:
6038      /* simple texture lookup */
6039      /* src[0] = texcoord */
6040      /* src[1] = compare */
6041      /* src[2] = sampler unit */
6042      exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
6043      break;
6044   case TGSI_OPCODE_TXB2:
6045      /* simple texture lookup */
6046      /* src[0] = texcoord */
6047      /* src[1] = bias */
6048      /* src[2] = sampler unit */
6049      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
6050      break;
6051   case TGSI_OPCODE_TXL2:
6052      /* simple texture lookup */
6053      /* src[0] = texcoord */
6054      /* src[1] = lod */
6055      /* src[2] = sampler unit */
6056      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
6057      break;
6058
6059   case TGSI_OPCODE_IBFE:
6060      exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6061      break;
6062   case TGSI_OPCODE_UBFE:
6063      exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6064      break;
6065   case TGSI_OPCODE_BFI:
6066      exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6067      break;
6068   case TGSI_OPCODE_BREV:
6069      exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6070      break;
6071   case TGSI_OPCODE_POPC:
6072      exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
6073      break;
6074   case TGSI_OPCODE_LSB:
6075      exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6076      break;
6077   case TGSI_OPCODE_IMSB:
6078      exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
6079      break;
6080   case TGSI_OPCODE_UMSB:
6081      exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_UINT);
6082      break;
6083
6084   case TGSI_OPCODE_F2D:
6085      exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
6086      break;
6087
6088   case TGSI_OPCODE_D2F:
6089      exec_64_2_t(mach, inst, micro_d2f, TGSI_EXEC_DATA_FLOAT);
6090      break;
6091
6092   case TGSI_OPCODE_DABS:
6093      exec_double_unary(mach, inst, micro_dabs);
6094      break;
6095
6096   case TGSI_OPCODE_DNEG:
6097      exec_double_unary(mach, inst, micro_dneg);
6098      break;
6099
6100   case TGSI_OPCODE_DADD:
6101      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
6102      break;
6103
6104   case TGSI_OPCODE_DDIV:
6105      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
6106      break;
6107
6108   case TGSI_OPCODE_DMUL:
6109      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
6110      break;
6111
6112   case TGSI_OPCODE_DMAX:
6113      exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
6114      break;
6115
6116   case TGSI_OPCODE_DMIN:
6117      exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
6118      break;
6119
6120   case TGSI_OPCODE_DSLT:
6121      exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
6122      break;
6123
6124   case TGSI_OPCODE_DSGE:
6125      exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
6126      break;
6127
6128   case TGSI_OPCODE_DSEQ:
6129      exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
6130      break;
6131
6132   case TGSI_OPCODE_DSNE:
6133      exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
6134      break;
6135
6136   case TGSI_OPCODE_DRCP:
6137      exec_double_unary(mach, inst, micro_drcp);
6138      break;
6139
6140   case TGSI_OPCODE_DSQRT:
6141      exec_double_unary(mach, inst, micro_dsqrt);
6142      break;
6143
6144   case TGSI_OPCODE_DRSQ:
6145      exec_double_unary(mach, inst, micro_drsq);
6146      break;
6147
6148   case TGSI_OPCODE_DMAD:
6149      exec_double_trinary(mach, inst, micro_dmad);
6150      break;
6151
6152   case TGSI_OPCODE_DFRAC:
6153      exec_double_unary(mach, inst, micro_dfrac);
6154      break;
6155
6156   case TGSI_OPCODE_DLDEXP:
6157      exec_dldexp(mach, inst);
6158      break;
6159
6160   case TGSI_OPCODE_DFRACEXP:
6161      exec_dfracexp(mach, inst);
6162      break;
6163
6164   case TGSI_OPCODE_I2D:
6165      exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_INT);
6166      break;
6167
6168   case TGSI_OPCODE_D2I:
6169      exec_64_2_t(mach, inst, micro_d2i, TGSI_EXEC_DATA_INT);
6170      break;
6171
6172   case TGSI_OPCODE_U2D:
6173      exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_UINT);
6174      break;
6175
6176   case TGSI_OPCODE_D2U:
6177      exec_64_2_t(mach, inst, micro_d2u, TGSI_EXEC_DATA_INT);
6178      break;
6179
6180   case TGSI_OPCODE_LOAD:
6181      exec_load(mach, inst);
6182      break;
6183
6184   case TGSI_OPCODE_STORE:
6185      exec_store(mach, inst);
6186      break;
6187
6188   case TGSI_OPCODE_ATOMUADD:
6189   case TGSI_OPCODE_ATOMXCHG:
6190   case TGSI_OPCODE_ATOMCAS:
6191   case TGSI_OPCODE_ATOMAND:
6192   case TGSI_OPCODE_ATOMOR:
6193   case TGSI_OPCODE_ATOMXOR:
6194   case TGSI_OPCODE_ATOMUMIN:
6195   case TGSI_OPCODE_ATOMUMAX:
6196   case TGSI_OPCODE_ATOMIMIN:
6197   case TGSI_OPCODE_ATOMIMAX:
6198   case TGSI_OPCODE_ATOMFADD:
6199      exec_atomop(mach, inst);
6200      break;
6201
6202   case TGSI_OPCODE_RESQ:
6203      exec_resq(mach, inst);
6204      break;
6205   case TGSI_OPCODE_BARRIER:
6206   case TGSI_OPCODE_MEMBAR:
6207      return TRUE;
6208      break;
6209
6210   case TGSI_OPCODE_I64ABS:
6211      exec_double_unary(mach, inst, micro_i64abs);
6212      break;
6213
6214   case TGSI_OPCODE_I64SSG:
6215      exec_double_unary(mach, inst, micro_i64sgn);
6216      break;
6217
6218   case TGSI_OPCODE_I64NEG:
6219      exec_double_unary(mach, inst, micro_i64neg);
6220      break;
6221
6222   case TGSI_OPCODE_U64SEQ:
6223      exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
6224      break;
6225
6226   case TGSI_OPCODE_U64SNE:
6227      exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
6228      break;
6229
6230   case TGSI_OPCODE_I64SLT:
6231      exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
6232      break;
6233   case TGSI_OPCODE_U64SLT:
6234      exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
6235      break;
6236
6237   case TGSI_OPCODE_I64SGE:
6238      exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
6239      break;
6240   case TGSI_OPCODE_U64SGE:
6241      exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
6242      break;
6243
6244   case TGSI_OPCODE_I64MIN:
6245      exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
6246      break;
6247   case TGSI_OPCODE_U64MIN:
6248      exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
6249      break;
6250   case TGSI_OPCODE_I64MAX:
6251      exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
6252      break;
6253   case TGSI_OPCODE_U64MAX:
6254      exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
6255      break;
6256   case TGSI_OPCODE_U64ADD:
6257      exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
6258      break;
6259   case TGSI_OPCODE_U64MUL:
6260      exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
6261      break;
6262   case TGSI_OPCODE_U64SHL:
6263      exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
6264      break;
6265   case TGSI_OPCODE_I64SHR:
6266      exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
6267      break;
6268   case TGSI_OPCODE_U64SHR:
6269      exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
6270      break;
6271   case TGSI_OPCODE_U64DIV:
6272      exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
6273      break;
6274   case TGSI_OPCODE_I64DIV:
6275      exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
6276      break;
6277   case TGSI_OPCODE_U64MOD:
6278      exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
6279      break;
6280   case TGSI_OPCODE_I64MOD:
6281      exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
6282      break;
6283
6284   case TGSI_OPCODE_F2U64:
6285      exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6286      break;
6287
6288   case TGSI_OPCODE_F2I64:
6289      exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6290      break;
6291
6292   case TGSI_OPCODE_U2I64:
6293      exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6294      break;
6295   case TGSI_OPCODE_I2I64:
6296      exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6297      break;
6298
6299   case TGSI_OPCODE_D2U64:
6300      exec_double_unary(mach, inst, micro_d2u64);
6301      break;
6302
6303   case TGSI_OPCODE_D2I64:
6304      exec_double_unary(mach, inst, micro_d2i64);
6305      break;
6306
6307   case TGSI_OPCODE_U642F:
6308      exec_64_2_t(mach, inst, micro_u642f, TGSI_EXEC_DATA_FLOAT);
6309      break;
6310   case TGSI_OPCODE_I642F:
6311      exec_64_2_t(mach, inst, micro_i642f, TGSI_EXEC_DATA_FLOAT);
6312      break;
6313
6314   case TGSI_OPCODE_U642D:
6315      exec_double_unary(mach, inst, micro_u642d);
6316      break;
6317   case TGSI_OPCODE_I642D:
6318      exec_double_unary(mach, inst, micro_i642d);
6319      break;
6320   case TGSI_OPCODE_INTERP_SAMPLE:
6321      exec_interp_at_sample(mach, inst);
6322      break;
6323   case TGSI_OPCODE_INTERP_OFFSET:
6324      exec_interp_at_offset(mach, inst);
6325      break;
6326   case TGSI_OPCODE_INTERP_CENTROID:
6327      exec_interp_at_centroid(mach, inst);
6328      break;
6329   default:
6330      assert( 0 );
6331   }
6332   return FALSE;
6333}
6334
6335static void
6336tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6337{
6338   uint default_mask = 0xf;
6339
6340   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
6341   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
6342
6343   if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6344      for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
6345         mach->Temps[temp_prim_idxs[i].idx].xyzw[temp_prim_idxs[i].chan].u[0] = 0;
6346         mach->Primitives[i][0] = 0;
6347      }
6348      /* GS runs on a single primitive for now */
6349      default_mask = 0x1;
6350   }
6351
6352   if (mach->NonHelperMask == 0)
6353      mach->NonHelperMask = default_mask;
6354   mach->CondMask = default_mask;
6355   mach->LoopMask = default_mask;
6356   mach->ContMask = default_mask;
6357   mach->FuncMask = default_mask;
6358   mach->ExecMask = default_mask;
6359
6360   mach->Switch.mask = default_mask;
6361
6362   assert(mach->CondStackTop == 0);
6363   assert(mach->LoopStackTop == 0);
6364   assert(mach->ContStackTop == 0);
6365   assert(mach->SwitchStackTop == 0);
6366   assert(mach->BreakStackTop == 0);
6367   assert(mach->CallStackTop == 0);
6368}
6369
6370/**
6371 * Run TGSI interpreter.
6372 * \return bitmask of "alive" quad components
6373 */
6374uint
6375tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6376{
6377   uint i;
6378
6379   mach->pc = start_pc;
6380
6381   if (!start_pc) {
6382      tgsi_exec_machine_setup_masks(mach);
6383
6384      /* execute declarations (interpolants) */
6385      for (i = 0; i < mach->NumDeclarations; i++) {
6386         exec_declaration( mach, mach->Declarations+i );
6387      }
6388   }
6389
6390   {
6391#if DEBUG_EXECUTION
6392      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS];
6393      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6394      uint inst = 1;
6395
6396      if (!start_pc) {
6397         memset(mach->Temps, 0, sizeof(temps));
6398         if (mach->Outputs)
6399            memset(mach->Outputs, 0, sizeof(outputs));
6400         memset(temps, 0, sizeof(temps));
6401         memset(outputs, 0, sizeof(outputs));
6402      }
6403#endif
6404
6405      /* execute instructions, until pc is set to -1 */
6406      while (mach->pc != -1) {
6407         boolean barrier_hit;
6408#if DEBUG_EXECUTION
6409         uint i;
6410
6411         tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6412#endif
6413
6414         assert(mach->pc < (int) mach->NumInstructions);
6415         barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6416
6417         /* for compute shaders if we hit a barrier return now for later rescheduling */
6418         if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6419            return 0;
6420
6421#if DEBUG_EXECUTION
6422         for (i = 0; i < TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS; i++) {
6423            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6424               uint j;
6425
6426               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6427               debug_printf("TEMP[%2u] = ", i);
6428               for (j = 0; j < 4; j++) {
6429                  if (j > 0) {
6430                     debug_printf("           ");
6431                  }
6432                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6433                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6434                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6435                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6436                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6437               }
6438            }
6439         }
6440         if (mach->Outputs) {
6441            for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6442               if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6443                  uint j;
6444
6445                  memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6446                  debug_printf("OUT[%2u] =  ", i);
6447                  for (j = 0; j < 4; j++) {
6448                     if (j > 0) {
6449                        debug_printf("           ");
6450                     }
6451                     debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6452                                  outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6453                                  outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6454                                  outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6455                                  outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6456                  }
6457               }
6458            }
6459         }
6460#endif
6461      }
6462   }
6463
6464#if 0
6465   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6466   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6467      /*
6468       * Scale back depth component.
6469       */
6470      for (i = 0; i < 4; i++)
6471         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6472   }
6473#endif
6474
6475   /* Strictly speaking, these assertions aren't really needed but they
6476    * can potentially catch some bugs in the control flow code.
6477    */
6478   assert(mach->CondStackTop == 0);
6479   assert(mach->LoopStackTop == 0);
6480   assert(mach->ContStackTop == 0);
6481   assert(mach->SwitchStackTop == 0);
6482   assert(mach->BreakStackTop == 0);
6483   assert(mach->CallStackTop == 0);
6484
6485   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
6486}
6487