tgsi_exec.c revision 7ec681f3
1/**************************************************************************
2 *
3 * Copyright 2007-2008 VMware, Inc.
4 * All Rights Reserved.
5 * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29/**
30 * TGSI interpreter/executor.
31 *
32 * Flow control information:
33 *
34 * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
35 * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
36 * care since a condition may be true for some quad components but false
37 * for other components.
38 *
39 * We basically execute all statements (even if they're in the part of
40 * an IF/ELSE clause that's "not taken") and use a special mask to
41 * control writing to destination registers.  This is the ExecMask.
42 * See store_dest().
43 *
44 * The ExecMask is computed from three other masks (CondMask, LoopMask and
45 * ContMask) which are controlled by the flow control instructions (namely:
46 * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
47 *
48 *
49 * Authors:
50 *   Michal Krol
51 *   Brian Paul
52 */
53
54#include "pipe/p_compiler.h"
55#include "pipe/p_state.h"
56#include "pipe/p_shader_tokens.h"
57#include "tgsi/tgsi_dump.h"
58#include "tgsi/tgsi_parse.h"
59#include "tgsi/tgsi_util.h"
60#include "tgsi_exec.h"
61#include "util/compiler.h"
62#include "util/half_float.h"
63#include "util/u_memory.h"
64#include "util/u_math.h"
65#include "util/rounding.h"
66
67
68#define DEBUG_EXECUTION 0
69
70
71#define TILE_TOP_LEFT     0
72#define TILE_TOP_RIGHT    1
73#define TILE_BOTTOM_LEFT  2
74#define TILE_BOTTOM_RIGHT 3
75
76union tgsi_double_channel {
77   double d[TGSI_QUAD_SIZE];
78   unsigned u[TGSI_QUAD_SIZE][2];
79   uint64_t u64[TGSI_QUAD_SIZE];
80   int64_t i64[TGSI_QUAD_SIZE];
81} ALIGN16;
82
83struct ALIGN16 tgsi_double_vector {
84   union tgsi_double_channel xy;
85   union tgsi_double_channel zw;
86};
87
88static void
89micro_abs(union tgsi_exec_channel *dst,
90          const union tgsi_exec_channel *src)
91{
92   dst->f[0] = fabsf(src->f[0]);
93   dst->f[1] = fabsf(src->f[1]);
94   dst->f[2] = fabsf(src->f[2]);
95   dst->f[3] = fabsf(src->f[3]);
96}
97
98static void
99micro_arl(union tgsi_exec_channel *dst,
100          const union tgsi_exec_channel *src)
101{
102   dst->i[0] = (int)floorf(src->f[0]);
103   dst->i[1] = (int)floorf(src->f[1]);
104   dst->i[2] = (int)floorf(src->f[2]);
105   dst->i[3] = (int)floorf(src->f[3]);
106}
107
108static void
109micro_arr(union tgsi_exec_channel *dst,
110          const union tgsi_exec_channel *src)
111{
112   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
113   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
114   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
115   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
116}
117
118static void
119micro_ceil(union tgsi_exec_channel *dst,
120           const union tgsi_exec_channel *src)
121{
122   dst->f[0] = ceilf(src->f[0]);
123   dst->f[1] = ceilf(src->f[1]);
124   dst->f[2] = ceilf(src->f[2]);
125   dst->f[3] = ceilf(src->f[3]);
126}
127
128static void
129micro_cmp(union tgsi_exec_channel *dst,
130          const union tgsi_exec_channel *src0,
131          const union tgsi_exec_channel *src1,
132          const union tgsi_exec_channel *src2)
133{
134   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
135   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
136   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
137   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
138}
139
140static void
141micro_cos(union tgsi_exec_channel *dst,
142          const union tgsi_exec_channel *src)
143{
144   dst->f[0] = cosf(src->f[0]);
145   dst->f[1] = cosf(src->f[1]);
146   dst->f[2] = cosf(src->f[2]);
147   dst->f[3] = cosf(src->f[3]);
148}
149
150static void
151micro_d2f(union tgsi_exec_channel *dst,
152          const union tgsi_double_channel *src)
153{
154   dst->f[0] = (float)src->d[0];
155   dst->f[1] = (float)src->d[1];
156   dst->f[2] = (float)src->d[2];
157   dst->f[3] = (float)src->d[3];
158}
159
160static void
161micro_d2i(union tgsi_exec_channel *dst,
162          const union tgsi_double_channel *src)
163{
164   dst->i[0] = (int)src->d[0];
165   dst->i[1] = (int)src->d[1];
166   dst->i[2] = (int)src->d[2];
167   dst->i[3] = (int)src->d[3];
168}
169
170static void
171micro_d2u(union tgsi_exec_channel *dst,
172          const union tgsi_double_channel *src)
173{
174   dst->u[0] = (unsigned)src->d[0];
175   dst->u[1] = (unsigned)src->d[1];
176   dst->u[2] = (unsigned)src->d[2];
177   dst->u[3] = (unsigned)src->d[3];
178}
179static void
180micro_dabs(union tgsi_double_channel *dst,
181           const union tgsi_double_channel *src)
182{
183   dst->d[0] = src->d[0] >= 0.0 ? src->d[0] : -src->d[0];
184   dst->d[1] = src->d[1] >= 0.0 ? src->d[1] : -src->d[1];
185   dst->d[2] = src->d[2] >= 0.0 ? src->d[2] : -src->d[2];
186   dst->d[3] = src->d[3] >= 0.0 ? src->d[3] : -src->d[3];
187}
188
189static void
190micro_dadd(union tgsi_double_channel *dst,
191          const union tgsi_double_channel *src)
192{
193   dst->d[0] = src[0].d[0] + src[1].d[0];
194   dst->d[1] = src[0].d[1] + src[1].d[1];
195   dst->d[2] = src[0].d[2] + src[1].d[2];
196   dst->d[3] = src[0].d[3] + src[1].d[3];
197}
198
199static void
200micro_ddiv(union tgsi_double_channel *dst,
201          const union tgsi_double_channel *src)
202{
203   dst->d[0] = src[0].d[0] / src[1].d[0];
204   dst->d[1] = src[0].d[1] / src[1].d[1];
205   dst->d[2] = src[0].d[2] / src[1].d[2];
206   dst->d[3] = src[0].d[3] / src[1].d[3];
207}
208
209static void
210micro_ddx(union tgsi_exec_channel *dst,
211          const union tgsi_exec_channel *src)
212{
213   dst->f[0] =
214   dst->f[1] =
215   dst->f[2] =
216   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
217}
218
219static void
220micro_ddx_fine(union tgsi_exec_channel *dst,
221          const union tgsi_exec_channel *src)
222{
223   dst->f[0] =
224   dst->f[1] = src->f[TILE_TOP_RIGHT] - src->f[TILE_TOP_LEFT];
225   dst->f[2] =
226   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
227}
228
229
230static void
231micro_ddy(union tgsi_exec_channel *dst,
232          const union tgsi_exec_channel *src)
233{
234   dst->f[0] =
235   dst->f[1] =
236   dst->f[2] =
237   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
238}
239
240static void
241micro_ddy_fine(union tgsi_exec_channel *dst,
242          const union tgsi_exec_channel *src)
243{
244   dst->f[0] =
245   dst->f[2] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
246   dst->f[1] =
247   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_TOP_RIGHT];
248}
249
250static void
251micro_dmul(union tgsi_double_channel *dst,
252           const union tgsi_double_channel *src)
253{
254   dst->d[0] = src[0].d[0] * src[1].d[0];
255   dst->d[1] = src[0].d[1] * src[1].d[1];
256   dst->d[2] = src[0].d[2] * src[1].d[2];
257   dst->d[3] = src[0].d[3] * src[1].d[3];
258}
259
260static void
261micro_dmax(union tgsi_double_channel *dst,
262           const union tgsi_double_channel *src)
263{
264   dst->d[0] = fmax(src[0].d[0], src[1].d[0]);
265   dst->d[1] = fmax(src[0].d[1], src[1].d[1]);
266   dst->d[2] = fmax(src[0].d[2], src[1].d[2]);
267   dst->d[3] = fmax(src[0].d[3], src[1].d[3]);
268}
269
270static void
271micro_dmin(union tgsi_double_channel *dst,
272           const union tgsi_double_channel *src)
273{
274   dst->d[0] = fmin(src[0].d[0], src[1].d[0]);
275   dst->d[1] = fmin(src[0].d[1], src[1].d[1]);
276   dst->d[2] = fmin(src[0].d[2], src[1].d[2]);
277   dst->d[3] = fmin(src[0].d[3], src[1].d[3]);
278}
279
280static void
281micro_dneg(union tgsi_double_channel *dst,
282           const union tgsi_double_channel *src)
283{
284   dst->d[0] = -src->d[0];
285   dst->d[1] = -src->d[1];
286   dst->d[2] = -src->d[2];
287   dst->d[3] = -src->d[3];
288}
289
290static void
291micro_dslt(union tgsi_double_channel *dst,
292           const union tgsi_double_channel *src)
293{
294   dst->u[0][0] = src[0].d[0] < src[1].d[0] ? ~0U : 0U;
295   dst->u[1][0] = src[0].d[1] < src[1].d[1] ? ~0U : 0U;
296   dst->u[2][0] = src[0].d[2] < src[1].d[2] ? ~0U : 0U;
297   dst->u[3][0] = src[0].d[3] < src[1].d[3] ? ~0U : 0U;
298}
299
300static void
301micro_dsne(union tgsi_double_channel *dst,
302           const union tgsi_double_channel *src)
303{
304   dst->u[0][0] = src[0].d[0] != src[1].d[0] ? ~0U : 0U;
305   dst->u[1][0] = src[0].d[1] != src[1].d[1] ? ~0U : 0U;
306   dst->u[2][0] = src[0].d[2] != src[1].d[2] ? ~0U : 0U;
307   dst->u[3][0] = src[0].d[3] != src[1].d[3] ? ~0U : 0U;
308}
309
310static void
311micro_dsge(union tgsi_double_channel *dst,
312           const union tgsi_double_channel *src)
313{
314   dst->u[0][0] = src[0].d[0] >= src[1].d[0] ? ~0U : 0U;
315   dst->u[1][0] = src[0].d[1] >= src[1].d[1] ? ~0U : 0U;
316   dst->u[2][0] = src[0].d[2] >= src[1].d[2] ? ~0U : 0U;
317   dst->u[3][0] = src[0].d[3] >= src[1].d[3] ? ~0U : 0U;
318}
319
320static void
321micro_dseq(union tgsi_double_channel *dst,
322           const union tgsi_double_channel *src)
323{
324   dst->u[0][0] = src[0].d[0] == src[1].d[0] ? ~0U : 0U;
325   dst->u[1][0] = src[0].d[1] == src[1].d[1] ? ~0U : 0U;
326   dst->u[2][0] = src[0].d[2] == src[1].d[2] ? ~0U : 0U;
327   dst->u[3][0] = src[0].d[3] == src[1].d[3] ? ~0U : 0U;
328}
329
330static void
331micro_drcp(union tgsi_double_channel *dst,
332           const union tgsi_double_channel *src)
333{
334   dst->d[0] = 1.0 / src->d[0];
335   dst->d[1] = 1.0 / src->d[1];
336   dst->d[2] = 1.0 / src->d[2];
337   dst->d[3] = 1.0 / src->d[3];
338}
339
340static void
341micro_dsqrt(union tgsi_double_channel *dst,
342            const union tgsi_double_channel *src)
343{
344   dst->d[0] = sqrt(src->d[0]);
345   dst->d[1] = sqrt(src->d[1]);
346   dst->d[2] = sqrt(src->d[2]);
347   dst->d[3] = sqrt(src->d[3]);
348}
349
350static void
351micro_drsq(union tgsi_double_channel *dst,
352          const union tgsi_double_channel *src)
353{
354   dst->d[0] = 1.0 / sqrt(src->d[0]);
355   dst->d[1] = 1.0 / sqrt(src->d[1]);
356   dst->d[2] = 1.0 / sqrt(src->d[2]);
357   dst->d[3] = 1.0 / sqrt(src->d[3]);
358}
359
360static void
361micro_dmad(union tgsi_double_channel *dst,
362           const union tgsi_double_channel *src)
363{
364   dst->d[0] = src[0].d[0] * src[1].d[0] + src[2].d[0];
365   dst->d[1] = src[0].d[1] * src[1].d[1] + src[2].d[1];
366   dst->d[2] = src[0].d[2] * src[1].d[2] + src[2].d[2];
367   dst->d[3] = src[0].d[3] * src[1].d[3] + src[2].d[3];
368}
369
370static void
371micro_dfrac(union tgsi_double_channel *dst,
372            const union tgsi_double_channel *src)
373{
374   dst->d[0] = src->d[0] - floor(src->d[0]);
375   dst->d[1] = src->d[1] - floor(src->d[1]);
376   dst->d[2] = src->d[2] - floor(src->d[2]);
377   dst->d[3] = src->d[3] - floor(src->d[3]);
378}
379
380static void
381micro_dflr(union tgsi_double_channel *dst,
382           const union tgsi_double_channel *src)
383{
384   dst->d[0] = floor(src->d[0]);
385   dst->d[1] = floor(src->d[1]);
386   dst->d[2] = floor(src->d[2]);
387   dst->d[3] = floor(src->d[3]);
388}
389
390static void
391micro_dldexp(union tgsi_double_channel *dst,
392             const union tgsi_double_channel *src0,
393             union tgsi_exec_channel *src1)
394{
395   dst->d[0] = ldexp(src0->d[0], src1->i[0]);
396   dst->d[1] = ldexp(src0->d[1], src1->i[1]);
397   dst->d[2] = ldexp(src0->d[2], src1->i[2]);
398   dst->d[3] = ldexp(src0->d[3], src1->i[3]);
399}
400
401static void
402micro_dfracexp(union tgsi_double_channel *dst,
403               union tgsi_exec_channel *dst_exp,
404               const union tgsi_double_channel *src)
405{
406   dst->d[0] = frexp(src->d[0], &dst_exp->i[0]);
407   dst->d[1] = frexp(src->d[1], &dst_exp->i[1]);
408   dst->d[2] = frexp(src->d[2], &dst_exp->i[2]);
409   dst->d[3] = frexp(src->d[3], &dst_exp->i[3]);
410}
411
412static void
413micro_exp2(union tgsi_exec_channel *dst,
414           const union tgsi_exec_channel *src)
415{
416#if DEBUG
417   /* Inf is okay for this instruction, so clamp it to silence assertions. */
418   uint i;
419   union tgsi_exec_channel clamped;
420
421   for (i = 0; i < 4; i++) {
422      if (src->f[i] > 127.99999f) {
423         clamped.f[i] = 127.99999f;
424      } else if (src->f[i] < -126.99999f) {
425         clamped.f[i] = -126.99999f;
426      } else {
427         clamped.f[i] = src->f[i];
428      }
429   }
430   src = &clamped;
431#endif /* DEBUG */
432
433   dst->f[0] = powf(2.0f, src->f[0]);
434   dst->f[1] = powf(2.0f, src->f[1]);
435   dst->f[2] = powf(2.0f, src->f[2]);
436   dst->f[3] = powf(2.0f, src->f[3]);
437}
438
439static void
440micro_f2d(union tgsi_double_channel *dst,
441          const union tgsi_exec_channel *src)
442{
443   dst->d[0] = (double)src->f[0];
444   dst->d[1] = (double)src->f[1];
445   dst->d[2] = (double)src->f[2];
446   dst->d[3] = (double)src->f[3];
447}
448
449static void
450micro_flr(union tgsi_exec_channel *dst,
451          const union tgsi_exec_channel *src)
452{
453   dst->f[0] = floorf(src->f[0]);
454   dst->f[1] = floorf(src->f[1]);
455   dst->f[2] = floorf(src->f[2]);
456   dst->f[3] = floorf(src->f[3]);
457}
458
459static void
460micro_frc(union tgsi_exec_channel *dst,
461          const union tgsi_exec_channel *src)
462{
463   dst->f[0] = src->f[0] - floorf(src->f[0]);
464   dst->f[1] = src->f[1] - floorf(src->f[1]);
465   dst->f[2] = src->f[2] - floorf(src->f[2]);
466   dst->f[3] = src->f[3] - floorf(src->f[3]);
467}
468
469static void
470micro_i2d(union tgsi_double_channel *dst,
471          const union tgsi_exec_channel *src)
472{
473   dst->d[0] = (double)src->i[0];
474   dst->d[1] = (double)src->i[1];
475   dst->d[2] = (double)src->i[2];
476   dst->d[3] = (double)src->i[3];
477}
478
479static void
480micro_iabs(union tgsi_exec_channel *dst,
481           const union tgsi_exec_channel *src)
482{
483   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
484   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
485   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
486   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
487}
488
489static void
490micro_ineg(union tgsi_exec_channel *dst,
491           const union tgsi_exec_channel *src)
492{
493   dst->i[0] = -src->i[0];
494   dst->i[1] = -src->i[1];
495   dst->i[2] = -src->i[2];
496   dst->i[3] = -src->i[3];
497}
498
499static void
500micro_lg2(union tgsi_exec_channel *dst,
501          const union tgsi_exec_channel *src)
502{
503   dst->f[0] = logf(src->f[0]) * 1.442695f;
504   dst->f[1] = logf(src->f[1]) * 1.442695f;
505   dst->f[2] = logf(src->f[2]) * 1.442695f;
506   dst->f[3] = logf(src->f[3]) * 1.442695f;
507}
508
509static void
510micro_lrp(union tgsi_exec_channel *dst,
511          const union tgsi_exec_channel *src0,
512          const union tgsi_exec_channel *src1,
513          const union tgsi_exec_channel *src2)
514{
515   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
516   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
517   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
518   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
519}
520
521static void
522micro_mad(union tgsi_exec_channel *dst,
523          const union tgsi_exec_channel *src0,
524          const union tgsi_exec_channel *src1,
525          const union tgsi_exec_channel *src2)
526{
527   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
528   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
529   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
530   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
531}
532
533static void
534micro_mov(union tgsi_exec_channel *dst,
535          const union tgsi_exec_channel *src)
536{
537   dst->u[0] = src->u[0];
538   dst->u[1] = src->u[1];
539   dst->u[2] = src->u[2];
540   dst->u[3] = src->u[3];
541}
542
543static void
544micro_rcp(union tgsi_exec_channel *dst,
545          const union tgsi_exec_channel *src)
546{
547#if 0 /* for debugging */
548   assert(src->f[0] != 0.0f);
549   assert(src->f[1] != 0.0f);
550   assert(src->f[2] != 0.0f);
551   assert(src->f[3] != 0.0f);
552#endif
553   dst->f[0] = 1.0f / src->f[0];
554   dst->f[1] = 1.0f / src->f[1];
555   dst->f[2] = 1.0f / src->f[2];
556   dst->f[3] = 1.0f / src->f[3];
557}
558
559static void
560micro_rnd(union tgsi_exec_channel *dst,
561          const union tgsi_exec_channel *src)
562{
563   dst->f[0] = _mesa_roundevenf(src->f[0]);
564   dst->f[1] = _mesa_roundevenf(src->f[1]);
565   dst->f[2] = _mesa_roundevenf(src->f[2]);
566   dst->f[3] = _mesa_roundevenf(src->f[3]);
567}
568
569static void
570micro_rsq(union tgsi_exec_channel *dst,
571          const union tgsi_exec_channel *src)
572{
573#if 0 /* for debugging */
574   assert(src->f[0] != 0.0f);
575   assert(src->f[1] != 0.0f);
576   assert(src->f[2] != 0.0f);
577   assert(src->f[3] != 0.0f);
578#endif
579   dst->f[0] = 1.0f / sqrtf(src->f[0]);
580   dst->f[1] = 1.0f / sqrtf(src->f[1]);
581   dst->f[2] = 1.0f / sqrtf(src->f[2]);
582   dst->f[3] = 1.0f / sqrtf(src->f[3]);
583}
584
585static void
586micro_sqrt(union tgsi_exec_channel *dst,
587           const union tgsi_exec_channel *src)
588{
589   dst->f[0] = sqrtf(src->f[0]);
590   dst->f[1] = sqrtf(src->f[1]);
591   dst->f[2] = sqrtf(src->f[2]);
592   dst->f[3] = sqrtf(src->f[3]);
593}
594
595static void
596micro_seq(union tgsi_exec_channel *dst,
597          const union tgsi_exec_channel *src0,
598          const union tgsi_exec_channel *src1)
599{
600   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
601   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
602   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
603   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
604}
605
606static void
607micro_sge(union tgsi_exec_channel *dst,
608          const union tgsi_exec_channel *src0,
609          const union tgsi_exec_channel *src1)
610{
611   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
612   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
613   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
614   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
615}
616
617static void
618micro_sgn(union tgsi_exec_channel *dst,
619          const union tgsi_exec_channel *src)
620{
621   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
622   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
623   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
624   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
625}
626
627static void
628micro_isgn(union tgsi_exec_channel *dst,
629          const union tgsi_exec_channel *src)
630{
631   dst->i[0] = src->i[0] < 0 ? -1 : src->i[0] > 0 ? 1 : 0;
632   dst->i[1] = src->i[1] < 0 ? -1 : src->i[1] > 0 ? 1 : 0;
633   dst->i[2] = src->i[2] < 0 ? -1 : src->i[2] > 0 ? 1 : 0;
634   dst->i[3] = src->i[3] < 0 ? -1 : src->i[3] > 0 ? 1 : 0;
635}
636
637static void
638micro_sgt(union tgsi_exec_channel *dst,
639          const union tgsi_exec_channel *src0,
640          const union tgsi_exec_channel *src1)
641{
642   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
643   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
644   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
645   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
646}
647
648static void
649micro_sin(union tgsi_exec_channel *dst,
650          const union tgsi_exec_channel *src)
651{
652   dst->f[0] = sinf(src->f[0]);
653   dst->f[1] = sinf(src->f[1]);
654   dst->f[2] = sinf(src->f[2]);
655   dst->f[3] = sinf(src->f[3]);
656}
657
658static void
659micro_sle(union tgsi_exec_channel *dst,
660          const union tgsi_exec_channel *src0,
661          const union tgsi_exec_channel *src1)
662{
663   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
664   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
665   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
666   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
667}
668
669static void
670micro_slt(union tgsi_exec_channel *dst,
671          const union tgsi_exec_channel *src0,
672          const union tgsi_exec_channel *src1)
673{
674   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
675   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
676   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
677   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
678}
679
680static void
681micro_sne(union tgsi_exec_channel *dst,
682          const union tgsi_exec_channel *src0,
683          const union tgsi_exec_channel *src1)
684{
685   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
686   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
687   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
688   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
689}
690
691static void
692micro_trunc(union tgsi_exec_channel *dst,
693            const union tgsi_exec_channel *src)
694{
695   dst->f[0] = truncf(src->f[0]);
696   dst->f[1] = truncf(src->f[1]);
697   dst->f[2] = truncf(src->f[2]);
698   dst->f[3] = truncf(src->f[3]);
699}
700
701static void
702micro_u2d(union tgsi_double_channel *dst,
703          const union tgsi_exec_channel *src)
704{
705   dst->d[0] = (double)src->u[0];
706   dst->d[1] = (double)src->u[1];
707   dst->d[2] = (double)src->u[2];
708   dst->d[3] = (double)src->u[3];
709}
710
711static void
712micro_i64abs(union tgsi_double_channel *dst,
713             const union tgsi_double_channel *src)
714{
715   dst->i64[0] = src->i64[0] >= 0.0 ? src->i64[0] : -src->i64[0];
716   dst->i64[1] = src->i64[1] >= 0.0 ? src->i64[1] : -src->i64[1];
717   dst->i64[2] = src->i64[2] >= 0.0 ? src->i64[2] : -src->i64[2];
718   dst->i64[3] = src->i64[3] >= 0.0 ? src->i64[3] : -src->i64[3];
719}
720
721static void
722micro_i64sgn(union tgsi_double_channel *dst,
723             const union tgsi_double_channel *src)
724{
725   dst->i64[0] = src->i64[0] < 0 ? -1 : src->i64[0] > 0 ? 1 : 0;
726   dst->i64[1] = src->i64[1] < 0 ? -1 : src->i64[1] > 0 ? 1 : 0;
727   dst->i64[2] = src->i64[2] < 0 ? -1 : src->i64[2] > 0 ? 1 : 0;
728   dst->i64[3] = src->i64[3] < 0 ? -1 : src->i64[3] > 0 ? 1 : 0;
729}
730
731static void
732micro_i64neg(union tgsi_double_channel *dst,
733             const union tgsi_double_channel *src)
734{
735   dst->i64[0] = -src->i64[0];
736   dst->i64[1] = -src->i64[1];
737   dst->i64[2] = -src->i64[2];
738   dst->i64[3] = -src->i64[3];
739}
740
741static void
742micro_u64seq(union tgsi_double_channel *dst,
743           const union tgsi_double_channel *src)
744{
745   dst->u[0][0] = src[0].u64[0] == src[1].u64[0] ? ~0U : 0U;
746   dst->u[1][0] = src[0].u64[1] == src[1].u64[1] ? ~0U : 0U;
747   dst->u[2][0] = src[0].u64[2] == src[1].u64[2] ? ~0U : 0U;
748   dst->u[3][0] = src[0].u64[3] == src[1].u64[3] ? ~0U : 0U;
749}
750
751static void
752micro_u64sne(union tgsi_double_channel *dst,
753             const union tgsi_double_channel *src)
754{
755   dst->u[0][0] = src[0].u64[0] != src[1].u64[0] ? ~0U : 0U;
756   dst->u[1][0] = src[0].u64[1] != src[1].u64[1] ? ~0U : 0U;
757   dst->u[2][0] = src[0].u64[2] != src[1].u64[2] ? ~0U : 0U;
758   dst->u[3][0] = src[0].u64[3] != src[1].u64[3] ? ~0U : 0U;
759}
760
761static void
762micro_i64slt(union tgsi_double_channel *dst,
763             const union tgsi_double_channel *src)
764{
765   dst->u[0][0] = src[0].i64[0] < src[1].i64[0] ? ~0U : 0U;
766   dst->u[1][0] = src[0].i64[1] < src[1].i64[1] ? ~0U : 0U;
767   dst->u[2][0] = src[0].i64[2] < src[1].i64[2] ? ~0U : 0U;
768   dst->u[3][0] = src[0].i64[3] < src[1].i64[3] ? ~0U : 0U;
769}
770
771static void
772micro_u64slt(union tgsi_double_channel *dst,
773             const union tgsi_double_channel *src)
774{
775   dst->u[0][0] = src[0].u64[0] < src[1].u64[0] ? ~0U : 0U;
776   dst->u[1][0] = src[0].u64[1] < src[1].u64[1] ? ~0U : 0U;
777   dst->u[2][0] = src[0].u64[2] < src[1].u64[2] ? ~0U : 0U;
778   dst->u[3][0] = src[0].u64[3] < src[1].u64[3] ? ~0U : 0U;
779}
780
781static void
782micro_i64sge(union tgsi_double_channel *dst,
783           const union tgsi_double_channel *src)
784{
785   dst->u[0][0] = src[0].i64[0] >= src[1].i64[0] ? ~0U : 0U;
786   dst->u[1][0] = src[0].i64[1] >= src[1].i64[1] ? ~0U : 0U;
787   dst->u[2][0] = src[0].i64[2] >= src[1].i64[2] ? ~0U : 0U;
788   dst->u[3][0] = src[0].i64[3] >= src[1].i64[3] ? ~0U : 0U;
789}
790
791static void
792micro_u64sge(union tgsi_double_channel *dst,
793             const union tgsi_double_channel *src)
794{
795   dst->u[0][0] = src[0].u64[0] >= src[1].u64[0] ? ~0U : 0U;
796   dst->u[1][0] = src[0].u64[1] >= src[1].u64[1] ? ~0U : 0U;
797   dst->u[2][0] = src[0].u64[2] >= src[1].u64[2] ? ~0U : 0U;
798   dst->u[3][0] = src[0].u64[3] >= src[1].u64[3] ? ~0U : 0U;
799}
800
801static void
802micro_u64max(union tgsi_double_channel *dst,
803             const union tgsi_double_channel *src)
804{
805   dst->u64[0] = src[0].u64[0] > src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
806   dst->u64[1] = src[0].u64[1] > src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
807   dst->u64[2] = src[0].u64[2] > src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
808   dst->u64[3] = src[0].u64[3] > src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
809}
810
811static void
812micro_i64max(union tgsi_double_channel *dst,
813             const union tgsi_double_channel *src)
814{
815   dst->i64[0] = src[0].i64[0] > src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
816   dst->i64[1] = src[0].i64[1] > src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
817   dst->i64[2] = src[0].i64[2] > src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
818   dst->i64[3] = src[0].i64[3] > src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
819}
820
821static void
822micro_u64min(union tgsi_double_channel *dst,
823             const union tgsi_double_channel *src)
824{
825   dst->u64[0] = src[0].u64[0] < src[1].u64[0] ? src[0].u64[0] : src[1].u64[0];
826   dst->u64[1] = src[0].u64[1] < src[1].u64[1] ? src[0].u64[1] : src[1].u64[1];
827   dst->u64[2] = src[0].u64[2] < src[1].u64[2] ? src[0].u64[2] : src[1].u64[2];
828   dst->u64[3] = src[0].u64[3] < src[1].u64[3] ? src[0].u64[3] : src[1].u64[3];
829}
830
831static void
832micro_i64min(union tgsi_double_channel *dst,
833             const union tgsi_double_channel *src)
834{
835   dst->i64[0] = src[0].i64[0] < src[1].i64[0] ? src[0].i64[0] : src[1].i64[0];
836   dst->i64[1] = src[0].i64[1] < src[1].i64[1] ? src[0].i64[1] : src[1].i64[1];
837   dst->i64[2] = src[0].i64[2] < src[1].i64[2] ? src[0].i64[2] : src[1].i64[2];
838   dst->i64[3] = src[0].i64[3] < src[1].i64[3] ? src[0].i64[3] : src[1].i64[3];
839}
840
841static void
842micro_u64add(union tgsi_double_channel *dst,
843             const union tgsi_double_channel *src)
844{
845   dst->u64[0] = src[0].u64[0] + src[1].u64[0];
846   dst->u64[1] = src[0].u64[1] + src[1].u64[1];
847   dst->u64[2] = src[0].u64[2] + src[1].u64[2];
848   dst->u64[3] = src[0].u64[3] + src[1].u64[3];
849}
850
851static void
852micro_u64mul(union tgsi_double_channel *dst,
853             const union tgsi_double_channel *src)
854{
855   dst->u64[0] = src[0].u64[0] * src[1].u64[0];
856   dst->u64[1] = src[0].u64[1] * src[1].u64[1];
857   dst->u64[2] = src[0].u64[2] * src[1].u64[2];
858   dst->u64[3] = src[0].u64[3] * src[1].u64[3];
859}
860
861static void
862micro_u64div(union tgsi_double_channel *dst,
863             const union tgsi_double_channel *src)
864{
865   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] / src[1].u64[0] : ~0ull;
866   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] / src[1].u64[1] : ~0ull;
867   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] / src[1].u64[2] : ~0ull;
868   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] / src[1].u64[3] : ~0ull;
869}
870
871static void
872micro_i64div(union tgsi_double_channel *dst,
873             const union tgsi_double_channel *src)
874{
875   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] / src[1].i64[0] : 0;
876   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] / src[1].i64[1] : 0;
877   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] / src[1].i64[2] : 0;
878   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] / src[1].i64[3] : 0;
879}
880
881static void
882micro_u64mod(union tgsi_double_channel *dst,
883             const union tgsi_double_channel *src)
884{
885   dst->u64[0] = src[1].u64[0] ? src[0].u64[0] % src[1].u64[0] : ~0ull;
886   dst->u64[1] = src[1].u64[1] ? src[0].u64[1] % src[1].u64[1] : ~0ull;
887   dst->u64[2] = src[1].u64[2] ? src[0].u64[2] % src[1].u64[2] : ~0ull;
888   dst->u64[3] = src[1].u64[3] ? src[0].u64[3] % src[1].u64[3] : ~0ull;
889}
890
891static void
892micro_i64mod(union tgsi_double_channel *dst,
893             const union tgsi_double_channel *src)
894{
895   dst->i64[0] = src[1].i64[0] ? src[0].i64[0] % src[1].i64[0] : ~0ll;
896   dst->i64[1] = src[1].i64[1] ? src[0].i64[1] % src[1].i64[1] : ~0ll;
897   dst->i64[2] = src[1].i64[2] ? src[0].i64[2] % src[1].i64[2] : ~0ll;
898   dst->i64[3] = src[1].i64[3] ? src[0].i64[3] % src[1].i64[3] : ~0ll;
899}
900
901static void
902micro_u64shl(union tgsi_double_channel *dst,
903             const union tgsi_double_channel *src0,
904             union tgsi_exec_channel *src1)
905{
906   unsigned masked_count;
907   masked_count = src1->u[0] & 0x3f;
908   dst->u64[0] = src0->u64[0] << masked_count;
909   masked_count = src1->u[1] & 0x3f;
910   dst->u64[1] = src0->u64[1] << masked_count;
911   masked_count = src1->u[2] & 0x3f;
912   dst->u64[2] = src0->u64[2] << masked_count;
913   masked_count = src1->u[3] & 0x3f;
914   dst->u64[3] = src0->u64[3] << masked_count;
915}
916
917static void
918micro_i64shr(union tgsi_double_channel *dst,
919             const union tgsi_double_channel *src0,
920             union tgsi_exec_channel *src1)
921{
922   unsigned masked_count;
923   masked_count = src1->u[0] & 0x3f;
924   dst->i64[0] = src0->i64[0] >> masked_count;
925   masked_count = src1->u[1] & 0x3f;
926   dst->i64[1] = src0->i64[1] >> masked_count;
927   masked_count = src1->u[2] & 0x3f;
928   dst->i64[2] = src0->i64[2] >> masked_count;
929   masked_count = src1->u[3] & 0x3f;
930   dst->i64[3] = src0->i64[3] >> masked_count;
931}
932
933static void
934micro_u64shr(union tgsi_double_channel *dst,
935             const union tgsi_double_channel *src0,
936             union tgsi_exec_channel *src1)
937{
938   unsigned masked_count;
939   masked_count = src1->u[0] & 0x3f;
940   dst->u64[0] = src0->u64[0] >> masked_count;
941   masked_count = src1->u[1] & 0x3f;
942   dst->u64[1] = src0->u64[1] >> masked_count;
943   masked_count = src1->u[2] & 0x3f;
944   dst->u64[2] = src0->u64[2] >> masked_count;
945   masked_count = src1->u[3] & 0x3f;
946   dst->u64[3] = src0->u64[3] >> masked_count;
947}
948
949enum tgsi_exec_datatype {
950   TGSI_EXEC_DATA_FLOAT,
951   TGSI_EXEC_DATA_INT,
952   TGSI_EXEC_DATA_UINT,
953   TGSI_EXEC_DATA_DOUBLE,
954   TGSI_EXEC_DATA_INT64,
955   TGSI_EXEC_DATA_UINT64,
956};
957
958/** The execution mask depends on the conditional mask and the loop mask */
959#define UPDATE_EXEC_MASK(MACH) \
960      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
961
962
963static const union tgsi_exec_channel ZeroVec =
964   { { 0.0, 0.0, 0.0, 0.0 } };
965
966static const union tgsi_exec_channel OneVec = {
967   {1.0f, 1.0f, 1.0f, 1.0f}
968};
969
970static const union tgsi_exec_channel P128Vec = {
971   {128.0f, 128.0f, 128.0f, 128.0f}
972};
973
974static const union tgsi_exec_channel M128Vec = {
975   {-128.0f, -128.0f, -128.0f, -128.0f}
976};
977
978
979/**
980 * Assert that none of the float values in 'chan' are infinite or NaN.
981 * NaN and Inf may occur normally during program execution and should
982 * not lead to crashes, etc.  But when debugging, it's helpful to catch
983 * them.
984 */
985static inline void
986check_inf_or_nan(const union tgsi_exec_channel *chan)
987{
988   assert(!util_is_inf_or_nan((chan)->f[0]));
989   assert(!util_is_inf_or_nan((chan)->f[1]));
990   assert(!util_is_inf_or_nan((chan)->f[2]));
991   assert(!util_is_inf_or_nan((chan)->f[3]));
992}
993
994
995#ifdef DEBUG
996static void
997print_chan(const char *msg, const union tgsi_exec_channel *chan)
998{
999   debug_printf("%s = {%f, %f, %f, %f}\n",
1000                msg, chan->f[0], chan->f[1], chan->f[2], chan->f[3]);
1001}
1002#endif
1003
1004
1005#ifdef DEBUG
1006static void
1007print_temp(const struct tgsi_exec_machine *mach, uint index)
1008{
1009   const struct tgsi_exec_vector *tmp = &mach->Temps[index];
1010   int i;
1011   debug_printf("Temp[%u] =\n", index);
1012   for (i = 0; i < 4; i++) {
1013      debug_printf("  %c: { %f, %f, %f, %f }\n",
1014                   "XYZW"[i],
1015                   tmp->xyzw[i].f[0],
1016                   tmp->xyzw[i].f[1],
1017                   tmp->xyzw[i].f[2],
1018                   tmp->xyzw[i].f[3]);
1019   }
1020}
1021#endif
1022
1023
1024void
1025tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
1026                               unsigned num_bufs,
1027                               const void **bufs,
1028                               const unsigned *buf_sizes)
1029{
1030   unsigned i;
1031
1032   for (i = 0; i < num_bufs; i++) {
1033      mach->Consts[i] = bufs[i];
1034      mach->ConstsSize[i] = buf_sizes[i];
1035   }
1036}
1037
1038/**
1039 * Initialize machine state by expanding tokens to full instructions,
1040 * allocating temporary storage, setting up constants, etc.
1041 * After this, we can call tgsi_exec_machine_run() many times.
1042 */
1043void
1044tgsi_exec_machine_bind_shader(
1045   struct tgsi_exec_machine *mach,
1046   const struct tgsi_token *tokens,
1047   struct tgsi_sampler *sampler,
1048   struct tgsi_image *image,
1049   struct tgsi_buffer *buffer)
1050{
1051   uint k;
1052   struct tgsi_parse_context parse;
1053   struct tgsi_full_instruction *instructions;
1054   struct tgsi_full_declaration *declarations;
1055   uint maxInstructions = 10, numInstructions = 0;
1056   uint maxDeclarations = 10, numDeclarations = 0;
1057
1058#if 0
1059   tgsi_dump(tokens, 0);
1060#endif
1061
1062   mach->Tokens = tokens;
1063   mach->Sampler = sampler;
1064   mach->Image = image;
1065   mach->Buffer = buffer;
1066
1067   if (!tokens) {
1068      /* unbind and free all */
1069      FREE(mach->Declarations);
1070      mach->Declarations = NULL;
1071      mach->NumDeclarations = 0;
1072
1073      FREE(mach->Instructions);
1074      mach->Instructions = NULL;
1075      mach->NumInstructions = 0;
1076
1077      return;
1078   }
1079
1080   k = tgsi_parse_init (&parse, mach->Tokens);
1081   if (k != TGSI_PARSE_OK) {
1082      debug_printf( "Problem parsing!\n" );
1083      return;
1084   }
1085
1086   mach->ImmLimit = 0;
1087   mach->NumOutputs = 0;
1088
1089   for (k = 0; k < TGSI_SEMANTIC_COUNT; k++)
1090      mach->SysSemanticToIndex[k] = -1;
1091
1092   if (mach->ShaderType == PIPE_SHADER_GEOMETRY &&
1093       !mach->UsedGeometryShader) {
1094      struct tgsi_exec_vector *inputs;
1095      struct tgsi_exec_vector *outputs;
1096
1097      inputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1098                            TGSI_MAX_PRIM_VERTICES * PIPE_MAX_SHADER_INPUTS,
1099                            16);
1100
1101      if (!inputs)
1102         return;
1103
1104      outputs = align_malloc(sizeof(struct tgsi_exec_vector) *
1105                             TGSI_MAX_TOTAL_VERTICES, 16);
1106
1107      if (!outputs) {
1108         align_free(inputs);
1109         return;
1110      }
1111
1112      align_free(mach->Inputs);
1113      align_free(mach->Outputs);
1114
1115      mach->Inputs = inputs;
1116      mach->Outputs = outputs;
1117      mach->UsedGeometryShader = TRUE;
1118   }
1119
1120   declarations = (struct tgsi_full_declaration *)
1121      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
1122
1123   if (!declarations) {
1124      return;
1125   }
1126
1127   instructions = (struct tgsi_full_instruction *)
1128      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
1129
1130   if (!instructions) {
1131      FREE( declarations );
1132      return;
1133   }
1134
1135   while( !tgsi_parse_end_of_tokens( &parse ) ) {
1136      uint i;
1137
1138      tgsi_parse_token( &parse );
1139      switch( parse.FullToken.Token.Type ) {
1140      case TGSI_TOKEN_TYPE_DECLARATION:
1141         /* save expanded declaration */
1142         if (numDeclarations == maxDeclarations) {
1143            declarations = REALLOC(declarations,
1144                                   maxDeclarations
1145                                   * sizeof(struct tgsi_full_declaration),
1146                                   (maxDeclarations + 10)
1147                                   * sizeof(struct tgsi_full_declaration));
1148            maxDeclarations += 10;
1149         }
1150         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT)
1151            mach->NumOutputs = MAX2(mach->NumOutputs, parse.FullToken.FullDeclaration.Range.Last + 1);
1152         else if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1153            const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
1154            mach->SysSemanticToIndex[decl->Semantic.Name] = decl->Range.First;
1155         }
1156
1157         memcpy(declarations + numDeclarations,
1158                &parse.FullToken.FullDeclaration,
1159                sizeof(declarations[0]));
1160         numDeclarations++;
1161         break;
1162
1163      case TGSI_TOKEN_TYPE_IMMEDIATE:
1164         {
1165            uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
1166            assert( size <= 4 );
1167            if (mach->ImmLimit >= mach->ImmsReserved) {
1168               unsigned newReserved = mach->ImmsReserved ? 2 * mach->ImmsReserved : 128;
1169               float4 *imms = REALLOC(mach->Imms, mach->ImmsReserved, newReserved * sizeof(float4));
1170               if (imms) {
1171                  mach->ImmsReserved = newReserved;
1172                  mach->Imms = imms;
1173               } else {
1174                  debug_printf("Unable to (re)allocate space for immidiate constants\n");
1175                  break;
1176               }
1177            }
1178
1179            for( i = 0; i < size; i++ ) {
1180               mach->Imms[mach->ImmLimit][i] =
1181		  parse.FullToken.FullImmediate.u[i].Float;
1182            }
1183            mach->ImmLimit += 1;
1184         }
1185         break;
1186
1187      case TGSI_TOKEN_TYPE_INSTRUCTION:
1188
1189         /* save expanded instruction */
1190         if (numInstructions == maxInstructions) {
1191            instructions = REALLOC(instructions,
1192                                   maxInstructions
1193                                   * sizeof(struct tgsi_full_instruction),
1194                                   (maxInstructions + 10)
1195                                   * sizeof(struct tgsi_full_instruction));
1196            maxInstructions += 10;
1197         }
1198
1199         memcpy(instructions + numInstructions,
1200                &parse.FullToken.FullInstruction,
1201                sizeof(instructions[0]));
1202
1203         numInstructions++;
1204         break;
1205
1206      case TGSI_TOKEN_TYPE_PROPERTY:
1207         if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
1208            if (parse.FullToken.FullProperty.Property.PropertyName == TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) {
1209               mach->MaxOutputVertices = parse.FullToken.FullProperty.u[0].Data;
1210            }
1211         }
1212         break;
1213
1214      default:
1215         assert( 0 );
1216      }
1217   }
1218   tgsi_parse_free (&parse);
1219
1220   FREE(mach->Declarations);
1221   mach->Declarations = declarations;
1222   mach->NumDeclarations = numDeclarations;
1223
1224   FREE(mach->Instructions);
1225   mach->Instructions = instructions;
1226   mach->NumInstructions = numInstructions;
1227}
1228
1229
1230struct tgsi_exec_machine *
1231tgsi_exec_machine_create(enum pipe_shader_type shader_type)
1232{
1233   struct tgsi_exec_machine *mach;
1234
1235   mach = align_malloc( sizeof *mach, 16 );
1236   if (!mach)
1237      goto fail;
1238
1239   memset(mach, 0, sizeof(*mach));
1240
1241   mach->ShaderType = shader_type;
1242
1243   if (shader_type != PIPE_SHADER_COMPUTE) {
1244      mach->Inputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_INPUTS, 16);
1245      mach->Outputs = align_malloc(sizeof(struct tgsi_exec_vector) * PIPE_MAX_SHADER_OUTPUTS, 16);
1246      if (!mach->Inputs || !mach->Outputs)
1247         goto fail;
1248   }
1249
1250   if (shader_type == PIPE_SHADER_FRAGMENT) {
1251      mach->InputSampleOffsetApply = align_malloc(sizeof(apply_sample_offset_func) * PIPE_MAX_SHADER_INPUTS, 16);
1252      if (!mach->InputSampleOffsetApply)
1253         goto fail;
1254   }
1255
1256#ifdef DEBUG
1257   /* silence warnings */
1258   (void) print_chan;
1259   (void) print_temp;
1260#endif
1261
1262   return mach;
1263
1264fail:
1265   if (mach) {
1266      align_free(mach->InputSampleOffsetApply);
1267      align_free(mach->Inputs);
1268      align_free(mach->Outputs);
1269      align_free(mach);
1270   }
1271   return NULL;
1272}
1273
1274
1275void
1276tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
1277{
1278   if (mach) {
1279      FREE(mach->Instructions);
1280      FREE(mach->Declarations);
1281      FREE(mach->Imms);
1282
1283      align_free(mach->InputSampleOffsetApply);
1284      align_free(mach->Inputs);
1285      align_free(mach->Outputs);
1286
1287      align_free(mach);
1288   }
1289}
1290
1291static void
1292micro_add(union tgsi_exec_channel *dst,
1293          const union tgsi_exec_channel *src0,
1294          const union tgsi_exec_channel *src1)
1295{
1296   dst->f[0] = src0->f[0] + src1->f[0];
1297   dst->f[1] = src0->f[1] + src1->f[1];
1298   dst->f[2] = src0->f[2] + src1->f[2];
1299   dst->f[3] = src0->f[3] + src1->f[3];
1300}
1301
1302static void
1303micro_div(
1304   union tgsi_exec_channel *dst,
1305   const union tgsi_exec_channel *src0,
1306   const union tgsi_exec_channel *src1 )
1307{
1308   if (src1->f[0] != 0) {
1309      dst->f[0] = src0->f[0] / src1->f[0];
1310   }
1311   if (src1->f[1] != 0) {
1312      dst->f[1] = src0->f[1] / src1->f[1];
1313   }
1314   if (src1->f[2] != 0) {
1315      dst->f[2] = src0->f[2] / src1->f[2];
1316   }
1317   if (src1->f[3] != 0) {
1318      dst->f[3] = src0->f[3] / src1->f[3];
1319   }
1320}
1321
1322static void
1323micro_lt(
1324   union tgsi_exec_channel *dst,
1325   const union tgsi_exec_channel *src0,
1326   const union tgsi_exec_channel *src1,
1327   const union tgsi_exec_channel *src2,
1328   const union tgsi_exec_channel *src3 )
1329{
1330   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
1331   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
1332   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
1333   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
1334}
1335
1336static void
1337micro_max(union tgsi_exec_channel *dst,
1338          const union tgsi_exec_channel *src0,
1339          const union tgsi_exec_channel *src1)
1340{
1341   dst->f[0] = fmaxf(src0->f[0], src1->f[0]);
1342   dst->f[1] = fmaxf(src0->f[1], src1->f[1]);
1343   dst->f[2] = fmaxf(src0->f[2], src1->f[2]);
1344   dst->f[3] = fmaxf(src0->f[3], src1->f[3]);
1345}
1346
1347static void
1348micro_min(union tgsi_exec_channel *dst,
1349          const union tgsi_exec_channel *src0,
1350          const union tgsi_exec_channel *src1)
1351{
1352   dst->f[0] = fminf(src0->f[0], src1->f[0]);
1353   dst->f[1] = fminf(src0->f[1], src1->f[1]);
1354   dst->f[2] = fminf(src0->f[2], src1->f[2]);
1355   dst->f[3] = fminf(src0->f[3], src1->f[3]);
1356}
1357
1358static void
1359micro_mul(union tgsi_exec_channel *dst,
1360          const union tgsi_exec_channel *src0,
1361          const union tgsi_exec_channel *src1)
1362{
1363   dst->f[0] = src0->f[0] * src1->f[0];
1364   dst->f[1] = src0->f[1] * src1->f[1];
1365   dst->f[2] = src0->f[2] * src1->f[2];
1366   dst->f[3] = src0->f[3] * src1->f[3];
1367}
1368
1369static void
1370micro_neg(
1371   union tgsi_exec_channel *dst,
1372   const union tgsi_exec_channel *src )
1373{
1374   dst->f[0] = -src->f[0];
1375   dst->f[1] = -src->f[1];
1376   dst->f[2] = -src->f[2];
1377   dst->f[3] = -src->f[3];
1378}
1379
1380static void
1381micro_pow(
1382   union tgsi_exec_channel *dst,
1383   const union tgsi_exec_channel *src0,
1384   const union tgsi_exec_channel *src1 )
1385{
1386   dst->f[0] = powf( src0->f[0], src1->f[0] );
1387   dst->f[1] = powf( src0->f[1], src1->f[1] );
1388   dst->f[2] = powf( src0->f[2], src1->f[2] );
1389   dst->f[3] = powf( src0->f[3], src1->f[3] );
1390}
1391
1392static void
1393micro_ldexp(union tgsi_exec_channel *dst,
1394            const union tgsi_exec_channel *src0,
1395            const union tgsi_exec_channel *src1)
1396{
1397   dst->f[0] = ldexpf(src0->f[0], src1->i[0]);
1398   dst->f[1] = ldexpf(src0->f[1], src1->i[1]);
1399   dst->f[2] = ldexpf(src0->f[2], src1->i[2]);
1400   dst->f[3] = ldexpf(src0->f[3], src1->i[3]);
1401}
1402
1403static void
1404micro_sub(union tgsi_exec_channel *dst,
1405          const union tgsi_exec_channel *src0,
1406          const union tgsi_exec_channel *src1)
1407{
1408   dst->f[0] = src0->f[0] - src1->f[0];
1409   dst->f[1] = src0->f[1] - src1->f[1];
1410   dst->f[2] = src0->f[2] - src1->f[2];
1411   dst->f[3] = src0->f[3] - src1->f[3];
1412}
1413
1414static void
1415fetch_src_file_channel(const struct tgsi_exec_machine *mach,
1416                       const uint file,
1417                       const uint swizzle,
1418                       const union tgsi_exec_channel *index,
1419                       const union tgsi_exec_channel *index2D,
1420                       union tgsi_exec_channel *chan)
1421{
1422   uint i;
1423
1424   assert(swizzle < 4);
1425
1426   switch (file) {
1427   case TGSI_FILE_CONSTANT:
1428      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1429         /* NOTE: copying the const value as a uint instead of float */
1430         const uint constbuf = index2D->i[i];
1431         const unsigned pos = index->i[i] * 4 + swizzle;
1432         /* const buffer bounds check */
1433         if (pos >= mach->ConstsSize[constbuf] / 4) {
1434            if (0) {
1435               /* Debug: print warning */
1436               static int count = 0;
1437               if (count++ < 100)
1438                  debug_printf("TGSI Exec: const buffer index %d"
1439                                 " out of bounds\n", pos);
1440            }
1441            chan->u[i] = 0;
1442         } else {
1443            const uint *buf = (const uint *)mach->Consts[constbuf];
1444            chan->u[i] = buf[pos];
1445         }
1446      }
1447      break;
1448
1449   case TGSI_FILE_INPUT:
1450      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1451         /*
1452         if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1453            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
1454                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
1455                         index2D->i[i], index->i[i]);
1456                         }*/
1457         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
1458         assert(pos >= 0);
1459         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
1460         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
1461      }
1462      break;
1463
1464   case TGSI_FILE_SYSTEM_VALUE:
1465      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1466         chan->u[i] = mach->SystemValue[index->i[i]].xyzw[swizzle].u[i];
1467      }
1468      break;
1469
1470   case TGSI_FILE_TEMPORARY:
1471      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1472         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
1473         assert(index2D->i[i] == 0);
1474
1475         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
1476      }
1477      break;
1478
1479   case TGSI_FILE_IMMEDIATE:
1480      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1481         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
1482         assert(index2D->i[i] == 0);
1483
1484         chan->f[i] = mach->Imms[index->i[i]][swizzle];
1485      }
1486      break;
1487
1488   case TGSI_FILE_ADDRESS:
1489      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1490         assert(index->i[i] >= 0 && index->i[i] < ARRAY_SIZE(mach->Addrs));
1491         assert(index2D->i[i] == 0);
1492
1493         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
1494      }
1495      break;
1496
1497   case TGSI_FILE_OUTPUT:
1498      /* vertex/fragment output vars can be read too */
1499      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1500         assert(index->i[i] >= 0);
1501         assert(index2D->i[i] == 0);
1502
1503         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
1504      }
1505      break;
1506
1507   default:
1508      assert(0);
1509      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1510         chan->u[i] = 0;
1511      }
1512   }
1513}
1514
1515static void
1516get_index_registers(const struct tgsi_exec_machine *mach,
1517                    const struct tgsi_full_src_register *reg,
1518                    union tgsi_exec_channel *index,
1519                    union tgsi_exec_channel *index2D)
1520{
1521   uint swizzle;
1522
1523   /* We start with a direct index into a register file.
1524    *
1525    *    file[1],
1526    *    where:
1527    *       file = Register.File
1528    *       [1] = Register.Index
1529    */
1530   index->i[0] =
1531   index->i[1] =
1532   index->i[2] =
1533   index->i[3] = reg->Register.Index;
1534
1535   /* There is an extra source register that indirectly subscripts
1536    * a register file. The direct index now becomes an offset
1537    * that is being added to the indirect register.
1538    *
1539    *    file[ind[2].x+1],
1540    *    where:
1541    *       ind = Indirect.File
1542    *       [2] = Indirect.Index
1543    *       .x = Indirect.SwizzleX
1544    */
1545   if (reg->Register.Indirect) {
1546      union tgsi_exec_channel index2;
1547      union tgsi_exec_channel indir_index;
1548      const uint execmask = mach->ExecMask;
1549      uint i;
1550
1551      /* which address register (always zero now) */
1552      index2.i[0] =
1553      index2.i[1] =
1554      index2.i[2] =
1555      index2.i[3] = reg->Indirect.Index;
1556      /* get current value of address register[swizzle] */
1557      swizzle = reg->Indirect.Swizzle;
1558      fetch_src_file_channel(mach,
1559                             reg->Indirect.File,
1560                             swizzle,
1561                             &index2,
1562                             &ZeroVec,
1563                             &indir_index);
1564
1565      /* add value of address register to the offset */
1566      index->i[0] += indir_index.i[0];
1567      index->i[1] += indir_index.i[1];
1568      index->i[2] += indir_index.i[2];
1569      index->i[3] += indir_index.i[3];
1570
1571      /* for disabled execution channels, zero-out the index to
1572       * avoid using a potential garbage value.
1573       */
1574      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1575         if ((execmask & (1 << i)) == 0)
1576            index->i[i] = 0;
1577      }
1578   }
1579
1580   /* There is an extra source register that is a second
1581    * subscript to a register file. Effectively it means that
1582    * the register file is actually a 2D array of registers.
1583    *
1584    *    file[3][1],
1585    *    where:
1586    *       [3] = Dimension.Index
1587    */
1588   if (reg->Register.Dimension) {
1589      index2D->i[0] =
1590      index2D->i[1] =
1591      index2D->i[2] =
1592      index2D->i[3] = reg->Dimension.Index;
1593
1594      /* Again, the second subscript index can be addressed indirectly
1595       * identically to the first one.
1596       * Nothing stops us from indirectly addressing the indirect register,
1597       * but there is no need for that, so we won't exercise it.
1598       *
1599       *    file[ind[4].y+3][1],
1600       *    where:
1601       *       ind = DimIndirect.File
1602       *       [4] = DimIndirect.Index
1603       *       .y = DimIndirect.SwizzleX
1604       */
1605      if (reg->Dimension.Indirect) {
1606         union tgsi_exec_channel index2;
1607         union tgsi_exec_channel indir_index;
1608         const uint execmask = mach->ExecMask;
1609         uint i;
1610
1611         index2.i[0] =
1612         index2.i[1] =
1613         index2.i[2] =
1614         index2.i[3] = reg->DimIndirect.Index;
1615
1616         swizzle = reg->DimIndirect.Swizzle;
1617         fetch_src_file_channel(mach,
1618                                reg->DimIndirect.File,
1619                                swizzle,
1620                                &index2,
1621                                &ZeroVec,
1622                                &indir_index);
1623
1624         index2D->i[0] += indir_index.i[0];
1625         index2D->i[1] += indir_index.i[1];
1626         index2D->i[2] += indir_index.i[2];
1627         index2D->i[3] += indir_index.i[3];
1628
1629         /* for disabled execution channels, zero-out the index to
1630          * avoid using a potential garbage value.
1631          */
1632         for (i = 0; i < TGSI_QUAD_SIZE; i++) {
1633            if ((execmask & (1 << i)) == 0) {
1634               index2D->i[i] = 0;
1635            }
1636         }
1637      }
1638
1639      /* If by any chance there was a need for a 3D array of register
1640       * files, we would have to check whether Dimension is followed
1641       * by a dimension register and continue the saga.
1642       */
1643   } else {
1644      index2D->i[0] =
1645      index2D->i[1] =
1646      index2D->i[2] =
1647      index2D->i[3] = 0;
1648   }
1649}
1650
1651
1652static void
1653fetch_source_d(const struct tgsi_exec_machine *mach,
1654               union tgsi_exec_channel *chan,
1655               const struct tgsi_full_src_register *reg,
1656	       const uint chan_index)
1657{
1658   union tgsi_exec_channel index;
1659   union tgsi_exec_channel index2D;
1660   uint swizzle;
1661
1662   get_index_registers(mach, reg, &index, &index2D);
1663
1664
1665   swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
1666   fetch_src_file_channel(mach,
1667                          reg->Register.File,
1668                          swizzle,
1669                          &index,
1670                          &index2D,
1671                          chan);
1672}
1673
1674static void
1675fetch_source(const struct tgsi_exec_machine *mach,
1676             union tgsi_exec_channel *chan,
1677             const struct tgsi_full_src_register *reg,
1678             const uint chan_index,
1679             enum tgsi_exec_datatype src_datatype)
1680{
1681   fetch_source_d(mach, chan, reg, chan_index);
1682
1683   if (reg->Register.Absolute) {
1684      assert(src_datatype == TGSI_EXEC_DATA_FLOAT);
1685      micro_abs(chan, chan);
1686   }
1687
1688   if (reg->Register.Negate) {
1689      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
1690         micro_neg(chan, chan);
1691      } else {
1692         micro_ineg(chan, chan);
1693      }
1694   }
1695}
1696
1697static union tgsi_exec_channel *
1698store_dest_dstret(struct tgsi_exec_machine *mach,
1699                 const union tgsi_exec_channel *chan,
1700                 const struct tgsi_full_dst_register *reg,
1701                 uint chan_index)
1702{
1703   static union tgsi_exec_channel null;
1704   union tgsi_exec_channel *dst;
1705   int offset = 0;  /* indirection offset */
1706   int index;
1707
1708
1709   /* There is an extra source register that indirectly subscripts
1710    * a register file. The direct index now becomes an offset
1711    * that is being added to the indirect register.
1712    *
1713    *    file[ind[2].x+1],
1714    *    where:
1715    *       ind = Indirect.File
1716    *       [2] = Indirect.Index
1717    *       .x = Indirect.SwizzleX
1718    */
1719   if (reg->Register.Indirect) {
1720      union tgsi_exec_channel index;
1721      union tgsi_exec_channel indir_index;
1722      uint swizzle;
1723
1724      /* which address register (always zero for now) */
1725      index.i[0] =
1726      index.i[1] =
1727      index.i[2] =
1728      index.i[3] = reg->Indirect.Index;
1729
1730      /* get current value of address register[swizzle] */
1731      swizzle = reg->Indirect.Swizzle;
1732
1733      /* fetch values from the address/indirection register */
1734      fetch_src_file_channel(mach,
1735                             reg->Indirect.File,
1736                             swizzle,
1737                             &index,
1738                             &ZeroVec,
1739                             &indir_index);
1740
1741      /* save indirection offset */
1742      offset = indir_index.i[0];
1743   }
1744
1745   switch (reg->Register.File) {
1746   case TGSI_FILE_NULL:
1747      dst = &null;
1748      break;
1749
1750   case TGSI_FILE_OUTPUT:
1751      index = mach->OutputVertexOffset + reg->Register.Index;
1752      dst = &mach->Outputs[offset + index].xyzw[chan_index];
1753#if 0
1754      debug_printf("NumOutputs = %d, TEMP_O_C/I = %d, redindex = %d\n",
1755                   mach->NumOutputs, mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0],
1756                   reg->Register.Index);
1757      if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1758         debug_printf("STORING OUT[%d] mask(%d), = (", offset + index, execmask);
1759         for (i = 0; i < TGSI_QUAD_SIZE; i++)
1760            if (execmask & (1 << i))
1761               debug_printf("%f, ", chan->f[i]);
1762         debug_printf(")\n");
1763      }
1764#endif
1765      break;
1766
1767   case TGSI_FILE_TEMPORARY:
1768      index = reg->Register.Index;
1769      assert( index < TGSI_EXEC_NUM_TEMPS );
1770      dst = &mach->Temps[offset + index].xyzw[chan_index];
1771      break;
1772
1773   case TGSI_FILE_ADDRESS:
1774      index = reg->Register.Index;
1775      assert(index >= 0 && index < ARRAY_SIZE(mach->Addrs));
1776      dst = &mach->Addrs[index].xyzw[chan_index];
1777      break;
1778
1779   default:
1780      unreachable("Bad destination file");
1781   }
1782
1783   return dst;
1784}
1785
1786static void
1787store_dest_double(struct tgsi_exec_machine *mach,
1788                 const union tgsi_exec_channel *chan,
1789                 const struct tgsi_full_dst_register *reg,
1790                 uint chan_index)
1791{
1792   union tgsi_exec_channel *dst;
1793   const uint execmask = mach->ExecMask;
1794   int i;
1795
1796   dst = store_dest_dstret(mach, chan, reg, chan_index);
1797   if (!dst)
1798      return;
1799
1800   /* doubles path */
1801   for (i = 0; i < TGSI_QUAD_SIZE; i++)
1802      if (execmask & (1 << i))
1803         dst->i[i] = chan->i[i];
1804}
1805
1806static void
1807store_dest(struct tgsi_exec_machine *mach,
1808           const union tgsi_exec_channel *chan,
1809           const struct tgsi_full_dst_register *reg,
1810           const struct tgsi_full_instruction *inst,
1811           uint chan_index)
1812{
1813   union tgsi_exec_channel *dst;
1814   const uint execmask = mach->ExecMask;
1815   int i;
1816
1817   dst = store_dest_dstret(mach, chan, reg, chan_index);
1818   if (!dst)
1819      return;
1820
1821   if (!inst->Instruction.Saturate) {
1822      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1823         if (execmask & (1 << i))
1824            dst->i[i] = chan->i[i];
1825   }
1826   else {
1827      for (i = 0; i < TGSI_QUAD_SIZE; i++)
1828         if (execmask & (1 << i))
1829            dst->f[i] = fminf(fmaxf(chan->f[i], 0.0f), 1.0f);
1830   }
1831}
1832
1833#define FETCH(VAL,INDEX,CHAN)\
1834    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
1835
1836#define IFETCH(VAL,INDEX,CHAN)\
1837    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_INT)
1838
1839
1840/**
1841 * Execute ARB-style KIL which is predicated by a src register.
1842 * Kill fragment if any of the four values is less than zero.
1843 */
1844static void
1845exec_kill_if(struct tgsi_exec_machine *mach,
1846             const struct tgsi_full_instruction *inst)
1847{
1848   uint uniquemask;
1849   uint chan_index;
1850   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
1851   union tgsi_exec_channel r[1];
1852
1853   /* This mask stores component bits that were already tested. */
1854   uniquemask = 0;
1855
1856   for (chan_index = 0; chan_index < 4; chan_index++)
1857   {
1858      uint swizzle;
1859      uint i;
1860
1861      /* unswizzle channel */
1862      swizzle = tgsi_util_get_full_src_register_swizzle (
1863                        &inst->Src[0],
1864                        chan_index);
1865
1866      /* check if the component has not been already tested */
1867      if (uniquemask & (1 << swizzle))
1868         continue;
1869      uniquemask |= 1 << swizzle;
1870
1871      FETCH(&r[0], 0, chan_index);
1872      for (i = 0; i < 4; i++)
1873         if (r[0].f[i] < 0.0f)
1874            kilmask |= 1 << i;
1875   }
1876
1877   /* restrict to fragments currently executing */
1878   kilmask &= mach->ExecMask;
1879
1880   mach->KillMask |= kilmask;
1881}
1882
1883/**
1884 * Unconditional fragment kill/discard.
1885 */
1886static void
1887exec_kill(struct tgsi_exec_machine *mach)
1888{
1889   /* kill fragment for all fragments currently executing.
1890    * bit 0 = pixel 0, bit 1 = pixel 1, etc.
1891    */
1892   mach->KillMask |= mach->ExecMask;
1893}
1894
1895static void
1896emit_vertex(struct tgsi_exec_machine *mach,
1897            const struct tgsi_full_instruction *inst)
1898{
1899   union tgsi_exec_channel r[1];
1900   unsigned stream_id;
1901   unsigned prim_count;
1902   /* FIXME: check for exec mask correctly
1903   unsigned i;
1904   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1905         if ((mach->ExecMask & (1 << i)))
1906   */
1907   IFETCH(&r[0], 0, TGSI_CHAN_X);
1908   stream_id = r[0].u[0];
1909   prim_count = mach->OutputPrimCount[stream_id];
1910   if (mach->ExecMask) {
1911      if (mach->Primitives[stream_id][prim_count] >= mach->MaxOutputVertices)
1912         return;
1913
1914      if (mach->Primitives[stream_id][prim_count] == 0)
1915         mach->PrimitiveOffsets[stream_id][prim_count] = mach->OutputVertexOffset;
1916      mach->OutputVertexOffset += mach->NumOutputs;
1917      mach->Primitives[stream_id][prim_count]++;
1918   }
1919}
1920
1921static void
1922emit_primitive(struct tgsi_exec_machine *mach,
1923               const struct tgsi_full_instruction *inst)
1924{
1925   unsigned *prim_count;
1926   union tgsi_exec_channel r[1];
1927   unsigned stream_id = 0;
1928   /* FIXME: check for exec mask correctly
1929   unsigned i;
1930   for (i = 0; i < TGSI_QUAD_SIZE; ++i) {
1931         if ((mach->ExecMask & (1 << i)))
1932   */
1933   if (inst) {
1934      IFETCH(&r[0], 0, TGSI_CHAN_X);
1935      stream_id = r[0].u[0];
1936   }
1937   prim_count = &mach->OutputPrimCount[stream_id];
1938   if (mach->ExecMask) {
1939      ++(*prim_count);
1940      debug_assert((*prim_count * mach->NumOutputs) < TGSI_MAX_TOTAL_VERTICES);
1941      mach->Primitives[stream_id][*prim_count] = 0;
1942   }
1943}
1944
1945static void
1946conditional_emit_primitive(struct tgsi_exec_machine *mach)
1947{
1948   if (PIPE_SHADER_GEOMETRY == mach->ShaderType) {
1949      int emitted_verts = mach->Primitives[0][mach->OutputPrimCount[0]];
1950      if (emitted_verts) {
1951         emit_primitive(mach, NULL);
1952      }
1953   }
1954}
1955
1956
1957/*
1958 * Fetch four texture samples using STR texture coordinates.
1959 */
1960static void
1961fetch_texel( struct tgsi_sampler *sampler,
1962             const unsigned sview_idx,
1963             const unsigned sampler_idx,
1964             const union tgsi_exec_channel *s,
1965             const union tgsi_exec_channel *t,
1966             const union tgsi_exec_channel *p,
1967             const union tgsi_exec_channel *c0,
1968             const union tgsi_exec_channel *c1,
1969             float derivs[3][2][TGSI_QUAD_SIZE],
1970             const int8_t offset[3],
1971             enum tgsi_sampler_control control,
1972             union tgsi_exec_channel *r,
1973             union tgsi_exec_channel *g,
1974             union tgsi_exec_channel *b,
1975             union tgsi_exec_channel *a )
1976{
1977   uint j;
1978   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
1979
1980   /* FIXME: handle explicit derivs, offsets */
1981   sampler->get_samples(sampler, sview_idx, sampler_idx,
1982                        s->f, t->f, p->f, c0->f, c1->f, derivs, offset, control, rgba);
1983
1984   for (j = 0; j < 4; j++) {
1985      r->f[j] = rgba[0][j];
1986      g->f[j] = rgba[1][j];
1987      b->f[j] = rgba[2][j];
1988      a->f[j] = rgba[3][j];
1989   }
1990}
1991
1992
1993#define TEX_MODIFIER_NONE           0
1994#define TEX_MODIFIER_PROJECTED      1
1995#define TEX_MODIFIER_LOD_BIAS       2
1996#define TEX_MODIFIER_EXPLICIT_LOD   3
1997#define TEX_MODIFIER_LEVEL_ZERO     4
1998#define TEX_MODIFIER_GATHER         5
1999
2000/*
2001 * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
2002 */
2003static void
2004fetch_texel_offsets(struct tgsi_exec_machine *mach,
2005                    const struct tgsi_full_instruction *inst,
2006                    int8_t offsets[3])
2007{
2008   if (inst->Texture.NumOffsets == 1) {
2009      union tgsi_exec_channel index;
2010      union tgsi_exec_channel offset[3];
2011      index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
2012      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2013                             inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
2014      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2015                             inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
2016      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
2017                             inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
2018     offsets[0] = offset[0].i[0];
2019     offsets[1] = offset[1].i[0];
2020     offsets[2] = offset[2].i[0];
2021   } else {
2022     assert(inst->Texture.NumOffsets == 0);
2023     offsets[0] = offsets[1] = offsets[2] = 0;
2024   }
2025}
2026
2027
2028/*
2029 * Fetch dx and dy values for one channel (s, t or r).
2030 * Put dx values into one float array, dy values into another.
2031 */
2032static void
2033fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
2034                           const struct tgsi_full_instruction *inst,
2035                           unsigned regdsrcx,
2036                           unsigned chan,
2037                           float derivs[2][TGSI_QUAD_SIZE])
2038{
2039   union tgsi_exec_channel d;
2040   FETCH(&d, regdsrcx, chan);
2041   derivs[0][0] = d.f[0];
2042   derivs[0][1] = d.f[1];
2043   derivs[0][2] = d.f[2];
2044   derivs[0][3] = d.f[3];
2045   FETCH(&d, regdsrcx + 1, chan);
2046   derivs[1][0] = d.f[0];
2047   derivs[1][1] = d.f[1];
2048   derivs[1][2] = d.f[2];
2049   derivs[1][3] = d.f[3];
2050}
2051
2052static uint
2053fetch_sampler_unit(struct tgsi_exec_machine *mach,
2054                   const struct tgsi_full_instruction *inst,
2055                   uint sampler)
2056{
2057   uint unit = 0;
2058   int i;
2059   if (inst->Src[sampler].Register.Indirect) {
2060      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
2061      union tgsi_exec_channel indir_index, index2;
2062      const uint execmask = mach->ExecMask;
2063      index2.i[0] =
2064      index2.i[1] =
2065      index2.i[2] =
2066      index2.i[3] = reg->Indirect.Index;
2067
2068      fetch_src_file_channel(mach,
2069                             reg->Indirect.File,
2070                             reg->Indirect.Swizzle,
2071                             &index2,
2072                             &ZeroVec,
2073                             &indir_index);
2074      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2075         if (execmask & (1 << i)) {
2076            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
2077            break;
2078         }
2079      }
2080
2081   } else {
2082      unit = inst->Src[sampler].Register.Index;
2083   }
2084   return unit;
2085}
2086
2087/*
2088 * execute a texture instruction.
2089 *
2090 * modifier is used to control the channel routing for the
2091 * instruction variants like proj, lod, and texture with lod bias.
2092 * sampler indicates which src register the sampler is contained in.
2093 */
2094static void
2095exec_tex(struct tgsi_exec_machine *mach,
2096         const struct tgsi_full_instruction *inst,
2097         uint modifier, uint sampler)
2098{
2099   const union tgsi_exec_channel *args[5], *proj = NULL;
2100   union tgsi_exec_channel r[5];
2101   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2102   uint chan;
2103   uint unit;
2104   int8_t offsets[3];
2105   int dim, shadow_ref, i;
2106
2107   unit = fetch_sampler_unit(mach, inst, sampler);
2108   /* always fetch all 3 offsets, overkill but keeps code simple */
2109   fetch_texel_offsets(mach, inst, offsets);
2110
2111   assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
2112   assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
2113
2114   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2115   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
2116
2117   assert(dim <= 4);
2118   if (shadow_ref >= 0)
2119      assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
2120
2121   /* fetch modifier to the last argument */
2122   if (modifier != TEX_MODIFIER_NONE) {
2123      const int last = ARRAY_SIZE(args) - 1;
2124
2125      /* fetch modifier from src0.w or src1.x */
2126      if (sampler == 1) {
2127         assert(dim <= TGSI_CHAN_W && shadow_ref != TGSI_CHAN_W);
2128         FETCH(&r[last], 0, TGSI_CHAN_W);
2129      }
2130      else {
2131         FETCH(&r[last], 1, TGSI_CHAN_X);
2132      }
2133
2134      if (modifier != TEX_MODIFIER_PROJECTED) {
2135         args[last] = &r[last];
2136      }
2137      else {
2138         proj = &r[last];
2139         args[last] = &ZeroVec;
2140      }
2141
2142      /* point unused arguments to zero vector */
2143      for (i = dim; i < last; i++)
2144         args[i] = &ZeroVec;
2145
2146      if (modifier == TEX_MODIFIER_EXPLICIT_LOD)
2147         control = TGSI_SAMPLER_LOD_EXPLICIT;
2148      else if (modifier == TEX_MODIFIER_LOD_BIAS)
2149         control = TGSI_SAMPLER_LOD_BIAS;
2150      else if (modifier == TEX_MODIFIER_GATHER)
2151         control = TGSI_SAMPLER_GATHER;
2152   }
2153   else {
2154      for (i = dim; i < (int)ARRAY_SIZE(args); i++)
2155         args[i] = &ZeroVec;
2156   }
2157
2158   /* fetch coordinates */
2159   for (i = 0; i < dim; i++) {
2160      FETCH(&r[i], 0, TGSI_CHAN_X + i);
2161
2162      if (proj)
2163         micro_div(&r[i], &r[i], proj);
2164
2165      args[i] = &r[i];
2166   }
2167
2168   /* fetch reference value */
2169   if (shadow_ref >= 0) {
2170      FETCH(&r[shadow_ref], shadow_ref / 4, TGSI_CHAN_X + (shadow_ref % 4));
2171
2172      if (proj)
2173         micro_div(&r[shadow_ref], &r[shadow_ref], proj);
2174
2175      args[shadow_ref] = &r[shadow_ref];
2176   }
2177
2178   fetch_texel(mach->Sampler, unit, unit,
2179         args[0], args[1], args[2], args[3], args[4],
2180         NULL, offsets, control,
2181         &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2182
2183#if 0
2184   debug_printf("fetch r: %g %g %g %g\n",
2185         r[0].f[0], r[0].f[1], r[0].f[2], r[0].f[3]);
2186   debug_printf("fetch g: %g %g %g %g\n",
2187         r[1].f[0], r[1].f[1], r[1].f[2], r[1].f[3]);
2188   debug_printf("fetch b: %g %g %g %g\n",
2189         r[2].f[0], r[2].f[1], r[2].f[2], r[2].f[3]);
2190   debug_printf("fetch a: %g %g %g %g\n",
2191         r[3].f[0], r[3].f[1], r[3].f[2], r[3].f[3]);
2192#endif
2193
2194   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2195      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2196         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2197      }
2198   }
2199}
2200
2201static void
2202exec_lodq(struct tgsi_exec_machine *mach,
2203          const struct tgsi_full_instruction *inst)
2204{
2205   uint resource_unit, sampler_unit;
2206   unsigned dim;
2207   unsigned i;
2208   union tgsi_exec_channel coords[4];
2209   const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
2210   union tgsi_exec_channel r[2];
2211
2212   resource_unit = fetch_sampler_unit(mach, inst, 1);
2213   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2214      uint target = mach->SamplerViews[resource_unit].Resource;
2215      dim = tgsi_util_get_texture_coord_dim(target);
2216      sampler_unit = fetch_sampler_unit(mach, inst, 2);
2217   } else {
2218      dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
2219      sampler_unit = resource_unit;
2220   }
2221   assert(dim <= ARRAY_SIZE(coords));
2222   /* fetch coordinates */
2223   for (i = 0; i < dim; i++) {
2224      FETCH(&coords[i], 0, TGSI_CHAN_X + i);
2225      args[i] = &coords[i];
2226   }
2227   for (i = dim; i < ARRAY_SIZE(coords); i++) {
2228      args[i] = &ZeroVec;
2229   }
2230   mach->Sampler->query_lod(mach->Sampler, resource_unit, sampler_unit,
2231                            args[0]->f,
2232                            args[1]->f,
2233                            args[2]->f,
2234                            args[3]->f,
2235                            TGSI_SAMPLER_LOD_NONE,
2236                            r[0].f,
2237                            r[1].f);
2238
2239   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2240      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
2241   }
2242   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2243      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y);
2244   }
2245   if (inst->Instruction.Opcode == TGSI_OPCODE_LOD) {
2246      unsigned char swizzles[4];
2247      unsigned chan;
2248      swizzles[0] = inst->Src[1].Register.SwizzleX;
2249      swizzles[1] = inst->Src[1].Register.SwizzleY;
2250      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2251      swizzles[3] = inst->Src[1].Register.SwizzleW;
2252
2253      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2254         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2255            if (swizzles[chan] >= 2) {
2256               store_dest(mach, &ZeroVec,
2257                          &inst->Dst[0], inst, chan);
2258            } else {
2259               store_dest(mach, &r[swizzles[chan]],
2260                          &inst->Dst[0], inst, chan);
2261            }
2262         }
2263      }
2264   } else {
2265      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2266         store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
2267      }
2268      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2269         store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Y);
2270      }
2271   }
2272}
2273
2274static void
2275exec_txd(struct tgsi_exec_machine *mach,
2276         const struct tgsi_full_instruction *inst)
2277{
2278   union tgsi_exec_channel r[4];
2279   float derivs[3][2][TGSI_QUAD_SIZE];
2280   uint chan;
2281   uint unit;
2282   int8_t offsets[3];
2283
2284   unit = fetch_sampler_unit(mach, inst, 3);
2285   /* always fetch all 3 offsets, overkill but keeps code simple */
2286   fetch_texel_offsets(mach, inst, offsets);
2287
2288   switch (inst->Texture.Texture) {
2289   case TGSI_TEXTURE_1D:
2290      FETCH(&r[0], 0, TGSI_CHAN_X);
2291
2292      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2293
2294      fetch_texel(mach->Sampler, unit, unit,
2295                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2296                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2297                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2298      break;
2299
2300   case TGSI_TEXTURE_SHADOW1D:
2301   case TGSI_TEXTURE_1D_ARRAY:
2302   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2303      /* SHADOW1D/1D_ARRAY would not need Y/Z respectively, but don't bother */
2304      FETCH(&r[0], 0, TGSI_CHAN_X);
2305      FETCH(&r[1], 0, TGSI_CHAN_Y);
2306      FETCH(&r[2], 0, TGSI_CHAN_Z);
2307
2308      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2309
2310      fetch_texel(mach->Sampler, unit, unit,
2311                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2312                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2313                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2314      break;
2315
2316   case TGSI_TEXTURE_2D:
2317   case TGSI_TEXTURE_RECT:
2318      FETCH(&r[0], 0, TGSI_CHAN_X);
2319      FETCH(&r[1], 0, TGSI_CHAN_Y);
2320
2321      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2322      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2323
2324      fetch_texel(mach->Sampler, unit, unit,
2325                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2326                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2327                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2328      break;
2329
2330
2331   case TGSI_TEXTURE_SHADOW2D:
2332   case TGSI_TEXTURE_SHADOWRECT:
2333   case TGSI_TEXTURE_2D_ARRAY:
2334   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2335      /* only SHADOW2D_ARRAY actually needs W */
2336      FETCH(&r[0], 0, TGSI_CHAN_X);
2337      FETCH(&r[1], 0, TGSI_CHAN_Y);
2338      FETCH(&r[2], 0, TGSI_CHAN_Z);
2339      FETCH(&r[3], 0, TGSI_CHAN_W);
2340
2341      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2342      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2343
2344      fetch_texel(mach->Sampler, unit, unit,
2345                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2346                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2347                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2348      break;
2349
2350   case TGSI_TEXTURE_3D:
2351   case TGSI_TEXTURE_CUBE:
2352   case TGSI_TEXTURE_CUBE_ARRAY:
2353   case TGSI_TEXTURE_SHADOWCUBE:
2354      /* only TEXTURE_CUBE_ARRAY and TEXTURE_SHADOWCUBE actually need W */
2355      FETCH(&r[0], 0, TGSI_CHAN_X);
2356      FETCH(&r[1], 0, TGSI_CHAN_Y);
2357      FETCH(&r[2], 0, TGSI_CHAN_Z);
2358      FETCH(&r[3], 0, TGSI_CHAN_W);
2359
2360      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_X, derivs[0]);
2361      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Y, derivs[1]);
2362      fetch_assign_deriv_channel(mach, inst, 1, TGSI_CHAN_Z, derivs[2]);
2363
2364      fetch_texel(mach->Sampler, unit, unit,
2365                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,   /* inputs */
2366                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2367                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2368      break;
2369
2370   default:
2371      assert(0);
2372   }
2373
2374   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2375      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2376         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2377      }
2378   }
2379}
2380
2381
2382static void
2383exec_txf(struct tgsi_exec_machine *mach,
2384         const struct tgsi_full_instruction *inst)
2385{
2386   union tgsi_exec_channel r[4];
2387   uint chan;
2388   uint unit;
2389   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
2390   int j;
2391   int8_t offsets[3];
2392   unsigned target;
2393
2394   unit = fetch_sampler_unit(mach, inst, 1);
2395   /* always fetch all 3 offsets, overkill but keeps code simple */
2396   fetch_texel_offsets(mach, inst, offsets);
2397
2398   IFETCH(&r[3], 0, TGSI_CHAN_W);
2399
2400   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2401       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2402      target = mach->SamplerViews[unit].Resource;
2403   }
2404   else {
2405      target = inst->Texture.Texture;
2406   }
2407   switch(target) {
2408   case TGSI_TEXTURE_3D:
2409   case TGSI_TEXTURE_2D_ARRAY:
2410   case TGSI_TEXTURE_SHADOW2D_ARRAY:
2411   case TGSI_TEXTURE_2D_ARRAY_MSAA:
2412      IFETCH(&r[2], 0, TGSI_CHAN_Z);
2413      FALLTHROUGH;
2414   case TGSI_TEXTURE_2D:
2415   case TGSI_TEXTURE_RECT:
2416   case TGSI_TEXTURE_SHADOW1D_ARRAY:
2417   case TGSI_TEXTURE_SHADOW2D:
2418   case TGSI_TEXTURE_SHADOWRECT:
2419   case TGSI_TEXTURE_1D_ARRAY:
2420   case TGSI_TEXTURE_2D_MSAA:
2421      IFETCH(&r[1], 0, TGSI_CHAN_Y);
2422      FALLTHROUGH;
2423   case TGSI_TEXTURE_BUFFER:
2424   case TGSI_TEXTURE_1D:
2425   case TGSI_TEXTURE_SHADOW1D:
2426      IFETCH(&r[0], 0, TGSI_CHAN_X);
2427      break;
2428   default:
2429      assert(0);
2430      break;
2431   }
2432
2433   mach->Sampler->get_texel(mach->Sampler, unit, r[0].i, r[1].i, r[2].i, r[3].i,
2434                            offsets, rgba);
2435
2436   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
2437      r[0].f[j] = rgba[0][j];
2438      r[1].f[j] = rgba[1][j];
2439      r[2].f[j] = rgba[2][j];
2440      r[3].f[j] = rgba[3][j];
2441   }
2442
2443   if (inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I ||
2444       inst->Instruction.Opcode == TGSI_OPCODE_SAMPLE_I_MS) {
2445      unsigned char swizzles[4];
2446      swizzles[0] = inst->Src[1].Register.SwizzleX;
2447      swizzles[1] = inst->Src[1].Register.SwizzleY;
2448      swizzles[2] = inst->Src[1].Register.SwizzleZ;
2449      swizzles[3] = inst->Src[1].Register.SwizzleW;
2450
2451      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2452         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2453            store_dest(mach, &r[swizzles[chan]],
2454                       &inst->Dst[0], inst, chan);
2455         }
2456      }
2457   }
2458   else {
2459      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2460         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2461            store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2462         }
2463      }
2464   }
2465}
2466
2467static void
2468exec_txq(struct tgsi_exec_machine *mach,
2469         const struct tgsi_full_instruction *inst)
2470{
2471   int result[4];
2472   union tgsi_exec_channel r[4], src;
2473   uint chan;
2474   uint unit;
2475   int i,j;
2476
2477   unit = fetch_sampler_unit(mach, inst, 1);
2478
2479   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
2480
2481   /* XXX: This interface can't return per-pixel values */
2482   mach->Sampler->get_dims(mach->Sampler, unit, src.i[0], result);
2483
2484   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2485      for (j = 0; j < 4; j++) {
2486         r[j].i[i] = result[j];
2487      }
2488   }
2489
2490   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2491      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2492         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
2493      }
2494   }
2495}
2496
2497static void
2498exec_sample(struct tgsi_exec_machine *mach,
2499            const struct tgsi_full_instruction *inst,
2500            uint modifier, boolean compare)
2501{
2502   const uint resource_unit = inst->Src[1].Register.Index;
2503   const uint sampler_unit = inst->Src[2].Register.Index;
2504   union tgsi_exec_channel r[5], c1;
2505   const union tgsi_exec_channel *lod = &ZeroVec;
2506   enum tgsi_sampler_control control = TGSI_SAMPLER_LOD_NONE;
2507   uint chan;
2508   unsigned char swizzles[4];
2509   int8_t offsets[3];
2510
2511   /* always fetch all 3 offsets, overkill but keeps code simple */
2512   fetch_texel_offsets(mach, inst, offsets);
2513
2514   assert(modifier != TEX_MODIFIER_PROJECTED);
2515
2516   if (modifier != TEX_MODIFIER_NONE) {
2517      if (modifier == TEX_MODIFIER_LOD_BIAS) {
2518         FETCH(&c1, 3, TGSI_CHAN_X);
2519         lod = &c1;
2520         control = TGSI_SAMPLER_LOD_BIAS;
2521      }
2522      else if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
2523         FETCH(&c1, 3, TGSI_CHAN_X);
2524         lod = &c1;
2525         control = TGSI_SAMPLER_LOD_EXPLICIT;
2526      }
2527      else if (modifier == TEX_MODIFIER_GATHER) {
2528         control = TGSI_SAMPLER_GATHER;
2529      }
2530      else {
2531         assert(modifier == TEX_MODIFIER_LEVEL_ZERO);
2532         control = TGSI_SAMPLER_LOD_ZERO;
2533      }
2534   }
2535
2536   FETCH(&r[0], 0, TGSI_CHAN_X);
2537
2538   switch (mach->SamplerViews[resource_unit].Resource) {
2539   case TGSI_TEXTURE_1D:
2540      if (compare) {
2541         FETCH(&r[2], 3, TGSI_CHAN_X);
2542         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2543                     &r[0], &ZeroVec, &r[2], &ZeroVec, lod, /* S, T, P, C, LOD */
2544                     NULL, offsets, control,
2545                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2546      }
2547      else {
2548         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2549                     &r[0], &ZeroVec, &ZeroVec, &ZeroVec, lod, /* S, T, P, C, LOD */
2550                     NULL, offsets, control,
2551                     &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
2552      }
2553      break;
2554
2555   case TGSI_TEXTURE_1D_ARRAY:
2556   case TGSI_TEXTURE_2D:
2557   case TGSI_TEXTURE_RECT:
2558      FETCH(&r[1], 0, TGSI_CHAN_Y);
2559      if (compare) {
2560         FETCH(&r[2], 3, TGSI_CHAN_X);
2561         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2562                     &r[0], &r[1], &r[2], &ZeroVec, lod,    /* S, T, P, C, LOD */
2563                     NULL, offsets, control,
2564                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2565      }
2566      else {
2567         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2568                     &r[0], &r[1], &ZeroVec, &ZeroVec, lod,    /* S, T, P, C, LOD */
2569                     NULL, offsets, control,
2570                     &r[0], &r[1], &r[2], &r[3]);  /* outputs */
2571      }
2572      break;
2573
2574   case TGSI_TEXTURE_2D_ARRAY:
2575   case TGSI_TEXTURE_3D:
2576   case TGSI_TEXTURE_CUBE:
2577      FETCH(&r[1], 0, TGSI_CHAN_Y);
2578      FETCH(&r[2], 0, TGSI_CHAN_Z);
2579      if(compare) {
2580         FETCH(&r[3], 3, TGSI_CHAN_X);
2581         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2582                     &r[0], &r[1], &r[2], &r[3], lod,
2583                     NULL, offsets, control,
2584                     &r[0], &r[1], &r[2], &r[3]);
2585      }
2586      else {
2587         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2588                     &r[0], &r[1], &r[2], &ZeroVec, lod,
2589                     NULL, offsets, control,
2590                     &r[0], &r[1], &r[2], &r[3]);
2591      }
2592      break;
2593
2594   case TGSI_TEXTURE_CUBE_ARRAY:
2595      FETCH(&r[1], 0, TGSI_CHAN_Y);
2596      FETCH(&r[2], 0, TGSI_CHAN_Z);
2597      FETCH(&r[3], 0, TGSI_CHAN_W);
2598      if(compare) {
2599         FETCH(&r[4], 3, TGSI_CHAN_X);
2600         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2601                     &r[0], &r[1], &r[2], &r[3], &r[4],
2602                     NULL, offsets, control,
2603                     &r[0], &r[1], &r[2], &r[3]);
2604      }
2605      else {
2606         fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2607                     &r[0], &r[1], &r[2], &r[3], lod,
2608                     NULL, offsets, control,
2609                     &r[0], &r[1], &r[2], &r[3]);
2610      }
2611      break;
2612
2613
2614   default:
2615      assert(0);
2616   }
2617
2618   swizzles[0] = inst->Src[1].Register.SwizzleX;
2619   swizzles[1] = inst->Src[1].Register.SwizzleY;
2620   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2621   swizzles[3] = inst->Src[1].Register.SwizzleW;
2622
2623   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2624      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2625         store_dest(mach, &r[swizzles[chan]],
2626                    &inst->Dst[0], inst, chan);
2627      }
2628   }
2629}
2630
2631static void
2632exec_sample_d(struct tgsi_exec_machine *mach,
2633              const struct tgsi_full_instruction *inst)
2634{
2635   const uint resource_unit = inst->Src[1].Register.Index;
2636   const uint sampler_unit = inst->Src[2].Register.Index;
2637   union tgsi_exec_channel r[4];
2638   float derivs[3][2][TGSI_QUAD_SIZE];
2639   uint chan;
2640   unsigned char swizzles[4];
2641   int8_t offsets[3];
2642
2643   /* always fetch all 3 offsets, overkill but keeps code simple */
2644   fetch_texel_offsets(mach, inst, offsets);
2645
2646   FETCH(&r[0], 0, TGSI_CHAN_X);
2647
2648   switch (mach->SamplerViews[resource_unit].Resource) {
2649   case TGSI_TEXTURE_1D:
2650   case TGSI_TEXTURE_1D_ARRAY:
2651      /* only 1D array actually needs Y */
2652      FETCH(&r[1], 0, TGSI_CHAN_Y);
2653
2654      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2655
2656      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2657                  &r[0], &r[1], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, C, LOD */
2658                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2659                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
2660      break;
2661
2662   case TGSI_TEXTURE_2D:
2663   case TGSI_TEXTURE_RECT:
2664   case TGSI_TEXTURE_2D_ARRAY:
2665      /* only 2D array actually needs Z */
2666      FETCH(&r[1], 0, TGSI_CHAN_Y);
2667      FETCH(&r[2], 0, TGSI_CHAN_Z);
2668
2669      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2670      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2671
2672      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2673                  &r[0], &r[1], &r[2], &ZeroVec, &ZeroVec,   /* inputs */
2674                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2675                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
2676      break;
2677
2678   case TGSI_TEXTURE_3D:
2679   case TGSI_TEXTURE_CUBE:
2680   case TGSI_TEXTURE_CUBE_ARRAY:
2681      /* only cube array actually needs W */
2682      FETCH(&r[1], 0, TGSI_CHAN_Y);
2683      FETCH(&r[2], 0, TGSI_CHAN_Z);
2684      FETCH(&r[3], 0, TGSI_CHAN_W);
2685
2686      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_X, derivs[0]);
2687      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Y, derivs[1]);
2688      fetch_assign_deriv_channel(mach, inst, 3, TGSI_CHAN_Z, derivs[2]);
2689
2690      fetch_texel(mach->Sampler, resource_unit, sampler_unit,
2691                  &r[0], &r[1], &r[2], &r[3], &ZeroVec,
2692                  derivs, offsets, TGSI_SAMPLER_DERIVS_EXPLICIT,
2693                  &r[0], &r[1], &r[2], &r[3]);
2694      break;
2695
2696   default:
2697      assert(0);
2698   }
2699
2700   swizzles[0] = inst->Src[1].Register.SwizzleX;
2701   swizzles[1] = inst->Src[1].Register.SwizzleY;
2702   swizzles[2] = inst->Src[1].Register.SwizzleZ;
2703   swizzles[3] = inst->Src[1].Register.SwizzleW;
2704
2705   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2706      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2707         store_dest(mach, &r[swizzles[chan]],
2708                    &inst->Dst[0], inst, chan);
2709      }
2710   }
2711}
2712
2713
2714/**
2715 * Evaluate a constant-valued coefficient at the position of the
2716 * current quad.
2717 */
2718static void
2719eval_constant_coef(
2720   struct tgsi_exec_machine *mach,
2721   unsigned attrib,
2722   unsigned chan )
2723{
2724   unsigned i;
2725
2726   for( i = 0; i < TGSI_QUAD_SIZE; i++ ) {
2727      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
2728   }
2729}
2730
2731static void
2732interp_constant_offset(
2733      UNUSED const struct tgsi_exec_machine *mach,
2734      UNUSED unsigned attrib,
2735      UNUSED unsigned chan,
2736      UNUSED float ofs_x,
2737      UNUSED float ofs_y,
2738      UNUSED union tgsi_exec_channel *out_chan)
2739{
2740}
2741
2742/**
2743 * Evaluate a linear-valued coefficient at the position of the
2744 * current quad.
2745 */
2746static void
2747interp_linear_offset(
2748      const struct tgsi_exec_machine *mach,
2749      unsigned attrib,
2750      unsigned chan,
2751      float ofs_x,
2752      float ofs_y,
2753      union tgsi_exec_channel *out_chan)
2754{
2755   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2756   const float dady = mach->InterpCoefs[attrib].dady[chan];
2757   const float delta = ofs_x * dadx + ofs_y * dady;
2758   out_chan->f[0] += delta;
2759   out_chan->f[1] += delta;
2760   out_chan->f[2] += delta;
2761   out_chan->f[3] += delta;
2762}
2763
2764static void
2765eval_linear_coef(struct tgsi_exec_machine *mach,
2766                 unsigned attrib,
2767                 unsigned chan)
2768{
2769   const float x = mach->QuadPos.xyzw[0].f[0];
2770   const float y = mach->QuadPos.xyzw[1].f[0];
2771   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2772   const float dady = mach->InterpCoefs[attrib].dady[chan];
2773   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2774
2775   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
2776   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
2777   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
2778   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
2779}
2780
2781/**
2782 * Evaluate a perspective-valued coefficient at the position of the
2783 * current quad.
2784 */
2785
2786static void
2787interp_perspective_offset(
2788   const struct tgsi_exec_machine *mach,
2789   unsigned attrib,
2790   unsigned chan,
2791   float ofs_x,
2792   float ofs_y,
2793   union tgsi_exec_channel *out_chan)
2794{
2795   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2796   const float dady = mach->InterpCoefs[attrib].dady[chan];
2797   const float *w = mach->QuadPos.xyzw[3].f;
2798   const float delta = ofs_x * dadx + ofs_y * dady;
2799   out_chan->f[0] += delta / w[0];
2800   out_chan->f[1] += delta / w[1];
2801   out_chan->f[2] += delta / w[2];
2802   out_chan->f[3] += delta / w[3];
2803}
2804
2805static void
2806eval_perspective_coef(
2807   struct tgsi_exec_machine *mach,
2808   unsigned attrib,
2809   unsigned chan )
2810{
2811   const float x = mach->QuadPos.xyzw[0].f[0];
2812   const float y = mach->QuadPos.xyzw[1].f[0];
2813   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
2814   const float dady = mach->InterpCoefs[attrib].dady[chan];
2815   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
2816   const float *w = mach->QuadPos.xyzw[3].f;
2817   /* divide by W here */
2818   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
2819   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
2820   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
2821   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
2822}
2823
2824
2825typedef void (* eval_coef_func)(
2826   struct tgsi_exec_machine *mach,
2827   unsigned attrib,
2828   unsigned chan );
2829
2830static void
2831exec_declaration(struct tgsi_exec_machine *mach,
2832                 const struct tgsi_full_declaration *decl)
2833{
2834   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
2835      mach->SamplerViews[decl->Range.First] = decl->SamplerView;
2836      return;
2837   }
2838
2839   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
2840      if (decl->Declaration.File == TGSI_FILE_INPUT) {
2841         uint first, last, mask;
2842
2843         first = decl->Range.First;
2844         last = decl->Range.Last;
2845         mask = decl->Declaration.UsageMask;
2846
2847         /* XXX we could remove this special-case code since
2848          * mach->InterpCoefs[first].a0 should already have the
2849          * front/back-face value.  But we should first update the
2850          * ureg code to emit the right UsageMask value (WRITEMASK_X).
2851          * Then, we could remove the tgsi_exec_machine::Face field.
2852          */
2853         /* XXX make FACE a system value */
2854         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
2855            uint i;
2856
2857            assert(decl->Semantic.Index == 0);
2858            assert(first == last);
2859
2860            for (i = 0; i < TGSI_QUAD_SIZE; i++) {
2861               mach->Inputs[first].xyzw[0].f[i] = mach->Face;
2862            }
2863         } else {
2864            eval_coef_func eval;
2865            apply_sample_offset_func interp;
2866            uint i, j;
2867
2868            switch (decl->Interp.Interpolate) {
2869            case TGSI_INTERPOLATE_CONSTANT:
2870               eval = eval_constant_coef;
2871               interp = interp_constant_offset;
2872               break;
2873
2874            case TGSI_INTERPOLATE_LINEAR:
2875               eval = eval_linear_coef;
2876               interp = interp_linear_offset;
2877               break;
2878
2879            case TGSI_INTERPOLATE_PERSPECTIVE:
2880               eval = eval_perspective_coef;
2881               interp = interp_perspective_offset;
2882               break;
2883
2884            case TGSI_INTERPOLATE_COLOR:
2885               eval = mach->flatshade_color ? eval_constant_coef : eval_perspective_coef;
2886               interp = mach->flatshade_color ? interp_constant_offset : interp_perspective_offset;
2887               break;
2888
2889            default:
2890               assert(0);
2891               return;
2892            }
2893
2894            for (i = first; i <= last; i++)
2895               mach->InputSampleOffsetApply[i] = interp;
2896
2897            for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2898               if (mask & (1 << j)) {
2899                  for (i = first; i <= last; i++) {
2900                     eval(mach, i, j);
2901                  }
2902               }
2903            }
2904         }
2905
2906         if (DEBUG_EXECUTION) {
2907            uint i, j;
2908            for (i = first; i <= last; ++i) {
2909               debug_printf("IN[%2u] = ", i);
2910               for (j = 0; j < TGSI_NUM_CHANNELS; j++) {
2911                  if (j > 0) {
2912                     debug_printf("         ");
2913                  }
2914                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
2915                               mach->Inputs[i].xyzw[0].f[j], mach->Inputs[i].xyzw[0].u[j],
2916                               mach->Inputs[i].xyzw[1].f[j], mach->Inputs[i].xyzw[1].u[j],
2917                               mach->Inputs[i].xyzw[2].f[j], mach->Inputs[i].xyzw[2].u[j],
2918                               mach->Inputs[i].xyzw[3].f[j], mach->Inputs[i].xyzw[3].u[j]);
2919               }
2920            }
2921         }
2922      }
2923   }
2924
2925}
2926
2927typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
2928                                const union tgsi_exec_channel *src);
2929
2930static void
2931exec_scalar_unary(struct tgsi_exec_machine *mach,
2932                  const struct tgsi_full_instruction *inst,
2933                  micro_unary_op op,
2934                  enum tgsi_exec_datatype src_datatype)
2935{
2936   unsigned int chan;
2937   union tgsi_exec_channel src;
2938   union tgsi_exec_channel dst;
2939
2940   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
2941   op(&dst, &src);
2942   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2943      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2944         store_dest(mach, &dst, &inst->Dst[0], inst, chan);
2945      }
2946   }
2947}
2948
2949static void
2950exec_vector_unary(struct tgsi_exec_machine *mach,
2951                  const struct tgsi_full_instruction *inst,
2952                  micro_unary_op op,
2953                  enum tgsi_exec_datatype src_datatype)
2954{
2955   unsigned int chan;
2956   struct tgsi_exec_vector dst;
2957
2958   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2959      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2960         union tgsi_exec_channel src;
2961
2962         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
2963         op(&dst.xyzw[chan], &src);
2964      }
2965   }
2966   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2967      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2968         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
2969      }
2970   }
2971}
2972
2973typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
2974                                 const union tgsi_exec_channel *src0,
2975                                 const union tgsi_exec_channel *src1);
2976
2977static void
2978exec_scalar_binary(struct tgsi_exec_machine *mach,
2979                   const struct tgsi_full_instruction *inst,
2980                   micro_binary_op op,
2981                   enum tgsi_exec_datatype src_datatype)
2982{
2983   unsigned int chan;
2984   union tgsi_exec_channel src[2];
2985   union tgsi_exec_channel dst;
2986
2987   fetch_source(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, src_datatype);
2988   fetch_source(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, src_datatype);
2989   op(&dst, &src[0], &src[1]);
2990   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
2991      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
2992         store_dest(mach, &dst, &inst->Dst[0], inst, chan);
2993      }
2994   }
2995}
2996
2997static void
2998exec_vector_binary(struct tgsi_exec_machine *mach,
2999                   const struct tgsi_full_instruction *inst,
3000                   micro_binary_op op,
3001                   enum tgsi_exec_datatype src_datatype)
3002{
3003   unsigned int chan;
3004   struct tgsi_exec_vector dst;
3005
3006   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3007      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3008         union tgsi_exec_channel src[2];
3009
3010         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3011         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3012         op(&dst.xyzw[chan], &src[0], &src[1]);
3013      }
3014   }
3015   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3016      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3017         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3018      }
3019   }
3020}
3021
3022typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
3023                                  const union tgsi_exec_channel *src0,
3024                                  const union tgsi_exec_channel *src1,
3025                                  const union tgsi_exec_channel *src2);
3026
3027static void
3028exec_vector_trinary(struct tgsi_exec_machine *mach,
3029                    const struct tgsi_full_instruction *inst,
3030                    micro_trinary_op op,
3031                    enum tgsi_exec_datatype src_datatype)
3032{
3033   unsigned int chan;
3034   struct tgsi_exec_vector dst;
3035
3036   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3037      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3038         union tgsi_exec_channel src[3];
3039
3040         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3041         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3042         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3043         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3044      }
3045   }
3046   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3047      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3048         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3049      }
3050   }
3051}
3052
3053typedef void (* micro_quaternary_op)(union tgsi_exec_channel *dst,
3054                                     const union tgsi_exec_channel *src0,
3055                                     const union tgsi_exec_channel *src1,
3056                                     const union tgsi_exec_channel *src2,
3057                                     const union tgsi_exec_channel *src3);
3058
3059static void
3060exec_vector_quaternary(struct tgsi_exec_machine *mach,
3061                       const struct tgsi_full_instruction *inst,
3062                       micro_quaternary_op op,
3063                       enum tgsi_exec_datatype src_datatype)
3064{
3065   unsigned int chan;
3066   struct tgsi_exec_vector dst;
3067
3068   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3069      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3070         union tgsi_exec_channel src[4];
3071
3072         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
3073         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
3074         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
3075         fetch_source(mach, &src[3], &inst->Src[3], chan, src_datatype);
3076         op(&dst.xyzw[chan], &src[0], &src[1], &src[2], &src[3]);
3077      }
3078   }
3079   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3080      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3081         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3082      }
3083   }
3084}
3085
3086static void
3087exec_dp3(struct tgsi_exec_machine *mach,
3088         const struct tgsi_full_instruction *inst)
3089{
3090   unsigned int chan;
3091   union tgsi_exec_channel arg[3];
3092
3093   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3094   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3095   micro_mul(&arg[2], &arg[0], &arg[1]);
3096
3097   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_Z; chan++) {
3098      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3099      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3100      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3101   }
3102
3103   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3104      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3105         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3106      }
3107   }
3108}
3109
3110static void
3111exec_dp4(struct tgsi_exec_machine *mach,
3112         const struct tgsi_full_instruction *inst)
3113{
3114   unsigned int chan;
3115   union tgsi_exec_channel arg[3];
3116
3117   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3118   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3119   micro_mul(&arg[2], &arg[0], &arg[1]);
3120
3121   for (chan = TGSI_CHAN_Y; chan <= TGSI_CHAN_W; chan++) {
3122      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
3123      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
3124      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3125   }
3126
3127   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3128      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3129         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3130      }
3131   }
3132}
3133
3134static void
3135exec_dp2(struct tgsi_exec_machine *mach,
3136         const struct tgsi_full_instruction *inst)
3137{
3138   unsigned int chan;
3139   union tgsi_exec_channel arg[3];
3140
3141   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3142   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3143   micro_mul(&arg[2], &arg[0], &arg[1]);
3144
3145   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3146   fetch_source(mach, &arg[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3147   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
3148
3149   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3150      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3151         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan);
3152      }
3153   }
3154}
3155
3156static void
3157exec_pk2h(struct tgsi_exec_machine *mach,
3158          const struct tgsi_full_instruction *inst)
3159{
3160   unsigned chan;
3161   union tgsi_exec_channel arg[2], dst;
3162
3163   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3164   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3165   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3166      dst.u[chan] = _mesa_float_to_half(arg[0].f[chan]) |
3167         (_mesa_float_to_half(arg[1].f[chan]) << 16);
3168   }
3169   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3170      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3171         store_dest(mach, &dst, &inst->Dst[0], inst, chan);
3172      }
3173   }
3174}
3175
3176static void
3177exec_up2h(struct tgsi_exec_machine *mach,
3178          const struct tgsi_full_instruction *inst)
3179{
3180   unsigned chan;
3181   union tgsi_exec_channel arg, dst[2];
3182
3183   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3184   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
3185      dst[0].f[chan] = _mesa_half_to_float(arg.u[chan] & 0xffff);
3186      dst[1].f[chan] = _mesa_half_to_float(arg.u[chan] >> 16);
3187   }
3188   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3189      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3190         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan);
3191      }
3192   }
3193}
3194
3195static void
3196micro_ucmp(union tgsi_exec_channel *dst,
3197           const union tgsi_exec_channel *src0,
3198           const union tgsi_exec_channel *src1,
3199           const union tgsi_exec_channel *src2)
3200{
3201   dst->f[0] = src0->u[0] ? src1->f[0] : src2->f[0];
3202   dst->f[1] = src0->u[1] ? src1->f[1] : src2->f[1];
3203   dst->f[2] = src0->u[2] ? src1->f[2] : src2->f[2];
3204   dst->f[3] = src0->u[3] ? src1->f[3] : src2->f[3];
3205}
3206
3207static void
3208exec_ucmp(struct tgsi_exec_machine *mach,
3209          const struct tgsi_full_instruction *inst)
3210{
3211   unsigned int chan;
3212   struct tgsi_exec_vector dst;
3213
3214   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3215      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3216         union tgsi_exec_channel src[3];
3217
3218         fetch_source(mach, &src[0], &inst->Src[0], chan,
3219                      TGSI_EXEC_DATA_UINT);
3220         fetch_source(mach, &src[1], &inst->Src[1], chan,
3221                      TGSI_EXEC_DATA_FLOAT);
3222         fetch_source(mach, &src[2], &inst->Src[2], chan,
3223                      TGSI_EXEC_DATA_FLOAT);
3224         micro_ucmp(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
3225      }
3226   }
3227   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3228      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3229         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan);
3230      }
3231   }
3232}
3233
3234static void
3235exec_dst(struct tgsi_exec_machine *mach,
3236         const struct tgsi_full_instruction *inst)
3237{
3238   union tgsi_exec_channel r[2];
3239   union tgsi_exec_channel d[4];
3240
3241   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3242      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3243      fetch_source(mach, &r[1], &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3244      micro_mul(&d[TGSI_CHAN_Y], &r[0], &r[1]);
3245   }
3246   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3247      fetch_source(mach, &d[TGSI_CHAN_Z], &inst->Src[0], TGSI_CHAN_Z, TGSI_EXEC_DATA_FLOAT);
3248   }
3249   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3250      fetch_source(mach, &d[TGSI_CHAN_W], &inst->Src[1], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3251   }
3252
3253   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3254      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X);
3255   }
3256   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3257      store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y);
3258   }
3259   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3260      store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z);
3261   }
3262   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3263      store_dest(mach, &d[TGSI_CHAN_W], &inst->Dst[0], inst, TGSI_CHAN_W);
3264   }
3265}
3266
3267static void
3268exec_log(struct tgsi_exec_machine *mach,
3269         const struct tgsi_full_instruction *inst)
3270{
3271   union tgsi_exec_channel r[3];
3272
3273   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3274   micro_abs(&r[2], &r[0]);  /* r2 = abs(r0) */
3275   micro_lg2(&r[1], &r[2]);  /* r1 = lg2(r2) */
3276   micro_flr(&r[0], &r[1]);  /* r0 = floor(r1) */
3277   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3278      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_X);
3279   }
3280   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3281      micro_exp2(&r[0], &r[0]);       /* r0 = 2 ^ r0 */
3282      micro_div(&r[0], &r[2], &r[0]); /* r0 = r2 / r0 */
3283      store_dest(mach, &r[0], &inst->Dst[0], inst, TGSI_CHAN_Y);
3284   }
3285   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3286      store_dest(mach, &r[1], &inst->Dst[0], inst, TGSI_CHAN_Z);
3287   }
3288   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3289      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3290   }
3291}
3292
3293static void
3294exec_exp(struct tgsi_exec_machine *mach,
3295         const struct tgsi_full_instruction *inst)
3296{
3297   union tgsi_exec_channel r[3];
3298
3299   fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3300   micro_flr(&r[1], &r[0]);  /* r1 = floor(r0) */
3301   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3302      micro_exp2(&r[2], &r[1]);       /* r2 = 2 ^ r1 */
3303      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_X);
3304   }
3305   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3306      micro_sub(&r[2], &r[0], &r[1]); /* r2 = r0 - r1 */
3307      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Y);
3308   }
3309   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3310      micro_exp2(&r[2], &r[0]);       /* r2 = 2 ^ r0 */
3311      store_dest(mach, &r[2], &inst->Dst[0], inst, TGSI_CHAN_Z);
3312   }
3313   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3314      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3315   }
3316}
3317
3318static void
3319exec_lit(struct tgsi_exec_machine *mach,
3320         const struct tgsi_full_instruction *inst)
3321{
3322   union tgsi_exec_channel r[3];
3323   union tgsi_exec_channel d[3];
3324
3325   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_YZ) {
3326      fetch_source(mach, &r[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
3327      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3328         fetch_source(mach, &r[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
3329         micro_max(&r[1], &r[1], &ZeroVec);
3330
3331         fetch_source(mach, &r[2], &inst->Src[0], TGSI_CHAN_W, TGSI_EXEC_DATA_FLOAT);
3332         micro_min(&r[2], &r[2], &P128Vec);
3333         micro_max(&r[2], &r[2], &M128Vec);
3334         micro_pow(&r[1], &r[1], &r[2]);
3335         micro_lt(&d[TGSI_CHAN_Z], &ZeroVec, &r[0], &r[1], &ZeroVec);
3336         store_dest(mach, &d[TGSI_CHAN_Z], &inst->Dst[0], inst, TGSI_CHAN_Z);
3337      }
3338      if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3339         micro_max(&d[TGSI_CHAN_Y], &r[0], &ZeroVec);
3340         store_dest(mach, &d[TGSI_CHAN_Y], &inst->Dst[0], inst, TGSI_CHAN_Y);
3341      }
3342   }
3343   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3344      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_X);
3345   }
3346
3347   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3348      store_dest(mach, &OneVec, &inst->Dst[0], inst, TGSI_CHAN_W);
3349   }
3350}
3351
3352static void
3353exec_break(struct tgsi_exec_machine *mach)
3354{
3355   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
3356      /* turn off loop channels for each enabled exec channel */
3357      mach->LoopMask &= ~mach->ExecMask;
3358      /* Todo: if mach->LoopMask == 0, jump to end of loop */
3359      UPDATE_EXEC_MASK(mach);
3360   } else {
3361      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
3362
3363      mach->Switch.mask = 0x0;
3364
3365      UPDATE_EXEC_MASK(mach);
3366   }
3367}
3368
3369static void
3370exec_switch(struct tgsi_exec_machine *mach,
3371            const struct tgsi_full_instruction *inst)
3372{
3373   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
3374   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
3375
3376   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
3377   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3378   mach->Switch.mask = 0x0;
3379   mach->Switch.defaultMask = 0x0;
3380
3381   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
3382   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
3383
3384   UPDATE_EXEC_MASK(mach);
3385}
3386
3387static void
3388exec_case(struct tgsi_exec_machine *mach,
3389          const struct tgsi_full_instruction *inst)
3390{
3391   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3392   union tgsi_exec_channel src;
3393   uint mask = 0;
3394
3395   fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
3396
3397   if (mach->Switch.selector.u[0] == src.u[0]) {
3398      mask |= 0x1;
3399   }
3400   if (mach->Switch.selector.u[1] == src.u[1]) {
3401      mask |= 0x2;
3402   }
3403   if (mach->Switch.selector.u[2] == src.u[2]) {
3404      mask |= 0x4;
3405   }
3406   if (mach->Switch.selector.u[3] == src.u[3]) {
3407      mask |= 0x8;
3408   }
3409
3410   mach->Switch.defaultMask |= mask;
3411
3412   mach->Switch.mask |= mask & prevMask;
3413
3414   UPDATE_EXEC_MASK(mach);
3415}
3416
3417/* FIXME: this will only work if default is last */
3418static void
3419exec_default(struct tgsi_exec_machine *mach)
3420{
3421   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
3422
3423   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
3424
3425   UPDATE_EXEC_MASK(mach);
3426}
3427
3428static void
3429exec_endswitch(struct tgsi_exec_machine *mach)
3430{
3431   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
3432   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
3433
3434   UPDATE_EXEC_MASK(mach);
3435}
3436
3437typedef void (* micro_dop)(union tgsi_double_channel *dst,
3438                           const union tgsi_double_channel *src);
3439
3440typedef void (* micro_dop_sop)(union tgsi_double_channel *dst,
3441                               const union tgsi_double_channel *src0,
3442                               union tgsi_exec_channel *src1);
3443
3444typedef void (* micro_dop_s)(union tgsi_double_channel *dst,
3445                             const union tgsi_exec_channel *src);
3446
3447typedef void (* micro_sop_d)(union tgsi_exec_channel *dst,
3448                             const union tgsi_double_channel *src);
3449
3450static void
3451fetch_double_channel(struct tgsi_exec_machine *mach,
3452                     union tgsi_double_channel *chan,
3453                     const struct tgsi_full_src_register *reg,
3454                     uint chan_0,
3455                     uint chan_1)
3456{
3457   union tgsi_exec_channel src[2];
3458   uint i;
3459
3460   fetch_source_d(mach, &src[0], reg, chan_0);
3461   fetch_source_d(mach, &src[1], reg, chan_1);
3462
3463   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3464      chan->u[i][0] = src[0].u[i];
3465      chan->u[i][1] = src[1].u[i];
3466   }
3467   assert(!reg->Register.Absolute);
3468   assert(!reg->Register.Negate);
3469}
3470
3471static void
3472store_double_channel(struct tgsi_exec_machine *mach,
3473                     const union tgsi_double_channel *chan,
3474                     const struct tgsi_full_dst_register *reg,
3475                     const struct tgsi_full_instruction *inst,
3476                     uint chan_0,
3477                     uint chan_1)
3478{
3479   union tgsi_exec_channel dst[2];
3480   uint i;
3481   union tgsi_double_channel temp;
3482   const uint execmask = mach->ExecMask;
3483
3484   if (!inst->Instruction.Saturate) {
3485      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3486         if (execmask & (1 << i)) {
3487            dst[0].u[i] = chan->u[i][0];
3488            dst[1].u[i] = chan->u[i][1];
3489         }
3490   }
3491   else {
3492      for (i = 0; i < TGSI_QUAD_SIZE; i++)
3493         if (execmask & (1 << i)) {
3494            if (chan->d[i] < 0.0 || isnan(chan->d[i]))
3495               temp.d[i] = 0.0;
3496            else if (chan->d[i] > 1.0)
3497               temp.d[i] = 1.0;
3498            else
3499               temp.d[i] = chan->d[i];
3500
3501            dst[0].u[i] = temp.u[i][0];
3502            dst[1].u[i] = temp.u[i][1];
3503         }
3504   }
3505
3506   store_dest_double(mach, &dst[0], reg, chan_0);
3507   if (chan_1 != (unsigned)-1)
3508      store_dest_double(mach, &dst[1], reg, chan_1);
3509}
3510
3511static void
3512exec_double_unary(struct tgsi_exec_machine *mach,
3513                  const struct tgsi_full_instruction *inst,
3514                  micro_dop op)
3515{
3516   union tgsi_double_channel src;
3517   union tgsi_double_channel dst;
3518
3519   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3520      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3521      op(&dst, &src);
3522      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3523   }
3524   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3525      fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3526      op(&dst, &src);
3527      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3528   }
3529}
3530
3531static void
3532exec_double_binary(struct tgsi_exec_machine *mach,
3533                   const struct tgsi_full_instruction *inst,
3534                   micro_dop op,
3535                   enum tgsi_exec_datatype dst_datatype)
3536{
3537   union tgsi_double_channel src[2];
3538   union tgsi_double_channel dst;
3539   int first_dest_chan, second_dest_chan;
3540   int wmask;
3541
3542   wmask = inst->Dst[0].Register.WriteMask;
3543   /* these are & because of the way DSLT etc store their destinations */
3544   if (wmask & TGSI_WRITEMASK_XY) {
3545      first_dest_chan = TGSI_CHAN_X;
3546      second_dest_chan = TGSI_CHAN_Y;
3547      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3548         first_dest_chan = (wmask & TGSI_WRITEMASK_X) ? TGSI_CHAN_X : TGSI_CHAN_Y;
3549         second_dest_chan = -1;
3550      }
3551
3552      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3553      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3554      op(&dst, src);
3555      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3556   }
3557
3558   if (wmask & TGSI_WRITEMASK_ZW) {
3559      first_dest_chan = TGSI_CHAN_Z;
3560      second_dest_chan = TGSI_CHAN_W;
3561      if (dst_datatype == TGSI_EXEC_DATA_UINT) {
3562         first_dest_chan = (wmask & TGSI_WRITEMASK_Z) ? TGSI_CHAN_Z : TGSI_CHAN_W;
3563         second_dest_chan = -1;
3564      }
3565
3566      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3567      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3568      op(&dst, src);
3569      store_double_channel(mach, &dst, &inst->Dst[0], inst, first_dest_chan, second_dest_chan);
3570   }
3571}
3572
3573static void
3574exec_double_trinary(struct tgsi_exec_machine *mach,
3575                    const struct tgsi_full_instruction *inst,
3576                    micro_dop op)
3577{
3578   union tgsi_double_channel src[3];
3579   union tgsi_double_channel dst;
3580
3581   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
3582      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3583      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_X, TGSI_CHAN_Y);
3584      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_X, TGSI_CHAN_Y);
3585      op(&dst, src);
3586      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3587   }
3588   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
3589      fetch_double_channel(mach, &src[0], &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3590      fetch_double_channel(mach, &src[1], &inst->Src[1], TGSI_CHAN_Z, TGSI_CHAN_W);
3591      fetch_double_channel(mach, &src[2], &inst->Src[2], TGSI_CHAN_Z, TGSI_CHAN_W);
3592      op(&dst, src);
3593      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3594   }
3595}
3596
3597static void
3598exec_dldexp(struct tgsi_exec_machine *mach,
3599            const struct tgsi_full_instruction *inst)
3600{
3601   union tgsi_double_channel src0;
3602   union tgsi_exec_channel src1;
3603   union tgsi_double_channel dst;
3604   int wmask;
3605
3606   wmask = inst->Dst[0].Register.WriteMask;
3607   if (wmask & TGSI_WRITEMASK_XY) {
3608      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3609      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3610      micro_dldexp(&dst, &src0, &src1);
3611      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3612   }
3613
3614   if (wmask & TGSI_WRITEMASK_ZW) {
3615      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3616      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3617      micro_dldexp(&dst, &src0, &src1);
3618      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3619   }
3620}
3621
3622static void
3623exec_dfracexp(struct tgsi_exec_machine *mach,
3624              const struct tgsi_full_instruction *inst)
3625{
3626   union tgsi_double_channel src;
3627   union tgsi_double_channel dst;
3628   union tgsi_exec_channel dst_exp;
3629
3630   fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3631   micro_dfracexp(&dst, &dst_exp, &src);
3632   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY)
3633      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3634   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW)
3635      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3636   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3637      if (inst->Dst[1].Register.WriteMask & (1 << chan))
3638         store_dest(mach, &dst_exp, &inst->Dst[1], inst, chan);
3639   }
3640}
3641
3642static void
3643exec_arg0_64_arg1_32(struct tgsi_exec_machine *mach,
3644            const struct tgsi_full_instruction *inst,
3645            micro_dop_sop op)
3646{
3647   union tgsi_double_channel src0;
3648   union tgsi_exec_channel src1;
3649   union tgsi_double_channel dst;
3650   int wmask;
3651
3652   wmask = inst->Dst[0].Register.WriteMask;
3653   if (wmask & TGSI_WRITEMASK_XY) {
3654      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
3655      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
3656      op(&dst, &src0, &src1);
3657      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
3658   }
3659
3660   if (wmask & TGSI_WRITEMASK_ZW) {
3661      fetch_double_channel(mach, &src0, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
3662      fetch_source(mach, &src1, &inst->Src[1], TGSI_CHAN_Z, TGSI_EXEC_DATA_INT);
3663      op(&dst, &src0, &src1);
3664      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
3665   }
3666}
3667
3668static int
3669get_image_coord_dim(unsigned tgsi_tex)
3670{
3671   int dim;
3672   switch (tgsi_tex) {
3673   case TGSI_TEXTURE_BUFFER:
3674   case TGSI_TEXTURE_1D:
3675      dim = 1;
3676      break;
3677   case TGSI_TEXTURE_2D:
3678   case TGSI_TEXTURE_RECT:
3679   case TGSI_TEXTURE_1D_ARRAY:
3680   case TGSI_TEXTURE_2D_MSAA:
3681      dim = 2;
3682      break;
3683   case TGSI_TEXTURE_3D:
3684   case TGSI_TEXTURE_CUBE:
3685   case TGSI_TEXTURE_2D_ARRAY:
3686   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3687   case TGSI_TEXTURE_CUBE_ARRAY:
3688      dim = 3;
3689      break;
3690   default:
3691      assert(!"unknown texture target");
3692      dim = 0;
3693      break;
3694   }
3695
3696   return dim;
3697}
3698
3699static int
3700get_image_coord_sample(unsigned tgsi_tex)
3701{
3702   int sample = 0;
3703   switch (tgsi_tex) {
3704   case TGSI_TEXTURE_2D_MSAA:
3705      sample = 3;
3706      break;
3707   case TGSI_TEXTURE_2D_ARRAY_MSAA:
3708      sample = 4;
3709      break;
3710   default:
3711      break;
3712   }
3713   return sample;
3714}
3715
3716static void
3717exec_load_img(struct tgsi_exec_machine *mach,
3718              const struct tgsi_full_instruction *inst)
3719{
3720   union tgsi_exec_channel r[4], sample_r;
3721   uint unit;
3722   int sample;
3723   int i, j;
3724   int dim;
3725   uint chan;
3726   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3727   struct tgsi_image_params params;
3728
3729   unit = fetch_sampler_unit(mach, inst, 0);
3730   dim = get_image_coord_dim(inst->Memory.Texture);
3731   sample = get_image_coord_sample(inst->Memory.Texture);
3732   assert(dim <= 3);
3733
3734   params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3735   params.unit = unit;
3736   params.tgsi_tex_instr = inst->Memory.Texture;
3737   params.format = inst->Memory.Format;
3738
3739   for (i = 0; i < dim; i++) {
3740      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
3741   }
3742
3743   if (sample)
3744      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
3745
3746   mach->Image->load(mach->Image, &params,
3747                     r[0].i, r[1].i, r[2].i, sample_r.i,
3748                     rgba);
3749   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3750      r[0].f[j] = rgba[0][j];
3751      r[1].f[j] = rgba[1][j];
3752      r[2].f[j] = rgba[2][j];
3753      r[3].f[j] = rgba[3][j];
3754   }
3755   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3756      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3757         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
3758      }
3759   }
3760}
3761
3762static void
3763exec_load_membuf(struct tgsi_exec_machine *mach,
3764                 const struct tgsi_full_instruction *inst)
3765{
3766   uint32_t unit = fetch_sampler_unit(mach, inst, 0);
3767
3768   uint32_t size;
3769   const char *ptr;
3770   switch (inst->Src[0].Register.File) {
3771   case TGSI_FILE_MEMORY:
3772      ptr = mach->LocalMem;
3773      size = mach->LocalMemSize;
3774      break;
3775
3776   case TGSI_FILE_BUFFER:
3777      ptr = mach->Buffer->lookup(mach->Buffer, unit, &size);
3778      break;
3779
3780   case TGSI_FILE_CONSTANT:
3781      if (unit < ARRAY_SIZE(mach->Consts)) {
3782         ptr = mach->Consts[unit];
3783         size = mach->ConstsSize[unit];
3784      } else {
3785         ptr = NULL;
3786         size = 0;
3787      }
3788      break;
3789
3790   default:
3791      unreachable("unsupported TGSI_OPCODE_LOAD file");
3792   }
3793
3794   union tgsi_exec_channel offset;
3795   IFETCH(&offset, 1, TGSI_CHAN_X);
3796
3797   assert(inst->Dst[0].Register.WriteMask);
3798   uint32_t load_size = util_last_bit(inst->Dst[0].Register.WriteMask) * 4;
3799
3800   union tgsi_exec_channel rgba[TGSI_NUM_CHANNELS];
3801   memset(&rgba, 0, sizeof(rgba));
3802   for (int j = 0; j < TGSI_QUAD_SIZE; j++) {
3803      if (size >= load_size && offset.u[j] <= (size - load_size)) {
3804         for (int chan = 0; chan < load_size / 4; chan++)
3805            rgba[chan].u[j] = *(uint32_t *)(ptr + offset.u[j] + chan * 4);
3806      }
3807   }
3808
3809   for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3810      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3811         store_dest(mach, &rgba[chan], &inst->Dst[0], inst, chan);
3812      }
3813   }
3814}
3815
3816static void
3817exec_load(struct tgsi_exec_machine *mach,
3818          const struct tgsi_full_instruction *inst)
3819{
3820   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
3821      exec_load_img(mach, inst);
3822   else
3823      exec_load_membuf(mach, inst);
3824}
3825
3826static uint
3827fetch_store_img_unit(struct tgsi_exec_machine *mach,
3828                     const struct tgsi_full_dst_register *dst)
3829{
3830   uint unit = 0;
3831   int i;
3832   if (dst->Register.Indirect) {
3833      union tgsi_exec_channel indir_index, index2;
3834      const uint execmask = mach->ExecMask;
3835      index2.i[0] =
3836      index2.i[1] =
3837      index2.i[2] =
3838      index2.i[3] = dst->Indirect.Index;
3839
3840      fetch_src_file_channel(mach,
3841                             dst->Indirect.File,
3842                             dst->Indirect.Swizzle,
3843                             &index2,
3844                             &ZeroVec,
3845                             &indir_index);
3846      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3847         if (execmask & (1 << i)) {
3848            unit = dst->Register.Index + indir_index.i[i];
3849            break;
3850         }
3851      }
3852   } else {
3853      unit = dst->Register.Index;
3854   }
3855   return unit;
3856}
3857
3858static void
3859exec_store_img(struct tgsi_exec_machine *mach,
3860               const struct tgsi_full_instruction *inst)
3861{
3862   union tgsi_exec_channel r[3], sample_r;
3863   union tgsi_exec_channel value[4];
3864   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3865   struct tgsi_image_params params;
3866   int dim;
3867   int sample;
3868   int i, j;
3869   uint unit;
3870   unit = fetch_store_img_unit(mach, &inst->Dst[0]);
3871   dim = get_image_coord_dim(inst->Memory.Texture);
3872   sample = get_image_coord_sample(inst->Memory.Texture);
3873   assert(dim <= 3);
3874
3875   params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3876   params.unit = unit;
3877   params.tgsi_tex_instr = inst->Memory.Texture;
3878   params.format = inst->Memory.Format;
3879
3880   for (i = 0; i < dim; i++) {
3881      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
3882   }
3883
3884   for (i = 0; i < 4; i++) {
3885      FETCH(&value[i], 1, TGSI_CHAN_X + i);
3886   }
3887   if (sample)
3888      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
3889
3890   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
3891      rgba[0][j] = value[0].f[j];
3892      rgba[1][j] = value[1].f[j];
3893      rgba[2][j] = value[2].f[j];
3894      rgba[3][j] = value[3].f[j];
3895   }
3896
3897   mach->Image->store(mach->Image, &params,
3898                      r[0].i, r[1].i, r[2].i, sample_r.i,
3899                      rgba);
3900}
3901
3902static void
3903exec_store_buf(struct tgsi_exec_machine *mach,
3904               const struct tgsi_full_instruction *inst)
3905{
3906   uint32_t unit = fetch_store_img_unit(mach, &inst->Dst[0]);
3907   uint32_t size;
3908   char *ptr = mach->Buffer->lookup(mach->Buffer, unit, &size);
3909
3910   int execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3911
3912   union tgsi_exec_channel offset;
3913   IFETCH(&offset, 0, TGSI_CHAN_X);
3914
3915   union tgsi_exec_channel value[4];
3916   for (int i = 0; i < 4; i++)
3917      FETCH(&value[i], 1, TGSI_CHAN_X + i);
3918
3919   for (int j = 0; j < TGSI_QUAD_SIZE; j++) {
3920      if (!(execmask & (1 << j)))
3921         continue;
3922      if (size < offset.u[j])
3923         continue;
3924
3925      uint32_t *invocation_ptr = (uint32_t *)(ptr + offset.u[j]);
3926      uint32_t size_avail = size - offset.u[j];
3927
3928      for (int chan = 0; chan < MIN2(4, size_avail / 4); chan++) {
3929         if (inst->Dst[0].Register.WriteMask & (1 << chan))
3930            memcpy(&invocation_ptr[chan], &value[chan].u[j], 4);
3931      }
3932   }
3933}
3934
3935static void
3936exec_store_mem(struct tgsi_exec_machine *mach,
3937               const struct tgsi_full_instruction *inst)
3938{
3939   union tgsi_exec_channel r[3];
3940   union tgsi_exec_channel value[4];
3941   uint i, chan;
3942   char *ptr = mach->LocalMem;
3943   int execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3944
3945   IFETCH(&r[0], 0, TGSI_CHAN_X);
3946
3947   for (i = 0; i < 4; i++) {
3948      FETCH(&value[i], 1, TGSI_CHAN_X + i);
3949   }
3950
3951   if (r[0].u[0] >= mach->LocalMemSize)
3952      return;
3953   ptr += r[0].u[0];
3954
3955   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
3956      if (execmask & (1 << i)) {
3957         for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
3958            if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
3959               memcpy(ptr + (chan * 4), &value[chan].u[0], 4);
3960            }
3961         }
3962      }
3963   }
3964}
3965
3966static void
3967exec_store(struct tgsi_exec_machine *mach,
3968           const struct tgsi_full_instruction *inst)
3969{
3970   if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
3971      exec_store_img(mach, inst);
3972   else if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
3973      exec_store_buf(mach, inst);
3974   else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
3975      exec_store_mem(mach, inst);
3976}
3977
3978static void
3979exec_atomop_img(struct tgsi_exec_machine *mach,
3980                const struct tgsi_full_instruction *inst)
3981{
3982   union tgsi_exec_channel r[4], sample_r;
3983   union tgsi_exec_channel value[4], value2[4];
3984   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3985   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
3986   struct tgsi_image_params params;
3987   int dim;
3988   int sample;
3989   int i, j;
3990   uint unit, chan;
3991   unit = fetch_sampler_unit(mach, inst, 0);
3992   dim = get_image_coord_dim(inst->Memory.Texture);
3993   sample = get_image_coord_sample(inst->Memory.Texture);
3994   assert(dim <= 3);
3995
3996   params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
3997   params.unit = unit;
3998   params.tgsi_tex_instr = inst->Memory.Texture;
3999   params.format = inst->Memory.Format;
4000
4001   for (i = 0; i < dim; i++) {
4002      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
4003   }
4004
4005   for (i = 0; i < 4; i++) {
4006      FETCH(&value[i], 2, TGSI_CHAN_X + i);
4007      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4008         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
4009   }
4010   if (sample)
4011      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
4012
4013   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4014      rgba[0][j] = value[0].f[j];
4015      rgba[1][j] = value[1].f[j];
4016      rgba[2][j] = value[2].f[j];
4017      rgba[3][j] = value[3].f[j];
4018   }
4019   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
4020      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4021         rgba2[0][j] = value2[0].f[j];
4022         rgba2[1][j] = value2[1].f[j];
4023         rgba2[2][j] = value2[2].f[j];
4024         rgba2[3][j] = value2[3].f[j];
4025      }
4026   }
4027
4028   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
4029                   r[0].i, r[1].i, r[2].i, sample_r.i,
4030                   rgba, rgba2);
4031
4032   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
4033      r[0].f[j] = rgba[0][j];
4034      r[1].f[j] = rgba[1][j];
4035      r[2].f[j] = rgba[2][j];
4036      r[3].f[j] = rgba[3][j];
4037   }
4038   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4039      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4040         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
4041      }
4042   }
4043}
4044
4045static void
4046exec_atomop_membuf(struct tgsi_exec_machine *mach,
4047                   const struct tgsi_full_instruction *inst)
4048{
4049   union tgsi_exec_channel offset, r0, r1;
4050   uint chan, i;
4051   int execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
4052   IFETCH(&offset, 1, TGSI_CHAN_X);
4053
4054   if (!(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X))
4055      return;
4056
4057   void *ptr[TGSI_QUAD_SIZE];
4058   if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
4059      uint32_t unit = fetch_sampler_unit(mach, inst, 0);
4060      uint32_t size;
4061      char *buffer = mach->Buffer->lookup(mach->Buffer, unit, &size);
4062      for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
4063         if (likely(size >= 4 && offset.u[i] <= size - 4))
4064            ptr[i] = buffer + offset.u[i];
4065         else
4066            ptr[i] = NULL;
4067      }
4068   } else {
4069      assert(inst->Src[0].Register.File == TGSI_FILE_MEMORY);
4070
4071      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4072         if (likely(mach->LocalMemSize >= 4 && offset.u[i] <= mach->LocalMemSize - 4))
4073            ptr[i] = (char *)mach->LocalMem + offset.u[i];
4074         else
4075            ptr[i] = NULL;
4076      }
4077   }
4078
4079   FETCH(&r0, 2, TGSI_CHAN_X);
4080   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
4081      FETCH(&r1, 3, TGSI_CHAN_X);
4082
4083   /* The load/op/store sequence has to happen inside the loop since ptr
4084    * may have the same ptr in some of the invocations.
4085    */
4086   for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
4087      if (!(execmask & (1 << i)))
4088         continue;
4089
4090      uint32_t val = 0;
4091      if (ptr[i]) {
4092         memcpy(&val, ptr[i], sizeof(val));
4093
4094         uint32_t result;
4095         switch (inst->Instruction.Opcode) {
4096         case TGSI_OPCODE_ATOMUADD:
4097            result = val + r0.u[i];
4098            break;
4099         case TGSI_OPCODE_ATOMXOR:
4100            result = val ^ r0.u[i];
4101            break;
4102         case TGSI_OPCODE_ATOMOR:
4103            result = val | r0.u[i];
4104            break;
4105         case TGSI_OPCODE_ATOMAND:
4106            result = val & r0.u[i];
4107            break;
4108         case TGSI_OPCODE_ATOMUMIN:
4109            result = MIN2(val, r0.u[i]);
4110            break;
4111         case TGSI_OPCODE_ATOMUMAX:
4112            result = MAX2(val, r0.u[i]);
4113            break;
4114         case TGSI_OPCODE_ATOMIMIN:
4115            result = MIN2((int32_t)val, r0.i[i]);
4116            break;
4117         case TGSI_OPCODE_ATOMIMAX:
4118            result = MAX2((int32_t)val, r0.i[i]);
4119            break;
4120         case TGSI_OPCODE_ATOMXCHG:
4121            result = r0.u[i];
4122            break;
4123         case TGSI_OPCODE_ATOMCAS:
4124            if (val == r0.u[i])
4125               result = r1.u[i];
4126            else
4127               result = val;
4128            break;
4129         case TGSI_OPCODE_ATOMFADD:
4130               result = fui(uif(val) + r0.f[i]);
4131            break;
4132         default:
4133            unreachable("bad atomic op");
4134         }
4135         memcpy(ptr[i], &result, sizeof(result));
4136      }
4137
4138      r0.u[i] = val;
4139   }
4140
4141   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
4142      store_dest(mach, &r0, &inst->Dst[0], inst, chan);
4143}
4144
4145static void
4146exec_atomop(struct tgsi_exec_machine *mach,
4147            const struct tgsi_full_instruction *inst)
4148{
4149   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4150      exec_atomop_img(mach, inst);
4151   else
4152      exec_atomop_membuf(mach, inst);
4153}
4154
4155static void
4156exec_resq_img(struct tgsi_exec_machine *mach,
4157              const struct tgsi_full_instruction *inst)
4158{
4159   int result[4];
4160   union tgsi_exec_channel r[4];
4161   uint unit;
4162   int i, chan, j;
4163   struct tgsi_image_params params;
4164
4165   unit = fetch_sampler_unit(mach, inst, 0);
4166
4167   params.execmask = mach->ExecMask & mach->NonHelperMask & ~mach->KillMask;
4168   params.unit = unit;
4169   params.tgsi_tex_instr = inst->Memory.Texture;
4170   params.format = inst->Memory.Format;
4171
4172   mach->Image->get_dims(mach->Image, &params, result);
4173
4174   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
4175      for (j = 0; j < 4; j++) {
4176         r[j].i[i] = result[j];
4177      }
4178   }
4179
4180   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4181      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
4182         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan);
4183      }
4184   }
4185}
4186
4187static void
4188exec_resq_buf(struct tgsi_exec_machine *mach,
4189              const struct tgsi_full_instruction *inst)
4190{
4191   uint32_t unit = fetch_sampler_unit(mach, inst, 0);
4192   uint32_t size;
4193   (void)mach->Buffer->lookup(mach->Buffer, unit, &size);
4194
4195   union tgsi_exec_channel r;
4196   for (int i = 0; i < TGSI_QUAD_SIZE; i++)
4197      r.i[i] = size;
4198
4199   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
4200      for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4201         store_dest(mach, &r, &inst->Dst[0], inst, TGSI_CHAN_X);
4202      }
4203   }
4204}
4205
4206static void
4207exec_resq(struct tgsi_exec_machine *mach,
4208          const struct tgsi_full_instruction *inst)
4209{
4210   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
4211      exec_resq_img(mach, inst);
4212   else
4213      exec_resq_buf(mach, inst);
4214}
4215
4216static void
4217micro_f2u64(union tgsi_double_channel *dst,
4218            const union tgsi_exec_channel *src)
4219{
4220   dst->u64[0] = (uint64_t)src->f[0];
4221   dst->u64[1] = (uint64_t)src->f[1];
4222   dst->u64[2] = (uint64_t)src->f[2];
4223   dst->u64[3] = (uint64_t)src->f[3];
4224}
4225
4226static void
4227micro_f2i64(union tgsi_double_channel *dst,
4228            const union tgsi_exec_channel *src)
4229{
4230   dst->i64[0] = (int64_t)src->f[0];
4231   dst->i64[1] = (int64_t)src->f[1];
4232   dst->i64[2] = (int64_t)src->f[2];
4233   dst->i64[3] = (int64_t)src->f[3];
4234}
4235
4236static void
4237micro_u2i64(union tgsi_double_channel *dst,
4238            const union tgsi_exec_channel *src)
4239{
4240   dst->u64[0] = (uint64_t)src->u[0];
4241   dst->u64[1] = (uint64_t)src->u[1];
4242   dst->u64[2] = (uint64_t)src->u[2];
4243   dst->u64[3] = (uint64_t)src->u[3];
4244}
4245
4246static void
4247micro_i2i64(union tgsi_double_channel *dst,
4248            const union tgsi_exec_channel *src)
4249{
4250   dst->i64[0] = (int64_t)src->i[0];
4251   dst->i64[1] = (int64_t)src->i[1];
4252   dst->i64[2] = (int64_t)src->i[2];
4253   dst->i64[3] = (int64_t)src->i[3];
4254}
4255
4256static void
4257micro_d2u64(union tgsi_double_channel *dst,
4258           const union tgsi_double_channel *src)
4259{
4260   dst->u64[0] = (uint64_t)src->d[0];
4261   dst->u64[1] = (uint64_t)src->d[1];
4262   dst->u64[2] = (uint64_t)src->d[2];
4263   dst->u64[3] = (uint64_t)src->d[3];
4264}
4265
4266static void
4267micro_d2i64(union tgsi_double_channel *dst,
4268           const union tgsi_double_channel *src)
4269{
4270   dst->i64[0] = (int64_t)src->d[0];
4271   dst->i64[1] = (int64_t)src->d[1];
4272   dst->i64[2] = (int64_t)src->d[2];
4273   dst->i64[3] = (int64_t)src->d[3];
4274}
4275
4276static void
4277micro_u642d(union tgsi_double_channel *dst,
4278           const union tgsi_double_channel *src)
4279{
4280   dst->d[0] = (double)src->u64[0];
4281   dst->d[1] = (double)src->u64[1];
4282   dst->d[2] = (double)src->u64[2];
4283   dst->d[3] = (double)src->u64[3];
4284}
4285
4286static void
4287micro_i642d(union tgsi_double_channel *dst,
4288           const union tgsi_double_channel *src)
4289{
4290   dst->d[0] = (double)src->i64[0];
4291   dst->d[1] = (double)src->i64[1];
4292   dst->d[2] = (double)src->i64[2];
4293   dst->d[3] = (double)src->i64[3];
4294}
4295
4296static void
4297micro_u642f(union tgsi_exec_channel *dst,
4298            const union tgsi_double_channel *src)
4299{
4300   dst->f[0] = (float)src->u64[0];
4301   dst->f[1] = (float)src->u64[1];
4302   dst->f[2] = (float)src->u64[2];
4303   dst->f[3] = (float)src->u64[3];
4304}
4305
4306static void
4307micro_i642f(union tgsi_exec_channel *dst,
4308            const union tgsi_double_channel *src)
4309{
4310   dst->f[0] = (float)src->i64[0];
4311   dst->f[1] = (float)src->i64[1];
4312   dst->f[2] = (float)src->i64[2];
4313   dst->f[3] = (float)src->i64[3];
4314}
4315
4316static void
4317exec_t_2_64(struct tgsi_exec_machine *mach,
4318          const struct tgsi_full_instruction *inst,
4319          micro_dop_s op,
4320          enum tgsi_exec_datatype src_datatype)
4321{
4322   union tgsi_exec_channel src;
4323   union tgsi_double_channel dst;
4324
4325   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY) == TGSI_WRITEMASK_XY) {
4326      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, src_datatype);
4327      op(&dst, &src);
4328      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_X, TGSI_CHAN_Y);
4329   }
4330   if ((inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_ZW) == TGSI_WRITEMASK_ZW) {
4331      fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_Y, src_datatype);
4332      op(&dst, &src);
4333      store_double_channel(mach, &dst, &inst->Dst[0], inst, TGSI_CHAN_Z, TGSI_CHAN_W);
4334   }
4335}
4336
4337static void
4338exec_64_2_t(struct tgsi_exec_machine *mach,
4339            const struct tgsi_full_instruction *inst,
4340            micro_sop_d op)
4341{
4342   union tgsi_double_channel src;
4343   union tgsi_exec_channel dst;
4344   int wm = inst->Dst[0].Register.WriteMask;
4345   int i;
4346   int bit;
4347   for (i = 0; i < 2; i++) {
4348      bit = ffs(wm);
4349      if (bit) {
4350         wm &= ~(1 << (bit - 1));
4351         if (i == 0)
4352            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_CHAN_Y);
4353         else
4354            fetch_double_channel(mach, &src, &inst->Src[0], TGSI_CHAN_Z, TGSI_CHAN_W);
4355         op(&dst, &src);
4356         store_dest(mach, &dst, &inst->Dst[0], inst, bit - 1);
4357      }
4358   }
4359}
4360
4361static void
4362micro_i2f(union tgsi_exec_channel *dst,
4363          const union tgsi_exec_channel *src)
4364{
4365   dst->f[0] = (float)src->i[0];
4366   dst->f[1] = (float)src->i[1];
4367   dst->f[2] = (float)src->i[2];
4368   dst->f[3] = (float)src->i[3];
4369}
4370
4371static void
4372micro_not(union tgsi_exec_channel *dst,
4373          const union tgsi_exec_channel *src)
4374{
4375   dst->u[0] = ~src->u[0];
4376   dst->u[1] = ~src->u[1];
4377   dst->u[2] = ~src->u[2];
4378   dst->u[3] = ~src->u[3];
4379}
4380
4381static void
4382micro_shl(union tgsi_exec_channel *dst,
4383          const union tgsi_exec_channel *src0,
4384          const union tgsi_exec_channel *src1)
4385{
4386   unsigned masked_count;
4387   masked_count = src1->u[0] & 0x1f;
4388   dst->u[0] = src0->u[0] << masked_count;
4389   masked_count = src1->u[1] & 0x1f;
4390   dst->u[1] = src0->u[1] << masked_count;
4391   masked_count = src1->u[2] & 0x1f;
4392   dst->u[2] = src0->u[2] << masked_count;
4393   masked_count = src1->u[3] & 0x1f;
4394   dst->u[3] = src0->u[3] << masked_count;
4395}
4396
4397static void
4398micro_and(union tgsi_exec_channel *dst,
4399          const union tgsi_exec_channel *src0,
4400          const union tgsi_exec_channel *src1)
4401{
4402   dst->u[0] = src0->u[0] & src1->u[0];
4403   dst->u[1] = src0->u[1] & src1->u[1];
4404   dst->u[2] = src0->u[2] & src1->u[2];
4405   dst->u[3] = src0->u[3] & src1->u[3];
4406}
4407
4408static void
4409micro_or(union tgsi_exec_channel *dst,
4410         const union tgsi_exec_channel *src0,
4411         const union tgsi_exec_channel *src1)
4412{
4413   dst->u[0] = src0->u[0] | src1->u[0];
4414   dst->u[1] = src0->u[1] | src1->u[1];
4415   dst->u[2] = src0->u[2] | src1->u[2];
4416   dst->u[3] = src0->u[3] | src1->u[3];
4417}
4418
4419static void
4420micro_xor(union tgsi_exec_channel *dst,
4421          const union tgsi_exec_channel *src0,
4422          const union tgsi_exec_channel *src1)
4423{
4424   dst->u[0] = src0->u[0] ^ src1->u[0];
4425   dst->u[1] = src0->u[1] ^ src1->u[1];
4426   dst->u[2] = src0->u[2] ^ src1->u[2];
4427   dst->u[3] = src0->u[3] ^ src1->u[3];
4428}
4429
4430static void
4431micro_mod(union tgsi_exec_channel *dst,
4432          const union tgsi_exec_channel *src0,
4433          const union tgsi_exec_channel *src1)
4434{
4435   dst->i[0] = src1->i[0] ? src0->i[0] % src1->i[0] : ~0;
4436   dst->i[1] = src1->i[1] ? src0->i[1] % src1->i[1] : ~0;
4437   dst->i[2] = src1->i[2] ? src0->i[2] % src1->i[2] : ~0;
4438   dst->i[3] = src1->i[3] ? src0->i[3] % src1->i[3] : ~0;
4439}
4440
4441static void
4442micro_f2i(union tgsi_exec_channel *dst,
4443          const union tgsi_exec_channel *src)
4444{
4445   dst->i[0] = (int)src->f[0];
4446   dst->i[1] = (int)src->f[1];
4447   dst->i[2] = (int)src->f[2];
4448   dst->i[3] = (int)src->f[3];
4449}
4450
4451static void
4452micro_fseq(union tgsi_exec_channel *dst,
4453           const union tgsi_exec_channel *src0,
4454           const union tgsi_exec_channel *src1)
4455{
4456   dst->u[0] = src0->f[0] == src1->f[0] ? ~0 : 0;
4457   dst->u[1] = src0->f[1] == src1->f[1] ? ~0 : 0;
4458   dst->u[2] = src0->f[2] == src1->f[2] ? ~0 : 0;
4459   dst->u[3] = src0->f[3] == src1->f[3] ? ~0 : 0;
4460}
4461
4462static void
4463micro_fsge(union tgsi_exec_channel *dst,
4464           const union tgsi_exec_channel *src0,
4465           const union tgsi_exec_channel *src1)
4466{
4467   dst->u[0] = src0->f[0] >= src1->f[0] ? ~0 : 0;
4468   dst->u[1] = src0->f[1] >= src1->f[1] ? ~0 : 0;
4469   dst->u[2] = src0->f[2] >= src1->f[2] ? ~0 : 0;
4470   dst->u[3] = src0->f[3] >= src1->f[3] ? ~0 : 0;
4471}
4472
4473static void
4474micro_fslt(union tgsi_exec_channel *dst,
4475           const union tgsi_exec_channel *src0,
4476           const union tgsi_exec_channel *src1)
4477{
4478   dst->u[0] = src0->f[0] < src1->f[0] ? ~0 : 0;
4479   dst->u[1] = src0->f[1] < src1->f[1] ? ~0 : 0;
4480   dst->u[2] = src0->f[2] < src1->f[2] ? ~0 : 0;
4481   dst->u[3] = src0->f[3] < src1->f[3] ? ~0 : 0;
4482}
4483
4484static void
4485micro_fsne(union tgsi_exec_channel *dst,
4486           const union tgsi_exec_channel *src0,
4487           const union tgsi_exec_channel *src1)
4488{
4489   dst->u[0] = src0->f[0] != src1->f[0] ? ~0 : 0;
4490   dst->u[1] = src0->f[1] != src1->f[1] ? ~0 : 0;
4491   dst->u[2] = src0->f[2] != src1->f[2] ? ~0 : 0;
4492   dst->u[3] = src0->f[3] != src1->f[3] ? ~0 : 0;
4493}
4494
4495static void
4496micro_idiv(union tgsi_exec_channel *dst,
4497           const union tgsi_exec_channel *src0,
4498           const union tgsi_exec_channel *src1)
4499{
4500   dst->i[0] = src1->i[0] ? src0->i[0] / src1->i[0] : 0;
4501   dst->i[1] = src1->i[1] ? src0->i[1] / src1->i[1] : 0;
4502   dst->i[2] = src1->i[2] ? src0->i[2] / src1->i[2] : 0;
4503   dst->i[3] = src1->i[3] ? src0->i[3] / src1->i[3] : 0;
4504}
4505
4506static void
4507micro_imax(union tgsi_exec_channel *dst,
4508           const union tgsi_exec_channel *src0,
4509           const union tgsi_exec_channel *src1)
4510{
4511   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
4512   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
4513   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
4514   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
4515}
4516
4517static void
4518micro_imin(union tgsi_exec_channel *dst,
4519           const union tgsi_exec_channel *src0,
4520           const union tgsi_exec_channel *src1)
4521{
4522   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
4523   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
4524   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
4525   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
4526}
4527
4528static void
4529micro_isge(union tgsi_exec_channel *dst,
4530           const union tgsi_exec_channel *src0,
4531           const union tgsi_exec_channel *src1)
4532{
4533   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
4534   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
4535   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
4536   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
4537}
4538
4539static void
4540micro_ishr(union tgsi_exec_channel *dst,
4541           const union tgsi_exec_channel *src0,
4542           const union tgsi_exec_channel *src1)
4543{
4544   unsigned masked_count;
4545   masked_count = src1->i[0] & 0x1f;
4546   dst->i[0] = src0->i[0] >> masked_count;
4547   masked_count = src1->i[1] & 0x1f;
4548   dst->i[1] = src0->i[1] >> masked_count;
4549   masked_count = src1->i[2] & 0x1f;
4550   dst->i[2] = src0->i[2] >> masked_count;
4551   masked_count = src1->i[3] & 0x1f;
4552   dst->i[3] = src0->i[3] >> masked_count;
4553}
4554
4555static void
4556micro_islt(union tgsi_exec_channel *dst,
4557           const union tgsi_exec_channel *src0,
4558           const union tgsi_exec_channel *src1)
4559{
4560   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
4561   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
4562   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
4563   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
4564}
4565
4566static void
4567micro_f2u(union tgsi_exec_channel *dst,
4568          const union tgsi_exec_channel *src)
4569{
4570   dst->u[0] = (uint)src->f[0];
4571   dst->u[1] = (uint)src->f[1];
4572   dst->u[2] = (uint)src->f[2];
4573   dst->u[3] = (uint)src->f[3];
4574}
4575
4576static void
4577micro_u2f(union tgsi_exec_channel *dst,
4578          const union tgsi_exec_channel *src)
4579{
4580   dst->f[0] = (float)src->u[0];
4581   dst->f[1] = (float)src->u[1];
4582   dst->f[2] = (float)src->u[2];
4583   dst->f[3] = (float)src->u[3];
4584}
4585
4586static void
4587micro_uadd(union tgsi_exec_channel *dst,
4588           const union tgsi_exec_channel *src0,
4589           const union tgsi_exec_channel *src1)
4590{
4591   dst->u[0] = src0->u[0] + src1->u[0];
4592   dst->u[1] = src0->u[1] + src1->u[1];
4593   dst->u[2] = src0->u[2] + src1->u[2];
4594   dst->u[3] = src0->u[3] + src1->u[3];
4595}
4596
4597static void
4598micro_udiv(union tgsi_exec_channel *dst,
4599           const union tgsi_exec_channel *src0,
4600           const union tgsi_exec_channel *src1)
4601{
4602   dst->u[0] = src1->u[0] ? src0->u[0] / src1->u[0] : ~0u;
4603   dst->u[1] = src1->u[1] ? src0->u[1] / src1->u[1] : ~0u;
4604   dst->u[2] = src1->u[2] ? src0->u[2] / src1->u[2] : ~0u;
4605   dst->u[3] = src1->u[3] ? src0->u[3] / src1->u[3] : ~0u;
4606}
4607
4608static void
4609micro_umad(union tgsi_exec_channel *dst,
4610           const union tgsi_exec_channel *src0,
4611           const union tgsi_exec_channel *src1,
4612           const union tgsi_exec_channel *src2)
4613{
4614   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
4615   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
4616   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
4617   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
4618}
4619
4620static void
4621micro_umax(union tgsi_exec_channel *dst,
4622           const union tgsi_exec_channel *src0,
4623           const union tgsi_exec_channel *src1)
4624{
4625   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
4626   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
4627   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
4628   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
4629}
4630
4631static void
4632micro_umin(union tgsi_exec_channel *dst,
4633           const union tgsi_exec_channel *src0,
4634           const union tgsi_exec_channel *src1)
4635{
4636   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
4637   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
4638   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
4639   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
4640}
4641
4642static void
4643micro_umod(union tgsi_exec_channel *dst,
4644           const union tgsi_exec_channel *src0,
4645           const union tgsi_exec_channel *src1)
4646{
4647   dst->u[0] = src1->u[0] ? src0->u[0] % src1->u[0] : ~0u;
4648   dst->u[1] = src1->u[1] ? src0->u[1] % src1->u[1] : ~0u;
4649   dst->u[2] = src1->u[2] ? src0->u[2] % src1->u[2] : ~0u;
4650   dst->u[3] = src1->u[3] ? src0->u[3] % src1->u[3] : ~0u;
4651}
4652
4653static void
4654micro_umul(union tgsi_exec_channel *dst,
4655           const union tgsi_exec_channel *src0,
4656           const union tgsi_exec_channel *src1)
4657{
4658   dst->u[0] = src0->u[0] * src1->u[0];
4659   dst->u[1] = src0->u[1] * src1->u[1];
4660   dst->u[2] = src0->u[2] * src1->u[2];
4661   dst->u[3] = src0->u[3] * src1->u[3];
4662}
4663
4664static void
4665micro_imul_hi(union tgsi_exec_channel *dst,
4666              const union tgsi_exec_channel *src0,
4667              const union tgsi_exec_channel *src1)
4668{
4669#define I64M(x, y) ((((int64_t)x) * ((int64_t)y)) >> 32)
4670   dst->i[0] = I64M(src0->i[0], src1->i[0]);
4671   dst->i[1] = I64M(src0->i[1], src1->i[1]);
4672   dst->i[2] = I64M(src0->i[2], src1->i[2]);
4673   dst->i[3] = I64M(src0->i[3], src1->i[3]);
4674#undef I64M
4675}
4676
4677static void
4678micro_umul_hi(union tgsi_exec_channel *dst,
4679              const union tgsi_exec_channel *src0,
4680              const union tgsi_exec_channel *src1)
4681{
4682#define U64M(x, y) ((((uint64_t)x) * ((uint64_t)y)) >> 32)
4683   dst->u[0] = U64M(src0->u[0], src1->u[0]);
4684   dst->u[1] = U64M(src0->u[1], src1->u[1]);
4685   dst->u[2] = U64M(src0->u[2], src1->u[2]);
4686   dst->u[3] = U64M(src0->u[3], src1->u[3]);
4687#undef U64M
4688}
4689
4690static void
4691micro_useq(union tgsi_exec_channel *dst,
4692           const union tgsi_exec_channel *src0,
4693           const union tgsi_exec_channel *src1)
4694{
4695   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
4696   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
4697   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
4698   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
4699}
4700
4701static void
4702micro_usge(union tgsi_exec_channel *dst,
4703           const union tgsi_exec_channel *src0,
4704           const union tgsi_exec_channel *src1)
4705{
4706   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
4707   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
4708   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
4709   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
4710}
4711
4712static void
4713micro_ushr(union tgsi_exec_channel *dst,
4714           const union tgsi_exec_channel *src0,
4715           const union tgsi_exec_channel *src1)
4716{
4717   unsigned masked_count;
4718   masked_count = src1->u[0] & 0x1f;
4719   dst->u[0] = src0->u[0] >> masked_count;
4720   masked_count = src1->u[1] & 0x1f;
4721   dst->u[1] = src0->u[1] >> masked_count;
4722   masked_count = src1->u[2] & 0x1f;
4723   dst->u[2] = src0->u[2] >> masked_count;
4724   masked_count = src1->u[3] & 0x1f;
4725   dst->u[3] = src0->u[3] >> masked_count;
4726}
4727
4728static void
4729micro_uslt(union tgsi_exec_channel *dst,
4730           const union tgsi_exec_channel *src0,
4731           const union tgsi_exec_channel *src1)
4732{
4733   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
4734   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
4735   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
4736   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
4737}
4738
4739static void
4740micro_usne(union tgsi_exec_channel *dst,
4741           const union tgsi_exec_channel *src0,
4742           const union tgsi_exec_channel *src1)
4743{
4744   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
4745   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
4746   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
4747   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
4748}
4749
4750static void
4751micro_uarl(union tgsi_exec_channel *dst,
4752           const union tgsi_exec_channel *src)
4753{
4754   dst->i[0] = src->u[0];
4755   dst->i[1] = src->u[1];
4756   dst->i[2] = src->u[2];
4757   dst->i[3] = src->u[3];
4758}
4759
4760/**
4761 * Signed bitfield extract (i.e. sign-extend the extracted bits)
4762 */
4763static void
4764micro_ibfe(union tgsi_exec_channel *dst,
4765           const union tgsi_exec_channel *src0,
4766           const union tgsi_exec_channel *src1,
4767           const union tgsi_exec_channel *src2)
4768{
4769   int i;
4770   for (i = 0; i < 4; i++) {
4771      int width = src2->i[i];
4772      int offset = src1->i[i] & 0x1f;
4773      if (width == 32 && offset == 0) {
4774         dst->i[i] = src0->i[i];
4775         continue;
4776      }
4777      width &= 0x1f;
4778      if (width == 0)
4779         dst->i[i] = 0;
4780      else if (width + offset < 32)
4781         dst->i[i] = (src0->i[i] << (32 - width - offset)) >> (32 - width);
4782      else
4783         dst->i[i] = src0->i[i] >> offset;
4784   }
4785}
4786
4787/**
4788 * Unsigned bitfield extract
4789 */
4790static void
4791micro_ubfe(union tgsi_exec_channel *dst,
4792           const union tgsi_exec_channel *src0,
4793           const union tgsi_exec_channel *src1,
4794           const union tgsi_exec_channel *src2)
4795{
4796   int i;
4797   for (i = 0; i < 4; i++) {
4798      int width = src2->u[i];
4799      int offset = src1->u[i] & 0x1f;
4800      if (width == 32 && offset == 0) {
4801         dst->u[i] = src0->u[i];
4802         continue;
4803      }
4804      width &= 0x1f;
4805      if (width == 0)
4806         dst->u[i] = 0;
4807      else if (width + offset < 32)
4808         dst->u[i] = (src0->u[i] << (32 - width - offset)) >> (32 - width);
4809      else
4810         dst->u[i] = src0->u[i] >> offset;
4811   }
4812}
4813
4814/**
4815 * Bitfield insert: copy low bits from src1 into a region of src0.
4816 */
4817static void
4818micro_bfi(union tgsi_exec_channel *dst,
4819          const union tgsi_exec_channel *src0,
4820          const union tgsi_exec_channel *src1,
4821          const union tgsi_exec_channel *src2,
4822          const union tgsi_exec_channel *src3)
4823{
4824   int i;
4825   for (i = 0; i < 4; i++) {
4826      int width = src3->u[i];
4827      int offset = src2->u[i] & 0x1f;
4828      if (width == 32) {
4829         dst->u[i] = src1->u[i];
4830      } else {
4831         int bitmask = ((1 << width) - 1) << offset;
4832         dst->u[i] = ((src1->u[i] << offset) & bitmask) | (src0->u[i] & ~bitmask);
4833      }
4834   }
4835}
4836
4837static void
4838micro_brev(union tgsi_exec_channel *dst,
4839           const union tgsi_exec_channel *src)
4840{
4841   dst->u[0] = util_bitreverse(src->u[0]);
4842   dst->u[1] = util_bitreverse(src->u[1]);
4843   dst->u[2] = util_bitreverse(src->u[2]);
4844   dst->u[3] = util_bitreverse(src->u[3]);
4845}
4846
4847static void
4848micro_popc(union tgsi_exec_channel *dst,
4849           const union tgsi_exec_channel *src)
4850{
4851   dst->u[0] = util_bitcount(src->u[0]);
4852   dst->u[1] = util_bitcount(src->u[1]);
4853   dst->u[2] = util_bitcount(src->u[2]);
4854   dst->u[3] = util_bitcount(src->u[3]);
4855}
4856
4857static void
4858micro_lsb(union tgsi_exec_channel *dst,
4859          const union tgsi_exec_channel *src)
4860{
4861   dst->i[0] = ffs(src->u[0]) - 1;
4862   dst->i[1] = ffs(src->u[1]) - 1;
4863   dst->i[2] = ffs(src->u[2]) - 1;
4864   dst->i[3] = ffs(src->u[3]) - 1;
4865}
4866
4867static void
4868micro_imsb(union tgsi_exec_channel *dst,
4869           const union tgsi_exec_channel *src)
4870{
4871   dst->i[0] = util_last_bit_signed(src->i[0]) - 1;
4872   dst->i[1] = util_last_bit_signed(src->i[1]) - 1;
4873   dst->i[2] = util_last_bit_signed(src->i[2]) - 1;
4874   dst->i[3] = util_last_bit_signed(src->i[3]) - 1;
4875}
4876
4877static void
4878micro_umsb(union tgsi_exec_channel *dst,
4879           const union tgsi_exec_channel *src)
4880{
4881   dst->i[0] = util_last_bit(src->u[0]) - 1;
4882   dst->i[1] = util_last_bit(src->u[1]) - 1;
4883   dst->i[2] = util_last_bit(src->u[2]) - 1;
4884   dst->i[3] = util_last_bit(src->u[3]) - 1;
4885}
4886
4887
4888static void
4889exec_interp_at_sample(struct tgsi_exec_machine *mach,
4890                      const struct tgsi_full_instruction *inst)
4891{
4892   union tgsi_exec_channel index;
4893   union tgsi_exec_channel index2D;
4894   union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
4895   const struct tgsi_full_src_register *reg = &inst->Src[0];
4896
4897   assert(reg->Register.File == TGSI_FILE_INPUT);
4898   assert(inst->Src[1].Register.File == TGSI_FILE_IMMEDIATE);
4899
4900   get_index_registers(mach, reg, &index, &index2D);
4901   float sample = mach->Imms[inst->Src[1].Register.Index][inst->Src[1].Register.SwizzleX];
4902
4903   /* Short cut: sample 0 is like a normal fetch */
4904   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4905      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4906         continue;
4907
4908      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
4909                             &result[chan]);
4910      if (sample != 0.0f) {
4911
4912      /* TODO: define the samples > 0, but so far we only do fake MSAA */
4913         float x = 0;
4914         float y = 0;
4915
4916         unsigned pos = index2D.i[chan] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[chan];
4917         assert(pos >= 0);
4918         assert(pos < TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS);
4919         mach->InputSampleOffsetApply[pos](mach, pos, chan, x, y, &result[chan]);
4920      }
4921      store_dest(mach, &result[chan], &inst->Dst[0], inst, chan);
4922   }
4923}
4924
4925
4926static void
4927exec_interp_at_offset(struct tgsi_exec_machine *mach,
4928                      const struct tgsi_full_instruction *inst)
4929{
4930   union tgsi_exec_channel index;
4931   union tgsi_exec_channel index2D;
4932   union tgsi_exec_channel ofsx;
4933   union tgsi_exec_channel ofsy;
4934   const struct tgsi_full_src_register *reg = &inst->Src[0];
4935
4936   assert(reg->Register.File == TGSI_FILE_INPUT);
4937
4938   get_index_registers(mach, reg, &index, &index2D);
4939   unsigned pos = index2D.i[0] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index.i[0];
4940
4941   fetch_source(mach, &ofsx, &inst->Src[1], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
4942   fetch_source(mach, &ofsy, &inst->Src[1], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
4943
4944   for (int chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4945      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4946         continue;
4947      union tgsi_exec_channel result;
4948      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D, &result);
4949      mach->InputSampleOffsetApply[pos](mach, pos, chan, ofsx.f[chan], ofsy.f[chan], &result);
4950      store_dest(mach, &result, &inst->Dst[0], inst, chan);
4951   }
4952}
4953
4954
4955static void
4956exec_interp_at_centroid(struct tgsi_exec_machine *mach,
4957                        const struct tgsi_full_instruction *inst)
4958{
4959   union tgsi_exec_channel index;
4960   union tgsi_exec_channel index2D;
4961   union tgsi_exec_channel result[TGSI_NUM_CHANNELS];
4962   const struct tgsi_full_src_register *reg = &inst->Src[0];
4963
4964   assert(reg->Register.File == TGSI_FILE_INPUT);
4965   get_index_registers(mach, reg, &index, &index2D);
4966
4967   for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
4968      if (!(inst->Dst[0].Register.WriteMask & (1 << chan)))
4969         continue;
4970
4971      /* Here we should add the change to use a sample that lies within the
4972       * primitive (Section 15.2):
4973       *
4974       * "When interpolating variables declared using centroid in ,
4975       * the variable is sampled at a location within the pixel covered
4976       * by the primitive generating the fragment.
4977       * ...
4978       * The built-in functions interpolateAtCentroid ... will sample
4979       * variables as though they were declared with the centroid ...
4980       * qualifier[s]."
4981       *
4982       * Since we only support 1 sample currently, this is just a pass-through.
4983       */
4984      fetch_src_file_channel(mach, TGSI_FILE_INPUT, chan, &index, &index2D,
4985                             &result[chan]);
4986      store_dest(mach, &result[chan], &inst->Dst[0], inst, chan);
4987   }
4988
4989}
4990
4991
4992/**
4993 * Execute a TGSI instruction.
4994 * Returns TRUE if a barrier instruction is hit,
4995 * otherwise FALSE.
4996 */
4997static boolean
4998exec_instruction(
4999   struct tgsi_exec_machine *mach,
5000   const struct tgsi_full_instruction *inst,
5001   int *pc )
5002{
5003   union tgsi_exec_channel r[10];
5004
5005   (*pc)++;
5006
5007   switch (inst->Instruction.Opcode) {
5008   case TGSI_OPCODE_ARL:
5009      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_FLOAT);
5010      break;
5011
5012   case TGSI_OPCODE_MOV:
5013      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_FLOAT);
5014      break;
5015
5016   case TGSI_OPCODE_LIT:
5017      exec_lit(mach, inst);
5018      break;
5019
5020   case TGSI_OPCODE_RCP:
5021      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT);
5022      break;
5023
5024   case TGSI_OPCODE_RSQ:
5025      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT);
5026      break;
5027
5028   case TGSI_OPCODE_EXP:
5029      exec_exp(mach, inst);
5030      break;
5031
5032   case TGSI_OPCODE_LOG:
5033      exec_log(mach, inst);
5034      break;
5035
5036   case TGSI_OPCODE_MUL:
5037      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT);
5038      break;
5039
5040   case TGSI_OPCODE_ADD:
5041      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT);
5042      break;
5043
5044   case TGSI_OPCODE_DP3:
5045      exec_dp3(mach, inst);
5046      break;
5047
5048   case TGSI_OPCODE_DP4:
5049      exec_dp4(mach, inst);
5050      break;
5051
5052   case TGSI_OPCODE_DST:
5053      exec_dst(mach, inst);
5054      break;
5055
5056   case TGSI_OPCODE_MIN:
5057      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT);
5058      break;
5059
5060   case TGSI_OPCODE_MAX:
5061      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT);
5062      break;
5063
5064   case TGSI_OPCODE_SLT:
5065      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT);
5066      break;
5067
5068   case TGSI_OPCODE_SGE:
5069      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT);
5070      break;
5071
5072   case TGSI_OPCODE_MAD:
5073      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT);
5074      break;
5075
5076   case TGSI_OPCODE_LRP:
5077      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT);
5078      break;
5079
5080   case TGSI_OPCODE_SQRT:
5081      exec_scalar_unary(mach, inst, micro_sqrt, TGSI_EXEC_DATA_FLOAT);
5082      break;
5083
5084   case TGSI_OPCODE_FRC:
5085      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT);
5086      break;
5087
5088   case TGSI_OPCODE_FLR:
5089      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT);
5090      break;
5091
5092   case TGSI_OPCODE_ROUND:
5093      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT);
5094      break;
5095
5096   case TGSI_OPCODE_EX2:
5097      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT);
5098      break;
5099
5100   case TGSI_OPCODE_LG2:
5101      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT);
5102      break;
5103
5104   case TGSI_OPCODE_POW:
5105      exec_scalar_binary(mach, inst, micro_pow, TGSI_EXEC_DATA_FLOAT);
5106      break;
5107
5108   case TGSI_OPCODE_LDEXP:
5109      exec_vector_binary(mach, inst, micro_ldexp, TGSI_EXEC_DATA_FLOAT);
5110      break;
5111
5112   case TGSI_OPCODE_COS:
5113      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT);
5114      break;
5115
5116   case TGSI_OPCODE_DDX_FINE:
5117      exec_vector_unary(mach, inst, micro_ddx_fine, TGSI_EXEC_DATA_FLOAT);
5118      break;
5119
5120   case TGSI_OPCODE_DDX:
5121      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT);
5122      break;
5123
5124   case TGSI_OPCODE_DDY_FINE:
5125      exec_vector_unary(mach, inst, micro_ddy_fine, TGSI_EXEC_DATA_FLOAT);
5126      break;
5127
5128   case TGSI_OPCODE_DDY:
5129      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT);
5130      break;
5131
5132   case TGSI_OPCODE_KILL:
5133      exec_kill (mach);
5134      break;
5135
5136   case TGSI_OPCODE_KILL_IF:
5137      exec_kill_if (mach, inst);
5138      break;
5139
5140   case TGSI_OPCODE_PK2H:
5141      exec_pk2h(mach, inst);
5142      break;
5143
5144   case TGSI_OPCODE_PK2US:
5145      assert (0);
5146      break;
5147
5148   case TGSI_OPCODE_PK4B:
5149      assert (0);
5150      break;
5151
5152   case TGSI_OPCODE_PK4UB:
5153      assert (0);
5154      break;
5155
5156   case TGSI_OPCODE_SEQ:
5157      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT);
5158      break;
5159
5160   case TGSI_OPCODE_SGT:
5161      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT);
5162      break;
5163
5164   case TGSI_OPCODE_SIN:
5165      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT);
5166      break;
5167
5168   case TGSI_OPCODE_SLE:
5169      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT);
5170      break;
5171
5172   case TGSI_OPCODE_SNE:
5173      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT);
5174      break;
5175
5176   case TGSI_OPCODE_TEX:
5177      /* simple texture lookup */
5178      /* src[0] = texcoord */
5179      /* src[1] = sampler unit */
5180      exec_tex(mach, inst, TEX_MODIFIER_NONE, 1);
5181      break;
5182
5183   case TGSI_OPCODE_TXB:
5184      /* Texture lookup with lod bias */
5185      /* src[0] = texcoord (src[0].w = LOD bias) */
5186      /* src[1] = sampler unit */
5187      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 1);
5188      break;
5189
5190   case TGSI_OPCODE_TXD:
5191      /* Texture lookup with explict partial derivatives */
5192      /* src[0] = texcoord */
5193      /* src[1] = d[strq]/dx */
5194      /* src[2] = d[strq]/dy */
5195      /* src[3] = sampler unit */
5196      exec_txd(mach, inst);
5197      break;
5198
5199   case TGSI_OPCODE_TXL:
5200      /* Texture lookup with explit LOD */
5201      /* src[0] = texcoord (src[0].w = LOD) */
5202      /* src[1] = sampler unit */
5203      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 1);
5204      break;
5205
5206   case TGSI_OPCODE_TXP:
5207      /* Texture lookup with projection */
5208      /* src[0] = texcoord (src[0].w = projection) */
5209      /* src[1] = sampler unit */
5210      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
5211      break;
5212
5213   case TGSI_OPCODE_TG4:
5214      /* src[0] = texcoord */
5215      /* src[1] = component */
5216      /* src[2] = sampler unit */
5217      exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
5218      break;
5219
5220   case TGSI_OPCODE_LODQ:
5221      /* src[0] = texcoord */
5222      /* src[1] = sampler unit */
5223      exec_lodq(mach, inst);
5224      break;
5225
5226   case TGSI_OPCODE_UP2H:
5227      exec_up2h(mach, inst);
5228      break;
5229
5230   case TGSI_OPCODE_UP2US:
5231      assert (0);
5232      break;
5233
5234   case TGSI_OPCODE_UP4B:
5235      assert (0);
5236      break;
5237
5238   case TGSI_OPCODE_UP4UB:
5239      assert (0);
5240      break;
5241
5242   case TGSI_OPCODE_ARR:
5243      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_FLOAT);
5244      break;
5245
5246   case TGSI_OPCODE_CAL:
5247      /* skip the call if no execution channels are enabled */
5248      if (mach->ExecMask) {
5249         /* do the call */
5250
5251         /* First, record the depths of the execution stacks.
5252          * This is important for deeply nested/looped return statements.
5253          * We have to unwind the stacks by the correct amount.  For a
5254          * real code generator, we could determine the number of entries
5255          * to pop off each stack with simple static analysis and avoid
5256          * implementing this data structure at run time.
5257          */
5258         mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
5259         mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
5260         mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
5261         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
5262         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
5263         /* note that PC was already incremented above */
5264         mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
5265
5266         mach->CallStackTop++;
5267
5268         /* Second, push the Cond, Loop, Cont, Func stacks */
5269         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5270         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5271         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5272         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
5273         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5274         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
5275
5276         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5277         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5278         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5279         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
5280         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5281         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
5282
5283         /* Finally, jump to the subroutine.  The label is a pointer
5284          * (an instruction number) to the BGNSUB instruction.
5285          */
5286         *pc = inst->Label.Label;
5287         assert(mach->Instructions[*pc].Instruction.Opcode
5288                == TGSI_OPCODE_BGNSUB);
5289      }
5290      break;
5291
5292   case TGSI_OPCODE_RET:
5293      mach->FuncMask &= ~mach->ExecMask;
5294      UPDATE_EXEC_MASK(mach);
5295
5296      if (mach->FuncMask == 0x0) {
5297         /* really return now (otherwise, keep executing */
5298
5299         if (mach->CallStackTop == 0) {
5300            /* returning from main() */
5301            mach->CondStackTop = 0;
5302            mach->LoopStackTop = 0;
5303            mach->ContStackTop = 0;
5304            mach->LoopLabelStackTop = 0;
5305            mach->SwitchStackTop = 0;
5306            mach->BreakStackTop = 0;
5307            *pc = -1;
5308            return FALSE;
5309         }
5310
5311         assert(mach->CallStackTop > 0);
5312         mach->CallStackTop--;
5313
5314         mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5315         mach->CondMask = mach->CondStack[mach->CondStackTop];
5316
5317         mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5318         mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5319
5320         mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5321         mach->ContMask = mach->ContStack[mach->ContStackTop];
5322
5323         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5324         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5325
5326         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5327         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5328
5329         assert(mach->FuncStackTop > 0);
5330         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5331
5332         *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5333
5334         UPDATE_EXEC_MASK(mach);
5335      }
5336      break;
5337
5338   case TGSI_OPCODE_SSG:
5339      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT);
5340      break;
5341
5342   case TGSI_OPCODE_CMP:
5343      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT);
5344      break;
5345
5346   case TGSI_OPCODE_DIV:
5347      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT);
5348      break;
5349
5350   case TGSI_OPCODE_DP2:
5351      exec_dp2(mach, inst);
5352      break;
5353
5354   case TGSI_OPCODE_IF:
5355      /* push CondMask */
5356      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5357      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5358      FETCH( &r[0], 0, TGSI_CHAN_X );
5359      for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
5360         if (!r[0].f[i])
5361            mach->CondMask &= ~(1 << i);
5362      }
5363      UPDATE_EXEC_MASK(mach);
5364      /* If no channels are taking the then branch, jump to ELSE. */
5365      if (!mach->CondMask)
5366         *pc = inst->Label.Label;
5367      break;
5368
5369   case TGSI_OPCODE_UIF:
5370      /* push CondMask */
5371      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
5372      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
5373      IFETCH( &r[0], 0, TGSI_CHAN_X );
5374      for (int i = 0; i < TGSI_QUAD_SIZE; i++) {
5375         if (!r[0].u[i])
5376            mach->CondMask &= ~(1 << i);
5377      }
5378      UPDATE_EXEC_MASK(mach);
5379      /* If no channels are taking the then branch, jump to ELSE. */
5380      if (!mach->CondMask)
5381         *pc = inst->Label.Label;
5382      break;
5383
5384   case TGSI_OPCODE_ELSE:
5385      /* invert CondMask wrt previous mask */
5386      {
5387         uint prevMask;
5388         assert(mach->CondStackTop > 0);
5389         prevMask = mach->CondStack[mach->CondStackTop - 1];
5390         mach->CondMask = ~mach->CondMask & prevMask;
5391         UPDATE_EXEC_MASK(mach);
5392
5393         /* If no channels are taking ELSE, jump to ENDIF */
5394         if (!mach->CondMask)
5395            *pc = inst->Label.Label;
5396      }
5397      break;
5398
5399   case TGSI_OPCODE_ENDIF:
5400      /* pop CondMask */
5401      assert(mach->CondStackTop > 0);
5402      mach->CondMask = mach->CondStack[--mach->CondStackTop];
5403      UPDATE_EXEC_MASK(mach);
5404      break;
5405
5406   case TGSI_OPCODE_END:
5407      /* make sure we end primitives which haven't
5408       * been explicitly emitted */
5409      conditional_emit_primitive(mach);
5410      /* halt execution */
5411      *pc = -1;
5412      break;
5413
5414   case TGSI_OPCODE_CEIL:
5415      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT);
5416      break;
5417
5418   case TGSI_OPCODE_I2F:
5419      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_INT);
5420      break;
5421
5422   case TGSI_OPCODE_NOT:
5423      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT);
5424      break;
5425
5426   case TGSI_OPCODE_TRUNC:
5427      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT);
5428      break;
5429
5430   case TGSI_OPCODE_SHL:
5431      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT);
5432      break;
5433
5434   case TGSI_OPCODE_AND:
5435      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT);
5436      break;
5437
5438   case TGSI_OPCODE_OR:
5439      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT);
5440      break;
5441
5442   case TGSI_OPCODE_MOD:
5443      exec_vector_binary(mach, inst, micro_mod, TGSI_EXEC_DATA_INT);
5444      break;
5445
5446   case TGSI_OPCODE_XOR:
5447      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT);
5448      break;
5449
5450   case TGSI_OPCODE_TXF:
5451      exec_txf(mach, inst);
5452      break;
5453
5454   case TGSI_OPCODE_TXQ:
5455      exec_txq(mach, inst);
5456      break;
5457
5458   case TGSI_OPCODE_EMIT:
5459      emit_vertex(mach, inst);
5460      break;
5461
5462   case TGSI_OPCODE_ENDPRIM:
5463      emit_primitive(mach, inst);
5464      break;
5465
5466   case TGSI_OPCODE_BGNLOOP:
5467      /* push LoopMask and ContMasks */
5468      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5469      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5470      assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
5471      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
5472
5473      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
5474      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
5475      mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
5476      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
5477      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
5478      break;
5479
5480   case TGSI_OPCODE_ENDLOOP:
5481      /* Restore ContMask, but don't pop */
5482      assert(mach->ContStackTop > 0);
5483      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
5484      UPDATE_EXEC_MASK(mach);
5485      if (mach->ExecMask) {
5486         /* repeat loop: jump to instruction just past BGNLOOP */
5487         assert(mach->LoopLabelStackTop > 0);
5488         *pc = mach->LoopLabelStack[mach->LoopLabelStackTop - 1] + 1;
5489      }
5490      else {
5491         /* exit loop: pop LoopMask */
5492         assert(mach->LoopStackTop > 0);
5493         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
5494         /* pop ContMask */
5495         assert(mach->ContStackTop > 0);
5496         mach->ContMask = mach->ContStack[--mach->ContStackTop];
5497         assert(mach->LoopLabelStackTop > 0);
5498         --mach->LoopLabelStackTop;
5499
5500         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
5501      }
5502      UPDATE_EXEC_MASK(mach);
5503      break;
5504
5505   case TGSI_OPCODE_BRK:
5506      exec_break(mach);
5507      break;
5508
5509   case TGSI_OPCODE_CONT:
5510      /* turn off cont channels for each enabled exec channel */
5511      mach->ContMask &= ~mach->ExecMask;
5512      /* Todo: if mach->LoopMask == 0, jump to end of loop */
5513      UPDATE_EXEC_MASK(mach);
5514      break;
5515
5516   case TGSI_OPCODE_BGNSUB:
5517      /* no-op */
5518      break;
5519
5520   case TGSI_OPCODE_ENDSUB:
5521      /*
5522       * XXX: This really should be a no-op. We should never reach this opcode.
5523       */
5524
5525      assert(mach->CallStackTop > 0);
5526      mach->CallStackTop--;
5527
5528      mach->CondStackTop = mach->CallStack[mach->CallStackTop].CondStackTop;
5529      mach->CondMask = mach->CondStack[mach->CondStackTop];
5530
5531      mach->LoopStackTop = mach->CallStack[mach->CallStackTop].LoopStackTop;
5532      mach->LoopMask = mach->LoopStack[mach->LoopStackTop];
5533
5534      mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
5535      mach->ContMask = mach->ContStack[mach->ContStackTop];
5536
5537      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
5538      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
5539
5540      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
5541      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
5542
5543      assert(mach->FuncStackTop > 0);
5544      mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
5545
5546      *pc = mach->CallStack[mach->CallStackTop].ReturnAddr;
5547
5548      UPDATE_EXEC_MASK(mach);
5549      break;
5550
5551   case TGSI_OPCODE_NOP:
5552      break;
5553
5554   case TGSI_OPCODE_F2I:
5555      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_FLOAT);
5556      break;
5557
5558   case TGSI_OPCODE_FSEQ:
5559      exec_vector_binary(mach, inst, micro_fseq, TGSI_EXEC_DATA_FLOAT);
5560      break;
5561
5562   case TGSI_OPCODE_FSGE:
5563      exec_vector_binary(mach, inst, micro_fsge, TGSI_EXEC_DATA_FLOAT);
5564      break;
5565
5566   case TGSI_OPCODE_FSLT:
5567      exec_vector_binary(mach, inst, micro_fslt, TGSI_EXEC_DATA_FLOAT);
5568      break;
5569
5570   case TGSI_OPCODE_FSNE:
5571      exec_vector_binary(mach, inst, micro_fsne, TGSI_EXEC_DATA_FLOAT);
5572      break;
5573
5574   case TGSI_OPCODE_IDIV:
5575      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT);
5576      break;
5577
5578   case TGSI_OPCODE_IMAX:
5579      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT);
5580      break;
5581
5582   case TGSI_OPCODE_IMIN:
5583      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT);
5584      break;
5585
5586   case TGSI_OPCODE_INEG:
5587      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT);
5588      break;
5589
5590   case TGSI_OPCODE_ISGE:
5591      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT);
5592      break;
5593
5594   case TGSI_OPCODE_ISHR:
5595      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT);
5596      break;
5597
5598   case TGSI_OPCODE_ISLT:
5599      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT);
5600      break;
5601
5602   case TGSI_OPCODE_F2U:
5603      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_FLOAT);
5604      break;
5605
5606   case TGSI_OPCODE_U2F:
5607      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_UINT);
5608      break;
5609
5610   case TGSI_OPCODE_UADD:
5611      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_INT);
5612      break;
5613
5614   case TGSI_OPCODE_UDIV:
5615      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT);
5616      break;
5617
5618   case TGSI_OPCODE_UMAD:
5619      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT);
5620      break;
5621
5622   case TGSI_OPCODE_UMAX:
5623      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT);
5624      break;
5625
5626   case TGSI_OPCODE_UMIN:
5627      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT);
5628      break;
5629
5630   case TGSI_OPCODE_UMOD:
5631      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT);
5632      break;
5633
5634   case TGSI_OPCODE_UMUL:
5635      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT);
5636      break;
5637
5638   case TGSI_OPCODE_IMUL_HI:
5639      exec_vector_binary(mach, inst, micro_imul_hi, TGSI_EXEC_DATA_INT);
5640      break;
5641
5642   case TGSI_OPCODE_UMUL_HI:
5643      exec_vector_binary(mach, inst, micro_umul_hi, TGSI_EXEC_DATA_UINT);
5644      break;
5645
5646   case TGSI_OPCODE_USEQ:
5647      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT);
5648      break;
5649
5650   case TGSI_OPCODE_USGE:
5651      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT);
5652      break;
5653
5654   case TGSI_OPCODE_USHR:
5655      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT);
5656      break;
5657
5658   case TGSI_OPCODE_USLT:
5659      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT);
5660      break;
5661
5662   case TGSI_OPCODE_USNE:
5663      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT);
5664      break;
5665
5666   case TGSI_OPCODE_SWITCH:
5667      exec_switch(mach, inst);
5668      break;
5669
5670   case TGSI_OPCODE_CASE:
5671      exec_case(mach, inst);
5672      break;
5673
5674   case TGSI_OPCODE_DEFAULT:
5675      exec_default(mach);
5676      break;
5677
5678   case TGSI_OPCODE_ENDSWITCH:
5679      exec_endswitch(mach);
5680      break;
5681
5682   case TGSI_OPCODE_SAMPLE_I:
5683      exec_txf(mach, inst);
5684      break;
5685
5686   case TGSI_OPCODE_SAMPLE_I_MS:
5687      exec_txf(mach, inst);
5688      break;
5689
5690   case TGSI_OPCODE_SAMPLE:
5691      exec_sample(mach, inst, TEX_MODIFIER_NONE, FALSE);
5692      break;
5693
5694   case TGSI_OPCODE_SAMPLE_B:
5695      exec_sample(mach, inst, TEX_MODIFIER_LOD_BIAS, FALSE);
5696      break;
5697
5698   case TGSI_OPCODE_SAMPLE_C:
5699      exec_sample(mach, inst, TEX_MODIFIER_NONE, TRUE);
5700      break;
5701
5702   case TGSI_OPCODE_SAMPLE_C_LZ:
5703      exec_sample(mach, inst, TEX_MODIFIER_LEVEL_ZERO, TRUE);
5704      break;
5705
5706   case TGSI_OPCODE_SAMPLE_D:
5707      exec_sample_d(mach, inst);
5708      break;
5709
5710   case TGSI_OPCODE_SAMPLE_L:
5711      exec_sample(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, FALSE);
5712      break;
5713
5714   case TGSI_OPCODE_GATHER4:
5715      exec_sample(mach, inst, TEX_MODIFIER_GATHER, FALSE);
5716      break;
5717
5718   case TGSI_OPCODE_SVIEWINFO:
5719      exec_txq(mach, inst);
5720      break;
5721
5722   case TGSI_OPCODE_SAMPLE_POS:
5723      assert(0);
5724      break;
5725
5726   case TGSI_OPCODE_SAMPLE_INFO:
5727      assert(0);
5728      break;
5729
5730   case TGSI_OPCODE_LOD:
5731      exec_lodq(mach, inst);
5732      break;
5733
5734   case TGSI_OPCODE_UARL:
5735      exec_vector_unary(mach, inst, micro_uarl, TGSI_EXEC_DATA_UINT);
5736      break;
5737
5738   case TGSI_OPCODE_UCMP:
5739      exec_ucmp(mach, inst);
5740      break;
5741
5742   case TGSI_OPCODE_IABS:
5743      exec_vector_unary(mach, inst, micro_iabs, TGSI_EXEC_DATA_INT);
5744      break;
5745
5746   case TGSI_OPCODE_ISSG:
5747      exec_vector_unary(mach, inst, micro_isgn, TGSI_EXEC_DATA_INT);
5748      break;
5749
5750   case TGSI_OPCODE_TEX2:
5751      /* simple texture lookup */
5752      /* src[0] = texcoord */
5753      /* src[1] = compare */
5754      /* src[2] = sampler unit */
5755      exec_tex(mach, inst, TEX_MODIFIER_NONE, 2);
5756      break;
5757   case TGSI_OPCODE_TXB2:
5758      /* simple texture lookup */
5759      /* src[0] = texcoord */
5760      /* src[1] = bias */
5761      /* src[2] = sampler unit */
5762      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS, 2);
5763      break;
5764   case TGSI_OPCODE_TXL2:
5765      /* simple texture lookup */
5766      /* src[0] = texcoord */
5767      /* src[1] = lod */
5768      /* src[2] = sampler unit */
5769      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD, 2);
5770      break;
5771
5772   case TGSI_OPCODE_IBFE:
5773      exec_vector_trinary(mach, inst, micro_ibfe, TGSI_EXEC_DATA_INT);
5774      break;
5775   case TGSI_OPCODE_UBFE:
5776      exec_vector_trinary(mach, inst, micro_ubfe, TGSI_EXEC_DATA_UINT);
5777      break;
5778   case TGSI_OPCODE_BFI:
5779      exec_vector_quaternary(mach, inst, micro_bfi, TGSI_EXEC_DATA_UINT);
5780      break;
5781   case TGSI_OPCODE_BREV:
5782      exec_vector_unary(mach, inst, micro_brev, TGSI_EXEC_DATA_UINT);
5783      break;
5784   case TGSI_OPCODE_POPC:
5785      exec_vector_unary(mach, inst, micro_popc, TGSI_EXEC_DATA_UINT);
5786      break;
5787   case TGSI_OPCODE_LSB:
5788      exec_vector_unary(mach, inst, micro_lsb, TGSI_EXEC_DATA_UINT);
5789      break;
5790   case TGSI_OPCODE_IMSB:
5791      exec_vector_unary(mach, inst, micro_imsb, TGSI_EXEC_DATA_INT);
5792      break;
5793   case TGSI_OPCODE_UMSB:
5794      exec_vector_unary(mach, inst, micro_umsb, TGSI_EXEC_DATA_UINT);
5795      break;
5796
5797   case TGSI_OPCODE_F2D:
5798      exec_t_2_64(mach, inst, micro_f2d, TGSI_EXEC_DATA_FLOAT);
5799      break;
5800
5801   case TGSI_OPCODE_D2F:
5802      exec_64_2_t(mach, inst, micro_d2f);
5803      break;
5804
5805   case TGSI_OPCODE_DABS:
5806      exec_double_unary(mach, inst, micro_dabs);
5807      break;
5808
5809   case TGSI_OPCODE_DNEG:
5810      exec_double_unary(mach, inst, micro_dneg);
5811      break;
5812
5813   case TGSI_OPCODE_DADD:
5814      exec_double_binary(mach, inst, micro_dadd, TGSI_EXEC_DATA_DOUBLE);
5815      break;
5816
5817   case TGSI_OPCODE_DDIV:
5818      exec_double_binary(mach, inst, micro_ddiv, TGSI_EXEC_DATA_DOUBLE);
5819      break;
5820
5821   case TGSI_OPCODE_DMUL:
5822      exec_double_binary(mach, inst, micro_dmul, TGSI_EXEC_DATA_DOUBLE);
5823      break;
5824
5825   case TGSI_OPCODE_DMAX:
5826      exec_double_binary(mach, inst, micro_dmax, TGSI_EXEC_DATA_DOUBLE);
5827      break;
5828
5829   case TGSI_OPCODE_DMIN:
5830      exec_double_binary(mach, inst, micro_dmin, TGSI_EXEC_DATA_DOUBLE);
5831      break;
5832
5833   case TGSI_OPCODE_DSLT:
5834      exec_double_binary(mach, inst, micro_dslt, TGSI_EXEC_DATA_UINT);
5835      break;
5836
5837   case TGSI_OPCODE_DSGE:
5838      exec_double_binary(mach, inst, micro_dsge, TGSI_EXEC_DATA_UINT);
5839      break;
5840
5841   case TGSI_OPCODE_DSEQ:
5842      exec_double_binary(mach, inst, micro_dseq, TGSI_EXEC_DATA_UINT);
5843      break;
5844
5845   case TGSI_OPCODE_DSNE:
5846      exec_double_binary(mach, inst, micro_dsne, TGSI_EXEC_DATA_UINT);
5847      break;
5848
5849   case TGSI_OPCODE_DRCP:
5850      exec_double_unary(mach, inst, micro_drcp);
5851      break;
5852
5853   case TGSI_OPCODE_DSQRT:
5854      exec_double_unary(mach, inst, micro_dsqrt);
5855      break;
5856
5857   case TGSI_OPCODE_DRSQ:
5858      exec_double_unary(mach, inst, micro_drsq);
5859      break;
5860
5861   case TGSI_OPCODE_DMAD:
5862      exec_double_trinary(mach, inst, micro_dmad);
5863      break;
5864
5865   case TGSI_OPCODE_DFRAC:
5866      exec_double_unary(mach, inst, micro_dfrac);
5867      break;
5868
5869   case TGSI_OPCODE_DFLR:
5870      exec_double_unary(mach, inst, micro_dflr);
5871      break;
5872
5873   case TGSI_OPCODE_DLDEXP:
5874      exec_dldexp(mach, inst);
5875      break;
5876
5877   case TGSI_OPCODE_DFRACEXP:
5878      exec_dfracexp(mach, inst);
5879      break;
5880
5881   case TGSI_OPCODE_I2D:
5882      exec_t_2_64(mach, inst, micro_i2d, TGSI_EXEC_DATA_FLOAT);
5883      break;
5884
5885   case TGSI_OPCODE_D2I:
5886      exec_64_2_t(mach, inst, micro_d2i);
5887      break;
5888
5889   case TGSI_OPCODE_U2D:
5890      exec_t_2_64(mach, inst, micro_u2d, TGSI_EXEC_DATA_FLOAT);
5891      break;
5892
5893   case TGSI_OPCODE_D2U:
5894      exec_64_2_t(mach, inst, micro_d2u);
5895      break;
5896
5897   case TGSI_OPCODE_LOAD:
5898      exec_load(mach, inst);
5899      break;
5900
5901   case TGSI_OPCODE_STORE:
5902      exec_store(mach, inst);
5903      break;
5904
5905   case TGSI_OPCODE_ATOMUADD:
5906   case TGSI_OPCODE_ATOMXCHG:
5907   case TGSI_OPCODE_ATOMCAS:
5908   case TGSI_OPCODE_ATOMAND:
5909   case TGSI_OPCODE_ATOMOR:
5910   case TGSI_OPCODE_ATOMXOR:
5911   case TGSI_OPCODE_ATOMUMIN:
5912   case TGSI_OPCODE_ATOMUMAX:
5913   case TGSI_OPCODE_ATOMIMIN:
5914   case TGSI_OPCODE_ATOMIMAX:
5915   case TGSI_OPCODE_ATOMFADD:
5916      exec_atomop(mach, inst);
5917      break;
5918
5919   case TGSI_OPCODE_RESQ:
5920      exec_resq(mach, inst);
5921      break;
5922   case TGSI_OPCODE_BARRIER:
5923   case TGSI_OPCODE_MEMBAR:
5924      return TRUE;
5925      break;
5926
5927   case TGSI_OPCODE_I64ABS:
5928      exec_double_unary(mach, inst, micro_i64abs);
5929      break;
5930
5931   case TGSI_OPCODE_I64SSG:
5932      exec_double_unary(mach, inst, micro_i64sgn);
5933      break;
5934
5935   case TGSI_OPCODE_I64NEG:
5936      exec_double_unary(mach, inst, micro_i64neg);
5937      break;
5938
5939   case TGSI_OPCODE_U64SEQ:
5940      exec_double_binary(mach, inst, micro_u64seq, TGSI_EXEC_DATA_UINT);
5941      break;
5942
5943   case TGSI_OPCODE_U64SNE:
5944      exec_double_binary(mach, inst, micro_u64sne, TGSI_EXEC_DATA_UINT);
5945      break;
5946
5947   case TGSI_OPCODE_I64SLT:
5948      exec_double_binary(mach, inst, micro_i64slt, TGSI_EXEC_DATA_UINT);
5949      break;
5950   case TGSI_OPCODE_U64SLT:
5951      exec_double_binary(mach, inst, micro_u64slt, TGSI_EXEC_DATA_UINT);
5952      break;
5953
5954   case TGSI_OPCODE_I64SGE:
5955      exec_double_binary(mach, inst, micro_i64sge, TGSI_EXEC_DATA_UINT);
5956      break;
5957   case TGSI_OPCODE_U64SGE:
5958      exec_double_binary(mach, inst, micro_u64sge, TGSI_EXEC_DATA_UINT);
5959      break;
5960
5961   case TGSI_OPCODE_I64MIN:
5962      exec_double_binary(mach, inst, micro_i64min, TGSI_EXEC_DATA_INT64);
5963      break;
5964   case TGSI_OPCODE_U64MIN:
5965      exec_double_binary(mach, inst, micro_u64min, TGSI_EXEC_DATA_UINT64);
5966      break;
5967   case TGSI_OPCODE_I64MAX:
5968      exec_double_binary(mach, inst, micro_i64max, TGSI_EXEC_DATA_INT64);
5969      break;
5970   case TGSI_OPCODE_U64MAX:
5971      exec_double_binary(mach, inst, micro_u64max, TGSI_EXEC_DATA_UINT64);
5972      break;
5973   case TGSI_OPCODE_U64ADD:
5974      exec_double_binary(mach, inst, micro_u64add, TGSI_EXEC_DATA_UINT64);
5975      break;
5976   case TGSI_OPCODE_U64MUL:
5977      exec_double_binary(mach, inst, micro_u64mul, TGSI_EXEC_DATA_UINT64);
5978      break;
5979   case TGSI_OPCODE_U64SHL:
5980      exec_arg0_64_arg1_32(mach, inst, micro_u64shl);
5981      break;
5982   case TGSI_OPCODE_I64SHR:
5983      exec_arg0_64_arg1_32(mach, inst, micro_i64shr);
5984      break;
5985   case TGSI_OPCODE_U64SHR:
5986      exec_arg0_64_arg1_32(mach, inst, micro_u64shr);
5987      break;
5988   case TGSI_OPCODE_U64DIV:
5989      exec_double_binary(mach, inst, micro_u64div, TGSI_EXEC_DATA_UINT64);
5990      break;
5991   case TGSI_OPCODE_I64DIV:
5992      exec_double_binary(mach, inst, micro_i64div, TGSI_EXEC_DATA_INT64);
5993      break;
5994   case TGSI_OPCODE_U64MOD:
5995      exec_double_binary(mach, inst, micro_u64mod, TGSI_EXEC_DATA_UINT64);
5996      break;
5997   case TGSI_OPCODE_I64MOD:
5998      exec_double_binary(mach, inst, micro_i64mod, TGSI_EXEC_DATA_INT64);
5999      break;
6000
6001   case TGSI_OPCODE_F2U64:
6002      exec_t_2_64(mach, inst, micro_f2u64, TGSI_EXEC_DATA_FLOAT);
6003      break;
6004
6005   case TGSI_OPCODE_F2I64:
6006      exec_t_2_64(mach, inst, micro_f2i64, TGSI_EXEC_DATA_FLOAT);
6007      break;
6008
6009   case TGSI_OPCODE_U2I64:
6010      exec_t_2_64(mach, inst, micro_u2i64, TGSI_EXEC_DATA_INT);
6011      break;
6012   case TGSI_OPCODE_I2I64:
6013      exec_t_2_64(mach, inst, micro_i2i64, TGSI_EXEC_DATA_INT);
6014      break;
6015
6016   case TGSI_OPCODE_D2U64:
6017      exec_double_unary(mach, inst, micro_d2u64);
6018      break;
6019
6020   case TGSI_OPCODE_D2I64:
6021      exec_double_unary(mach, inst, micro_d2i64);
6022      break;
6023
6024   case TGSI_OPCODE_U642F:
6025      exec_64_2_t(mach, inst, micro_u642f);
6026      break;
6027   case TGSI_OPCODE_I642F:
6028      exec_64_2_t(mach, inst, micro_i642f);
6029      break;
6030
6031   case TGSI_OPCODE_U642D:
6032      exec_double_unary(mach, inst, micro_u642d);
6033      break;
6034   case TGSI_OPCODE_I642D:
6035      exec_double_unary(mach, inst, micro_i642d);
6036      break;
6037   case TGSI_OPCODE_INTERP_SAMPLE:
6038      exec_interp_at_sample(mach, inst);
6039      break;
6040   case TGSI_OPCODE_INTERP_OFFSET:
6041      exec_interp_at_offset(mach, inst);
6042      break;
6043   case TGSI_OPCODE_INTERP_CENTROID:
6044      exec_interp_at_centroid(mach, inst);
6045      break;
6046   default:
6047      assert( 0 );
6048   }
6049   return FALSE;
6050}
6051
6052static void
6053tgsi_exec_machine_setup_masks(struct tgsi_exec_machine *mach)
6054{
6055   uint default_mask = 0xf;
6056
6057   mach->KillMask = 0;
6058   mach->OutputVertexOffset = 0;
6059
6060   if (mach->ShaderType == PIPE_SHADER_GEOMETRY) {
6061      for (unsigned i = 0; i < TGSI_MAX_VERTEX_STREAMS; i++) {
6062         mach->OutputPrimCount[i] = 0;
6063         mach->Primitives[i][0] = 0;
6064      }
6065      /* GS runs on a single primitive for now */
6066      default_mask = 0x1;
6067   }
6068
6069   if (mach->NonHelperMask == 0)
6070      mach->NonHelperMask = default_mask;
6071   mach->CondMask = default_mask;
6072   mach->LoopMask = default_mask;
6073   mach->ContMask = default_mask;
6074   mach->FuncMask = default_mask;
6075   mach->ExecMask = default_mask;
6076
6077   mach->Switch.mask = default_mask;
6078
6079   assert(mach->CondStackTop == 0);
6080   assert(mach->LoopStackTop == 0);
6081   assert(mach->ContStackTop == 0);
6082   assert(mach->SwitchStackTop == 0);
6083   assert(mach->BreakStackTop == 0);
6084   assert(mach->CallStackTop == 0);
6085}
6086
6087/**
6088 * Run TGSI interpreter.
6089 * \return bitmask of "alive" quad components
6090 */
6091uint
6092tgsi_exec_machine_run( struct tgsi_exec_machine *mach, int start_pc )
6093{
6094   uint i;
6095
6096   mach->pc = start_pc;
6097
6098   if (!start_pc) {
6099      tgsi_exec_machine_setup_masks(mach);
6100
6101      /* execute declarations (interpolants) */
6102      for (i = 0; i < mach->NumDeclarations; i++) {
6103         exec_declaration( mach, mach->Declarations+i );
6104      }
6105   }
6106
6107   {
6108#if DEBUG_EXECUTION
6109      struct tgsi_exec_vector temps[TGSI_EXEC_NUM_TEMPS];
6110      struct tgsi_exec_vector outputs[PIPE_MAX_ATTRIBS];
6111      uint inst = 1;
6112
6113      if (!start_pc) {
6114         memset(mach->Temps, 0, sizeof(temps));
6115         if (mach->Outputs)
6116            memset(mach->Outputs, 0, sizeof(outputs));
6117         memset(temps, 0, sizeof(temps));
6118         memset(outputs, 0, sizeof(outputs));
6119      }
6120#endif
6121
6122      /* execute instructions, until pc is set to -1 */
6123      while (mach->pc != -1) {
6124         boolean barrier_hit;
6125#if DEBUG_EXECUTION
6126         uint i;
6127
6128         tgsi_dump_instruction(&mach->Instructions[mach->pc], inst++);
6129#endif
6130
6131         assert(mach->pc < (int) mach->NumInstructions);
6132         barrier_hit = exec_instruction(mach, mach->Instructions + mach->pc, &mach->pc);
6133
6134         /* for compute shaders if we hit a barrier return now for later rescheduling */
6135         if (barrier_hit && mach->ShaderType == PIPE_SHADER_COMPUTE)
6136            return 0;
6137
6138#if DEBUG_EXECUTION
6139         for (i = 0; i < TGSI_EXEC_NUM_TEMPS; i++) {
6140            if (memcmp(&temps[i], &mach->Temps[i], sizeof(temps[i]))) {
6141               uint j;
6142
6143               memcpy(&temps[i], &mach->Temps[i], sizeof(temps[i]));
6144               debug_printf("TEMP[%2u] = ", i);
6145               for (j = 0; j < 4; j++) {
6146                  if (j > 0) {
6147                     debug_printf("           ");
6148                  }
6149                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6150                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
6151                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
6152                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
6153                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
6154               }
6155            }
6156         }
6157         if (mach->Outputs) {
6158            for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
6159               if (memcmp(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]))) {
6160                  uint j;
6161
6162                  memcpy(&outputs[i], &mach->Outputs[i], sizeof(outputs[i]));
6163                  debug_printf("OUT[%2u] =  ", i);
6164                  for (j = 0; j < 4; j++) {
6165                     if (j > 0) {
6166                        debug_printf("           ");
6167                     }
6168                     debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
6169                                  outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
6170                                  outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
6171                                  outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
6172                                  outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
6173                  }
6174               }
6175            }
6176         }
6177#endif
6178      }
6179   }
6180
6181#if 0
6182   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
6183   if (mach->ShaderType == PIPE_SHADER_FRAGMENT) {
6184      /*
6185       * Scale back depth component.
6186       */
6187      for (i = 0; i < 4; i++)
6188         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
6189   }
6190#endif
6191
6192   /* Strictly speaking, these assertions aren't really needed but they
6193    * can potentially catch some bugs in the control flow code.
6194    */
6195   assert(mach->CondStackTop == 0);
6196   assert(mach->LoopStackTop == 0);
6197   assert(mach->ContStackTop == 0);
6198   assert(mach->SwitchStackTop == 0);
6199   assert(mach->BreakStackTop == 0);
6200   assert(mach->CallStackTop == 0);
6201
6202   return ~mach->KillMask;
6203}
6204