1/*
2 * Copyright 2011 Joakim Sindholt <opensource@zhasha.com>
3 * Copyright 2013 Christoph Bumiller
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23
24#include "nine_shader.h"
25
26#include "device9.h"
27#include "nine_debug.h"
28#include "nine_state.h"
29#include "vertexdeclaration9.h"
30
31#include "util/macros.h"
32#include "util/u_memory.h"
33#include "util/u_inlines.h"
34#include "pipe/p_shader_tokens.h"
35#include "tgsi/tgsi_ureg.h"
36#include "tgsi/tgsi_dump.h"
37
38#define DBG_CHANNEL DBG_SHADER
39
40#define DUMP(args...) _nine_debug_printf(DBG_CHANNEL, NULL, args)
41
42
43struct shader_translator;
44
45typedef HRESULT (*translate_instruction_func)(struct shader_translator *);
46
47static inline const char *d3dsio_to_string(unsigned opcode);
48
49
50#define NINED3D_SM1_VS 0xfffe
51#define NINED3D_SM1_PS 0xffff
52
53#define NINE_MAX_COND_DEPTH 64
54#define NINE_MAX_LOOP_DEPTH 64
55
56#define NINED3DSP_END 0x0000ffff
57
58#define NINED3DSPTYPE_FLOAT4  0
59#define NINED3DSPTYPE_INT4    1
60#define NINED3DSPTYPE_BOOL    2
61
62#define NINED3DSPR_IMMEDIATE (D3DSPR_PREDICATE + 1)
63
64#define NINED3DSP_WRITEMASK_MASK  D3DSP_WRITEMASK_ALL
65#define NINED3DSP_WRITEMASK_SHIFT 16
66
67#define NINED3DSHADER_INST_PREDICATED (1 << 28)
68
69#define NINED3DSHADER_REL_OP_GT 1
70#define NINED3DSHADER_REL_OP_EQ 2
71#define NINED3DSHADER_REL_OP_GE 3
72#define NINED3DSHADER_REL_OP_LT 4
73#define NINED3DSHADER_REL_OP_NE 5
74#define NINED3DSHADER_REL_OP_LE 6
75
76#define NINED3DSIO_OPCODE_FLAGS_SHIFT 16
77#define NINED3DSIO_OPCODE_FLAGS_MASK  (0xff << NINED3DSIO_OPCODE_FLAGS_SHIFT)
78
79#define NINED3DSI_TEXLD_PROJECT 0x1
80#define NINED3DSI_TEXLD_BIAS    0x2
81
82#define NINED3DSP_WRITEMASK_0   0x1
83#define NINED3DSP_WRITEMASK_1   0x2
84#define NINED3DSP_WRITEMASK_2   0x4
85#define NINED3DSP_WRITEMASK_3   0x8
86#define NINED3DSP_WRITEMASK_ALL 0xf
87
88#define NINED3DSP_NOSWIZZLE ((0 << 0) | (1 << 2) | (2 << 4) | (3 << 6))
89
90#define NINE_SWIZZLE4(x,y,z,w) \
91   TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w
92
93#define NINE_APPLY_SWIZZLE(src, s) \
94   ureg_swizzle(src, NINE_SWIZZLE4(s, s, s, s))
95
96#define NINED3DSPDM_SATURATE (D3DSPDM_SATURATE >> D3DSP_DSTMOD_SHIFT)
97#define NINED3DSPDM_PARTIALP (D3DSPDM_PARTIALPRECISION >> D3DSP_DSTMOD_SHIFT)
98#define NINED3DSPDM_CENTROID (D3DSPDM_MSAMPCENTROID >> D3DSP_DSTMOD_SHIFT)
99
100/*
101 * NEG     all, not ps: m3x2, m3x3, m3x4, m4x3, m4x4
102 * BIAS    <= PS 1.4 (x-0.5)
103 * BIASNEG <= PS 1.4 (-(x-0.5))
104 * SIGN    <= PS 1.4 (2(x-0.5))
105 * SIGNNEG <= PS 1.4 (-2(x-0.5))
106 * COMP    <= PS 1.4 (1-x)
107 * X2       = PS 1.4 (2x)
108 * X2NEG    = PS 1.4 (-2x)
109 * DZ      <= PS 1.4, tex{ld,crd} (.xy/.z), z=0 => .11
110 * DW      <= PS 1.4, tex{ld,crd} (.xy/.w), w=0 => .11
111 * ABS     >= SM 3.0 (abs(x))
112 * ABSNEG  >= SM 3.0 (-abs(x))
113 * NOT     >= SM 2.0 pedication only
114 */
115#define NINED3DSPSM_NONE    (D3DSPSM_NONE    >> D3DSP_SRCMOD_SHIFT)
116#define NINED3DSPSM_NEG     (D3DSPSM_NEG     >> D3DSP_SRCMOD_SHIFT)
117#define NINED3DSPSM_BIAS    (D3DSPSM_BIAS    >> D3DSP_SRCMOD_SHIFT)
118#define NINED3DSPSM_BIASNEG (D3DSPSM_BIASNEG >> D3DSP_SRCMOD_SHIFT)
119#define NINED3DSPSM_SIGN    (D3DSPSM_SIGN    >> D3DSP_SRCMOD_SHIFT)
120#define NINED3DSPSM_SIGNNEG (D3DSPSM_SIGNNEG >> D3DSP_SRCMOD_SHIFT)
121#define NINED3DSPSM_COMP    (D3DSPSM_COMP    >> D3DSP_SRCMOD_SHIFT)
122#define NINED3DSPSM_X2      (D3DSPSM_X2      >> D3DSP_SRCMOD_SHIFT)
123#define NINED3DSPSM_X2NEG   (D3DSPSM_X2NEG   >> D3DSP_SRCMOD_SHIFT)
124#define NINED3DSPSM_DZ      (D3DSPSM_DZ      >> D3DSP_SRCMOD_SHIFT)
125#define NINED3DSPSM_DW      (D3DSPSM_DW      >> D3DSP_SRCMOD_SHIFT)
126#define NINED3DSPSM_ABS     (D3DSPSM_ABS     >> D3DSP_SRCMOD_SHIFT)
127#define NINED3DSPSM_ABSNEG  (D3DSPSM_ABSNEG  >> D3DSP_SRCMOD_SHIFT)
128#define NINED3DSPSM_NOT     (D3DSPSM_NOT     >> D3DSP_SRCMOD_SHIFT)
129
130static const char *sm1_mod_str[] =
131{
132    [NINED3DSPSM_NONE] = "",
133    [NINED3DSPSM_NEG] = "-",
134    [NINED3DSPSM_BIAS] = "bias",
135    [NINED3DSPSM_BIASNEG] = "biasneg",
136    [NINED3DSPSM_SIGN] = "sign",
137    [NINED3DSPSM_SIGNNEG] = "signneg",
138    [NINED3DSPSM_COMP] = "comp",
139    [NINED3DSPSM_X2] = "x2",
140    [NINED3DSPSM_X2NEG] = "x2neg",
141    [NINED3DSPSM_DZ] = "dz",
142    [NINED3DSPSM_DW] = "dw",
143    [NINED3DSPSM_ABS] = "abs",
144    [NINED3DSPSM_ABSNEG] = "-abs",
145    [NINED3DSPSM_NOT] = "not"
146};
147
148static void
149sm1_dump_writemask(BYTE mask)
150{
151    if (mask & 1) DUMP("x"); else DUMP("_");
152    if (mask & 2) DUMP("y"); else DUMP("_");
153    if (mask & 4) DUMP("z"); else DUMP("_");
154    if (mask & 8) DUMP("w"); else DUMP("_");
155}
156
157static void
158sm1_dump_swizzle(BYTE s)
159{
160    char c[4] = { 'x', 'y', 'z', 'w' };
161    DUMP("%c%c%c%c",
162         c[(s >> 0) & 3], c[(s >> 2) & 3], c[(s >> 4) & 3], c[(s >> 6) & 3]);
163}
164
165static const char sm1_file_char[] =
166{
167    [D3DSPR_TEMP] = 'r',
168    [D3DSPR_INPUT] = 'v',
169    [D3DSPR_CONST] = 'c',
170    [D3DSPR_ADDR] = 'A',
171    [D3DSPR_RASTOUT] = 'R',
172    [D3DSPR_ATTROUT] = 'D',
173    [D3DSPR_OUTPUT] = 'o',
174    [D3DSPR_CONSTINT] = 'I',
175    [D3DSPR_COLOROUT] = 'C',
176    [D3DSPR_DEPTHOUT] = 'D',
177    [D3DSPR_SAMPLER] = 's',
178    [D3DSPR_CONST2] = 'c',
179    [D3DSPR_CONST3] = 'c',
180    [D3DSPR_CONST4] = 'c',
181    [D3DSPR_CONSTBOOL] = 'B',
182    [D3DSPR_LOOP] = 'L',
183    [D3DSPR_TEMPFLOAT16] = 'h',
184    [D3DSPR_MISCTYPE] = 'M',
185    [D3DSPR_LABEL] = 'X',
186    [D3DSPR_PREDICATE] = 'p'
187};
188
189static void
190sm1_dump_reg(BYTE file, INT index)
191{
192    switch (file) {
193    case D3DSPR_LOOP:
194        DUMP("aL");
195        break;
196    case D3DSPR_COLOROUT:
197        DUMP("oC%i", index);
198        break;
199    case D3DSPR_DEPTHOUT:
200        DUMP("oDepth");
201        break;
202    case D3DSPR_RASTOUT:
203        DUMP("oRast%i", index);
204        break;
205    case D3DSPR_CONSTINT:
206        DUMP("iconst[%i]", index);
207        break;
208    case D3DSPR_CONSTBOOL:
209        DUMP("bconst[%i]", index);
210        break;
211    default:
212        DUMP("%c%i", sm1_file_char[file], index);
213        break;
214    }
215}
216
217struct sm1_src_param
218{
219    INT idx;
220    struct sm1_src_param *rel;
221    BYTE file;
222    BYTE swizzle;
223    BYTE mod;
224    BYTE type;
225    union {
226        DWORD d[4];
227        float f[4];
228        int i[4];
229        BOOL b;
230    } imm;
231};
232static void
233sm1_parse_immediate(struct shader_translator *, struct sm1_src_param *);
234
235struct sm1_dst_param
236{
237    INT idx;
238    struct sm1_src_param *rel;
239    BYTE file;
240    BYTE mask;
241    BYTE mod;
242    int8_t shift; /* sint4 */
243    BYTE type;
244};
245
246static inline void
247assert_replicate_swizzle(const struct ureg_src *reg)
248{
249    assert(reg->SwizzleY == reg->SwizzleX &&
250           reg->SwizzleZ == reg->SwizzleX &&
251           reg->SwizzleW == reg->SwizzleX);
252}
253
254static void
255sm1_dump_immediate(const struct sm1_src_param *param)
256{
257    switch (param->type) {
258    case NINED3DSPTYPE_FLOAT4:
259        DUMP("{ %f %f %f %f }",
260             param->imm.f[0], param->imm.f[1],
261             param->imm.f[2], param->imm.f[3]);
262        break;
263    case NINED3DSPTYPE_INT4:
264        DUMP("{ %i %i %i %i }",
265             param->imm.i[0], param->imm.i[1],
266             param->imm.i[2], param->imm.i[3]);
267        break;
268    case NINED3DSPTYPE_BOOL:
269        DUMP("%s", param->imm.b ? "TRUE" : "FALSE");
270        break;
271    default:
272        assert(0);
273        break;
274    }
275}
276
277static void
278sm1_dump_src_param(const struct sm1_src_param *param)
279{
280    if (param->file == NINED3DSPR_IMMEDIATE) {
281        assert(!param->mod &&
282               !param->rel &&
283               param->swizzle == NINED3DSP_NOSWIZZLE);
284        sm1_dump_immediate(param);
285        return;
286    }
287
288    if (param->mod)
289        DUMP("%s(", sm1_mod_str[param->mod]);
290    if (param->rel) {
291        DUMP("%c[", sm1_file_char[param->file]);
292        sm1_dump_src_param(param->rel);
293        DUMP("+%i]", param->idx);
294    } else {
295        sm1_dump_reg(param->file, param->idx);
296    }
297    if (param->mod)
298       DUMP(")");
299    if (param->swizzle != NINED3DSP_NOSWIZZLE) {
300       DUMP(".");
301       sm1_dump_swizzle(param->swizzle);
302    }
303}
304
305static void
306sm1_dump_dst_param(const struct sm1_dst_param *param)
307{
308   if (param->mod & NINED3DSPDM_SATURATE)
309      DUMP("sat ");
310   if (param->mod & NINED3DSPDM_PARTIALP)
311      DUMP("pp ");
312   if (param->mod & NINED3DSPDM_CENTROID)
313      DUMP("centroid ");
314   if (param->shift < 0)
315      DUMP("/%u ", 1 << -param->shift);
316   if (param->shift > 0)
317      DUMP("*%u ", 1 << param->shift);
318
319   if (param->rel) {
320      DUMP("%c[", sm1_file_char[param->file]);
321      sm1_dump_src_param(param->rel);
322      DUMP("+%i]", param->idx);
323   } else {
324      sm1_dump_reg(param->file, param->idx);
325   }
326   if (param->mask != NINED3DSP_WRITEMASK_ALL) {
327      DUMP(".");
328      sm1_dump_writemask(param->mask);
329   }
330}
331
332struct sm1_semantic
333{
334   struct sm1_dst_param reg;
335   BYTE sampler_type;
336   D3DDECLUSAGE usage;
337   BYTE usage_idx;
338};
339
340struct sm1_op_info
341{
342    /* NOTE: 0 is a valid TGSI opcode, but if handler is set, this parameter
343     * should be ignored completely */
344    unsigned sio;
345    unsigned opcode; /* TGSI_OPCODE_x */
346
347    /* versions are still set even handler is set */
348    struct {
349        unsigned min;
350        unsigned max;
351    } vert_version, frag_version;
352
353    /* number of regs parsed outside of special handler */
354    unsigned ndst;
355    unsigned nsrc;
356
357    /* some instructions don't map perfectly, so use a special handler */
358    translate_instruction_func handler;
359};
360
361struct sm1_instruction
362{
363    D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode;
364    BYTE flags;
365    BOOL coissue;
366    BOOL predicated;
367    BYTE ndst;
368    BYTE nsrc;
369    struct sm1_src_param src[4];
370    struct sm1_src_param src_rel[4];
371    struct sm1_src_param pred;
372    struct sm1_src_param dst_rel[1];
373    struct sm1_dst_param dst[1];
374
375    const struct sm1_op_info *info;
376};
377
378static void
379sm1_dump_instruction(struct sm1_instruction *insn, unsigned indent)
380{
381    unsigned i;
382
383    /* no info stored for these: */
384    if (insn->opcode == D3DSIO_DCL)
385        return;
386    for (i = 0; i < indent; ++i)
387        DUMP("  ");
388
389    if (insn->predicated) {
390        DUMP("@");
391        sm1_dump_src_param(&insn->pred);
392        DUMP(" ");
393    }
394    DUMP("%s", d3dsio_to_string(insn->opcode));
395    if (insn->flags) {
396        switch (insn->opcode) {
397        case D3DSIO_TEX:
398            DUMP(insn->flags == NINED3DSI_TEXLD_PROJECT ? "p" : "b");
399            break;
400        default:
401            DUMP("_%x", insn->flags);
402            break;
403        }
404    }
405    if (insn->coissue)
406        DUMP("_co");
407    DUMP(" ");
408
409    for (i = 0; i < insn->ndst && i < ARRAY_SIZE(insn->dst); ++i) {
410        sm1_dump_dst_param(&insn->dst[i]);
411        DUMP(" ");
412    }
413
414    for (i = 0; i < insn->nsrc && i < ARRAY_SIZE(insn->src); ++i) {
415        sm1_dump_src_param(&insn->src[i]);
416        DUMP(" ");
417    }
418    if (insn->opcode == D3DSIO_DEF ||
419        insn->opcode == D3DSIO_DEFI ||
420        insn->opcode == D3DSIO_DEFB)
421        sm1_dump_immediate(&insn->src[0]);
422
423    DUMP("\n");
424}
425
426struct sm1_local_const
427{
428    INT idx;
429    struct ureg_src reg;
430    float f[4]; /* for indirect addressing of float constants */
431};
432
433struct shader_translator
434{
435    const DWORD *byte_code;
436    const DWORD *parse;
437    const DWORD *parse_next;
438
439    struct ureg_program *ureg;
440
441    /* shader version */
442    struct {
443        BYTE major;
444        BYTE minor;
445    } version;
446    unsigned processor; /* PIPE_SHADER_VERTEX/FRAMGENT */
447    unsigned num_constf_allowed;
448    unsigned num_consti_allowed;
449    unsigned num_constb_allowed;
450
451    boolean native_integers;
452    boolean inline_subroutines;
453    boolean want_texcoord;
454    boolean shift_wpos;
455    boolean wpos_is_sysval;
456    boolean face_is_sysval_integer;
457    boolean mul_zero_wins;
458    unsigned texcoord_sn;
459
460    struct sm1_instruction insn; /* current instruction */
461
462    struct {
463        struct ureg_dst *r;
464        struct ureg_dst oPos;
465        struct ureg_dst oPos_out; /* the real output when doing streamout */
466        struct ureg_dst oFog;
467        struct ureg_dst oPts;
468        struct ureg_dst oCol[4];
469        struct ureg_dst o[PIPE_MAX_SHADER_OUTPUTS];
470        struct ureg_dst oDepth;
471        struct ureg_src v[PIPE_MAX_SHADER_INPUTS];
472        struct ureg_src v_consecutive; /* copy in temp array of ps inputs for rel addressing */
473        struct ureg_src vPos;
474        struct ureg_src vFace;
475        struct ureg_src s;
476        struct ureg_dst p;
477        struct ureg_dst address;
478        struct ureg_dst a0;
479        struct ureg_dst predicate;
480        struct ureg_dst predicate_tmp;
481        struct ureg_dst predicate_dst;
482        struct ureg_dst tS[8]; /* texture stage registers */
483        struct ureg_dst tdst; /* scratch dst if we need extra modifiers */
484        struct ureg_dst t[8]; /* scratch TEMPs */
485        struct ureg_src vC[2]; /* PS color in */
486        struct ureg_src vT[8]; /* PS texcoord in */
487        struct ureg_dst rL[NINE_MAX_LOOP_DEPTH]; /* loop ctr */
488    } regs;
489    unsigned num_temp; /* ARRAY_SIZE(regs.r) */
490    unsigned num_scratch;
491    unsigned loop_depth;
492    unsigned loop_depth_max;
493    unsigned cond_depth;
494    unsigned loop_labels[NINE_MAX_LOOP_DEPTH];
495    unsigned cond_labels[NINE_MAX_COND_DEPTH];
496    boolean loop_or_rep[NINE_MAX_LOOP_DEPTH]; /* true: loop, false: rep */
497    boolean predicated_activated;
498
499    unsigned *inst_labels; /* LABEL op */
500    unsigned num_inst_labels;
501
502    unsigned sampler_targets[NINE_MAX_SAMPLERS]; /* TGSI_TEXTURE_x */
503
504    struct sm1_local_const *lconstf;
505    unsigned num_lconstf;
506    struct sm1_local_const *lconsti;
507    unsigned num_lconsti;
508    struct sm1_local_const *lconstb;
509    unsigned num_lconstb;
510
511    boolean slots_used[NINE_MAX_CONST_ALL];
512    unsigned *slot_map;
513    unsigned num_slots;
514
515    boolean indirect_const_access;
516    boolean failure;
517
518    struct nine_vs_output_info output_info[16];
519    int num_outputs;
520
521    struct nine_shader_info *info;
522
523    int16_t op_info_map[D3DSIO_BREAKP + 1];
524};
525
526#define IS_VS (tx->processor == PIPE_SHADER_VERTEX)
527#define IS_PS (tx->processor == PIPE_SHADER_FRAGMENT)
528
529#define FAILURE_VOID(cond) if ((cond)) {tx->failure=1;return;}
530
531static void
532sm1_read_semantic(struct shader_translator *, struct sm1_semantic *);
533
534static void
535sm1_instruction_check(const struct sm1_instruction *insn)
536{
537    if (insn->opcode == D3DSIO_CRS)
538    {
539        if (insn->dst[0].mask & NINED3DSP_WRITEMASK_3)
540        {
541            DBG("CRS.mask.w\n");
542        }
543    }
544}
545
546static void
547nine_record_outputs(struct shader_translator *tx, BYTE Usage, BYTE UsageIndex,
548                    int mask, int output_index)
549{
550    tx->output_info[tx->num_outputs].output_semantic = Usage;
551    tx->output_info[tx->num_outputs].output_semantic_index = UsageIndex;
552    tx->output_info[tx->num_outputs].mask = mask;
553    tx->output_info[tx->num_outputs].output_index = output_index;
554    tx->num_outputs++;
555}
556
557static struct ureg_src nine_float_constant_src(struct shader_translator *tx, int idx)
558{
559    struct ureg_src src;
560
561    if (tx->slot_map)
562        idx = tx->slot_map[idx];
563    /* vswp constant handling: we use two buffers
564     * to fit all the float constants. The special handling
565     * doesn't need to be elsewhere, because all the instructions
566     * accessing the constants directly are VS1, and swvp
567     * is VS >= 2 */
568    if (tx->info->swvp_on && idx >= 4096) {
569        /* TODO: swvp rel is broken if many constants are used */
570        src = ureg_src_register(TGSI_FILE_CONSTANT, idx - 4096);
571        src = ureg_src_dimension(src, 1);
572    } else {
573        src = ureg_src_register(TGSI_FILE_CONSTANT, idx);
574        src = ureg_src_dimension(src, 0);
575    }
576
577    if (!tx->info->swvp_on)
578        tx->slots_used[idx] = TRUE;
579    if (tx->info->const_float_slots < (idx + 1))
580        tx->info->const_float_slots = idx + 1;
581    if (tx->num_slots < (idx + 1))
582        tx->num_slots = idx + 1;
583
584    return src;
585}
586
587static struct ureg_src nine_integer_constant_src(struct shader_translator *tx, int idx)
588{
589    struct ureg_src src;
590
591    if (tx->info->swvp_on) {
592        src = ureg_src_register(TGSI_FILE_CONSTANT, idx);
593        src = ureg_src_dimension(src, 2);
594    } else {
595        unsigned slot_idx = tx->info->const_i_base + idx;
596        if (tx->slot_map)
597            slot_idx = tx->slot_map[slot_idx];
598        src = ureg_src_register(TGSI_FILE_CONSTANT, slot_idx);
599        src = ureg_src_dimension(src, 0);
600        tx->slots_used[slot_idx] = TRUE;
601        tx->info->int_slots_used[idx] = TRUE;
602        if (tx->num_slots < (slot_idx + 1))
603            tx->num_slots = slot_idx + 1;
604    }
605
606    if (tx->info->const_int_slots < (idx + 1))
607        tx->info->const_int_slots = idx + 1;
608
609    return src;
610}
611
612static struct ureg_src nine_boolean_constant_src(struct shader_translator *tx, int idx)
613{
614    struct ureg_src src;
615
616    char r = idx / 4;
617    char s = idx & 3;
618
619    if (tx->info->swvp_on) {
620        src = ureg_src_register(TGSI_FILE_CONSTANT, r);
621        src = ureg_src_dimension(src, 3);
622    } else {
623        unsigned slot_idx = tx->info->const_b_base + r;
624        if (tx->slot_map)
625            slot_idx = tx->slot_map[slot_idx];
626        src = ureg_src_register(TGSI_FILE_CONSTANT, slot_idx);
627        src = ureg_src_dimension(src, 0);
628        tx->slots_used[slot_idx] = TRUE;
629        tx->info->bool_slots_used[idx] = TRUE;
630        if (tx->num_slots < (slot_idx + 1))
631            tx->num_slots = slot_idx + 1;
632    }
633    src = ureg_swizzle(src, s, s, s, s);
634
635    if (tx->info->const_bool_slots < (idx + 1))
636        tx->info->const_bool_slots = idx + 1;
637
638    return src;
639}
640
641static boolean
642tx_lconstf(struct shader_translator *tx, struct ureg_src *src, INT index)
643{
644   INT i;
645
646   if (index < 0 || index >= tx->num_constf_allowed) {
647       tx->failure = TRUE;
648       return FALSE;
649   }
650   for (i = 0; i < tx->num_lconstf; ++i) {
651      if (tx->lconstf[i].idx == index) {
652         *src = tx->lconstf[i].reg;
653         return TRUE;
654      }
655   }
656   return FALSE;
657}
658static boolean
659tx_lconsti(struct shader_translator *tx, struct ureg_src *src, INT index)
660{
661   int i;
662
663   if (index < 0 || index >= tx->num_consti_allowed) {
664       tx->failure = TRUE;
665       return FALSE;
666   }
667   for (i = 0; i < tx->num_lconsti; ++i) {
668      if (tx->lconsti[i].idx == index) {
669         *src = tx->lconsti[i].reg;
670         return TRUE;
671      }
672   }
673   return FALSE;
674}
675static boolean
676tx_lconstb(struct shader_translator *tx, struct ureg_src *src, INT index)
677{
678   int i;
679
680   if (index < 0 || index >= tx->num_constb_allowed) {
681       tx->failure = TRUE;
682       return FALSE;
683   }
684   for (i = 0; i < tx->num_lconstb; ++i) {
685      if (tx->lconstb[i].idx == index) {
686         *src = tx->lconstb[i].reg;
687         return TRUE;
688      }
689   }
690   return FALSE;
691}
692
693static void
694tx_set_lconstf(struct shader_translator *tx, INT index, float f[4])
695{
696    unsigned n;
697
698    FAILURE_VOID(index < 0 || index >= tx->num_constf_allowed)
699
700    for (n = 0; n < tx->num_lconstf; ++n)
701        if (tx->lconstf[n].idx == index)
702            break;
703    if (n == tx->num_lconstf) {
704       if ((n % 8) == 0) {
705          tx->lconstf = REALLOC(tx->lconstf,
706                                (n + 0) * sizeof(tx->lconstf[0]),
707                                (n + 8) * sizeof(tx->lconstf[0]));
708          assert(tx->lconstf);
709       }
710       tx->num_lconstf++;
711    }
712    tx->lconstf[n].idx = index;
713    tx->lconstf[n].reg = ureg_imm4f(tx->ureg, f[0], f[1], f[2], f[3]);
714
715    memcpy(tx->lconstf[n].f, f, sizeof(tx->lconstf[n].f));
716}
717static void
718tx_set_lconsti(struct shader_translator *tx, INT index, int i[4])
719{
720    unsigned n;
721
722    FAILURE_VOID(index < 0 || index >= tx->num_consti_allowed)
723
724    for (n = 0; n < tx->num_lconsti; ++n)
725        if (tx->lconsti[n].idx == index)
726            break;
727    if (n == tx->num_lconsti) {
728       if ((n % 8) == 0) {
729          tx->lconsti = REALLOC(tx->lconsti,
730                                (n + 0) * sizeof(tx->lconsti[0]),
731                                (n + 8) * sizeof(tx->lconsti[0]));
732          assert(tx->lconsti);
733       }
734       tx->num_lconsti++;
735    }
736
737    tx->lconsti[n].idx = index;
738    tx->lconsti[n].reg = tx->native_integers ?
739       ureg_imm4i(tx->ureg, i[0], i[1], i[2], i[3]) :
740       ureg_imm4f(tx->ureg, i[0], i[1], i[2], i[3]);
741}
742static void
743tx_set_lconstb(struct shader_translator *tx, INT index, BOOL b)
744{
745    unsigned n;
746
747    FAILURE_VOID(index < 0 || index >= tx->num_constb_allowed)
748
749    for (n = 0; n < tx->num_lconstb; ++n)
750        if (tx->lconstb[n].idx == index)
751            break;
752    if (n == tx->num_lconstb) {
753       if ((n % 8) == 0) {
754          tx->lconstb = REALLOC(tx->lconstb,
755                                (n + 0) * sizeof(tx->lconstb[0]),
756                                (n + 8) * sizeof(tx->lconstb[0]));
757          assert(tx->lconstb);
758       }
759       tx->num_lconstb++;
760    }
761
762    tx->lconstb[n].idx = index;
763    tx->lconstb[n].reg = tx->native_integers ?
764       ureg_imm1u(tx->ureg, b ? 0xffffffff : 0) :
765       ureg_imm1f(tx->ureg, b ? 1.0f : 0.0f);
766}
767
768static inline struct ureg_dst
769tx_scratch(struct shader_translator *tx)
770{
771    if (tx->num_scratch >= ARRAY_SIZE(tx->regs.t)) {
772        tx->failure = TRUE;
773        return tx->regs.t[0];
774    }
775    if (ureg_dst_is_undef(tx->regs.t[tx->num_scratch]))
776        tx->regs.t[tx->num_scratch] = ureg_DECL_local_temporary(tx->ureg);
777    return tx->regs.t[tx->num_scratch++];
778}
779
780static inline struct ureg_dst
781tx_scratch_scalar(struct shader_translator *tx)
782{
783    return ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
784}
785
786static inline struct ureg_src
787tx_src_scalar(struct ureg_dst dst)
788{
789    struct ureg_src src = ureg_src(dst);
790    int c = ffs(dst.WriteMask) - 1;
791    if (dst.WriteMask == (1 << c))
792        src = ureg_scalar(src, c);
793    return src;
794}
795
796static inline void
797tx_temp_alloc(struct shader_translator *tx, INT idx)
798{
799    assert(idx >= 0);
800    if (idx >= tx->num_temp) {
801       unsigned k = tx->num_temp;
802       unsigned n = idx + 1;
803       tx->regs.r = REALLOC(tx->regs.r,
804                            k * sizeof(tx->regs.r[0]),
805                            n * sizeof(tx->regs.r[0]));
806       for (; k < n; ++k)
807          tx->regs.r[k] = ureg_dst_undef();
808       tx->num_temp = n;
809    }
810    if (ureg_dst_is_undef(tx->regs.r[idx]))
811        tx->regs.r[idx] = ureg_DECL_temporary(tx->ureg);
812}
813
814static inline void
815tx_addr_alloc(struct shader_translator *tx, INT idx)
816{
817    assert(idx == 0);
818    if (ureg_dst_is_undef(tx->regs.address))
819        tx->regs.address = ureg_DECL_address(tx->ureg);
820    if (ureg_dst_is_undef(tx->regs.a0))
821        tx->regs.a0 = ureg_DECL_temporary(tx->ureg);
822}
823
824/* NOTE: It's not very clear on which ps1.1-ps1.3 instructions
825 * the projection should be applied on the texture. It doesn't
826 * apply on texkill.
827 * The doc is very imprecise here (it says the projection is done
828 * before rasterization, thus in vs, which seems wrong since ps instructions
829 * are affected differently)
830 * For now we only apply to the ps TEX instruction and TEXBEM.
831 * Perhaps some other instructions would need it */
832static inline void
833apply_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
834                      struct ureg_src src, INT idx)
835{
836    struct ureg_dst tmp;
837    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
838
839    /* no projection */
840    if (dim == 1) {
841        ureg_MOV(tx->ureg, dst, src);
842    } else {
843        tmp = tx_scratch_scalar(tx);
844        ureg_RCP(tx->ureg, tmp, ureg_scalar(src, dim-1));
845        ureg_MUL(tx->ureg, dst, tx_src_scalar(tmp), src);
846    }
847}
848
849static inline void
850TEX_with_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
851                         unsigned target, struct ureg_src src0,
852                         struct ureg_src src1, INT idx)
853{
854    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
855    struct ureg_dst tmp;
856    boolean shadow = !!(tx->info->sampler_mask_shadow & (1 << idx));
857
858    /* dim == 1: no projection
859     * Looks like must be disabled when it makes no
860     * sense according the texture dimensions
861     */
862    if (dim == 1 || (dim <= target && !shadow)) {
863        ureg_TEX(tx->ureg, dst, target, src0, src1);
864    } else if (dim == 4) {
865        ureg_TXP(tx->ureg, dst, target, src0, src1);
866    } else {
867        tmp = tx_scratch(tx);
868        apply_ps1x_projection(tx, tmp, src0, idx);
869        ureg_TEX(tx->ureg, dst, target, ureg_src(tmp), src1);
870    }
871}
872
873static inline void
874tx_texcoord_alloc(struct shader_translator *tx, INT idx)
875{
876    assert(IS_PS);
877    assert(idx >= 0 && idx < ARRAY_SIZE(tx->regs.vT));
878    if (ureg_src_is_undef(tx->regs.vT[idx]))
879       tx->regs.vT[idx] = ureg_DECL_fs_input(tx->ureg, tx->texcoord_sn, idx,
880                                             TGSI_INTERPOLATE_PERSPECTIVE);
881}
882
883static inline unsigned *
884tx_bgnloop(struct shader_translator *tx)
885{
886    tx->loop_depth++;
887    if (tx->loop_depth_max < tx->loop_depth)
888        tx->loop_depth_max = tx->loop_depth;
889    assert(tx->loop_depth < NINE_MAX_LOOP_DEPTH);
890    return &tx->loop_labels[tx->loop_depth - 1];
891}
892
893static inline unsigned *
894tx_endloop(struct shader_translator *tx)
895{
896    assert(tx->loop_depth);
897    tx->loop_depth--;
898    ureg_fixup_label(tx->ureg, tx->loop_labels[tx->loop_depth],
899                     ureg_get_instruction_number(tx->ureg));
900    return &tx->loop_labels[tx->loop_depth];
901}
902
903static struct ureg_dst
904tx_get_loopctr(struct shader_translator *tx, boolean loop_or_rep)
905{
906    const unsigned l = tx->loop_depth - 1;
907
908    if (!tx->loop_depth)
909    {
910        DBG("loop counter requested outside of loop\n");
911        return ureg_dst_undef();
912    }
913
914    if (ureg_dst_is_undef(tx->regs.rL[l])) {
915        /* loop or rep ctr creation */
916        tx->regs.rL[l] = ureg_DECL_local_temporary(tx->ureg);
917        tx->loop_or_rep[l] = loop_or_rep;
918    }
919    /* loop - rep - endloop - endrep not allowed */
920    assert(tx->loop_or_rep[l] == loop_or_rep);
921
922    return tx->regs.rL[l];
923}
924
925static struct ureg_src
926tx_get_loopal(struct shader_translator *tx)
927{
928    int loop_level = tx->loop_depth - 1;
929
930    while (loop_level >= 0) {
931        /* handle loop - rep - endrep - endloop case */
932        if (tx->loop_or_rep[loop_level])
933            /* the value is in the loop counter y component (nine implementation) */
934            return ureg_scalar(ureg_src(tx->regs.rL[loop_level]), TGSI_SWIZZLE_Y);
935        loop_level--;
936    }
937
938    DBG("aL counter requested outside of loop\n");
939    return ureg_src_undef();
940}
941
942static inline unsigned *
943tx_cond(struct shader_translator *tx)
944{
945   assert(tx->cond_depth <= NINE_MAX_COND_DEPTH);
946   tx->cond_depth++;
947   return &tx->cond_labels[tx->cond_depth - 1];
948}
949
950static inline unsigned *
951tx_elsecond(struct shader_translator *tx)
952{
953   assert(tx->cond_depth);
954   return &tx->cond_labels[tx->cond_depth - 1];
955}
956
957static inline void
958tx_endcond(struct shader_translator *tx)
959{
960   assert(tx->cond_depth);
961   tx->cond_depth--;
962   ureg_fixup_label(tx->ureg, tx->cond_labels[tx->cond_depth],
963                    ureg_get_instruction_number(tx->ureg));
964}
965
966static inline struct ureg_dst
967nine_ureg_dst_register(unsigned file, int index)
968{
969    return ureg_dst(ureg_src_register(file, index));
970}
971
972static inline struct ureg_src
973nine_get_position_input(struct shader_translator *tx)
974{
975    struct ureg_program *ureg = tx->ureg;
976
977    if (tx->wpos_is_sysval)
978        return ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
979    else
980        return ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION,
981                                  0, TGSI_INTERPOLATE_LINEAR);
982}
983
984static struct ureg_src
985tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
986{
987    struct ureg_program *ureg = tx->ureg;
988    struct ureg_src src;
989    struct ureg_dst tmp;
990
991    assert(!param->rel || (IS_VS && param->file == D3DSPR_CONST) ||
992        (D3DSPR_ADDR && tx->version.major == 3));
993
994    switch (param->file)
995    {
996    case D3DSPR_TEMP:
997        tx_temp_alloc(tx, param->idx);
998        src = ureg_src(tx->regs.r[param->idx]);
999        break;
1000 /* case D3DSPR_TEXTURE: == D3DSPR_ADDR */
1001    case D3DSPR_ADDR:
1002        if (IS_VS) {
1003            assert(param->idx == 0);
1004            /* the address register (vs only) must be
1005             * assigned before use */
1006            assert(!ureg_dst_is_undef(tx->regs.a0));
1007            /* Round to lowest for vs1.1 (contrary to the doc), else
1008             * round to nearest */
1009            if (tx->version.major < 2 && tx->version.minor < 2)
1010                ureg_ARL(ureg, tx->regs.address, ureg_src(tx->regs.a0));
1011            else
1012                ureg_ARR(ureg, tx->regs.address, ureg_src(tx->regs.a0));
1013            src = ureg_src(tx->regs.address);
1014        } else {
1015            if (tx->version.major < 2 && tx->version.minor < 4) {
1016                /* no subroutines, so should be defined */
1017                src = ureg_src(tx->regs.tS[param->idx]);
1018            } else {
1019                tx_texcoord_alloc(tx, param->idx);
1020                src = tx->regs.vT[param->idx];
1021            }
1022        }
1023        break;
1024    case D3DSPR_INPUT:
1025        if (IS_VS) {
1026            src = ureg_src_register(TGSI_FILE_INPUT, param->idx);
1027        } else {
1028            if (tx->version.major < 3) {
1029                src = ureg_DECL_fs_input_cyl_centroid(
1030                    ureg, TGSI_SEMANTIC_COLOR, param->idx,
1031                    TGSI_INTERPOLATE_COLOR, 0,
1032                    tx->info->force_color_in_centroid ?
1033                      TGSI_INTERPOLATE_LOC_CENTROID : 0,
1034                    0, 1);
1035            } else {
1036                if(param->rel) {
1037                    /* Copy all inputs (non consecutive)
1038                     * to temp array (consecutive).
1039                     * This is not good for performance.
1040                     * A better way would be to have inputs
1041                     * consecutive (would need implement alternative
1042                     * way to match vs outputs and ps inputs).
1043                     * However even with the better way, the temp array
1044                     * copy would need to be used if some inputs
1045                     * are not GENERIC or if they have different
1046                     * interpolation flag. */
1047                    if (ureg_src_is_undef(tx->regs.v_consecutive)) {
1048                        int i;
1049                        tx->regs.v_consecutive = ureg_src(ureg_DECL_array_temporary(ureg, 10, 0));
1050                        for (i = 0; i < 10; i++) {
1051                            if (!ureg_src_is_undef(tx->regs.v[i]))
1052                                ureg_MOV(ureg, ureg_dst_array_offset(ureg_dst(tx->regs.v_consecutive), i), tx->regs.v[i]);
1053                            else
1054                                ureg_MOV(ureg, ureg_dst_array_offset(ureg_dst(tx->regs.v_consecutive), i), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
1055                        }
1056                    }
1057                    src = ureg_src_array_offset(tx->regs.v_consecutive, param->idx);
1058                } else {
1059                    assert(param->idx < ARRAY_SIZE(tx->regs.v));
1060                    src = tx->regs.v[param->idx];
1061                }
1062            }
1063        }
1064        if (param->rel)
1065            src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1066        break;
1067    case D3DSPR_PREDICATE:
1068        if (ureg_dst_is_undef(tx->regs.predicate)) {
1069            /* Forbidden to use the predicate register before being set */
1070            tx->failure = TRUE;
1071            tx->regs.predicate = ureg_DECL_temporary(tx->ureg);
1072        }
1073        src = ureg_src(tx->regs.predicate);
1074        break;
1075    case D3DSPR_SAMPLER:
1076        assert(param->mod == NINED3DSPSM_NONE);
1077        assert(param->swizzle == NINED3DSP_NOSWIZZLE);
1078        src = ureg_DECL_sampler(ureg, param->idx);
1079        break;
1080    case D3DSPR_CONST:
1081        if (param->rel || !tx_lconstf(tx, &src, param->idx)) {
1082            src = nine_float_constant_src(tx, param->idx);
1083            if (param->rel) {
1084                tx->indirect_const_access = TRUE;
1085                src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1086            }
1087        }
1088        if (!IS_VS && tx->version.major < 2) {
1089            /* ps 1.X clamps constants */
1090            tmp = tx_scratch(tx);
1091            ureg_MIN(ureg, tmp, src, ureg_imm1f(ureg, 1.0f));
1092            ureg_MAX(ureg, tmp, ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
1093            src = ureg_src(tmp);
1094        }
1095        break;
1096    case D3DSPR_CONST2:
1097    case D3DSPR_CONST3:
1098    case D3DSPR_CONST4:
1099        DBG("CONST2/3/4 should have been collapsed into D3DSPR_CONST !\n");
1100        assert(!"CONST2/3/4");
1101        src = ureg_imm1f(ureg, 0.0f);
1102        break;
1103    case D3DSPR_CONSTINT:
1104        /* relative adressing only possible for float constants in vs */
1105        if (!tx_lconsti(tx, &src, param->idx))
1106            src = nine_integer_constant_src(tx, param->idx);
1107        break;
1108    case D3DSPR_CONSTBOOL:
1109        if (!tx_lconstb(tx, &src, param->idx))
1110            src = nine_boolean_constant_src(tx, param->idx);
1111        break;
1112    case D3DSPR_LOOP:
1113        if (ureg_dst_is_undef(tx->regs.address))
1114            tx->regs.address = ureg_DECL_address(ureg);
1115        if (!tx->native_integers)
1116            ureg_ARR(ureg, tx->regs.address, tx_get_loopal(tx));
1117        else
1118            ureg_UARL(ureg, tx->regs.address, tx_get_loopal(tx));
1119        src = ureg_src(tx->regs.address);
1120        break;
1121    case D3DSPR_MISCTYPE:
1122        switch (param->idx) {
1123        case D3DSMO_POSITION:
1124           if (ureg_src_is_undef(tx->regs.vPos))
1125              tx->regs.vPos = nine_get_position_input(tx);
1126           if (tx->shift_wpos) {
1127               /* TODO: do this only once */
1128               struct ureg_dst wpos = tx_scratch(tx);
1129               ureg_ADD(ureg, wpos, tx->regs.vPos,
1130                        ureg_imm4f(ureg, -0.5f, -0.5f, 0.0f, 0.0f));
1131               src = ureg_src(wpos);
1132           } else {
1133               src = tx->regs.vPos;
1134           }
1135           break;
1136        case D3DSMO_FACE:
1137           if (ureg_src_is_undef(tx->regs.vFace)) {
1138               if (tx->face_is_sysval_integer) {
1139                   tmp = ureg_DECL_temporary(ureg);
1140                   tx->regs.vFace =
1141                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_FACE, 0);
1142
1143                   /* convert bool to float */
1144                   ureg_UCMP(ureg, tmp, ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X),
1145                             ureg_imm1f(ureg, 1), ureg_imm1f(ureg, -1));
1146                   tx->regs.vFace = ureg_src(tmp);
1147               } else {
1148                   tx->regs.vFace = ureg_DECL_fs_input(ureg,
1149                                                       TGSI_SEMANTIC_FACE, 0,
1150                                                       TGSI_INTERPOLATE_CONSTANT);
1151               }
1152               tx->regs.vFace = ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X);
1153           }
1154           src = tx->regs.vFace;
1155           break;
1156        default:
1157            assert(!"invalid src D3DSMO");
1158            break;
1159        }
1160        break;
1161    case D3DSPR_TEMPFLOAT16:
1162        break;
1163    default:
1164        assert(!"invalid src D3DSPR");
1165    }
1166
1167    switch (param->mod) {
1168    case NINED3DSPSM_DW:
1169        tmp = tx_scratch(tx);
1170        /* NOTE: app is not allowed to read w with this modifier */
1171        ureg_RCP(ureg, ureg_writemask(tmp, NINED3DSP_WRITEMASK_3), ureg_scalar(src, TGSI_SWIZZLE_W));
1172        ureg_MUL(ureg, tmp, src, ureg_swizzle(ureg_src(tmp), NINE_SWIZZLE4(W,W,W,W)));
1173        src = ureg_src(tmp);
1174        break;
1175    case NINED3DSPSM_DZ:
1176        tmp = tx_scratch(tx);
1177        /* NOTE: app is not allowed to read z with this modifier */
1178        ureg_RCP(ureg, ureg_writemask(tmp, NINED3DSP_WRITEMASK_2), ureg_scalar(src, TGSI_SWIZZLE_Z));
1179        ureg_MUL(ureg, tmp, src, ureg_swizzle(ureg_src(tmp), NINE_SWIZZLE4(Z,Z,Z,Z)));
1180        src = ureg_src(tmp);
1181        break;
1182    default:
1183        break;
1184    }
1185
1186    if (param->swizzle != NINED3DSP_NOSWIZZLE)
1187        src = ureg_swizzle(src,
1188                           (param->swizzle >> 0) & 0x3,
1189                           (param->swizzle >> 2) & 0x3,
1190                           (param->swizzle >> 4) & 0x3,
1191                           (param->swizzle >> 6) & 0x3);
1192
1193    switch (param->mod) {
1194    case NINED3DSPSM_ABS:
1195        src = ureg_abs(src);
1196        break;
1197    case NINED3DSPSM_ABSNEG:
1198        src = ureg_negate(ureg_abs(src));
1199        break;
1200    case NINED3DSPSM_NEG:
1201        src = ureg_negate(src);
1202        break;
1203    case NINED3DSPSM_BIAS:
1204        tmp = tx_scratch(tx);
1205        ureg_ADD(ureg, tmp, src, ureg_imm1f(ureg, -0.5f));
1206        src = ureg_src(tmp);
1207        break;
1208    case NINED3DSPSM_BIASNEG:
1209        tmp = tx_scratch(tx);
1210        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 0.5f), ureg_negate(src));
1211        src = ureg_src(tmp);
1212        break;
1213    case NINED3DSPSM_NOT:
1214        if (tx->native_integers && param->file == D3DSPR_CONSTBOOL) {
1215            tmp = tx_scratch(tx);
1216            ureg_NOT(ureg, tmp, src);
1217            src = ureg_src(tmp);
1218            break;
1219        } else { /* predicate */
1220            tmp = tx_scratch(tx);
1221            ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(src));
1222            src = ureg_src(tmp);
1223        }
1224        /* fall through */
1225    case NINED3DSPSM_COMP:
1226        tmp = tx_scratch(tx);
1227        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(src));
1228        src = ureg_src(tmp);
1229        break;
1230    case NINED3DSPSM_DZ:
1231    case NINED3DSPSM_DW:
1232        /* Already handled*/
1233        break;
1234    case NINED3DSPSM_SIGN:
1235        tmp = tx_scratch(tx);
1236        ureg_MAD(ureg, tmp, src, ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1237        src = ureg_src(tmp);
1238        break;
1239    case NINED3DSPSM_SIGNNEG:
1240        tmp = tx_scratch(tx);
1241        ureg_MAD(ureg, tmp, src, ureg_imm1f(ureg, -2.0f), ureg_imm1f(ureg, 1.0f));
1242        src = ureg_src(tmp);
1243        break;
1244    case NINED3DSPSM_X2:
1245        tmp = tx_scratch(tx);
1246        ureg_ADD(ureg, tmp, src, src);
1247        src = ureg_src(tmp);
1248        break;
1249    case NINED3DSPSM_X2NEG:
1250        tmp = tx_scratch(tx);
1251        ureg_ADD(ureg, tmp, src, src);
1252        src = ureg_negate(ureg_src(tmp));
1253        break;
1254    default:
1255        assert(param->mod == NINED3DSPSM_NONE);
1256        break;
1257    }
1258
1259    return src;
1260}
1261
1262static struct ureg_dst
1263_tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
1264{
1265    struct ureg_dst dst;
1266
1267    switch (param->file)
1268    {
1269    case D3DSPR_TEMP:
1270        assert(!param->rel);
1271        tx_temp_alloc(tx, param->idx);
1272        dst = tx->regs.r[param->idx];
1273        break;
1274 /* case D3DSPR_TEXTURE: == D3DSPR_ADDR */
1275    case D3DSPR_ADDR:
1276        assert(!param->rel);
1277        if (tx->version.major < 2 && !IS_VS) {
1278            if (ureg_dst_is_undef(tx->regs.tS[param->idx]))
1279                tx->regs.tS[param->idx] = ureg_DECL_temporary(tx->ureg);
1280            dst = tx->regs.tS[param->idx];
1281        } else
1282        if (!IS_VS && tx->insn.opcode == D3DSIO_TEXKILL) { /* maybe others, too */
1283            tx_texcoord_alloc(tx, param->idx);
1284            dst = ureg_dst(tx->regs.vT[param->idx]);
1285        } else {
1286            tx_addr_alloc(tx, param->idx);
1287            dst = tx->regs.a0;
1288        }
1289        break;
1290    case D3DSPR_RASTOUT:
1291        assert(!param->rel);
1292        switch (param->idx) {
1293        case 0:
1294            if (ureg_dst_is_undef(tx->regs.oPos))
1295                tx->regs.oPos =
1296                    ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_POSITION, 0);
1297            dst = tx->regs.oPos;
1298            break;
1299        case 1:
1300            if (ureg_dst_is_undef(tx->regs.oFog))
1301                tx->regs.oFog =
1302                    ureg_saturate(ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_GENERIC, 16));
1303            dst = tx->regs.oFog;
1304            break;
1305        case 2:
1306            if (ureg_dst_is_undef(tx->regs.oPts))
1307                tx->regs.oPts = ureg_DECL_temporary(tx->ureg);
1308            dst = tx->regs.oPts;
1309            break;
1310        default:
1311            assert(0);
1312            break;
1313        }
1314        break;
1315 /* case D3DSPR_TEXCRDOUT: == D3DSPR_OUTPUT */
1316    case D3DSPR_OUTPUT:
1317        if (tx->version.major < 3) {
1318            assert(!param->rel);
1319            dst = ureg_DECL_output(tx->ureg, tx->texcoord_sn, param->idx);
1320        } else {
1321            assert(!param->rel); /* TODO */
1322            assert(param->idx < ARRAY_SIZE(tx->regs.o));
1323            dst = tx->regs.o[param->idx];
1324        }
1325        break;
1326    case D3DSPR_ATTROUT: /* VS */
1327    case D3DSPR_COLOROUT: /* PS */
1328        assert(param->idx >= 0 && param->idx < 4);
1329        assert(!param->rel);
1330        tx->info->rt_mask |= 1 << param->idx;
1331        if (ureg_dst_is_undef(tx->regs.oCol[param->idx])) {
1332            /* ps < 3: oCol[0] will have fog blending afterward */
1333            if (!IS_VS && tx->version.major < 3 && param->idx == 0) {
1334                tx->regs.oCol[0] = ureg_DECL_temporary(tx->ureg);
1335            } else {
1336                tx->regs.oCol[param->idx] =
1337                    ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, param->idx);
1338            }
1339        }
1340        dst = tx->regs.oCol[param->idx];
1341        if (IS_VS && tx->version.major < 3)
1342            dst = ureg_saturate(dst);
1343        break;
1344    case D3DSPR_DEPTHOUT:
1345        assert(!param->rel);
1346        if (ureg_dst_is_undef(tx->regs.oDepth))
1347           tx->regs.oDepth =
1348              ureg_DECL_output_masked(tx->ureg, TGSI_SEMANTIC_POSITION, 0,
1349                                      TGSI_WRITEMASK_Z, 0, 1);
1350        dst = tx->regs.oDepth; /* XXX: must write .z component */
1351        break;
1352    case D3DSPR_PREDICATE:
1353        if (ureg_dst_is_undef(tx->regs.predicate))
1354            tx->regs.predicate = ureg_DECL_temporary(tx->ureg);
1355        dst = tx->regs.predicate;
1356        break;
1357    case D3DSPR_TEMPFLOAT16:
1358        DBG("unhandled D3DSPR: %u\n", param->file);
1359        break;
1360    default:
1361        assert(!"invalid dst D3DSPR");
1362        break;
1363    }
1364    if (param->rel)
1365        dst = ureg_dst_indirect(dst, tx_src_param(tx, param->rel));
1366
1367    if (param->mask != NINED3DSP_WRITEMASK_ALL)
1368        dst = ureg_writemask(dst, param->mask);
1369    if (param->mod & NINED3DSPDM_SATURATE)
1370        dst = ureg_saturate(dst);
1371
1372    if (tx->predicated_activated) {
1373        tx->regs.predicate_dst = dst;
1374        dst = tx->regs.predicate_tmp;
1375    }
1376
1377    return dst;
1378}
1379
1380static struct ureg_dst
1381tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
1382{
1383    if (param->shift) {
1384        tx->regs.tdst = ureg_writemask(tx_scratch(tx), param->mask);
1385        return tx->regs.tdst;
1386    }
1387    return _tx_dst_param(tx, param);
1388}
1389
1390static void
1391tx_apply_dst0_modifiers(struct shader_translator *tx)
1392{
1393    struct ureg_dst rdst;
1394    float f;
1395
1396    if (!tx->insn.ndst || !tx->insn.dst[0].shift || tx->insn.opcode == D3DSIO_TEXKILL)
1397        return;
1398    rdst = _tx_dst_param(tx, &tx->insn.dst[0]);
1399
1400    assert(rdst.File != TGSI_FILE_ADDRESS); /* this probably isn't possible */
1401
1402    if (tx->insn.dst[0].shift < 0)
1403        f = 1.0f / (1 << -tx->insn.dst[0].shift);
1404    else
1405        f = 1 << tx->insn.dst[0].shift;
1406
1407    ureg_MUL(tx->ureg, rdst, ureg_src(tx->regs.tdst), ureg_imm1f(tx->ureg, f));
1408}
1409
1410static struct ureg_src
1411tx_dst_param_as_src(struct shader_translator *tx, const struct sm1_dst_param *param)
1412{
1413    struct ureg_src src;
1414
1415    assert(!param->shift);
1416    assert(!(param->mod & NINED3DSPDM_SATURATE));
1417
1418    switch (param->file) {
1419    case D3DSPR_INPUT:
1420        if (IS_VS) {
1421            src = ureg_src_register(TGSI_FILE_INPUT, param->idx);
1422        } else {
1423            assert(!param->rel);
1424            assert(param->idx < ARRAY_SIZE(tx->regs.v));
1425            src = tx->regs.v[param->idx];
1426        }
1427        break;
1428    default:
1429        src = ureg_src(tx_dst_param(tx, param));
1430        break;
1431    }
1432    if (param->rel)
1433        src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1434
1435    if (!param->mask)
1436        WARN("mask is 0, using identity swizzle\n");
1437
1438    if (param->mask && param->mask != NINED3DSP_WRITEMASK_ALL) {
1439        char s[4];
1440        int n;
1441        int c;
1442        for (n = 0, c = 0; c < 4; ++c)
1443            if (param->mask & (1 << c))
1444                s[n++] = c;
1445        assert(n);
1446        for (c = n; c < 4; ++c)
1447            s[c] = s[n - 1];
1448        src = ureg_swizzle(src, s[0], s[1], s[2], s[3]);
1449    }
1450    return src;
1451}
1452
1453static HRESULT
1454NineTranslateInstruction_Mkxn(struct shader_translator *tx, const unsigned k, const unsigned n)
1455{
1456    struct ureg_program *ureg = tx->ureg;
1457    struct ureg_dst dst;
1458    struct ureg_src src[2];
1459    struct sm1_src_param *src_mat = &tx->insn.src[1];
1460    unsigned i;
1461
1462    dst = tx_dst_param(tx, &tx->insn.dst[0]);
1463    src[0] = tx_src_param(tx, &tx->insn.src[0]);
1464
1465    for (i = 0; i < n; i++)
1466    {
1467        const unsigned m = (1 << i);
1468
1469        src[1] = tx_src_param(tx, src_mat);
1470        src_mat->idx++;
1471
1472        if (!(dst.WriteMask & m))
1473            continue;
1474
1475        /* XXX: src == dst case ? */
1476
1477        switch (k) {
1478        case 3:
1479            ureg_DP3(ureg, ureg_writemask(dst, m), src[0], src[1]);
1480            break;
1481        case 4:
1482            ureg_DP4(ureg, ureg_writemask(dst, m), src[0], src[1]);
1483            break;
1484        default:
1485            DBG("invalid operation: M%ux%u\n", m, n);
1486            break;
1487        }
1488    }
1489
1490    return D3D_OK;
1491}
1492
1493#define VNOTSUPPORTED   0, 0
1494#define V(maj, min)     (((maj) << 8) | (min))
1495
1496static inline const char *
1497d3dsio_to_string( unsigned opcode )
1498{
1499    static const char *names[] = {
1500        "NOP",
1501        "MOV",
1502        "ADD",
1503        "SUB",
1504        "MAD",
1505        "MUL",
1506        "RCP",
1507        "RSQ",
1508        "DP3",
1509        "DP4",
1510        "MIN",
1511        "MAX",
1512        "SLT",
1513        "SGE",
1514        "EXP",
1515        "LOG",
1516        "LIT",
1517        "DST",
1518        "LRP",
1519        "FRC",
1520        "M4x4",
1521        "M4x3",
1522        "M3x4",
1523        "M3x3",
1524        "M3x2",
1525        "CALL",
1526        "CALLNZ",
1527        "LOOP",
1528        "RET",
1529        "ENDLOOP",
1530        "LABEL",
1531        "DCL",
1532        "POW",
1533        "CRS",
1534        "SGN",
1535        "ABS",
1536        "NRM",
1537        "SINCOS",
1538        "REP",
1539        "ENDREP",
1540        "IF",
1541        "IFC",
1542        "ELSE",
1543        "ENDIF",
1544        "BREAK",
1545        "BREAKC",
1546        "MOVA",
1547        "DEFB",
1548        "DEFI",
1549        NULL,
1550        NULL,
1551        NULL,
1552        NULL,
1553        NULL,
1554        NULL,
1555        NULL,
1556        NULL,
1557        NULL,
1558        NULL,
1559        NULL,
1560        NULL,
1561        NULL,
1562        NULL,
1563        NULL,
1564        "TEXCOORD",
1565        "TEXKILL",
1566        "TEX",
1567        "TEXBEM",
1568        "TEXBEML",
1569        "TEXREG2AR",
1570        "TEXREG2GB",
1571        "TEXM3x2PAD",
1572        "TEXM3x2TEX",
1573        "TEXM3x3PAD",
1574        "TEXM3x3TEX",
1575        NULL,
1576        "TEXM3x3SPEC",
1577        "TEXM3x3VSPEC",
1578        "EXPP",
1579        "LOGP",
1580        "CND",
1581        "DEF",
1582        "TEXREG2RGB",
1583        "TEXDP3TEX",
1584        "TEXM3x2DEPTH",
1585        "TEXDP3",
1586        "TEXM3x3",
1587        "TEXDEPTH",
1588        "CMP",
1589        "BEM",
1590        "DP2ADD",
1591        "DSX",
1592        "DSY",
1593        "TEXLDD",
1594        "SETP",
1595        "TEXLDL",
1596        "BREAKP"
1597    };
1598
1599    if (opcode < ARRAY_SIZE(names)) return names[opcode];
1600
1601    switch (opcode) {
1602    case D3DSIO_PHASE: return "PHASE";
1603    case D3DSIO_COMMENT: return "COMMENT";
1604    case D3DSIO_END: return "END";
1605    default:
1606        return NULL;
1607    }
1608}
1609
1610#define NULL_INSTRUCTION            { 0, { 0, 0 }, { 0, 0 }, 0, 0, NULL }
1611#define IS_VALID_INSTRUCTION(inst)  ((inst).vert_version.min | \
1612                                     (inst).vert_version.max | \
1613                                     (inst).frag_version.min | \
1614                                     (inst).frag_version.max)
1615
1616#define SPECIAL(name) \
1617    NineTranslateInstruction_##name
1618
1619#define DECL_SPECIAL(name) \
1620    static HRESULT \
1621    NineTranslateInstruction_##name( struct shader_translator *tx )
1622
1623static HRESULT
1624NineTranslateInstruction_Generic(struct shader_translator *);
1625
1626DECL_SPECIAL(NOP)
1627{
1628    /* Nothing to do. NOP was used to avoid hangs
1629     * with very old d3d drivers. */
1630    return D3D_OK;
1631}
1632
1633DECL_SPECIAL(SUB)
1634{
1635    struct ureg_program *ureg = tx->ureg;
1636    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1637    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
1638    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
1639
1640    ureg_ADD(ureg, dst, src0, ureg_negate(src1));
1641    return D3D_OK;
1642}
1643
1644DECL_SPECIAL(ABS)
1645{
1646    struct ureg_program *ureg = tx->ureg;
1647    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1648    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1649
1650    ureg_MOV(ureg, dst, ureg_abs(src));
1651    return D3D_OK;
1652}
1653
1654DECL_SPECIAL(XPD)
1655{
1656    struct ureg_program *ureg = tx->ureg;
1657    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1658    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
1659    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
1660
1661    ureg_MUL(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
1662             ureg_swizzle(src0, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z,
1663                          TGSI_SWIZZLE_X, 0),
1664             ureg_swizzle(src1, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
1665                          TGSI_SWIZZLE_Y, 0));
1666    ureg_MAD(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
1667             ureg_swizzle(src0, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
1668                          TGSI_SWIZZLE_Y, 0),
1669             ureg_negate(ureg_swizzle(src1, TGSI_SWIZZLE_Y,
1670                                      TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X, 0)),
1671             ureg_src(dst));
1672    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W),
1673             ureg_imm1f(ureg, 1));
1674    return D3D_OK;
1675}
1676
1677DECL_SPECIAL(M4x4)
1678{
1679    return NineTranslateInstruction_Mkxn(tx, 4, 4);
1680}
1681
1682DECL_SPECIAL(M4x3)
1683{
1684    return NineTranslateInstruction_Mkxn(tx, 4, 3);
1685}
1686
1687DECL_SPECIAL(M3x4)
1688{
1689    return NineTranslateInstruction_Mkxn(tx, 3, 4);
1690}
1691
1692DECL_SPECIAL(M3x3)
1693{
1694    return NineTranslateInstruction_Mkxn(tx, 3, 3);
1695}
1696
1697DECL_SPECIAL(M3x2)
1698{
1699    return NineTranslateInstruction_Mkxn(tx, 3, 2);
1700}
1701
1702DECL_SPECIAL(CMP)
1703{
1704    ureg_CMP(tx->ureg, tx_dst_param(tx, &tx->insn.dst[0]),
1705             tx_src_param(tx, &tx->insn.src[0]),
1706             tx_src_param(tx, &tx->insn.src[2]),
1707             tx_src_param(tx, &tx->insn.src[1]));
1708    return D3D_OK;
1709}
1710
1711DECL_SPECIAL(CND)
1712{
1713    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1714    struct ureg_dst cgt;
1715    struct ureg_src cnd;
1716
1717    /* the coissue flag was a tip for compilers to advise to
1718     * execute two operations at the same time, in cases
1719     * the two executions had same dst with different channels.
1720     * It has no effect on current hw. However it seems CND
1721     * is affected. The handling of this very specific case
1722     * handled below mimick wine behaviour */
1723    if (tx->insn.coissue && tx->version.major == 1 && tx->version.minor < 4 && tx->insn.dst[0].mask != NINED3DSP_WRITEMASK_3) {
1724        ureg_MOV(tx->ureg,
1725                 dst, tx_src_param(tx, &tx->insn.src[1]));
1726        return D3D_OK;
1727    }
1728
1729    cnd = tx_src_param(tx, &tx->insn.src[0]);
1730    cgt = tx_scratch(tx);
1731
1732    if (tx->version.major == 1 && tx->version.minor < 4)
1733        cnd = ureg_scalar(cnd, TGSI_SWIZZLE_W);
1734
1735    ureg_SGT(tx->ureg, cgt, cnd, ureg_imm1f(tx->ureg, 0.5f));
1736
1737    ureg_CMP(tx->ureg, dst, ureg_negate(ureg_src(cgt)),
1738             tx_src_param(tx, &tx->insn.src[1]),
1739             tx_src_param(tx, &tx->insn.src[2]));
1740    return D3D_OK;
1741}
1742
1743DECL_SPECIAL(CALL)
1744{
1745    assert(tx->insn.src[0].idx < tx->num_inst_labels);
1746    ureg_CAL(tx->ureg, &tx->inst_labels[tx->insn.src[0].idx]);
1747    return D3D_OK;
1748}
1749
1750DECL_SPECIAL(CALLNZ)
1751{
1752    struct ureg_program *ureg = tx->ureg;
1753    struct ureg_src src = tx_src_param(tx, &tx->insn.src[1]);
1754
1755    if (!tx->native_integers)
1756        ureg_IF(ureg, src, tx_cond(tx));
1757    else
1758        ureg_UIF(ureg, src, tx_cond(tx));
1759    ureg_CAL(ureg, &tx->inst_labels[tx->insn.src[0].idx]);
1760    tx_endcond(tx);
1761    ureg_ENDIF(ureg);
1762    return D3D_OK;
1763}
1764
1765DECL_SPECIAL(LOOP)
1766{
1767    struct ureg_program *ureg = tx->ureg;
1768    unsigned *label;
1769    struct ureg_src src = tx_src_param(tx, &tx->insn.src[1]);
1770    struct ureg_dst ctr;
1771    struct ureg_dst tmp;
1772    struct ureg_src ctrx;
1773
1774    label = tx_bgnloop(tx);
1775    ctr = tx_get_loopctr(tx, TRUE);
1776    ctrx = ureg_scalar(ureg_src(ctr), TGSI_SWIZZLE_X);
1777
1778    /* src: num_iterations - start_value of al - step for al - 0 */
1779    ureg_MOV(ureg, ctr, src);
1780    ureg_BGNLOOP(tx->ureg, label);
1781    tmp = tx_scratch_scalar(tx);
1782    /* Initially ctr.x contains the number of iterations.
1783     * ctr.y will contain the updated value of al.
1784     * We decrease ctr.x at the end of every iteration,
1785     * and stop when it reaches 0. */
1786
1787    if (!tx->native_integers) {
1788        /* case src and ctr contain floats */
1789        /* to avoid precision issue, we stop when ctr <= 0.5 */
1790        ureg_SGE(ureg, tmp, ureg_imm1f(ureg, 0.5f), ctrx);
1791        ureg_IF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1792    } else {
1793        /* case src and ctr contain integers */
1794        ureg_ISGE(ureg, tmp, ureg_imm1i(ureg, 0), ctrx);
1795        ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1796    }
1797    ureg_BRK(ureg);
1798    tx_endcond(tx);
1799    ureg_ENDIF(ureg);
1800    return D3D_OK;
1801}
1802
1803DECL_SPECIAL(RET)
1804{
1805    ureg_RET(tx->ureg);
1806    return D3D_OK;
1807}
1808
1809DECL_SPECIAL(ENDLOOP)
1810{
1811    struct ureg_program *ureg = tx->ureg;
1812    struct ureg_dst ctr = tx_get_loopctr(tx, TRUE);
1813    struct ureg_dst dst_ctrx, dst_al;
1814    struct ureg_src src_ctr, al_counter;
1815
1816    dst_ctrx = ureg_writemask(ctr, NINED3DSP_WRITEMASK_0);
1817    dst_al = ureg_writemask(ctr, NINED3DSP_WRITEMASK_1);
1818    src_ctr = ureg_src(ctr);
1819    al_counter = ureg_scalar(src_ctr, TGSI_SWIZZLE_Z);
1820
1821    /* ctr.x -= 1
1822     * ctr.y (aL) += step */
1823    if (!tx->native_integers) {
1824        ureg_ADD(ureg, dst_ctrx, src_ctr, ureg_imm1f(ureg, -1.0f));
1825        ureg_ADD(ureg, dst_al, src_ctr, al_counter);
1826    } else {
1827        ureg_UADD(ureg, dst_ctrx, src_ctr, ureg_imm1i(ureg, -1));
1828        ureg_UADD(ureg, dst_al, src_ctr, al_counter);
1829    }
1830    ureg_ENDLOOP(tx->ureg, tx_endloop(tx));
1831    return D3D_OK;
1832}
1833
1834DECL_SPECIAL(LABEL)
1835{
1836    unsigned k = tx->num_inst_labels;
1837    unsigned n = tx->insn.src[0].idx;
1838    assert(n < 2048);
1839    if (n >= k)
1840       tx->inst_labels = REALLOC(tx->inst_labels,
1841                                 k * sizeof(tx->inst_labels[0]),
1842                                 n * sizeof(tx->inst_labels[0]));
1843
1844    tx->inst_labels[n] = ureg_get_instruction_number(tx->ureg);
1845    return D3D_OK;
1846}
1847
1848DECL_SPECIAL(SINCOS)
1849{
1850    struct ureg_program *ureg = tx->ureg;
1851    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1852    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1853    struct ureg_dst tmp = tx_scratch_scalar(tx);
1854
1855    assert(!(dst.WriteMask & 0xc));
1856
1857    /* Copying to a temporary register avoids src/dst aliasing.
1858     * src is supposed to have replicated swizzle. */
1859    ureg_MOV(ureg, tmp, src);
1860
1861    /* z undefined, w untouched */
1862    ureg_COS(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X),
1863             tx_src_scalar(tmp));
1864    ureg_SIN(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y),
1865             tx_src_scalar(tmp));
1866    return D3D_OK;
1867}
1868
1869DECL_SPECIAL(SGN)
1870{
1871    ureg_SSG(tx->ureg,
1872             tx_dst_param(tx, &tx->insn.dst[0]),
1873             tx_src_param(tx, &tx->insn.src[0]));
1874    return D3D_OK;
1875}
1876
1877DECL_SPECIAL(REP)
1878{
1879    struct ureg_program *ureg = tx->ureg;
1880    unsigned *label;
1881    struct ureg_src rep = tx_src_param(tx, &tx->insn.src[0]);
1882    struct ureg_dst ctr;
1883    struct ureg_dst tmp;
1884    struct ureg_src ctrx;
1885
1886    label = tx_bgnloop(tx);
1887    ctr = ureg_writemask(tx_get_loopctr(tx, FALSE), NINED3DSP_WRITEMASK_0);
1888    ctrx = ureg_scalar(ureg_src(ctr), TGSI_SWIZZLE_X);
1889
1890    /* NOTE: rep must be constant, so we don't have to save the count */
1891    assert(rep.File == TGSI_FILE_CONSTANT || rep.File == TGSI_FILE_IMMEDIATE);
1892
1893    /* rep: num_iterations - 0 - 0 - 0 */
1894    ureg_MOV(ureg, ctr, rep);
1895    ureg_BGNLOOP(ureg, label);
1896    tmp = tx_scratch_scalar(tx);
1897    /* Initially ctr.x contains the number of iterations.
1898     * We decrease ctr.x at the end of every iteration,
1899     * and stop when it reaches 0. */
1900
1901    if (!tx->native_integers) {
1902        /* case src and ctr contain floats */
1903        /* to avoid precision issue, we stop when ctr <= 0.5 */
1904        ureg_SGE(ureg, tmp, ureg_imm1f(ureg, 0.5f), ctrx);
1905        ureg_IF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1906    } else {
1907        /* case src and ctr contain integers */
1908        ureg_ISGE(ureg, tmp, ureg_imm1i(ureg, 0), ctrx);
1909        ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1910    }
1911    ureg_BRK(ureg);
1912    tx_endcond(tx);
1913    ureg_ENDIF(ureg);
1914
1915    return D3D_OK;
1916}
1917
1918DECL_SPECIAL(ENDREP)
1919{
1920    struct ureg_program *ureg = tx->ureg;
1921    struct ureg_dst ctr = tx_get_loopctr(tx, FALSE);
1922    struct ureg_dst dst_ctrx = ureg_writemask(ctr, NINED3DSP_WRITEMASK_0);
1923    struct ureg_src src_ctr = ureg_src(ctr);
1924
1925    /* ctr.x -= 1 */
1926    if (!tx->native_integers)
1927        ureg_ADD(ureg, dst_ctrx, src_ctr, ureg_imm1f(ureg, -1.0f));
1928    else
1929        ureg_UADD(ureg, dst_ctrx, src_ctr, ureg_imm1i(ureg, -1));
1930
1931    ureg_ENDLOOP(tx->ureg, tx_endloop(tx));
1932    return D3D_OK;
1933}
1934
1935DECL_SPECIAL(ENDIF)
1936{
1937    tx_endcond(tx);
1938    ureg_ENDIF(tx->ureg);
1939    return D3D_OK;
1940}
1941
1942DECL_SPECIAL(IF)
1943{
1944    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1945
1946    if (tx->native_integers && tx->insn.src[0].file == D3DSPR_CONSTBOOL)
1947        ureg_UIF(tx->ureg, src, tx_cond(tx));
1948    else
1949        ureg_IF(tx->ureg, src, tx_cond(tx));
1950
1951    return D3D_OK;
1952}
1953
1954static inline unsigned
1955sm1_insn_flags_to_tgsi_setop(BYTE flags)
1956{
1957    switch (flags) {
1958    case NINED3DSHADER_REL_OP_GT: return TGSI_OPCODE_SGT;
1959    case NINED3DSHADER_REL_OP_EQ: return TGSI_OPCODE_SEQ;
1960    case NINED3DSHADER_REL_OP_GE: return TGSI_OPCODE_SGE;
1961    case NINED3DSHADER_REL_OP_LT: return TGSI_OPCODE_SLT;
1962    case NINED3DSHADER_REL_OP_NE: return TGSI_OPCODE_SNE;
1963    case NINED3DSHADER_REL_OP_LE: return TGSI_OPCODE_SLE;
1964    default:
1965        assert(!"invalid comparison flags");
1966        return TGSI_OPCODE_SGT;
1967    }
1968}
1969
1970DECL_SPECIAL(IFC)
1971{
1972    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
1973    struct ureg_src src[2];
1974    struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
1975    src[0] = tx_src_param(tx, &tx->insn.src[0]);
1976    src[1] = tx_src_param(tx, &tx->insn.src[1]);
1977    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
1978    ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
1979    return D3D_OK;
1980}
1981
1982DECL_SPECIAL(ELSE)
1983{
1984    ureg_ELSE(tx->ureg, tx_elsecond(tx));
1985    return D3D_OK;
1986}
1987
1988DECL_SPECIAL(BREAKC)
1989{
1990    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
1991    struct ureg_src src[2];
1992    struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
1993    src[0] = tx_src_param(tx, &tx->insn.src[0]);
1994    src[1] = tx_src_param(tx, &tx->insn.src[1]);
1995    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
1996    ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
1997    ureg_BRK(tx->ureg);
1998    tx_endcond(tx);
1999    ureg_ENDIF(tx->ureg);
2000    return D3D_OK;
2001}
2002
2003static const char *sm1_declusage_names[] =
2004{
2005    [D3DDECLUSAGE_POSITION] = "POSITION",
2006    [D3DDECLUSAGE_BLENDWEIGHT] = "BLENDWEIGHT",
2007    [D3DDECLUSAGE_BLENDINDICES] = "BLENDINDICES",
2008    [D3DDECLUSAGE_NORMAL] = "NORMAL",
2009    [D3DDECLUSAGE_PSIZE] = "PSIZE",
2010    [D3DDECLUSAGE_TEXCOORD] = "TEXCOORD",
2011    [D3DDECLUSAGE_TANGENT] = "TANGENT",
2012    [D3DDECLUSAGE_BINORMAL] = "BINORMAL",
2013    [D3DDECLUSAGE_TESSFACTOR] = "TESSFACTOR",
2014    [D3DDECLUSAGE_POSITIONT] = "POSITIONT",
2015    [D3DDECLUSAGE_COLOR] = "COLOR",
2016    [D3DDECLUSAGE_FOG] = "FOG",
2017    [D3DDECLUSAGE_DEPTH] = "DEPTH",
2018    [D3DDECLUSAGE_SAMPLE] = "SAMPLE"
2019};
2020
2021static inline unsigned
2022sm1_to_nine_declusage(struct sm1_semantic *dcl)
2023{
2024    return nine_d3d9_to_nine_declusage(dcl->usage, dcl->usage_idx);
2025}
2026
2027static void
2028sm1_declusage_to_tgsi(struct tgsi_declaration_semantic *sem,
2029                      boolean tc,
2030                      struct sm1_semantic *dcl)
2031{
2032    BYTE index = dcl->usage_idx;
2033
2034    /* For everything that is not matching to a TGSI_SEMANTIC_****,
2035     * we match to a TGSI_SEMANTIC_GENERIC with index.
2036     *
2037     * The index can be anything UINT16 and usage_idx is BYTE,
2038     * so we can fit everything. It doesn't matter if indices
2039     * are close together or low.
2040     *
2041     *
2042     * POSITION >= 1: 10 * index + 7
2043     * COLOR >= 2: 10 * (index-1) + 8
2044     * FOG: 16
2045     * TEXCOORD[0..15]: index
2046     * BLENDWEIGHT: 10 * index + 19
2047     * BLENDINDICES: 10 * index + 20
2048     * NORMAL: 10 * index + 21
2049     * TANGENT: 10 * index + 22
2050     * BINORMAL: 10 * index + 23
2051     * TESSFACTOR: 10 * index + 24
2052     */
2053
2054    switch (dcl->usage) {
2055    case D3DDECLUSAGE_POSITION:
2056    case D3DDECLUSAGE_POSITIONT:
2057    case D3DDECLUSAGE_DEPTH:
2058        if (index == 0) {
2059            sem->Name = TGSI_SEMANTIC_POSITION;
2060            sem->Index = 0;
2061        } else {
2062            sem->Name = TGSI_SEMANTIC_GENERIC;
2063            sem->Index = 10 * index + 7;
2064        }
2065        break;
2066    case D3DDECLUSAGE_COLOR:
2067        if (index < 2) {
2068            sem->Name = TGSI_SEMANTIC_COLOR;
2069            sem->Index = index;
2070        } else {
2071            sem->Name = TGSI_SEMANTIC_GENERIC;
2072            sem->Index = 10 * (index-1) + 8;
2073        }
2074        break;
2075    case D3DDECLUSAGE_FOG:
2076        assert(index == 0);
2077        sem->Name = TGSI_SEMANTIC_GENERIC;
2078        sem->Index = 16;
2079        break;
2080    case D3DDECLUSAGE_PSIZE:
2081        assert(index == 0);
2082        sem->Name = TGSI_SEMANTIC_PSIZE;
2083        sem->Index = 0;
2084        break;
2085    case D3DDECLUSAGE_TEXCOORD:
2086        assert(index < 16);
2087        if (index < 8 && tc)
2088            sem->Name = TGSI_SEMANTIC_TEXCOORD;
2089        else
2090            sem->Name = TGSI_SEMANTIC_GENERIC;
2091        sem->Index = index;
2092        break;
2093    case D3DDECLUSAGE_BLENDWEIGHT:
2094        sem->Name = TGSI_SEMANTIC_GENERIC;
2095        sem->Index = 10 * index + 19;
2096        break;
2097    case D3DDECLUSAGE_BLENDINDICES:
2098        sem->Name = TGSI_SEMANTIC_GENERIC;
2099        sem->Index = 10 * index + 20;
2100        break;
2101    case D3DDECLUSAGE_NORMAL:
2102        sem->Name = TGSI_SEMANTIC_GENERIC;
2103        sem->Index = 10 * index + 21;
2104        break;
2105    case D3DDECLUSAGE_TANGENT:
2106        sem->Name = TGSI_SEMANTIC_GENERIC;
2107        sem->Index = 10 * index + 22;
2108        break;
2109    case D3DDECLUSAGE_BINORMAL:
2110        sem->Name = TGSI_SEMANTIC_GENERIC;
2111        sem->Index = 10 * index + 23;
2112        break;
2113    case D3DDECLUSAGE_TESSFACTOR:
2114        sem->Name = TGSI_SEMANTIC_GENERIC;
2115        sem->Index = 10 * index + 24;
2116        break;
2117    case D3DDECLUSAGE_SAMPLE:
2118        sem->Name = TGSI_SEMANTIC_COUNT;
2119        sem->Index = 0;
2120        break;
2121    default:
2122        unreachable("Invalid DECLUSAGE.");
2123        break;
2124    }
2125}
2126
2127#define NINED3DSTT_1D     (D3DSTT_1D >> D3DSP_TEXTURETYPE_SHIFT)
2128#define NINED3DSTT_2D     (D3DSTT_2D >> D3DSP_TEXTURETYPE_SHIFT)
2129#define NINED3DSTT_VOLUME (D3DSTT_VOLUME >> D3DSP_TEXTURETYPE_SHIFT)
2130#define NINED3DSTT_CUBE   (D3DSTT_CUBE >> D3DSP_TEXTURETYPE_SHIFT)
2131static inline unsigned
2132d3dstt_to_tgsi_tex(BYTE sampler_type)
2133{
2134    switch (sampler_type) {
2135    case NINED3DSTT_1D:     return TGSI_TEXTURE_1D;
2136    case NINED3DSTT_2D:     return TGSI_TEXTURE_2D;
2137    case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D;
2138    case NINED3DSTT_CUBE:   return TGSI_TEXTURE_CUBE;
2139    default:
2140        assert(0);
2141        return TGSI_TEXTURE_UNKNOWN;
2142    }
2143}
2144static inline unsigned
2145d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
2146{
2147    switch (sampler_type) {
2148    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D;
2149    case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D;
2150    case NINED3DSTT_VOLUME:
2151    case NINED3DSTT_CUBE:
2152    default:
2153        assert(0);
2154        return TGSI_TEXTURE_UNKNOWN;
2155    }
2156}
2157static inline unsigned
2158ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
2159{
2160    boolean shadow = !!(info->sampler_mask_shadow & (1 << stage));
2161    switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
2162    case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D;
2163    case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
2164    case 3: return TGSI_TEXTURE_3D;
2165    default:
2166        return TGSI_TEXTURE_CUBE;
2167    }
2168}
2169
2170static const char *
2171sm1_sampler_type_name(BYTE sampler_type)
2172{
2173    switch (sampler_type) {
2174    case NINED3DSTT_1D:     return "1D";
2175    case NINED3DSTT_2D:     return "2D";
2176    case NINED3DSTT_VOLUME: return "VOLUME";
2177    case NINED3DSTT_CUBE:   return "CUBE";
2178    default:
2179        return "(D3DSTT_?)";
2180    }
2181}
2182
2183static inline unsigned
2184nine_tgsi_to_interp_mode(struct tgsi_declaration_semantic *sem)
2185{
2186    switch (sem->Name) {
2187    case TGSI_SEMANTIC_POSITION:
2188    case TGSI_SEMANTIC_NORMAL:
2189        return TGSI_INTERPOLATE_LINEAR;
2190    case TGSI_SEMANTIC_BCOLOR:
2191    case TGSI_SEMANTIC_COLOR:
2192        return TGSI_INTERPOLATE_COLOR;
2193    case TGSI_SEMANTIC_FOG:
2194    case TGSI_SEMANTIC_GENERIC:
2195    case TGSI_SEMANTIC_TEXCOORD:
2196    case TGSI_SEMANTIC_CLIPDIST:
2197    case TGSI_SEMANTIC_CLIPVERTEX:
2198        return TGSI_INTERPOLATE_PERSPECTIVE;
2199    case TGSI_SEMANTIC_EDGEFLAG:
2200    case TGSI_SEMANTIC_FACE:
2201    case TGSI_SEMANTIC_INSTANCEID:
2202    case TGSI_SEMANTIC_PCOORD:
2203    case TGSI_SEMANTIC_PRIMID:
2204    case TGSI_SEMANTIC_PSIZE:
2205    case TGSI_SEMANTIC_VERTEXID:
2206        return TGSI_INTERPOLATE_CONSTANT;
2207    default:
2208        assert(0);
2209        return TGSI_INTERPOLATE_CONSTANT;
2210    }
2211}
2212
2213DECL_SPECIAL(DCL)
2214{
2215    struct ureg_program *ureg = tx->ureg;
2216    boolean is_input;
2217    boolean is_sampler;
2218    struct tgsi_declaration_semantic tgsi;
2219    struct sm1_semantic sem;
2220    sm1_read_semantic(tx, &sem);
2221
2222    is_input = sem.reg.file == D3DSPR_INPUT;
2223    is_sampler =
2224        sem.usage == D3DDECLUSAGE_SAMPLE || sem.reg.file == D3DSPR_SAMPLER;
2225
2226    DUMP("DCL ");
2227    sm1_dump_dst_param(&sem.reg);
2228    if (is_sampler)
2229        DUMP(" %s\n", sm1_sampler_type_name(sem.sampler_type));
2230    else
2231    if (tx->version.major >= 3)
2232        DUMP(" %s%i\n", sm1_declusage_names[sem.usage], sem.usage_idx);
2233    else
2234    if (sem.usage | sem.usage_idx)
2235        DUMP(" %u[%u]\n", sem.usage, sem.usage_idx);
2236    else
2237        DUMP("\n");
2238
2239    if (is_sampler) {
2240        const unsigned m = 1 << sem.reg.idx;
2241        ureg_DECL_sampler(ureg, sem.reg.idx);
2242        tx->info->sampler_mask |= m;
2243        tx->sampler_targets[sem.reg.idx] = (tx->info->sampler_mask_shadow & m) ?
2244            d3dstt_to_tgsi_tex_shadow(sem.sampler_type) :
2245            d3dstt_to_tgsi_tex(sem.sampler_type);
2246        return D3D_OK;
2247    }
2248
2249    sm1_declusage_to_tgsi(&tgsi, tx->want_texcoord, &sem);
2250    if (IS_VS) {
2251        if (is_input) {
2252            /* linkage outside of shader with vertex declaration */
2253            ureg_DECL_vs_input(ureg, sem.reg.idx);
2254            assert(sem.reg.idx < ARRAY_SIZE(tx->info->input_map));
2255            tx->info->input_map[sem.reg.idx] = sm1_to_nine_declusage(&sem);
2256            tx->info->num_inputs = MAX2(tx->info->num_inputs, sem.reg.idx + 1);
2257            /* NOTE: preserving order in case of indirect access */
2258        } else
2259        if (tx->version.major >= 3) {
2260            /* SM2 output semantic determined by file */
2261            assert(sem.reg.mask != 0);
2262            if (sem.usage == D3DDECLUSAGE_POSITIONT)
2263                tx->info->position_t = TRUE;
2264            assert(sem.reg.idx < ARRAY_SIZE(tx->regs.o));
2265            assert(ureg_dst_is_undef(tx->regs.o[sem.reg.idx]) && "Nine doesn't support yet packing");
2266            tx->regs.o[sem.reg.idx] = ureg_DECL_output_masked(
2267                ureg, tgsi.Name, tgsi.Index, sem.reg.mask, 0, 1);
2268            nine_record_outputs(tx, sem.usage, sem.usage_idx, sem.reg.mask, sem.reg.idx);
2269            if (tx->info->process_vertices && sem.usage == D3DDECLUSAGE_POSITION && sem.usage_idx == 0) {
2270                tx->regs.oPos_out = tx->regs.o[sem.reg.idx];
2271                tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
2272                tx->regs.oPos = tx->regs.o[sem.reg.idx];
2273            }
2274
2275            if (tgsi.Name == TGSI_SEMANTIC_PSIZE) {
2276                tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
2277                tx->regs.oPts = tx->regs.o[sem.reg.idx];
2278            }
2279        }
2280    } else {
2281        if (is_input && tx->version.major >= 3) {
2282            unsigned interp_location = 0;
2283            /* SM3 only, SM2 input semantic determined by file */
2284            assert(sem.reg.idx < ARRAY_SIZE(tx->regs.v));
2285            assert(ureg_src_is_undef(tx->regs.v[sem.reg.idx]) && "Nine doesn't support yet packing");
2286            /* PositionT and tessfactor forbidden */
2287            if (sem.usage == D3DDECLUSAGE_POSITIONT || sem.usage == D3DDECLUSAGE_TESSFACTOR)
2288                return D3DERR_INVALIDCALL;
2289
2290            if (tgsi.Name == TGSI_SEMANTIC_POSITION) {
2291                /* Position0 is forbidden (likely because vPos already does that) */
2292                if (sem.usage == D3DDECLUSAGE_POSITION)
2293                    return D3DERR_INVALIDCALL;
2294                /* Following code is for depth */
2295                tx->regs.v[sem.reg.idx] = nine_get_position_input(tx);
2296                return D3D_OK;
2297            }
2298
2299            if (sem.reg.mod & NINED3DSPDM_CENTROID ||
2300                (tgsi.Name == TGSI_SEMANTIC_COLOR && tx->info->force_color_in_centroid))
2301                interp_location = TGSI_INTERPOLATE_LOC_CENTROID;
2302
2303            tx->regs.v[sem.reg.idx] = ureg_DECL_fs_input_cyl_centroid(
2304                ureg, tgsi.Name, tgsi.Index,
2305                nine_tgsi_to_interp_mode(&tgsi),
2306                0, /* cylwrap */
2307                interp_location, 0, 1);
2308        } else
2309        if (!is_input && 0) { /* declare in COLOROUT/DEPTHOUT case */
2310            /* FragColor or FragDepth */
2311            assert(sem.reg.mask != 0);
2312            ureg_DECL_output_masked(ureg, tgsi.Name, tgsi.Index, sem.reg.mask,
2313                                    0, 1);
2314        }
2315    }
2316    return D3D_OK;
2317}
2318
2319DECL_SPECIAL(DEF)
2320{
2321    tx_set_lconstf(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.f);
2322    return D3D_OK;
2323}
2324
2325DECL_SPECIAL(DEFB)
2326{
2327    tx_set_lconstb(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.b);
2328    return D3D_OK;
2329}
2330
2331DECL_SPECIAL(DEFI)
2332{
2333    tx_set_lconsti(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.i);
2334    return D3D_OK;
2335}
2336
2337DECL_SPECIAL(POW)
2338{
2339    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2340    struct ureg_src src[2] = {
2341        tx_src_param(tx, &tx->insn.src[0]),
2342        tx_src_param(tx, &tx->insn.src[1])
2343    };
2344    ureg_POW(tx->ureg, dst, ureg_abs(src[0]), src[1]);
2345    return D3D_OK;
2346}
2347
2348/* Tests results on Win 10:
2349 * NV (NVIDIA GeForce GT 635M)
2350 * AMD (AMD Radeon HD 7730M)
2351 * INTEL (Intel(R) HD Graphics 4000)
2352 * PS2 and PS3:
2353 * RCP and RSQ can generate inf on NV and AMD.
2354 * RCP and RSQ are clamped on INTEL (+- FLT_MAX),
2355 * NV: log not clamped
2356 * AMD: log(0) is -FLT_MAX (but log(inf) is inf)
2357 * INTEL: log(0) is -FLT_MAX and log(inf) is 127
2358 * All devices have 0*anything = 0
2359 *
2360 * INTEL VS2 and VS3: same behaviour.
2361 * Some differences VS2 and VS3 for constants defined with inf/NaN.
2362 * While PS3, VS3 and PS2 keep NaN and Inf shader constants without change,
2363 * VS2 seems to clamp to zero (may be test failure).
2364 * AMD VS2: unknown, VS3: very likely behaviour of PS3
2365 * NV VS2 and VS3: very likely behaviour of PS3
2366 * For both, Inf in VS becomes NaN is PS
2367 * "Very likely" because the test was less extensive.
2368 *
2369 * Thus all clamping can be removed for shaders 2 and 3,
2370 * as long as 0*anything = 0.
2371 * Else clamps to enforce 0*anything = 0 (anything being then
2372 * neither inf or NaN, the user being unlikely to pass them
2373 * as constant).
2374 * The status for VS1 and PS1 is unknown.
2375 */
2376
2377DECL_SPECIAL(RCP)
2378{
2379    struct ureg_program *ureg = tx->ureg;
2380    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2381    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2382    struct ureg_dst tmp = tx->mul_zero_wins ? dst : tx_scratch(tx);
2383    ureg_RCP(ureg, tmp, src);
2384    if (!tx->mul_zero_wins) {
2385        /* FLT_MAX has issues with Rayman */
2386        ureg_MIN(ureg, tmp, ureg_imm1f(ureg, FLT_MAX/2.f), ureg_src(tmp));
2387        ureg_MAX(ureg, dst, ureg_imm1f(ureg, -FLT_MAX/2.f), ureg_src(tmp));
2388    }
2389    return D3D_OK;
2390}
2391
2392DECL_SPECIAL(RSQ)
2393{
2394    struct ureg_program *ureg = tx->ureg;
2395    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2396    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2397    struct ureg_dst tmp = tx->mul_zero_wins ? dst : tx_scratch(tx);
2398    ureg_RSQ(ureg, tmp, ureg_abs(src));
2399    if (!tx->mul_zero_wins)
2400        ureg_MIN(ureg, dst, ureg_imm1f(ureg, FLT_MAX), ureg_src(tmp));
2401    return D3D_OK;
2402}
2403
2404DECL_SPECIAL(LOG)
2405{
2406    struct ureg_program *ureg = tx->ureg;
2407    struct ureg_dst tmp = tx_scratch_scalar(tx);
2408    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2409    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2410    ureg_LG2(ureg, tmp, ureg_abs(src));
2411    if (tx->mul_zero_wins) {
2412        ureg_MOV(ureg, dst, tx_src_scalar(tmp));
2413    } else {
2414        ureg_MAX(ureg, dst, ureg_imm1f(ureg, -FLT_MAX), tx_src_scalar(tmp));
2415    }
2416    return D3D_OK;
2417}
2418
2419DECL_SPECIAL(LIT)
2420{
2421    struct ureg_program *ureg = tx->ureg;
2422    struct ureg_dst tmp = tx_scratch(tx);
2423    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2424    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2425    ureg_LIT(ureg, tmp, src);
2426    /* d3d9 LIT is the same than gallium LIT. One difference is that d3d9
2427     * states that dst.z is 0 when src.y <= 0. Gallium definition can assign
2428     * it 0^0 if src.w=0, which value is driver dependent. */
2429    ureg_CMP(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z),
2430             ureg_negate(ureg_scalar(src, TGSI_SWIZZLE_Y)),
2431             ureg_src(tmp), ureg_imm1f(ureg, 0.0f));
2432    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYW), ureg_src(tmp));
2433    return D3D_OK;
2434}
2435
2436DECL_SPECIAL(NRM)
2437{
2438    struct ureg_program *ureg = tx->ureg;
2439    struct ureg_dst tmp = tx_scratch_scalar(tx);
2440    struct ureg_src nrm = tx_src_scalar(tmp);
2441    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2442    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2443    ureg_DP3(ureg, tmp, src, src);
2444    ureg_RSQ(ureg, tmp, nrm);
2445    if (!tx->mul_zero_wins)
2446        ureg_MIN(ureg, tmp, ureg_imm1f(ureg, FLT_MAX), nrm);
2447    ureg_MUL(ureg, dst, src, nrm);
2448    return D3D_OK;
2449}
2450
2451DECL_SPECIAL(DP2ADD)
2452{
2453    struct ureg_dst tmp = tx_scratch_scalar(tx);
2454    struct ureg_src dp2 = tx_src_scalar(tmp);
2455    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2456    struct ureg_src src[3];
2457    int i;
2458    for (i = 0; i < 3; ++i)
2459        src[i] = tx_src_param(tx, &tx->insn.src[i]);
2460    assert_replicate_swizzle(&src[2]);
2461
2462    ureg_DP2(tx->ureg, tmp, src[0], src[1]);
2463    ureg_ADD(tx->ureg, dst, src[2], dp2);
2464
2465    return D3D_OK;
2466}
2467
2468DECL_SPECIAL(TEXCOORD)
2469{
2470    struct ureg_program *ureg = tx->ureg;
2471    const unsigned s = tx->insn.dst[0].idx;
2472    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2473
2474    tx_texcoord_alloc(tx, s);
2475    ureg_MOV(ureg, ureg_writemask(ureg_saturate(dst), TGSI_WRITEMASK_XYZ), tx->regs.vT[s]);
2476    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(tx->ureg, 1.0f));
2477
2478    return D3D_OK;
2479}
2480
2481DECL_SPECIAL(TEXCOORD_ps14)
2482{
2483    struct ureg_program *ureg = tx->ureg;
2484    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2485    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2486
2487    assert(tx->insn.src[0].file == D3DSPR_TEXTURE);
2488
2489    ureg_MOV(ureg, dst, src);
2490
2491    return D3D_OK;
2492}
2493
2494DECL_SPECIAL(TEXKILL)
2495{
2496    struct ureg_src reg;
2497
2498    if (tx->version.major > 1 || tx->version.minor > 3) {
2499        reg = tx_dst_param_as_src(tx, &tx->insn.dst[0]);
2500    } else {
2501        tx_texcoord_alloc(tx, tx->insn.dst[0].idx);
2502        reg = tx->regs.vT[tx->insn.dst[0].idx];
2503    }
2504    if (tx->version.major < 2)
2505        reg = ureg_swizzle(reg, NINE_SWIZZLE4(X,Y,Z,Z));
2506    ureg_KILL_IF(tx->ureg, reg);
2507
2508    return D3D_OK;
2509}
2510
2511DECL_SPECIAL(TEXBEM)
2512{
2513    struct ureg_program *ureg = tx->ureg;
2514    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2515    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2516    struct ureg_dst tmp, tmp2, texcoord;
2517    struct ureg_src sample, m00, m01, m10, m11, c8m, c16m2;
2518    struct ureg_src bumpenvlscale, bumpenvloffset;
2519    const int m = tx->insn.dst[0].idx;
2520
2521    assert(tx->version.major == 1);
2522
2523    sample = ureg_DECL_sampler(ureg, m);
2524    tx->info->sampler_mask |= 1 << m;
2525
2526    tx_texcoord_alloc(tx, m);
2527
2528    tmp = tx_scratch(tx);
2529    tmp2 = tx_scratch(tx);
2530    texcoord = tx_scratch(tx);
2531    /*
2532     * Bump-env-matrix:
2533     * 00 is X
2534     * 01 is Y
2535     * 10 is Z
2536     * 11 is W
2537     */
2538    c8m = nine_float_constant_src(tx, 8+m);
2539    c16m2 = nine_float_constant_src(tx, 8+8+m/2);
2540
2541    m00 = NINE_APPLY_SWIZZLE(c8m, X);
2542    m01 = NINE_APPLY_SWIZZLE(c8m, Y);
2543    m10 = NINE_APPLY_SWIZZLE(c8m, Z);
2544    m11 = NINE_APPLY_SWIZZLE(c8m, W);
2545
2546    /* These two attributes are packed as X=scale0 Y=offset0 Z=scale1 W=offset1 etc */
2547    if (m % 2 == 0) {
2548        bumpenvlscale = NINE_APPLY_SWIZZLE(c16m2, X);
2549        bumpenvloffset = NINE_APPLY_SWIZZLE(c16m2, Y);
2550    } else {
2551        bumpenvlscale = NINE_APPLY_SWIZZLE(c16m2, Z);
2552        bumpenvloffset = NINE_APPLY_SWIZZLE(c16m2, W);
2553    }
2554
2555    apply_ps1x_projection(tx, texcoord, tx->regs.vT[m], m);
2556
2557    /* u' = TextureCoordinates(stage m)u + D3DTSS_BUMPENVMAT00(stage m)*t(n)R  */
2558    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
2559             NINE_APPLY_SWIZZLE(src, X), ureg_src(texcoord));
2560    /* u' = u' + D3DTSS_BUMPENVMAT10(stage m)*t(n)G */
2561    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
2562             NINE_APPLY_SWIZZLE(src, Y),
2563             NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
2564
2565    /* v' = TextureCoordinates(stage m)v + D3DTSS_BUMPENVMAT01(stage m)*t(n)R */
2566    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
2567             NINE_APPLY_SWIZZLE(src, X), ureg_src(texcoord));
2568    /* v' = v' + D3DTSS_BUMPENVMAT11(stage m)*t(n)G*/
2569    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
2570             NINE_APPLY_SWIZZLE(src, Y),
2571             NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
2572
2573    /* Now the texture coordinates are in tmp.xy */
2574
2575    if (tx->insn.opcode == D3DSIO_TEXBEM) {
2576        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2577    } else if (tx->insn.opcode == D3DSIO_TEXBEML) {
2578        /* t(m)RGBA = t(m)RGBA * [(t(n)B * D3DTSS_BUMPENVLSCALE(stage m)) + D3DTSS_BUMPENVLOFFSET(stage m)] */
2579        ureg_TEX(ureg, tmp, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2580        ureg_MAD(ureg, tmp2, NINE_APPLY_SWIZZLE(src, Z),
2581                 bumpenvlscale, bumpenvloffset);
2582        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_src(tmp2));
2583    }
2584
2585    tx->info->bumpenvmat_needed = 1;
2586
2587    return D3D_OK;
2588}
2589
2590DECL_SPECIAL(TEXREG2AR)
2591{
2592    struct ureg_program *ureg = tx->ureg;
2593    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2594    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2595    struct ureg_src sample;
2596    const int m = tx->insn.dst[0].idx;
2597    const int n = tx->insn.src[0].idx;
2598    assert(m >= 0 && m > n);
2599
2600    sample = ureg_DECL_sampler(ureg, m);
2601    tx->info->sampler_mask |= 1 << m;
2602    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_swizzle(src, NINE_SWIZZLE4(W,X,X,X)), sample);
2603
2604    return D3D_OK;
2605}
2606
2607DECL_SPECIAL(TEXREG2GB)
2608{
2609    struct ureg_program *ureg = tx->ureg;
2610    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2611    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2612    struct ureg_src sample;
2613    const int m = tx->insn.dst[0].idx;
2614    const int n = tx->insn.src[0].idx;
2615    assert(m >= 0 && m > n);
2616
2617    sample = ureg_DECL_sampler(ureg, m);
2618    tx->info->sampler_mask |= 1 << m;
2619    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_swizzle(src, NINE_SWIZZLE4(Y,Z,Z,Z)), sample);
2620
2621    return D3D_OK;
2622}
2623
2624DECL_SPECIAL(TEXM3x2PAD)
2625{
2626    return D3D_OK; /* this is just padding */
2627}
2628
2629DECL_SPECIAL(TEXM3x2TEX)
2630{
2631    struct ureg_program *ureg = tx->ureg;
2632    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2633    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2634    struct ureg_src sample;
2635    const int m = tx->insn.dst[0].idx - 1;
2636    const int n = tx->insn.src[0].idx;
2637    assert(m >= 0 && m > n);
2638
2639    tx_texcoord_alloc(tx, m);
2640    tx_texcoord_alloc(tx, m+1);
2641
2642    /* performs the matrix multiplication */
2643    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2644    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2645
2646    sample = ureg_DECL_sampler(ureg, m + 1);
2647    tx->info->sampler_mask |= 1 << (m + 1);
2648    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 1), ureg_src(dst), sample);
2649
2650    return D3D_OK;
2651}
2652
2653DECL_SPECIAL(TEXM3x3PAD)
2654{
2655    return D3D_OK; /* this is just padding */
2656}
2657
2658DECL_SPECIAL(TEXM3x3SPEC)
2659{
2660    struct ureg_program *ureg = tx->ureg;
2661    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2662    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2663    struct ureg_src E = tx_src_param(tx, &tx->insn.src[1]);
2664    struct ureg_src sample;
2665    struct ureg_dst tmp;
2666    const int m = tx->insn.dst[0].idx - 2;
2667    const int n = tx->insn.src[0].idx;
2668    assert(m >= 0 && m > n);
2669
2670    tx_texcoord_alloc(tx, m);
2671    tx_texcoord_alloc(tx, m+1);
2672    tx_texcoord_alloc(tx, m+2);
2673
2674    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2675    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2676    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z), tx->regs.vT[m+2], src);
2677
2678    sample = ureg_DECL_sampler(ureg, m + 2);
2679    tx->info->sampler_mask |= 1 << (m + 2);
2680    tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_XYZ);
2681
2682    /* At this step, dst = N = (u', w', z').
2683     * We want dst to be the texture sampled at (u'', w'', z''), with
2684     * (u'', w'', z'') = 2 * (N.E / N.N) * N - E */
2685    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(dst), ureg_src(dst));
2686    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2687    /* at this step tmp.x = 1/N.N */
2688    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(dst), E);
2689    /* at this step tmp.y = N.E */
2690    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2691    /* at this step tmp.x = N.E/N.N */
2692    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 2.0f));
2693    ureg_MUL(ureg, tmp, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_src(dst));
2694    /* at this step tmp.xyz = 2 * (N.E / N.N) * N */
2695    ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_negate(E));
2696    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(tmp), sample);
2697
2698    return D3D_OK;
2699}
2700
2701DECL_SPECIAL(TEXREG2RGB)
2702{
2703    struct ureg_program *ureg = tx->ureg;
2704    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2705    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2706    struct ureg_src sample;
2707    const int m = tx->insn.dst[0].idx;
2708    const int n = tx->insn.src[0].idx;
2709    assert(m >= 0 && m > n);
2710
2711    sample = ureg_DECL_sampler(ureg, m);
2712    tx->info->sampler_mask |= 1 << m;
2713    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), src, sample);
2714
2715    return D3D_OK;
2716}
2717
2718DECL_SPECIAL(TEXDP3TEX)
2719{
2720    struct ureg_program *ureg = tx->ureg;
2721    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2722    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2723    struct ureg_dst tmp;
2724    struct ureg_src sample;
2725    const int m = tx->insn.dst[0].idx;
2726    const int n = tx->insn.src[0].idx;
2727    assert(m >= 0 && m > n);
2728
2729    tx_texcoord_alloc(tx, m);
2730
2731    tmp = tx_scratch(tx);
2732    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2733    ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_YZ), ureg_imm1f(ureg, 0.0f));
2734
2735    sample = ureg_DECL_sampler(ureg, m);
2736    tx->info->sampler_mask |= 1 << m;
2737    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2738
2739    return D3D_OK;
2740}
2741
2742DECL_SPECIAL(TEXM3x2DEPTH)
2743{
2744    struct ureg_program *ureg = tx->ureg;
2745    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2746    struct ureg_dst tmp;
2747    const int m = tx->insn.dst[0].idx - 1;
2748    const int n = tx->insn.src[0].idx;
2749    assert(m >= 0 && m > n);
2750
2751    tx_texcoord_alloc(tx, m);
2752    tx_texcoord_alloc(tx, m+1);
2753
2754    tmp = tx_scratch(tx);
2755
2756    /* performs the matrix multiplication */
2757    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2758    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2759
2760    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Z), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2761    /* tmp.x = 'z', tmp.y = 'w', tmp.z = 1/'w'. */
2762    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Z));
2763    /* res = 'w' == 0 ? 1.0 : z/w */
2764    ureg_CMP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y))),
2765             ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 1.0f));
2766    /* replace the depth for depth testing with the result */
2767    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
2768                                              TGSI_WRITEMASK_Z, 0, 1);
2769    ureg_MOV(ureg, tx->regs.oDepth, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2770    /* note that we write nothing to the destination, since it's disallowed to use it afterward */
2771    return D3D_OK;
2772}
2773
2774DECL_SPECIAL(TEXDP3)
2775{
2776    struct ureg_program *ureg = tx->ureg;
2777    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2778    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2779    const int m = tx->insn.dst[0].idx;
2780    const int n = tx->insn.src[0].idx;
2781    assert(m >= 0 && m > n);
2782
2783    tx_texcoord_alloc(tx, m);
2784
2785    ureg_DP3(ureg, dst, tx->regs.vT[m], src);
2786
2787    return D3D_OK;
2788}
2789
2790DECL_SPECIAL(TEXM3x3)
2791{
2792    struct ureg_program *ureg = tx->ureg;
2793    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2794    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2795    struct ureg_src sample;
2796    struct ureg_dst E, tmp;
2797    const int m = tx->insn.dst[0].idx - 2;
2798    const int n = tx->insn.src[0].idx;
2799    assert(m >= 0 && m > n);
2800
2801    tx_texcoord_alloc(tx, m);
2802    tx_texcoord_alloc(tx, m+1);
2803    tx_texcoord_alloc(tx, m+2);
2804
2805    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2806    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2807    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z), tx->regs.vT[m+2], src);
2808
2809    switch (tx->insn.opcode) {
2810    case D3DSIO_TEXM3x3:
2811        ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
2812        break;
2813    case D3DSIO_TEXM3x3TEX:
2814        sample = ureg_DECL_sampler(ureg, m + 2);
2815        tx->info->sampler_mask |= 1 << (m + 2);
2816        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(dst), sample);
2817        break;
2818    case D3DSIO_TEXM3x3VSPEC:
2819        sample = ureg_DECL_sampler(ureg, m + 2);
2820        tx->info->sampler_mask |= 1 << (m + 2);
2821        E = tx_scratch(tx);
2822        tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_XYZ);
2823        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_X), ureg_scalar(tx->regs.vT[m], TGSI_SWIZZLE_W));
2824        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_Y), ureg_scalar(tx->regs.vT[m+1], TGSI_SWIZZLE_W));
2825        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_Z), ureg_scalar(tx->regs.vT[m+2], TGSI_SWIZZLE_W));
2826        /* At this step, dst = N = (u', w', z').
2827         * We want dst to be the texture sampled at (u'', w'', z''), with
2828         * (u'', w'', z'') = 2 * (N.E / N.N) * N - E */
2829        ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(dst), ureg_src(dst));
2830        ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2831        /* at this step tmp.x = 1/N.N */
2832        ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(dst), ureg_src(E));
2833        /* at this step tmp.y = N.E */
2834        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2835        /* at this step tmp.x = N.E/N.N */
2836        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 2.0f));
2837        ureg_MUL(ureg, tmp, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_src(dst));
2838        /* at this step tmp.xyz = 2 * (N.E / N.N) * N */
2839        ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_negate(ureg_src(E)));
2840        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(tmp), sample);
2841        break;
2842    default:
2843        return D3DERR_INVALIDCALL;
2844    }
2845    return D3D_OK;
2846}
2847
2848DECL_SPECIAL(TEXDEPTH)
2849{
2850    struct ureg_program *ureg = tx->ureg;
2851    struct ureg_dst r5;
2852    struct ureg_src r5r, r5g;
2853
2854    assert(tx->insn.dst[0].idx == 5); /* instruction must get r5 here */
2855
2856    /* we must replace the depth by r5.g == 0 ? 1.0f : r5.r/r5.g.
2857     * r5 won't be used afterward, thus we can use r5.ba */
2858    r5 = tx->regs.r[5];
2859    r5r = ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_X);
2860    r5g = ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_Y);
2861
2862    ureg_RCP(ureg, ureg_writemask(r5, TGSI_WRITEMASK_Z), r5g);
2863    ureg_MUL(ureg, ureg_writemask(r5, TGSI_WRITEMASK_X), r5r, ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_Z));
2864    /* r5.r = r/g */
2865    ureg_CMP(ureg, ureg_writemask(r5, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(r5g)),
2866             r5r, ureg_imm1f(ureg, 1.0f));
2867    /* replace the depth for depth testing with the result */
2868    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
2869                                              TGSI_WRITEMASK_Z, 0, 1);
2870    ureg_MOV(ureg, tx->regs.oDepth, r5r);
2871
2872    return D3D_OK;
2873}
2874
2875DECL_SPECIAL(BEM)
2876{
2877    struct ureg_program *ureg = tx->ureg;
2878    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2879    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
2880    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
2881    struct ureg_src m00, m01, m10, m11, c8m;
2882    const int m = tx->insn.dst[0].idx;
2883    struct ureg_dst tmp;
2884    /*
2885     * Bump-env-matrix:
2886     * 00 is X
2887     * 01 is Y
2888     * 10 is Z
2889     * 11 is W
2890     */
2891    c8m = nine_float_constant_src(tx, 8+m);
2892    m00 = NINE_APPLY_SWIZZLE(c8m, X);
2893    m01 = NINE_APPLY_SWIZZLE(c8m, Y);
2894    m10 = NINE_APPLY_SWIZZLE(c8m, Z);
2895    m11 = NINE_APPLY_SWIZZLE(c8m, W);
2896    /* dest.r = src0.r + D3DTSS_BUMPENVMAT00(stage n) * src1.r  */
2897    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
2898             NINE_APPLY_SWIZZLE(src1, X), NINE_APPLY_SWIZZLE(src0, X));
2899    /* dest.r = dest.r + D3DTSS_BUMPENVMAT10(stage n) * src1.g; */
2900    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
2901             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
2902
2903    /* dest.g = src0.g + D3DTSS_BUMPENVMAT01(stage n) * src1.r */
2904    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
2905             NINE_APPLY_SWIZZLE(src1, X), src0);
2906    /* dest.g = dest.g + D3DTSS_BUMPENVMAT11(stage n) * src1.g */
2907    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
2908             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
2909    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XY), ureg_src(tmp));
2910
2911    tx->info->bumpenvmat_needed = 1;
2912
2913    return D3D_OK;
2914}
2915
2916DECL_SPECIAL(TEXLD)
2917{
2918    struct ureg_program *ureg = tx->ureg;
2919    unsigned target;
2920    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2921    struct ureg_src src[2] = {
2922        tx_src_param(tx, &tx->insn.src[0]),
2923        tx_src_param(tx, &tx->insn.src[1])
2924    };
2925    assert(tx->insn.src[1].idx >= 0 &&
2926           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
2927    target = tx->sampler_targets[tx->insn.src[1].idx];
2928
2929    switch (tx->insn.flags) {
2930    case 0:
2931        ureg_TEX(ureg, dst, target, src[0], src[1]);
2932        break;
2933    case NINED3DSI_TEXLD_PROJECT:
2934        ureg_TXP(ureg, dst, target, src[0], src[1]);
2935        break;
2936    case NINED3DSI_TEXLD_BIAS:
2937        ureg_TXB(ureg, dst, target, src[0], src[1]);
2938        break;
2939    default:
2940        assert(0);
2941        return D3DERR_INVALIDCALL;
2942    }
2943    return D3D_OK;
2944}
2945
2946DECL_SPECIAL(TEXLD_14)
2947{
2948    struct ureg_program *ureg = tx->ureg;
2949    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2950    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2951    const unsigned s = tx->insn.dst[0].idx;
2952    const unsigned t = ps1x_sampler_type(tx->info, s);
2953
2954    tx->info->sampler_mask |= 1 << s;
2955    ureg_TEX(ureg, dst, t, src, ureg_DECL_sampler(ureg, s));
2956
2957    return D3D_OK;
2958}
2959
2960DECL_SPECIAL(TEX)
2961{
2962    struct ureg_program *ureg = tx->ureg;
2963    const unsigned s = tx->insn.dst[0].idx;
2964    const unsigned t = ps1x_sampler_type(tx->info, s);
2965    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2966    struct ureg_src src[2];
2967
2968    tx_texcoord_alloc(tx, s);
2969
2970    src[0] = tx->regs.vT[s];
2971    src[1] = ureg_DECL_sampler(ureg, s);
2972    tx->info->sampler_mask |= 1 << s;
2973
2974    TEX_with_ps1x_projection(tx, dst, t, src[0], src[1], s);
2975
2976    return D3D_OK;
2977}
2978
2979DECL_SPECIAL(TEXLDD)
2980{
2981    unsigned target;
2982    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2983    struct ureg_src src[4] = {
2984        tx_src_param(tx, &tx->insn.src[0]),
2985        tx_src_param(tx, &tx->insn.src[1]),
2986        tx_src_param(tx, &tx->insn.src[2]),
2987        tx_src_param(tx, &tx->insn.src[3])
2988    };
2989    assert(tx->insn.src[1].idx >= 0 &&
2990           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
2991    target = tx->sampler_targets[tx->insn.src[1].idx];
2992
2993    ureg_TXD(tx->ureg, dst, target, src[0], src[2], src[3], src[1]);
2994    return D3D_OK;
2995}
2996
2997DECL_SPECIAL(TEXLDL)
2998{
2999    unsigned target;
3000    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
3001    struct ureg_src src[2] = {
3002       tx_src_param(tx, &tx->insn.src[0]),
3003       tx_src_param(tx, &tx->insn.src[1])
3004    };
3005    assert(tx->insn.src[1].idx >= 0 &&
3006           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
3007    target = tx->sampler_targets[tx->insn.src[1].idx];
3008
3009    ureg_TXL(tx->ureg, dst, target, src[0], src[1]);
3010    return D3D_OK;
3011}
3012
3013DECL_SPECIAL(SETP)
3014{
3015    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
3016    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
3017    struct ureg_src src[2] = {
3018       tx_src_param(tx, &tx->insn.src[0]),
3019       tx_src_param(tx, &tx->insn.src[1])
3020    };
3021    ureg_insn(tx->ureg, cmp_op, &dst, 1, src, 2, 0);
3022    return D3D_OK;
3023}
3024
3025DECL_SPECIAL(BREAKP)
3026{
3027    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
3028    ureg_IF(tx->ureg, src, tx_cond(tx));
3029    ureg_BRK(tx->ureg);
3030    tx_endcond(tx);
3031    ureg_ENDIF(tx->ureg);
3032    return D3D_OK;
3033}
3034
3035DECL_SPECIAL(PHASE)
3036{
3037    return D3D_OK; /* we don't care about phase */
3038}
3039
3040DECL_SPECIAL(COMMENT)
3041{
3042    return D3D_OK; /* nothing to do */
3043}
3044
3045
3046#define _OPI(o,t,vv1,vv2,pv1,pv2,d,s,h) \
3047    { D3DSIO_##o, TGSI_OPCODE_##t, { vv1, vv2 }, { pv1, pv2, }, d, s, h }
3048
3049static const struct sm1_op_info inst_table[] =
3050{
3051    _OPI(NOP, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(NOP)), /* 0 */
3052    _OPI(MOV, MOV, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL),
3053    _OPI(ADD, ADD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 2 */
3054    _OPI(SUB, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(SUB)), /* 3 */
3055    _OPI(MAD, MAD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 4 */
3056    _OPI(MUL, MUL, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 5 */
3057    _OPI(RCP, RCP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RCP)), /* 6 */
3058    _OPI(RSQ, RSQ, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RSQ)), /* 7 */
3059    _OPI(DP3, DP3, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 8 */
3060    _OPI(DP4, DP4, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 9 */
3061    _OPI(MIN, MIN, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 10 */
3062    _OPI(MAX, MAX, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 11 */
3063    _OPI(SLT, SLT, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 12 */
3064    _OPI(SGE, SGE, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 13 */
3065    _OPI(EXP, EX2, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL), /* 14 */
3066    _OPI(LOG, LG2, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(LOG)), /* 15 */
3067    _OPI(LIT, LIT, V(0,0), V(3,0), V(0,0), V(0,0), 1, 1, SPECIAL(LIT)), /* 16 */
3068    _OPI(DST, DST, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 17 */
3069    _OPI(LRP, LRP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 18 */
3070    _OPI(FRC, FRC, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL), /* 19 */
3071
3072    _OPI(M4x4, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M4x4)),
3073    _OPI(M4x3, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M4x3)),
3074    _OPI(M3x4, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x4)),
3075    _OPI(M3x3, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x3)),
3076    _OPI(M3x2, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x2)),
3077
3078    _OPI(CALL,    CAL,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(CALL)),
3079    _OPI(CALLNZ,  CAL,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(CALLNZ)),
3080    _OPI(LOOP,    BGNLOOP, V(2,0), V(3,0), V(3,0), V(3,0), 0, 2, SPECIAL(LOOP)),
3081    _OPI(RET,     RET,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(RET)),
3082    _OPI(ENDLOOP, ENDLOOP, V(2,0), V(3,0), V(3,0), V(3,0), 0, 0, SPECIAL(ENDLOOP)),
3083    _OPI(LABEL,   NOP,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(LABEL)),
3084
3085    _OPI(DCL, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(DCL)),
3086
3087    _OPI(POW, POW, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(POW)),
3088    _OPI(CRS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(XPD)), /* XXX: .w */
3089    _OPI(SGN, SSG, V(2,0), V(3,0), V(0,0), V(0,0), 1, 3, SPECIAL(SGN)), /* ignore src1,2 */
3090    _OPI(ABS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(ABS)),
3091    _OPI(NRM, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(NRM)), /* NRM doesn't fit */
3092
3093    _OPI(SINCOS, NOP, V(2,0), V(2,1), V(2,0), V(2,1), 1, 3, SPECIAL(SINCOS)),
3094    _OPI(SINCOS, NOP, V(3,0), V(3,0), V(3,0), V(3,0), 1, 1, SPECIAL(SINCOS)),
3095
3096    /* More flow control */
3097    _OPI(REP,    NOP,    V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(REP)),
3098    _OPI(ENDREP, NOP,    V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ENDREP)),
3099    _OPI(IF,     IF,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(IF)),
3100    _OPI(IFC,    IF,     V(2,1), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(IFC)),
3101    _OPI(ELSE,   ELSE,   V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ELSE)),
3102    _OPI(ENDIF,  ENDIF,  V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ENDIF)),
3103    _OPI(BREAK,  BRK,    V(2,1), V(3,0), V(2,1), V(3,0), 0, 0, NULL),
3104    _OPI(BREAKC, NOP,    V(2,1), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(BREAKC)),
3105    /* we don't write to the address register, but a normal register (copied
3106     * when needed to the address register), thus we don't use ARR */
3107    _OPI(MOVA, MOV, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL),
3108
3109    _OPI(DEFB, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFB)),
3110    _OPI(DEFI, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFI)),
3111
3112    _OPI(TEXCOORD,     NOP, V(0,0), V(0,0), V(0,0), V(1,3), 1, 0, SPECIAL(TEXCOORD)),
3113    _OPI(TEXCOORD,     MOV, V(0,0), V(0,0), V(1,4), V(1,4), 1, 1, SPECIAL(TEXCOORD_ps14)),
3114    _OPI(TEXKILL,      KILL_IF, V(0,0), V(0,0), V(0,0), V(3,0), 1, 0, SPECIAL(TEXKILL)),
3115    _OPI(TEX,          TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 0, SPECIAL(TEX)),
3116    _OPI(TEX,          TEX, V(0,0), V(0,0), V(1,4), V(1,4), 1, 1, SPECIAL(TEXLD_14)),
3117    _OPI(TEX,          TEX, V(0,0), V(0,0), V(2,0), V(3,0), 1, 2, SPECIAL(TEXLD)),
3118    _OPI(TEXBEM,       TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
3119    _OPI(TEXBEML,      TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
3120    _OPI(TEXREG2AR,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2AR)),
3121    _OPI(TEXREG2GB,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2GB)),
3122    _OPI(TEXM3x2PAD,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x2PAD)),
3123    _OPI(TEXM3x2TEX,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x2TEX)),
3124    _OPI(TEXM3x3PAD,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3PAD)),
3125    _OPI(TEXM3x3TEX,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3126    _OPI(TEXM3x3SPEC,  TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 2, SPECIAL(TEXM3x3SPEC)),
3127    _OPI(TEXM3x3VSPEC, TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3128
3129    _OPI(EXPP, EXP, V(0,0), V(1,1), V(0,0), V(0,0), 1, 1, NULL),
3130    _OPI(EXPP, EX2, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL),
3131    _OPI(LOGP, LG2, V(0,0), V(3,0), V(0,0), V(0,0), 1, 1, SPECIAL(LOG)),
3132    _OPI(CND,  NOP, V(0,0), V(0,0), V(0,0), V(1,4), 1, 3, SPECIAL(CND)),
3133
3134    _OPI(DEF, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 0, SPECIAL(DEF)),
3135
3136    /* More tex stuff */
3137    _OPI(TEXREG2RGB,   TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXREG2RGB)),
3138    _OPI(TEXDP3TEX,    TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXDP3TEX)),
3139    _OPI(TEXM3x2DEPTH, TEX, V(0,0), V(0,0), V(1,3), V(1,3), 1, 1, SPECIAL(TEXM3x2DEPTH)),
3140    _OPI(TEXDP3,       TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXDP3)),
3141    _OPI(TEXM3x3,      TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3142    _OPI(TEXDEPTH,     TEX, V(0,0), V(0,0), V(1,4), V(1,4), 1, 0, SPECIAL(TEXDEPTH)),
3143
3144    /* Misc */
3145    _OPI(CMP,    CMP,  V(0,0), V(0,0), V(1,2), V(3,0), 1, 3, SPECIAL(CMP)), /* reversed */
3146    _OPI(BEM,    NOP,  V(0,0), V(0,0), V(1,4), V(1,4), 1, 2, SPECIAL(BEM)),
3147    _OPI(DP2ADD, NOP,  V(0,0), V(0,0), V(2,0), V(3,0), 1, 3, SPECIAL(DP2ADD)),
3148    _OPI(DSX,    DDX,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 1, NULL),
3149    _OPI(DSY,    DDY,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 1, NULL),
3150    _OPI(TEXLDD, TXD,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 4, SPECIAL(TEXLDD)),
3151    _OPI(SETP,   NOP,  V(0,0), V(3,0), V(2,1), V(3,0), 1, 2, SPECIAL(SETP)),
3152    _OPI(TEXLDL, TXL,  V(3,0), V(3,0), V(3,0), V(3,0), 1, 2, SPECIAL(TEXLDL)),
3153    _OPI(BREAKP, BRK,  V(0,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(BREAKP))
3154};
3155
3156static const struct sm1_op_info inst_phase =
3157    _OPI(PHASE, NOP, V(0,0), V(0,0), V(1,4), V(1,4), 0, 0, SPECIAL(PHASE));
3158
3159static const struct sm1_op_info inst_comment =
3160    _OPI(COMMENT, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(COMMENT));
3161
3162static void
3163create_op_info_map(struct shader_translator *tx)
3164{
3165    const unsigned version = (tx->version.major << 8) | tx->version.minor;
3166    unsigned i;
3167
3168    for (i = 0; i < ARRAY_SIZE(tx->op_info_map); ++i)
3169        tx->op_info_map[i] = -1;
3170
3171    if (tx->processor == PIPE_SHADER_VERTEX) {
3172        for (i = 0; i < ARRAY_SIZE(inst_table); ++i) {
3173            assert(inst_table[i].sio < ARRAY_SIZE(tx->op_info_map));
3174            if (inst_table[i].vert_version.min <= version &&
3175                inst_table[i].vert_version.max >= version)
3176                tx->op_info_map[inst_table[i].sio] = i;
3177        }
3178    } else {
3179        for (i = 0; i < ARRAY_SIZE(inst_table); ++i) {
3180            assert(inst_table[i].sio < ARRAY_SIZE(tx->op_info_map));
3181            if (inst_table[i].frag_version.min <= version &&
3182                inst_table[i].frag_version.max >= version)
3183                tx->op_info_map[inst_table[i].sio] = i;
3184        }
3185    }
3186}
3187
3188static inline HRESULT
3189NineTranslateInstruction_Generic(struct shader_translator *tx)
3190{
3191    struct ureg_dst dst[1];
3192    struct ureg_src src[4];
3193    unsigned i;
3194
3195    for (i = 0; i < tx->insn.ndst && i < ARRAY_SIZE(dst); ++i)
3196        dst[i] = tx_dst_param(tx, &tx->insn.dst[i]);
3197    for (i = 0; i < tx->insn.nsrc && i < ARRAY_SIZE(src); ++i)
3198        src[i] = tx_src_param(tx, &tx->insn.src[i]);
3199
3200    ureg_insn(tx->ureg, tx->insn.info->opcode,
3201              dst, tx->insn.ndst,
3202              src, tx->insn.nsrc, 0);
3203    return D3D_OK;
3204}
3205
3206static inline DWORD
3207TOKEN_PEEK(struct shader_translator *tx)
3208{
3209    return *(tx->parse);
3210}
3211
3212static inline DWORD
3213TOKEN_NEXT(struct shader_translator *tx)
3214{
3215    return *(tx->parse)++;
3216}
3217
3218static inline void
3219TOKEN_JUMP(struct shader_translator *tx)
3220{
3221    if (tx->parse_next && tx->parse != tx->parse_next) {
3222        WARN("parse(%p) != parse_next(%p) !\n", tx->parse, tx->parse_next);
3223        tx->parse = tx->parse_next;
3224    }
3225}
3226
3227static inline boolean
3228sm1_parse_eof(struct shader_translator *tx)
3229{
3230    return TOKEN_PEEK(tx) == NINED3DSP_END;
3231}
3232
3233static void
3234sm1_read_version(struct shader_translator *tx)
3235{
3236    const DWORD tok = TOKEN_NEXT(tx);
3237
3238    tx->version.major = D3DSHADER_VERSION_MAJOR(tok);
3239    tx->version.minor = D3DSHADER_VERSION_MINOR(tok);
3240
3241    switch (tok >> 16) {
3242    case NINED3D_SM1_VS: tx->processor = PIPE_SHADER_VERTEX; break;
3243    case NINED3D_SM1_PS: tx->processor = PIPE_SHADER_FRAGMENT; break;
3244    default:
3245       DBG("Invalid shader type: %x\n", tok);
3246       tx->processor = ~0;
3247       break;
3248    }
3249}
3250
3251/* This is just to check if we parsed the instruction properly. */
3252static void
3253sm1_parse_get_skip(struct shader_translator *tx)
3254{
3255    const DWORD tok = TOKEN_PEEK(tx);
3256
3257    if (tx->version.major >= 2) {
3258        tx->parse_next = tx->parse + 1 /* this */ +
3259            ((tok & D3DSI_INSTLENGTH_MASK) >> D3DSI_INSTLENGTH_SHIFT);
3260    } else {
3261        tx->parse_next = NULL; /* TODO: determine from param count */
3262    }
3263}
3264
3265static void
3266sm1_print_comment(const char *comment, UINT size)
3267{
3268    if (!size)
3269        return;
3270    /* TODO */
3271}
3272
3273static void
3274sm1_parse_comments(struct shader_translator *tx, BOOL print)
3275{
3276    DWORD tok = TOKEN_PEEK(tx);
3277
3278    while ((tok & D3DSI_OPCODE_MASK) == D3DSIO_COMMENT)
3279    {
3280        const char *comment = "";
3281        UINT size = (tok & D3DSI_COMMENTSIZE_MASK) >> D3DSI_COMMENTSIZE_SHIFT;
3282        tx->parse += size + 1;
3283
3284        if (print)
3285            sm1_print_comment(comment, size);
3286
3287        tok = TOKEN_PEEK(tx);
3288    }
3289}
3290
3291static void
3292sm1_parse_get_param(struct shader_translator *tx, DWORD *reg, DWORD *rel)
3293{
3294    *reg = TOKEN_NEXT(tx);
3295
3296    if (*reg & D3DSHADER_ADDRMODE_RELATIVE)
3297    {
3298        if (tx->version.major < 2)
3299            *rel = (1 << 31) |
3300                ((D3DSPR_ADDR << D3DSP_REGTYPE_SHIFT2) & D3DSP_REGTYPE_MASK2) |
3301                ((D3DSPR_ADDR << D3DSP_REGTYPE_SHIFT)  & D3DSP_REGTYPE_MASK) |
3302                D3DSP_NOSWIZZLE;
3303        else
3304            *rel = TOKEN_NEXT(tx);
3305    }
3306}
3307
3308static void
3309sm1_parse_dst_param(struct sm1_dst_param *dst, DWORD tok)
3310{
3311    int8_t shift;
3312    dst->file =
3313        (tok & D3DSP_REGTYPE_MASK)  >> D3DSP_REGTYPE_SHIFT |
3314        (tok & D3DSP_REGTYPE_MASK2) >> D3DSP_REGTYPE_SHIFT2;
3315    dst->type = TGSI_RETURN_TYPE_FLOAT;
3316    dst->idx = tok & D3DSP_REGNUM_MASK;
3317    dst->rel = NULL;
3318    dst->mask = (tok & NINED3DSP_WRITEMASK_MASK) >> NINED3DSP_WRITEMASK_SHIFT;
3319    dst->mod = (tok & D3DSP_DSTMOD_MASK) >> D3DSP_DSTMOD_SHIFT;
3320    shift = (tok & D3DSP_DSTSHIFT_MASK) >> D3DSP_DSTSHIFT_SHIFT;
3321    dst->shift = (shift & 0x7) - (shift & 0x8);
3322}
3323
3324static void
3325sm1_parse_src_param(struct sm1_src_param *src, DWORD tok)
3326{
3327    src->file =
3328        ((tok & D3DSP_REGTYPE_MASK)  >> D3DSP_REGTYPE_SHIFT) |
3329        ((tok & D3DSP_REGTYPE_MASK2) >> D3DSP_REGTYPE_SHIFT2);
3330    src->type = TGSI_RETURN_TYPE_FLOAT;
3331    src->idx = tok & D3DSP_REGNUM_MASK;
3332    src->rel = NULL;
3333    src->swizzle = (tok & D3DSP_SWIZZLE_MASK) >> D3DSP_SWIZZLE_SHIFT;
3334    src->mod = (tok & D3DSP_SRCMOD_MASK) >> D3DSP_SRCMOD_SHIFT;
3335
3336    switch (src->file) {
3337    case D3DSPR_CONST2: src->file = D3DSPR_CONST; src->idx += 2048; break;
3338    case D3DSPR_CONST3: src->file = D3DSPR_CONST; src->idx += 4096; break;
3339    case D3DSPR_CONST4: src->file = D3DSPR_CONST; src->idx += 6144; break;
3340    default:
3341        break;
3342    }
3343}
3344
3345static void
3346sm1_parse_immediate(struct shader_translator *tx,
3347                    struct sm1_src_param *imm)
3348{
3349    imm->file = NINED3DSPR_IMMEDIATE;
3350    imm->idx = INT_MIN;
3351    imm->rel = NULL;
3352    imm->swizzle = NINED3DSP_NOSWIZZLE;
3353    imm->mod = 0;
3354    switch (tx->insn.opcode) {
3355    case D3DSIO_DEF:
3356        imm->type = NINED3DSPTYPE_FLOAT4;
3357        memcpy(&imm->imm.d[0], tx->parse, 4 * sizeof(DWORD));
3358        tx->parse += 4;
3359        break;
3360    case D3DSIO_DEFI:
3361        imm->type = NINED3DSPTYPE_INT4;
3362        memcpy(&imm->imm.d[0], tx->parse, 4 * sizeof(DWORD));
3363        tx->parse += 4;
3364        break;
3365    case D3DSIO_DEFB:
3366        imm->type = NINED3DSPTYPE_BOOL;
3367        memcpy(&imm->imm.d[0], tx->parse, 1 * sizeof(DWORD));
3368        tx->parse += 1;
3369        break;
3370    default:
3371       assert(0);
3372       break;
3373    }
3374}
3375
3376static void
3377sm1_read_dst_param(struct shader_translator *tx,
3378                   struct sm1_dst_param *dst,
3379                   struct sm1_src_param *rel)
3380{
3381    DWORD tok_dst, tok_rel = 0;
3382
3383    sm1_parse_get_param(tx, &tok_dst, &tok_rel);
3384    sm1_parse_dst_param(dst, tok_dst);
3385    if (tok_dst & D3DSHADER_ADDRMODE_RELATIVE) {
3386        sm1_parse_src_param(rel, tok_rel);
3387        dst->rel = rel;
3388    }
3389}
3390
3391static void
3392sm1_read_src_param(struct shader_translator *tx,
3393                   struct sm1_src_param *src,
3394                   struct sm1_src_param *rel)
3395{
3396    DWORD tok_src, tok_rel = 0;
3397
3398    sm1_parse_get_param(tx, &tok_src, &tok_rel);
3399    sm1_parse_src_param(src, tok_src);
3400    if (tok_src & D3DSHADER_ADDRMODE_RELATIVE) {
3401        assert(rel);
3402        sm1_parse_src_param(rel, tok_rel);
3403        src->rel = rel;
3404    }
3405}
3406
3407static void
3408sm1_read_semantic(struct shader_translator *tx,
3409                  struct sm1_semantic *sem)
3410{
3411    const DWORD tok_usg = TOKEN_NEXT(tx);
3412    const DWORD tok_dst = TOKEN_NEXT(tx);
3413
3414    sem->sampler_type = (tok_usg & D3DSP_TEXTURETYPE_MASK) >> D3DSP_TEXTURETYPE_SHIFT;
3415    sem->usage = (tok_usg & D3DSP_DCL_USAGE_MASK) >> D3DSP_DCL_USAGE_SHIFT;
3416    sem->usage_idx = (tok_usg & D3DSP_DCL_USAGEINDEX_MASK) >> D3DSP_DCL_USAGEINDEX_SHIFT;
3417
3418    sm1_parse_dst_param(&sem->reg, tok_dst);
3419}
3420
3421static void
3422sm1_parse_instruction(struct shader_translator *tx)
3423{
3424    struct sm1_instruction *insn = &tx->insn;
3425    HRESULT hr;
3426    DWORD tok;
3427    const struct sm1_op_info *info = NULL;
3428    unsigned i;
3429
3430    sm1_parse_comments(tx, TRUE);
3431    sm1_parse_get_skip(tx);
3432
3433    tok = TOKEN_NEXT(tx);
3434
3435    insn->opcode = tok & D3DSI_OPCODE_MASK;
3436    insn->flags = (tok & NINED3DSIO_OPCODE_FLAGS_MASK) >> NINED3DSIO_OPCODE_FLAGS_SHIFT;
3437    insn->coissue = !!(tok & D3DSI_COISSUE);
3438    insn->predicated = !!(tok & NINED3DSHADER_INST_PREDICATED);
3439
3440    if (insn->opcode < ARRAY_SIZE(tx->op_info_map)) {
3441        int k = tx->op_info_map[insn->opcode];
3442        if (k >= 0) {
3443            assert(k < ARRAY_SIZE(inst_table));
3444            info = &inst_table[k];
3445        }
3446    } else {
3447       if (insn->opcode == D3DSIO_PHASE)   info = &inst_phase;
3448       if (insn->opcode == D3DSIO_COMMENT) info = &inst_comment;
3449    }
3450    if (!info) {
3451       DBG("illegal or unhandled opcode: %08x\n", insn->opcode);
3452       TOKEN_JUMP(tx);
3453       return;
3454    }
3455    insn->info = info;
3456    insn->ndst = info->ndst;
3457    insn->nsrc = info->nsrc;
3458
3459    /* check version */
3460    {
3461        unsigned min = IS_VS ? info->vert_version.min : info->frag_version.min;
3462        unsigned max = IS_VS ? info->vert_version.max : info->frag_version.max;
3463        unsigned ver = (tx->version.major << 8) | tx->version.minor;
3464        if (ver < min || ver > max) {
3465            DBG("opcode not supported in this shader version: %x <= %x <= %x\n",
3466                min, ver, max);
3467            return;
3468        }
3469    }
3470
3471    for (i = 0; i < insn->ndst; ++i)
3472        sm1_read_dst_param(tx, &insn->dst[i], &insn->dst_rel[i]);
3473    if (insn->predicated)
3474        sm1_read_src_param(tx, &insn->pred, NULL);
3475    for (i = 0; i < insn->nsrc; ++i)
3476        sm1_read_src_param(tx, &insn->src[i], &insn->src_rel[i]);
3477
3478    /* parse here so we can dump them before processing */
3479    if (insn->opcode == D3DSIO_DEF ||
3480        insn->opcode == D3DSIO_DEFI ||
3481        insn->opcode == D3DSIO_DEFB)
3482        sm1_parse_immediate(tx, &tx->insn.src[0]);
3483
3484    sm1_dump_instruction(insn, tx->cond_depth + tx->loop_depth);
3485    sm1_instruction_check(insn);
3486
3487    if (insn->predicated) {
3488        tx->predicated_activated = true;
3489        if (ureg_dst_is_undef(tx->regs.predicate_tmp)) {
3490            tx->regs.predicate_tmp = ureg_DECL_temporary(tx->ureg);
3491            tx->regs.predicate_dst = ureg_DECL_temporary(tx->ureg);
3492        }
3493    }
3494
3495    if (info->handler)
3496        hr = info->handler(tx);
3497    else
3498        hr = NineTranslateInstruction_Generic(tx);
3499    tx_apply_dst0_modifiers(tx);
3500
3501    if (insn->predicated) {
3502        tx->predicated_activated = false;
3503        /* TODO: predicate might be allowed on outputs,
3504         * which cannot be src. Workaround it. */
3505        ureg_CMP(tx->ureg, tx->regs.predicate_dst,
3506                 ureg_negate(tx_src_param(tx, &insn->pred)),
3507                 ureg_src(tx->regs.predicate_tmp),
3508                 ureg_src(tx->regs.predicate_dst));
3509    }
3510
3511    if (hr != D3D_OK)
3512        tx->failure = TRUE;
3513    tx->num_scratch = 0; /* reset */
3514
3515    TOKEN_JUMP(tx);
3516}
3517
3518#define GET_CAP(n) screen->get_param( \
3519      screen, PIPE_CAP_##n)
3520#define GET_SHADER_CAP(n) screen->get_shader_param( \
3521      screen, info->type, PIPE_SHADER_CAP_##n)
3522
3523static HRESULT
3524tx_ctor(struct shader_translator *tx, struct pipe_screen *screen, struct nine_shader_info *info)
3525{
3526    unsigned i;
3527
3528    memset(tx, 0, sizeof(*tx));
3529
3530    tx->info = info;
3531
3532    tx->byte_code = info->byte_code;
3533    tx->parse = info->byte_code;
3534
3535    for (i = 0; i < ARRAY_SIZE(info->input_map); ++i)
3536        info->input_map[i] = NINE_DECLUSAGE_NONE;
3537    info->num_inputs = 0;
3538
3539    info->position_t = FALSE;
3540    info->point_size = FALSE;
3541
3542    memset(tx->slots_used, 0, sizeof(tx->slots_used));
3543    memset(info->int_slots_used, 0, sizeof(info->int_slots_used));
3544    memset(info->bool_slots_used, 0, sizeof(info->bool_slots_used));
3545
3546    tx->info->const_float_slots = 0;
3547    tx->info->const_int_slots = 0;
3548    tx->info->const_bool_slots = 0;
3549
3550    info->sampler_mask = 0x0;
3551    info->rt_mask = 0x0;
3552
3553    info->lconstf.data = NULL;
3554    info->lconstf.ranges = NULL;
3555
3556    info->bumpenvmat_needed = 0;
3557
3558    for (i = 0; i < ARRAY_SIZE(tx->regs.rL); ++i) {
3559        tx->regs.rL[i] = ureg_dst_undef();
3560    }
3561    tx->regs.address = ureg_dst_undef();
3562    tx->regs.a0 = ureg_dst_undef();
3563    tx->regs.p = ureg_dst_undef();
3564    tx->regs.oDepth = ureg_dst_undef();
3565    tx->regs.vPos = ureg_src_undef();
3566    tx->regs.vFace = ureg_src_undef();
3567    for (i = 0; i < ARRAY_SIZE(tx->regs.o); ++i)
3568        tx->regs.o[i] = ureg_dst_undef();
3569    for (i = 0; i < ARRAY_SIZE(tx->regs.oCol); ++i)
3570        tx->regs.oCol[i] = ureg_dst_undef();
3571    for (i = 0; i < ARRAY_SIZE(tx->regs.vC); ++i)
3572        tx->regs.vC[i] = ureg_src_undef();
3573    for (i = 0; i < ARRAY_SIZE(tx->regs.vT); ++i)
3574        tx->regs.vT[i] = ureg_src_undef();
3575
3576    sm1_read_version(tx);
3577
3578    info->version = (tx->version.major << 4) | tx->version.minor;
3579
3580    tx->num_outputs = 0;
3581
3582    create_op_info_map(tx);
3583
3584    tx->ureg = ureg_create(info->type);
3585    if (!tx->ureg) {
3586        return E_OUTOFMEMORY;
3587    }
3588
3589    tx->native_integers = GET_SHADER_CAP(INTEGERS);
3590    tx->inline_subroutines = !GET_SHADER_CAP(SUBROUTINES);
3591    tx->want_texcoord = GET_CAP(TGSI_TEXCOORD);
3592    tx->shift_wpos = !GET_CAP(TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
3593    tx->texcoord_sn = tx->want_texcoord ?
3594        TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
3595    tx->wpos_is_sysval = GET_CAP(TGSI_FS_POSITION_IS_SYSVAL);
3596    tx->face_is_sysval_integer = GET_CAP(TGSI_FS_FACE_IS_INTEGER_SYSVAL);
3597
3598    if (IS_VS) {
3599        tx->num_constf_allowed = NINE_MAX_CONST_F;
3600    } else if (tx->version.major < 2) {/* IS_PS v1 */
3601        tx->num_constf_allowed = 8;
3602    } else if (tx->version.major == 2) {/* IS_PS v2 */
3603        tx->num_constf_allowed = 32;
3604    } else {/* IS_PS v3 */
3605        tx->num_constf_allowed = NINE_MAX_CONST_F_PS3;
3606    }
3607
3608    if (tx->version.major < 2) {
3609        tx->num_consti_allowed = 0;
3610        tx->num_constb_allowed = 0;
3611    } else {
3612        tx->num_consti_allowed = NINE_MAX_CONST_I;
3613        tx->num_constb_allowed = NINE_MAX_CONST_B;
3614    }
3615
3616    if (info->swvp_on && tx->version.major >= 2) {
3617        tx->num_constf_allowed = 8192;
3618        tx->num_consti_allowed = 2048;
3619        tx->num_constb_allowed = 2048;
3620    }
3621
3622    /* VS must always write position. Declare it here to make it the 1st output.
3623     * (Some drivers like nv50 are buggy and rely on that.)
3624     */
3625    if (IS_VS) {
3626        tx->regs.oPos = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_POSITION, 0);
3627    } else {
3628        ureg_property(tx->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN, TGSI_FS_COORD_ORIGIN_UPPER_LEFT);
3629        if (!tx->shift_wpos)
3630            ureg_property(tx->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
3631    }
3632
3633    tx->mul_zero_wins = GET_CAP(TGSI_MUL_ZERO_WINS);
3634    if (tx->mul_zero_wins)
3635       ureg_property(tx->ureg, TGSI_PROPERTY_MUL_ZERO_WINS, 1);
3636
3637    /* Add additional definition of constants */
3638    if (info->add_constants_defs.c_combination) {
3639        unsigned i;
3640
3641        assert(info->add_constants_defs.int_const_added);
3642        assert(info->add_constants_defs.bool_const_added);
3643        /* We only add constants that are used by the shader
3644         * and that are not defined in the shader */
3645        for (i = 0; i < NINE_MAX_CONST_I; ++i) {
3646            if ((*info->add_constants_defs.int_const_added)[i]) {
3647                DBG("Defining const i%i : { %i %i %i %i }\n", i,
3648                    info->add_constants_defs.c_combination->const_i[i][0],
3649                    info->add_constants_defs.c_combination->const_i[i][1],
3650                    info->add_constants_defs.c_combination->const_i[i][2],
3651                    info->add_constants_defs.c_combination->const_i[i][3]);
3652                tx_set_lconsti(tx, i, info->add_constants_defs.c_combination->const_i[i]);
3653            }
3654        }
3655        for (i = 0; i < NINE_MAX_CONST_B; ++i) {
3656            if ((*info->add_constants_defs.bool_const_added)[i]) {
3657                DBG("Defining const b%i : %i\n", i, (int)(info->add_constants_defs.c_combination->const_b[i] != 0));
3658                tx_set_lconstb(tx, i, info->add_constants_defs.c_combination->const_b[i]);
3659            }
3660        }
3661    }
3662    return D3D_OK;
3663}
3664
3665static void
3666tx_dtor(struct shader_translator *tx)
3667{
3668    if (tx->slot_map)
3669        FREE(tx->slot_map);
3670    if (tx->num_inst_labels)
3671        FREE(tx->inst_labels);
3672    FREE(tx->lconstf);
3673    FREE(tx->regs.r);
3674    FREE(tx);
3675}
3676
3677/* CONST[0].xyz = width/2, -height/2, zmax-zmin
3678 * CONST[1].xyz = x+width/2, y+height/2, zmin */
3679static void
3680shader_add_vs_viewport_transform(struct shader_translator *tx)
3681{
3682    struct ureg_program *ureg = tx->ureg;
3683    struct ureg_src c0 = ureg_src_register(TGSI_FILE_CONSTANT, 0);
3684    struct ureg_src c1 = ureg_src_register(TGSI_FILE_CONSTANT, 1);
3685    /* struct ureg_dst pos_tmp = ureg_DECL_temporary(ureg);*/
3686
3687    c0 = ureg_src_dimension(c0, 4);
3688    c1 = ureg_src_dimension(c1, 4);
3689    /* TODO: find out when we need to apply the viewport transformation or not.
3690     * Likely will be XYZ vs XYZRHW in vdecl_out
3691     * ureg_MUL(ureg, ureg_writemask(pos_tmp, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos), c0);
3692     * ureg_ADD(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(pos_tmp), c1);
3693     */
3694    ureg_MOV(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos));
3695}
3696
3697static void
3698shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
3699{
3700    struct ureg_program *ureg = tx->ureg;
3701    struct ureg_dst oCol0 = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
3702    struct ureg_src fog_end, fog_coeff, fog_density, fog_params;
3703    struct ureg_src fog_vs, fog_color;
3704    struct ureg_dst fog_factor, depth;
3705
3706    if (!tx->info->fog_enable) {
3707        ureg_MOV(ureg, oCol0, src_col);
3708        return;
3709    }
3710
3711    if (tx->info->fog_mode != D3DFOG_NONE) {
3712        depth = tx_scratch_scalar(tx);
3713        /* Depth used for fog is perspective interpolated */
3714        ureg_RCP(ureg, depth, ureg_scalar(nine_get_position_input(tx), TGSI_SWIZZLE_W));
3715        ureg_MUL(ureg, depth, ureg_src(depth), ureg_scalar(nine_get_position_input(tx), TGSI_SWIZZLE_Z));
3716    }
3717
3718    fog_color = nine_float_constant_src(tx, 32);
3719    fog_params = nine_float_constant_src(tx, 33);
3720    fog_factor = tx_scratch_scalar(tx);
3721
3722    if (tx->info->fog_mode == D3DFOG_LINEAR) {
3723        fog_end = NINE_APPLY_SWIZZLE(fog_params, X);
3724        fog_coeff = NINE_APPLY_SWIZZLE(fog_params, Y);
3725        ureg_ADD(ureg, fog_factor, fog_end, ureg_negate(ureg_src(depth)));
3726        ureg_MUL(ureg, ureg_saturate(fog_factor), tx_src_scalar(fog_factor), fog_coeff);
3727    } else if (tx->info->fog_mode == D3DFOG_EXP) {
3728        fog_density = NINE_APPLY_SWIZZLE(fog_params, X);
3729        ureg_MUL(ureg, fog_factor, ureg_src(depth), fog_density);
3730        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
3731        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
3732    } else if (tx->info->fog_mode == D3DFOG_EXP2) {
3733        fog_density = NINE_APPLY_SWIZZLE(fog_params, X);
3734        ureg_MUL(ureg, fog_factor, ureg_src(depth), fog_density);
3735        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), tx_src_scalar(fog_factor));
3736        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
3737        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
3738    } else {
3739        fog_vs = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16,
3740                                            TGSI_INTERPOLATE_PERSPECTIVE),
3741                                            TGSI_SWIZZLE_X);
3742        ureg_MOV(ureg, fog_factor, fog_vs);
3743    }
3744
3745    ureg_LRP(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_XYZ),
3746             tx_src_scalar(fog_factor), src_col, fog_color);
3747    ureg_MOV(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_W), src_col);
3748}
3749
3750static void parse_shader(struct shader_translator *tx)
3751{
3752    struct nine_shader_info *info = tx->info;
3753
3754    while (!sm1_parse_eof(tx) && !tx->failure)
3755        sm1_parse_instruction(tx);
3756    tx->parse++; /* for byte_size */
3757
3758    if (tx->failure)
3759        return;
3760
3761    if (IS_PS && tx->version.major < 3) {
3762        if (tx->version.major < 2) {
3763            assert(tx->num_temp); /* there must be color output */
3764            info->rt_mask |= 0x1;
3765            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.r[0]));
3766        } else {
3767            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.oCol[0]));
3768        }
3769    }
3770
3771    if (IS_VS && tx->version.major < 3 && ureg_dst_is_undef(tx->regs.oFog) && info->fog_enable) {
3772        tx->regs.oFog = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_GENERIC, 16);
3773        ureg_MOV(tx->ureg, ureg_writemask(tx->regs.oFog, TGSI_WRITEMASK_X), ureg_imm1f(tx->ureg, 0.0f));
3774    }
3775
3776    if (info->position_t)
3777        ureg_property(tx->ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
3778
3779    if (IS_VS && !ureg_dst_is_undef(tx->regs.oPts)) {
3780        struct ureg_dst oPts = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_PSIZE, 0);
3781        ureg_MAX(tx->ureg, tx->regs.oPts, ureg_src(tx->regs.oPts), ureg_imm1f(tx->ureg, info->point_size_min));
3782        ureg_MIN(tx->ureg, oPts, ureg_src(tx->regs.oPts), ureg_imm1f(tx->ureg, info->point_size_max));
3783        info->point_size = TRUE;
3784    }
3785
3786    if (info->process_vertices)
3787        shader_add_vs_viewport_transform(tx);
3788
3789    ureg_END(tx->ureg);
3790}
3791
3792HRESULT
3793nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info, struct pipe_context *pipe)
3794{
3795    struct shader_translator *tx;
3796    HRESULT hr = D3D_OK;
3797    const unsigned processor = info->type;
3798    struct pipe_screen *screen = info->process_vertices ? device->screen_sw : device->screen;
3799    unsigned *const_ranges = NULL;
3800
3801    user_assert(processor != ~0, D3DERR_INVALIDCALL);
3802
3803    tx = MALLOC_STRUCT(shader_translator);
3804    if (!tx)
3805        return E_OUTOFMEMORY;
3806
3807    if (tx_ctor(tx, screen, info) == E_OUTOFMEMORY) {
3808        hr = E_OUTOFMEMORY;
3809        goto out;
3810    }
3811
3812    assert(IS_VS || !info->swvp_on);
3813
3814    if (((tx->version.major << 16) | tx->version.minor) > 0x00030000) {
3815        hr = D3DERR_INVALIDCALL;
3816        DBG("Unsupported shader version: %u.%u !\n",
3817            tx->version.major, tx->version.minor);
3818        goto out;
3819    }
3820    if (tx->processor != processor) {
3821        hr = D3DERR_INVALIDCALL;
3822        DBG("Shader type mismatch: %u / %u !\n", tx->processor, processor);
3823        goto out;
3824    }
3825    DUMP("%s%u.%u\n", processor == PIPE_SHADER_VERTEX ? "VS" : "PS",
3826         tx->version.major, tx->version.minor);
3827
3828    parse_shader(tx);
3829
3830    if (tx->failure) {
3831        /* For VS shaders, we print the warning later,
3832         * we first try with swvp. */
3833        if (IS_PS)
3834            ERR("Encountered buggy shader\n");
3835        ureg_destroy(tx->ureg);
3836        hr = D3DERR_INVALIDCALL;
3837        goto out;
3838    }
3839
3840    /* Recompile after compacting constant slots if possible */
3841    if (!tx->indirect_const_access && !info->swvp_on && tx->num_slots > 0) {
3842        unsigned *slot_map;
3843        unsigned c;
3844        int i, j, num_ranges, prev;
3845
3846        DBG("Recompiling shader for constant compaction\n");
3847        ureg_destroy(tx->ureg);
3848
3849        if (tx->num_inst_labels)
3850            FREE(tx->inst_labels);
3851        FREE(tx->lconstf);
3852        FREE(tx->regs.r);
3853
3854        num_ranges = 0;
3855        prev = -2;
3856        for (i = 0; i < NINE_MAX_CONST_ALL; i++) {
3857            if (tx->slots_used[i]) {
3858                if (prev != i - 1)
3859                    num_ranges++;
3860                prev = i;
3861            }
3862        }
3863        slot_map = MALLOC(NINE_MAX_CONST_ALL * sizeof(unsigned));
3864        const_ranges = CALLOC(num_ranges + 1, 2 * sizeof(unsigned)); /* ranges stop when last is of size 0 */
3865        if (!slot_map || !const_ranges) {
3866            hr = E_OUTOFMEMORY;
3867            goto out;
3868        }
3869        c = 0;
3870        j = -1;
3871        prev = -2;
3872        for (i = 0; i < NINE_MAX_CONST_ALL; i++) {
3873            if (tx->slots_used[i]) {
3874                if (prev != i - 1)
3875                    j++;
3876                /* Initialize first slot of the range */
3877                if (!const_ranges[2*j+1])
3878                    const_ranges[2*j] = i;
3879                const_ranges[2*j+1]++;
3880                prev = i;
3881                slot_map[i] = c++;
3882            }
3883        }
3884
3885        if (tx_ctor(tx, screen, info) == E_OUTOFMEMORY) {
3886            hr = E_OUTOFMEMORY;
3887            goto out;
3888        }
3889        tx->slot_map = slot_map;
3890        parse_shader(tx);
3891        assert(!tx->failure);
3892#if !defined(NDEBUG)
3893        i = 0;
3894        j = 0;
3895        while (const_ranges[i*2+1] != 0) {
3896            j += const_ranges[i*2+1];
3897            i++;
3898        }
3899        assert(j == tx->num_slots);
3900#endif
3901    }
3902
3903    /* record local constants */
3904    if (tx->num_lconstf && tx->indirect_const_access) {
3905        struct nine_range *ranges;
3906        float *data;
3907        int *indices;
3908        unsigned i, k, n;
3909
3910        hr = E_OUTOFMEMORY;
3911
3912        data = MALLOC(tx->num_lconstf * 4 * sizeof(float));
3913        if (!data)
3914            goto out;
3915        info->lconstf.data = data;
3916
3917        indices = MALLOC(tx->num_lconstf * sizeof(indices[0]));
3918        if (!indices)
3919            goto out;
3920
3921        /* lazy sort, num_lconstf should be small */
3922        for (n = 0; n < tx->num_lconstf; ++n) {
3923            for (k = 0, i = 0; i < tx->num_lconstf; ++i) {
3924                if (tx->lconstf[i].idx < tx->lconstf[k].idx)
3925                    k = i;
3926            }
3927            indices[n] = tx->lconstf[k].idx;
3928            memcpy(&data[n * 4], &tx->lconstf[k].f[0], 4 * sizeof(float));
3929            tx->lconstf[k].idx = INT_MAX;
3930        }
3931
3932        /* count ranges */
3933        for (n = 1, i = 1; i < tx->num_lconstf; ++i)
3934            if (indices[i] != indices[i - 1] + 1)
3935                ++n;
3936        ranges = MALLOC(n * sizeof(ranges[0]));
3937        if (!ranges) {
3938            FREE(indices);
3939            goto out;
3940        }
3941        info->lconstf.ranges = ranges;
3942
3943        k = 0;
3944        ranges[k].bgn = indices[0];
3945        for (i = 1; i < tx->num_lconstf; ++i) {
3946            if (indices[i] != indices[i - 1] + 1) {
3947                ranges[k].next = &ranges[k + 1];
3948                ranges[k].end = indices[i - 1] + 1;
3949                ++k;
3950                ranges[k].bgn = indices[i];
3951            }
3952        }
3953        ranges[k].end = indices[i - 1] + 1;
3954        ranges[k].next = NULL;
3955        assert(n == (k + 1));
3956
3957        FREE(indices);
3958        hr = D3D_OK;
3959    }
3960
3961    /* r500 */
3962    if (info->const_float_slots > device->max_vs_const_f &&
3963        (info->const_int_slots || info->const_bool_slots) &&
3964        !info->swvp_on)
3965        ERR("Overlapping constant slots. The shader is likely to be buggy\n");
3966
3967
3968    if (tx->indirect_const_access) { /* vs only */
3969        info->const_float_slots = device->max_vs_const_f;
3970        tx->num_slots = MAX2(tx->num_slots, device->max_vs_const_f);
3971    }
3972
3973    if (!info->swvp_on) {
3974        info->const_used_size = sizeof(float[4]) * tx->num_slots;
3975        if (tx->num_slots)
3976            ureg_DECL_constant2D(tx->ureg, 0, tx->num_slots-1, 0);
3977    } else {
3978         ureg_DECL_constant2D(tx->ureg, 0, 4095, 0);
3979         ureg_DECL_constant2D(tx->ureg, 0, 4095, 1);
3980         ureg_DECL_constant2D(tx->ureg, 0, 2047, 2);
3981         ureg_DECL_constant2D(tx->ureg, 0, 511, 3);
3982    }
3983
3984    if (info->process_vertices)
3985        ureg_DECL_constant2D(tx->ureg, 0, 2, 4); /* Viewport data */
3986
3987    if (debug_get_bool_option("NINE_TGSI_DUMP", FALSE)) {
3988        const struct tgsi_token *toks = ureg_get_tokens(tx->ureg, NULL);
3989        tgsi_dump(toks, 0);
3990        ureg_free_tokens(toks);
3991    }
3992
3993    if (info->process_vertices) {
3994        NineVertexDeclaration9_FillStreamOutputInfo(info->vdecl_out,
3995                                                    tx->output_info,
3996                                                    tx->num_outputs,
3997                                                    &(info->so));
3998        info->cso = ureg_create_shader_with_so_and_destroy(tx->ureg, pipe, &(info->so));
3999    } else
4000        info->cso = ureg_create_shader_and_destroy(tx->ureg, pipe);
4001    if (!info->cso) {
4002        hr = D3DERR_DRIVERINTERNALERROR;
4003        FREE(info->lconstf.data);
4004        FREE(info->lconstf.ranges);
4005        goto out;
4006    }
4007
4008    info->const_ranges = const_ranges;
4009    const_ranges = NULL;
4010    info->byte_size = (tx->parse - tx->byte_code) * sizeof(DWORD);
4011out:
4012    if (const_ranges)
4013        FREE(const_ranges);
4014    tx_dtor(tx);
4015    return hr;
4016}
4017