nine_ff.c revision 7ec681f3
1
2/* FF is big and ugly so feel free to write lines as long as you like.
3 * Aieeeeeeeee !
4 *
5 * Let me make that clearer:
6 * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
7 */
8
9#include "device9.h"
10#include "basetexture9.h"
11#include "vertexdeclaration9.h"
12#include "vertexshader9.h"
13#include "pixelshader9.h"
14#include "nine_ff.h"
15#include "nine_defines.h"
16#include "nine_helpers.h"
17#include "nine_pipe.h"
18#include "nine_dump.h"
19
20#include "pipe/p_context.h"
21#include "tgsi/tgsi_ureg.h"
22#include "tgsi/tgsi_dump.h"
23#include "util/u_box.h"
24#include "util/u_hash_table.h"
25#include "util/u_upload_mgr.h"
26
27#define DBG_CHANNEL DBG_FF
28
29#define NINE_FF_NUM_VS_CONST 196
30#define NINE_FF_NUM_PS_CONST 24
31
32struct fvec4
33{
34    float x, y, z, w;
35};
36
37struct nine_ff_vs_key
38{
39    union {
40        struct {
41            uint32_t position_t : 1;
42            uint32_t lighting   : 1;
43            uint32_t darkness   : 1; /* lighting enabled but no active lights */
44            uint32_t localviewer : 1;
45            uint32_t vertexpointsize : 1;
46            uint32_t pointscale : 1;
47            uint32_t vertexblend : 3;
48            uint32_t vertexblend_indexed : 1;
49            uint32_t vertextween : 1;
50            uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
51            uint32_t mtl_ambient : 2;
52            uint32_t mtl_specular : 2;
53            uint32_t mtl_emissive : 2;
54            uint32_t fog_mode : 2;
55            uint32_t fog_range : 1;
56            uint32_t color0in_one : 1;
57            uint32_t color1in_zero : 1;
58            uint32_t has_normal : 1;
59            uint32_t fog : 1;
60            uint32_t normalizenormals : 1;
61            uint32_t ucp : 1;
62            uint32_t pad1 : 4;
63            uint32_t tc_dim_input: 16; /* 8 * 2 bits */
64            uint32_t pad2 : 16;
65            uint32_t tc_dim_output: 24; /* 8 * 3 bits */
66            uint32_t pad3 : 8;
67            uint32_t tc_gen : 24; /* 8 * 3 bits */
68            uint32_t pad4 : 8;
69            uint32_t tc_idx : 24;
70            uint32_t pad5 : 8;
71            uint32_t passthrough;
72        };
73        uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
74        uint32_t value32[6];
75    };
76};
77
78/* Texture stage state:
79 *
80 * COLOROP       D3DTOP 5 bit
81 * ALPHAOP       D3DTOP 5 bit
82 * COLORARG0     D3DTA  3 bit
83 * COLORARG1     D3DTA  3 bit
84 * COLORARG2     D3DTA  3 bit
85 * ALPHAARG0     D3DTA  3 bit
86 * ALPHAARG1     D3DTA  3 bit
87 * ALPHAARG2     D3DTA  3 bit
88 * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
89 * TEXCOORDINDEX 0 - 7  3 bit
90 * ===========================
91 *                     32 bit per stage
92 */
93struct nine_ff_ps_key
94{
95    union {
96        struct {
97            struct {
98                uint32_t colorop   : 5;
99                uint32_t alphaop   : 5;
100                uint32_t colorarg0 : 3;
101                uint32_t colorarg1 : 3;
102                uint32_t colorarg2 : 3;
103                uint32_t alphaarg0 : 3;
104                uint32_t alphaarg1 : 3;
105                uint32_t alphaarg2 : 3;
106                uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
107                uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
108                uint32_t pad       : 1;
109                /* that's 32 bit exactly */
110            } ts[8];
111            uint32_t projected : 16;
112            uint32_t fog : 1; /* for vFog coming from VS */
113            uint32_t fog_mode : 2;
114            uint32_t fog_source : 1; /* 0: Z, 1: W */
115            uint32_t specular : 1;
116            uint32_t pad1 : 11; /* 9 32-bit words with this */
117            uint8_t colorarg_b4[3];
118            uint8_t colorarg_b5[3];
119            uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
120            uint8_t pad2[3];
121        };
122        uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
123        uint32_t value32[12];
124    };
125};
126
127static uint32_t nine_ff_vs_key_hash(const void *key)
128{
129    const struct nine_ff_vs_key *vs = key;
130    unsigned i;
131    uint32_t hash = vs->value32[0];
132    for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
133        hash ^= vs->value32[i];
134    return hash;
135}
136static bool nine_ff_vs_key_comp(const void *key1, const void *key2)
137{
138    struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
139    struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
140
141    return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
142}
143static uint32_t nine_ff_ps_key_hash(const void *key)
144{
145    const struct nine_ff_ps_key *ps = key;
146    unsigned i;
147    uint32_t hash = ps->value32[0];
148    for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
149        hash ^= ps->value32[i];
150    return hash;
151}
152static bool nine_ff_ps_key_comp(const void *key1, const void *key2)
153{
154    struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
155    struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
156
157    return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
158}
159static uint32_t nine_ff_fvf_key_hash(const void *key)
160{
161    return *(DWORD *)key;
162}
163static bool nine_ff_fvf_key_comp(const void *key1, const void *key2)
164{
165    return *(DWORD *)key1 == *(DWORD *)key2;
166}
167
168static void nine_ff_prune_vs(struct NineDevice9 *);
169static void nine_ff_prune_ps(struct NineDevice9 *);
170
171static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
172{
173    if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
174        const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
175        tgsi_dump(toks, 0);
176        ureg_free_tokens(toks);
177    }
178}
179
180#define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
181#define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
182#define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
183#define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
184
185#define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
186#define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
187#define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
188#define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
189
190#define _XYZW(r) (r)
191
192/* AL should contain base address of lights table. */
193#define LIGHT_CONST(i)                                                \
194    ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
195
196#define MATERIAL_CONST(i) \
197    ureg_DECL_constant(ureg, 19 + (i))
198
199#define _CONST(n) ureg_DECL_constant(ureg, n)
200
201/* VS FF constants layout:
202 *
203 * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
204 * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
205 * CONST[ 8..11] D3DTS_PROJECTION
206 * CONST[12..15] D3DTS_VIEW^(-1)
207 * CONST[16..18] Normal matrix
208 *
209 * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
210 * CONST[20]      MATERIAL.Diffuse
211 * CONST[21]      MATERIAL.Ambient
212 * CONST[22]      MATERIAL.Specular
213 * CONST[23].x___ MATERIAL.Power
214 * CONST[24]      MATERIAL.Emissive
215 * CONST[25]      RS.Ambient
216 *
217 * CONST[26].x___ RS.PointSizeMin
218 * CONST[26]._y__ RS.PointSizeMax
219 * CONST[26].__z_ RS.PointSize
220 * CONST[26].___w RS.PointScaleA
221 * CONST[27].x___ RS.PointScaleB
222 * CONST[27]._y__ RS.PointScaleC
223 *
224 * CONST[28].x___ RS.FogEnd
225 * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
226 * CONST[28].__z_ RS.FogDensity
227
228 * CONST[30].x___ TWEENFACTOR
229 *
230 * CONST[32].x___ LIGHT[0].Type
231 * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
232 * CONST[33]      LIGHT[0].Diffuse
233 * CONST[34]      LIGHT[0].Specular
234 * CONST[35]      LIGHT[0].Ambient
235 * CONST[36].xyz_ LIGHT[0].Position
236 * CONST[36].___w LIGHT[0].Range
237 * CONST[37].xyz_ LIGHT[0].Direction
238 * CONST[37].___w LIGHT[0].Falloff
239 * CONST[38].x___ cos(LIGHT[0].Theta / 2)
240 * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
241 * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
242 * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
243 * CONST[39].___w 1 if this is the last active light, 0 if not
244 * CONST[40]      LIGHT[1]
245 * CONST[48]      LIGHT[2]
246 * CONST[56]      LIGHT[3]
247 * CONST[64]      LIGHT[4]
248 * CONST[72]      LIGHT[5]
249 * CONST[80]      LIGHT[6]
250 * CONST[88]      LIGHT[7]
251 * NOTE: no lighting code is generated if there are no active lights
252 *
253 * CONST[100].x___ Viewport 2/width
254 * CONST[100]._y__ Viewport 2/height
255 * CONST[100].__z_ Viewport 1/(zmax - zmin)
256 * CONST[100].___w Viewport width
257 * CONST[101].x___ Viewport x0
258 * CONST[101]._y__ Viewport y0
259 * CONST[101].__z_ Viewport z0
260 *
261 * CONST[128..131] D3DTS_TEXTURE0
262 * CONST[132..135] D3DTS_TEXTURE1
263 * CONST[136..139] D3DTS_TEXTURE2
264 * CONST[140..143] D3DTS_TEXTURE3
265 * CONST[144..147] D3DTS_TEXTURE4
266 * CONST[148..151] D3DTS_TEXTURE5
267 * CONST[152..155] D3DTS_TEXTURE6
268 * CONST[156..159] D3DTS_TEXTURE7
269 *
270 * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
271 * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
272 * ...
273 * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
274 */
275struct vs_build_ctx
276{
277    struct ureg_program *ureg;
278    const struct nine_ff_vs_key *key;
279
280    uint16_t input[PIPE_MAX_ATTRIBS];
281    unsigned num_inputs;
282
283    struct ureg_src aVtx;
284    struct ureg_src aNrm;
285    struct ureg_src aCol[2];
286    struct ureg_src aTex[8];
287    struct ureg_src aPsz;
288    struct ureg_src aInd;
289    struct ureg_src aWgt;
290
291    struct ureg_src aVtx1; /* tweening */
292    struct ureg_src aNrm1;
293
294    struct ureg_src mtlA;
295    struct ureg_src mtlD;
296    struct ureg_src mtlS;
297    struct ureg_src mtlE;
298};
299
300static inline unsigned
301get_texcoord_sn(struct pipe_screen *screen)
302{
303    if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
304        return TGSI_SEMANTIC_TEXCOORD;
305    return TGSI_SEMANTIC_GENERIC;
306}
307
308static inline struct ureg_src
309build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
310{
311    const unsigned i = vs->num_inputs++;
312    assert(i < PIPE_MAX_ATTRIBS);
313    vs->input[i] = ndecl;
314    return ureg_DECL_vs_input(vs->ureg, i);
315}
316
317/* NOTE: dst may alias src */
318static inline void
319ureg_normalize3(struct ureg_program *ureg,
320                struct ureg_dst dst, struct ureg_src src)
321{
322    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
323    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
324
325    ureg_DP3(ureg, tmp_x, src, src);
326    ureg_RSQ(ureg, tmp_x, _X(tmp));
327    ureg_MUL(ureg, dst, src, _X(tmp));
328    ureg_release_temporary(ureg, tmp);
329}
330
331static void *
332nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
333{
334    const struct nine_ff_vs_key *key = vs->key;
335    struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
336    struct ureg_dst oPos, oCol[2], oPsz, oFog;
337    struct ureg_dst AR;
338    unsigned i, c;
339    unsigned label[32], l = 0;
340    boolean need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
341    boolean has_aNrm;
342    boolean need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
343    const unsigned texcoord_sn = get_texcoord_sn(device->screen);
344
345    vs->ureg = ureg;
346
347    /* Check which inputs we should transform. */
348    for (i = 0; i < 8 * 3; i += 3) {
349        switch ((key->tc_gen >> i) & 0x7) {
350        case NINED3DTSS_TCI_CAMERASPACENORMAL:
351            need_aNrm = TRUE;
352            break;
353        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
354            need_aVtx = TRUE;
355            break;
356        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
357            need_aVtx = need_aNrm = TRUE;
358            break;
359        case NINED3DTSS_TCI_SPHEREMAP:
360            need_aVtx = need_aNrm = TRUE;
361            break;
362        default:
363            break;
364        }
365    }
366
367    has_aNrm = need_aNrm && key->has_normal;
368
369    /* Declare and record used inputs (needed for linkage with vertex format):
370     * (texture coordinates handled later)
371     */
372    vs->aVtx = build_vs_add_input(vs,
373        key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
374
375    vs->aNrm = ureg_imm1f(ureg, 0.0f);
376    if (has_aNrm)
377        vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
378
379    vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
380    vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
381
382    if (key->lighting || key->darkness) {
383        const unsigned mask = key->mtl_diffuse | key->mtl_specular |
384                              key->mtl_ambient | key->mtl_emissive;
385        if ((mask & 0x1) && !key->color0in_one)
386            vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
387        if ((mask & 0x2) && !key->color1in_zero)
388            vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
389
390        vs->mtlD = MATERIAL_CONST(1);
391        vs->mtlA = MATERIAL_CONST(2);
392        vs->mtlS = MATERIAL_CONST(3);
393        vs->mtlE = MATERIAL_CONST(5);
394        if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
395        if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
396        if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
397        if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
398        if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
399        if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
400        if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
401        if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
402    } else {
403        if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
404        if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
405    }
406
407    if (key->vertexpointsize)
408        vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
409
410    if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
411        vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
412    if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
413        vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
414    if (key->vertextween) {
415        vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
416        vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
417    }
418
419    /* Declare outputs:
420     */
421    oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
422    oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
423    oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
424    if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
425        oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 16);
426        oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
427    }
428
429    if (key->vertexpointsize || key->pointscale) {
430        oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
431                                       TGSI_WRITEMASK_X, 0, 1);
432        oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
433    }
434
435    if (key->lighting || key->vertexblend)
436        AR = ureg_DECL_address(ureg);
437
438    /* === Vertex transformation / vertex blending:
439     */
440
441    if (key->position_t) {
442        if (device->driver_caps.window_space_position_support) {
443            ureg_MOV(ureg, oPos, vs->aVtx);
444        } else {
445            struct ureg_dst tmp = ureg_DECL_temporary(ureg);
446            /* vs->aVtx contains the coordinates buffer wise.
447            * later in the pipeline, clipping, viewport and division
448            * by w (rhw = 1/w) are going to be applied, so do the reverse
449            * of these transformations (except clipping) to have the good
450            * position at the end.*/
451            ureg_MOV(ureg, tmp, vs->aVtx);
452            /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
453            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
454            ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
455            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
456            /* Y needs to be reversed */
457            ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
458            /* inverse rhw */
459            ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
460            /* multiply X, Y, Z by w */
461            ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
462            ureg_MOV(ureg, oPos, ureg_src(tmp));
463            ureg_release_temporary(ureg, tmp);
464        }
465    } else if (key->vertexblend) {
466        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
467        struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
468        struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
469        struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
470        struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
471        struct ureg_src cWM[4];
472
473        for (i = 160; i <= 195; ++i)
474            ureg_DECL_constant(ureg, i);
475
476        /* translate world matrix index to constant file index */
477        if (key->vertexblend_indexed) {
478            ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
479            ureg_ARL(ureg, AR, ureg_src(tmp));
480        }
481
482        ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
483        ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
484        ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
485
486        for (i = 0; i < key->vertexblend; ++i) {
487            for (c = 0; c < 4; ++c) {
488                cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
489                if (key->vertexblend_indexed)
490                    cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
491            }
492
493            /* multiply by WORLD(index) */
494            ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
495            ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
496            ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
497            ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
498
499            if (has_aNrm) {
500                /* Note: the spec says the transpose of the inverse of the
501                 * WorldView matrices should be used, but all tests show
502                 * otherwise.
503                 * Only case unknown: D3DVBF_0WEIGHTS */
504                ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
505                ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
506                ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
507            }
508
509            if (i < (key->vertexblend - 1)) {
510                /* accumulate weighted position value */
511                ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
512                if (has_aNrm)
513                    ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
514                /* subtract weighted position value for last value */
515                ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
516            }
517        }
518
519        /* the last weighted position is always 1 - sum_of_previous_weights */
520        ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
521        if (has_aNrm)
522            ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
523
524        /* multiply by VIEW_PROJ */
525        ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
526        ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
527        ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
528        ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
529
530        if (need_aVtx)
531            vs->aVtx = ureg_src(aVtx_dst);
532
533        ureg_release_temporary(ureg, tmp);
534        ureg_release_temporary(ureg, tmp2);
535        ureg_release_temporary(ureg, sum_blendweights);
536        if (!need_aVtx)
537            ureg_release_temporary(ureg, aVtx_dst);
538
539        if (has_aNrm) {
540            if (key->normalizenormals)
541               ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
542            vs->aNrm = ureg_src(aNrm_dst);
543        } else
544            ureg_release_temporary(ureg, aNrm_dst);
545    } else {
546        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
547
548        if (key->vertextween) {
549            struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
550            ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
551            vs->aVtx = ureg_src(aVtx_dst);
552            if (has_aNrm) {
553                struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
554                ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
555                vs->aNrm = ureg_src(aNrm_dst);
556            }
557        }
558
559        /* position = vertex * WORLD_VIEW_PROJ */
560        ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
561        ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
562        ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
563        ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
564        ureg_release_temporary(ureg, tmp);
565
566        if (need_aVtx) {
567            struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
568            ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
569            ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
570            ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
571            ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
572            vs->aVtx = ureg_src(aVtx_dst);
573        }
574        if (has_aNrm) {
575            struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
576            ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
577            ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
578            ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
579            if (key->normalizenormals)
580               ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
581            vs->aNrm = ureg_src(aNrm_dst);
582        }
583    }
584
585    /* === Process point size:
586     */
587    if (key->vertexpointsize || key->pointscale) {
588        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
589        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
590        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
591        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
592        if (key->vertexpointsize) {
593            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
594            ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
595            ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
596        } else {
597            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
598            ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
599        }
600
601        if (key->pointscale) {
602            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
603            struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
604
605            ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
606            ureg_RSQ(ureg, tmp_y, _X(tmp));
607            ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
608            ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
609            ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
610            ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
611            ureg_RSQ(ureg, tmp_x, _X(tmp));
612            ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
613            ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
614            ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
615            ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
616        }
617
618        ureg_MOV(ureg, oPsz, _Z(tmp));
619        ureg_release_temporary(ureg, tmp);
620    }
621
622    for (i = 0; i < 8; ++i) {
623        struct ureg_dst tmp, tmp_x, tmp2;
624        struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
625        unsigned c, writemask;
626        const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
627        const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
628        unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
629        const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
630
631        /* No texture output of index s */
632        if (tci == NINED3DTSS_TCI_DISABLE)
633            continue;
634        oTex = ureg_DECL_output(ureg, texcoord_sn, i);
635        tmp = ureg_DECL_temporary(ureg);
636        tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
637        input_coord = ureg_DECL_temporary(ureg);
638        transformed = ureg_DECL_temporary(ureg);
639
640        /* Get the coordinate */
641        switch (tci) {
642        case NINED3DTSS_TCI_PASSTHRU:
643            /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
644             * Else the idx is used only to determine wrapping mode. */
645            vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
646            ureg_MOV(ureg, input_coord, vs->aTex[idx]);
647            break;
648        case NINED3DTSS_TCI_CAMERASPACENORMAL:
649            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
650            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
651            dim_input = 4;
652            break;
653        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
654            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
655            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
656            dim_input = 4;
657            break;
658        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
659            tmp.WriteMask = TGSI_WRITEMASK_XYZ;
660            aVtx_normed = ureg_DECL_temporary(ureg);
661            ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
662            ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
663            ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
664            ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
665            ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
666            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
667            ureg_release_temporary(ureg, aVtx_normed);
668            dim_input = 4;
669            tmp.WriteMask = TGSI_WRITEMASK_XYZW;
670            break;
671        case NINED3DTSS_TCI_SPHEREMAP:
672            /* Implement the formula of GL_SPHERE_MAP */
673            tmp.WriteMask = TGSI_WRITEMASK_XYZ;
674            aVtx_normed = ureg_DECL_temporary(ureg);
675            tmp2 = ureg_DECL_temporary(ureg);
676            ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
677            ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
678            ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
679            ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
680            ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
681            /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
682            ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
683            ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
684            ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
685            ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
686            ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
687            /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
688             * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
689            ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
690            ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
691            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
692            ureg_release_temporary(ureg, aVtx_normed);
693            ureg_release_temporary(ureg, tmp2);
694            dim_input = 4;
695            tmp.WriteMask = TGSI_WRITEMASK_XYZW;
696            break;
697        default:
698            assert(0);
699            break;
700        }
701
702        /* Apply the transformation */
703        /* dim_output == 0 => do not transform the components.
704         * XYZRHW also disables transformation */
705        if (!dim_output || key->position_t) {
706            ureg_release_temporary(ureg, transformed);
707            transformed = input_coord;
708            writemask = TGSI_WRITEMASK_XYZW;
709        } else {
710            for (c = 0; c < dim_output; c++) {
711                t = ureg_writemask(transformed, 1 << c);
712                switch (dim_input) {
713                /* dim_input = 1 2 3: -> we add trailing 1 to input*/
714                case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
715                        break;
716                case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
717                        ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
718                        break;
719                case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
720                        ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
721                        break;
722                case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
723                default:
724                    assert(0);
725                }
726            }
727            writemask = (1 << dim_output) - 1;
728            ureg_release_temporary(ureg, input_coord);
729        }
730
731        ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
732        ureg_release_temporary(ureg, transformed);
733        ureg_release_temporary(ureg, tmp);
734    }
735
736    /* === Lighting:
737     *
738     * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
739     * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
740     * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
741     *
742     * vec3 normal = normalize(in.Normal * NormalMatrix);
743     * vec3 hitDir = light.direction;
744     * float atten = 1.0;
745     *
746     * if (light.type != DIRECTIONAL)
747     * {
748     *     vec3 hitVec = light.position - eyeVertex;
749     *     float d = length(hitVec);
750     *     hitDir = hitVec / d;
751     *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
752     * }
753     *
754     * if (light.type == SPOTLIGHT)
755     * {
756     *     float rho = dp3(-hitVec, light.direction);
757     *     if (rho < cos(light.phi / 2))
758     *         atten = 0;
759     *     if (rho < cos(light.theta / 2))
760     *         atten *= pow(some_func(rho), light.falloff);
761     * }
762     *
763     * float nDotHit = dp3_sat(normal, hitVec);
764     * float powFact = 0.0;
765     *
766     * if (nDotHit > 0.0)
767     * {
768     *     vec3 midVec = normalize(hitDir + eye);
769     *     float nDotMid = dp3_sat(normal, midVec);
770     *     pFact = pow(nDotMid, material.power);
771     * }
772     *
773     * ambient += light.ambient * atten;
774     * diffuse += light.diffuse * atten * nDotHit;
775     * specular += light.specular * atten * powFact;
776     */
777    if (key->lighting) {
778        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
779        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
780        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
781        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
782        struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
783        struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
784        struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
785
786        struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
787
788        struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
789
790        /* Light.*.Alpha is not used. */
791        struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
792        struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
793        struct ureg_dst rS = ureg_DECL_temporary(ureg);
794
795        struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
796
797        struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
798        struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
799        struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
800        struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
801        struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
802        struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
803        struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
804        struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
805        struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
806        struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
807        struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
808        struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
809        struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
810        struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
811        struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
812
813        const unsigned loop_label = l++;
814
815        /* Declare all light constants to allow indirect adressing */
816        for (i = 32; i < 96; i++)
817            ureg_DECL_constant(ureg, i);
818
819        ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
820        ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
821        ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
822        ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
823
824        /* loop management */
825        ureg_BGNLOOP(ureg, &label[loop_label]);
826        ureg_ARL(ureg, AL, _W(rCtr));
827
828        /* if (not DIRECTIONAL light): */
829        ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
830        ureg_MOV(ureg, rHit, ureg_negate(cLDir));
831        ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
832        ureg_IF(ureg, _X(tmp), &label[l++]);
833        {
834            /* hitDir = light.position - eyeVtx
835             * d = length(hitDir)
836             */
837            ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
838            ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
839            ureg_RSQ(ureg, tmp_y, _X(tmp));
840            ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
841
842            /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
843            ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
844            ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
845            ureg_RCP(ureg, rAtt, _W(rAtt));
846            /* cut-off if distance exceeds Light.Range */
847            ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
848            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
849        }
850        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
851        ureg_ENDIF(ureg);
852
853        /* normalize hitDir */
854        ureg_normalize3(ureg, rHit, ureg_src(rHit));
855
856        /* if (SPOT light) */
857        ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
858        ureg_IF(ureg, _X(tmp), &label[l++]);
859        {
860            /* rho = dp3(-hitDir, light.spotDir)
861             *
862             * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
863             *     spotAtt = 1
864             * else
865             * if (rho <= light.cphi2)
866             *     spotAtt = 0
867             * else
868             *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
869             */
870            ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
871            ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
872            ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
873            ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
874            ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
875            ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
876            ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
877            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
878        }
879        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
880        ureg_ENDIF(ureg);
881
882        /* directional factors, let's not use LIT because of clarity */
883
884        if (has_aNrm) {
885            if (key->localviewer) {
886                ureg_normalize3(ureg, rMid, vs->aVtx);
887                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
888            } else {
889                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
890            }
891            ureg_normalize3(ureg, rMid, ureg_src(rMid));
892            ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
893            ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
894            ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
895            /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
896             * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
897             * No tests were made for backfacing, so add the two conditions */
898            ureg_IF(ureg, _Z(tmp), &label[l++]);
899            {
900                ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
901                ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
902                ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
903                ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
904            }
905            ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
906            ureg_ENDIF(ureg);
907
908            ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
909            ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
910        }
911
912        ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
913
914        /* break if this was the last light */
915        ureg_IF(ureg, cLLast, &label[l++]);
916        ureg_BRK(ureg);
917        ureg_ENDIF(ureg);
918        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
919
920        ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
921        ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
922        ureg_ENDLOOP(ureg, &label[loop_label]);
923
924        /* Apply to material:
925         *
926         * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
927         *           material.ambient * ambient +
928         *           material.diffuse * diffuse +
929         * oCol[1] = material.specular * specular;
930         */
931        if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
932            ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
933        else {
934            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
935            ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
936        }
937
938        ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
939        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
940        ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
941        ureg_release_temporary(ureg, rAtt);
942        ureg_release_temporary(ureg, rHit);
943        ureg_release_temporary(ureg, rMid);
944        ureg_release_temporary(ureg, rCtr);
945        ureg_release_temporary(ureg, rD);
946        ureg_release_temporary(ureg, rA);
947        ureg_release_temporary(ureg, rS);
948        ureg_release_temporary(ureg, rAtt);
949        ureg_release_temporary(ureg, tmp);
950    } else
951    /* COLOR */
952    if (key->darkness) {
953        if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
954            ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
955        else
956            ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
957        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
958        ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
959    } else {
960        ureg_MOV(ureg, oCol[0], vs->aCol[0]);
961        ureg_MOV(ureg, oCol[1], vs->aCol[1]);
962    }
963
964    /* === Process fog.
965     *
966     * exp(x) = ex2(log2(e) * x)
967     */
968    if (key->fog_mode) {
969        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
970        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
971        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
972        if (key->fog_range) {
973            ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
974            ureg_RSQ(ureg, tmp_z, _X(tmp));
975            ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
976        } else {
977            ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
978        }
979
980        if (key->fog_mode == D3DFOG_EXP) {
981            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
982            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
983            ureg_EX2(ureg, tmp_x, _X(tmp));
984        } else
985        if (key->fog_mode == D3DFOG_EXP2) {
986            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
987            ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
988            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
989            ureg_EX2(ureg, tmp_x, _X(tmp));
990        } else
991        if (key->fog_mode == D3DFOG_LINEAR) {
992            ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
993            ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
994        }
995        ureg_MOV(ureg, oFog, _X(tmp));
996        ureg_release_temporary(ureg, tmp);
997    } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
998        ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
999    }
1000
1001    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
1002        struct ureg_src input;
1003        struct ureg_dst output;
1004        input = vs->aWgt;
1005        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
1006        ureg_MOV(ureg, output, input);
1007    }
1008    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
1009        struct ureg_src input;
1010        struct ureg_dst output;
1011        input = vs->aInd;
1012        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
1013        ureg_MOV(ureg, output, input);
1014    }
1015    if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
1016        struct ureg_src input;
1017        struct ureg_dst output;
1018        input = vs->aNrm;
1019        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
1020        ureg_MOV(ureg, output, input);
1021    }
1022    if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
1023        struct ureg_src input;
1024        struct ureg_dst output;
1025        input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
1026        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
1027        ureg_MOV(ureg, output, input);
1028    }
1029    if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
1030        struct ureg_src input;
1031        struct ureg_dst output;
1032        input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
1033        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 23);
1034        ureg_MOV(ureg, output, input);
1035    }
1036    if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
1037        struct ureg_src input;
1038        struct ureg_dst output;
1039        input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
1040        input = ureg_scalar(input, TGSI_SWIZZLE_X);
1041        output = oFog;
1042        ureg_MOV(ureg, output, input);
1043    }
1044    if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
1045        (void) 0; /* TODO: replace z of position output ? */
1046    }
1047
1048    /* ucp for ff applies on world coordinates.
1049     * aVtx is in worldview coordinates. */
1050    if (key->ucp) {
1051        struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
1052        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1053        ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
1054        ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
1055        ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
1056        ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
1057        ureg_release_temporary(ureg, tmp);
1058    }
1059
1060    if (key->position_t && device->driver_caps.window_space_position_support)
1061        ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
1062
1063    ureg_END(ureg);
1064    nine_ureg_tgsi_dump(ureg, FALSE);
1065    return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1066}
1067
1068/* PS FF constants layout:
1069 *
1070 * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
1071 * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
1072 * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
1073 * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
1074 * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
1075 * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
1076 * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
1077 *
1078 * CONST[20] D3DRS_TEXTUREFACTOR
1079 * CONST[21] D3DRS_FOGCOLOR
1080 * CONST[22].x___ RS.FogEnd
1081 * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
1082 * CONST[22].__z_ RS.FogDensity
1083 */
1084struct ps_build_ctx
1085{
1086    struct ureg_program *ureg;
1087
1088    struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
1089    struct ureg_src vT[8]; /* TEXCOORD[i] */
1090    struct ureg_dst rCur; /* D3DTA_CURRENT */
1091    struct ureg_dst rMod;
1092    struct ureg_src rCurSrc;
1093    struct ureg_dst rTmp; /* D3DTA_TEMP */
1094    struct ureg_src rTmpSrc;
1095    struct ureg_dst rTex;
1096    struct ureg_src rTexSrc;
1097    struct ureg_src cBEM[8];
1098    struct ureg_src s[8];
1099
1100    struct {
1101        unsigned index;
1102        unsigned index_pre_mod;
1103    } stage;
1104};
1105
1106static struct ureg_src
1107ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
1108{
1109    struct ureg_src reg;
1110
1111    switch (ta & D3DTA_SELECTMASK) {
1112    case D3DTA_CONSTANT:
1113        reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
1114        break;
1115    case D3DTA_CURRENT:
1116        reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
1117        break;
1118    case D3DTA_DIFFUSE:
1119        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
1120        break;
1121    case D3DTA_SPECULAR:
1122        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1123        break;
1124    case D3DTA_TEMP:
1125        reg = ps->rTmpSrc;
1126        break;
1127    case D3DTA_TEXTURE:
1128        reg = ps->rTexSrc;
1129        break;
1130    case D3DTA_TFACTOR:
1131        reg = ureg_DECL_constant(ps->ureg, 20);
1132        break;
1133    default:
1134        assert(0);
1135        reg = ureg_src_undef();
1136        break;
1137    }
1138    if (ta & D3DTA_COMPLEMENT) {
1139        struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
1140        ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
1141        reg = ureg_src(dst);
1142    }
1143    if (ta & D3DTA_ALPHAREPLICATE)
1144        reg = _WWWW(reg);
1145    return reg;
1146}
1147
1148static struct ureg_dst
1149ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
1150{
1151    assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
1152
1153    switch (ta & D3DTA_SELECTMASK) {
1154    case D3DTA_CURRENT:
1155        return ps->rCur;
1156    case D3DTA_TEMP:
1157        return ps->rTmp;
1158    default:
1159        assert(0);
1160        return ureg_dst_undef();
1161    }
1162}
1163
1164static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
1165{
1166    switch (top) {
1167    case D3DTOP_DISABLE:
1168        return 0x0;
1169    case D3DTOP_SELECTARG1:
1170    case D3DTOP_PREMODULATE:
1171        return 0x2;
1172    case D3DTOP_SELECTARG2:
1173        return 0x4;
1174    case D3DTOP_MULTIPLYADD:
1175    case D3DTOP_LERP:
1176        return 0x7;
1177    default:
1178        return 0x6;
1179    }
1180}
1181
1182static inline boolean
1183is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
1184{
1185    return !dst.WriteMask ||
1186        (dst.File == src.File &&
1187         dst.Index == src.Index &&
1188         !dst.Indirect &&
1189         !dst.Saturate &&
1190         !src.Indirect &&
1191         !src.Negate &&
1192         !src.Absolute &&
1193         (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
1194         (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
1195         (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
1196         (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
1197
1198}
1199
1200static void
1201ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
1202{
1203    struct ureg_program *ureg = ps->ureg;
1204    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1205    struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
1206    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
1207
1208    tmp.WriteMask = dst.WriteMask;
1209
1210    if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
1211        top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
1212        top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
1213        top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
1214        top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
1215        top != D3DTOP_LERP)
1216        dst = ureg_saturate(dst);
1217
1218    switch (top) {
1219    case D3DTOP_SELECTARG1:
1220        if (!is_MOV_no_op(dst, arg[1]))
1221            ureg_MOV(ureg, dst, arg[1]);
1222        break;
1223    case D3DTOP_SELECTARG2:
1224        if (!is_MOV_no_op(dst, arg[2]))
1225            ureg_MOV(ureg, dst, arg[2]);
1226        break;
1227    case D3DTOP_MODULATE:
1228        ureg_MUL(ureg, dst, arg[1], arg[2]);
1229        break;
1230    case D3DTOP_MODULATE2X:
1231        ureg_MUL(ureg, tmp, arg[1], arg[2]);
1232        ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
1233        break;
1234    case D3DTOP_MODULATE4X:
1235        ureg_MUL(ureg, tmp, arg[1], arg[2]);
1236        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
1237        break;
1238    case D3DTOP_ADD:
1239        ureg_ADD(ureg, dst, arg[1], arg[2]);
1240        break;
1241    case D3DTOP_ADDSIGNED:
1242        ureg_ADD(ureg, tmp, arg[1], arg[2]);
1243        ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
1244        break;
1245    case D3DTOP_ADDSIGNED2X:
1246        ureg_ADD(ureg, tmp, arg[1], arg[2]);
1247        ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1248        break;
1249    case D3DTOP_SUBTRACT:
1250        ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
1251        break;
1252    case D3DTOP_ADDSMOOTH:
1253        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1254        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
1255        break;
1256    case D3DTOP_BLENDDIFFUSEALPHA:
1257        ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
1258        break;
1259    case D3DTOP_BLENDTEXTUREALPHA:
1260        /* XXX: alpha taken from previous stage, texture or result ? */
1261        ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
1262        break;
1263    case D3DTOP_BLENDFACTORALPHA:
1264        ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
1265        break;
1266    case D3DTOP_BLENDTEXTUREALPHAPM:
1267        ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
1268        ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
1269        break;
1270    case D3DTOP_BLENDCURRENTALPHA:
1271        ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
1272        break;
1273    case D3DTOP_PREMODULATE:
1274        ureg_MOV(ureg, dst, arg[1]);
1275        ps->stage.index_pre_mod = ps->stage.index + 1;
1276        break;
1277    case D3DTOP_MODULATEALPHA_ADDCOLOR:
1278        ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
1279        break;
1280    case D3DTOP_MODULATECOLOR_ADDALPHA:
1281        ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
1282        break;
1283    case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
1284        ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
1285        ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
1286        break;
1287    case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
1288        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1289        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
1290        break;
1291    case D3DTOP_BUMPENVMAP:
1292        break;
1293    case D3DTOP_BUMPENVMAPLUMINANCE:
1294        break;
1295    case D3DTOP_DOTPRODUCT3:
1296        ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1297        ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1298        ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
1299        ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
1300        break;
1301    case D3DTOP_MULTIPLYADD:
1302        ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
1303        break;
1304    case D3DTOP_LERP:
1305        ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
1306        break;
1307    case D3DTOP_DISABLE:
1308        /* no-op ? */
1309        break;
1310    default:
1311        assert(!"invalid D3DTOP");
1312        break;
1313    }
1314    ureg_release_temporary(ureg, tmp);
1315    ureg_release_temporary(ureg, tmp2);
1316}
1317
1318static void *
1319nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
1320{
1321    struct ps_build_ctx ps;
1322    struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
1323    struct ureg_dst oCol;
1324    unsigned s;
1325    const unsigned texcoord_sn = get_texcoord_sn(device->screen);
1326
1327    memset(&ps, 0, sizeof(ps));
1328    ps.ureg = ureg;
1329    ps.stage.index_pre_mod = -1;
1330
1331    ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
1332
1333    ps.rCur = ureg_DECL_temporary(ureg);
1334    ps.rTmp = ureg_DECL_temporary(ureg);
1335    ps.rTex = ureg_DECL_temporary(ureg);
1336    ps.rCurSrc = ureg_src(ps.rCur);
1337    ps.rTmpSrc = ureg_src(ps.rTmp);
1338    ps.rTexSrc = ureg_src(ps.rTex);
1339
1340    /* Initial values */
1341    ureg_MOV(ureg, ps.rCur, ps.vC[0]);
1342    ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
1343    ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
1344
1345    for (s = 0; s < 8; ++s) {
1346        ps.s[s] = ureg_src_undef();
1347
1348        if (key->ts[s].colorop != D3DTOP_DISABLE) {
1349            if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
1350                key->ts[s].colorarg1 == D3DTA_SPECULAR ||
1351                key->ts[s].colorarg2 == D3DTA_SPECULAR)
1352                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1353
1354            if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
1355                key->ts[s].colorarg1 == D3DTA_TEXTURE ||
1356                key->ts[s].colorarg2 == D3DTA_TEXTURE ||
1357                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1358                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1359                ps.s[s] = ureg_DECL_sampler(ureg, s);
1360                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1361            }
1362            if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
1363                      key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
1364                ps.s[s] = ureg_DECL_sampler(ureg, s);
1365        }
1366
1367        if (key->ts[s].alphaop != D3DTOP_DISABLE) {
1368            if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
1369                key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
1370                key->ts[s].alphaarg2 == D3DTA_SPECULAR)
1371                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1372
1373            if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
1374                key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
1375                key->ts[s].alphaarg2 == D3DTA_TEXTURE ||
1376                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1377                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1378                ps.s[s] = ureg_DECL_sampler(ureg, s);
1379                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1380            }
1381        }
1382    }
1383    if (key->specular)
1384        ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
1385
1386    oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
1387
1388    /* Run stages.
1389     */
1390    for (s = 0; s < 8; ++s) {
1391        unsigned colorarg[3];
1392        unsigned alphaarg[3];
1393        const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
1394        const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
1395        struct ureg_dst dst;
1396        struct ureg_src arg[3];
1397
1398        if (key->ts[s].colorop == D3DTOP_DISABLE) {
1399            assert (key->ts[s].alphaop == D3DTOP_DISABLE);
1400            continue;
1401        }
1402        ps.stage.index = s;
1403
1404        DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
1405            nine_D3DTOP_to_str(key->ts[s].colorop),
1406            nine_D3DTOP_to_str(key->ts[s].alphaop));
1407
1408        if (!ureg_src_is_undef(ps.s[s])) {
1409            unsigned target;
1410            struct ureg_src texture_coord = ps.vT[s];
1411            struct ureg_dst delta;
1412            switch (key->ts[s].textarget) {
1413            case 0: target = TGSI_TEXTURE_1D; break;
1414            case 1: target = TGSI_TEXTURE_2D; break;
1415            case 2: target = TGSI_TEXTURE_3D; break;
1416            case 3: target = TGSI_TEXTURE_CUBE; break;
1417            /* this is a 2 bit bitfield, do I really need a default case ? */
1418            }
1419
1420            /* Modify coordinates */
1421            if (s >= 1 &&
1422                (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
1423                 key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
1424                delta = ureg_DECL_temporary(ureg);
1425                /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
1426                ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
1427                ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
1428                /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
1429                ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
1430                ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
1431                texture_coord = ureg_src(ureg_DECL_temporary(ureg));
1432                ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
1433                ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
1434                /* Prepare luminance multiplier
1435                 * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
1436                if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
1437                    struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
1438                    struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
1439
1440                    ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
1441                }
1442            }
1443            if (key->projected & (3 << (s *2))) {
1444                unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
1445                if (dim == 4)
1446                    ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1447                else {
1448                    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1449                    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
1450                    ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
1451                    ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
1452                    ureg_release_temporary(ureg, tmp);
1453                }
1454            } else {
1455                ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1456            }
1457            if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1458                ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
1459        }
1460
1461        if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
1462            key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1463            continue;
1464
1465        dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
1466
1467        if (ps.stage.index_pre_mod == ps.stage.index) {
1468            ps.rMod = ureg_DECL_temporary(ureg);
1469            ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
1470        }
1471
1472        colorarg[0] = (key->ts[s].colorarg0 | (((key->colorarg_b4[0] >> s) & 0x1) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
1473        colorarg[1] = (key->ts[s].colorarg1 | (((key->colorarg_b4[1] >> s) & 0x1) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
1474        colorarg[2] = (key->ts[s].colorarg2 | (((key->colorarg_b4[2] >> s) & 0x1) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
1475        alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
1476        alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
1477        alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
1478
1479        if (key->ts[s].colorop != key->ts[s].alphaop ||
1480            colorarg[0] != alphaarg[0] ||
1481            colorarg[1] != alphaarg[1] ||
1482            colorarg[2] != alphaarg[2])
1483            dst.WriteMask = TGSI_WRITEMASK_XYZ;
1484
1485        /* Special DOTPRODUCT behaviour (see wine tests) */
1486        if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
1487            dst.WriteMask = TGSI_WRITEMASK_XYZW;
1488
1489        if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
1490        if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
1491        if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
1492        ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
1493
1494        if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
1495            dst.WriteMask = TGSI_WRITEMASK_W;
1496
1497            if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
1498            if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
1499            if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
1500            ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
1501        }
1502    }
1503
1504    if (key->specular)
1505        ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
1506
1507    /* Fog.
1508     */
1509    if (key->fog_mode) {
1510        struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
1511        struct ureg_src vPos;
1512        if (device->screen->get_param(device->screen,
1513                                      PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
1514            vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
1515        } else {
1516            vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
1517                                      TGSI_INTERPOLATE_LINEAR);
1518        }
1519
1520        /* Source is either W or Z.
1521         * When we use vs ff,
1522         * Z is when an orthogonal projection matrix is detected,
1523         * W (WFOG) else.
1524         * Z is used for programmable vs.
1525         * Note: Tests indicate that the projection matrix coefficients do
1526         * actually affect pixel fog (and not vertex fog) when vs ff is used,
1527         * which justifies taking the position's w instead of taking the z coordinate
1528         * before the projection in the vs shader.
1529         */
1530        if (!key->fog_source)
1531            ureg_MOV(ureg, rFog, _ZZZZ(vPos));
1532        else
1533            /* Position's w is 1/w */
1534            ureg_RCP(ureg, rFog, _WWWW(vPos));
1535
1536        if (key->fog_mode == D3DFOG_EXP) {
1537            ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1538            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1539            ureg_EX2(ureg, rFog, _X(rFog));
1540        } else
1541        if (key->fog_mode == D3DFOG_EXP2) {
1542            ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1543            ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
1544            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1545            ureg_EX2(ureg, rFog, _X(rFog));
1546        } else
1547        if (key->fog_mode == D3DFOG_LINEAR) {
1548            ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
1549            ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
1550        }
1551        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
1552        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1553    } else
1554    if (key->fog) {
1555        struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16, TGSI_INTERPOLATE_PERSPECTIVE);
1556        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
1557        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1558    } else {
1559        ureg_MOV(ureg, oCol, ps.rCurSrc);
1560    }
1561
1562    ureg_END(ureg);
1563    nine_ureg_tgsi_dump(ureg, FALSE);
1564    return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1565}
1566
1567static struct NineVertexShader9 *
1568nine_ff_get_vs(struct NineDevice9 *device)
1569{
1570    const struct nine_context *context = &device->context;
1571    struct NineVertexShader9 *vs;
1572    struct vs_build_ctx bld;
1573    struct nine_ff_vs_key key;
1574    unsigned s, i;
1575    boolean has_indexes = false;
1576    boolean has_weights = false;
1577    char input_texture_coord[8];
1578
1579    assert(sizeof(key) <= sizeof(key.value32));
1580
1581    memset(&key, 0, sizeof(key));
1582    memset(&bld, 0, sizeof(bld));
1583    memset(&input_texture_coord, 0, sizeof(input_texture_coord));
1584
1585    bld.key = &key;
1586
1587    /* FIXME: this shouldn't be NULL, but it is on init */
1588    if (context->vdecl) {
1589        key.color0in_one = 1;
1590        key.color1in_zero = 1;
1591        for (i = 0; i < context->vdecl->nelems; i++) {
1592            uint16_t usage = context->vdecl->usage_map[i];
1593            if (usage == NINE_DECLUSAGE_POSITIONT)
1594                key.position_t = 1;
1595            else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
1596                key.color0in_one = 0;
1597            else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
1598                key.color1in_zero = 0;
1599            else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
1600                has_indexes = true;
1601                key.passthrough |= 1 << usage;
1602            } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
1603                has_weights = true;
1604                key.passthrough |= 1 << usage;
1605            } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
1606                key.has_normal = 1;
1607                key.passthrough |= 1 << usage;
1608            } else if (usage == NINE_DECLUSAGE_PSIZE)
1609                key.vertexpointsize = 1;
1610            else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
1611                s = usage / NINE_DECLUSAGE_COUNT;
1612                if (s < 8)
1613                    input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
1614                else
1615                    DBG("FF given texture coordinate >= 8. Ignoring\n");
1616            } else if (usage < NINE_DECLUSAGE_NONE)
1617                key.passthrough |= 1 << usage;
1618        }
1619    }
1620    /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
1621     * We do restrict to indices 0 */
1622    key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
1623                         (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
1624                         (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
1625    if (!key.position_t)
1626        key.passthrough = 0;
1627    key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
1628
1629    key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
1630    key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
1631    if (key.position_t) {
1632        key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
1633        key.lighting = 0;
1634    }
1635    if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
1636        uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
1637        key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
1638        key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
1639        key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
1640        key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
1641    }
1642    key.fog = !!context->rs[D3DRS_FOGENABLE];
1643    key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
1644    if (key.fog_mode)
1645        key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
1646
1647    key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
1648    key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
1649    key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
1650
1651    if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1652        key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
1653
1654        switch (context->rs[D3DRS_VERTEXBLEND]) {
1655        case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
1656        case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
1657        case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
1658        case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
1659        case D3DVBF_TWEENING: key.vertextween = 1; break;
1660        default:
1661            assert(!"invalid D3DVBF");
1662            break;
1663        }
1664        if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
1665            key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
1666    }
1667
1668    for (s = 0; s < 8; ++s) {
1669        unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
1670        unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
1671        unsigned dim;
1672
1673        if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
1674            gen = NINED3DTSS_TCI_PASSTHRU;
1675
1676        if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
1677            gen = NINED3DTSS_TCI_DISABLE;
1678
1679        key.tc_gen |= gen << (s * 3);
1680        key.tc_idx |= idx << (s * 3);
1681        key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
1682
1683        dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
1684        if (dim > 4)
1685            dim = input_texture_coord[idx];
1686        if (dim == 1) /* NV behaviour */
1687            dim = 0;
1688        key.tc_dim_output |= dim << (s * 3);
1689    }
1690
1691    DBG("VS ff key hash: %x\n", nine_ff_vs_key_hash(&key));
1692    vs = util_hash_table_get(device->ff.ht_vs, &key);
1693    if (vs)
1694        return vs;
1695    NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
1696
1697    nine_ff_prune_vs(device);
1698    if (vs) {
1699        unsigned n;
1700
1701        memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
1702
1703        _mesa_hash_table_insert(device->ff.ht_vs, &vs->ff_key, vs);
1704        device->ff.num_vs++;
1705
1706        vs->num_inputs = bld.num_inputs;
1707        for (n = 0; n < bld.num_inputs; ++n)
1708            vs->input_map[n].ndecl = bld.input[n];
1709
1710        vs->position_t = key.position_t;
1711        vs->point_size = key.vertexpointsize | key.pointscale;
1712    }
1713    return vs;
1714}
1715
1716#define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
1717#define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
1718
1719static struct NinePixelShader9 *
1720nine_ff_get_ps(struct NineDevice9 *device)
1721{
1722    struct nine_context *context = &device->context;
1723    D3DMATRIX *projection_matrix = GET_D3DTS(PROJECTION);
1724    struct NinePixelShader9 *ps;
1725    struct nine_ff_ps_key key;
1726    unsigned s;
1727    uint8_t sampler_mask = 0;
1728
1729    assert(sizeof(key) <= sizeof(key.value32));
1730
1731    memset(&key, 0, sizeof(key));
1732    for (s = 0; s < 8; ++s) {
1733        key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
1734        key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
1735        const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
1736        const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
1737        /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
1738         * ALPHAOP cannot be enabled if COLOROP is disabled.
1739         * Verified on Windows. */
1740        if (key.ts[s].colorop == D3DTOP_DISABLE) {
1741            key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
1742            break;
1743        }
1744
1745        if (!context->texture[s].enabled &&
1746            ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
1747              used_c & 0x1) ||
1748             (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
1749              used_c & 0x2) ||
1750             (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
1751              used_c & 0x4))) {
1752            /* Tested on Windows: Invalid texture read disables the stage
1753             * and the subsequent ones, but only for colorop. For alpha,
1754             * it's as if the texture had alpha of 1.0, which is what
1755             * has our dummy texture in that case. Invalid color also
1756             * disabled the following alpha stages. */
1757            key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1758            break;
1759        }
1760
1761        if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
1762            context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
1763            context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
1764            context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
1765            context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
1766            context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
1767            sampler_mask |= (1 << s);
1768
1769        if (key.ts[s].colorop != D3DTOP_DISABLE) {
1770            if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0] & 0x7;
1771            if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1] & 0x7;
1772            if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2] & 0x7;
1773            if (used_c & 0x1) key.colorarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) & 0x1) << s;
1774            if (used_c & 0x1) key.colorarg_b5[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) & 0x1) << s;
1775            if (used_c & 0x2) key.colorarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) & 0x1) << s;
1776            if (used_c & 0x2) key.colorarg_b5[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) & 0x1) << s;
1777            if (used_c & 0x4) key.colorarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) & 0x1) << s;
1778            if (used_c & 0x4) key.colorarg_b5[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) & 0x1) << s;
1779        }
1780        if (key.ts[s].alphaop != D3DTOP_DISABLE) {
1781            if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0] & 0x7;
1782            if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1] & 0x7;
1783            if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2] & 0x7;
1784            if (used_a & 0x1) key.alphaarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) & 0x1) << s;
1785            if (used_a & 0x2) key.alphaarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) & 0x1) << s;
1786            if (used_a & 0x4) key.alphaarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) & 0x1) << s;
1787        }
1788        key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
1789
1790        if (context->texture[s].enabled) {
1791            switch (context->texture[s].type) {
1792            case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
1793            case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
1794            case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
1795            default:
1796                assert(!"unexpected texture type");
1797                break;
1798            }
1799        } else {
1800            key.ts[s].textarget = 1;
1801        }
1802    }
1803
1804    /* Note: If colorop is D3DTOP_DISABLE for the first stage
1805     * (which implies alphaop is too), nothing particular happens,
1806     * that is, current is equal to diffuse (which is the case anyway,
1807     * because it is how it is initialized).
1808     * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
1809     * because then if the resultarg is TEMP, then diffuse alpha is written
1810     * to it. */
1811    if (key.ts[0].colorop != D3DTOP_DISABLE &&
1812        key.ts[0].alphaop == D3DTOP_DISABLE &&
1813        key.ts[0].resultarg != 0) {
1814        key.ts[0].alphaop = D3DTOP_SELECTARG1;
1815        key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
1816    }
1817    /* When no alpha stage writes to current, diffuse alpha is taken.
1818     * Since we initialize current to diffuse, we have the behaviour. */
1819
1820    /* Last stage always writes to Current */
1821    if (s >= 1)
1822        key.ts[s-1].resultarg = 0;
1823
1824    key.projected = nine_ff_get_projected_key_ff(context);
1825    key.specular = !!context->rs[D3DRS_SPECULARENABLE];
1826
1827    for (; s < 8; ++s)
1828        key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1829    if (context->rs[D3DRS_FOGENABLE])
1830        key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
1831    key.fog = !!context->rs[D3DRS_FOGENABLE];
1832    /* Pixel fog (with WFOG advertised): source is either Z or W.
1833     * W is the source if vs ff is used, and the
1834     * projection matrix is not orthogonal.
1835     * Tests on Win 10 seem to indicate _34
1836     * and _33 are checked against 0, 1. */
1837    if (key.fog_mode && key.fog)
1838        key.fog_source = !context->programmable_vs &&
1839            !(projection_matrix->_34 == 0.0f &&
1840              projection_matrix->_44 == 1.0f);
1841
1842    DBG("PS ff key hash: %x\n", nine_ff_ps_key_hash(&key));
1843    ps = util_hash_table_get(device->ff.ht_ps, &key);
1844    if (ps)
1845        return ps;
1846    NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
1847
1848    nine_ff_prune_ps(device);
1849    if (ps) {
1850        memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
1851
1852        _mesa_hash_table_insert(device->ff.ht_ps, &ps->ff_key, ps);
1853        device->ff.num_ps++;
1854
1855        ps->rt_mask = 0x1;
1856        ps->sampler_mask = sampler_mask;
1857    }
1858    return ps;
1859}
1860
1861static void
1862nine_ff_load_vs_transforms(struct NineDevice9 *device)
1863{
1864    struct nine_context *context = &device->context;
1865    D3DMATRIX T;
1866    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1867    unsigned i;
1868
1869    /* TODO: make this nicer, and only upload the ones we need */
1870    /* TODO: use ff.vs_const as storage of W, V, P matrices */
1871
1872    if (IS_D3DTS_DIRTY(context, WORLD) ||
1873        IS_D3DTS_DIRTY(context, VIEW) ||
1874        IS_D3DTS_DIRTY(context, PROJECTION)) {
1875        /* WVP, WV matrices */
1876        nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
1877        nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
1878
1879        /* normal matrix == transpose(inverse(WV)) */
1880        nine_d3d_matrix_inverse(&T, &M[1]);
1881        nine_d3d_matrix_transpose(&M[4], &T);
1882
1883        /* P matrix */
1884        M[2] = *GET_D3DTS(PROJECTION);
1885
1886        /* V and W matrix */
1887        nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
1888        M[40] = M[1];
1889    }
1890
1891    if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1892        /* load other world matrices */
1893        for (i = 1; i <= 8; ++i) {
1894            nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
1895        }
1896    }
1897
1898    device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
1899}
1900
1901static void
1902nine_ff_load_lights(struct NineDevice9 *device)
1903{
1904    struct nine_context *context = &device->context;
1905    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1906    unsigned l;
1907
1908    if (context->changed.group & NINE_STATE_FF_MATERIAL) {
1909        const D3DMATERIAL9 *mtl = &context->ff.material;
1910
1911        memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
1912        memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
1913        memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
1914        dst[23].x = mtl->Power;
1915        memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
1916        d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
1917        dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
1918        dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
1919        dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
1920    }
1921
1922    if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
1923        return;
1924
1925    for (l = 0; l < context->ff.num_lights_active; ++l) {
1926        const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
1927
1928        dst[32 + l * 8].x = light->Type;
1929        dst[32 + l * 8].y = light->Attenuation0;
1930        dst[32 + l * 8].z = light->Attenuation1;
1931        dst[32 + l * 8].w = light->Attenuation2;
1932        memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
1933        memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
1934        memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
1935        nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
1936        nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
1937        dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
1938        dst[37 + l * 8].w = light->Falloff;
1939        dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
1940        dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
1941        dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
1942        dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
1943    }
1944}
1945
1946static void
1947nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
1948{
1949    struct nine_context *context = &device->context;
1950    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1951
1952    if (!(context->changed.group & NINE_STATE_FF_VS_OTHER))
1953        return;
1954    dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
1955    dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
1956    dst[26].z = asfloat(context->rs[D3DRS_POINTSIZE]);
1957    dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
1958    dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
1959    dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
1960    dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
1961    dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
1962    if (isinf(dst[28].y))
1963        dst[28].y = 0.0f;
1964    dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
1965}
1966
1967static void
1968nine_ff_load_tex_matrices(struct NineDevice9 *device)
1969{
1970    struct nine_context *context = &device->context;
1971    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1972    unsigned s;
1973
1974    if (!(context->ff.changed.transform[0] & 0xff0000))
1975        return;
1976    for (s = 0; s < 8; ++s) {
1977        if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
1978            nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, FALSE));
1979    }
1980}
1981
1982static void
1983nine_ff_load_ps_params(struct NineDevice9 *device)
1984{
1985    struct nine_context *context = &device->context;
1986    struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
1987    unsigned s;
1988
1989    if (!(context->changed.group & NINE_STATE_FF_PS_CONSTS))
1990        return;
1991
1992    for (s = 0; s < 8; ++s)
1993        d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
1994
1995    for (s = 0; s < 8; ++s) {
1996        dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
1997        dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
1998        dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
1999        dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
2000        if (s & 1) {
2001            dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2002            dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2003        } else {
2004            dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2005            dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2006        }
2007    }
2008
2009    d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
2010    d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
2011    dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
2012    dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
2013    dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
2014}
2015
2016static void
2017nine_ff_load_viewport_info(struct NineDevice9 *device)
2018{
2019    D3DVIEWPORT9 *viewport = &device->context.viewport;
2020    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
2021    float diffZ = viewport->MaxZ - viewport->MinZ;
2022
2023    /* Note: the other functions avoids to fill the const again if nothing changed.
2024     * But we don't have much to fill, and adding code to allow that may be complex
2025     * so just fill it always */
2026    dst[100].x = 2.0f / (float)(viewport->Width);
2027    dst[100].y = 2.0f / (float)(viewport->Height);
2028    dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
2029    dst[100].w = (float)(viewport->Width);
2030    dst[101].x = (float)(viewport->X);
2031    dst[101].y = (float)(viewport->Y);
2032    dst[101].z = (float)(viewport->MinZ);
2033}
2034
2035void
2036nine_ff_update(struct NineDevice9 *device)
2037{
2038    struct nine_context *context = &device->context;
2039    struct pipe_constant_buffer cb;
2040
2041    DBG("vs=%p ps=%p\n", context->vs, context->ps);
2042
2043    /* NOTE: the only reference belongs to the hash table */
2044    if (!context->programmable_vs) {
2045        device->ff.vs = nine_ff_get_vs(device);
2046        context->changed.group |= NINE_STATE_VS;
2047    }
2048    if (!context->ps) {
2049        device->ff.ps = nine_ff_get_ps(device);
2050        context->changed.group |= NINE_STATE_PS;
2051    }
2052
2053    if (!context->programmable_vs) {
2054        nine_ff_load_vs_transforms(device);
2055        nine_ff_load_tex_matrices(device);
2056        nine_ff_load_lights(device);
2057        nine_ff_load_point_and_fog_params(device);
2058        nine_ff_load_viewport_info(device);
2059
2060        memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
2061
2062        cb.buffer_offset = 0;
2063        cb.buffer = NULL;
2064        cb.user_buffer = device->ff.vs_const;
2065        cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
2066
2067        context->pipe_data.cb_vs_ff = cb;
2068        context->commit |= NINE_STATE_COMMIT_CONST_VS;
2069
2070        context->changed.group &= ~NINE_STATE_FF_VS;
2071    }
2072
2073    if (!context->ps) {
2074        nine_ff_load_ps_params(device);
2075
2076        cb.buffer_offset = 0;
2077        cb.buffer = NULL;
2078        cb.user_buffer = device->ff.ps_const;
2079        cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
2080
2081        context->pipe_data.cb_ps_ff = cb;
2082        context->commit |= NINE_STATE_COMMIT_CONST_PS;
2083
2084        context->changed.group &= ~NINE_STATE_FF_PS;
2085    }
2086}
2087
2088
2089boolean
2090nine_ff_init(struct NineDevice9 *device)
2091{
2092    device->ff.ht_vs = _mesa_hash_table_create(NULL, nine_ff_vs_key_hash,
2093                                               nine_ff_vs_key_comp);
2094    device->ff.ht_ps = _mesa_hash_table_create(NULL, nine_ff_ps_key_hash,
2095                                               nine_ff_ps_key_comp);
2096
2097    device->ff.ht_fvf = _mesa_hash_table_create(NULL, nine_ff_fvf_key_hash,
2098                                                nine_ff_fvf_key_comp);
2099
2100    device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
2101    device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
2102
2103    return device->ff.ht_vs && device->ff.ht_ps &&
2104        device->ff.ht_fvf &&
2105        device->ff.vs_const && device->ff.ps_const;
2106}
2107
2108static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
2109{
2110    NineUnknown_Unbind(NineUnknown(value));
2111    return PIPE_OK;
2112}
2113
2114void
2115nine_ff_fini(struct NineDevice9 *device)
2116{
2117    if (device->ff.ht_vs) {
2118        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2119        _mesa_hash_table_destroy(device->ff.ht_vs, NULL);
2120    }
2121    if (device->ff.ht_ps) {
2122        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2123        _mesa_hash_table_destroy(device->ff.ht_ps, NULL);
2124    }
2125    if (device->ff.ht_fvf) {
2126        util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
2127        _mesa_hash_table_destroy(device->ff.ht_fvf, NULL);
2128    }
2129    device->ff.vs = NULL; /* destroyed by unbinding from hash table */
2130    device->ff.ps = NULL;
2131
2132    FREE(device->ff.vs_const);
2133    FREE(device->ff.ps_const);
2134}
2135
2136static void
2137nine_ff_prune_vs(struct NineDevice9 *device)
2138{
2139    struct nine_context *context = &device->context;
2140
2141    if (device->ff.num_vs > 1024) {
2142        /* could destroy the bound one here, so unbind */
2143        context->pipe->bind_vs_state(context->pipe, NULL);
2144        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2145        _mesa_hash_table_clear(device->ff.ht_vs, NULL);
2146        device->ff.num_vs = 0;
2147        context->changed.group |= NINE_STATE_VS;
2148    }
2149}
2150static void
2151nine_ff_prune_ps(struct NineDevice9 *device)
2152{
2153    struct nine_context *context = &device->context;
2154
2155    if (device->ff.num_ps > 1024) {
2156        /* could destroy the bound one here, so unbind */
2157        context->pipe->bind_fs_state(context->pipe, NULL);
2158        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2159        _mesa_hash_table_clear(device->ff.ht_ps, NULL);
2160        device->ff.num_ps = 0;
2161        context->changed.group |= NINE_STATE_PS;
2162    }
2163}
2164
2165/* ========================================================================== */
2166
2167/* Matrix multiplication:
2168 *
2169 * in memory: 0 1 2 3 (row major)
2170 *            4 5 6 7
2171 *            8 9 a b
2172 *            c d e f
2173 *
2174 *    cA cB cC cD
2175 * r0             = (r0 * cA) (r0 * cB) . .
2176 * r1             = (r1 * cA) (r1 * cB)
2177 * r2             = (r2 * cA) .
2178 * r3             = (r3 * cA) .
2179 *
2180 *               r: (11) (12) (13) (14)
2181 *                  (21) (22) (23) (24)
2182 *                  (31) (32) (33) (34)
2183 *                  (41) (42) (43) (44)
2184 * l: (11 12 13 14)
2185 *    (21 22 23 24)
2186 *    (31 32 33 34)
2187 *    (41 42 43 44)
2188 *
2189 * v: (x  y  z  1 )
2190 *
2191 * t.xyzw = MUL(v.xxxx, r[0]);
2192 * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
2193 * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
2194 * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
2195 *
2196 * v.x = DP4(v, c[0]);
2197 * v.y = DP4(v, c[1]);
2198 * v.z = DP4(v, c[2]);
2199 * v.w = DP4(v, c[3]) = 1
2200 */
2201
2202/*
2203static void
2204nine_D3DMATRIX_print(const D3DMATRIX *M)
2205{
2206    DBG("\n(%f %f %f %f)\n"
2207        "(%f %f %f %f)\n"
2208        "(%f %f %f %f)\n"
2209        "(%f %f %f %f)\n",
2210        M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
2211        M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
2212        M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
2213        M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
2214}
2215*/
2216
2217static inline float
2218nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
2219{
2220    return A->m[r][0] * B->m[0][c] +
2221           A->m[r][1] * B->m[1][c] +
2222           A->m[r][2] * B->m[2][c] +
2223           A->m[r][3] * B->m[3][c];
2224}
2225
2226static inline float
2227nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2228{
2229    return v->x * M->m[0][c] +
2230           v->y * M->m[1][c] +
2231           v->z * M->m[2][c] +
2232           1.0f * M->m[3][c];
2233}
2234
2235static inline float
2236nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2237{
2238    return v->x * M->m[0][c] +
2239           v->y * M->m[1][c] +
2240           v->z * M->m[2][c];
2241}
2242
2243void
2244nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
2245{
2246    D->_11 = nine_DP4_row_col(L, 0, R, 0);
2247    D->_12 = nine_DP4_row_col(L, 0, R, 1);
2248    D->_13 = nine_DP4_row_col(L, 0, R, 2);
2249    D->_14 = nine_DP4_row_col(L, 0, R, 3);
2250
2251    D->_21 = nine_DP4_row_col(L, 1, R, 0);
2252    D->_22 = nine_DP4_row_col(L, 1, R, 1);
2253    D->_23 = nine_DP4_row_col(L, 1, R, 2);
2254    D->_24 = nine_DP4_row_col(L, 1, R, 3);
2255
2256    D->_31 = nine_DP4_row_col(L, 2, R, 0);
2257    D->_32 = nine_DP4_row_col(L, 2, R, 1);
2258    D->_33 = nine_DP4_row_col(L, 2, R, 2);
2259    D->_34 = nine_DP4_row_col(L, 2, R, 3);
2260
2261    D->_41 = nine_DP4_row_col(L, 3, R, 0);
2262    D->_42 = nine_DP4_row_col(L, 3, R, 1);
2263    D->_43 = nine_DP4_row_col(L, 3, R, 2);
2264    D->_44 = nine_DP4_row_col(L, 3, R, 3);
2265}
2266
2267void
2268nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2269{
2270    d->x = nine_DP4_vec_col(v, M, 0);
2271    d->y = nine_DP4_vec_col(v, M, 1);
2272    d->z = nine_DP4_vec_col(v, M, 2);
2273}
2274
2275void
2276nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2277{
2278    d->x = nine_DP3_vec_col(v, M, 0);
2279    d->y = nine_DP3_vec_col(v, M, 1);
2280    d->z = nine_DP3_vec_col(v, M, 2);
2281}
2282
2283void
2284nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
2285{
2286    unsigned i, j;
2287    for (i = 0; i < 4; ++i)
2288    for (j = 0; j < 4; ++j)
2289        D->m[i][j] = M->m[j][i];
2290}
2291
2292#define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2293    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2294    if (t > 0.0f) pos += t; else neg += t; } while(0)
2295
2296#define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2297    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2298    if (t > 0.0f) neg -= t; else pos -= t; } while(0)
2299float
2300nine_d3d_matrix_det(const D3DMATRIX *M)
2301{
2302    float pos = 0.0f;
2303    float neg = 0.0f;
2304
2305    _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
2306    _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
2307    _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
2308
2309    _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
2310    _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
2311    _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
2312
2313    _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
2314    _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
2315    _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
2316
2317    _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
2318    _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
2319    _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
2320
2321    _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
2322    _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
2323    _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
2324
2325    _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
2326    _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
2327    _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
2328
2329    _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
2330    _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
2331    _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
2332
2333    _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
2334    _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
2335    _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
2336
2337    return pos + neg;
2338}
2339
2340/* XXX: Probably better to just use src/mesa/math/m_matrix.c because
2341 * I have no idea where this code came from.
2342 */
2343void
2344nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
2345{
2346    int i, k;
2347    float det;
2348
2349    D->m[0][0] =
2350        M->m[1][1] * M->m[2][2] * M->m[3][3] -
2351        M->m[1][1] * M->m[3][2] * M->m[2][3] -
2352        M->m[1][2] * M->m[2][1] * M->m[3][3] +
2353        M->m[1][2] * M->m[3][1] * M->m[2][3] +
2354        M->m[1][3] * M->m[2][1] * M->m[3][2] -
2355        M->m[1][3] * M->m[3][1] * M->m[2][2];
2356
2357    D->m[0][1] =
2358       -M->m[0][1] * M->m[2][2] * M->m[3][3] +
2359        M->m[0][1] * M->m[3][2] * M->m[2][3] +
2360        M->m[0][2] * M->m[2][1] * M->m[3][3] -
2361        M->m[0][2] * M->m[3][1] * M->m[2][3] -
2362        M->m[0][3] * M->m[2][1] * M->m[3][2] +
2363        M->m[0][3] * M->m[3][1] * M->m[2][2];
2364
2365    D->m[0][2] =
2366        M->m[0][1] * M->m[1][2] * M->m[3][3] -
2367        M->m[0][1] * M->m[3][2] * M->m[1][3] -
2368        M->m[0][2] * M->m[1][1] * M->m[3][3] +
2369        M->m[0][2] * M->m[3][1] * M->m[1][3] +
2370        M->m[0][3] * M->m[1][1] * M->m[3][2] -
2371        M->m[0][3] * M->m[3][1] * M->m[1][2];
2372
2373    D->m[0][3] =
2374       -M->m[0][1] * M->m[1][2] * M->m[2][3] +
2375        M->m[0][1] * M->m[2][2] * M->m[1][3] +
2376        M->m[0][2] * M->m[1][1] * M->m[2][3] -
2377        M->m[0][2] * M->m[2][1] * M->m[1][3] -
2378        M->m[0][3] * M->m[1][1] * M->m[2][2] +
2379        M->m[0][3] * M->m[2][1] * M->m[1][2];
2380
2381    D->m[1][0] =
2382       -M->m[1][0] * M->m[2][2] * M->m[3][3] +
2383        M->m[1][0] * M->m[3][2] * M->m[2][3] +
2384        M->m[1][2] * M->m[2][0] * M->m[3][3] -
2385        M->m[1][2] * M->m[3][0] * M->m[2][3] -
2386        M->m[1][3] * M->m[2][0] * M->m[3][2] +
2387        M->m[1][3] * M->m[3][0] * M->m[2][2];
2388
2389    D->m[1][1] =
2390        M->m[0][0] * M->m[2][2] * M->m[3][3] -
2391        M->m[0][0] * M->m[3][2] * M->m[2][3] -
2392        M->m[0][2] * M->m[2][0] * M->m[3][3] +
2393        M->m[0][2] * M->m[3][0] * M->m[2][3] +
2394        M->m[0][3] * M->m[2][0] * M->m[3][2] -
2395        M->m[0][3] * M->m[3][0] * M->m[2][2];
2396
2397    D->m[1][2] =
2398       -M->m[0][0] * M->m[1][2] * M->m[3][3] +
2399        M->m[0][0] * M->m[3][2] * M->m[1][3] +
2400        M->m[0][2] * M->m[1][0] * M->m[3][3] -
2401        M->m[0][2] * M->m[3][0] * M->m[1][3] -
2402        M->m[0][3] * M->m[1][0] * M->m[3][2] +
2403        M->m[0][3] * M->m[3][0] * M->m[1][2];
2404
2405    D->m[1][3] =
2406        M->m[0][0] * M->m[1][2] * M->m[2][3] -
2407        M->m[0][0] * M->m[2][2] * M->m[1][3] -
2408        M->m[0][2] * M->m[1][0] * M->m[2][3] +
2409        M->m[0][2] * M->m[2][0] * M->m[1][3] +
2410        M->m[0][3] * M->m[1][0] * M->m[2][2] -
2411        M->m[0][3] * M->m[2][0] * M->m[1][2];
2412
2413    D->m[2][0] =
2414        M->m[1][0] * M->m[2][1] * M->m[3][3] -
2415        M->m[1][0] * M->m[3][1] * M->m[2][3] -
2416        M->m[1][1] * M->m[2][0] * M->m[3][3] +
2417        M->m[1][1] * M->m[3][0] * M->m[2][3] +
2418        M->m[1][3] * M->m[2][0] * M->m[3][1] -
2419        M->m[1][3] * M->m[3][0] * M->m[2][1];
2420
2421    D->m[2][1] =
2422       -M->m[0][0] * M->m[2][1] * M->m[3][3] +
2423        M->m[0][0] * M->m[3][1] * M->m[2][3] +
2424        M->m[0][1] * M->m[2][0] * M->m[3][3] -
2425        M->m[0][1] * M->m[3][0] * M->m[2][3] -
2426        M->m[0][3] * M->m[2][0] * M->m[3][1] +
2427        M->m[0][3] * M->m[3][0] * M->m[2][1];
2428
2429    D->m[2][2] =
2430        M->m[0][0] * M->m[1][1] * M->m[3][3] -
2431        M->m[0][0] * M->m[3][1] * M->m[1][3] -
2432        M->m[0][1] * M->m[1][0] * M->m[3][3] +
2433        M->m[0][1] * M->m[3][0] * M->m[1][3] +
2434        M->m[0][3] * M->m[1][0] * M->m[3][1] -
2435        M->m[0][3] * M->m[3][0] * M->m[1][1];
2436
2437    D->m[2][3] =
2438       -M->m[0][0] * M->m[1][1] * M->m[2][3] +
2439        M->m[0][0] * M->m[2][1] * M->m[1][3] +
2440        M->m[0][1] * M->m[1][0] * M->m[2][3] -
2441        M->m[0][1] * M->m[2][0] * M->m[1][3] -
2442        M->m[0][3] * M->m[1][0] * M->m[2][1] +
2443        M->m[0][3] * M->m[2][0] * M->m[1][1];
2444
2445    D->m[3][0] =
2446       -M->m[1][0] * M->m[2][1] * M->m[3][2] +
2447        M->m[1][0] * M->m[3][1] * M->m[2][2] +
2448        M->m[1][1] * M->m[2][0] * M->m[3][2] -
2449        M->m[1][1] * M->m[3][0] * M->m[2][2] -
2450        M->m[1][2] * M->m[2][0] * M->m[3][1] +
2451        M->m[1][2] * M->m[3][0] * M->m[2][1];
2452
2453    D->m[3][1] =
2454        M->m[0][0] * M->m[2][1] * M->m[3][2] -
2455        M->m[0][0] * M->m[3][1] * M->m[2][2] -
2456        M->m[0][1] * M->m[2][0] * M->m[3][2] +
2457        M->m[0][1] * M->m[3][0] * M->m[2][2] +
2458        M->m[0][2] * M->m[2][0] * M->m[3][1] -
2459        M->m[0][2] * M->m[3][0] * M->m[2][1];
2460
2461    D->m[3][2] =
2462       -M->m[0][0] * M->m[1][1] * M->m[3][2] +
2463        M->m[0][0] * M->m[3][1] * M->m[1][2] +
2464        M->m[0][1] * M->m[1][0] * M->m[3][2] -
2465        M->m[0][1] * M->m[3][0] * M->m[1][2] -
2466        M->m[0][2] * M->m[1][0] * M->m[3][1] +
2467        M->m[0][2] * M->m[3][0] * M->m[1][1];
2468
2469    D->m[3][3] =
2470        M->m[0][0] * M->m[1][1] * M->m[2][2] -
2471        M->m[0][0] * M->m[2][1] * M->m[1][2] -
2472        M->m[0][1] * M->m[1][0] * M->m[2][2] +
2473        M->m[0][1] * M->m[2][0] * M->m[1][2] +
2474        M->m[0][2] * M->m[1][0] * M->m[2][1] -
2475        M->m[0][2] * M->m[2][0] * M->m[1][1];
2476
2477    det =
2478        M->m[0][0] * D->m[0][0] +
2479        M->m[1][0] * D->m[0][1] +
2480        M->m[2][0] * D->m[0][2] +
2481        M->m[3][0] * D->m[0][3];
2482
2483    if (fabsf(det) < 1e-30) {/* non inversible */
2484        *D = *M; /* wine tests */
2485        return;
2486    }
2487
2488    det = 1.0 / det;
2489
2490    for (i = 0; i < 4; i++)
2491    for (k = 0; k < 4; k++)
2492        D->m[i][k] *= det;
2493
2494#if defined(DEBUG) || !defined(NDEBUG)
2495    {
2496        D3DMATRIX I;
2497
2498        nine_d3d_matrix_matrix_mul(&I, D, M);
2499
2500        for (i = 0; i < 4; ++i)
2501        for (k = 0; k < 4; ++k)
2502            if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
2503                DBG("Matrix inversion check FAILED !\n");
2504    }
2505#endif
2506}
2507