17ec681f3Smrg
27ec681f3Smrg/* FF is big and ugly so feel free to write lines as long as you like.
37ec681f3Smrg * Aieeeeeeeee !
47ec681f3Smrg *
57ec681f3Smrg * Let me make that clearer:
67ec681f3Smrg * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
77ec681f3Smrg */
87ec681f3Smrg
97ec681f3Smrg#include "device9.h"
107ec681f3Smrg#include "basetexture9.h"
117ec681f3Smrg#include "vertexdeclaration9.h"
127ec681f3Smrg#include "vertexshader9.h"
137ec681f3Smrg#include "pixelshader9.h"
147ec681f3Smrg#include "nine_ff.h"
157ec681f3Smrg#include "nine_defines.h"
167ec681f3Smrg#include "nine_helpers.h"
177ec681f3Smrg#include "nine_pipe.h"
187ec681f3Smrg#include "nine_dump.h"
197ec681f3Smrg
207ec681f3Smrg#include "pipe/p_context.h"
217ec681f3Smrg#include "tgsi/tgsi_ureg.h"
227ec681f3Smrg#include "tgsi/tgsi_dump.h"
237ec681f3Smrg#include "util/u_box.h"
247ec681f3Smrg#include "util/u_hash_table.h"
257ec681f3Smrg#include "util/u_upload_mgr.h"
267ec681f3Smrg
277ec681f3Smrg#define DBG_CHANNEL DBG_FF
287ec681f3Smrg
297ec681f3Smrg#define NINE_FF_NUM_VS_CONST 196
307ec681f3Smrg#define NINE_FF_NUM_PS_CONST 24
317ec681f3Smrg
327ec681f3Smrgstruct fvec4
337ec681f3Smrg{
347ec681f3Smrg    float x, y, z, w;
357ec681f3Smrg};
367ec681f3Smrg
377ec681f3Smrgstruct nine_ff_vs_key
387ec681f3Smrg{
397ec681f3Smrg    union {
407ec681f3Smrg        struct {
417ec681f3Smrg            uint32_t position_t : 1;
427ec681f3Smrg            uint32_t lighting   : 1;
437ec681f3Smrg            uint32_t darkness   : 1; /* lighting enabled but no active lights */
447ec681f3Smrg            uint32_t localviewer : 1;
457ec681f3Smrg            uint32_t vertexpointsize : 1;
467ec681f3Smrg            uint32_t pointscale : 1;
477ec681f3Smrg            uint32_t vertexblend : 3;
487ec681f3Smrg            uint32_t vertexblend_indexed : 1;
497ec681f3Smrg            uint32_t vertextween : 1;
507ec681f3Smrg            uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
517ec681f3Smrg            uint32_t mtl_ambient : 2;
527ec681f3Smrg            uint32_t mtl_specular : 2;
537ec681f3Smrg            uint32_t mtl_emissive : 2;
547ec681f3Smrg            uint32_t fog_mode : 2;
557ec681f3Smrg            uint32_t fog_range : 1;
567ec681f3Smrg            uint32_t color0in_one : 1;
577ec681f3Smrg            uint32_t color1in_zero : 1;
587ec681f3Smrg            uint32_t has_normal : 1;
597ec681f3Smrg            uint32_t fog : 1;
607ec681f3Smrg            uint32_t normalizenormals : 1;
617ec681f3Smrg            uint32_t ucp : 1;
627ec681f3Smrg            uint32_t pad1 : 4;
637ec681f3Smrg            uint32_t tc_dim_input: 16; /* 8 * 2 bits */
647ec681f3Smrg            uint32_t pad2 : 16;
657ec681f3Smrg            uint32_t tc_dim_output: 24; /* 8 * 3 bits */
667ec681f3Smrg            uint32_t pad3 : 8;
677ec681f3Smrg            uint32_t tc_gen : 24; /* 8 * 3 bits */
687ec681f3Smrg            uint32_t pad4 : 8;
697ec681f3Smrg            uint32_t tc_idx : 24;
707ec681f3Smrg            uint32_t pad5 : 8;
717ec681f3Smrg            uint32_t passthrough;
727ec681f3Smrg        };
737ec681f3Smrg        uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
747ec681f3Smrg        uint32_t value32[6];
757ec681f3Smrg    };
767ec681f3Smrg};
777ec681f3Smrg
787ec681f3Smrg/* Texture stage state:
797ec681f3Smrg *
807ec681f3Smrg * COLOROP       D3DTOP 5 bit
817ec681f3Smrg * ALPHAOP       D3DTOP 5 bit
827ec681f3Smrg * COLORARG0     D3DTA  3 bit
837ec681f3Smrg * COLORARG1     D3DTA  3 bit
847ec681f3Smrg * COLORARG2     D3DTA  3 bit
857ec681f3Smrg * ALPHAARG0     D3DTA  3 bit
867ec681f3Smrg * ALPHAARG1     D3DTA  3 bit
877ec681f3Smrg * ALPHAARG2     D3DTA  3 bit
887ec681f3Smrg * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
897ec681f3Smrg * TEXCOORDINDEX 0 - 7  3 bit
907ec681f3Smrg * ===========================
917ec681f3Smrg *                     32 bit per stage
927ec681f3Smrg */
937ec681f3Smrgstruct nine_ff_ps_key
947ec681f3Smrg{
957ec681f3Smrg    union {
967ec681f3Smrg        struct {
977ec681f3Smrg            struct {
987ec681f3Smrg                uint32_t colorop   : 5;
997ec681f3Smrg                uint32_t alphaop   : 5;
1007ec681f3Smrg                uint32_t colorarg0 : 3;
1017ec681f3Smrg                uint32_t colorarg1 : 3;
1027ec681f3Smrg                uint32_t colorarg2 : 3;
1037ec681f3Smrg                uint32_t alphaarg0 : 3;
1047ec681f3Smrg                uint32_t alphaarg1 : 3;
1057ec681f3Smrg                uint32_t alphaarg2 : 3;
1067ec681f3Smrg                uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
1077ec681f3Smrg                uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
1087ec681f3Smrg                uint32_t pad       : 1;
1097ec681f3Smrg                /* that's 32 bit exactly */
1107ec681f3Smrg            } ts[8];
1117ec681f3Smrg            uint32_t projected : 16;
1127ec681f3Smrg            uint32_t fog : 1; /* for vFog coming from VS */
1137ec681f3Smrg            uint32_t fog_mode : 2;
1147ec681f3Smrg            uint32_t fog_source : 1; /* 0: Z, 1: W */
1157ec681f3Smrg            uint32_t specular : 1;
1167ec681f3Smrg            uint32_t pad1 : 11; /* 9 32-bit words with this */
1177ec681f3Smrg            uint8_t colorarg_b4[3];
1187ec681f3Smrg            uint8_t colorarg_b5[3];
1197ec681f3Smrg            uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
1207ec681f3Smrg            uint8_t pad2[3];
1217ec681f3Smrg        };
1227ec681f3Smrg        uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
1237ec681f3Smrg        uint32_t value32[12];
1247ec681f3Smrg    };
1257ec681f3Smrg};
1267ec681f3Smrg
1277ec681f3Smrgstatic uint32_t nine_ff_vs_key_hash(const void *key)
1287ec681f3Smrg{
1297ec681f3Smrg    const struct nine_ff_vs_key *vs = key;
1307ec681f3Smrg    unsigned i;
1317ec681f3Smrg    uint32_t hash = vs->value32[0];
1327ec681f3Smrg    for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
1337ec681f3Smrg        hash ^= vs->value32[i];
1347ec681f3Smrg    return hash;
1357ec681f3Smrg}
1367ec681f3Smrgstatic bool nine_ff_vs_key_comp(const void *key1, const void *key2)
1377ec681f3Smrg{
1387ec681f3Smrg    struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
1397ec681f3Smrg    struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
1407ec681f3Smrg
1417ec681f3Smrg    return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
1427ec681f3Smrg}
1437ec681f3Smrgstatic uint32_t nine_ff_ps_key_hash(const void *key)
1447ec681f3Smrg{
1457ec681f3Smrg    const struct nine_ff_ps_key *ps = key;
1467ec681f3Smrg    unsigned i;
1477ec681f3Smrg    uint32_t hash = ps->value32[0];
1487ec681f3Smrg    for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
1497ec681f3Smrg        hash ^= ps->value32[i];
1507ec681f3Smrg    return hash;
1517ec681f3Smrg}
1527ec681f3Smrgstatic bool nine_ff_ps_key_comp(const void *key1, const void *key2)
1537ec681f3Smrg{
1547ec681f3Smrg    struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
1557ec681f3Smrg    struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
1567ec681f3Smrg
1577ec681f3Smrg    return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
1587ec681f3Smrg}
1597ec681f3Smrgstatic uint32_t nine_ff_fvf_key_hash(const void *key)
1607ec681f3Smrg{
1617ec681f3Smrg    return *(DWORD *)key;
1627ec681f3Smrg}
1637ec681f3Smrgstatic bool nine_ff_fvf_key_comp(const void *key1, const void *key2)
1647ec681f3Smrg{
1657ec681f3Smrg    return *(DWORD *)key1 == *(DWORD *)key2;
1667ec681f3Smrg}
1677ec681f3Smrg
1687ec681f3Smrgstatic void nine_ff_prune_vs(struct NineDevice9 *);
1697ec681f3Smrgstatic void nine_ff_prune_ps(struct NineDevice9 *);
1707ec681f3Smrg
1717ec681f3Smrgstatic void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
1727ec681f3Smrg{
1737ec681f3Smrg    if (debug_get_bool_option("NINE_FF_DUMP", FALSE) || override) {
1747ec681f3Smrg        const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
1757ec681f3Smrg        tgsi_dump(toks, 0);
1767ec681f3Smrg        ureg_free_tokens(toks);
1777ec681f3Smrg    }
1787ec681f3Smrg}
1797ec681f3Smrg
1807ec681f3Smrg#define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
1817ec681f3Smrg#define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
1827ec681f3Smrg#define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
1837ec681f3Smrg#define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
1847ec681f3Smrg
1857ec681f3Smrg#define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
1867ec681f3Smrg#define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
1877ec681f3Smrg#define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
1887ec681f3Smrg#define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
1897ec681f3Smrg
1907ec681f3Smrg#define _XYZW(r) (r)
1917ec681f3Smrg
1927ec681f3Smrg/* AL should contain base address of lights table. */
1937ec681f3Smrg#define LIGHT_CONST(i)                                                \
1947ec681f3Smrg    ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
1957ec681f3Smrg
1967ec681f3Smrg#define MATERIAL_CONST(i) \
1977ec681f3Smrg    ureg_DECL_constant(ureg, 19 + (i))
1987ec681f3Smrg
1997ec681f3Smrg#define _CONST(n) ureg_DECL_constant(ureg, n)
2007ec681f3Smrg
2017ec681f3Smrg/* VS FF constants layout:
2027ec681f3Smrg *
2037ec681f3Smrg * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
2047ec681f3Smrg * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
2057ec681f3Smrg * CONST[ 8..11] D3DTS_PROJECTION
2067ec681f3Smrg * CONST[12..15] D3DTS_VIEW^(-1)
2077ec681f3Smrg * CONST[16..18] Normal matrix
2087ec681f3Smrg *
2097ec681f3Smrg * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
2107ec681f3Smrg * CONST[20]      MATERIAL.Diffuse
2117ec681f3Smrg * CONST[21]      MATERIAL.Ambient
2127ec681f3Smrg * CONST[22]      MATERIAL.Specular
2137ec681f3Smrg * CONST[23].x___ MATERIAL.Power
2147ec681f3Smrg * CONST[24]      MATERIAL.Emissive
2157ec681f3Smrg * CONST[25]      RS.Ambient
2167ec681f3Smrg *
2177ec681f3Smrg * CONST[26].x___ RS.PointSizeMin
2187ec681f3Smrg * CONST[26]._y__ RS.PointSizeMax
2197ec681f3Smrg * CONST[26].__z_ RS.PointSize
2207ec681f3Smrg * CONST[26].___w RS.PointScaleA
2217ec681f3Smrg * CONST[27].x___ RS.PointScaleB
2227ec681f3Smrg * CONST[27]._y__ RS.PointScaleC
2237ec681f3Smrg *
2247ec681f3Smrg * CONST[28].x___ RS.FogEnd
2257ec681f3Smrg * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
2267ec681f3Smrg * CONST[28].__z_ RS.FogDensity
2277ec681f3Smrg
2287ec681f3Smrg * CONST[30].x___ TWEENFACTOR
2297ec681f3Smrg *
2307ec681f3Smrg * CONST[32].x___ LIGHT[0].Type
2317ec681f3Smrg * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
2327ec681f3Smrg * CONST[33]      LIGHT[0].Diffuse
2337ec681f3Smrg * CONST[34]      LIGHT[0].Specular
2347ec681f3Smrg * CONST[35]      LIGHT[0].Ambient
2357ec681f3Smrg * CONST[36].xyz_ LIGHT[0].Position
2367ec681f3Smrg * CONST[36].___w LIGHT[0].Range
2377ec681f3Smrg * CONST[37].xyz_ LIGHT[0].Direction
2387ec681f3Smrg * CONST[37].___w LIGHT[0].Falloff
2397ec681f3Smrg * CONST[38].x___ cos(LIGHT[0].Theta / 2)
2407ec681f3Smrg * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
2417ec681f3Smrg * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
2427ec681f3Smrg * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
2437ec681f3Smrg * CONST[39].___w 1 if this is the last active light, 0 if not
2447ec681f3Smrg * CONST[40]      LIGHT[1]
2457ec681f3Smrg * CONST[48]      LIGHT[2]
2467ec681f3Smrg * CONST[56]      LIGHT[3]
2477ec681f3Smrg * CONST[64]      LIGHT[4]
2487ec681f3Smrg * CONST[72]      LIGHT[5]
2497ec681f3Smrg * CONST[80]      LIGHT[6]
2507ec681f3Smrg * CONST[88]      LIGHT[7]
2517ec681f3Smrg * NOTE: no lighting code is generated if there are no active lights
2527ec681f3Smrg *
2537ec681f3Smrg * CONST[100].x___ Viewport 2/width
2547ec681f3Smrg * CONST[100]._y__ Viewport 2/height
2557ec681f3Smrg * CONST[100].__z_ Viewport 1/(zmax - zmin)
2567ec681f3Smrg * CONST[100].___w Viewport width
2577ec681f3Smrg * CONST[101].x___ Viewport x0
2587ec681f3Smrg * CONST[101]._y__ Viewport y0
2597ec681f3Smrg * CONST[101].__z_ Viewport z0
2607ec681f3Smrg *
2617ec681f3Smrg * CONST[128..131] D3DTS_TEXTURE0
2627ec681f3Smrg * CONST[132..135] D3DTS_TEXTURE1
2637ec681f3Smrg * CONST[136..139] D3DTS_TEXTURE2
2647ec681f3Smrg * CONST[140..143] D3DTS_TEXTURE3
2657ec681f3Smrg * CONST[144..147] D3DTS_TEXTURE4
2667ec681f3Smrg * CONST[148..151] D3DTS_TEXTURE5
2677ec681f3Smrg * CONST[152..155] D3DTS_TEXTURE6
2687ec681f3Smrg * CONST[156..159] D3DTS_TEXTURE7
2697ec681f3Smrg *
2707ec681f3Smrg * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
2717ec681f3Smrg * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
2727ec681f3Smrg * ...
2737ec681f3Smrg * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
2747ec681f3Smrg */
2757ec681f3Smrgstruct vs_build_ctx
2767ec681f3Smrg{
2777ec681f3Smrg    struct ureg_program *ureg;
2787ec681f3Smrg    const struct nine_ff_vs_key *key;
2797ec681f3Smrg
2807ec681f3Smrg    uint16_t input[PIPE_MAX_ATTRIBS];
2817ec681f3Smrg    unsigned num_inputs;
2827ec681f3Smrg
2837ec681f3Smrg    struct ureg_src aVtx;
2847ec681f3Smrg    struct ureg_src aNrm;
2857ec681f3Smrg    struct ureg_src aCol[2];
2867ec681f3Smrg    struct ureg_src aTex[8];
2877ec681f3Smrg    struct ureg_src aPsz;
2887ec681f3Smrg    struct ureg_src aInd;
2897ec681f3Smrg    struct ureg_src aWgt;
2907ec681f3Smrg
2917ec681f3Smrg    struct ureg_src aVtx1; /* tweening */
2927ec681f3Smrg    struct ureg_src aNrm1;
2937ec681f3Smrg
2947ec681f3Smrg    struct ureg_src mtlA;
2957ec681f3Smrg    struct ureg_src mtlD;
2967ec681f3Smrg    struct ureg_src mtlS;
2977ec681f3Smrg    struct ureg_src mtlE;
2987ec681f3Smrg};
2997ec681f3Smrg
3007ec681f3Smrgstatic inline unsigned
3017ec681f3Smrgget_texcoord_sn(struct pipe_screen *screen)
3027ec681f3Smrg{
3037ec681f3Smrg    if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
3047ec681f3Smrg        return TGSI_SEMANTIC_TEXCOORD;
3057ec681f3Smrg    return TGSI_SEMANTIC_GENERIC;
3067ec681f3Smrg}
3077ec681f3Smrg
3087ec681f3Smrgstatic inline struct ureg_src
3097ec681f3Smrgbuild_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
3107ec681f3Smrg{
3117ec681f3Smrg    const unsigned i = vs->num_inputs++;
3127ec681f3Smrg    assert(i < PIPE_MAX_ATTRIBS);
3137ec681f3Smrg    vs->input[i] = ndecl;
3147ec681f3Smrg    return ureg_DECL_vs_input(vs->ureg, i);
3157ec681f3Smrg}
3167ec681f3Smrg
3177ec681f3Smrg/* NOTE: dst may alias src */
3187ec681f3Smrgstatic inline void
3197ec681f3Smrgureg_normalize3(struct ureg_program *ureg,
3207ec681f3Smrg                struct ureg_dst dst, struct ureg_src src)
3217ec681f3Smrg{
3227ec681f3Smrg    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
3237ec681f3Smrg    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
3247ec681f3Smrg
3257ec681f3Smrg    ureg_DP3(ureg, tmp_x, src, src);
3267ec681f3Smrg    ureg_RSQ(ureg, tmp_x, _X(tmp));
3277ec681f3Smrg    ureg_MUL(ureg, dst, src, _X(tmp));
3287ec681f3Smrg    ureg_release_temporary(ureg, tmp);
3297ec681f3Smrg}
3307ec681f3Smrg
3317ec681f3Smrgstatic void *
3327ec681f3Smrgnine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
3337ec681f3Smrg{
3347ec681f3Smrg    const struct nine_ff_vs_key *key = vs->key;
3357ec681f3Smrg    struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
3367ec681f3Smrg    struct ureg_dst oPos, oCol[2], oPsz, oFog;
3377ec681f3Smrg    struct ureg_dst AR;
3387ec681f3Smrg    unsigned i, c;
3397ec681f3Smrg    unsigned label[32], l = 0;
3407ec681f3Smrg    boolean need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
3417ec681f3Smrg    boolean has_aNrm;
3427ec681f3Smrg    boolean need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
3437ec681f3Smrg    const unsigned texcoord_sn = get_texcoord_sn(device->screen);
3447ec681f3Smrg
3457ec681f3Smrg    vs->ureg = ureg;
3467ec681f3Smrg
3477ec681f3Smrg    /* Check which inputs we should transform. */
3487ec681f3Smrg    for (i = 0; i < 8 * 3; i += 3) {
3497ec681f3Smrg        switch ((key->tc_gen >> i) & 0x7) {
3507ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACENORMAL:
3517ec681f3Smrg            need_aNrm = TRUE;
3527ec681f3Smrg            break;
3537ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
3547ec681f3Smrg            need_aVtx = TRUE;
3557ec681f3Smrg            break;
3567ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
3577ec681f3Smrg            need_aVtx = need_aNrm = TRUE;
3587ec681f3Smrg            break;
3597ec681f3Smrg        case NINED3DTSS_TCI_SPHEREMAP:
3607ec681f3Smrg            need_aVtx = need_aNrm = TRUE;
3617ec681f3Smrg            break;
3627ec681f3Smrg        default:
3637ec681f3Smrg            break;
3647ec681f3Smrg        }
3657ec681f3Smrg    }
3667ec681f3Smrg
3677ec681f3Smrg    has_aNrm = need_aNrm && key->has_normal;
3687ec681f3Smrg
3697ec681f3Smrg    /* Declare and record used inputs (needed for linkage with vertex format):
3707ec681f3Smrg     * (texture coordinates handled later)
3717ec681f3Smrg     */
3727ec681f3Smrg    vs->aVtx = build_vs_add_input(vs,
3737ec681f3Smrg        key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
3747ec681f3Smrg
3757ec681f3Smrg    vs->aNrm = ureg_imm1f(ureg, 0.0f);
3767ec681f3Smrg    if (has_aNrm)
3777ec681f3Smrg        vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
3787ec681f3Smrg
3797ec681f3Smrg    vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
3807ec681f3Smrg    vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
3817ec681f3Smrg
3827ec681f3Smrg    if (key->lighting || key->darkness) {
3837ec681f3Smrg        const unsigned mask = key->mtl_diffuse | key->mtl_specular |
3847ec681f3Smrg                              key->mtl_ambient | key->mtl_emissive;
3857ec681f3Smrg        if ((mask & 0x1) && !key->color0in_one)
3867ec681f3Smrg            vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
3877ec681f3Smrg        if ((mask & 0x2) && !key->color1in_zero)
3887ec681f3Smrg            vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
3897ec681f3Smrg
3907ec681f3Smrg        vs->mtlD = MATERIAL_CONST(1);
3917ec681f3Smrg        vs->mtlA = MATERIAL_CONST(2);
3927ec681f3Smrg        vs->mtlS = MATERIAL_CONST(3);
3937ec681f3Smrg        vs->mtlE = MATERIAL_CONST(5);
3947ec681f3Smrg        if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
3957ec681f3Smrg        if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
3967ec681f3Smrg        if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
3977ec681f3Smrg        if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
3987ec681f3Smrg        if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
3997ec681f3Smrg        if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
4007ec681f3Smrg        if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
4017ec681f3Smrg        if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
4027ec681f3Smrg    } else {
4037ec681f3Smrg        if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
4047ec681f3Smrg        if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
4057ec681f3Smrg    }
4067ec681f3Smrg
4077ec681f3Smrg    if (key->vertexpointsize)
4087ec681f3Smrg        vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
4097ec681f3Smrg
4107ec681f3Smrg    if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
4117ec681f3Smrg        vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
4127ec681f3Smrg    if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
4137ec681f3Smrg        vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
4147ec681f3Smrg    if (key->vertextween) {
4157ec681f3Smrg        vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
4167ec681f3Smrg        vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
4177ec681f3Smrg    }
4187ec681f3Smrg
4197ec681f3Smrg    /* Declare outputs:
4207ec681f3Smrg     */
4217ec681f3Smrg    oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
4227ec681f3Smrg    oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
4237ec681f3Smrg    oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
4247ec681f3Smrg    if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
4257ec681f3Smrg        oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 16);
4267ec681f3Smrg        oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
4277ec681f3Smrg    }
4287ec681f3Smrg
4297ec681f3Smrg    if (key->vertexpointsize || key->pointscale) {
4307ec681f3Smrg        oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
4317ec681f3Smrg                                       TGSI_WRITEMASK_X, 0, 1);
4327ec681f3Smrg        oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
4337ec681f3Smrg    }
4347ec681f3Smrg
4357ec681f3Smrg    if (key->lighting || key->vertexblend)
4367ec681f3Smrg        AR = ureg_DECL_address(ureg);
4377ec681f3Smrg
4387ec681f3Smrg    /* === Vertex transformation / vertex blending:
4397ec681f3Smrg     */
4407ec681f3Smrg
4417ec681f3Smrg    if (key->position_t) {
4427ec681f3Smrg        if (device->driver_caps.window_space_position_support) {
4437ec681f3Smrg            ureg_MOV(ureg, oPos, vs->aVtx);
4447ec681f3Smrg        } else {
4457ec681f3Smrg            struct ureg_dst tmp = ureg_DECL_temporary(ureg);
4467ec681f3Smrg            /* vs->aVtx contains the coordinates buffer wise.
4477ec681f3Smrg            * later in the pipeline, clipping, viewport and division
4487ec681f3Smrg            * by w (rhw = 1/w) are going to be applied, so do the reverse
4497ec681f3Smrg            * of these transformations (except clipping) to have the good
4507ec681f3Smrg            * position at the end.*/
4517ec681f3Smrg            ureg_MOV(ureg, tmp, vs->aVtx);
4527ec681f3Smrg            /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
4537ec681f3Smrg            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
4547ec681f3Smrg            ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
4557ec681f3Smrg            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
4567ec681f3Smrg            /* Y needs to be reversed */
4577ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
4587ec681f3Smrg            /* inverse rhw */
4597ec681f3Smrg            ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
4607ec681f3Smrg            /* multiply X, Y, Z by w */
4617ec681f3Smrg            ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
4627ec681f3Smrg            ureg_MOV(ureg, oPos, ureg_src(tmp));
4637ec681f3Smrg            ureg_release_temporary(ureg, tmp);
4647ec681f3Smrg        }
4657ec681f3Smrg    } else if (key->vertexblend) {
4667ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
4677ec681f3Smrg        struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
4687ec681f3Smrg        struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
4697ec681f3Smrg        struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
4707ec681f3Smrg        struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
4717ec681f3Smrg        struct ureg_src cWM[4];
4727ec681f3Smrg
4737ec681f3Smrg        for (i = 160; i <= 195; ++i)
4747ec681f3Smrg            ureg_DECL_constant(ureg, i);
4757ec681f3Smrg
4767ec681f3Smrg        /* translate world matrix index to constant file index */
4777ec681f3Smrg        if (key->vertexblend_indexed) {
4787ec681f3Smrg            ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
4797ec681f3Smrg            ureg_ARL(ureg, AR, ureg_src(tmp));
4807ec681f3Smrg        }
4817ec681f3Smrg
4827ec681f3Smrg        ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
4837ec681f3Smrg        ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
4847ec681f3Smrg        ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
4857ec681f3Smrg
4867ec681f3Smrg        for (i = 0; i < key->vertexblend; ++i) {
4877ec681f3Smrg            for (c = 0; c < 4; ++c) {
4887ec681f3Smrg                cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
4897ec681f3Smrg                if (key->vertexblend_indexed)
4907ec681f3Smrg                    cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
4917ec681f3Smrg            }
4927ec681f3Smrg
4937ec681f3Smrg            /* multiply by WORLD(index) */
4947ec681f3Smrg            ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
4957ec681f3Smrg            ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
4967ec681f3Smrg            ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
4977ec681f3Smrg            ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
4987ec681f3Smrg
4997ec681f3Smrg            if (has_aNrm) {
5007ec681f3Smrg                /* Note: the spec says the transpose of the inverse of the
5017ec681f3Smrg                 * WorldView matrices should be used, but all tests show
5027ec681f3Smrg                 * otherwise.
5037ec681f3Smrg                 * Only case unknown: D3DVBF_0WEIGHTS */
5047ec681f3Smrg                ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
5057ec681f3Smrg                ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
5067ec681f3Smrg                ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
5077ec681f3Smrg            }
5087ec681f3Smrg
5097ec681f3Smrg            if (i < (key->vertexblend - 1)) {
5107ec681f3Smrg                /* accumulate weighted position value */
5117ec681f3Smrg                ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
5127ec681f3Smrg                if (has_aNrm)
5137ec681f3Smrg                    ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
5147ec681f3Smrg                /* subtract weighted position value for last value */
5157ec681f3Smrg                ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
5167ec681f3Smrg            }
5177ec681f3Smrg        }
5187ec681f3Smrg
5197ec681f3Smrg        /* the last weighted position is always 1 - sum_of_previous_weights */
5207ec681f3Smrg        ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
5217ec681f3Smrg        if (has_aNrm)
5227ec681f3Smrg            ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
5237ec681f3Smrg
5247ec681f3Smrg        /* multiply by VIEW_PROJ */
5257ec681f3Smrg        ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
5267ec681f3Smrg        ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
5277ec681f3Smrg        ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
5287ec681f3Smrg        ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
5297ec681f3Smrg
5307ec681f3Smrg        if (need_aVtx)
5317ec681f3Smrg            vs->aVtx = ureg_src(aVtx_dst);
5327ec681f3Smrg
5337ec681f3Smrg        ureg_release_temporary(ureg, tmp);
5347ec681f3Smrg        ureg_release_temporary(ureg, tmp2);
5357ec681f3Smrg        ureg_release_temporary(ureg, sum_blendweights);
5367ec681f3Smrg        if (!need_aVtx)
5377ec681f3Smrg            ureg_release_temporary(ureg, aVtx_dst);
5387ec681f3Smrg
5397ec681f3Smrg        if (has_aNrm) {
5407ec681f3Smrg            if (key->normalizenormals)
5417ec681f3Smrg               ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
5427ec681f3Smrg            vs->aNrm = ureg_src(aNrm_dst);
5437ec681f3Smrg        } else
5447ec681f3Smrg            ureg_release_temporary(ureg, aNrm_dst);
5457ec681f3Smrg    } else {
5467ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
5477ec681f3Smrg
5487ec681f3Smrg        if (key->vertextween) {
5497ec681f3Smrg            struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
5507ec681f3Smrg            ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
5517ec681f3Smrg            vs->aVtx = ureg_src(aVtx_dst);
5527ec681f3Smrg            if (has_aNrm) {
5537ec681f3Smrg                struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
5547ec681f3Smrg                ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
5557ec681f3Smrg                vs->aNrm = ureg_src(aNrm_dst);
5567ec681f3Smrg            }
5577ec681f3Smrg        }
5587ec681f3Smrg
5597ec681f3Smrg        /* position = vertex * WORLD_VIEW_PROJ */
5607ec681f3Smrg        ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
5617ec681f3Smrg        ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
5627ec681f3Smrg        ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
5637ec681f3Smrg        ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
5647ec681f3Smrg        ureg_release_temporary(ureg, tmp);
5657ec681f3Smrg
5667ec681f3Smrg        if (need_aVtx) {
5677ec681f3Smrg            struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
5687ec681f3Smrg            ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
5697ec681f3Smrg            ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
5707ec681f3Smrg            ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
5717ec681f3Smrg            ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
5727ec681f3Smrg            vs->aVtx = ureg_src(aVtx_dst);
5737ec681f3Smrg        }
5747ec681f3Smrg        if (has_aNrm) {
5757ec681f3Smrg            struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
5767ec681f3Smrg            ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
5777ec681f3Smrg            ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
5787ec681f3Smrg            ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
5797ec681f3Smrg            if (key->normalizenormals)
5807ec681f3Smrg               ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
5817ec681f3Smrg            vs->aNrm = ureg_src(aNrm_dst);
5827ec681f3Smrg        }
5837ec681f3Smrg    }
5847ec681f3Smrg
5857ec681f3Smrg    /* === Process point size:
5867ec681f3Smrg     */
5877ec681f3Smrg    if (key->vertexpointsize || key->pointscale) {
5887ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
5897ec681f3Smrg        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
5907ec681f3Smrg        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
5917ec681f3Smrg        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
5927ec681f3Smrg        if (key->vertexpointsize) {
5937ec681f3Smrg            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
5947ec681f3Smrg            ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
5957ec681f3Smrg            ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
5967ec681f3Smrg        } else {
5977ec681f3Smrg            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
5987ec681f3Smrg            ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
5997ec681f3Smrg        }
6007ec681f3Smrg
6017ec681f3Smrg        if (key->pointscale) {
6027ec681f3Smrg            struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
6037ec681f3Smrg            struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
6047ec681f3Smrg
6057ec681f3Smrg            ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
6067ec681f3Smrg            ureg_RSQ(ureg, tmp_y, _X(tmp));
6077ec681f3Smrg            ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
6087ec681f3Smrg            ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
6097ec681f3Smrg            ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
6107ec681f3Smrg            ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
6117ec681f3Smrg            ureg_RSQ(ureg, tmp_x, _X(tmp));
6127ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
6137ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
6147ec681f3Smrg            ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
6157ec681f3Smrg            ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
6167ec681f3Smrg        }
6177ec681f3Smrg
6187ec681f3Smrg        ureg_MOV(ureg, oPsz, _Z(tmp));
6197ec681f3Smrg        ureg_release_temporary(ureg, tmp);
6207ec681f3Smrg    }
6217ec681f3Smrg
6227ec681f3Smrg    for (i = 0; i < 8; ++i) {
6237ec681f3Smrg        struct ureg_dst tmp, tmp_x, tmp2;
6247ec681f3Smrg        struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
6257ec681f3Smrg        unsigned c, writemask;
6267ec681f3Smrg        const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
6277ec681f3Smrg        const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
6287ec681f3Smrg        unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
6297ec681f3Smrg        const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
6307ec681f3Smrg
6317ec681f3Smrg        /* No texture output of index s */
6327ec681f3Smrg        if (tci == NINED3DTSS_TCI_DISABLE)
6337ec681f3Smrg            continue;
6347ec681f3Smrg        oTex = ureg_DECL_output(ureg, texcoord_sn, i);
6357ec681f3Smrg        tmp = ureg_DECL_temporary(ureg);
6367ec681f3Smrg        tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
6377ec681f3Smrg        input_coord = ureg_DECL_temporary(ureg);
6387ec681f3Smrg        transformed = ureg_DECL_temporary(ureg);
6397ec681f3Smrg
6407ec681f3Smrg        /* Get the coordinate */
6417ec681f3Smrg        switch (tci) {
6427ec681f3Smrg        case NINED3DTSS_TCI_PASSTHRU:
6437ec681f3Smrg            /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
6447ec681f3Smrg             * Else the idx is used only to determine wrapping mode. */
6457ec681f3Smrg            vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
6467ec681f3Smrg            ureg_MOV(ureg, input_coord, vs->aTex[idx]);
6477ec681f3Smrg            break;
6487ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACENORMAL:
6497ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
6507ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
6517ec681f3Smrg            dim_input = 4;
6527ec681f3Smrg            break;
6537ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACEPOSITION:
6547ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
6557ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
6567ec681f3Smrg            dim_input = 4;
6577ec681f3Smrg            break;
6587ec681f3Smrg        case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
6597ec681f3Smrg            tmp.WriteMask = TGSI_WRITEMASK_XYZ;
6607ec681f3Smrg            aVtx_normed = ureg_DECL_temporary(ureg);
6617ec681f3Smrg            ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
6627ec681f3Smrg            ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
6637ec681f3Smrg            ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
6647ec681f3Smrg            ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
6657ec681f3Smrg            ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
6667ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
6677ec681f3Smrg            ureg_release_temporary(ureg, aVtx_normed);
6687ec681f3Smrg            dim_input = 4;
6697ec681f3Smrg            tmp.WriteMask = TGSI_WRITEMASK_XYZW;
6707ec681f3Smrg            break;
6717ec681f3Smrg        case NINED3DTSS_TCI_SPHEREMAP:
6727ec681f3Smrg            /* Implement the formula of GL_SPHERE_MAP */
6737ec681f3Smrg            tmp.WriteMask = TGSI_WRITEMASK_XYZ;
6747ec681f3Smrg            aVtx_normed = ureg_DECL_temporary(ureg);
6757ec681f3Smrg            tmp2 = ureg_DECL_temporary(ureg);
6767ec681f3Smrg            ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
6777ec681f3Smrg            ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
6787ec681f3Smrg            ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
6797ec681f3Smrg            ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
6807ec681f3Smrg            ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
6817ec681f3Smrg            /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
6827ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
6837ec681f3Smrg            ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
6847ec681f3Smrg            ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
6857ec681f3Smrg            ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
6867ec681f3Smrg            ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
6877ec681f3Smrg            /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
6887ec681f3Smrg             * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
6897ec681f3Smrg            ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
6907ec681f3Smrg            ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
6917ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
6927ec681f3Smrg            ureg_release_temporary(ureg, aVtx_normed);
6937ec681f3Smrg            ureg_release_temporary(ureg, tmp2);
6947ec681f3Smrg            dim_input = 4;
6957ec681f3Smrg            tmp.WriteMask = TGSI_WRITEMASK_XYZW;
6967ec681f3Smrg            break;
6977ec681f3Smrg        default:
6987ec681f3Smrg            assert(0);
6997ec681f3Smrg            break;
7007ec681f3Smrg        }
7017ec681f3Smrg
7027ec681f3Smrg        /* Apply the transformation */
7037ec681f3Smrg        /* dim_output == 0 => do not transform the components.
7047ec681f3Smrg         * XYZRHW also disables transformation */
7057ec681f3Smrg        if (!dim_output || key->position_t) {
7067ec681f3Smrg            ureg_release_temporary(ureg, transformed);
7077ec681f3Smrg            transformed = input_coord;
7087ec681f3Smrg            writemask = TGSI_WRITEMASK_XYZW;
7097ec681f3Smrg        } else {
7107ec681f3Smrg            for (c = 0; c < dim_output; c++) {
7117ec681f3Smrg                t = ureg_writemask(transformed, 1 << c);
7127ec681f3Smrg                switch (dim_input) {
7137ec681f3Smrg                /* dim_input = 1 2 3: -> we add trailing 1 to input*/
7147ec681f3Smrg                case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
7157ec681f3Smrg                        break;
7167ec681f3Smrg                case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
7177ec681f3Smrg                        ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
7187ec681f3Smrg                        break;
7197ec681f3Smrg                case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
7207ec681f3Smrg                        ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
7217ec681f3Smrg                        break;
7227ec681f3Smrg                case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
7237ec681f3Smrg                default:
7247ec681f3Smrg                    assert(0);
7257ec681f3Smrg                }
7267ec681f3Smrg            }
7277ec681f3Smrg            writemask = (1 << dim_output) - 1;
7287ec681f3Smrg            ureg_release_temporary(ureg, input_coord);
7297ec681f3Smrg        }
7307ec681f3Smrg
7317ec681f3Smrg        ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
7327ec681f3Smrg        ureg_release_temporary(ureg, transformed);
7337ec681f3Smrg        ureg_release_temporary(ureg, tmp);
7347ec681f3Smrg    }
7357ec681f3Smrg
7367ec681f3Smrg    /* === Lighting:
7377ec681f3Smrg     *
7387ec681f3Smrg     * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
7397ec681f3Smrg     * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
7407ec681f3Smrg     * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
7417ec681f3Smrg     *
7427ec681f3Smrg     * vec3 normal = normalize(in.Normal * NormalMatrix);
7437ec681f3Smrg     * vec3 hitDir = light.direction;
7447ec681f3Smrg     * float atten = 1.0;
7457ec681f3Smrg     *
7467ec681f3Smrg     * if (light.type != DIRECTIONAL)
7477ec681f3Smrg     * {
7487ec681f3Smrg     *     vec3 hitVec = light.position - eyeVertex;
7497ec681f3Smrg     *     float d = length(hitVec);
7507ec681f3Smrg     *     hitDir = hitVec / d;
7517ec681f3Smrg     *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
7527ec681f3Smrg     * }
7537ec681f3Smrg     *
7547ec681f3Smrg     * if (light.type == SPOTLIGHT)
7557ec681f3Smrg     * {
7567ec681f3Smrg     *     float rho = dp3(-hitVec, light.direction);
7577ec681f3Smrg     *     if (rho < cos(light.phi / 2))
7587ec681f3Smrg     *         atten = 0;
7597ec681f3Smrg     *     if (rho < cos(light.theta / 2))
7607ec681f3Smrg     *         atten *= pow(some_func(rho), light.falloff);
7617ec681f3Smrg     * }
7627ec681f3Smrg     *
7637ec681f3Smrg     * float nDotHit = dp3_sat(normal, hitVec);
7647ec681f3Smrg     * float powFact = 0.0;
7657ec681f3Smrg     *
7667ec681f3Smrg     * if (nDotHit > 0.0)
7677ec681f3Smrg     * {
7687ec681f3Smrg     *     vec3 midVec = normalize(hitDir + eye);
7697ec681f3Smrg     *     float nDotMid = dp3_sat(normal, midVec);
7707ec681f3Smrg     *     pFact = pow(nDotMid, material.power);
7717ec681f3Smrg     * }
7727ec681f3Smrg     *
7737ec681f3Smrg     * ambient += light.ambient * atten;
7747ec681f3Smrg     * diffuse += light.diffuse * atten * nDotHit;
7757ec681f3Smrg     * specular += light.specular * atten * powFact;
7767ec681f3Smrg     */
7777ec681f3Smrg    if (key->lighting) {
7787ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
7797ec681f3Smrg        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
7807ec681f3Smrg        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
7817ec681f3Smrg        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
7827ec681f3Smrg        struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
7837ec681f3Smrg        struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
7847ec681f3Smrg        struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
7857ec681f3Smrg
7867ec681f3Smrg        struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
7877ec681f3Smrg
7887ec681f3Smrg        struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
7897ec681f3Smrg
7907ec681f3Smrg        /* Light.*.Alpha is not used. */
7917ec681f3Smrg        struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
7927ec681f3Smrg        struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
7937ec681f3Smrg        struct ureg_dst rS = ureg_DECL_temporary(ureg);
7947ec681f3Smrg
7957ec681f3Smrg        struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
7967ec681f3Smrg
7977ec681f3Smrg        struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
7987ec681f3Smrg        struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
7997ec681f3Smrg        struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
8007ec681f3Smrg        struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
8017ec681f3Smrg        struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
8027ec681f3Smrg        struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
8037ec681f3Smrg        struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
8047ec681f3Smrg        struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
8057ec681f3Smrg        struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
8067ec681f3Smrg        struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
8077ec681f3Smrg        struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
8087ec681f3Smrg        struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
8097ec681f3Smrg        struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
8107ec681f3Smrg        struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
8117ec681f3Smrg        struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
8127ec681f3Smrg
8137ec681f3Smrg        const unsigned loop_label = l++;
8147ec681f3Smrg
8157ec681f3Smrg        /* Declare all light constants to allow indirect adressing */
8167ec681f3Smrg        for (i = 32; i < 96; i++)
8177ec681f3Smrg            ureg_DECL_constant(ureg, i);
8187ec681f3Smrg
8197ec681f3Smrg        ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
8207ec681f3Smrg        ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
8217ec681f3Smrg        ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
8227ec681f3Smrg        ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
8237ec681f3Smrg
8247ec681f3Smrg        /* loop management */
8257ec681f3Smrg        ureg_BGNLOOP(ureg, &label[loop_label]);
8267ec681f3Smrg        ureg_ARL(ureg, AL, _W(rCtr));
8277ec681f3Smrg
8287ec681f3Smrg        /* if (not DIRECTIONAL light): */
8297ec681f3Smrg        ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
8307ec681f3Smrg        ureg_MOV(ureg, rHit, ureg_negate(cLDir));
8317ec681f3Smrg        ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
8327ec681f3Smrg        ureg_IF(ureg, _X(tmp), &label[l++]);
8337ec681f3Smrg        {
8347ec681f3Smrg            /* hitDir = light.position - eyeVtx
8357ec681f3Smrg             * d = length(hitDir)
8367ec681f3Smrg             */
8377ec681f3Smrg            ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
8387ec681f3Smrg            ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
8397ec681f3Smrg            ureg_RSQ(ureg, tmp_y, _X(tmp));
8407ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
8417ec681f3Smrg
8427ec681f3Smrg            /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
8437ec681f3Smrg            ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
8447ec681f3Smrg            ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
8457ec681f3Smrg            ureg_RCP(ureg, rAtt, _W(rAtt));
8467ec681f3Smrg            /* cut-off if distance exceeds Light.Range */
8477ec681f3Smrg            ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
8487ec681f3Smrg            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
8497ec681f3Smrg        }
8507ec681f3Smrg        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
8517ec681f3Smrg        ureg_ENDIF(ureg);
8527ec681f3Smrg
8537ec681f3Smrg        /* normalize hitDir */
8547ec681f3Smrg        ureg_normalize3(ureg, rHit, ureg_src(rHit));
8557ec681f3Smrg
8567ec681f3Smrg        /* if (SPOT light) */
8577ec681f3Smrg        ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
8587ec681f3Smrg        ureg_IF(ureg, _X(tmp), &label[l++]);
8597ec681f3Smrg        {
8607ec681f3Smrg            /* rho = dp3(-hitDir, light.spotDir)
8617ec681f3Smrg             *
8627ec681f3Smrg             * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
8637ec681f3Smrg             *     spotAtt = 1
8647ec681f3Smrg             * else
8657ec681f3Smrg             * if (rho <= light.cphi2)
8667ec681f3Smrg             *     spotAtt = 0
8677ec681f3Smrg             * else
8687ec681f3Smrg             *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
8697ec681f3Smrg             */
8707ec681f3Smrg            ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
8717ec681f3Smrg            ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
8727ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
8737ec681f3Smrg            ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
8747ec681f3Smrg            ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
8757ec681f3Smrg            ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
8767ec681f3Smrg            ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
8777ec681f3Smrg            ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
8787ec681f3Smrg        }
8797ec681f3Smrg        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
8807ec681f3Smrg        ureg_ENDIF(ureg);
8817ec681f3Smrg
8827ec681f3Smrg        /* directional factors, let's not use LIT because of clarity */
8837ec681f3Smrg
8847ec681f3Smrg        if (has_aNrm) {
8857ec681f3Smrg            if (key->localviewer) {
8867ec681f3Smrg                ureg_normalize3(ureg, rMid, vs->aVtx);
8877ec681f3Smrg                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
8887ec681f3Smrg            } else {
8897ec681f3Smrg                ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
8907ec681f3Smrg            }
8917ec681f3Smrg            ureg_normalize3(ureg, rMid, ureg_src(rMid));
8927ec681f3Smrg            ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
8937ec681f3Smrg            ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
8947ec681f3Smrg            ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
8957ec681f3Smrg            /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
8967ec681f3Smrg             * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
8977ec681f3Smrg             * No tests were made for backfacing, so add the two conditions */
8987ec681f3Smrg            ureg_IF(ureg, _Z(tmp), &label[l++]);
8997ec681f3Smrg            {
9007ec681f3Smrg                ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
9017ec681f3Smrg                ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
9027ec681f3Smrg                ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
9037ec681f3Smrg                ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
9047ec681f3Smrg            }
9057ec681f3Smrg            ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
9067ec681f3Smrg            ureg_ENDIF(ureg);
9077ec681f3Smrg
9087ec681f3Smrg            ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
9097ec681f3Smrg            ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
9107ec681f3Smrg        }
9117ec681f3Smrg
9127ec681f3Smrg        ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
9137ec681f3Smrg
9147ec681f3Smrg        /* break if this was the last light */
9157ec681f3Smrg        ureg_IF(ureg, cLLast, &label[l++]);
9167ec681f3Smrg        ureg_BRK(ureg);
9177ec681f3Smrg        ureg_ENDIF(ureg);
9187ec681f3Smrg        ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
9197ec681f3Smrg
9207ec681f3Smrg        ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
9217ec681f3Smrg        ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
9227ec681f3Smrg        ureg_ENDLOOP(ureg, &label[loop_label]);
9237ec681f3Smrg
9247ec681f3Smrg        /* Apply to material:
9257ec681f3Smrg         *
9267ec681f3Smrg         * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
9277ec681f3Smrg         *           material.ambient * ambient +
9287ec681f3Smrg         *           material.diffuse * diffuse +
9297ec681f3Smrg         * oCol[1] = material.specular * specular;
9307ec681f3Smrg         */
9317ec681f3Smrg        if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
9327ec681f3Smrg            ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
9337ec681f3Smrg        else {
9347ec681f3Smrg            ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
9357ec681f3Smrg            ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
9367ec681f3Smrg        }
9377ec681f3Smrg
9387ec681f3Smrg        ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
9397ec681f3Smrg        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
9407ec681f3Smrg        ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
9417ec681f3Smrg        ureg_release_temporary(ureg, rAtt);
9427ec681f3Smrg        ureg_release_temporary(ureg, rHit);
9437ec681f3Smrg        ureg_release_temporary(ureg, rMid);
9447ec681f3Smrg        ureg_release_temporary(ureg, rCtr);
9457ec681f3Smrg        ureg_release_temporary(ureg, rD);
9467ec681f3Smrg        ureg_release_temporary(ureg, rA);
9477ec681f3Smrg        ureg_release_temporary(ureg, rS);
9487ec681f3Smrg        ureg_release_temporary(ureg, rAtt);
9497ec681f3Smrg        ureg_release_temporary(ureg, tmp);
9507ec681f3Smrg    } else
9517ec681f3Smrg    /* COLOR */
9527ec681f3Smrg    if (key->darkness) {
9537ec681f3Smrg        if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
9547ec681f3Smrg            ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
9557ec681f3Smrg        else
9567ec681f3Smrg            ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
9577ec681f3Smrg        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
9587ec681f3Smrg        ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
9597ec681f3Smrg    } else {
9607ec681f3Smrg        ureg_MOV(ureg, oCol[0], vs->aCol[0]);
9617ec681f3Smrg        ureg_MOV(ureg, oCol[1], vs->aCol[1]);
9627ec681f3Smrg    }
9637ec681f3Smrg
9647ec681f3Smrg    /* === Process fog.
9657ec681f3Smrg     *
9667ec681f3Smrg     * exp(x) = ex2(log2(e) * x)
9677ec681f3Smrg     */
9687ec681f3Smrg    if (key->fog_mode) {
9697ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
9707ec681f3Smrg        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
9717ec681f3Smrg        struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
9727ec681f3Smrg        if (key->fog_range) {
9737ec681f3Smrg            ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
9747ec681f3Smrg            ureg_RSQ(ureg, tmp_z, _X(tmp));
9757ec681f3Smrg            ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
9767ec681f3Smrg        } else {
9777ec681f3Smrg            ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
9787ec681f3Smrg        }
9797ec681f3Smrg
9807ec681f3Smrg        if (key->fog_mode == D3DFOG_EXP) {
9817ec681f3Smrg            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
9827ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
9837ec681f3Smrg            ureg_EX2(ureg, tmp_x, _X(tmp));
9847ec681f3Smrg        } else
9857ec681f3Smrg        if (key->fog_mode == D3DFOG_EXP2) {
9867ec681f3Smrg            ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
9877ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
9887ec681f3Smrg            ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
9897ec681f3Smrg            ureg_EX2(ureg, tmp_x, _X(tmp));
9907ec681f3Smrg        } else
9917ec681f3Smrg        if (key->fog_mode == D3DFOG_LINEAR) {
9927ec681f3Smrg            ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
9937ec681f3Smrg            ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
9947ec681f3Smrg        }
9957ec681f3Smrg        ureg_MOV(ureg, oFog, _X(tmp));
9967ec681f3Smrg        ureg_release_temporary(ureg, tmp);
9977ec681f3Smrg    } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
9987ec681f3Smrg        ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
9997ec681f3Smrg    }
10007ec681f3Smrg
10017ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
10027ec681f3Smrg        struct ureg_src input;
10037ec681f3Smrg        struct ureg_dst output;
10047ec681f3Smrg        input = vs->aWgt;
10057ec681f3Smrg        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
10067ec681f3Smrg        ureg_MOV(ureg, output, input);
10077ec681f3Smrg    }
10087ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
10097ec681f3Smrg        struct ureg_src input;
10107ec681f3Smrg        struct ureg_dst output;
10117ec681f3Smrg        input = vs->aInd;
10127ec681f3Smrg        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
10137ec681f3Smrg        ureg_MOV(ureg, output, input);
10147ec681f3Smrg    }
10157ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
10167ec681f3Smrg        struct ureg_src input;
10177ec681f3Smrg        struct ureg_dst output;
10187ec681f3Smrg        input = vs->aNrm;
10197ec681f3Smrg        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
10207ec681f3Smrg        ureg_MOV(ureg, output, input);
10217ec681f3Smrg    }
10227ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
10237ec681f3Smrg        struct ureg_src input;
10247ec681f3Smrg        struct ureg_dst output;
10257ec681f3Smrg        input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
10267ec681f3Smrg        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
10277ec681f3Smrg        ureg_MOV(ureg, output, input);
10287ec681f3Smrg    }
10297ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
10307ec681f3Smrg        struct ureg_src input;
10317ec681f3Smrg        struct ureg_dst output;
10327ec681f3Smrg        input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
10337ec681f3Smrg        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 23);
10347ec681f3Smrg        ureg_MOV(ureg, output, input);
10357ec681f3Smrg    }
10367ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
10377ec681f3Smrg        struct ureg_src input;
10387ec681f3Smrg        struct ureg_dst output;
10397ec681f3Smrg        input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
10407ec681f3Smrg        input = ureg_scalar(input, TGSI_SWIZZLE_X);
10417ec681f3Smrg        output = oFog;
10427ec681f3Smrg        ureg_MOV(ureg, output, input);
10437ec681f3Smrg    }
10447ec681f3Smrg    if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
10457ec681f3Smrg        (void) 0; /* TODO: replace z of position output ? */
10467ec681f3Smrg    }
10477ec681f3Smrg
10487ec681f3Smrg    /* ucp for ff applies on world coordinates.
10497ec681f3Smrg     * aVtx is in worldview coordinates. */
10507ec681f3Smrg    if (key->ucp) {
10517ec681f3Smrg        struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
10527ec681f3Smrg        struct ureg_dst tmp = ureg_DECL_temporary(ureg);
10537ec681f3Smrg        ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
10547ec681f3Smrg        ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
10557ec681f3Smrg        ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
10567ec681f3Smrg        ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
10577ec681f3Smrg        ureg_release_temporary(ureg, tmp);
10587ec681f3Smrg    }
10597ec681f3Smrg
10607ec681f3Smrg    if (key->position_t && device->driver_caps.window_space_position_support)
10617ec681f3Smrg        ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
10627ec681f3Smrg
10637ec681f3Smrg    ureg_END(ureg);
10647ec681f3Smrg    nine_ureg_tgsi_dump(ureg, FALSE);
10657ec681f3Smrg    return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
10667ec681f3Smrg}
10677ec681f3Smrg
10687ec681f3Smrg/* PS FF constants layout:
10697ec681f3Smrg *
10707ec681f3Smrg * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
10717ec681f3Smrg * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
10727ec681f3Smrg * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
10737ec681f3Smrg * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
10747ec681f3Smrg * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
10757ec681f3Smrg * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
10767ec681f3Smrg * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
10777ec681f3Smrg *
10787ec681f3Smrg * CONST[20] D3DRS_TEXTUREFACTOR
10797ec681f3Smrg * CONST[21] D3DRS_FOGCOLOR
10807ec681f3Smrg * CONST[22].x___ RS.FogEnd
10817ec681f3Smrg * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
10827ec681f3Smrg * CONST[22].__z_ RS.FogDensity
10837ec681f3Smrg */
10847ec681f3Smrgstruct ps_build_ctx
10857ec681f3Smrg{
10867ec681f3Smrg    struct ureg_program *ureg;
10877ec681f3Smrg
10887ec681f3Smrg    struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
10897ec681f3Smrg    struct ureg_src vT[8]; /* TEXCOORD[i] */
10907ec681f3Smrg    struct ureg_dst rCur; /* D3DTA_CURRENT */
10917ec681f3Smrg    struct ureg_dst rMod;
10927ec681f3Smrg    struct ureg_src rCurSrc;
10937ec681f3Smrg    struct ureg_dst rTmp; /* D3DTA_TEMP */
10947ec681f3Smrg    struct ureg_src rTmpSrc;
10957ec681f3Smrg    struct ureg_dst rTex;
10967ec681f3Smrg    struct ureg_src rTexSrc;
10977ec681f3Smrg    struct ureg_src cBEM[8];
10987ec681f3Smrg    struct ureg_src s[8];
10997ec681f3Smrg
11007ec681f3Smrg    struct {
11017ec681f3Smrg        unsigned index;
11027ec681f3Smrg        unsigned index_pre_mod;
11037ec681f3Smrg    } stage;
11047ec681f3Smrg};
11057ec681f3Smrg
11067ec681f3Smrgstatic struct ureg_src
11077ec681f3Smrgps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
11087ec681f3Smrg{
11097ec681f3Smrg    struct ureg_src reg;
11107ec681f3Smrg
11117ec681f3Smrg    switch (ta & D3DTA_SELECTMASK) {
11127ec681f3Smrg    case D3DTA_CONSTANT:
11137ec681f3Smrg        reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
11147ec681f3Smrg        break;
11157ec681f3Smrg    case D3DTA_CURRENT:
11167ec681f3Smrg        reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
11177ec681f3Smrg        break;
11187ec681f3Smrg    case D3DTA_DIFFUSE:
11197ec681f3Smrg        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
11207ec681f3Smrg        break;
11217ec681f3Smrg    case D3DTA_SPECULAR:
11227ec681f3Smrg        reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
11237ec681f3Smrg        break;
11247ec681f3Smrg    case D3DTA_TEMP:
11257ec681f3Smrg        reg = ps->rTmpSrc;
11267ec681f3Smrg        break;
11277ec681f3Smrg    case D3DTA_TEXTURE:
11287ec681f3Smrg        reg = ps->rTexSrc;
11297ec681f3Smrg        break;
11307ec681f3Smrg    case D3DTA_TFACTOR:
11317ec681f3Smrg        reg = ureg_DECL_constant(ps->ureg, 20);
11327ec681f3Smrg        break;
11337ec681f3Smrg    default:
11347ec681f3Smrg        assert(0);
11357ec681f3Smrg        reg = ureg_src_undef();
11367ec681f3Smrg        break;
11377ec681f3Smrg    }
11387ec681f3Smrg    if (ta & D3DTA_COMPLEMENT) {
11397ec681f3Smrg        struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
11407ec681f3Smrg        ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
11417ec681f3Smrg        reg = ureg_src(dst);
11427ec681f3Smrg    }
11437ec681f3Smrg    if (ta & D3DTA_ALPHAREPLICATE)
11447ec681f3Smrg        reg = _WWWW(reg);
11457ec681f3Smrg    return reg;
11467ec681f3Smrg}
11477ec681f3Smrg
11487ec681f3Smrgstatic struct ureg_dst
11497ec681f3Smrgps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
11507ec681f3Smrg{
11517ec681f3Smrg    assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
11527ec681f3Smrg
11537ec681f3Smrg    switch (ta & D3DTA_SELECTMASK) {
11547ec681f3Smrg    case D3DTA_CURRENT:
11557ec681f3Smrg        return ps->rCur;
11567ec681f3Smrg    case D3DTA_TEMP:
11577ec681f3Smrg        return ps->rTmp;
11587ec681f3Smrg    default:
11597ec681f3Smrg        assert(0);
11607ec681f3Smrg        return ureg_dst_undef();
11617ec681f3Smrg    }
11627ec681f3Smrg}
11637ec681f3Smrg
11647ec681f3Smrgstatic uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
11657ec681f3Smrg{
11667ec681f3Smrg    switch (top) {
11677ec681f3Smrg    case D3DTOP_DISABLE:
11687ec681f3Smrg        return 0x0;
11697ec681f3Smrg    case D3DTOP_SELECTARG1:
11707ec681f3Smrg    case D3DTOP_PREMODULATE:
11717ec681f3Smrg        return 0x2;
11727ec681f3Smrg    case D3DTOP_SELECTARG2:
11737ec681f3Smrg        return 0x4;
11747ec681f3Smrg    case D3DTOP_MULTIPLYADD:
11757ec681f3Smrg    case D3DTOP_LERP:
11767ec681f3Smrg        return 0x7;
11777ec681f3Smrg    default:
11787ec681f3Smrg        return 0x6;
11797ec681f3Smrg    }
11807ec681f3Smrg}
11817ec681f3Smrg
11827ec681f3Smrgstatic inline boolean
11837ec681f3Smrgis_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
11847ec681f3Smrg{
11857ec681f3Smrg    return !dst.WriteMask ||
11867ec681f3Smrg        (dst.File == src.File &&
11877ec681f3Smrg         dst.Index == src.Index &&
11887ec681f3Smrg         !dst.Indirect &&
11897ec681f3Smrg         !dst.Saturate &&
11907ec681f3Smrg         !src.Indirect &&
11917ec681f3Smrg         !src.Negate &&
11927ec681f3Smrg         !src.Absolute &&
11937ec681f3Smrg         (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
11947ec681f3Smrg         (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
11957ec681f3Smrg         (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
11967ec681f3Smrg         (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
11977ec681f3Smrg
11987ec681f3Smrg}
11997ec681f3Smrg
12007ec681f3Smrgstatic void
12017ec681f3Smrgps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
12027ec681f3Smrg{
12037ec681f3Smrg    struct ureg_program *ureg = ps->ureg;
12047ec681f3Smrg    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
12057ec681f3Smrg    struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
12067ec681f3Smrg    struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
12077ec681f3Smrg
12087ec681f3Smrg    tmp.WriteMask = dst.WriteMask;
12097ec681f3Smrg
12107ec681f3Smrg    if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
12117ec681f3Smrg        top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
12127ec681f3Smrg        top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
12137ec681f3Smrg        top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
12147ec681f3Smrg        top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
12157ec681f3Smrg        top != D3DTOP_LERP)
12167ec681f3Smrg        dst = ureg_saturate(dst);
12177ec681f3Smrg
12187ec681f3Smrg    switch (top) {
12197ec681f3Smrg    case D3DTOP_SELECTARG1:
12207ec681f3Smrg        if (!is_MOV_no_op(dst, arg[1]))
12217ec681f3Smrg            ureg_MOV(ureg, dst, arg[1]);
12227ec681f3Smrg        break;
12237ec681f3Smrg    case D3DTOP_SELECTARG2:
12247ec681f3Smrg        if (!is_MOV_no_op(dst, arg[2]))
12257ec681f3Smrg            ureg_MOV(ureg, dst, arg[2]);
12267ec681f3Smrg        break;
12277ec681f3Smrg    case D3DTOP_MODULATE:
12287ec681f3Smrg        ureg_MUL(ureg, dst, arg[1], arg[2]);
12297ec681f3Smrg        break;
12307ec681f3Smrg    case D3DTOP_MODULATE2X:
12317ec681f3Smrg        ureg_MUL(ureg, tmp, arg[1], arg[2]);
12327ec681f3Smrg        ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
12337ec681f3Smrg        break;
12347ec681f3Smrg    case D3DTOP_MODULATE4X:
12357ec681f3Smrg        ureg_MUL(ureg, tmp, arg[1], arg[2]);
12367ec681f3Smrg        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
12377ec681f3Smrg        break;
12387ec681f3Smrg    case D3DTOP_ADD:
12397ec681f3Smrg        ureg_ADD(ureg, dst, arg[1], arg[2]);
12407ec681f3Smrg        break;
12417ec681f3Smrg    case D3DTOP_ADDSIGNED:
12427ec681f3Smrg        ureg_ADD(ureg, tmp, arg[1], arg[2]);
12437ec681f3Smrg        ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
12447ec681f3Smrg        break;
12457ec681f3Smrg    case D3DTOP_ADDSIGNED2X:
12467ec681f3Smrg        ureg_ADD(ureg, tmp, arg[1], arg[2]);
12477ec681f3Smrg        ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
12487ec681f3Smrg        break;
12497ec681f3Smrg    case D3DTOP_SUBTRACT:
12507ec681f3Smrg        ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
12517ec681f3Smrg        break;
12527ec681f3Smrg    case D3DTOP_ADDSMOOTH:
12537ec681f3Smrg        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
12547ec681f3Smrg        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
12557ec681f3Smrg        break;
12567ec681f3Smrg    case D3DTOP_BLENDDIFFUSEALPHA:
12577ec681f3Smrg        ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
12587ec681f3Smrg        break;
12597ec681f3Smrg    case D3DTOP_BLENDTEXTUREALPHA:
12607ec681f3Smrg        /* XXX: alpha taken from previous stage, texture or result ? */
12617ec681f3Smrg        ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
12627ec681f3Smrg        break;
12637ec681f3Smrg    case D3DTOP_BLENDFACTORALPHA:
12647ec681f3Smrg        ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
12657ec681f3Smrg        break;
12667ec681f3Smrg    case D3DTOP_BLENDTEXTUREALPHAPM:
12677ec681f3Smrg        ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
12687ec681f3Smrg        ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
12697ec681f3Smrg        break;
12707ec681f3Smrg    case D3DTOP_BLENDCURRENTALPHA:
12717ec681f3Smrg        ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
12727ec681f3Smrg        break;
12737ec681f3Smrg    case D3DTOP_PREMODULATE:
12747ec681f3Smrg        ureg_MOV(ureg, dst, arg[1]);
12757ec681f3Smrg        ps->stage.index_pre_mod = ps->stage.index + 1;
12767ec681f3Smrg        break;
12777ec681f3Smrg    case D3DTOP_MODULATEALPHA_ADDCOLOR:
12787ec681f3Smrg        ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
12797ec681f3Smrg        break;
12807ec681f3Smrg    case D3DTOP_MODULATECOLOR_ADDALPHA:
12817ec681f3Smrg        ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
12827ec681f3Smrg        break;
12837ec681f3Smrg    case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
12847ec681f3Smrg        ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
12857ec681f3Smrg        ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
12867ec681f3Smrg        break;
12877ec681f3Smrg    case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
12887ec681f3Smrg        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
12897ec681f3Smrg        ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
12907ec681f3Smrg        break;
12917ec681f3Smrg    case D3DTOP_BUMPENVMAP:
12927ec681f3Smrg        break;
12937ec681f3Smrg    case D3DTOP_BUMPENVMAPLUMINANCE:
12947ec681f3Smrg        break;
12957ec681f3Smrg    case D3DTOP_DOTPRODUCT3:
12967ec681f3Smrg        ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
12977ec681f3Smrg        ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
12987ec681f3Smrg        ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
12997ec681f3Smrg        ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
13007ec681f3Smrg        break;
13017ec681f3Smrg    case D3DTOP_MULTIPLYADD:
13027ec681f3Smrg        ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
13037ec681f3Smrg        break;
13047ec681f3Smrg    case D3DTOP_LERP:
13057ec681f3Smrg        ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
13067ec681f3Smrg        break;
13077ec681f3Smrg    case D3DTOP_DISABLE:
13087ec681f3Smrg        /* no-op ? */
13097ec681f3Smrg        break;
13107ec681f3Smrg    default:
13117ec681f3Smrg        assert(!"invalid D3DTOP");
13127ec681f3Smrg        break;
13137ec681f3Smrg    }
13147ec681f3Smrg    ureg_release_temporary(ureg, tmp);
13157ec681f3Smrg    ureg_release_temporary(ureg, tmp2);
13167ec681f3Smrg}
13177ec681f3Smrg
13187ec681f3Smrgstatic void *
13197ec681f3Smrgnine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
13207ec681f3Smrg{
13217ec681f3Smrg    struct ps_build_ctx ps;
13227ec681f3Smrg    struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
13237ec681f3Smrg    struct ureg_dst oCol;
13247ec681f3Smrg    unsigned s;
13257ec681f3Smrg    const unsigned texcoord_sn = get_texcoord_sn(device->screen);
13267ec681f3Smrg
13277ec681f3Smrg    memset(&ps, 0, sizeof(ps));
13287ec681f3Smrg    ps.ureg = ureg;
13297ec681f3Smrg    ps.stage.index_pre_mod = -1;
13307ec681f3Smrg
13317ec681f3Smrg    ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, TGSI_INTERPOLATE_COLOR);
13327ec681f3Smrg
13337ec681f3Smrg    ps.rCur = ureg_DECL_temporary(ureg);
13347ec681f3Smrg    ps.rTmp = ureg_DECL_temporary(ureg);
13357ec681f3Smrg    ps.rTex = ureg_DECL_temporary(ureg);
13367ec681f3Smrg    ps.rCurSrc = ureg_src(ps.rCur);
13377ec681f3Smrg    ps.rTmpSrc = ureg_src(ps.rTmp);
13387ec681f3Smrg    ps.rTexSrc = ureg_src(ps.rTex);
13397ec681f3Smrg
13407ec681f3Smrg    /* Initial values */
13417ec681f3Smrg    ureg_MOV(ureg, ps.rCur, ps.vC[0]);
13427ec681f3Smrg    ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
13437ec681f3Smrg    ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
13447ec681f3Smrg
13457ec681f3Smrg    for (s = 0; s < 8; ++s) {
13467ec681f3Smrg        ps.s[s] = ureg_src_undef();
13477ec681f3Smrg
13487ec681f3Smrg        if (key->ts[s].colorop != D3DTOP_DISABLE) {
13497ec681f3Smrg            if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
13507ec681f3Smrg                key->ts[s].colorarg1 == D3DTA_SPECULAR ||
13517ec681f3Smrg                key->ts[s].colorarg2 == D3DTA_SPECULAR)
13527ec681f3Smrg                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
13537ec681f3Smrg
13547ec681f3Smrg            if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
13557ec681f3Smrg                key->ts[s].colorarg1 == D3DTA_TEXTURE ||
13567ec681f3Smrg                key->ts[s].colorarg2 == D3DTA_TEXTURE ||
13577ec681f3Smrg                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
13587ec681f3Smrg                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
13597ec681f3Smrg                ps.s[s] = ureg_DECL_sampler(ureg, s);
13607ec681f3Smrg                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
13617ec681f3Smrg            }
13627ec681f3Smrg            if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
13637ec681f3Smrg                      key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
13647ec681f3Smrg                ps.s[s] = ureg_DECL_sampler(ureg, s);
13657ec681f3Smrg        }
13667ec681f3Smrg
13677ec681f3Smrg        if (key->ts[s].alphaop != D3DTOP_DISABLE) {
13687ec681f3Smrg            if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
13697ec681f3Smrg                key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
13707ec681f3Smrg                key->ts[s].alphaarg2 == D3DTA_SPECULAR)
13717ec681f3Smrg                ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
13727ec681f3Smrg
13737ec681f3Smrg            if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
13747ec681f3Smrg                key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
13757ec681f3Smrg                key->ts[s].alphaarg2 == D3DTA_TEXTURE ||
13767ec681f3Smrg                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
13777ec681f3Smrg                key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
13787ec681f3Smrg                ps.s[s] = ureg_DECL_sampler(ureg, s);
13797ec681f3Smrg                ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
13807ec681f3Smrg            }
13817ec681f3Smrg        }
13827ec681f3Smrg    }
13837ec681f3Smrg    if (key->specular)
13847ec681f3Smrg        ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, TGSI_INTERPOLATE_COLOR);
13857ec681f3Smrg
13867ec681f3Smrg    oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
13877ec681f3Smrg
13887ec681f3Smrg    /* Run stages.
13897ec681f3Smrg     */
13907ec681f3Smrg    for (s = 0; s < 8; ++s) {
13917ec681f3Smrg        unsigned colorarg[3];
13927ec681f3Smrg        unsigned alphaarg[3];
13937ec681f3Smrg        const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
13947ec681f3Smrg        const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
13957ec681f3Smrg        struct ureg_dst dst;
13967ec681f3Smrg        struct ureg_src arg[3];
13977ec681f3Smrg
13987ec681f3Smrg        if (key->ts[s].colorop == D3DTOP_DISABLE) {
13997ec681f3Smrg            assert (key->ts[s].alphaop == D3DTOP_DISABLE);
14007ec681f3Smrg            continue;
14017ec681f3Smrg        }
14027ec681f3Smrg        ps.stage.index = s;
14037ec681f3Smrg
14047ec681f3Smrg        DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
14057ec681f3Smrg            nine_D3DTOP_to_str(key->ts[s].colorop),
14067ec681f3Smrg            nine_D3DTOP_to_str(key->ts[s].alphaop));
14077ec681f3Smrg
14087ec681f3Smrg        if (!ureg_src_is_undef(ps.s[s])) {
14097ec681f3Smrg            unsigned target;
14107ec681f3Smrg            struct ureg_src texture_coord = ps.vT[s];
14117ec681f3Smrg            struct ureg_dst delta;
14127ec681f3Smrg            switch (key->ts[s].textarget) {
14137ec681f3Smrg            case 0: target = TGSI_TEXTURE_1D; break;
14147ec681f3Smrg            case 1: target = TGSI_TEXTURE_2D; break;
14157ec681f3Smrg            case 2: target = TGSI_TEXTURE_3D; break;
14167ec681f3Smrg            case 3: target = TGSI_TEXTURE_CUBE; break;
14177ec681f3Smrg            /* this is a 2 bit bitfield, do I really need a default case ? */
14187ec681f3Smrg            }
14197ec681f3Smrg
14207ec681f3Smrg            /* Modify coordinates */
14217ec681f3Smrg            if (s >= 1 &&
14227ec681f3Smrg                (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
14237ec681f3Smrg                 key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
14247ec681f3Smrg                delta = ureg_DECL_temporary(ureg);
14257ec681f3Smrg                /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
14267ec681f3Smrg                ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
14277ec681f3Smrg                ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
14287ec681f3Smrg                /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
14297ec681f3Smrg                ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
14307ec681f3Smrg                ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
14317ec681f3Smrg                texture_coord = ureg_src(ureg_DECL_temporary(ureg));
14327ec681f3Smrg                ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
14337ec681f3Smrg                ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
14347ec681f3Smrg                /* Prepare luminance multiplier
14357ec681f3Smrg                 * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
14367ec681f3Smrg                if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
14377ec681f3Smrg                    struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
14387ec681f3Smrg                    struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
14397ec681f3Smrg
14407ec681f3Smrg                    ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
14417ec681f3Smrg                }
14427ec681f3Smrg            }
14437ec681f3Smrg            if (key->projected & (3 << (s *2))) {
14447ec681f3Smrg                unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
14457ec681f3Smrg                if (dim == 4)
14467ec681f3Smrg                    ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
14477ec681f3Smrg                else {
14487ec681f3Smrg                    struct ureg_dst tmp = ureg_DECL_temporary(ureg);
14497ec681f3Smrg                    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
14507ec681f3Smrg                    ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
14517ec681f3Smrg                    ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
14527ec681f3Smrg                    ureg_release_temporary(ureg, tmp);
14537ec681f3Smrg                }
14547ec681f3Smrg            } else {
14557ec681f3Smrg                ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
14567ec681f3Smrg            }
14577ec681f3Smrg            if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
14587ec681f3Smrg                ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
14597ec681f3Smrg        }
14607ec681f3Smrg
14617ec681f3Smrg        if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
14627ec681f3Smrg            key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
14637ec681f3Smrg            continue;
14647ec681f3Smrg
14657ec681f3Smrg        dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
14667ec681f3Smrg
14677ec681f3Smrg        if (ps.stage.index_pre_mod == ps.stage.index) {
14687ec681f3Smrg            ps.rMod = ureg_DECL_temporary(ureg);
14697ec681f3Smrg            ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
14707ec681f3Smrg        }
14717ec681f3Smrg
14727ec681f3Smrg        colorarg[0] = (key->ts[s].colorarg0 | (((key->colorarg_b4[0] >> s) & 0x1) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
14737ec681f3Smrg        colorarg[1] = (key->ts[s].colorarg1 | (((key->colorarg_b4[1] >> s) & 0x1) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
14747ec681f3Smrg        colorarg[2] = (key->ts[s].colorarg2 | (((key->colorarg_b4[2] >> s) & 0x1) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
14757ec681f3Smrg        alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
14767ec681f3Smrg        alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
14777ec681f3Smrg        alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
14787ec681f3Smrg
14797ec681f3Smrg        if (key->ts[s].colorop != key->ts[s].alphaop ||
14807ec681f3Smrg            colorarg[0] != alphaarg[0] ||
14817ec681f3Smrg            colorarg[1] != alphaarg[1] ||
14827ec681f3Smrg            colorarg[2] != alphaarg[2])
14837ec681f3Smrg            dst.WriteMask = TGSI_WRITEMASK_XYZ;
14847ec681f3Smrg
14857ec681f3Smrg        /* Special DOTPRODUCT behaviour (see wine tests) */
14867ec681f3Smrg        if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
14877ec681f3Smrg            dst.WriteMask = TGSI_WRITEMASK_XYZW;
14887ec681f3Smrg
14897ec681f3Smrg        if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
14907ec681f3Smrg        if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
14917ec681f3Smrg        if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
14927ec681f3Smrg        ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
14937ec681f3Smrg
14947ec681f3Smrg        if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
14957ec681f3Smrg            dst.WriteMask = TGSI_WRITEMASK_W;
14967ec681f3Smrg
14977ec681f3Smrg            if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
14987ec681f3Smrg            if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
14997ec681f3Smrg            if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
15007ec681f3Smrg            ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
15017ec681f3Smrg        }
15027ec681f3Smrg    }
15037ec681f3Smrg
15047ec681f3Smrg    if (key->specular)
15057ec681f3Smrg        ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
15067ec681f3Smrg
15077ec681f3Smrg    /* Fog.
15087ec681f3Smrg     */
15097ec681f3Smrg    if (key->fog_mode) {
15107ec681f3Smrg        struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
15117ec681f3Smrg        struct ureg_src vPos;
15127ec681f3Smrg        if (device->screen->get_param(device->screen,
15137ec681f3Smrg                                      PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL)) {
15147ec681f3Smrg            vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
15157ec681f3Smrg        } else {
15167ec681f3Smrg            vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
15177ec681f3Smrg                                      TGSI_INTERPOLATE_LINEAR);
15187ec681f3Smrg        }
15197ec681f3Smrg
15207ec681f3Smrg        /* Source is either W or Z.
15217ec681f3Smrg         * When we use vs ff,
15227ec681f3Smrg         * Z is when an orthogonal projection matrix is detected,
15237ec681f3Smrg         * W (WFOG) else.
15247ec681f3Smrg         * Z is used for programmable vs.
15257ec681f3Smrg         * Note: Tests indicate that the projection matrix coefficients do
15267ec681f3Smrg         * actually affect pixel fog (and not vertex fog) when vs ff is used,
15277ec681f3Smrg         * which justifies taking the position's w instead of taking the z coordinate
15287ec681f3Smrg         * before the projection in the vs shader.
15297ec681f3Smrg         */
15307ec681f3Smrg        if (!key->fog_source)
15317ec681f3Smrg            ureg_MOV(ureg, rFog, _ZZZZ(vPos));
15327ec681f3Smrg        else
15337ec681f3Smrg            /* Position's w is 1/w */
15347ec681f3Smrg            ureg_RCP(ureg, rFog, _WWWW(vPos));
15357ec681f3Smrg
15367ec681f3Smrg        if (key->fog_mode == D3DFOG_EXP) {
15377ec681f3Smrg            ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
15387ec681f3Smrg            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
15397ec681f3Smrg            ureg_EX2(ureg, rFog, _X(rFog));
15407ec681f3Smrg        } else
15417ec681f3Smrg        if (key->fog_mode == D3DFOG_EXP2) {
15427ec681f3Smrg            ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
15437ec681f3Smrg            ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
15447ec681f3Smrg            ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
15457ec681f3Smrg            ureg_EX2(ureg, rFog, _X(rFog));
15467ec681f3Smrg        } else
15477ec681f3Smrg        if (key->fog_mode == D3DFOG_LINEAR) {
15487ec681f3Smrg            ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
15497ec681f3Smrg            ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
15507ec681f3Smrg        }
15517ec681f3Smrg        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
15527ec681f3Smrg        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
15537ec681f3Smrg    } else
15547ec681f3Smrg    if (key->fog) {
15557ec681f3Smrg        struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16, TGSI_INTERPOLATE_PERSPECTIVE);
15567ec681f3Smrg        ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
15577ec681f3Smrg        ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
15587ec681f3Smrg    } else {
15597ec681f3Smrg        ureg_MOV(ureg, oCol, ps.rCurSrc);
15607ec681f3Smrg    }
15617ec681f3Smrg
15627ec681f3Smrg    ureg_END(ureg);
15637ec681f3Smrg    nine_ureg_tgsi_dump(ureg, FALSE);
15647ec681f3Smrg    return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
15657ec681f3Smrg}
15667ec681f3Smrg
15677ec681f3Smrgstatic struct NineVertexShader9 *
15687ec681f3Smrgnine_ff_get_vs(struct NineDevice9 *device)
15697ec681f3Smrg{
15707ec681f3Smrg    const struct nine_context *context = &device->context;
15717ec681f3Smrg    struct NineVertexShader9 *vs;
15727ec681f3Smrg    struct vs_build_ctx bld;
15737ec681f3Smrg    struct nine_ff_vs_key key;
15747ec681f3Smrg    unsigned s, i;
15757ec681f3Smrg    boolean has_indexes = false;
15767ec681f3Smrg    boolean has_weights = false;
15777ec681f3Smrg    char input_texture_coord[8];
15787ec681f3Smrg
15797ec681f3Smrg    assert(sizeof(key) <= sizeof(key.value32));
15807ec681f3Smrg
15817ec681f3Smrg    memset(&key, 0, sizeof(key));
15827ec681f3Smrg    memset(&bld, 0, sizeof(bld));
15837ec681f3Smrg    memset(&input_texture_coord, 0, sizeof(input_texture_coord));
15847ec681f3Smrg
15857ec681f3Smrg    bld.key = &key;
15867ec681f3Smrg
15877ec681f3Smrg    /* FIXME: this shouldn't be NULL, but it is on init */
15887ec681f3Smrg    if (context->vdecl) {
15897ec681f3Smrg        key.color0in_one = 1;
15907ec681f3Smrg        key.color1in_zero = 1;
15917ec681f3Smrg        for (i = 0; i < context->vdecl->nelems; i++) {
15927ec681f3Smrg            uint16_t usage = context->vdecl->usage_map[i];
15937ec681f3Smrg            if (usage == NINE_DECLUSAGE_POSITIONT)
15947ec681f3Smrg                key.position_t = 1;
15957ec681f3Smrg            else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
15967ec681f3Smrg                key.color0in_one = 0;
15977ec681f3Smrg            else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
15987ec681f3Smrg                key.color1in_zero = 0;
15997ec681f3Smrg            else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
16007ec681f3Smrg                has_indexes = true;
16017ec681f3Smrg                key.passthrough |= 1 << usage;
16027ec681f3Smrg            } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
16037ec681f3Smrg                has_weights = true;
16047ec681f3Smrg                key.passthrough |= 1 << usage;
16057ec681f3Smrg            } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
16067ec681f3Smrg                key.has_normal = 1;
16077ec681f3Smrg                key.passthrough |= 1 << usage;
16087ec681f3Smrg            } else if (usage == NINE_DECLUSAGE_PSIZE)
16097ec681f3Smrg                key.vertexpointsize = 1;
16107ec681f3Smrg            else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
16117ec681f3Smrg                s = usage / NINE_DECLUSAGE_COUNT;
16127ec681f3Smrg                if (s < 8)
16137ec681f3Smrg                    input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
16147ec681f3Smrg                else
16157ec681f3Smrg                    DBG("FF given texture coordinate >= 8. Ignoring\n");
16167ec681f3Smrg            } else if (usage < NINE_DECLUSAGE_NONE)
16177ec681f3Smrg                key.passthrough |= 1 << usage;
16187ec681f3Smrg        }
16197ec681f3Smrg    }
16207ec681f3Smrg    /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
16217ec681f3Smrg     * We do restrict to indices 0 */
16227ec681f3Smrg    key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
16237ec681f3Smrg                         (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
16247ec681f3Smrg                         (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
16257ec681f3Smrg    if (!key.position_t)
16267ec681f3Smrg        key.passthrough = 0;
16277ec681f3Smrg    key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
16287ec681f3Smrg
16297ec681f3Smrg    key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
16307ec681f3Smrg    key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
16317ec681f3Smrg    if (key.position_t) {
16327ec681f3Smrg        key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
16337ec681f3Smrg        key.lighting = 0;
16347ec681f3Smrg    }
16357ec681f3Smrg    if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
16367ec681f3Smrg        uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
16377ec681f3Smrg        key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
16387ec681f3Smrg        key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
16397ec681f3Smrg        key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
16407ec681f3Smrg        key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
16417ec681f3Smrg    }
16427ec681f3Smrg    key.fog = !!context->rs[D3DRS_FOGENABLE];
16437ec681f3Smrg    key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
16447ec681f3Smrg    if (key.fog_mode)
16457ec681f3Smrg        key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
16467ec681f3Smrg
16477ec681f3Smrg    key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
16487ec681f3Smrg    key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
16497ec681f3Smrg    key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
16507ec681f3Smrg
16517ec681f3Smrg    if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
16527ec681f3Smrg        key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
16537ec681f3Smrg
16547ec681f3Smrg        switch (context->rs[D3DRS_VERTEXBLEND]) {
16557ec681f3Smrg        case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
16567ec681f3Smrg        case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
16577ec681f3Smrg        case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
16587ec681f3Smrg        case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
16597ec681f3Smrg        case D3DVBF_TWEENING: key.vertextween = 1; break;
16607ec681f3Smrg        default:
16617ec681f3Smrg            assert(!"invalid D3DVBF");
16627ec681f3Smrg            break;
16637ec681f3Smrg        }
16647ec681f3Smrg        if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
16657ec681f3Smrg            key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
16667ec681f3Smrg    }
16677ec681f3Smrg
16687ec681f3Smrg    for (s = 0; s < 8; ++s) {
16697ec681f3Smrg        unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
16707ec681f3Smrg        unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
16717ec681f3Smrg        unsigned dim;
16727ec681f3Smrg
16737ec681f3Smrg        if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
16747ec681f3Smrg            gen = NINED3DTSS_TCI_PASSTHRU;
16757ec681f3Smrg
16767ec681f3Smrg        if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
16777ec681f3Smrg            gen = NINED3DTSS_TCI_DISABLE;
16787ec681f3Smrg
16797ec681f3Smrg        key.tc_gen |= gen << (s * 3);
16807ec681f3Smrg        key.tc_idx |= idx << (s * 3);
16817ec681f3Smrg        key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
16827ec681f3Smrg
16837ec681f3Smrg        dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
16847ec681f3Smrg        if (dim > 4)
16857ec681f3Smrg            dim = input_texture_coord[idx];
16867ec681f3Smrg        if (dim == 1) /* NV behaviour */
16877ec681f3Smrg            dim = 0;
16887ec681f3Smrg        key.tc_dim_output |= dim << (s * 3);
16897ec681f3Smrg    }
16907ec681f3Smrg
16917ec681f3Smrg    DBG("VS ff key hash: %x\n", nine_ff_vs_key_hash(&key));
16927ec681f3Smrg    vs = util_hash_table_get(device->ff.ht_vs, &key);
16937ec681f3Smrg    if (vs)
16947ec681f3Smrg        return vs;
16957ec681f3Smrg    NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
16967ec681f3Smrg
16977ec681f3Smrg    nine_ff_prune_vs(device);
16987ec681f3Smrg    if (vs) {
16997ec681f3Smrg        unsigned n;
17007ec681f3Smrg
17017ec681f3Smrg        memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
17027ec681f3Smrg
17037ec681f3Smrg        _mesa_hash_table_insert(device->ff.ht_vs, &vs->ff_key, vs);
17047ec681f3Smrg        device->ff.num_vs++;
17057ec681f3Smrg
17067ec681f3Smrg        vs->num_inputs = bld.num_inputs;
17077ec681f3Smrg        for (n = 0; n < bld.num_inputs; ++n)
17087ec681f3Smrg            vs->input_map[n].ndecl = bld.input[n];
17097ec681f3Smrg
17107ec681f3Smrg        vs->position_t = key.position_t;
17117ec681f3Smrg        vs->point_size = key.vertexpointsize | key.pointscale;
17127ec681f3Smrg    }
17137ec681f3Smrg    return vs;
17147ec681f3Smrg}
17157ec681f3Smrg
17167ec681f3Smrg#define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
17177ec681f3Smrg#define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
17187ec681f3Smrg
17197ec681f3Smrgstatic struct NinePixelShader9 *
17207ec681f3Smrgnine_ff_get_ps(struct NineDevice9 *device)
17217ec681f3Smrg{
17227ec681f3Smrg    struct nine_context *context = &device->context;
17237ec681f3Smrg    D3DMATRIX *projection_matrix = GET_D3DTS(PROJECTION);
17247ec681f3Smrg    struct NinePixelShader9 *ps;
17257ec681f3Smrg    struct nine_ff_ps_key key;
17267ec681f3Smrg    unsigned s;
17277ec681f3Smrg    uint8_t sampler_mask = 0;
17287ec681f3Smrg
17297ec681f3Smrg    assert(sizeof(key) <= sizeof(key.value32));
17307ec681f3Smrg
17317ec681f3Smrg    memset(&key, 0, sizeof(key));
17327ec681f3Smrg    for (s = 0; s < 8; ++s) {
17337ec681f3Smrg        key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
17347ec681f3Smrg        key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
17357ec681f3Smrg        const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
17367ec681f3Smrg        const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
17377ec681f3Smrg        /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
17387ec681f3Smrg         * ALPHAOP cannot be enabled if COLOROP is disabled.
17397ec681f3Smrg         * Verified on Windows. */
17407ec681f3Smrg        if (key.ts[s].colorop == D3DTOP_DISABLE) {
17417ec681f3Smrg            key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
17427ec681f3Smrg            break;
17437ec681f3Smrg        }
17447ec681f3Smrg
17457ec681f3Smrg        if (!context->texture[s].enabled &&
17467ec681f3Smrg            ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
17477ec681f3Smrg              used_c & 0x1) ||
17487ec681f3Smrg             (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
17497ec681f3Smrg              used_c & 0x2) ||
17507ec681f3Smrg             (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
17517ec681f3Smrg              used_c & 0x4))) {
17527ec681f3Smrg            /* Tested on Windows: Invalid texture read disables the stage
17537ec681f3Smrg             * and the subsequent ones, but only for colorop. For alpha,
17547ec681f3Smrg             * it's as if the texture had alpha of 1.0, which is what
17557ec681f3Smrg             * has our dummy texture in that case. Invalid color also
17567ec681f3Smrg             * disabled the following alpha stages. */
17577ec681f3Smrg            key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
17587ec681f3Smrg            break;
17597ec681f3Smrg        }
17607ec681f3Smrg
17617ec681f3Smrg        if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
17627ec681f3Smrg            context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
17637ec681f3Smrg            context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
17647ec681f3Smrg            context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
17657ec681f3Smrg            context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
17667ec681f3Smrg            context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
17677ec681f3Smrg            sampler_mask |= (1 << s);
17687ec681f3Smrg
17697ec681f3Smrg        if (key.ts[s].colorop != D3DTOP_DISABLE) {
17707ec681f3Smrg            if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0] & 0x7;
17717ec681f3Smrg            if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1] & 0x7;
17727ec681f3Smrg            if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2] & 0x7;
17737ec681f3Smrg            if (used_c & 0x1) key.colorarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) & 0x1) << s;
17747ec681f3Smrg            if (used_c & 0x1) key.colorarg_b5[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) & 0x1) << s;
17757ec681f3Smrg            if (used_c & 0x2) key.colorarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) & 0x1) << s;
17767ec681f3Smrg            if (used_c & 0x2) key.colorarg_b5[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) & 0x1) << s;
17777ec681f3Smrg            if (used_c & 0x4) key.colorarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) & 0x1) << s;
17787ec681f3Smrg            if (used_c & 0x4) key.colorarg_b5[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) & 0x1) << s;
17797ec681f3Smrg        }
17807ec681f3Smrg        if (key.ts[s].alphaop != D3DTOP_DISABLE) {
17817ec681f3Smrg            if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0] & 0x7;
17827ec681f3Smrg            if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1] & 0x7;
17837ec681f3Smrg            if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2] & 0x7;
17847ec681f3Smrg            if (used_a & 0x1) key.alphaarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) & 0x1) << s;
17857ec681f3Smrg            if (used_a & 0x2) key.alphaarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) & 0x1) << s;
17867ec681f3Smrg            if (used_a & 0x4) key.alphaarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) & 0x1) << s;
17877ec681f3Smrg        }
17887ec681f3Smrg        key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
17897ec681f3Smrg
17907ec681f3Smrg        if (context->texture[s].enabled) {
17917ec681f3Smrg            switch (context->texture[s].type) {
17927ec681f3Smrg            case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
17937ec681f3Smrg            case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
17947ec681f3Smrg            case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
17957ec681f3Smrg            default:
17967ec681f3Smrg                assert(!"unexpected texture type");
17977ec681f3Smrg                break;
17987ec681f3Smrg            }
17997ec681f3Smrg        } else {
18007ec681f3Smrg            key.ts[s].textarget = 1;
18017ec681f3Smrg        }
18027ec681f3Smrg    }
18037ec681f3Smrg
18047ec681f3Smrg    /* Note: If colorop is D3DTOP_DISABLE for the first stage
18057ec681f3Smrg     * (which implies alphaop is too), nothing particular happens,
18067ec681f3Smrg     * that is, current is equal to diffuse (which is the case anyway,
18077ec681f3Smrg     * because it is how it is initialized).
18087ec681f3Smrg     * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
18097ec681f3Smrg     * because then if the resultarg is TEMP, then diffuse alpha is written
18107ec681f3Smrg     * to it. */
18117ec681f3Smrg    if (key.ts[0].colorop != D3DTOP_DISABLE &&
18127ec681f3Smrg        key.ts[0].alphaop == D3DTOP_DISABLE &&
18137ec681f3Smrg        key.ts[0].resultarg != 0) {
18147ec681f3Smrg        key.ts[0].alphaop = D3DTOP_SELECTARG1;
18157ec681f3Smrg        key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
18167ec681f3Smrg    }
18177ec681f3Smrg    /* When no alpha stage writes to current, diffuse alpha is taken.
18187ec681f3Smrg     * Since we initialize current to diffuse, we have the behaviour. */
18197ec681f3Smrg
18207ec681f3Smrg    /* Last stage always writes to Current */
18217ec681f3Smrg    if (s >= 1)
18227ec681f3Smrg        key.ts[s-1].resultarg = 0;
18237ec681f3Smrg
18247ec681f3Smrg    key.projected = nine_ff_get_projected_key_ff(context);
18257ec681f3Smrg    key.specular = !!context->rs[D3DRS_SPECULARENABLE];
18267ec681f3Smrg
18277ec681f3Smrg    for (; s < 8; ++s)
18287ec681f3Smrg        key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
18297ec681f3Smrg    if (context->rs[D3DRS_FOGENABLE])
18307ec681f3Smrg        key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
18317ec681f3Smrg    key.fog = !!context->rs[D3DRS_FOGENABLE];
18327ec681f3Smrg    /* Pixel fog (with WFOG advertised): source is either Z or W.
18337ec681f3Smrg     * W is the source if vs ff is used, and the
18347ec681f3Smrg     * projection matrix is not orthogonal.
18357ec681f3Smrg     * Tests on Win 10 seem to indicate _34
18367ec681f3Smrg     * and _33 are checked against 0, 1. */
18377ec681f3Smrg    if (key.fog_mode && key.fog)
18387ec681f3Smrg        key.fog_source = !context->programmable_vs &&
18397ec681f3Smrg            !(projection_matrix->_34 == 0.0f &&
18407ec681f3Smrg              projection_matrix->_44 == 1.0f);
18417ec681f3Smrg
18427ec681f3Smrg    DBG("PS ff key hash: %x\n", nine_ff_ps_key_hash(&key));
18437ec681f3Smrg    ps = util_hash_table_get(device->ff.ht_ps, &key);
18447ec681f3Smrg    if (ps)
18457ec681f3Smrg        return ps;
18467ec681f3Smrg    NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
18477ec681f3Smrg
18487ec681f3Smrg    nine_ff_prune_ps(device);
18497ec681f3Smrg    if (ps) {
18507ec681f3Smrg        memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
18517ec681f3Smrg
18527ec681f3Smrg        _mesa_hash_table_insert(device->ff.ht_ps, &ps->ff_key, ps);
18537ec681f3Smrg        device->ff.num_ps++;
18547ec681f3Smrg
18557ec681f3Smrg        ps->rt_mask = 0x1;
18567ec681f3Smrg        ps->sampler_mask = sampler_mask;
18577ec681f3Smrg    }
18587ec681f3Smrg    return ps;
18597ec681f3Smrg}
18607ec681f3Smrg
18617ec681f3Smrgstatic void
18627ec681f3Smrgnine_ff_load_vs_transforms(struct NineDevice9 *device)
18637ec681f3Smrg{
18647ec681f3Smrg    struct nine_context *context = &device->context;
18657ec681f3Smrg    D3DMATRIX T;
18667ec681f3Smrg    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
18677ec681f3Smrg    unsigned i;
18687ec681f3Smrg
18697ec681f3Smrg    /* TODO: make this nicer, and only upload the ones we need */
18707ec681f3Smrg    /* TODO: use ff.vs_const as storage of W, V, P matrices */
18717ec681f3Smrg
18727ec681f3Smrg    if (IS_D3DTS_DIRTY(context, WORLD) ||
18737ec681f3Smrg        IS_D3DTS_DIRTY(context, VIEW) ||
18747ec681f3Smrg        IS_D3DTS_DIRTY(context, PROJECTION)) {
18757ec681f3Smrg        /* WVP, WV matrices */
18767ec681f3Smrg        nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
18777ec681f3Smrg        nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
18787ec681f3Smrg
18797ec681f3Smrg        /* normal matrix == transpose(inverse(WV)) */
18807ec681f3Smrg        nine_d3d_matrix_inverse(&T, &M[1]);
18817ec681f3Smrg        nine_d3d_matrix_transpose(&M[4], &T);
18827ec681f3Smrg
18837ec681f3Smrg        /* P matrix */
18847ec681f3Smrg        M[2] = *GET_D3DTS(PROJECTION);
18857ec681f3Smrg
18867ec681f3Smrg        /* V and W matrix */
18877ec681f3Smrg        nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
18887ec681f3Smrg        M[40] = M[1];
18897ec681f3Smrg    }
18907ec681f3Smrg
18917ec681f3Smrg    if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
18927ec681f3Smrg        /* load other world matrices */
18937ec681f3Smrg        for (i = 1; i <= 8; ++i) {
18947ec681f3Smrg            nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
18957ec681f3Smrg        }
18967ec681f3Smrg    }
18977ec681f3Smrg
18987ec681f3Smrg    device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
18997ec681f3Smrg}
19007ec681f3Smrg
19017ec681f3Smrgstatic void
19027ec681f3Smrgnine_ff_load_lights(struct NineDevice9 *device)
19037ec681f3Smrg{
19047ec681f3Smrg    struct nine_context *context = &device->context;
19057ec681f3Smrg    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
19067ec681f3Smrg    unsigned l;
19077ec681f3Smrg
19087ec681f3Smrg    if (context->changed.group & NINE_STATE_FF_MATERIAL) {
19097ec681f3Smrg        const D3DMATERIAL9 *mtl = &context->ff.material;
19107ec681f3Smrg
19117ec681f3Smrg        memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
19127ec681f3Smrg        memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
19137ec681f3Smrg        memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
19147ec681f3Smrg        dst[23].x = mtl->Power;
19157ec681f3Smrg        memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
19167ec681f3Smrg        d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
19177ec681f3Smrg        dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
19187ec681f3Smrg        dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
19197ec681f3Smrg        dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
19207ec681f3Smrg    }
19217ec681f3Smrg
19227ec681f3Smrg    if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
19237ec681f3Smrg        return;
19247ec681f3Smrg
19257ec681f3Smrg    for (l = 0; l < context->ff.num_lights_active; ++l) {
19267ec681f3Smrg        const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
19277ec681f3Smrg
19287ec681f3Smrg        dst[32 + l * 8].x = light->Type;
19297ec681f3Smrg        dst[32 + l * 8].y = light->Attenuation0;
19307ec681f3Smrg        dst[32 + l * 8].z = light->Attenuation1;
19317ec681f3Smrg        dst[32 + l * 8].w = light->Attenuation2;
19327ec681f3Smrg        memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
19337ec681f3Smrg        memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
19347ec681f3Smrg        memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
19357ec681f3Smrg        nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
19367ec681f3Smrg        nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
19377ec681f3Smrg        dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
19387ec681f3Smrg        dst[37 + l * 8].w = light->Falloff;
19397ec681f3Smrg        dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
19407ec681f3Smrg        dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
19417ec681f3Smrg        dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
19427ec681f3Smrg        dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
19437ec681f3Smrg    }
19447ec681f3Smrg}
19457ec681f3Smrg
19467ec681f3Smrgstatic void
19477ec681f3Smrgnine_ff_load_point_and_fog_params(struct NineDevice9 *device)
19487ec681f3Smrg{
19497ec681f3Smrg    struct nine_context *context = &device->context;
19507ec681f3Smrg    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
19517ec681f3Smrg
19527ec681f3Smrg    if (!(context->changed.group & NINE_STATE_FF_VS_OTHER))
19537ec681f3Smrg        return;
19547ec681f3Smrg    dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
19557ec681f3Smrg    dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
19567ec681f3Smrg    dst[26].z = asfloat(context->rs[D3DRS_POINTSIZE]);
19577ec681f3Smrg    dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
19587ec681f3Smrg    dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
19597ec681f3Smrg    dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
19607ec681f3Smrg    dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
19617ec681f3Smrg    dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
19627ec681f3Smrg    if (isinf(dst[28].y))
19637ec681f3Smrg        dst[28].y = 0.0f;
19647ec681f3Smrg    dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
19657ec681f3Smrg}
19667ec681f3Smrg
19677ec681f3Smrgstatic void
19687ec681f3Smrgnine_ff_load_tex_matrices(struct NineDevice9 *device)
19697ec681f3Smrg{
19707ec681f3Smrg    struct nine_context *context = &device->context;
19717ec681f3Smrg    D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
19727ec681f3Smrg    unsigned s;
19737ec681f3Smrg
19747ec681f3Smrg    if (!(context->ff.changed.transform[0] & 0xff0000))
19757ec681f3Smrg        return;
19767ec681f3Smrg    for (s = 0; s < 8; ++s) {
19777ec681f3Smrg        if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
19787ec681f3Smrg            nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, FALSE));
19797ec681f3Smrg    }
19807ec681f3Smrg}
19817ec681f3Smrg
19827ec681f3Smrgstatic void
19837ec681f3Smrgnine_ff_load_ps_params(struct NineDevice9 *device)
19847ec681f3Smrg{
19857ec681f3Smrg    struct nine_context *context = &device->context;
19867ec681f3Smrg    struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
19877ec681f3Smrg    unsigned s;
19887ec681f3Smrg
19897ec681f3Smrg    if (!(context->changed.group & NINE_STATE_FF_PS_CONSTS))
19907ec681f3Smrg        return;
19917ec681f3Smrg
19927ec681f3Smrg    for (s = 0; s < 8; ++s)
19937ec681f3Smrg        d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
19947ec681f3Smrg
19957ec681f3Smrg    for (s = 0; s < 8; ++s) {
19967ec681f3Smrg        dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
19977ec681f3Smrg        dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
19987ec681f3Smrg        dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
19997ec681f3Smrg        dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
20007ec681f3Smrg        if (s & 1) {
20017ec681f3Smrg            dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
20027ec681f3Smrg            dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
20037ec681f3Smrg        } else {
20047ec681f3Smrg            dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
20057ec681f3Smrg            dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
20067ec681f3Smrg        }
20077ec681f3Smrg    }
20087ec681f3Smrg
20097ec681f3Smrg    d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
20107ec681f3Smrg    d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
20117ec681f3Smrg    dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
20127ec681f3Smrg    dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
20137ec681f3Smrg    dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
20147ec681f3Smrg}
20157ec681f3Smrg
20167ec681f3Smrgstatic void
20177ec681f3Smrgnine_ff_load_viewport_info(struct NineDevice9 *device)
20187ec681f3Smrg{
20197ec681f3Smrg    D3DVIEWPORT9 *viewport = &device->context.viewport;
20207ec681f3Smrg    struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
20217ec681f3Smrg    float diffZ = viewport->MaxZ - viewport->MinZ;
20227ec681f3Smrg
20237ec681f3Smrg    /* Note: the other functions avoids to fill the const again if nothing changed.
20247ec681f3Smrg     * But we don't have much to fill, and adding code to allow that may be complex
20257ec681f3Smrg     * so just fill it always */
20267ec681f3Smrg    dst[100].x = 2.0f / (float)(viewport->Width);
20277ec681f3Smrg    dst[100].y = 2.0f / (float)(viewport->Height);
20287ec681f3Smrg    dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
20297ec681f3Smrg    dst[100].w = (float)(viewport->Width);
20307ec681f3Smrg    dst[101].x = (float)(viewport->X);
20317ec681f3Smrg    dst[101].y = (float)(viewport->Y);
20327ec681f3Smrg    dst[101].z = (float)(viewport->MinZ);
20337ec681f3Smrg}
20347ec681f3Smrg
20357ec681f3Smrgvoid
20367ec681f3Smrgnine_ff_update(struct NineDevice9 *device)
20377ec681f3Smrg{
20387ec681f3Smrg    struct nine_context *context = &device->context;
20397ec681f3Smrg    struct pipe_constant_buffer cb;
20407ec681f3Smrg
20417ec681f3Smrg    DBG("vs=%p ps=%p\n", context->vs, context->ps);
20427ec681f3Smrg
20437ec681f3Smrg    /* NOTE: the only reference belongs to the hash table */
20447ec681f3Smrg    if (!context->programmable_vs) {
20457ec681f3Smrg        device->ff.vs = nine_ff_get_vs(device);
20467ec681f3Smrg        context->changed.group |= NINE_STATE_VS;
20477ec681f3Smrg    }
20487ec681f3Smrg    if (!context->ps) {
20497ec681f3Smrg        device->ff.ps = nine_ff_get_ps(device);
20507ec681f3Smrg        context->changed.group |= NINE_STATE_PS;
20517ec681f3Smrg    }
20527ec681f3Smrg
20537ec681f3Smrg    if (!context->programmable_vs) {
20547ec681f3Smrg        nine_ff_load_vs_transforms(device);
20557ec681f3Smrg        nine_ff_load_tex_matrices(device);
20567ec681f3Smrg        nine_ff_load_lights(device);
20577ec681f3Smrg        nine_ff_load_point_and_fog_params(device);
20587ec681f3Smrg        nine_ff_load_viewport_info(device);
20597ec681f3Smrg
20607ec681f3Smrg        memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
20617ec681f3Smrg
20627ec681f3Smrg        cb.buffer_offset = 0;
20637ec681f3Smrg        cb.buffer = NULL;
20647ec681f3Smrg        cb.user_buffer = device->ff.vs_const;
20657ec681f3Smrg        cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
20667ec681f3Smrg
20677ec681f3Smrg        context->pipe_data.cb_vs_ff = cb;
20687ec681f3Smrg        context->commit |= NINE_STATE_COMMIT_CONST_VS;
20697ec681f3Smrg
20707ec681f3Smrg        context->changed.group &= ~NINE_STATE_FF_VS;
20717ec681f3Smrg    }
20727ec681f3Smrg
20737ec681f3Smrg    if (!context->ps) {
20747ec681f3Smrg        nine_ff_load_ps_params(device);
20757ec681f3Smrg
20767ec681f3Smrg        cb.buffer_offset = 0;
20777ec681f3Smrg        cb.buffer = NULL;
20787ec681f3Smrg        cb.user_buffer = device->ff.ps_const;
20797ec681f3Smrg        cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
20807ec681f3Smrg
20817ec681f3Smrg        context->pipe_data.cb_ps_ff = cb;
20827ec681f3Smrg        context->commit |= NINE_STATE_COMMIT_CONST_PS;
20837ec681f3Smrg
20847ec681f3Smrg        context->changed.group &= ~NINE_STATE_FF_PS;
20857ec681f3Smrg    }
20867ec681f3Smrg}
20877ec681f3Smrg
20887ec681f3Smrg
20897ec681f3Smrgboolean
20907ec681f3Smrgnine_ff_init(struct NineDevice9 *device)
20917ec681f3Smrg{
20927ec681f3Smrg    device->ff.ht_vs = _mesa_hash_table_create(NULL, nine_ff_vs_key_hash,
20937ec681f3Smrg                                               nine_ff_vs_key_comp);
20947ec681f3Smrg    device->ff.ht_ps = _mesa_hash_table_create(NULL, nine_ff_ps_key_hash,
20957ec681f3Smrg                                               nine_ff_ps_key_comp);
20967ec681f3Smrg
20977ec681f3Smrg    device->ff.ht_fvf = _mesa_hash_table_create(NULL, nine_ff_fvf_key_hash,
20987ec681f3Smrg                                                nine_ff_fvf_key_comp);
20997ec681f3Smrg
21007ec681f3Smrg    device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
21017ec681f3Smrg    device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
21027ec681f3Smrg
21037ec681f3Smrg    return device->ff.ht_vs && device->ff.ht_ps &&
21047ec681f3Smrg        device->ff.ht_fvf &&
21057ec681f3Smrg        device->ff.vs_const && device->ff.ps_const;
21067ec681f3Smrg}
21077ec681f3Smrg
21087ec681f3Smrgstatic enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
21097ec681f3Smrg{
21107ec681f3Smrg    NineUnknown_Unbind(NineUnknown(value));
21117ec681f3Smrg    return PIPE_OK;
21127ec681f3Smrg}
21137ec681f3Smrg
21147ec681f3Smrgvoid
21157ec681f3Smrgnine_ff_fini(struct NineDevice9 *device)
21167ec681f3Smrg{
21177ec681f3Smrg    if (device->ff.ht_vs) {
21187ec681f3Smrg        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
21197ec681f3Smrg        _mesa_hash_table_destroy(device->ff.ht_vs, NULL);
21207ec681f3Smrg    }
21217ec681f3Smrg    if (device->ff.ht_ps) {
21227ec681f3Smrg        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
21237ec681f3Smrg        _mesa_hash_table_destroy(device->ff.ht_ps, NULL);
21247ec681f3Smrg    }
21257ec681f3Smrg    if (device->ff.ht_fvf) {
21267ec681f3Smrg        util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
21277ec681f3Smrg        _mesa_hash_table_destroy(device->ff.ht_fvf, NULL);
21287ec681f3Smrg    }
21297ec681f3Smrg    device->ff.vs = NULL; /* destroyed by unbinding from hash table */
21307ec681f3Smrg    device->ff.ps = NULL;
21317ec681f3Smrg
21327ec681f3Smrg    FREE(device->ff.vs_const);
21337ec681f3Smrg    FREE(device->ff.ps_const);
21347ec681f3Smrg}
21357ec681f3Smrg
21367ec681f3Smrgstatic void
21377ec681f3Smrgnine_ff_prune_vs(struct NineDevice9 *device)
21387ec681f3Smrg{
21397ec681f3Smrg    struct nine_context *context = &device->context;
21407ec681f3Smrg
21417ec681f3Smrg    if (device->ff.num_vs > 1024) {
21427ec681f3Smrg        /* could destroy the bound one here, so unbind */
21437ec681f3Smrg        context->pipe->bind_vs_state(context->pipe, NULL);
21447ec681f3Smrg        util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
21457ec681f3Smrg        _mesa_hash_table_clear(device->ff.ht_vs, NULL);
21467ec681f3Smrg        device->ff.num_vs = 0;
21477ec681f3Smrg        context->changed.group |= NINE_STATE_VS;
21487ec681f3Smrg    }
21497ec681f3Smrg}
21507ec681f3Smrgstatic void
21517ec681f3Smrgnine_ff_prune_ps(struct NineDevice9 *device)
21527ec681f3Smrg{
21537ec681f3Smrg    struct nine_context *context = &device->context;
21547ec681f3Smrg
21557ec681f3Smrg    if (device->ff.num_ps > 1024) {
21567ec681f3Smrg        /* could destroy the bound one here, so unbind */
21577ec681f3Smrg        context->pipe->bind_fs_state(context->pipe, NULL);
21587ec681f3Smrg        util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
21597ec681f3Smrg        _mesa_hash_table_clear(device->ff.ht_ps, NULL);
21607ec681f3Smrg        device->ff.num_ps = 0;
21617ec681f3Smrg        context->changed.group |= NINE_STATE_PS;
21627ec681f3Smrg    }
21637ec681f3Smrg}
21647ec681f3Smrg
21657ec681f3Smrg/* ========================================================================== */
21667ec681f3Smrg
21677ec681f3Smrg/* Matrix multiplication:
21687ec681f3Smrg *
21697ec681f3Smrg * in memory: 0 1 2 3 (row major)
21707ec681f3Smrg *            4 5 6 7
21717ec681f3Smrg *            8 9 a b
21727ec681f3Smrg *            c d e f
21737ec681f3Smrg *
21747ec681f3Smrg *    cA cB cC cD
21757ec681f3Smrg * r0             = (r0 * cA) (r0 * cB) . .
21767ec681f3Smrg * r1             = (r1 * cA) (r1 * cB)
21777ec681f3Smrg * r2             = (r2 * cA) .
21787ec681f3Smrg * r3             = (r3 * cA) .
21797ec681f3Smrg *
21807ec681f3Smrg *               r: (11) (12) (13) (14)
21817ec681f3Smrg *                  (21) (22) (23) (24)
21827ec681f3Smrg *                  (31) (32) (33) (34)
21837ec681f3Smrg *                  (41) (42) (43) (44)
21847ec681f3Smrg * l: (11 12 13 14)
21857ec681f3Smrg *    (21 22 23 24)
21867ec681f3Smrg *    (31 32 33 34)
21877ec681f3Smrg *    (41 42 43 44)
21887ec681f3Smrg *
21897ec681f3Smrg * v: (x  y  z  1 )
21907ec681f3Smrg *
21917ec681f3Smrg * t.xyzw = MUL(v.xxxx, r[0]);
21927ec681f3Smrg * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
21937ec681f3Smrg * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
21947ec681f3Smrg * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
21957ec681f3Smrg *
21967ec681f3Smrg * v.x = DP4(v, c[0]);
21977ec681f3Smrg * v.y = DP4(v, c[1]);
21987ec681f3Smrg * v.z = DP4(v, c[2]);
21997ec681f3Smrg * v.w = DP4(v, c[3]) = 1
22007ec681f3Smrg */
22017ec681f3Smrg
22027ec681f3Smrg/*
22037ec681f3Smrgstatic void
22047ec681f3Smrgnine_D3DMATRIX_print(const D3DMATRIX *M)
22057ec681f3Smrg{
22067ec681f3Smrg    DBG("\n(%f %f %f %f)\n"
22077ec681f3Smrg        "(%f %f %f %f)\n"
22087ec681f3Smrg        "(%f %f %f %f)\n"
22097ec681f3Smrg        "(%f %f %f %f)\n",
22107ec681f3Smrg        M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
22117ec681f3Smrg        M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
22127ec681f3Smrg        M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
22137ec681f3Smrg        M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
22147ec681f3Smrg}
22157ec681f3Smrg*/
22167ec681f3Smrg
22177ec681f3Smrgstatic inline float
22187ec681f3Smrgnine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
22197ec681f3Smrg{
22207ec681f3Smrg    return A->m[r][0] * B->m[0][c] +
22217ec681f3Smrg           A->m[r][1] * B->m[1][c] +
22227ec681f3Smrg           A->m[r][2] * B->m[2][c] +
22237ec681f3Smrg           A->m[r][3] * B->m[3][c];
22247ec681f3Smrg}
22257ec681f3Smrg
22267ec681f3Smrgstatic inline float
22277ec681f3Smrgnine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
22287ec681f3Smrg{
22297ec681f3Smrg    return v->x * M->m[0][c] +
22307ec681f3Smrg           v->y * M->m[1][c] +
22317ec681f3Smrg           v->z * M->m[2][c] +
22327ec681f3Smrg           1.0f * M->m[3][c];
22337ec681f3Smrg}
22347ec681f3Smrg
22357ec681f3Smrgstatic inline float
22367ec681f3Smrgnine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
22377ec681f3Smrg{
22387ec681f3Smrg    return v->x * M->m[0][c] +
22397ec681f3Smrg           v->y * M->m[1][c] +
22407ec681f3Smrg           v->z * M->m[2][c];
22417ec681f3Smrg}
22427ec681f3Smrg
22437ec681f3Smrgvoid
22447ec681f3Smrgnine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
22457ec681f3Smrg{
22467ec681f3Smrg    D->_11 = nine_DP4_row_col(L, 0, R, 0);
22477ec681f3Smrg    D->_12 = nine_DP4_row_col(L, 0, R, 1);
22487ec681f3Smrg    D->_13 = nine_DP4_row_col(L, 0, R, 2);
22497ec681f3Smrg    D->_14 = nine_DP4_row_col(L, 0, R, 3);
22507ec681f3Smrg
22517ec681f3Smrg    D->_21 = nine_DP4_row_col(L, 1, R, 0);
22527ec681f3Smrg    D->_22 = nine_DP4_row_col(L, 1, R, 1);
22537ec681f3Smrg    D->_23 = nine_DP4_row_col(L, 1, R, 2);
22547ec681f3Smrg    D->_24 = nine_DP4_row_col(L, 1, R, 3);
22557ec681f3Smrg
22567ec681f3Smrg    D->_31 = nine_DP4_row_col(L, 2, R, 0);
22577ec681f3Smrg    D->_32 = nine_DP4_row_col(L, 2, R, 1);
22587ec681f3Smrg    D->_33 = nine_DP4_row_col(L, 2, R, 2);
22597ec681f3Smrg    D->_34 = nine_DP4_row_col(L, 2, R, 3);
22607ec681f3Smrg
22617ec681f3Smrg    D->_41 = nine_DP4_row_col(L, 3, R, 0);
22627ec681f3Smrg    D->_42 = nine_DP4_row_col(L, 3, R, 1);
22637ec681f3Smrg    D->_43 = nine_DP4_row_col(L, 3, R, 2);
22647ec681f3Smrg    D->_44 = nine_DP4_row_col(L, 3, R, 3);
22657ec681f3Smrg}
22667ec681f3Smrg
22677ec681f3Smrgvoid
22687ec681f3Smrgnine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
22697ec681f3Smrg{
22707ec681f3Smrg    d->x = nine_DP4_vec_col(v, M, 0);
22717ec681f3Smrg    d->y = nine_DP4_vec_col(v, M, 1);
22727ec681f3Smrg    d->z = nine_DP4_vec_col(v, M, 2);
22737ec681f3Smrg}
22747ec681f3Smrg
22757ec681f3Smrgvoid
22767ec681f3Smrgnine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
22777ec681f3Smrg{
22787ec681f3Smrg    d->x = nine_DP3_vec_col(v, M, 0);
22797ec681f3Smrg    d->y = nine_DP3_vec_col(v, M, 1);
22807ec681f3Smrg    d->z = nine_DP3_vec_col(v, M, 2);
22817ec681f3Smrg}
22827ec681f3Smrg
22837ec681f3Smrgvoid
22847ec681f3Smrgnine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
22857ec681f3Smrg{
22867ec681f3Smrg    unsigned i, j;
22877ec681f3Smrg    for (i = 0; i < 4; ++i)
22887ec681f3Smrg    for (j = 0; j < 4; ++j)
22897ec681f3Smrg        D->m[i][j] = M->m[j][i];
22907ec681f3Smrg}
22917ec681f3Smrg
22927ec681f3Smrg#define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
22937ec681f3Smrg    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
22947ec681f3Smrg    if (t > 0.0f) pos += t; else neg += t; } while(0)
22957ec681f3Smrg
22967ec681f3Smrg#define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
22977ec681f3Smrg    float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
22987ec681f3Smrg    if (t > 0.0f) neg -= t; else pos -= t; } while(0)
22997ec681f3Smrgfloat
23007ec681f3Smrgnine_d3d_matrix_det(const D3DMATRIX *M)
23017ec681f3Smrg{
23027ec681f3Smrg    float pos = 0.0f;
23037ec681f3Smrg    float neg = 0.0f;
23047ec681f3Smrg
23057ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
23067ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
23077ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
23087ec681f3Smrg
23097ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
23107ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
23117ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
23127ec681f3Smrg
23137ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
23147ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
23157ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
23167ec681f3Smrg
23177ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
23187ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
23197ec681f3Smrg    _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
23207ec681f3Smrg
23217ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
23227ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
23237ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
23247ec681f3Smrg
23257ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
23267ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
23277ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
23287ec681f3Smrg
23297ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
23307ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
23317ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
23327ec681f3Smrg
23337ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
23347ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
23357ec681f3Smrg    _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
23367ec681f3Smrg
23377ec681f3Smrg    return pos + neg;
23387ec681f3Smrg}
23397ec681f3Smrg
23407ec681f3Smrg/* XXX: Probably better to just use src/mesa/math/m_matrix.c because
23417ec681f3Smrg * I have no idea where this code came from.
23427ec681f3Smrg */
23437ec681f3Smrgvoid
23447ec681f3Smrgnine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
23457ec681f3Smrg{
23467ec681f3Smrg    int i, k;
23477ec681f3Smrg    float det;
23487ec681f3Smrg
23497ec681f3Smrg    D->m[0][0] =
23507ec681f3Smrg        M->m[1][1] * M->m[2][2] * M->m[3][3] -
23517ec681f3Smrg        M->m[1][1] * M->m[3][2] * M->m[2][3] -
23527ec681f3Smrg        M->m[1][2] * M->m[2][1] * M->m[3][3] +
23537ec681f3Smrg        M->m[1][2] * M->m[3][1] * M->m[2][3] +
23547ec681f3Smrg        M->m[1][3] * M->m[2][1] * M->m[3][2] -
23557ec681f3Smrg        M->m[1][3] * M->m[3][1] * M->m[2][2];
23567ec681f3Smrg
23577ec681f3Smrg    D->m[0][1] =
23587ec681f3Smrg       -M->m[0][1] * M->m[2][2] * M->m[3][3] +
23597ec681f3Smrg        M->m[0][1] * M->m[3][2] * M->m[2][3] +
23607ec681f3Smrg        M->m[0][2] * M->m[2][1] * M->m[3][3] -
23617ec681f3Smrg        M->m[0][2] * M->m[3][1] * M->m[2][3] -
23627ec681f3Smrg        M->m[0][3] * M->m[2][1] * M->m[3][2] +
23637ec681f3Smrg        M->m[0][3] * M->m[3][1] * M->m[2][2];
23647ec681f3Smrg
23657ec681f3Smrg    D->m[0][2] =
23667ec681f3Smrg        M->m[0][1] * M->m[1][2] * M->m[3][3] -
23677ec681f3Smrg        M->m[0][1] * M->m[3][2] * M->m[1][3] -
23687ec681f3Smrg        M->m[0][2] * M->m[1][1] * M->m[3][3] +
23697ec681f3Smrg        M->m[0][2] * M->m[3][1] * M->m[1][3] +
23707ec681f3Smrg        M->m[0][3] * M->m[1][1] * M->m[3][2] -
23717ec681f3Smrg        M->m[0][3] * M->m[3][1] * M->m[1][2];
23727ec681f3Smrg
23737ec681f3Smrg    D->m[0][3] =
23747ec681f3Smrg       -M->m[0][1] * M->m[1][2] * M->m[2][3] +
23757ec681f3Smrg        M->m[0][1] * M->m[2][2] * M->m[1][3] +
23767ec681f3Smrg        M->m[0][2] * M->m[1][1] * M->m[2][3] -
23777ec681f3Smrg        M->m[0][2] * M->m[2][1] * M->m[1][3] -
23787ec681f3Smrg        M->m[0][3] * M->m[1][1] * M->m[2][2] +
23797ec681f3Smrg        M->m[0][3] * M->m[2][1] * M->m[1][2];
23807ec681f3Smrg
23817ec681f3Smrg    D->m[1][0] =
23827ec681f3Smrg       -M->m[1][0] * M->m[2][2] * M->m[3][3] +
23837ec681f3Smrg        M->m[1][0] * M->m[3][2] * M->m[2][3] +
23847ec681f3Smrg        M->m[1][2] * M->m[2][0] * M->m[3][3] -
23857ec681f3Smrg        M->m[1][2] * M->m[3][0] * M->m[2][3] -
23867ec681f3Smrg        M->m[1][3] * M->m[2][0] * M->m[3][2] +
23877ec681f3Smrg        M->m[1][3] * M->m[3][0] * M->m[2][2];
23887ec681f3Smrg
23897ec681f3Smrg    D->m[1][1] =
23907ec681f3Smrg        M->m[0][0] * M->m[2][2] * M->m[3][3] -
23917ec681f3Smrg        M->m[0][0] * M->m[3][2] * M->m[2][3] -
23927ec681f3Smrg        M->m[0][2] * M->m[2][0] * M->m[3][3] +
23937ec681f3Smrg        M->m[0][2] * M->m[3][0] * M->m[2][3] +
23947ec681f3Smrg        M->m[0][3] * M->m[2][0] * M->m[3][2] -
23957ec681f3Smrg        M->m[0][3] * M->m[3][0] * M->m[2][2];
23967ec681f3Smrg
23977ec681f3Smrg    D->m[1][2] =
23987ec681f3Smrg       -M->m[0][0] * M->m[1][2] * M->m[3][3] +
23997ec681f3Smrg        M->m[0][0] * M->m[3][2] * M->m[1][3] +
24007ec681f3Smrg        M->m[0][2] * M->m[1][0] * M->m[3][3] -
24017ec681f3Smrg        M->m[0][2] * M->m[3][0] * M->m[1][3] -
24027ec681f3Smrg        M->m[0][3] * M->m[1][0] * M->m[3][2] +
24037ec681f3Smrg        M->m[0][3] * M->m[3][0] * M->m[1][2];
24047ec681f3Smrg
24057ec681f3Smrg    D->m[1][3] =
24067ec681f3Smrg        M->m[0][0] * M->m[1][2] * M->m[2][3] -
24077ec681f3Smrg        M->m[0][0] * M->m[2][2] * M->m[1][3] -
24087ec681f3Smrg        M->m[0][2] * M->m[1][0] * M->m[2][3] +
24097ec681f3Smrg        M->m[0][2] * M->m[2][0] * M->m[1][3] +
24107ec681f3Smrg        M->m[0][3] * M->m[1][0] * M->m[2][2] -
24117ec681f3Smrg        M->m[0][3] * M->m[2][0] * M->m[1][2];
24127ec681f3Smrg
24137ec681f3Smrg    D->m[2][0] =
24147ec681f3Smrg        M->m[1][0] * M->m[2][1] * M->m[3][3] -
24157ec681f3Smrg        M->m[1][0] * M->m[3][1] * M->m[2][3] -
24167ec681f3Smrg        M->m[1][1] * M->m[2][0] * M->m[3][3] +
24177ec681f3Smrg        M->m[1][1] * M->m[3][0] * M->m[2][3] +
24187ec681f3Smrg        M->m[1][3] * M->m[2][0] * M->m[3][1] -
24197ec681f3Smrg        M->m[1][3] * M->m[3][0] * M->m[2][1];
24207ec681f3Smrg
24217ec681f3Smrg    D->m[2][1] =
24227ec681f3Smrg       -M->m[0][0] * M->m[2][1] * M->m[3][3] +
24237ec681f3Smrg        M->m[0][0] * M->m[3][1] * M->m[2][3] +
24247ec681f3Smrg        M->m[0][1] * M->m[2][0] * M->m[3][3] -
24257ec681f3Smrg        M->m[0][1] * M->m[3][0] * M->m[2][3] -
24267ec681f3Smrg        M->m[0][3] * M->m[2][0] * M->m[3][1] +
24277ec681f3Smrg        M->m[0][3] * M->m[3][0] * M->m[2][1];
24287ec681f3Smrg
24297ec681f3Smrg    D->m[2][2] =
24307ec681f3Smrg        M->m[0][0] * M->m[1][1] * M->m[3][3] -
24317ec681f3Smrg        M->m[0][0] * M->m[3][1] * M->m[1][3] -
24327ec681f3Smrg        M->m[0][1] * M->m[1][0] * M->m[3][3] +
24337ec681f3Smrg        M->m[0][1] * M->m[3][0] * M->m[1][3] +
24347ec681f3Smrg        M->m[0][3] * M->m[1][0] * M->m[3][1] -
24357ec681f3Smrg        M->m[0][3] * M->m[3][0] * M->m[1][1];
24367ec681f3Smrg
24377ec681f3Smrg    D->m[2][3] =
24387ec681f3Smrg       -M->m[0][0] * M->m[1][1] * M->m[2][3] +
24397ec681f3Smrg        M->m[0][0] * M->m[2][1] * M->m[1][3] +
24407ec681f3Smrg        M->m[0][1] * M->m[1][0] * M->m[2][3] -
24417ec681f3Smrg        M->m[0][1] * M->m[2][0] * M->m[1][3] -
24427ec681f3Smrg        M->m[0][3] * M->m[1][0] * M->m[2][1] +
24437ec681f3Smrg        M->m[0][3] * M->m[2][0] * M->m[1][1];
24447ec681f3Smrg
24457ec681f3Smrg    D->m[3][0] =
24467ec681f3Smrg       -M->m[1][0] * M->m[2][1] * M->m[3][2] +
24477ec681f3Smrg        M->m[1][0] * M->m[3][1] * M->m[2][2] +
24487ec681f3Smrg        M->m[1][1] * M->m[2][0] * M->m[3][2] -
24497ec681f3Smrg        M->m[1][1] * M->m[3][0] * M->m[2][2] -
24507ec681f3Smrg        M->m[1][2] * M->m[2][0] * M->m[3][1] +
24517ec681f3Smrg        M->m[1][2] * M->m[3][0] * M->m[2][1];
24527ec681f3Smrg
24537ec681f3Smrg    D->m[3][1] =
24547ec681f3Smrg        M->m[0][0] * M->m[2][1] * M->m[3][2] -
24557ec681f3Smrg        M->m[0][0] * M->m[3][1] * M->m[2][2] -
24567ec681f3Smrg        M->m[0][1] * M->m[2][0] * M->m[3][2] +
24577ec681f3Smrg        M->m[0][1] * M->m[3][0] * M->m[2][2] +
24587ec681f3Smrg        M->m[0][2] * M->m[2][0] * M->m[3][1] -
24597ec681f3Smrg        M->m[0][2] * M->m[3][0] * M->m[2][1];
24607ec681f3Smrg
24617ec681f3Smrg    D->m[3][2] =
24627ec681f3Smrg       -M->m[0][0] * M->m[1][1] * M->m[3][2] +
24637ec681f3Smrg        M->m[0][0] * M->m[3][1] * M->m[1][2] +
24647ec681f3Smrg        M->m[0][1] * M->m[1][0] * M->m[3][2] -
24657ec681f3Smrg        M->m[0][1] * M->m[3][0] * M->m[1][2] -
24667ec681f3Smrg        M->m[0][2] * M->m[1][0] * M->m[3][1] +
24677ec681f3Smrg        M->m[0][2] * M->m[3][0] * M->m[1][1];
24687ec681f3Smrg
24697ec681f3Smrg    D->m[3][3] =
24707ec681f3Smrg        M->m[0][0] * M->m[1][1] * M->m[2][2] -
24717ec681f3Smrg        M->m[0][0] * M->m[2][1] * M->m[1][2] -
24727ec681f3Smrg        M->m[0][1] * M->m[1][0] * M->m[2][2] +
24737ec681f3Smrg        M->m[0][1] * M->m[2][0] * M->m[1][2] +
24747ec681f3Smrg        M->m[0][2] * M->m[1][0] * M->m[2][1] -
24757ec681f3Smrg        M->m[0][2] * M->m[2][0] * M->m[1][1];
24767ec681f3Smrg
24777ec681f3Smrg    det =
24787ec681f3Smrg        M->m[0][0] * D->m[0][0] +
24797ec681f3Smrg        M->m[1][0] * D->m[0][1] +
24807ec681f3Smrg        M->m[2][0] * D->m[0][2] +
24817ec681f3Smrg        M->m[3][0] * D->m[0][3];
24827ec681f3Smrg
24837ec681f3Smrg    if (fabsf(det) < 1e-30) {/* non inversible */
24847ec681f3Smrg        *D = *M; /* wine tests */
24857ec681f3Smrg        return;
24867ec681f3Smrg    }
24877ec681f3Smrg
24887ec681f3Smrg    det = 1.0 / det;
24897ec681f3Smrg
24907ec681f3Smrg    for (i = 0; i < 4; i++)
24917ec681f3Smrg    for (k = 0; k < 4; k++)
24927ec681f3Smrg        D->m[i][k] *= det;
24937ec681f3Smrg
24947ec681f3Smrg#if defined(DEBUG) || !defined(NDEBUG)
24957ec681f3Smrg    {
24967ec681f3Smrg        D3DMATRIX I;
24977ec681f3Smrg
24987ec681f3Smrg        nine_d3d_matrix_matrix_mul(&I, D, M);
24997ec681f3Smrg
25007ec681f3Smrg        for (i = 0; i < 4; ++i)
25017ec681f3Smrg        for (k = 0; k < 4; ++k)
25027ec681f3Smrg            if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
25037ec681f3Smrg                DBG("Matrix inversion check FAILED !\n");
25047ec681f3Smrg    }
25057ec681f3Smrg#endif
25067ec681f3Smrg}
2507