14a49301eSmrg/* 2af69d88dSmrg * Copyright 2003 VMware, Inc. 34a49301eSmrg * All Rights Reserved. 44a49301eSmrg * 54a49301eSmrg * Permission is hereby granted, free of charge, to any person obtaining a 64a49301eSmrg * copy of this software and associated documentation files (the "Software"), 74a49301eSmrg * to deal in the Software without restriction, including without limitation 84a49301eSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 94a49301eSmrg * license, and/or sell copies of the Software, and to permit persons to whom 104a49301eSmrg * the Software is furnished to do so, subject to the following conditions: 114a49301eSmrg * 124a49301eSmrg * The above copyright notice and this permission notice (including the next 134a49301eSmrg * paragraph) shall be included in all copies or substantial portions of the 144a49301eSmrg * Software. 154a49301eSmrg * 164a49301eSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 174a49301eSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 184a49301eSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19af69d88dSmrg * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 204a49301eSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 214a49301eSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 224a49301eSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 234a49301eSmrg * 244a49301eSmrg * Authors: 25af69d88dSmrg * Keith Whitwell <keithw@vmware.com> 264a49301eSmrg */ 274a49301eSmrg 284a49301eSmrg 294a49301eSmrg#include "pipe/p_config.h" 304a49301eSmrg#include "pipe/p_compiler.h" 314a49301eSmrg#include "util/u_memory.h" 324a49301eSmrg#include "util/u_math.h" 337ec681f3Smrg#include "util/format/u_format.h" 344a49301eSmrg 354a49301eSmrg#include "translate.h" 364a49301eSmrg 374a49301eSmrg 387ec681f3Smrg#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE) 394a49301eSmrg 404a49301eSmrg#include "rtasm/rtasm_cpu.h" 414a49301eSmrg#include "rtasm/rtasm_x86sse.h" 424a49301eSmrg 434a49301eSmrg 444a49301eSmrg#define X 0 454a49301eSmrg#define Y 1 464a49301eSmrg#define Z 2 474a49301eSmrg#define W 3 484a49301eSmrg 494a49301eSmrg 50af69d88dSmrgstruct translate_buffer 51af69d88dSmrg{ 524a49301eSmrg const void *base_ptr; 533464ebd5Sriastradh uintptr_t stride; 543464ebd5Sriastradh unsigned max_index; 554a49301eSmrg}; 564a49301eSmrg 57af69d88dSmrgstruct translate_buffer_variant 58af69d88dSmrg{ 59cdc920a0Smrg unsigned buffer_index; 60cdc920a0Smrg unsigned instance_divisor; 61af69d88dSmrg void *ptr; /* updated either per vertex or per instance */ 62cdc920a0Smrg}; 63cdc920a0Smrg 64cdc920a0Smrg 65cdc920a0Smrg#define ELEMENT_BUFFER_INSTANCE_ID 1001 66cdc920a0Smrg 673464ebd5Sriastradh#define NUM_CONSTS 7 683464ebd5Sriastradh 693464ebd5Sriastradhenum 703464ebd5Sriastradh{ 713464ebd5Sriastradh CONST_IDENTITY, 723464ebd5Sriastradh CONST_INV_127, 733464ebd5Sriastradh CONST_INV_255, 743464ebd5Sriastradh CONST_INV_32767, 753464ebd5Sriastradh CONST_INV_65535, 763464ebd5Sriastradh CONST_INV_2147483647, 773464ebd5Sriastradh CONST_255 783464ebd5Sriastradh}; 793464ebd5Sriastradh 803464ebd5Sriastradh#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} 813464ebd5Sriastradhstatic float consts[NUM_CONSTS][4] = { 82af69d88dSmrg {0, 0, 0, 1}, 83af69d88dSmrg C(1.0 / 127.0), 84af69d88dSmrg C(1.0 / 255.0), 85af69d88dSmrg C(1.0 / 32767.0), 86af69d88dSmrg C(1.0 / 65535.0), 87af69d88dSmrg C(1.0 / 2147483647.0), 88af69d88dSmrg C(255.0) 893464ebd5Sriastradh}; 90af69d88dSmrg 913464ebd5Sriastradh#undef C 924a49301eSmrg 93af69d88dSmrgstruct translate_sse 94af69d88dSmrg{ 954a49301eSmrg struct translate translate; 964a49301eSmrg 974a49301eSmrg struct x86_function linear_func; 984a49301eSmrg struct x86_function elt_func; 993464ebd5Sriastradh struct x86_function elt16_func; 1003464ebd5Sriastradh struct x86_function elt8_func; 1014a49301eSmrg struct x86_function *func; 1024a49301eSmrg 103af69d88dSmrg PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; 1043464ebd5Sriastradh int8_t reg_to_const[16]; 1053464ebd5Sriastradh int8_t const_to_reg[NUM_CONSTS]; 1064a49301eSmrg 107af69d88dSmrg struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS]; 1084a49301eSmrg unsigned nr_buffers; 1094a49301eSmrg 1103464ebd5Sriastradh /* Multiple buffer variants can map to a single buffer. */ 111af69d88dSmrg struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS]; 1123464ebd5Sriastradh unsigned nr_buffer_variants; 113cdc920a0Smrg 1143464ebd5Sriastradh /* Multiple elements can map to a single buffer variant. */ 115af69d88dSmrg unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS]; 116cdc920a0Smrg 117cdc920a0Smrg boolean use_instancing; 118cdc920a0Smrg unsigned instance_id; 119af69d88dSmrg unsigned start_instance; 120cdc920a0Smrg 1214a49301eSmrg /* these are actually known values, but putting them in a struct 1224a49301eSmrg * like this is helpful to keep them in sync across the file. 1234a49301eSmrg */ 1244a49301eSmrg struct x86_reg tmp_EAX; 1253464ebd5Sriastradh struct x86_reg tmp2_EDX; 1263464ebd5Sriastradh struct x86_reg src_ECX; 127af69d88dSmrg struct x86_reg idx_ESI; /* either start+i or &elt[i] */ 1283464ebd5Sriastradh struct x86_reg machine_EDI; 1293464ebd5Sriastradh struct x86_reg outbuf_EBX; 1303464ebd5Sriastradh struct x86_reg count_EBP; /* decrements to zero */ 1314a49301eSmrg}; 1324a49301eSmrg 133af69d88dSmrg 134af69d88dSmrgstatic int 135af69d88dSmrgget_offset(const void *a, const void *b) 1364a49301eSmrg{ 137af69d88dSmrg return (const char *) b - (const char *) a; 1384a49301eSmrg} 1394a49301eSmrg 140af69d88dSmrg 141af69d88dSmrgstatic struct x86_reg 142af69d88dSmrgget_const(struct translate_sse *p, unsigned id) 1433464ebd5Sriastradh{ 1443464ebd5Sriastradh struct x86_reg reg; 1453464ebd5Sriastradh unsigned i; 1464a49301eSmrg 147af69d88dSmrg if (p->const_to_reg[id] >= 0) 1483464ebd5Sriastradh return x86_make_reg(file_XMM, p->const_to_reg[id]); 1494a49301eSmrg 150af69d88dSmrg for (i = 2; i < 8; ++i) { 151af69d88dSmrg if (p->reg_to_const[i] < 0) 1523464ebd5Sriastradh break; 1534a49301eSmrg } 1544a49301eSmrg 1553464ebd5Sriastradh /* TODO: be smarter here */ 156af69d88dSmrg if (i == 8) 1573464ebd5Sriastradh --i; 1583464ebd5Sriastradh 1593464ebd5Sriastradh reg = x86_make_reg(file_XMM, i); 1603464ebd5Sriastradh 161af69d88dSmrg if (p->reg_to_const[i] >= 0) 1623464ebd5Sriastradh p->const_to_reg[p->reg_to_const[i]] = -1; 1633464ebd5Sriastradh 1643464ebd5Sriastradh p->reg_to_const[i] = id; 1653464ebd5Sriastradh p->const_to_reg[id] = i; 1663464ebd5Sriastradh 1673464ebd5Sriastradh /* TODO: this should happen outside the loop, if possible */ 1683464ebd5Sriastradh sse_movaps(p->func, reg, 169af69d88dSmrg x86_make_disp(p->machine_EDI, 170af69d88dSmrg get_offset(p, &p->consts[id][0]))); 1713464ebd5Sriastradh 1724a49301eSmrg return reg; 1734a49301eSmrg} 1744a49301eSmrg 175af69d88dSmrg 1763464ebd5Sriastradh/* load the data in a SSE2 register, padding with zeros */ 177af69d88dSmrgstatic boolean 178af69d88dSmrgemit_load_sse2(struct translate_sse *p, 179af69d88dSmrg struct x86_reg data, struct x86_reg src, unsigned size) 1804a49301eSmrg{ 1813464ebd5Sriastradh struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 1823464ebd5Sriastradh struct x86_reg tmp = p->tmp_EAX; 183af69d88dSmrg switch (size) { 1843464ebd5Sriastradh case 1: 1853464ebd5Sriastradh x86_movzx8(p->func, tmp, src); 1863464ebd5Sriastradh sse2_movd(p->func, data, tmp); 1873464ebd5Sriastradh break; 1883464ebd5Sriastradh case 2: 1893464ebd5Sriastradh x86_movzx16(p->func, tmp, src); 1903464ebd5Sriastradh sse2_movd(p->func, data, tmp); 1913464ebd5Sriastradh break; 1923464ebd5Sriastradh case 3: 1933464ebd5Sriastradh x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); 1943464ebd5Sriastradh x86_shl_imm(p->func, tmp, 16); 1953464ebd5Sriastradh x86_mov16(p->func, tmp, src); 1963464ebd5Sriastradh sse2_movd(p->func, data, tmp); 1973464ebd5Sriastradh break; 1983464ebd5Sriastradh case 4: 1993464ebd5Sriastradh sse2_movd(p->func, data, src); 2003464ebd5Sriastradh break; 2013464ebd5Sriastradh case 6: 2023464ebd5Sriastradh sse2_movd(p->func, data, src); 2033464ebd5Sriastradh x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); 2043464ebd5Sriastradh sse2_movd(p->func, tmpXMM, tmp); 2053464ebd5Sriastradh sse2_punpckldq(p->func, data, tmpXMM); 2063464ebd5Sriastradh break; 2073464ebd5Sriastradh case 8: 2083464ebd5Sriastradh sse2_movq(p->func, data, src); 2093464ebd5Sriastradh break; 2103464ebd5Sriastradh case 12: 2113464ebd5Sriastradh sse2_movq(p->func, data, src); 2123464ebd5Sriastradh sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); 2133464ebd5Sriastradh sse2_punpcklqdq(p->func, data, tmpXMM); 2143464ebd5Sriastradh break; 2153464ebd5Sriastradh case 16: 2163464ebd5Sriastradh sse2_movdqu(p->func, data, src); 2173464ebd5Sriastradh break; 2183464ebd5Sriastradh default: 2193464ebd5Sriastradh return FALSE; 2204a49301eSmrg } 2213464ebd5Sriastradh return TRUE; 2224a49301eSmrg} 2234a49301eSmrg 224af69d88dSmrg 2253464ebd5Sriastradh/* this value can be passed for the out_chans argument */ 2263464ebd5Sriastradh#define CHANNELS_0001 5 2273464ebd5Sriastradh 228af69d88dSmrg 2293464ebd5Sriastradh/* this function will load #chans float values, and will 2303464ebd5Sriastradh * pad the register with zeroes at least up to out_chans. 2313464ebd5Sriastradh * 2323464ebd5Sriastradh * If out_chans is set to CHANNELS_0001, then the fourth 2333464ebd5Sriastradh * value will be padded with 1. Only pass this value if 2343464ebd5Sriastradh * chans < 4 or results are undefined. 2353464ebd5Sriastradh */ 236af69d88dSmrgstatic void 237af69d88dSmrgemit_load_float32(struct translate_sse *p, struct x86_reg data, 238af69d88dSmrg struct x86_reg arg0, unsigned out_chans, unsigned chans) 2394a49301eSmrg{ 240af69d88dSmrg switch (chans) { 2413464ebd5Sriastradh case 1: 2423464ebd5Sriastradh /* a 0 0 0 2433464ebd5Sriastradh * a 0 0 1 2443464ebd5Sriastradh */ 2453464ebd5Sriastradh sse_movss(p->func, data, arg0); 246af69d88dSmrg if (out_chans == CHANNELS_0001) 247af69d88dSmrg sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 2483464ebd5Sriastradh break; 2493464ebd5Sriastradh case 2: 2503464ebd5Sriastradh /* 0 0 0 1 2513464ebd5Sriastradh * a b 0 1 2523464ebd5Sriastradh */ 253af69d88dSmrg if (out_chans == CHANNELS_0001) 254af69d88dSmrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 255af69d88dSmrg SHUF(X, Y, Z, W)); 256af69d88dSmrg else if (out_chans > 2) 257af69d88dSmrg sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 2583464ebd5Sriastradh sse_movlps(p->func, data, arg0); 2593464ebd5Sriastradh break; 2603464ebd5Sriastradh case 3: 2613464ebd5Sriastradh /* Have to jump through some hoops: 2623464ebd5Sriastradh * 2633464ebd5Sriastradh * c 0 0 0 2643464ebd5Sriastradh * c 0 0 1 if out_chans == CHANNELS_0001 2653464ebd5Sriastradh * 0 0 c 0/1 2663464ebd5Sriastradh * a b c 0/1 2673464ebd5Sriastradh */ 2683464ebd5Sriastradh sse_movss(p->func, data, x86_make_disp(arg0, 8)); 269af69d88dSmrg if (out_chans == CHANNELS_0001) 270af69d88dSmrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 271af69d88dSmrg SHUF(X, Y, Z, W)); 272af69d88dSmrg sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); 2733464ebd5Sriastradh sse_movlps(p->func, data, arg0); 2743464ebd5Sriastradh break; 2753464ebd5Sriastradh case 4: 2763464ebd5Sriastradh sse_movups(p->func, data, arg0); 2773464ebd5Sriastradh break; 2784a49301eSmrg } 2794a49301eSmrg} 2804a49301eSmrg 2813464ebd5Sriastradh/* this function behaves like emit_load_float32, but loads 2823464ebd5Sriastradh 64-bit floating point numbers, converting them to 32-bit 2833464ebd5Sriastradh ones */ 284af69d88dSmrgstatic void 285af69d88dSmrgemit_load_float64to32(struct translate_sse *p, struct x86_reg data, 286af69d88dSmrg struct x86_reg arg0, unsigned out_chans, unsigned chans) 2873464ebd5Sriastradh{ 2883464ebd5Sriastradh struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 289af69d88dSmrg switch (chans) { 2903464ebd5Sriastradh case 1: 2913464ebd5Sriastradh sse2_movsd(p->func, data, arg0); 292af69d88dSmrg if (out_chans > 1) 2933464ebd5Sriastradh sse2_cvtpd2ps(p->func, data, data); 2943464ebd5Sriastradh else 2953464ebd5Sriastradh sse2_cvtsd2ss(p->func, data, data); 296af69d88dSmrg if (out_chans == CHANNELS_0001) 297af69d88dSmrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 298af69d88dSmrg SHUF(X, Y, Z, W)); 2993464ebd5Sriastradh break; 3003464ebd5Sriastradh case 2: 3013464ebd5Sriastradh sse2_movupd(p->func, data, arg0); 3023464ebd5Sriastradh sse2_cvtpd2ps(p->func, data, data); 303af69d88dSmrg if (out_chans == CHANNELS_0001) 304af69d88dSmrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 305af69d88dSmrg SHUF(X, Y, Z, W)); 306af69d88dSmrg else if (out_chans > 2) 307af69d88dSmrg sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 308af69d88dSmrg break; 3093464ebd5Sriastradh case 3: 3103464ebd5Sriastradh sse2_movupd(p->func, data, arg0); 3113464ebd5Sriastradh sse2_cvtpd2ps(p->func, data, data); 3123464ebd5Sriastradh sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 313af69d88dSmrg if (out_chans > 3) 3143464ebd5Sriastradh sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 3153464ebd5Sriastradh else 3163464ebd5Sriastradh sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); 3173464ebd5Sriastradh sse_movlhps(p->func, data, tmpXMM); 318af69d88dSmrg if (out_chans == CHANNELS_0001) 319af69d88dSmrg sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 3203464ebd5Sriastradh break; 3213464ebd5Sriastradh case 4: 3223464ebd5Sriastradh sse2_movupd(p->func, data, arg0); 3233464ebd5Sriastradh sse2_cvtpd2ps(p->func, data, data); 3243464ebd5Sriastradh sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 3253464ebd5Sriastradh sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 3263464ebd5Sriastradh sse_movlhps(p->func, data, tmpXMM); 3273464ebd5Sriastradh break; 3283464ebd5Sriastradh } 3293464ebd5Sriastradh} 3304a49301eSmrg 331af69d88dSmrg 332af69d88dSmrgstatic void 333af69d88dSmrgemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, 334af69d88dSmrg struct x86_reg dst_xmm, struct x86_reg src_gpr, 335af69d88dSmrg struct x86_reg src_xmm) 3364a49301eSmrg{ 337af69d88dSmrg if (x86_target(p->func) != X86_32) 3383464ebd5Sriastradh x64_mov64(p->func, dst_gpr, src_gpr); 339af69d88dSmrg else { 3403464ebd5Sriastradh /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ 341af69d88dSmrg if (x86_target_caps(p->func) & X86_SSE2) 3423464ebd5Sriastradh sse2_movq(p->func, dst_xmm, src_xmm); 3433464ebd5Sriastradh else 3443464ebd5Sriastradh sse_movlps(p->func, dst_xmm, src_xmm); 3453464ebd5Sriastradh } 3464a49301eSmrg} 3474a49301eSmrg 348af69d88dSmrg 349af69d88dSmrgstatic void 350af69d88dSmrgemit_load64(struct translate_sse *p, struct x86_reg dst_gpr, 351af69d88dSmrg struct x86_reg dst_xmm, struct x86_reg src) 3524a49301eSmrg{ 3533464ebd5Sriastradh emit_mov64(p, dst_gpr, dst_xmm, src, src); 3544a49301eSmrg} 3554a49301eSmrg 356af69d88dSmrg 357af69d88dSmrgstatic void 358af69d88dSmrgemit_store64(struct translate_sse *p, struct x86_reg dst, 359af69d88dSmrg struct x86_reg src_gpr, struct x86_reg src_xmm) 3604a49301eSmrg{ 3613464ebd5Sriastradh emit_mov64(p, dst, dst, src_gpr, src_xmm); 3624a49301eSmrg} 3634a49301eSmrg 364af69d88dSmrg 365af69d88dSmrgstatic void 366af69d88dSmrgemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) 3673464ebd5Sriastradh{ 368af69d88dSmrg if (x86_target_caps(p->func) & X86_SSE2) 3693464ebd5Sriastradh sse2_movdqu(p->func, dst, src); 3703464ebd5Sriastradh else 3713464ebd5Sriastradh sse_movups(p->func, dst, src); 3723464ebd5Sriastradh} 3734a49301eSmrg 374af69d88dSmrg 3753464ebd5Sriastradh/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, 3763464ebd5Sriastradh * but may or may not be good on older processors 3773464ebd5Sriastradh * TODO: may perhaps want to use non-temporal stores here if possible 3783464ebd5Sriastradh */ 379af69d88dSmrgstatic void 380af69d88dSmrgemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, 381af69d88dSmrg unsigned size) 3824a49301eSmrg{ 3833464ebd5Sriastradh struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 3843464ebd5Sriastradh struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); 3853464ebd5Sriastradh struct x86_reg dataGPR = p->tmp_EAX; 3863464ebd5Sriastradh struct x86_reg dataGPR2 = p->tmp2_EDX; 3873464ebd5Sriastradh 388af69d88dSmrg if (size < 8) { 389af69d88dSmrg switch (size) { 3903464ebd5Sriastradh case 1: 3913464ebd5Sriastradh x86_mov8(p->func, dataGPR, src); 3923464ebd5Sriastradh x86_mov8(p->func, dst, dataGPR); 3933464ebd5Sriastradh break; 3943464ebd5Sriastradh case 2: 3953464ebd5Sriastradh x86_mov16(p->func, dataGPR, src); 3963464ebd5Sriastradh x86_mov16(p->func, dst, dataGPR); 3973464ebd5Sriastradh break; 3983464ebd5Sriastradh case 3: 3993464ebd5Sriastradh x86_mov16(p->func, dataGPR, src); 4003464ebd5Sriastradh x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); 4013464ebd5Sriastradh x86_mov16(p->func, dst, dataGPR); 4023464ebd5Sriastradh x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); 4033464ebd5Sriastradh break; 4043464ebd5Sriastradh case 4: 4053464ebd5Sriastradh x86_mov(p->func, dataGPR, src); 4063464ebd5Sriastradh x86_mov(p->func, dst, dataGPR); 4073464ebd5Sriastradh break; 4083464ebd5Sriastradh case 6: 4093464ebd5Sriastradh x86_mov(p->func, dataGPR, src); 4103464ebd5Sriastradh x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); 4113464ebd5Sriastradh x86_mov(p->func, dst, dataGPR); 4123464ebd5Sriastradh x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); 4133464ebd5Sriastradh break; 4143464ebd5Sriastradh } 4153464ebd5Sriastradh } 416af69d88dSmrg else if (!(x86_target_caps(p->func) & X86_SSE)) { 4173464ebd5Sriastradh unsigned i = 0; 4183464ebd5Sriastradh assert((size & 3) == 0); 419af69d88dSmrg for (i = 0; i < size; i += 4) { 4203464ebd5Sriastradh x86_mov(p->func, dataGPR, x86_make_disp(src, i)); 4213464ebd5Sriastradh x86_mov(p->func, x86_make_disp(dst, i), dataGPR); 4223464ebd5Sriastradh } 4233464ebd5Sriastradh } 424af69d88dSmrg else { 425af69d88dSmrg switch (size) { 4263464ebd5Sriastradh case 8: 4273464ebd5Sriastradh emit_load64(p, dataGPR, dataXMM, src); 4283464ebd5Sriastradh emit_store64(p, dst, dataGPR, dataXMM); 4293464ebd5Sriastradh break; 4303464ebd5Sriastradh case 12: 4313464ebd5Sriastradh emit_load64(p, dataGPR2, dataXMM, src); 4323464ebd5Sriastradh x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); 4333464ebd5Sriastradh emit_store64(p, dst, dataGPR2, dataXMM); 4343464ebd5Sriastradh x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); 4353464ebd5Sriastradh break; 4363464ebd5Sriastradh case 16: 4373464ebd5Sriastradh emit_mov128(p, dataXMM, src); 4383464ebd5Sriastradh emit_mov128(p, dst, dataXMM); 4393464ebd5Sriastradh break; 4403464ebd5Sriastradh case 24: 4413464ebd5Sriastradh emit_mov128(p, dataXMM, src); 4423464ebd5Sriastradh emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); 4433464ebd5Sriastradh emit_mov128(p, dst, dataXMM); 4443464ebd5Sriastradh emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); 4453464ebd5Sriastradh break; 4463464ebd5Sriastradh case 32: 4473464ebd5Sriastradh emit_mov128(p, dataXMM, src); 4483464ebd5Sriastradh emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); 4493464ebd5Sriastradh emit_mov128(p, dst, dataXMM); 4503464ebd5Sriastradh emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); 4513464ebd5Sriastradh break; 4523464ebd5Sriastradh default: 4533464ebd5Sriastradh assert(0); 4543464ebd5Sriastradh } 4553464ebd5Sriastradh } 4564a49301eSmrg} 4574a49301eSmrg 458af69d88dSmrgstatic boolean 459af69d88dSmrgtranslate_attr_convert(struct translate_sse *p, 460af69d88dSmrg const struct translate_element *a, 461af69d88dSmrg struct x86_reg src, struct x86_reg dst) 4624a49301eSmrg{ 463af69d88dSmrg const struct util_format_description *input_desc = 464af69d88dSmrg util_format_description(a->input_format); 465af69d88dSmrg const struct util_format_description *output_desc = 466af69d88dSmrg util_format_description(a->output_format); 4673464ebd5Sriastradh unsigned i; 4683464ebd5Sriastradh boolean id_swizzle = TRUE; 469af69d88dSmrg unsigned swizzle[4] = 47001e04c3fSmrg { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE, 47101e04c3fSmrg PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE }; 4723464ebd5Sriastradh unsigned needed_chans = 0; 473af69d88dSmrg unsigned imms[2] = { 0, 0x3f800000 }; 4744a49301eSmrg 475af69d88dSmrg if (a->output_format == PIPE_FORMAT_NONE 476af69d88dSmrg || a->input_format == PIPE_FORMAT_NONE) 4773464ebd5Sriastradh return FALSE; 4784a49301eSmrg 479af69d88dSmrg if (input_desc->channel[0].size & 7) 4803464ebd5Sriastradh return FALSE; 4814a49301eSmrg 482af69d88dSmrg if (input_desc->colorspace != output_desc->colorspace) 4833464ebd5Sriastradh return FALSE; 4844a49301eSmrg 485af69d88dSmrg for (i = 1; i < input_desc->nr_channels; ++i) { 486af69d88dSmrg if (memcmp 487af69d88dSmrg (&input_desc->channel[i], &input_desc->channel[0], 488af69d88dSmrg sizeof(input_desc->channel[0]))) 4893464ebd5Sriastradh return FALSE; 4903464ebd5Sriastradh } 4914a49301eSmrg 492af69d88dSmrg for (i = 1; i < output_desc->nr_channels; ++i) { 493af69d88dSmrg if (memcmp 494af69d88dSmrg (&output_desc->channel[i], &output_desc->channel[0], 495af69d88dSmrg sizeof(output_desc->channel[0]))) { 4963464ebd5Sriastradh return FALSE; 497af69d88dSmrg } 4983464ebd5Sriastradh } 4994a49301eSmrg 500af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 501af69d88dSmrg if (output_desc->swizzle[i] < 4) 5023464ebd5Sriastradh swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; 5033464ebd5Sriastradh } 5044a49301eSmrg 505af69d88dSmrg if ((x86_target_caps(p->func) & X86_SSE) && 506af69d88dSmrg (0 || a->output_format == PIPE_FORMAT_R32_FLOAT 507af69d88dSmrg || a->output_format == PIPE_FORMAT_R32G32_FLOAT 508af69d88dSmrg || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT 509af69d88dSmrg || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { 5103464ebd5Sriastradh struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 5114a49301eSmrg 512af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 51301e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_0 514af69d88dSmrg && i >= input_desc->nr_channels) 5153464ebd5Sriastradh swizzle[i] = i; 5163464ebd5Sriastradh } 5174a49301eSmrg 518af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 519af69d88dSmrg if (swizzle[i] < 4) 5203464ebd5Sriastradh needed_chans = MAX2(needed_chans, swizzle[i] + 1); 52101e04c3fSmrg if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 5223464ebd5Sriastradh id_swizzle = FALSE; 5233464ebd5Sriastradh } 5244a49301eSmrg 525af69d88dSmrg if (needed_chans > 0) { 526af69d88dSmrg switch (input_desc->channel[0].type) { 5273464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 528af69d88dSmrg if (!(x86_target_caps(p->func) & X86_SSE2)) 5293464ebd5Sriastradh return FALSE; 530af69d88dSmrg emit_load_sse2(p, dataXMM, src, 531af69d88dSmrg input_desc->channel[0].size * 532af69d88dSmrg input_desc->nr_channels >> 3); 5333464ebd5Sriastradh 5343464ebd5Sriastradh /* TODO: add support for SSE4.1 pmovzx */ 535af69d88dSmrg switch (input_desc->channel[0].size) { 5363464ebd5Sriastradh case 8: 537af69d88dSmrg /* TODO: this may be inefficient due to get_identity() being 538af69d88dSmrg * used both as a float and integer register. 539af69d88dSmrg */ 5403464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 5413464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 5423464ebd5Sriastradh break; 5433464ebd5Sriastradh case 16: 5443464ebd5Sriastradh sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 5453464ebd5Sriastradh break; 546af69d88dSmrg case 32: /* we lose precision here */ 5473464ebd5Sriastradh sse2_psrld_imm(p->func, dataXMM, 1); 5483464ebd5Sriastradh break; 5493464ebd5Sriastradh default: 5503464ebd5Sriastradh return FALSE; 5513464ebd5Sriastradh } 5523464ebd5Sriastradh sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 553af69d88dSmrg if (input_desc->channel[0].normalized) { 5543464ebd5Sriastradh struct x86_reg factor; 555af69d88dSmrg switch (input_desc->channel[0].size) { 5563464ebd5Sriastradh case 8: 5573464ebd5Sriastradh factor = get_const(p, CONST_INV_255); 5583464ebd5Sriastradh break; 5593464ebd5Sriastradh case 16: 5603464ebd5Sriastradh factor = get_const(p, CONST_INV_65535); 5613464ebd5Sriastradh break; 5623464ebd5Sriastradh case 32: 5633464ebd5Sriastradh factor = get_const(p, CONST_INV_2147483647); 5643464ebd5Sriastradh break; 5653464ebd5Sriastradh default: 5663464ebd5Sriastradh assert(0); 5673464ebd5Sriastradh factor.disp = 0; 5683464ebd5Sriastradh factor.file = 0; 5693464ebd5Sriastradh factor.idx = 0; 5703464ebd5Sriastradh factor.mod = 0; 5713464ebd5Sriastradh break; 5723464ebd5Sriastradh } 5733464ebd5Sriastradh sse_mulps(p->func, dataXMM, factor); 5743464ebd5Sriastradh } 575af69d88dSmrg else if (input_desc->channel[0].size == 32) 576af69d88dSmrg /* compensate for the bit we threw away to fit u32 into s32 */ 577af69d88dSmrg sse_addps(p->func, dataXMM, dataXMM); 5783464ebd5Sriastradh break; 5793464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 580af69d88dSmrg if (!(x86_target_caps(p->func) & X86_SSE2)) 5813464ebd5Sriastradh return FALSE; 582af69d88dSmrg emit_load_sse2(p, dataXMM, src, 583af69d88dSmrg input_desc->channel[0].size * 584af69d88dSmrg input_desc->nr_channels >> 3); 5853464ebd5Sriastradh 5863464ebd5Sriastradh /* TODO: add support for SSE4.1 pmovsx */ 587af69d88dSmrg switch (input_desc->channel[0].size) { 5883464ebd5Sriastradh case 8: 5893464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, dataXMM); 5903464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, dataXMM); 5913464ebd5Sriastradh sse2_psrad_imm(p->func, dataXMM, 24); 5923464ebd5Sriastradh break; 5933464ebd5Sriastradh case 16: 5943464ebd5Sriastradh sse2_punpcklwd(p->func, dataXMM, dataXMM); 5953464ebd5Sriastradh sse2_psrad_imm(p->func, dataXMM, 16); 5963464ebd5Sriastradh break; 597af69d88dSmrg case 32: /* we lose precision here */ 5983464ebd5Sriastradh break; 5993464ebd5Sriastradh default: 6003464ebd5Sriastradh return FALSE; 6013464ebd5Sriastradh } 6023464ebd5Sriastradh sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 603af69d88dSmrg if (input_desc->channel[0].normalized) { 6043464ebd5Sriastradh struct x86_reg factor; 605af69d88dSmrg switch (input_desc->channel[0].size) { 6063464ebd5Sriastradh case 8: 6073464ebd5Sriastradh factor = get_const(p, CONST_INV_127); 6083464ebd5Sriastradh break; 6093464ebd5Sriastradh case 16: 6103464ebd5Sriastradh factor = get_const(p, CONST_INV_32767); 6113464ebd5Sriastradh break; 6123464ebd5Sriastradh case 32: 6133464ebd5Sriastradh factor = get_const(p, CONST_INV_2147483647); 6143464ebd5Sriastradh break; 6153464ebd5Sriastradh default: 6163464ebd5Sriastradh assert(0); 6173464ebd5Sriastradh factor.disp = 0; 6183464ebd5Sriastradh factor.file = 0; 6193464ebd5Sriastradh factor.idx = 0; 6203464ebd5Sriastradh factor.mod = 0; 6213464ebd5Sriastradh break; 6223464ebd5Sriastradh } 6233464ebd5Sriastradh sse_mulps(p->func, dataXMM, factor); 6243464ebd5Sriastradh } 6253464ebd5Sriastradh break; 6263464ebd5Sriastradh 6273464ebd5Sriastradh break; 6283464ebd5Sriastradh case UTIL_FORMAT_TYPE_FLOAT: 629af69d88dSmrg if (input_desc->channel[0].size != 32 630af69d88dSmrg && input_desc->channel[0].size != 64) { 6313464ebd5Sriastradh return FALSE; 632af69d88dSmrg } 63301e04c3fSmrg if (swizzle[3] == PIPE_SWIZZLE_1 634af69d88dSmrg && input_desc->nr_channels <= 3) { 63501e04c3fSmrg swizzle[3] = PIPE_SWIZZLE_W; 6363464ebd5Sriastradh needed_chans = CHANNELS_0001; 6373464ebd5Sriastradh } 638af69d88dSmrg switch (input_desc->channel[0].size) { 6393464ebd5Sriastradh case 32: 640af69d88dSmrg emit_load_float32(p, dataXMM, src, needed_chans, 641af69d88dSmrg input_desc->nr_channels); 6423464ebd5Sriastradh break; 643af69d88dSmrg case 64: /* we lose precision here */ 644af69d88dSmrg if (!(x86_target_caps(p->func) & X86_SSE2)) 6453464ebd5Sriastradh return FALSE; 646af69d88dSmrg emit_load_float64to32(p, dataXMM, src, needed_chans, 647af69d88dSmrg input_desc->nr_channels); 6483464ebd5Sriastradh break; 6493464ebd5Sriastradh default: 6503464ebd5Sriastradh return FALSE; 6513464ebd5Sriastradh } 6523464ebd5Sriastradh break; 6533464ebd5Sriastradh default: 6543464ebd5Sriastradh return FALSE; 6553464ebd5Sriastradh } 6564a49301eSmrg 657af69d88dSmrg if (!id_swizzle) { 658af69d88dSmrg sse_shufps(p->func, dataXMM, dataXMM, 659af69d88dSmrg SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); 660af69d88dSmrg } 6613464ebd5Sriastradh } 6623464ebd5Sriastradh 663af69d88dSmrg if (output_desc->nr_channels >= 4 66401e04c3fSmrg && swizzle[0] < PIPE_SWIZZLE_0 66501e04c3fSmrg && swizzle[1] < PIPE_SWIZZLE_0 66601e04c3fSmrg && swizzle[2] < PIPE_SWIZZLE_0 66701e04c3fSmrg && swizzle[3] < PIPE_SWIZZLE_0) { 6683464ebd5Sriastradh sse_movups(p->func, dst, dataXMM); 669af69d88dSmrg } 670af69d88dSmrg else { 671af69d88dSmrg if (output_desc->nr_channels >= 2 67201e04c3fSmrg && swizzle[0] < PIPE_SWIZZLE_0 67301e04c3fSmrg && swizzle[1] < PIPE_SWIZZLE_0) { 6743464ebd5Sriastradh sse_movlps(p->func, dst, dataXMM); 675af69d88dSmrg } 676af69d88dSmrg else { 67701e04c3fSmrg if (swizzle[0] < PIPE_SWIZZLE_0) { 6783464ebd5Sriastradh sse_movss(p->func, dst, dataXMM); 679af69d88dSmrg } 680af69d88dSmrg else { 681af69d88dSmrg x86_mov_imm(p->func, dst, 68201e04c3fSmrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 683af69d88dSmrg } 6843464ebd5Sriastradh 685af69d88dSmrg if (output_desc->nr_channels >= 2) { 68601e04c3fSmrg if (swizzle[1] < PIPE_SWIZZLE_0) { 6873464ebd5Sriastradh sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); 6883464ebd5Sriastradh sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); 6893464ebd5Sriastradh } 690af69d88dSmrg else { 691af69d88dSmrg x86_mov_imm(p->func, x86_make_disp(dst, 4), 69201e04c3fSmrg imms[swizzle[1] - PIPE_SWIZZLE_0]); 693af69d88dSmrg } 6943464ebd5Sriastradh } 6953464ebd5Sriastradh } 6964a49301eSmrg 697af69d88dSmrg if (output_desc->nr_channels >= 3) { 698af69d88dSmrg if (output_desc->nr_channels >= 4 69901e04c3fSmrg && swizzle[2] < PIPE_SWIZZLE_0 70001e04c3fSmrg && swizzle[3] < PIPE_SWIZZLE_0) { 7013464ebd5Sriastradh sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); 702af69d88dSmrg } 703af69d88dSmrg else { 70401e04c3fSmrg if (swizzle[2] < PIPE_SWIZZLE_0) { 7053464ebd5Sriastradh sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); 7063464ebd5Sriastradh sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); 7073464ebd5Sriastradh } 708af69d88dSmrg else { 709af69d88dSmrg x86_mov_imm(p->func, x86_make_disp(dst, 8), 71001e04c3fSmrg imms[swizzle[2] - PIPE_SWIZZLE_0]); 711af69d88dSmrg } 7123464ebd5Sriastradh 713af69d88dSmrg if (output_desc->nr_channels >= 4) { 71401e04c3fSmrg if (swizzle[3] < PIPE_SWIZZLE_0) { 7153464ebd5Sriastradh sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); 7163464ebd5Sriastradh sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); 7173464ebd5Sriastradh } 718af69d88dSmrg else { 719af69d88dSmrg x86_mov_imm(p->func, x86_make_disp(dst, 12), 72001e04c3fSmrg imms[swizzle[3] - PIPE_SWIZZLE_0]); 721af69d88dSmrg } 7223464ebd5Sriastradh } 7233464ebd5Sriastradh } 7243464ebd5Sriastradh } 7253464ebd5Sriastradh } 7263464ebd5Sriastradh return TRUE; 7273464ebd5Sriastradh } 728af69d88dSmrg else if ((x86_target_caps(p->func) & X86_SSE2) 729af69d88dSmrg && input_desc->channel[0].size == 8 730af69d88dSmrg && output_desc->channel[0].size == 16 731af69d88dSmrg && output_desc->channel[0].normalized == 732af69d88dSmrg input_desc->channel[0].normalized && 733af69d88dSmrg (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 734af69d88dSmrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) 735af69d88dSmrg || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 736af69d88dSmrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 737af69d88dSmrg || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED 738af69d88dSmrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { 7393464ebd5Sriastradh struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 7403464ebd5Sriastradh struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 7413464ebd5Sriastradh struct x86_reg tmp = p->tmp_EAX; 742af69d88dSmrg unsigned imms[2] = { 0, 1 }; 7433464ebd5Sriastradh 744af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 74501e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_0 746af69d88dSmrg && i >= input_desc->nr_channels) { 7473464ebd5Sriastradh swizzle[i] = i; 748af69d88dSmrg } 7493464ebd5Sriastradh } 7504a49301eSmrg 751af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 752af69d88dSmrg if (swizzle[i] < 4) 7533464ebd5Sriastradh needed_chans = MAX2(needed_chans, swizzle[i] + 1); 75401e04c3fSmrg if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 7553464ebd5Sriastradh id_swizzle = FALSE; 7563464ebd5Sriastradh } 7574a49301eSmrg 758af69d88dSmrg if (needed_chans > 0) { 759af69d88dSmrg emit_load_sse2(p, dataXMM, src, 760af69d88dSmrg input_desc->channel[0].size * 761af69d88dSmrg input_desc->nr_channels >> 3); 7623464ebd5Sriastradh 763af69d88dSmrg switch (input_desc->channel[0].type) { 7643464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 765af69d88dSmrg if (input_desc->channel[0].normalized) { 7663464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, dataXMM); 767af69d88dSmrg if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 768af69d88dSmrg sse2_psrlw_imm(p->func, dataXMM, 1); 7693464ebd5Sriastradh } 7703464ebd5Sriastradh else 7713464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 7723464ebd5Sriastradh break; 7733464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 774af69d88dSmrg if (input_desc->channel[0].normalized) { 7753464ebd5Sriastradh sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); 7763464ebd5Sriastradh sse2_punpcklbw(p->func, tmpXMM, dataXMM); 7773464ebd5Sriastradh sse2_psllw_imm(p->func, dataXMM, 9); 7783464ebd5Sriastradh sse2_psrlw_imm(p->func, dataXMM, 8); 7793464ebd5Sriastradh sse2_por(p->func, tmpXMM, dataXMM); 7803464ebd5Sriastradh sse2_psrlw_imm(p->func, dataXMM, 7); 7813464ebd5Sriastradh sse2_por(p->func, tmpXMM, dataXMM); 7823464ebd5Sriastradh { 7833464ebd5Sriastradh struct x86_reg t = dataXMM; 7843464ebd5Sriastradh dataXMM = tmpXMM; 7853464ebd5Sriastradh tmpXMM = t; 7863464ebd5Sriastradh } 7873464ebd5Sriastradh } 788af69d88dSmrg else { 7893464ebd5Sriastradh sse2_punpcklbw(p->func, dataXMM, dataXMM); 7903464ebd5Sriastradh sse2_psraw_imm(p->func, dataXMM, 8); 7913464ebd5Sriastradh } 7923464ebd5Sriastradh break; 7933464ebd5Sriastradh default: 7943464ebd5Sriastradh assert(0); 7953464ebd5Sriastradh } 7964a49301eSmrg 797af69d88dSmrg if (output_desc->channel[0].normalized) 798af69d88dSmrg imms[1] = 799af69d88dSmrg (output_desc->channel[0].type == 800af69d88dSmrg UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; 8014a49301eSmrg 802af69d88dSmrg if (!id_swizzle) 803af69d88dSmrg sse2_pshuflw(p->func, dataXMM, dataXMM, 804af69d88dSmrg (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | 805af69d88dSmrg ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); 8063464ebd5Sriastradh } 8074a49301eSmrg 808af69d88dSmrg if (output_desc->nr_channels >= 4 80901e04c3fSmrg && swizzle[0] < PIPE_SWIZZLE_0 81001e04c3fSmrg && swizzle[1] < PIPE_SWIZZLE_0 81101e04c3fSmrg && swizzle[2] < PIPE_SWIZZLE_0 81201e04c3fSmrg && swizzle[3] < PIPE_SWIZZLE_0) { 8133464ebd5Sriastradh sse2_movq(p->func, dst, dataXMM); 814af69d88dSmrg } 815af69d88dSmrg else { 81601e04c3fSmrg if (swizzle[0] < PIPE_SWIZZLE_0) { 817af69d88dSmrg if (output_desc->nr_channels >= 2 81801e04c3fSmrg && swizzle[1] < PIPE_SWIZZLE_0) { 8193464ebd5Sriastradh sse2_movd(p->func, dst, dataXMM); 820af69d88dSmrg } 821af69d88dSmrg else { 8223464ebd5Sriastradh sse2_movd(p->func, tmp, dataXMM); 8233464ebd5Sriastradh x86_mov16(p->func, dst, tmp); 824af69d88dSmrg if (output_desc->nr_channels >= 2) 825af69d88dSmrg x86_mov16_imm(p->func, x86_make_disp(dst, 2), 82601e04c3fSmrg imms[swizzle[1] - PIPE_SWIZZLE_0]); 8273464ebd5Sriastradh } 8283464ebd5Sriastradh } 829af69d88dSmrg else { 830af69d88dSmrg if (output_desc->nr_channels >= 2 83101e04c3fSmrg && swizzle[1] >= PIPE_SWIZZLE_0) { 832af69d88dSmrg x86_mov_imm(p->func, dst, 83301e04c3fSmrg (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) | 83401e04c3fSmrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 835af69d88dSmrg } 836af69d88dSmrg else { 837af69d88dSmrg x86_mov16_imm(p->func, dst, 83801e04c3fSmrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 839af69d88dSmrg if (output_desc->nr_channels >= 2) { 8403464ebd5Sriastradh sse2_movd(p->func, tmp, dataXMM); 8413464ebd5Sriastradh x86_shr_imm(p->func, tmp, 16); 8423464ebd5Sriastradh x86_mov16(p->func, x86_make_disp(dst, 2), tmp); 8433464ebd5Sriastradh } 8443464ebd5Sriastradh } 8453464ebd5Sriastradh } 8464a49301eSmrg 847af69d88dSmrg if (output_desc->nr_channels >= 3) { 84801e04c3fSmrg if (swizzle[2] < PIPE_SWIZZLE_0) { 849af69d88dSmrg if (output_desc->nr_channels >= 4 85001e04c3fSmrg && swizzle[3] < PIPE_SWIZZLE_0) { 8513464ebd5Sriastradh sse2_psrlq_imm(p->func, dataXMM, 32); 8523464ebd5Sriastradh sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); 8533464ebd5Sriastradh } 854af69d88dSmrg else { 8553464ebd5Sriastradh sse2_psrlq_imm(p->func, dataXMM, 32); 8563464ebd5Sriastradh sse2_movd(p->func, tmp, dataXMM); 8573464ebd5Sriastradh x86_mov16(p->func, x86_make_disp(dst, 4), tmp); 858af69d88dSmrg if (output_desc->nr_channels >= 4) { 859af69d88dSmrg x86_mov16_imm(p->func, x86_make_disp(dst, 6), 86001e04c3fSmrg imms[swizzle[3] - PIPE_SWIZZLE_0]); 8613464ebd5Sriastradh } 8623464ebd5Sriastradh } 8633464ebd5Sriastradh } 864af69d88dSmrg else { 865af69d88dSmrg if (output_desc->nr_channels >= 4 86601e04c3fSmrg && swizzle[3] >= PIPE_SWIZZLE_0) { 867af69d88dSmrg x86_mov_imm(p->func, x86_make_disp(dst, 4), 86801e04c3fSmrg (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16) 86901e04c3fSmrg | imms[swizzle[2] - PIPE_SWIZZLE_0]); 870af69d88dSmrg } 871af69d88dSmrg else { 872af69d88dSmrg x86_mov16_imm(p->func, x86_make_disp(dst, 4), 87301e04c3fSmrg imms[swizzle[2] - PIPE_SWIZZLE_0]); 8743464ebd5Sriastradh 875af69d88dSmrg if (output_desc->nr_channels >= 4) { 8763464ebd5Sriastradh sse2_psrlq_imm(p->func, dataXMM, 48); 8773464ebd5Sriastradh sse2_movd(p->func, tmp, dataXMM); 8783464ebd5Sriastradh x86_mov16(p->func, x86_make_disp(dst, 6), tmp); 8793464ebd5Sriastradh } 8803464ebd5Sriastradh } 8813464ebd5Sriastradh } 8823464ebd5Sriastradh } 8833464ebd5Sriastradh } 8843464ebd5Sriastradh return TRUE; 8853464ebd5Sriastradh } 886af69d88dSmrg else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], 887af69d88dSmrg sizeof(output_desc->channel[0]))) { 8883464ebd5Sriastradh struct x86_reg tmp = p->tmp_EAX; 8893464ebd5Sriastradh unsigned i; 890af69d88dSmrg 891af69d88dSmrg if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 892af69d88dSmrg && output_desc->nr_channels == 4 89301e04c3fSmrg && swizzle[0] == PIPE_SWIZZLE_W 89401e04c3fSmrg && swizzle[1] == PIPE_SWIZZLE_Z 89501e04c3fSmrg && swizzle[2] == PIPE_SWIZZLE_Y 89601e04c3fSmrg && swizzle[3] == PIPE_SWIZZLE_X) { 8973464ebd5Sriastradh /* TODO: support movbe */ 8983464ebd5Sriastradh x86_mov(p->func, tmp, src); 8993464ebd5Sriastradh x86_bswap(p->func, tmp); 9003464ebd5Sriastradh x86_mov(p->func, dst, tmp); 9013464ebd5Sriastradh return TRUE; 9023464ebd5Sriastradh } 9034a49301eSmrg 904af69d88dSmrg for (i = 0; i < output_desc->nr_channels; ++i) { 905af69d88dSmrg switch (output_desc->channel[0].size) { 9063464ebd5Sriastradh case 8: 90701e04c3fSmrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 9083464ebd5Sriastradh unsigned v = 0; 90901e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_1) { 910af69d88dSmrg switch (output_desc->channel[0].type) { 9113464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 9123464ebd5Sriastradh v = output_desc->channel[0].normalized ? 0xff : 1; 9133464ebd5Sriastradh break; 9143464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 9153464ebd5Sriastradh v = output_desc->channel[0].normalized ? 0x7f : 1; 9163464ebd5Sriastradh break; 9173464ebd5Sriastradh default: 9183464ebd5Sriastradh return FALSE; 9193464ebd5Sriastradh } 9203464ebd5Sriastradh } 9213464ebd5Sriastradh x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); 9223464ebd5Sriastradh } 923af69d88dSmrg else { 9243464ebd5Sriastradh x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); 9253464ebd5Sriastradh x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); 9263464ebd5Sriastradh } 9273464ebd5Sriastradh break; 9283464ebd5Sriastradh case 16: 92901e04c3fSmrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 9303464ebd5Sriastradh unsigned v = 0; 93101e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_1) { 932af69d88dSmrg switch (output_desc->channel[1].type) { 9333464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 9343464ebd5Sriastradh v = output_desc->channel[1].normalized ? 0xffff : 1; 9353464ebd5Sriastradh break; 9363464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 9373464ebd5Sriastradh v = output_desc->channel[1].normalized ? 0x7fff : 1; 9383464ebd5Sriastradh break; 9393464ebd5Sriastradh case UTIL_FORMAT_TYPE_FLOAT: 9403464ebd5Sriastradh v = 0x3c00; 9413464ebd5Sriastradh break; 9423464ebd5Sriastradh default: 9433464ebd5Sriastradh return FALSE; 9443464ebd5Sriastradh } 9453464ebd5Sriastradh } 9463464ebd5Sriastradh x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); 9473464ebd5Sriastradh } 94801e04c3fSmrg else if (swizzle[i] == PIPE_SWIZZLE_0) { 9493464ebd5Sriastradh x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); 950af69d88dSmrg } 951af69d88dSmrg else { 9523464ebd5Sriastradh x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); 9533464ebd5Sriastradh x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); 9543464ebd5Sriastradh } 9553464ebd5Sriastradh break; 9563464ebd5Sriastradh case 32: 95701e04c3fSmrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 9583464ebd5Sriastradh unsigned v = 0; 95901e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_1) { 960af69d88dSmrg switch (output_desc->channel[1].type) { 9613464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 9623464ebd5Sriastradh v = output_desc->channel[1].normalized ? 0xffffffff : 1; 9633464ebd5Sriastradh break; 9643464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 9653464ebd5Sriastradh v = output_desc->channel[1].normalized ? 0x7fffffff : 1; 9663464ebd5Sriastradh break; 9673464ebd5Sriastradh case UTIL_FORMAT_TYPE_FLOAT: 9683464ebd5Sriastradh v = 0x3f800000; 9693464ebd5Sriastradh break; 9703464ebd5Sriastradh default: 9713464ebd5Sriastradh return FALSE; 9723464ebd5Sriastradh } 9733464ebd5Sriastradh } 9743464ebd5Sriastradh x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); 9753464ebd5Sriastradh } 976af69d88dSmrg else { 9773464ebd5Sriastradh x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); 9783464ebd5Sriastradh x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); 9793464ebd5Sriastradh } 9803464ebd5Sriastradh break; 9813464ebd5Sriastradh case 64: 98201e04c3fSmrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 9833464ebd5Sriastradh unsigned l = 0; 9843464ebd5Sriastradh unsigned h = 0; 98501e04c3fSmrg if (swizzle[i] == PIPE_SWIZZLE_1) { 986af69d88dSmrg switch (output_desc->channel[1].type) { 9873464ebd5Sriastradh case UTIL_FORMAT_TYPE_UNSIGNED: 9883464ebd5Sriastradh h = output_desc->channel[1].normalized ? 0xffffffff : 0; 9893464ebd5Sriastradh l = output_desc->channel[1].normalized ? 0xffffffff : 1; 9903464ebd5Sriastradh break; 9913464ebd5Sriastradh case UTIL_FORMAT_TYPE_SIGNED: 9923464ebd5Sriastradh h = output_desc->channel[1].normalized ? 0x7fffffff : 0; 9933464ebd5Sriastradh l = output_desc->channel[1].normalized ? 0xffffffff : 1; 9943464ebd5Sriastradh break; 9953464ebd5Sriastradh case UTIL_FORMAT_TYPE_FLOAT: 9963464ebd5Sriastradh h = 0x3ff00000; 9973464ebd5Sriastradh l = 0; 9983464ebd5Sriastradh break; 9993464ebd5Sriastradh default: 10003464ebd5Sriastradh return FALSE; 10013464ebd5Sriastradh } 10023464ebd5Sriastradh } 10033464ebd5Sriastradh x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); 10043464ebd5Sriastradh x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); 10053464ebd5Sriastradh } 1006af69d88dSmrg else { 1007af69d88dSmrg if (x86_target_caps(p->func) & X86_SSE) { 10083464ebd5Sriastradh struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); 1009af69d88dSmrg emit_load64(p, tmp, tmpXMM, 1010af69d88dSmrg x86_make_disp(src, swizzle[i] * 8)); 10113464ebd5Sriastradh emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); 10123464ebd5Sriastradh } 1013af69d88dSmrg else { 10143464ebd5Sriastradh x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); 10153464ebd5Sriastradh x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); 1016af69d88dSmrg x86_mov(p->func, tmp, 1017af69d88dSmrg x86_make_disp(src, swizzle[i] * 8 + 4)); 10183464ebd5Sriastradh x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); 10193464ebd5Sriastradh } 10203464ebd5Sriastradh } 10213464ebd5Sriastradh break; 10223464ebd5Sriastradh default: 10233464ebd5Sriastradh return FALSE; 10243464ebd5Sriastradh } 10253464ebd5Sriastradh } 10263464ebd5Sriastradh return TRUE; 10273464ebd5Sriastradh } 10283464ebd5Sriastradh /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ 1029af69d88dSmrg else if ((x86_target_caps(p->func) & X86_SSE2) && 1030af69d88dSmrg a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && 1031af69d88dSmrg (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM 1032af69d88dSmrg || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { 10333464ebd5Sriastradh struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 10344a49301eSmrg 10353464ebd5Sriastradh /* load */ 10363464ebd5Sriastradh sse_movups(p->func, dataXMM, src); 10374a49301eSmrg 1038af69d88dSmrg if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { 1039af69d88dSmrg sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); 1040af69d88dSmrg } 10414a49301eSmrg 10423464ebd5Sriastradh /* scale by 255.0 */ 10433464ebd5Sriastradh sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); 10444a49301eSmrg 10453464ebd5Sriastradh /* pack and emit */ 10463464ebd5Sriastradh sse2_cvtps2dq(p->func, dataXMM, dataXMM); 10473464ebd5Sriastradh sse2_packssdw(p->func, dataXMM, dataXMM); 10483464ebd5Sriastradh sse2_packuswb(p->func, dataXMM, dataXMM); 10493464ebd5Sriastradh sse2_movd(p->func, dst, dataXMM); 10504a49301eSmrg 10513464ebd5Sriastradh return TRUE; 10524a49301eSmrg } 10534a49301eSmrg 10543464ebd5Sriastradh return FALSE; 10554a49301eSmrg} 10564a49301eSmrg 1057af69d88dSmrg 1058af69d88dSmrgstatic boolean 1059af69d88dSmrgtranslate_attr(struct translate_sse *p, 1060af69d88dSmrg const struct translate_element *a, 1061af69d88dSmrg struct x86_reg src, struct x86_reg dst) 10623464ebd5Sriastradh{ 1063af69d88dSmrg if (a->input_format == a->output_format) { 10643464ebd5Sriastradh emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); 10653464ebd5Sriastradh return TRUE; 10663464ebd5Sriastradh } 10673464ebd5Sriastradh 10683464ebd5Sriastradh return translate_attr_convert(p, a, src, dst); 10693464ebd5Sriastradh} 10704a49301eSmrg 1071af69d88dSmrg 1072af69d88dSmrgstatic boolean 1073af69d88dSmrginit_inputs(struct translate_sse *p, unsigned index_size) 10744a49301eSmrg{ 10754a49301eSmrg unsigned i; 1076af69d88dSmrg struct x86_reg instance_id = 1077af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1078af69d88dSmrg struct x86_reg start_instance = 1079af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); 1080cdc920a0Smrg 10813464ebd5Sriastradh for (i = 0; i < p->nr_buffer_variants; i++) { 10823464ebd5Sriastradh struct translate_buffer_variant *variant = &p->buffer_variant[i]; 10833464ebd5Sriastradh struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; 1084cdc920a0Smrg 10853464ebd5Sriastradh if (!index_size || variant->instance_divisor) { 1086af69d88dSmrg struct x86_reg buf_max_index = 1087af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); 1088af69d88dSmrg struct x86_reg buf_stride = 1089af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); 1090af69d88dSmrg struct x86_reg buf_ptr = 1091af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); 1092af69d88dSmrg struct x86_reg buf_base_ptr = 1093af69d88dSmrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); 10943464ebd5Sriastradh struct x86_reg elt = p->idx_ESI; 1095cdc920a0Smrg struct x86_reg tmp_EAX = p->tmp_EAX; 10964a49301eSmrg 10974a49301eSmrg /* Calculate pointer to first attrib: 1098cdc920a0Smrg * base_ptr + stride * index, where index depends on instance divisor 10994a49301eSmrg */ 11003464ebd5Sriastradh if (variant->instance_divisor) { 110101e04c3fSmrg struct x86_reg tmp_EDX = p->tmp2_EDX; 110201e04c3fSmrg 1103af69d88dSmrg /* Start with instance = instance_id 1104af69d88dSmrg * which is true if divisor is 1. 1105cdc920a0Smrg */ 1106cdc920a0Smrg x86_mov(p->func, tmp_EAX, instance_id); 1107cdc920a0Smrg 11083464ebd5Sriastradh if (variant->instance_divisor != 1) { 11093464ebd5Sriastradh struct x86_reg tmp_ECX = p->src_ECX; 1110cdc920a0Smrg 1111cdc920a0Smrg /* TODO: Add x86_shr() to rtasm and use it whenever 1112cdc920a0Smrg * instance divisor is power of two. 1113cdc920a0Smrg */ 1114cdc920a0Smrg x86_xor(p->func, tmp_EDX, tmp_EDX); 11153464ebd5Sriastradh x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); 1116af69d88dSmrg x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 1117cdc920a0Smrg } 11183464ebd5Sriastradh 111901e04c3fSmrg /* instance = (instance_id / divisor) + start_instance 112001e04c3fSmrg */ 112101e04c3fSmrg x86_mov(p->func, tmp_EDX, start_instance); 112201e04c3fSmrg x86_add(p->func, tmp_EAX, tmp_EDX); 112301e04c3fSmrg 11243464ebd5Sriastradh /* XXX we need to clamp the index here too, but to a 11253464ebd5Sriastradh * per-array max value, not the draw->pt.max_index value 11263464ebd5Sriastradh * that's being given to us via translate->set_buffer(). 11273464ebd5Sriastradh */ 1128af69d88dSmrg } 1129af69d88dSmrg else { 1130cdc920a0Smrg x86_mov(p->func, tmp_EAX, elt); 11313464ebd5Sriastradh 11323464ebd5Sriastradh /* Clamp to max_index 11333464ebd5Sriastradh */ 11343464ebd5Sriastradh x86_cmp(p->func, tmp_EAX, buf_max_index); 11353464ebd5Sriastradh x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); 1136cdc920a0Smrg } 11373464ebd5Sriastradh 1138af69d88dSmrg x86_mov(p->func, p->tmp2_EDX, buf_stride); 1139af69d88dSmrg x64_rexw(p->func); 1140af69d88dSmrg x86_imul(p->func, tmp_EAX, p->tmp2_EDX); 11413464ebd5Sriastradh x64_rexw(p->func); 1142cdc920a0Smrg x86_add(p->func, tmp_EAX, buf_base_ptr); 11434a49301eSmrg 11443464ebd5Sriastradh x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 11454a49301eSmrg 11464a49301eSmrg /* In the linear case, keep the buffer pointer instead of the 11474a49301eSmrg * index number. 11484a49301eSmrg */ 1149af69d88dSmrg if (!index_size && p->nr_buffer_variants == 1) { 11503464ebd5Sriastradh x64_rexw(p->func); 1151cdc920a0Smrg x86_mov(p->func, elt, tmp_EAX); 11523464ebd5Sriastradh } 1153af69d88dSmrg else { 11543464ebd5Sriastradh x64_rexw(p->func); 1155cdc920a0Smrg x86_mov(p->func, buf_ptr, tmp_EAX); 11563464ebd5Sriastradh } 11574a49301eSmrg } 11584a49301eSmrg } 11594a49301eSmrg 11604a49301eSmrg return TRUE; 11614a49301eSmrg} 11624a49301eSmrg 11634a49301eSmrg 1164af69d88dSmrgstatic struct x86_reg 1165af69d88dSmrgget_buffer_ptr(struct translate_sse *p, 1166af69d88dSmrg unsigned index_size, unsigned var_idx, struct x86_reg elt) 11674a49301eSmrg{ 1168cdc920a0Smrg if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 1169af69d88dSmrg return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1170cdc920a0Smrg } 11713464ebd5Sriastradh if (!index_size && p->nr_buffer_variants == 1) { 11723464ebd5Sriastradh return p->idx_ESI; 11734a49301eSmrg } 11743464ebd5Sriastradh else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { 11753464ebd5Sriastradh struct x86_reg ptr = p->src_ECX; 1176af69d88dSmrg struct x86_reg buf_ptr = 11773464ebd5Sriastradh x86_make_disp(p->machine_EDI, 11783464ebd5Sriastradh get_offset(p, &p->buffer_variant[var_idx].ptr)); 1179af69d88dSmrg 11803464ebd5Sriastradh x64_rexw(p->func); 11814a49301eSmrg x86_mov(p->func, ptr, buf_ptr); 11824a49301eSmrg return ptr; 11834a49301eSmrg } 11844a49301eSmrg else { 11853464ebd5Sriastradh struct x86_reg ptr = p->src_ECX; 1186af69d88dSmrg const struct translate_buffer_variant *variant = 1187af69d88dSmrg &p->buffer_variant[var_idx]; 1188af69d88dSmrg struct x86_reg buf_stride = 11893464ebd5Sriastradh x86_make_disp(p->machine_EDI, 11903464ebd5Sriastradh get_offset(p, &p->buffer[variant->buffer_index].stride)); 1191af69d88dSmrg struct x86_reg buf_base_ptr = 11923464ebd5Sriastradh x86_make_disp(p->machine_EDI, 1193af69d88dSmrg get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); 11943464ebd5Sriastradh struct x86_reg buf_max_index = 11953464ebd5Sriastradh x86_make_disp(p->machine_EDI, 1196af69d88dSmrg get_offset(p, &p->buffer[variant->buffer_index].max_index)); 11974a49301eSmrg 11984a49301eSmrg /* Calculate pointer to current attrib: 11994a49301eSmrg */ 1200af69d88dSmrg switch (index_size) { 12013464ebd5Sriastradh case 1: 12023464ebd5Sriastradh x86_movzx8(p->func, ptr, elt); 12033464ebd5Sriastradh break; 12043464ebd5Sriastradh case 2: 12053464ebd5Sriastradh x86_movzx16(p->func, ptr, elt); 12063464ebd5Sriastradh break; 12073464ebd5Sriastradh case 4: 12083464ebd5Sriastradh x86_mov(p->func, ptr, elt); 12093464ebd5Sriastradh break; 12103464ebd5Sriastradh } 12113464ebd5Sriastradh 12123464ebd5Sriastradh /* Clamp to max_index 12133464ebd5Sriastradh */ 12143464ebd5Sriastradh x86_cmp(p->func, ptr, buf_max_index); 12153464ebd5Sriastradh x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); 12163464ebd5Sriastradh 1217af69d88dSmrg x86_mov(p->func, p->tmp2_EDX, buf_stride); 1218af69d88dSmrg x64_rexw(p->func); 1219af69d88dSmrg x86_imul(p->func, ptr, p->tmp2_EDX); 12203464ebd5Sriastradh x64_rexw(p->func); 12214a49301eSmrg x86_add(p->func, ptr, buf_base_ptr); 12224a49301eSmrg return ptr; 12234a49301eSmrg } 12244a49301eSmrg} 12254a49301eSmrg 12264a49301eSmrg 1227af69d88dSmrgstatic boolean 1228af69d88dSmrgincr_inputs(struct translate_sse *p, unsigned index_size) 12294a49301eSmrg{ 12303464ebd5Sriastradh if (!index_size && p->nr_buffer_variants == 1) { 1231af69d88dSmrg const unsigned buffer_index = p->buffer_variant[0].buffer_index; 1232af69d88dSmrg struct x86_reg stride = 1233af69d88dSmrg x86_make_disp(p->machine_EDI, 1234af69d88dSmrg get_offset(p, &p->buffer[buffer_index].stride)); 12354a49301eSmrg 12363464ebd5Sriastradh if (p->buffer_variant[0].instance_divisor == 0) { 12373464ebd5Sriastradh x64_rexw(p->func); 12383464ebd5Sriastradh x86_add(p->func, p->idx_ESI, stride); 12393464ebd5Sriastradh sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); 1240cdc920a0Smrg } 12414a49301eSmrg } 12423464ebd5Sriastradh else if (!index_size) { 12434a49301eSmrg unsigned i; 12444a49301eSmrg 12454a49301eSmrg /* Is this worthwhile?? 12464a49301eSmrg */ 12473464ebd5Sriastradh for (i = 0; i < p->nr_buffer_variants; i++) { 12483464ebd5Sriastradh struct translate_buffer_variant *variant = &p->buffer_variant[i]; 12493464ebd5Sriastradh struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 12503464ebd5Sriastradh get_offset(p, &variant->ptr)); 1251af69d88dSmrg struct x86_reg buf_stride = 1252af69d88dSmrg x86_make_disp(p->machine_EDI, 1253af69d88dSmrg get_offset(p, &p->buffer[variant->buffer_index].stride)); 12543464ebd5Sriastradh 12553464ebd5Sriastradh if (variant->instance_divisor == 0) { 12563464ebd5Sriastradh x86_mov(p->func, p->tmp_EAX, buf_stride); 12573464ebd5Sriastradh x64_rexw(p->func); 12583464ebd5Sriastradh x86_add(p->func, p->tmp_EAX, buf_ptr); 1259af69d88dSmrg if (i == 0) 1260af69d88dSmrg sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 12613464ebd5Sriastradh x64_rexw(p->func); 1262cdc920a0Smrg x86_mov(p->func, buf_ptr, p->tmp_EAX); 1263cdc920a0Smrg } 12644a49301eSmrg } 1265af69d88dSmrg } 12664a49301eSmrg else { 12673464ebd5Sriastradh x64_rexw(p->func); 12683464ebd5Sriastradh x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); 12694a49301eSmrg } 1270af69d88dSmrg 12714a49301eSmrg return TRUE; 12724a49301eSmrg} 12734a49301eSmrg 12744a49301eSmrg 12754a49301eSmrg/* Build run( struct translate *machine, 12764a49301eSmrg * unsigned start, 12774a49301eSmrg * unsigned count, 12784a49301eSmrg * void *output_buffer ) 12794a49301eSmrg * or 12804a49301eSmrg * run_elts( struct translate *machine, 12814a49301eSmrg * unsigned *elts, 12824a49301eSmrg * unsigned count, 12834a49301eSmrg * void *output_buffer ) 12844a49301eSmrg * 12854a49301eSmrg * Lots of hardcoding 12864a49301eSmrg * 12874a49301eSmrg * EAX -- pointer to current output vertex 12884a49301eSmrg * ECX -- pointer to current attribute 12894a49301eSmrg * 12904a49301eSmrg */ 1291af69d88dSmrgstatic boolean 1292af69d88dSmrgbuild_vertex_emit(struct translate_sse *p, 1293af69d88dSmrg struct x86_function *func, unsigned index_size) 12944a49301eSmrg{ 12954a49301eSmrg int fixup, label; 12964a49301eSmrg unsigned j; 12974a49301eSmrg 12983464ebd5Sriastradh memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); 12993464ebd5Sriastradh memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); 13003464ebd5Sriastradh 1301af69d88dSmrg p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 1302af69d88dSmrg p->idx_ESI = x86_make_reg(file_REG32, reg_SI); 1303af69d88dSmrg p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); 1304af69d88dSmrg p->machine_EDI = x86_make_reg(file_REG32, reg_DI); 1305af69d88dSmrg p->count_EBP = x86_make_reg(file_REG32, reg_BP); 1306af69d88dSmrg p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); 1307af69d88dSmrg p->src_ECX = x86_make_reg(file_REG32, reg_CX); 13084a49301eSmrg 13094a49301eSmrg p->func = func; 13104a49301eSmrg 13114a49301eSmrg x86_init_func(p->func); 13124a49301eSmrg 1313af69d88dSmrg if (x86_target(p->func) == X86_64_WIN64_ABI) { 1314af69d88dSmrg /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" 1315af69d88dSmrg * above the return address 1316af69d88dSmrg */ 1317af69d88dSmrg sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), 1318af69d88dSmrg x86_make_reg(file_XMM, 6)); 1319af69d88dSmrg sse2_movdqa(p->func, 1320af69d88dSmrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), 1321af69d88dSmrg x86_make_reg(file_XMM, 7)); 13223464ebd5Sriastradh } 13234a49301eSmrg 13243464ebd5Sriastradh x86_push(p->func, p->outbuf_EBX); 13253464ebd5Sriastradh x86_push(p->func, p->count_EBP); 13263464ebd5Sriastradh 1327af69d88dSmrg /* on non-Win64 x86-64, these are already in the right registers */ 1328af69d88dSmrg if (x86_target(p->func) != X86_64_STD_ABI) { 13293464ebd5Sriastradh x86_push(p->func, p->machine_EDI); 13303464ebd5Sriastradh x86_push(p->func, p->idx_ESI); 13313464ebd5Sriastradh 1332af69d88dSmrg if (x86_target(p->func) != X86_32) { 1333af69d88dSmrg x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1334af69d88dSmrg x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1335af69d88dSmrg } 1336af69d88dSmrg else { 1337af69d88dSmrg x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1338af69d88dSmrg x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1339af69d88dSmrg } 13403464ebd5Sriastradh } 13413464ebd5Sriastradh 13423464ebd5Sriastradh x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); 13433464ebd5Sriastradh 1344af69d88dSmrg if (x86_target(p->func) != X86_32) 1345af69d88dSmrg x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 13463464ebd5Sriastradh else 1347af69d88dSmrg x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1348cdc920a0Smrg 1349cdc920a0Smrg /* Load instance ID. 1350cdc920a0Smrg */ 1351cdc920a0Smrg if (p->use_instancing) { 1352af69d88dSmrg x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); 1353cdc920a0Smrg x86_mov(p->func, 1354af69d88dSmrg x86_make_disp(p->machine_EDI, 1355af69d88dSmrg get_offset(p, &p->start_instance)), p->tmp2_EDX); 1356af69d88dSmrg 1357af69d88dSmrg x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); 1358cdc920a0Smrg x86_mov(p->func, 13593464ebd5Sriastradh x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), 1360cdc920a0Smrg p->tmp_EAX); 1361cdc920a0Smrg } 13624a49301eSmrg 13634a49301eSmrg /* Get vertex count, compare to zero 13644a49301eSmrg */ 13654a49301eSmrg x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 13663464ebd5Sriastradh x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 13674a49301eSmrg fixup = x86_jcc_forward(p->func, cc_E); 13684a49301eSmrg 13694a49301eSmrg /* always load, needed or not: 13704a49301eSmrg */ 13713464ebd5Sriastradh init_inputs(p, index_size); 13724a49301eSmrg 13734a49301eSmrg /* Note address for loop jump 13744a49301eSmrg */ 13754a49301eSmrg label = x86_get_label(p->func); 13764a49301eSmrg { 13773464ebd5Sriastradh struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); 13783464ebd5Sriastradh int last_variant = -1; 13794a49301eSmrg struct x86_reg vb; 13804a49301eSmrg 13814a49301eSmrg for (j = 0; j < p->translate.key.nr_elements; j++) { 13824a49301eSmrg const struct translate_element *a = &p->translate.key.element[j]; 13833464ebd5Sriastradh unsigned variant = p->element_to_buffer_variant[j]; 13844a49301eSmrg 13854a49301eSmrg /* Figure out source pointer address: 13864a49301eSmrg */ 13873464ebd5Sriastradh if (variant != last_variant) { 13883464ebd5Sriastradh last_variant = variant; 13893464ebd5Sriastradh vb = get_buffer_ptr(p, index_size, variant, elt); 13904a49301eSmrg } 1391af69d88dSmrg 1392af69d88dSmrg if (!translate_attr(p, a, 1393af69d88dSmrg x86_make_disp(vb, a->input_offset), 1394af69d88dSmrg x86_make_disp(p->outbuf_EBX, a->output_offset))) 13954a49301eSmrg return FALSE; 13964a49301eSmrg } 13974a49301eSmrg 13984a49301eSmrg /* Next output vertex: 13994a49301eSmrg */ 14003464ebd5Sriastradh x64_rexw(p->func); 1401af69d88dSmrg x86_lea(p->func, p->outbuf_EBX, 1402af69d88dSmrg x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); 14034a49301eSmrg 14044a49301eSmrg /* Incr index 1405af69d88dSmrg */ 1406af69d88dSmrg incr_inputs(p, index_size); 14074a49301eSmrg } 14084a49301eSmrg 14094a49301eSmrg /* decr count, loop if not zero 14104a49301eSmrg */ 14113464ebd5Sriastradh x86_dec(p->func, p->count_EBP); 14124a49301eSmrg x86_jcc(p->func, cc_NZ, label); 14134a49301eSmrg 14144a49301eSmrg /* Exit mmx state? 14154a49301eSmrg */ 14164a49301eSmrg if (p->func->need_emms) 14174a49301eSmrg mmx_emms(p->func); 14184a49301eSmrg 14194a49301eSmrg /* Land forward jump here: 14204a49301eSmrg */ 14214a49301eSmrg x86_fixup_fwd_jump(p->func, fixup); 14224a49301eSmrg 14234a49301eSmrg /* Pop regs and return 14244a49301eSmrg */ 1425af69d88dSmrg if (x86_target(p->func) != X86_64_STD_ABI) { 14263464ebd5Sriastradh x86_pop(p->func, p->idx_ESI); 14273464ebd5Sriastradh x86_pop(p->func, p->machine_EDI); 14283464ebd5Sriastradh } 14293464ebd5Sriastradh 14303464ebd5Sriastradh x86_pop(p->func, p->count_EBP); 14313464ebd5Sriastradh x86_pop(p->func, p->outbuf_EBX); 14323464ebd5Sriastradh 1433af69d88dSmrg if (x86_target(p->func) == X86_64_WIN64_ABI) { 1434af69d88dSmrg sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), 1435af69d88dSmrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); 1436af69d88dSmrg sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), 1437af69d88dSmrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); 14383464ebd5Sriastradh } 14394a49301eSmrg x86_ret(p->func); 14404a49301eSmrg 14414a49301eSmrg return TRUE; 14424a49301eSmrg} 14434a49301eSmrg 14444a49301eSmrg 1445af69d88dSmrgstatic void 1446af69d88dSmrgtranslate_sse_set_buffer(struct translate *translate, 1447af69d88dSmrg unsigned buf, 1448af69d88dSmrg const void *ptr, unsigned stride, unsigned max_index) 14494a49301eSmrg{ 1450af69d88dSmrg struct translate_sse *p = (struct translate_sse *) translate; 14514a49301eSmrg 14524a49301eSmrg if (buf < p->nr_buffers) { 1453af69d88dSmrg p->buffer[buf].base_ptr = (char *) ptr; 14544a49301eSmrg p->buffer[buf].stride = stride; 14553464ebd5Sriastradh p->buffer[buf].max_index = max_index; 14564a49301eSmrg } 14574a49301eSmrg 1458af69d88dSmrg if (0) 1459af69d88dSmrg debug_printf("%s %d/%d: %p %d\n", 1460af69d88dSmrg __FUNCTION__, buf, p->nr_buffers, ptr, stride); 14614a49301eSmrg} 14624a49301eSmrg 14634a49301eSmrg 1464af69d88dSmrgstatic void 1465af69d88dSmrgtranslate_sse_release(struct translate *translate) 14664a49301eSmrg{ 1467af69d88dSmrg struct translate_sse *p = (struct translate_sse *) translate; 14684a49301eSmrg 1469af69d88dSmrg x86_release_func(&p->elt8_func); 1470af69d88dSmrg x86_release_func(&p->elt16_func); 1471af69d88dSmrg x86_release_func(&p->elt_func); 1472af69d88dSmrg x86_release_func(&p->linear_func); 14734a49301eSmrg 14743464ebd5Sriastradh os_free_aligned(p); 14754a49301eSmrg} 14764a49301eSmrg 14774a49301eSmrg 1478af69d88dSmrgstruct translate * 1479af69d88dSmrgtranslate_sse2_create(const struct translate_key *key) 14804a49301eSmrg{ 14814a49301eSmrg struct translate_sse *p = NULL; 14824a49301eSmrg unsigned i; 14834a49301eSmrg 14843464ebd5Sriastradh /* this is misnamed, it actually refers to whether rtasm is enabled or not */ 14853464ebd5Sriastradh if (!rtasm_cpu_has_sse()) 14864a49301eSmrg goto fail; 14874a49301eSmrg 14883464ebd5Sriastradh p = os_malloc_aligned(sizeof(struct translate_sse), 16); 148901e04c3fSmrg if (!p) 14904a49301eSmrg goto fail; 1491af69d88dSmrg 14923464ebd5Sriastradh memset(p, 0, sizeof(*p)); 14933464ebd5Sriastradh memcpy(p->consts, consts, sizeof(consts)); 14944a49301eSmrg 14954a49301eSmrg p->translate.key = *key; 14964a49301eSmrg p->translate.release = translate_sse_release; 14974a49301eSmrg p->translate.set_buffer = translate_sse_set_buffer; 14984a49301eSmrg 1499af69d88dSmrg assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS); 1500af69d88dSmrg 1501cdc920a0Smrg for (i = 0; i < key->nr_elements; i++) { 1502cdc920a0Smrg if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 1503cdc920a0Smrg unsigned j; 1504cdc920a0Smrg 1505af69d88dSmrg p->nr_buffers = 1506af69d88dSmrg MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 1507cdc920a0Smrg 1508cdc920a0Smrg if (key->element[i].instance_divisor) { 1509cdc920a0Smrg p->use_instancing = TRUE; 1510cdc920a0Smrg } 1511cdc920a0Smrg 1512cdc920a0Smrg /* 15133464ebd5Sriastradh * Map vertex element to vertex buffer variant. 1514cdc920a0Smrg */ 15153464ebd5Sriastradh for (j = 0; j < p->nr_buffer_variants; j++) { 1516af69d88dSmrg if (p->buffer_variant[j].buffer_index == 1517af69d88dSmrg key->element[i].input_buffer 1518af69d88dSmrg && p->buffer_variant[j].instance_divisor == 1519af69d88dSmrg key->element[i].instance_divisor) { 1520cdc920a0Smrg break; 1521cdc920a0Smrg } 1522cdc920a0Smrg } 15233464ebd5Sriastradh if (j == p->nr_buffer_variants) { 15243464ebd5Sriastradh p->buffer_variant[j].buffer_index = key->element[i].input_buffer; 1525af69d88dSmrg p->buffer_variant[j].instance_divisor = 1526af69d88dSmrg key->element[i].instance_divisor; 15273464ebd5Sriastradh p->nr_buffer_variants++; 1528cdc920a0Smrg } 15293464ebd5Sriastradh p->element_to_buffer_variant[i] = j; 1530af69d88dSmrg } 1531af69d88dSmrg else { 1532cdc920a0Smrg assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 1533cdc920a0Smrg 15343464ebd5Sriastradh p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; 1535cdc920a0Smrg } 1536cdc920a0Smrg } 15374a49301eSmrg 1538af69d88dSmrg if (0) 1539af69d88dSmrg debug_printf("nr_buffers: %d\n", p->nr_buffers); 15404a49301eSmrg 15413464ebd5Sriastradh if (!build_vertex_emit(p, &p->linear_func, 0)) 15423464ebd5Sriastradh goto fail; 15433464ebd5Sriastradh 15443464ebd5Sriastradh if (!build_vertex_emit(p, &p->elt_func, 4)) 15453464ebd5Sriastradh goto fail; 15463464ebd5Sriastradh 15473464ebd5Sriastradh if (!build_vertex_emit(p, &p->elt16_func, 2)) 15483464ebd5Sriastradh goto fail; 15493464ebd5Sriastradh 15503464ebd5Sriastradh if (!build_vertex_emit(p, &p->elt8_func, 1)) 15513464ebd5Sriastradh goto fail; 15523464ebd5Sriastradh 15533464ebd5Sriastradh p->translate.run = (run_func) x86_get_func(&p->linear_func); 15543464ebd5Sriastradh if (p->translate.run == NULL) 15554a49301eSmrg goto fail; 15564a49301eSmrg 15573464ebd5Sriastradh p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); 15583464ebd5Sriastradh if (p->translate.run_elts == NULL) 15594a49301eSmrg goto fail; 15604a49301eSmrg 15613464ebd5Sriastradh p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); 15623464ebd5Sriastradh if (p->translate.run_elts16 == NULL) 15634a49301eSmrg goto fail; 15644a49301eSmrg 15653464ebd5Sriastradh p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); 15663464ebd5Sriastradh if (p->translate.run_elts8 == NULL) 15674a49301eSmrg goto fail; 15684a49301eSmrg 15694a49301eSmrg return &p->translate; 15704a49301eSmrg 15714a49301eSmrg fail: 15724a49301eSmrg if (p) 1573af69d88dSmrg translate_sse_release(&p->translate); 15744a49301eSmrg 15754a49301eSmrg return NULL; 15764a49301eSmrg} 15774a49301eSmrg 15784a49301eSmrg 15794a49301eSmrg#else 15804a49301eSmrg 1581af69d88dSmrgstruct translate * 1582af69d88dSmrgtranslate_sse2_create(const struct translate_key *key) 15834a49301eSmrg{ 15844a49301eSmrg return NULL; 15854a49301eSmrg} 15864a49301eSmrg 15874a49301eSmrg#endif 1588