1848b8605Smrg/* 2848b8605Smrg * Copyright 2003 VMware, Inc. 3848b8605Smrg * All Rights Reserved. 4848b8605Smrg * 5848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a 6848b8605Smrg * copy of this software and associated documentation files (the "Software"), 7848b8605Smrg * to deal in the Software without restriction, including without limitation 8848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub 9848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom 10848b8605Smrg * the Software is furnished to do so, subject to the following conditions: 11848b8605Smrg * 12848b8605Smrg * The above copyright notice and this permission notice (including the next 13848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the 14848b8605Smrg * Software. 15848b8605Smrg * 16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19848b8605Smrg * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 23848b8605Smrg * 24848b8605Smrg * Authors: 25848b8605Smrg * Keith Whitwell <keithw@vmware.com> 26848b8605Smrg */ 27848b8605Smrg 28848b8605Smrg 29848b8605Smrg#include "pipe/p_config.h" 30848b8605Smrg#include "pipe/p_compiler.h" 31848b8605Smrg#include "util/u_memory.h" 32848b8605Smrg#include "util/u_math.h" 33848b8605Smrg#include "util/u_format.h" 34848b8605Smrg 35848b8605Smrg#include "translate.h" 36848b8605Smrg 37848b8605Smrg 38b8e80941Smrg#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED) 39848b8605Smrg 40848b8605Smrg#include "rtasm/rtasm_cpu.h" 41848b8605Smrg#include "rtasm/rtasm_x86sse.h" 42848b8605Smrg 43848b8605Smrg 44848b8605Smrg#define X 0 45848b8605Smrg#define Y 1 46848b8605Smrg#define Z 2 47848b8605Smrg#define W 3 48848b8605Smrg 49848b8605Smrg 50848b8605Smrgstruct translate_buffer 51848b8605Smrg{ 52848b8605Smrg const void *base_ptr; 53848b8605Smrg uintptr_t stride; 54848b8605Smrg unsigned max_index; 55848b8605Smrg}; 56848b8605Smrg 57848b8605Smrgstruct translate_buffer_variant 58848b8605Smrg{ 59848b8605Smrg unsigned buffer_index; 60848b8605Smrg unsigned instance_divisor; 61848b8605Smrg void *ptr; /* updated either per vertex or per instance */ 62848b8605Smrg}; 63848b8605Smrg 64848b8605Smrg 65848b8605Smrg#define ELEMENT_BUFFER_INSTANCE_ID 1001 66848b8605Smrg 67848b8605Smrg#define NUM_CONSTS 7 68848b8605Smrg 69848b8605Smrgenum 70848b8605Smrg{ 71848b8605Smrg CONST_IDENTITY, 72848b8605Smrg CONST_INV_127, 73848b8605Smrg CONST_INV_255, 74848b8605Smrg CONST_INV_32767, 75848b8605Smrg CONST_INV_65535, 76848b8605Smrg CONST_INV_2147483647, 77848b8605Smrg CONST_255 78848b8605Smrg}; 79848b8605Smrg 80848b8605Smrg#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} 81848b8605Smrgstatic float consts[NUM_CONSTS][4] = { 82848b8605Smrg {0, 0, 0, 1}, 83848b8605Smrg C(1.0 / 127.0), 84848b8605Smrg C(1.0 / 255.0), 85848b8605Smrg C(1.0 / 32767.0), 86848b8605Smrg C(1.0 / 65535.0), 87848b8605Smrg C(1.0 / 2147483647.0), 88848b8605Smrg C(255.0) 89848b8605Smrg}; 90848b8605Smrg 91848b8605Smrg#undef C 92848b8605Smrg 93848b8605Smrgstruct translate_sse 94848b8605Smrg{ 95848b8605Smrg struct translate translate; 96848b8605Smrg 97848b8605Smrg struct x86_function linear_func; 98848b8605Smrg struct x86_function elt_func; 99848b8605Smrg struct x86_function elt16_func; 100848b8605Smrg struct x86_function elt8_func; 101848b8605Smrg struct x86_function *func; 102848b8605Smrg 103848b8605Smrg PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; 104848b8605Smrg int8_t reg_to_const[16]; 105848b8605Smrg int8_t const_to_reg[NUM_CONSTS]; 106848b8605Smrg 107848b8605Smrg struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS]; 108848b8605Smrg unsigned nr_buffers; 109848b8605Smrg 110848b8605Smrg /* Multiple buffer variants can map to a single buffer. */ 111848b8605Smrg struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS]; 112848b8605Smrg unsigned nr_buffer_variants; 113848b8605Smrg 114848b8605Smrg /* Multiple elements can map to a single buffer variant. */ 115848b8605Smrg unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS]; 116848b8605Smrg 117848b8605Smrg boolean use_instancing; 118848b8605Smrg unsigned instance_id; 119848b8605Smrg unsigned start_instance; 120848b8605Smrg 121848b8605Smrg /* these are actually known values, but putting them in a struct 122848b8605Smrg * like this is helpful to keep them in sync across the file. 123848b8605Smrg */ 124848b8605Smrg struct x86_reg tmp_EAX; 125848b8605Smrg struct x86_reg tmp2_EDX; 126848b8605Smrg struct x86_reg src_ECX; 127848b8605Smrg struct x86_reg idx_ESI; /* either start+i or &elt[i] */ 128848b8605Smrg struct x86_reg machine_EDI; 129848b8605Smrg struct x86_reg outbuf_EBX; 130848b8605Smrg struct x86_reg count_EBP; /* decrements to zero */ 131848b8605Smrg}; 132848b8605Smrg 133848b8605Smrg 134848b8605Smrgstatic int 135848b8605Smrgget_offset(const void *a, const void *b) 136848b8605Smrg{ 137848b8605Smrg return (const char *) b - (const char *) a; 138848b8605Smrg} 139848b8605Smrg 140848b8605Smrg 141848b8605Smrgstatic struct x86_reg 142848b8605Smrgget_const(struct translate_sse *p, unsigned id) 143848b8605Smrg{ 144848b8605Smrg struct x86_reg reg; 145848b8605Smrg unsigned i; 146848b8605Smrg 147848b8605Smrg if (p->const_to_reg[id] >= 0) 148848b8605Smrg return x86_make_reg(file_XMM, p->const_to_reg[id]); 149848b8605Smrg 150848b8605Smrg for (i = 2; i < 8; ++i) { 151848b8605Smrg if (p->reg_to_const[i] < 0) 152848b8605Smrg break; 153848b8605Smrg } 154848b8605Smrg 155848b8605Smrg /* TODO: be smarter here */ 156848b8605Smrg if (i == 8) 157848b8605Smrg --i; 158848b8605Smrg 159848b8605Smrg reg = x86_make_reg(file_XMM, i); 160848b8605Smrg 161848b8605Smrg if (p->reg_to_const[i] >= 0) 162848b8605Smrg p->const_to_reg[p->reg_to_const[i]] = -1; 163848b8605Smrg 164848b8605Smrg p->reg_to_const[i] = id; 165848b8605Smrg p->const_to_reg[id] = i; 166848b8605Smrg 167848b8605Smrg /* TODO: this should happen outside the loop, if possible */ 168848b8605Smrg sse_movaps(p->func, reg, 169848b8605Smrg x86_make_disp(p->machine_EDI, 170848b8605Smrg get_offset(p, &p->consts[id][0]))); 171848b8605Smrg 172848b8605Smrg return reg; 173848b8605Smrg} 174848b8605Smrg 175848b8605Smrg 176848b8605Smrg/* load the data in a SSE2 register, padding with zeros */ 177848b8605Smrgstatic boolean 178848b8605Smrgemit_load_sse2(struct translate_sse *p, 179848b8605Smrg struct x86_reg data, struct x86_reg src, unsigned size) 180848b8605Smrg{ 181848b8605Smrg struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 182848b8605Smrg struct x86_reg tmp = p->tmp_EAX; 183848b8605Smrg switch (size) { 184848b8605Smrg case 1: 185848b8605Smrg x86_movzx8(p->func, tmp, src); 186848b8605Smrg sse2_movd(p->func, data, tmp); 187848b8605Smrg break; 188848b8605Smrg case 2: 189848b8605Smrg x86_movzx16(p->func, tmp, src); 190848b8605Smrg sse2_movd(p->func, data, tmp); 191848b8605Smrg break; 192848b8605Smrg case 3: 193848b8605Smrg x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); 194848b8605Smrg x86_shl_imm(p->func, tmp, 16); 195848b8605Smrg x86_mov16(p->func, tmp, src); 196848b8605Smrg sse2_movd(p->func, data, tmp); 197848b8605Smrg break; 198848b8605Smrg case 4: 199848b8605Smrg sse2_movd(p->func, data, src); 200848b8605Smrg break; 201848b8605Smrg case 6: 202848b8605Smrg sse2_movd(p->func, data, src); 203848b8605Smrg x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); 204848b8605Smrg sse2_movd(p->func, tmpXMM, tmp); 205848b8605Smrg sse2_punpckldq(p->func, data, tmpXMM); 206848b8605Smrg break; 207848b8605Smrg case 8: 208848b8605Smrg sse2_movq(p->func, data, src); 209848b8605Smrg break; 210848b8605Smrg case 12: 211848b8605Smrg sse2_movq(p->func, data, src); 212848b8605Smrg sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); 213848b8605Smrg sse2_punpcklqdq(p->func, data, tmpXMM); 214848b8605Smrg break; 215848b8605Smrg case 16: 216848b8605Smrg sse2_movdqu(p->func, data, src); 217848b8605Smrg break; 218848b8605Smrg default: 219848b8605Smrg return FALSE; 220848b8605Smrg } 221848b8605Smrg return TRUE; 222848b8605Smrg} 223848b8605Smrg 224848b8605Smrg 225848b8605Smrg/* this value can be passed for the out_chans argument */ 226848b8605Smrg#define CHANNELS_0001 5 227848b8605Smrg 228848b8605Smrg 229848b8605Smrg/* this function will load #chans float values, and will 230848b8605Smrg * pad the register with zeroes at least up to out_chans. 231848b8605Smrg * 232848b8605Smrg * If out_chans is set to CHANNELS_0001, then the fourth 233848b8605Smrg * value will be padded with 1. Only pass this value if 234848b8605Smrg * chans < 4 or results are undefined. 235848b8605Smrg */ 236848b8605Smrgstatic void 237848b8605Smrgemit_load_float32(struct translate_sse *p, struct x86_reg data, 238848b8605Smrg struct x86_reg arg0, unsigned out_chans, unsigned chans) 239848b8605Smrg{ 240848b8605Smrg switch (chans) { 241848b8605Smrg case 1: 242848b8605Smrg /* a 0 0 0 243848b8605Smrg * a 0 0 1 244848b8605Smrg */ 245848b8605Smrg sse_movss(p->func, data, arg0); 246848b8605Smrg if (out_chans == CHANNELS_0001) 247848b8605Smrg sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 248848b8605Smrg break; 249848b8605Smrg case 2: 250848b8605Smrg /* 0 0 0 1 251848b8605Smrg * a b 0 1 252848b8605Smrg */ 253848b8605Smrg if (out_chans == CHANNELS_0001) 254848b8605Smrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 255848b8605Smrg SHUF(X, Y, Z, W)); 256848b8605Smrg else if (out_chans > 2) 257848b8605Smrg sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 258848b8605Smrg sse_movlps(p->func, data, arg0); 259848b8605Smrg break; 260848b8605Smrg case 3: 261848b8605Smrg /* Have to jump through some hoops: 262848b8605Smrg * 263848b8605Smrg * c 0 0 0 264848b8605Smrg * c 0 0 1 if out_chans == CHANNELS_0001 265848b8605Smrg * 0 0 c 0/1 266848b8605Smrg * a b c 0/1 267848b8605Smrg */ 268848b8605Smrg sse_movss(p->func, data, x86_make_disp(arg0, 8)); 269848b8605Smrg if (out_chans == CHANNELS_0001) 270848b8605Smrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 271848b8605Smrg SHUF(X, Y, Z, W)); 272848b8605Smrg sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); 273848b8605Smrg sse_movlps(p->func, data, arg0); 274848b8605Smrg break; 275848b8605Smrg case 4: 276848b8605Smrg sse_movups(p->func, data, arg0); 277848b8605Smrg break; 278848b8605Smrg } 279848b8605Smrg} 280848b8605Smrg 281848b8605Smrg/* this function behaves like emit_load_float32, but loads 282848b8605Smrg 64-bit floating point numbers, converting them to 32-bit 283848b8605Smrg ones */ 284848b8605Smrgstatic void 285848b8605Smrgemit_load_float64to32(struct translate_sse *p, struct x86_reg data, 286848b8605Smrg struct x86_reg arg0, unsigned out_chans, unsigned chans) 287848b8605Smrg{ 288848b8605Smrg struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 289848b8605Smrg switch (chans) { 290848b8605Smrg case 1: 291848b8605Smrg sse2_movsd(p->func, data, arg0); 292848b8605Smrg if (out_chans > 1) 293848b8605Smrg sse2_cvtpd2ps(p->func, data, data); 294848b8605Smrg else 295848b8605Smrg sse2_cvtsd2ss(p->func, data, data); 296848b8605Smrg if (out_chans == CHANNELS_0001) 297848b8605Smrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 298848b8605Smrg SHUF(X, Y, Z, W)); 299848b8605Smrg break; 300848b8605Smrg case 2: 301848b8605Smrg sse2_movupd(p->func, data, arg0); 302848b8605Smrg sse2_cvtpd2ps(p->func, data, data); 303848b8605Smrg if (out_chans == CHANNELS_0001) 304848b8605Smrg sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 305848b8605Smrg SHUF(X, Y, Z, W)); 306848b8605Smrg else if (out_chans > 2) 307848b8605Smrg sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 308848b8605Smrg break; 309848b8605Smrg case 3: 310848b8605Smrg sse2_movupd(p->func, data, arg0); 311848b8605Smrg sse2_cvtpd2ps(p->func, data, data); 312848b8605Smrg sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 313848b8605Smrg if (out_chans > 3) 314848b8605Smrg sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 315848b8605Smrg else 316848b8605Smrg sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); 317848b8605Smrg sse_movlhps(p->func, data, tmpXMM); 318848b8605Smrg if (out_chans == CHANNELS_0001) 319848b8605Smrg sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 320848b8605Smrg break; 321848b8605Smrg case 4: 322848b8605Smrg sse2_movupd(p->func, data, arg0); 323848b8605Smrg sse2_cvtpd2ps(p->func, data, data); 324848b8605Smrg sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 325848b8605Smrg sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 326848b8605Smrg sse_movlhps(p->func, data, tmpXMM); 327848b8605Smrg break; 328848b8605Smrg } 329848b8605Smrg} 330848b8605Smrg 331848b8605Smrg 332848b8605Smrgstatic void 333848b8605Smrgemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, 334848b8605Smrg struct x86_reg dst_xmm, struct x86_reg src_gpr, 335848b8605Smrg struct x86_reg src_xmm) 336848b8605Smrg{ 337848b8605Smrg if (x86_target(p->func) != X86_32) 338848b8605Smrg x64_mov64(p->func, dst_gpr, src_gpr); 339848b8605Smrg else { 340848b8605Smrg /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ 341848b8605Smrg if (x86_target_caps(p->func) & X86_SSE2) 342848b8605Smrg sse2_movq(p->func, dst_xmm, src_xmm); 343848b8605Smrg else 344848b8605Smrg sse_movlps(p->func, dst_xmm, src_xmm); 345848b8605Smrg } 346848b8605Smrg} 347848b8605Smrg 348848b8605Smrg 349848b8605Smrgstatic void 350848b8605Smrgemit_load64(struct translate_sse *p, struct x86_reg dst_gpr, 351848b8605Smrg struct x86_reg dst_xmm, struct x86_reg src) 352848b8605Smrg{ 353848b8605Smrg emit_mov64(p, dst_gpr, dst_xmm, src, src); 354848b8605Smrg} 355848b8605Smrg 356848b8605Smrg 357848b8605Smrgstatic void 358848b8605Smrgemit_store64(struct translate_sse *p, struct x86_reg dst, 359848b8605Smrg struct x86_reg src_gpr, struct x86_reg src_xmm) 360848b8605Smrg{ 361848b8605Smrg emit_mov64(p, dst, dst, src_gpr, src_xmm); 362848b8605Smrg} 363848b8605Smrg 364848b8605Smrg 365848b8605Smrgstatic void 366848b8605Smrgemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) 367848b8605Smrg{ 368848b8605Smrg if (x86_target_caps(p->func) & X86_SSE2) 369848b8605Smrg sse2_movdqu(p->func, dst, src); 370848b8605Smrg else 371848b8605Smrg sse_movups(p->func, dst, src); 372848b8605Smrg} 373848b8605Smrg 374848b8605Smrg 375848b8605Smrg/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, 376848b8605Smrg * but may or may not be good on older processors 377848b8605Smrg * TODO: may perhaps want to use non-temporal stores here if possible 378848b8605Smrg */ 379848b8605Smrgstatic void 380848b8605Smrgemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, 381848b8605Smrg unsigned size) 382848b8605Smrg{ 383848b8605Smrg struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 384848b8605Smrg struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); 385848b8605Smrg struct x86_reg dataGPR = p->tmp_EAX; 386848b8605Smrg struct x86_reg dataGPR2 = p->tmp2_EDX; 387848b8605Smrg 388848b8605Smrg if (size < 8) { 389848b8605Smrg switch (size) { 390848b8605Smrg case 1: 391848b8605Smrg x86_mov8(p->func, dataGPR, src); 392848b8605Smrg x86_mov8(p->func, dst, dataGPR); 393848b8605Smrg break; 394848b8605Smrg case 2: 395848b8605Smrg x86_mov16(p->func, dataGPR, src); 396848b8605Smrg x86_mov16(p->func, dst, dataGPR); 397848b8605Smrg break; 398848b8605Smrg case 3: 399848b8605Smrg x86_mov16(p->func, dataGPR, src); 400848b8605Smrg x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); 401848b8605Smrg x86_mov16(p->func, dst, dataGPR); 402848b8605Smrg x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); 403848b8605Smrg break; 404848b8605Smrg case 4: 405848b8605Smrg x86_mov(p->func, dataGPR, src); 406848b8605Smrg x86_mov(p->func, dst, dataGPR); 407848b8605Smrg break; 408848b8605Smrg case 6: 409848b8605Smrg x86_mov(p->func, dataGPR, src); 410848b8605Smrg x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); 411848b8605Smrg x86_mov(p->func, dst, dataGPR); 412848b8605Smrg x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); 413848b8605Smrg break; 414848b8605Smrg } 415848b8605Smrg } 416848b8605Smrg else if (!(x86_target_caps(p->func) & X86_SSE)) { 417848b8605Smrg unsigned i = 0; 418848b8605Smrg assert((size & 3) == 0); 419848b8605Smrg for (i = 0; i < size; i += 4) { 420848b8605Smrg x86_mov(p->func, dataGPR, x86_make_disp(src, i)); 421848b8605Smrg x86_mov(p->func, x86_make_disp(dst, i), dataGPR); 422848b8605Smrg } 423848b8605Smrg } 424848b8605Smrg else { 425848b8605Smrg switch (size) { 426848b8605Smrg case 8: 427848b8605Smrg emit_load64(p, dataGPR, dataXMM, src); 428848b8605Smrg emit_store64(p, dst, dataGPR, dataXMM); 429848b8605Smrg break; 430848b8605Smrg case 12: 431848b8605Smrg emit_load64(p, dataGPR2, dataXMM, src); 432848b8605Smrg x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); 433848b8605Smrg emit_store64(p, dst, dataGPR2, dataXMM); 434848b8605Smrg x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); 435848b8605Smrg break; 436848b8605Smrg case 16: 437848b8605Smrg emit_mov128(p, dataXMM, src); 438848b8605Smrg emit_mov128(p, dst, dataXMM); 439848b8605Smrg break; 440848b8605Smrg case 24: 441848b8605Smrg emit_mov128(p, dataXMM, src); 442848b8605Smrg emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); 443848b8605Smrg emit_mov128(p, dst, dataXMM); 444848b8605Smrg emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); 445848b8605Smrg break; 446848b8605Smrg case 32: 447848b8605Smrg emit_mov128(p, dataXMM, src); 448848b8605Smrg emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); 449848b8605Smrg emit_mov128(p, dst, dataXMM); 450848b8605Smrg emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); 451848b8605Smrg break; 452848b8605Smrg default: 453848b8605Smrg assert(0); 454848b8605Smrg } 455848b8605Smrg } 456848b8605Smrg} 457848b8605Smrg 458848b8605Smrgstatic boolean 459848b8605Smrgtranslate_attr_convert(struct translate_sse *p, 460848b8605Smrg const struct translate_element *a, 461848b8605Smrg struct x86_reg src, struct x86_reg dst) 462848b8605Smrg{ 463848b8605Smrg const struct util_format_description *input_desc = 464848b8605Smrg util_format_description(a->input_format); 465848b8605Smrg const struct util_format_description *output_desc = 466848b8605Smrg util_format_description(a->output_format); 467848b8605Smrg unsigned i; 468848b8605Smrg boolean id_swizzle = TRUE; 469848b8605Smrg unsigned swizzle[4] = 470b8e80941Smrg { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE, 471b8e80941Smrg PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE }; 472848b8605Smrg unsigned needed_chans = 0; 473848b8605Smrg unsigned imms[2] = { 0, 0x3f800000 }; 474848b8605Smrg 475848b8605Smrg if (a->output_format == PIPE_FORMAT_NONE 476848b8605Smrg || a->input_format == PIPE_FORMAT_NONE) 477848b8605Smrg return FALSE; 478848b8605Smrg 479848b8605Smrg if (input_desc->channel[0].size & 7) 480848b8605Smrg return FALSE; 481848b8605Smrg 482848b8605Smrg if (input_desc->colorspace != output_desc->colorspace) 483848b8605Smrg return FALSE; 484848b8605Smrg 485848b8605Smrg for (i = 1; i < input_desc->nr_channels; ++i) { 486848b8605Smrg if (memcmp 487848b8605Smrg (&input_desc->channel[i], &input_desc->channel[0], 488848b8605Smrg sizeof(input_desc->channel[0]))) 489848b8605Smrg return FALSE; 490848b8605Smrg } 491848b8605Smrg 492848b8605Smrg for (i = 1; i < output_desc->nr_channels; ++i) { 493848b8605Smrg if (memcmp 494848b8605Smrg (&output_desc->channel[i], &output_desc->channel[0], 495848b8605Smrg sizeof(output_desc->channel[0]))) { 496848b8605Smrg return FALSE; 497848b8605Smrg } 498848b8605Smrg } 499848b8605Smrg 500848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 501848b8605Smrg if (output_desc->swizzle[i] < 4) 502848b8605Smrg swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; 503848b8605Smrg } 504848b8605Smrg 505848b8605Smrg if ((x86_target_caps(p->func) & X86_SSE) && 506848b8605Smrg (0 || a->output_format == PIPE_FORMAT_R32_FLOAT 507848b8605Smrg || a->output_format == PIPE_FORMAT_R32G32_FLOAT 508848b8605Smrg || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT 509848b8605Smrg || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { 510848b8605Smrg struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 511848b8605Smrg 512848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 513b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_0 514848b8605Smrg && i >= input_desc->nr_channels) 515848b8605Smrg swizzle[i] = i; 516848b8605Smrg } 517848b8605Smrg 518848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 519848b8605Smrg if (swizzle[i] < 4) 520848b8605Smrg needed_chans = MAX2(needed_chans, swizzle[i] + 1); 521b8e80941Smrg if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 522848b8605Smrg id_swizzle = FALSE; 523848b8605Smrg } 524848b8605Smrg 525848b8605Smrg if (needed_chans > 0) { 526848b8605Smrg switch (input_desc->channel[0].type) { 527848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 528848b8605Smrg if (!(x86_target_caps(p->func) & X86_SSE2)) 529848b8605Smrg return FALSE; 530848b8605Smrg emit_load_sse2(p, dataXMM, src, 531848b8605Smrg input_desc->channel[0].size * 532848b8605Smrg input_desc->nr_channels >> 3); 533848b8605Smrg 534848b8605Smrg /* TODO: add support for SSE4.1 pmovzx */ 535848b8605Smrg switch (input_desc->channel[0].size) { 536848b8605Smrg case 8: 537848b8605Smrg /* TODO: this may be inefficient due to get_identity() being 538848b8605Smrg * used both as a float and integer register. 539848b8605Smrg */ 540848b8605Smrg sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 541848b8605Smrg sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 542848b8605Smrg break; 543848b8605Smrg case 16: 544848b8605Smrg sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 545848b8605Smrg break; 546848b8605Smrg case 32: /* we lose precision here */ 547848b8605Smrg sse2_psrld_imm(p->func, dataXMM, 1); 548848b8605Smrg break; 549848b8605Smrg default: 550848b8605Smrg return FALSE; 551848b8605Smrg } 552848b8605Smrg sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 553848b8605Smrg if (input_desc->channel[0].normalized) { 554848b8605Smrg struct x86_reg factor; 555848b8605Smrg switch (input_desc->channel[0].size) { 556848b8605Smrg case 8: 557848b8605Smrg factor = get_const(p, CONST_INV_255); 558848b8605Smrg break; 559848b8605Smrg case 16: 560848b8605Smrg factor = get_const(p, CONST_INV_65535); 561848b8605Smrg break; 562848b8605Smrg case 32: 563848b8605Smrg factor = get_const(p, CONST_INV_2147483647); 564848b8605Smrg break; 565848b8605Smrg default: 566848b8605Smrg assert(0); 567848b8605Smrg factor.disp = 0; 568848b8605Smrg factor.file = 0; 569848b8605Smrg factor.idx = 0; 570848b8605Smrg factor.mod = 0; 571848b8605Smrg break; 572848b8605Smrg } 573848b8605Smrg sse_mulps(p->func, dataXMM, factor); 574848b8605Smrg } 575848b8605Smrg else if (input_desc->channel[0].size == 32) 576848b8605Smrg /* compensate for the bit we threw away to fit u32 into s32 */ 577848b8605Smrg sse_addps(p->func, dataXMM, dataXMM); 578848b8605Smrg break; 579848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 580848b8605Smrg if (!(x86_target_caps(p->func) & X86_SSE2)) 581848b8605Smrg return FALSE; 582848b8605Smrg emit_load_sse2(p, dataXMM, src, 583848b8605Smrg input_desc->channel[0].size * 584848b8605Smrg input_desc->nr_channels >> 3); 585848b8605Smrg 586848b8605Smrg /* TODO: add support for SSE4.1 pmovsx */ 587848b8605Smrg switch (input_desc->channel[0].size) { 588848b8605Smrg case 8: 589848b8605Smrg sse2_punpcklbw(p->func, dataXMM, dataXMM); 590848b8605Smrg sse2_punpcklbw(p->func, dataXMM, dataXMM); 591848b8605Smrg sse2_psrad_imm(p->func, dataXMM, 24); 592848b8605Smrg break; 593848b8605Smrg case 16: 594848b8605Smrg sse2_punpcklwd(p->func, dataXMM, dataXMM); 595848b8605Smrg sse2_psrad_imm(p->func, dataXMM, 16); 596848b8605Smrg break; 597848b8605Smrg case 32: /* we lose precision here */ 598848b8605Smrg break; 599848b8605Smrg default: 600848b8605Smrg return FALSE; 601848b8605Smrg } 602848b8605Smrg sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 603848b8605Smrg if (input_desc->channel[0].normalized) { 604848b8605Smrg struct x86_reg factor; 605848b8605Smrg switch (input_desc->channel[0].size) { 606848b8605Smrg case 8: 607848b8605Smrg factor = get_const(p, CONST_INV_127); 608848b8605Smrg break; 609848b8605Smrg case 16: 610848b8605Smrg factor = get_const(p, CONST_INV_32767); 611848b8605Smrg break; 612848b8605Smrg case 32: 613848b8605Smrg factor = get_const(p, CONST_INV_2147483647); 614848b8605Smrg break; 615848b8605Smrg default: 616848b8605Smrg assert(0); 617848b8605Smrg factor.disp = 0; 618848b8605Smrg factor.file = 0; 619848b8605Smrg factor.idx = 0; 620848b8605Smrg factor.mod = 0; 621848b8605Smrg break; 622848b8605Smrg } 623848b8605Smrg sse_mulps(p->func, dataXMM, factor); 624848b8605Smrg } 625848b8605Smrg break; 626848b8605Smrg 627848b8605Smrg break; 628848b8605Smrg case UTIL_FORMAT_TYPE_FLOAT: 629848b8605Smrg if (input_desc->channel[0].size != 32 630848b8605Smrg && input_desc->channel[0].size != 64) { 631848b8605Smrg return FALSE; 632848b8605Smrg } 633b8e80941Smrg if (swizzle[3] == PIPE_SWIZZLE_1 634848b8605Smrg && input_desc->nr_channels <= 3) { 635b8e80941Smrg swizzle[3] = PIPE_SWIZZLE_W; 636848b8605Smrg needed_chans = CHANNELS_0001; 637848b8605Smrg } 638848b8605Smrg switch (input_desc->channel[0].size) { 639848b8605Smrg case 32: 640848b8605Smrg emit_load_float32(p, dataXMM, src, needed_chans, 641848b8605Smrg input_desc->nr_channels); 642848b8605Smrg break; 643848b8605Smrg case 64: /* we lose precision here */ 644848b8605Smrg if (!(x86_target_caps(p->func) & X86_SSE2)) 645848b8605Smrg return FALSE; 646848b8605Smrg emit_load_float64to32(p, dataXMM, src, needed_chans, 647848b8605Smrg input_desc->nr_channels); 648848b8605Smrg break; 649848b8605Smrg default: 650848b8605Smrg return FALSE; 651848b8605Smrg } 652848b8605Smrg break; 653848b8605Smrg default: 654848b8605Smrg return FALSE; 655848b8605Smrg } 656848b8605Smrg 657848b8605Smrg if (!id_swizzle) { 658848b8605Smrg sse_shufps(p->func, dataXMM, dataXMM, 659848b8605Smrg SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); 660848b8605Smrg } 661848b8605Smrg } 662848b8605Smrg 663848b8605Smrg if (output_desc->nr_channels >= 4 664b8e80941Smrg && swizzle[0] < PIPE_SWIZZLE_0 665b8e80941Smrg && swizzle[1] < PIPE_SWIZZLE_0 666b8e80941Smrg && swizzle[2] < PIPE_SWIZZLE_0 667b8e80941Smrg && swizzle[3] < PIPE_SWIZZLE_0) { 668848b8605Smrg sse_movups(p->func, dst, dataXMM); 669848b8605Smrg } 670848b8605Smrg else { 671848b8605Smrg if (output_desc->nr_channels >= 2 672b8e80941Smrg && swizzle[0] < PIPE_SWIZZLE_0 673b8e80941Smrg && swizzle[1] < PIPE_SWIZZLE_0) { 674848b8605Smrg sse_movlps(p->func, dst, dataXMM); 675848b8605Smrg } 676848b8605Smrg else { 677b8e80941Smrg if (swizzle[0] < PIPE_SWIZZLE_0) { 678848b8605Smrg sse_movss(p->func, dst, dataXMM); 679848b8605Smrg } 680848b8605Smrg else { 681848b8605Smrg x86_mov_imm(p->func, dst, 682b8e80941Smrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 683848b8605Smrg } 684848b8605Smrg 685848b8605Smrg if (output_desc->nr_channels >= 2) { 686b8e80941Smrg if (swizzle[1] < PIPE_SWIZZLE_0) { 687848b8605Smrg sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); 688848b8605Smrg sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); 689848b8605Smrg } 690848b8605Smrg else { 691848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, 4), 692b8e80941Smrg imms[swizzle[1] - PIPE_SWIZZLE_0]); 693848b8605Smrg } 694848b8605Smrg } 695848b8605Smrg } 696848b8605Smrg 697848b8605Smrg if (output_desc->nr_channels >= 3) { 698848b8605Smrg if (output_desc->nr_channels >= 4 699b8e80941Smrg && swizzle[2] < PIPE_SWIZZLE_0 700b8e80941Smrg && swizzle[3] < PIPE_SWIZZLE_0) { 701848b8605Smrg sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); 702848b8605Smrg } 703848b8605Smrg else { 704b8e80941Smrg if (swizzle[2] < PIPE_SWIZZLE_0) { 705848b8605Smrg sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); 706848b8605Smrg sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); 707848b8605Smrg } 708848b8605Smrg else { 709848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, 8), 710b8e80941Smrg imms[swizzle[2] - PIPE_SWIZZLE_0]); 711848b8605Smrg } 712848b8605Smrg 713848b8605Smrg if (output_desc->nr_channels >= 4) { 714b8e80941Smrg if (swizzle[3] < PIPE_SWIZZLE_0) { 715848b8605Smrg sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); 716848b8605Smrg sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); 717848b8605Smrg } 718848b8605Smrg else { 719848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, 12), 720b8e80941Smrg imms[swizzle[3] - PIPE_SWIZZLE_0]); 721848b8605Smrg } 722848b8605Smrg } 723848b8605Smrg } 724848b8605Smrg } 725848b8605Smrg } 726848b8605Smrg return TRUE; 727848b8605Smrg } 728848b8605Smrg else if ((x86_target_caps(p->func) & X86_SSE2) 729848b8605Smrg && input_desc->channel[0].size == 8 730848b8605Smrg && output_desc->channel[0].size == 16 731848b8605Smrg && output_desc->channel[0].normalized == 732848b8605Smrg input_desc->channel[0].normalized && 733848b8605Smrg (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 734848b8605Smrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) 735848b8605Smrg || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 736848b8605Smrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 737848b8605Smrg || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED 738848b8605Smrg && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { 739848b8605Smrg struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 740848b8605Smrg struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 741848b8605Smrg struct x86_reg tmp = p->tmp_EAX; 742848b8605Smrg unsigned imms[2] = { 0, 1 }; 743848b8605Smrg 744848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 745b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_0 746848b8605Smrg && i >= input_desc->nr_channels) { 747848b8605Smrg swizzle[i] = i; 748848b8605Smrg } 749848b8605Smrg } 750848b8605Smrg 751848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 752848b8605Smrg if (swizzle[i] < 4) 753848b8605Smrg needed_chans = MAX2(needed_chans, swizzle[i] + 1); 754b8e80941Smrg if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 755848b8605Smrg id_swizzle = FALSE; 756848b8605Smrg } 757848b8605Smrg 758848b8605Smrg if (needed_chans > 0) { 759848b8605Smrg emit_load_sse2(p, dataXMM, src, 760848b8605Smrg input_desc->channel[0].size * 761848b8605Smrg input_desc->nr_channels >> 3); 762848b8605Smrg 763848b8605Smrg switch (input_desc->channel[0].type) { 764848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 765848b8605Smrg if (input_desc->channel[0].normalized) { 766848b8605Smrg sse2_punpcklbw(p->func, dataXMM, dataXMM); 767848b8605Smrg if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 768848b8605Smrg sse2_psrlw_imm(p->func, dataXMM, 1); 769848b8605Smrg } 770848b8605Smrg else 771848b8605Smrg sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 772848b8605Smrg break; 773848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 774848b8605Smrg if (input_desc->channel[0].normalized) { 775848b8605Smrg sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); 776848b8605Smrg sse2_punpcklbw(p->func, tmpXMM, dataXMM); 777848b8605Smrg sse2_psllw_imm(p->func, dataXMM, 9); 778848b8605Smrg sse2_psrlw_imm(p->func, dataXMM, 8); 779848b8605Smrg sse2_por(p->func, tmpXMM, dataXMM); 780848b8605Smrg sse2_psrlw_imm(p->func, dataXMM, 7); 781848b8605Smrg sse2_por(p->func, tmpXMM, dataXMM); 782848b8605Smrg { 783848b8605Smrg struct x86_reg t = dataXMM; 784848b8605Smrg dataXMM = tmpXMM; 785848b8605Smrg tmpXMM = t; 786848b8605Smrg } 787848b8605Smrg } 788848b8605Smrg else { 789848b8605Smrg sse2_punpcklbw(p->func, dataXMM, dataXMM); 790848b8605Smrg sse2_psraw_imm(p->func, dataXMM, 8); 791848b8605Smrg } 792848b8605Smrg break; 793848b8605Smrg default: 794848b8605Smrg assert(0); 795848b8605Smrg } 796848b8605Smrg 797848b8605Smrg if (output_desc->channel[0].normalized) 798848b8605Smrg imms[1] = 799848b8605Smrg (output_desc->channel[0].type == 800848b8605Smrg UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; 801848b8605Smrg 802848b8605Smrg if (!id_swizzle) 803848b8605Smrg sse2_pshuflw(p->func, dataXMM, dataXMM, 804848b8605Smrg (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | 805848b8605Smrg ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); 806848b8605Smrg } 807848b8605Smrg 808848b8605Smrg if (output_desc->nr_channels >= 4 809b8e80941Smrg && swizzle[0] < PIPE_SWIZZLE_0 810b8e80941Smrg && swizzle[1] < PIPE_SWIZZLE_0 811b8e80941Smrg && swizzle[2] < PIPE_SWIZZLE_0 812b8e80941Smrg && swizzle[3] < PIPE_SWIZZLE_0) { 813848b8605Smrg sse2_movq(p->func, dst, dataXMM); 814848b8605Smrg } 815848b8605Smrg else { 816b8e80941Smrg if (swizzle[0] < PIPE_SWIZZLE_0) { 817848b8605Smrg if (output_desc->nr_channels >= 2 818b8e80941Smrg && swizzle[1] < PIPE_SWIZZLE_0) { 819848b8605Smrg sse2_movd(p->func, dst, dataXMM); 820848b8605Smrg } 821848b8605Smrg else { 822848b8605Smrg sse2_movd(p->func, tmp, dataXMM); 823848b8605Smrg x86_mov16(p->func, dst, tmp); 824848b8605Smrg if (output_desc->nr_channels >= 2) 825848b8605Smrg x86_mov16_imm(p->func, x86_make_disp(dst, 2), 826b8e80941Smrg imms[swizzle[1] - PIPE_SWIZZLE_0]); 827848b8605Smrg } 828848b8605Smrg } 829848b8605Smrg else { 830848b8605Smrg if (output_desc->nr_channels >= 2 831b8e80941Smrg && swizzle[1] >= PIPE_SWIZZLE_0) { 832848b8605Smrg x86_mov_imm(p->func, dst, 833b8e80941Smrg (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) | 834b8e80941Smrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 835848b8605Smrg } 836848b8605Smrg else { 837848b8605Smrg x86_mov16_imm(p->func, dst, 838b8e80941Smrg imms[swizzle[0] - PIPE_SWIZZLE_0]); 839848b8605Smrg if (output_desc->nr_channels >= 2) { 840848b8605Smrg sse2_movd(p->func, tmp, dataXMM); 841848b8605Smrg x86_shr_imm(p->func, tmp, 16); 842848b8605Smrg x86_mov16(p->func, x86_make_disp(dst, 2), tmp); 843848b8605Smrg } 844848b8605Smrg } 845848b8605Smrg } 846848b8605Smrg 847848b8605Smrg if (output_desc->nr_channels >= 3) { 848b8e80941Smrg if (swizzle[2] < PIPE_SWIZZLE_0) { 849848b8605Smrg if (output_desc->nr_channels >= 4 850b8e80941Smrg && swizzle[3] < PIPE_SWIZZLE_0) { 851848b8605Smrg sse2_psrlq_imm(p->func, dataXMM, 32); 852848b8605Smrg sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); 853848b8605Smrg } 854848b8605Smrg else { 855848b8605Smrg sse2_psrlq_imm(p->func, dataXMM, 32); 856848b8605Smrg sse2_movd(p->func, tmp, dataXMM); 857848b8605Smrg x86_mov16(p->func, x86_make_disp(dst, 4), tmp); 858848b8605Smrg if (output_desc->nr_channels >= 4) { 859848b8605Smrg x86_mov16_imm(p->func, x86_make_disp(dst, 6), 860b8e80941Smrg imms[swizzle[3] - PIPE_SWIZZLE_0]); 861848b8605Smrg } 862848b8605Smrg } 863848b8605Smrg } 864848b8605Smrg else { 865848b8605Smrg if (output_desc->nr_channels >= 4 866b8e80941Smrg && swizzle[3] >= PIPE_SWIZZLE_0) { 867848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, 4), 868b8e80941Smrg (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16) 869b8e80941Smrg | imms[swizzle[2] - PIPE_SWIZZLE_0]); 870848b8605Smrg } 871848b8605Smrg else { 872848b8605Smrg x86_mov16_imm(p->func, x86_make_disp(dst, 4), 873b8e80941Smrg imms[swizzle[2] - PIPE_SWIZZLE_0]); 874848b8605Smrg 875848b8605Smrg if (output_desc->nr_channels >= 4) { 876848b8605Smrg sse2_psrlq_imm(p->func, dataXMM, 48); 877848b8605Smrg sse2_movd(p->func, tmp, dataXMM); 878848b8605Smrg x86_mov16(p->func, x86_make_disp(dst, 6), tmp); 879848b8605Smrg } 880848b8605Smrg } 881848b8605Smrg } 882848b8605Smrg } 883848b8605Smrg } 884848b8605Smrg return TRUE; 885848b8605Smrg } 886848b8605Smrg else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], 887848b8605Smrg sizeof(output_desc->channel[0]))) { 888848b8605Smrg struct x86_reg tmp = p->tmp_EAX; 889848b8605Smrg unsigned i; 890848b8605Smrg 891848b8605Smrg if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 892848b8605Smrg && output_desc->nr_channels == 4 893b8e80941Smrg && swizzle[0] == PIPE_SWIZZLE_W 894b8e80941Smrg && swizzle[1] == PIPE_SWIZZLE_Z 895b8e80941Smrg && swizzle[2] == PIPE_SWIZZLE_Y 896b8e80941Smrg && swizzle[3] == PIPE_SWIZZLE_X) { 897848b8605Smrg /* TODO: support movbe */ 898848b8605Smrg x86_mov(p->func, tmp, src); 899848b8605Smrg x86_bswap(p->func, tmp); 900848b8605Smrg x86_mov(p->func, dst, tmp); 901848b8605Smrg return TRUE; 902848b8605Smrg } 903848b8605Smrg 904848b8605Smrg for (i = 0; i < output_desc->nr_channels; ++i) { 905848b8605Smrg switch (output_desc->channel[0].size) { 906848b8605Smrg case 8: 907b8e80941Smrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 908848b8605Smrg unsigned v = 0; 909b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_1) { 910848b8605Smrg switch (output_desc->channel[0].type) { 911848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 912848b8605Smrg v = output_desc->channel[0].normalized ? 0xff : 1; 913848b8605Smrg break; 914848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 915848b8605Smrg v = output_desc->channel[0].normalized ? 0x7f : 1; 916848b8605Smrg break; 917848b8605Smrg default: 918848b8605Smrg return FALSE; 919848b8605Smrg } 920848b8605Smrg } 921848b8605Smrg x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); 922848b8605Smrg } 923848b8605Smrg else { 924848b8605Smrg x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); 925848b8605Smrg x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); 926848b8605Smrg } 927848b8605Smrg break; 928848b8605Smrg case 16: 929b8e80941Smrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 930848b8605Smrg unsigned v = 0; 931b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_1) { 932848b8605Smrg switch (output_desc->channel[1].type) { 933848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 934848b8605Smrg v = output_desc->channel[1].normalized ? 0xffff : 1; 935848b8605Smrg break; 936848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 937848b8605Smrg v = output_desc->channel[1].normalized ? 0x7fff : 1; 938848b8605Smrg break; 939848b8605Smrg case UTIL_FORMAT_TYPE_FLOAT: 940848b8605Smrg v = 0x3c00; 941848b8605Smrg break; 942848b8605Smrg default: 943848b8605Smrg return FALSE; 944848b8605Smrg } 945848b8605Smrg } 946848b8605Smrg x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); 947848b8605Smrg } 948b8e80941Smrg else if (swizzle[i] == PIPE_SWIZZLE_0) { 949848b8605Smrg x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); 950848b8605Smrg } 951848b8605Smrg else { 952848b8605Smrg x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); 953848b8605Smrg x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); 954848b8605Smrg } 955848b8605Smrg break; 956848b8605Smrg case 32: 957b8e80941Smrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 958848b8605Smrg unsigned v = 0; 959b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_1) { 960848b8605Smrg switch (output_desc->channel[1].type) { 961848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 962848b8605Smrg v = output_desc->channel[1].normalized ? 0xffffffff : 1; 963848b8605Smrg break; 964848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 965848b8605Smrg v = output_desc->channel[1].normalized ? 0x7fffffff : 1; 966848b8605Smrg break; 967848b8605Smrg case UTIL_FORMAT_TYPE_FLOAT: 968848b8605Smrg v = 0x3f800000; 969848b8605Smrg break; 970848b8605Smrg default: 971848b8605Smrg return FALSE; 972848b8605Smrg } 973848b8605Smrg } 974848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); 975848b8605Smrg } 976848b8605Smrg else { 977848b8605Smrg x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); 978848b8605Smrg x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); 979848b8605Smrg } 980848b8605Smrg break; 981848b8605Smrg case 64: 982b8e80941Smrg if (swizzle[i] >= PIPE_SWIZZLE_0) { 983848b8605Smrg unsigned l = 0; 984848b8605Smrg unsigned h = 0; 985b8e80941Smrg if (swizzle[i] == PIPE_SWIZZLE_1) { 986848b8605Smrg switch (output_desc->channel[1].type) { 987848b8605Smrg case UTIL_FORMAT_TYPE_UNSIGNED: 988848b8605Smrg h = output_desc->channel[1].normalized ? 0xffffffff : 0; 989848b8605Smrg l = output_desc->channel[1].normalized ? 0xffffffff : 1; 990848b8605Smrg break; 991848b8605Smrg case UTIL_FORMAT_TYPE_SIGNED: 992848b8605Smrg h = output_desc->channel[1].normalized ? 0x7fffffff : 0; 993848b8605Smrg l = output_desc->channel[1].normalized ? 0xffffffff : 1; 994848b8605Smrg break; 995848b8605Smrg case UTIL_FORMAT_TYPE_FLOAT: 996848b8605Smrg h = 0x3ff00000; 997848b8605Smrg l = 0; 998848b8605Smrg break; 999848b8605Smrg default: 1000848b8605Smrg return FALSE; 1001848b8605Smrg } 1002848b8605Smrg } 1003848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); 1004848b8605Smrg x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); 1005848b8605Smrg } 1006848b8605Smrg else { 1007848b8605Smrg if (x86_target_caps(p->func) & X86_SSE) { 1008848b8605Smrg struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); 1009848b8605Smrg emit_load64(p, tmp, tmpXMM, 1010848b8605Smrg x86_make_disp(src, swizzle[i] * 8)); 1011848b8605Smrg emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); 1012848b8605Smrg } 1013848b8605Smrg else { 1014848b8605Smrg x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); 1015848b8605Smrg x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); 1016848b8605Smrg x86_mov(p->func, tmp, 1017848b8605Smrg x86_make_disp(src, swizzle[i] * 8 + 4)); 1018848b8605Smrg x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); 1019848b8605Smrg } 1020848b8605Smrg } 1021848b8605Smrg break; 1022848b8605Smrg default: 1023848b8605Smrg return FALSE; 1024848b8605Smrg } 1025848b8605Smrg } 1026848b8605Smrg return TRUE; 1027848b8605Smrg } 1028848b8605Smrg /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ 1029848b8605Smrg else if ((x86_target_caps(p->func) & X86_SSE2) && 1030848b8605Smrg a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && 1031848b8605Smrg (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM 1032848b8605Smrg || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { 1033848b8605Smrg struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 1034848b8605Smrg 1035848b8605Smrg /* load */ 1036848b8605Smrg sse_movups(p->func, dataXMM, src); 1037848b8605Smrg 1038848b8605Smrg if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { 1039848b8605Smrg sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); 1040848b8605Smrg } 1041848b8605Smrg 1042848b8605Smrg /* scale by 255.0 */ 1043848b8605Smrg sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); 1044848b8605Smrg 1045848b8605Smrg /* pack and emit */ 1046848b8605Smrg sse2_cvtps2dq(p->func, dataXMM, dataXMM); 1047848b8605Smrg sse2_packssdw(p->func, dataXMM, dataXMM); 1048848b8605Smrg sse2_packuswb(p->func, dataXMM, dataXMM); 1049848b8605Smrg sse2_movd(p->func, dst, dataXMM); 1050848b8605Smrg 1051848b8605Smrg return TRUE; 1052848b8605Smrg } 1053848b8605Smrg 1054848b8605Smrg return FALSE; 1055848b8605Smrg} 1056848b8605Smrg 1057848b8605Smrg 1058848b8605Smrgstatic boolean 1059848b8605Smrgtranslate_attr(struct translate_sse *p, 1060848b8605Smrg const struct translate_element *a, 1061848b8605Smrg struct x86_reg src, struct x86_reg dst) 1062848b8605Smrg{ 1063848b8605Smrg if (a->input_format == a->output_format) { 1064848b8605Smrg emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); 1065848b8605Smrg return TRUE; 1066848b8605Smrg } 1067848b8605Smrg 1068848b8605Smrg return translate_attr_convert(p, a, src, dst); 1069848b8605Smrg} 1070848b8605Smrg 1071848b8605Smrg 1072848b8605Smrgstatic boolean 1073848b8605Smrginit_inputs(struct translate_sse *p, unsigned index_size) 1074848b8605Smrg{ 1075848b8605Smrg unsigned i; 1076848b8605Smrg struct x86_reg instance_id = 1077848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1078848b8605Smrg struct x86_reg start_instance = 1079848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); 1080848b8605Smrg 1081848b8605Smrg for (i = 0; i < p->nr_buffer_variants; i++) { 1082848b8605Smrg struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1083848b8605Smrg struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; 1084848b8605Smrg 1085848b8605Smrg if (!index_size || variant->instance_divisor) { 1086848b8605Smrg struct x86_reg buf_max_index = 1087848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); 1088848b8605Smrg struct x86_reg buf_stride = 1089848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); 1090848b8605Smrg struct x86_reg buf_ptr = 1091848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); 1092848b8605Smrg struct x86_reg buf_base_ptr = 1093848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); 1094848b8605Smrg struct x86_reg elt = p->idx_ESI; 1095848b8605Smrg struct x86_reg tmp_EAX = p->tmp_EAX; 1096848b8605Smrg 1097848b8605Smrg /* Calculate pointer to first attrib: 1098848b8605Smrg * base_ptr + stride * index, where index depends on instance divisor 1099848b8605Smrg */ 1100848b8605Smrg if (variant->instance_divisor) { 1101b8e80941Smrg struct x86_reg tmp_EDX = p->tmp2_EDX; 1102b8e80941Smrg 1103848b8605Smrg /* Start with instance = instance_id 1104848b8605Smrg * which is true if divisor is 1. 1105848b8605Smrg */ 1106848b8605Smrg x86_mov(p->func, tmp_EAX, instance_id); 1107848b8605Smrg 1108848b8605Smrg if (variant->instance_divisor != 1) { 1109848b8605Smrg struct x86_reg tmp_ECX = p->src_ECX; 1110848b8605Smrg 1111848b8605Smrg /* TODO: Add x86_shr() to rtasm and use it whenever 1112848b8605Smrg * instance divisor is power of two. 1113848b8605Smrg */ 1114848b8605Smrg x86_xor(p->func, tmp_EDX, tmp_EDX); 1115848b8605Smrg x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); 1116848b8605Smrg x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 1117848b8605Smrg } 1118848b8605Smrg 1119b8e80941Smrg /* instance = (instance_id / divisor) + start_instance 1120b8e80941Smrg */ 1121b8e80941Smrg x86_mov(p->func, tmp_EDX, start_instance); 1122b8e80941Smrg x86_add(p->func, tmp_EAX, tmp_EDX); 1123b8e80941Smrg 1124848b8605Smrg /* XXX we need to clamp the index here too, but to a 1125848b8605Smrg * per-array max value, not the draw->pt.max_index value 1126848b8605Smrg * that's being given to us via translate->set_buffer(). 1127848b8605Smrg */ 1128848b8605Smrg } 1129848b8605Smrg else { 1130848b8605Smrg x86_mov(p->func, tmp_EAX, elt); 1131848b8605Smrg 1132848b8605Smrg /* Clamp to max_index 1133848b8605Smrg */ 1134848b8605Smrg x86_cmp(p->func, tmp_EAX, buf_max_index); 1135848b8605Smrg x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); 1136848b8605Smrg } 1137848b8605Smrg 1138848b8605Smrg x86_mov(p->func, p->tmp2_EDX, buf_stride); 1139848b8605Smrg x64_rexw(p->func); 1140848b8605Smrg x86_imul(p->func, tmp_EAX, p->tmp2_EDX); 1141848b8605Smrg x64_rexw(p->func); 1142848b8605Smrg x86_add(p->func, tmp_EAX, buf_base_ptr); 1143848b8605Smrg 1144848b8605Smrg x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1145848b8605Smrg 1146848b8605Smrg /* In the linear case, keep the buffer pointer instead of the 1147848b8605Smrg * index number. 1148848b8605Smrg */ 1149848b8605Smrg if (!index_size && p->nr_buffer_variants == 1) { 1150848b8605Smrg x64_rexw(p->func); 1151848b8605Smrg x86_mov(p->func, elt, tmp_EAX); 1152848b8605Smrg } 1153848b8605Smrg else { 1154848b8605Smrg x64_rexw(p->func); 1155848b8605Smrg x86_mov(p->func, buf_ptr, tmp_EAX); 1156848b8605Smrg } 1157848b8605Smrg } 1158848b8605Smrg } 1159848b8605Smrg 1160848b8605Smrg return TRUE; 1161848b8605Smrg} 1162848b8605Smrg 1163848b8605Smrg 1164848b8605Smrgstatic struct x86_reg 1165848b8605Smrgget_buffer_ptr(struct translate_sse *p, 1166848b8605Smrg unsigned index_size, unsigned var_idx, struct x86_reg elt) 1167848b8605Smrg{ 1168848b8605Smrg if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 1169848b8605Smrg return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1170848b8605Smrg } 1171848b8605Smrg if (!index_size && p->nr_buffer_variants == 1) { 1172848b8605Smrg return p->idx_ESI; 1173848b8605Smrg } 1174848b8605Smrg else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { 1175848b8605Smrg struct x86_reg ptr = p->src_ECX; 1176848b8605Smrg struct x86_reg buf_ptr = 1177848b8605Smrg x86_make_disp(p->machine_EDI, 1178848b8605Smrg get_offset(p, &p->buffer_variant[var_idx].ptr)); 1179848b8605Smrg 1180848b8605Smrg x64_rexw(p->func); 1181848b8605Smrg x86_mov(p->func, ptr, buf_ptr); 1182848b8605Smrg return ptr; 1183848b8605Smrg } 1184848b8605Smrg else { 1185848b8605Smrg struct x86_reg ptr = p->src_ECX; 1186848b8605Smrg const struct translate_buffer_variant *variant = 1187848b8605Smrg &p->buffer_variant[var_idx]; 1188848b8605Smrg struct x86_reg buf_stride = 1189848b8605Smrg x86_make_disp(p->machine_EDI, 1190848b8605Smrg get_offset(p, &p->buffer[variant->buffer_index].stride)); 1191848b8605Smrg struct x86_reg buf_base_ptr = 1192848b8605Smrg x86_make_disp(p->machine_EDI, 1193848b8605Smrg get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); 1194848b8605Smrg struct x86_reg buf_max_index = 1195848b8605Smrg x86_make_disp(p->machine_EDI, 1196848b8605Smrg get_offset(p, &p->buffer[variant->buffer_index].max_index)); 1197848b8605Smrg 1198848b8605Smrg /* Calculate pointer to current attrib: 1199848b8605Smrg */ 1200848b8605Smrg switch (index_size) { 1201848b8605Smrg case 1: 1202848b8605Smrg x86_movzx8(p->func, ptr, elt); 1203848b8605Smrg break; 1204848b8605Smrg case 2: 1205848b8605Smrg x86_movzx16(p->func, ptr, elt); 1206848b8605Smrg break; 1207848b8605Smrg case 4: 1208848b8605Smrg x86_mov(p->func, ptr, elt); 1209848b8605Smrg break; 1210848b8605Smrg } 1211848b8605Smrg 1212848b8605Smrg /* Clamp to max_index 1213848b8605Smrg */ 1214848b8605Smrg x86_cmp(p->func, ptr, buf_max_index); 1215848b8605Smrg x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); 1216848b8605Smrg 1217848b8605Smrg x86_mov(p->func, p->tmp2_EDX, buf_stride); 1218848b8605Smrg x64_rexw(p->func); 1219848b8605Smrg x86_imul(p->func, ptr, p->tmp2_EDX); 1220848b8605Smrg x64_rexw(p->func); 1221848b8605Smrg x86_add(p->func, ptr, buf_base_ptr); 1222848b8605Smrg return ptr; 1223848b8605Smrg } 1224848b8605Smrg} 1225848b8605Smrg 1226848b8605Smrg 1227848b8605Smrgstatic boolean 1228848b8605Smrgincr_inputs(struct translate_sse *p, unsigned index_size) 1229848b8605Smrg{ 1230848b8605Smrg if (!index_size && p->nr_buffer_variants == 1) { 1231848b8605Smrg const unsigned buffer_index = p->buffer_variant[0].buffer_index; 1232848b8605Smrg struct x86_reg stride = 1233848b8605Smrg x86_make_disp(p->machine_EDI, 1234848b8605Smrg get_offset(p, &p->buffer[buffer_index].stride)); 1235848b8605Smrg 1236848b8605Smrg if (p->buffer_variant[0].instance_divisor == 0) { 1237848b8605Smrg x64_rexw(p->func); 1238848b8605Smrg x86_add(p->func, p->idx_ESI, stride); 1239848b8605Smrg sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); 1240848b8605Smrg } 1241848b8605Smrg } 1242848b8605Smrg else if (!index_size) { 1243848b8605Smrg unsigned i; 1244848b8605Smrg 1245848b8605Smrg /* Is this worthwhile?? 1246848b8605Smrg */ 1247848b8605Smrg for (i = 0; i < p->nr_buffer_variants; i++) { 1248848b8605Smrg struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1249848b8605Smrg struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 1250848b8605Smrg get_offset(p, &variant->ptr)); 1251848b8605Smrg struct x86_reg buf_stride = 1252848b8605Smrg x86_make_disp(p->machine_EDI, 1253848b8605Smrg get_offset(p, &p->buffer[variant->buffer_index].stride)); 1254848b8605Smrg 1255848b8605Smrg if (variant->instance_divisor == 0) { 1256848b8605Smrg x86_mov(p->func, p->tmp_EAX, buf_stride); 1257848b8605Smrg x64_rexw(p->func); 1258848b8605Smrg x86_add(p->func, p->tmp_EAX, buf_ptr); 1259848b8605Smrg if (i == 0) 1260848b8605Smrg sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 1261848b8605Smrg x64_rexw(p->func); 1262848b8605Smrg x86_mov(p->func, buf_ptr, p->tmp_EAX); 1263848b8605Smrg } 1264848b8605Smrg } 1265848b8605Smrg } 1266848b8605Smrg else { 1267848b8605Smrg x64_rexw(p->func); 1268848b8605Smrg x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); 1269848b8605Smrg } 1270848b8605Smrg 1271848b8605Smrg return TRUE; 1272848b8605Smrg} 1273848b8605Smrg 1274848b8605Smrg 1275848b8605Smrg/* Build run( struct translate *machine, 1276848b8605Smrg * unsigned start, 1277848b8605Smrg * unsigned count, 1278848b8605Smrg * void *output_buffer ) 1279848b8605Smrg * or 1280848b8605Smrg * run_elts( struct translate *machine, 1281848b8605Smrg * unsigned *elts, 1282848b8605Smrg * unsigned count, 1283848b8605Smrg * void *output_buffer ) 1284848b8605Smrg * 1285848b8605Smrg * Lots of hardcoding 1286848b8605Smrg * 1287848b8605Smrg * EAX -- pointer to current output vertex 1288848b8605Smrg * ECX -- pointer to current attribute 1289848b8605Smrg * 1290848b8605Smrg */ 1291848b8605Smrgstatic boolean 1292848b8605Smrgbuild_vertex_emit(struct translate_sse *p, 1293848b8605Smrg struct x86_function *func, unsigned index_size) 1294848b8605Smrg{ 1295848b8605Smrg int fixup, label; 1296848b8605Smrg unsigned j; 1297848b8605Smrg 1298848b8605Smrg memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); 1299848b8605Smrg memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); 1300848b8605Smrg 1301848b8605Smrg p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 1302848b8605Smrg p->idx_ESI = x86_make_reg(file_REG32, reg_SI); 1303848b8605Smrg p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); 1304848b8605Smrg p->machine_EDI = x86_make_reg(file_REG32, reg_DI); 1305848b8605Smrg p->count_EBP = x86_make_reg(file_REG32, reg_BP); 1306848b8605Smrg p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); 1307848b8605Smrg p->src_ECX = x86_make_reg(file_REG32, reg_CX); 1308848b8605Smrg 1309848b8605Smrg p->func = func; 1310848b8605Smrg 1311848b8605Smrg x86_init_func(p->func); 1312848b8605Smrg 1313848b8605Smrg if (x86_target(p->func) == X86_64_WIN64_ABI) { 1314848b8605Smrg /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" 1315848b8605Smrg * above the return address 1316848b8605Smrg */ 1317848b8605Smrg sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), 1318848b8605Smrg x86_make_reg(file_XMM, 6)); 1319848b8605Smrg sse2_movdqa(p->func, 1320848b8605Smrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), 1321848b8605Smrg x86_make_reg(file_XMM, 7)); 1322848b8605Smrg } 1323848b8605Smrg 1324848b8605Smrg x86_push(p->func, p->outbuf_EBX); 1325848b8605Smrg x86_push(p->func, p->count_EBP); 1326848b8605Smrg 1327848b8605Smrg /* on non-Win64 x86-64, these are already in the right registers */ 1328848b8605Smrg if (x86_target(p->func) != X86_64_STD_ABI) { 1329848b8605Smrg x86_push(p->func, p->machine_EDI); 1330848b8605Smrg x86_push(p->func, p->idx_ESI); 1331848b8605Smrg 1332848b8605Smrg if (x86_target(p->func) != X86_32) { 1333848b8605Smrg x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1334848b8605Smrg x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1335848b8605Smrg } 1336848b8605Smrg else { 1337848b8605Smrg x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1338848b8605Smrg x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1339848b8605Smrg } 1340848b8605Smrg } 1341848b8605Smrg 1342848b8605Smrg x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); 1343848b8605Smrg 1344848b8605Smrg if (x86_target(p->func) != X86_32) 1345848b8605Smrg x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1346848b8605Smrg else 1347848b8605Smrg x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1348848b8605Smrg 1349848b8605Smrg /* Load instance ID. 1350848b8605Smrg */ 1351848b8605Smrg if (p->use_instancing) { 1352848b8605Smrg x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); 1353848b8605Smrg x86_mov(p->func, 1354848b8605Smrg x86_make_disp(p->machine_EDI, 1355848b8605Smrg get_offset(p, &p->start_instance)), p->tmp2_EDX); 1356848b8605Smrg 1357848b8605Smrg x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); 1358848b8605Smrg x86_mov(p->func, 1359848b8605Smrg x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), 1360848b8605Smrg p->tmp_EAX); 1361848b8605Smrg } 1362848b8605Smrg 1363848b8605Smrg /* Get vertex count, compare to zero 1364848b8605Smrg */ 1365848b8605Smrg x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 1366848b8605Smrg x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1367848b8605Smrg fixup = x86_jcc_forward(p->func, cc_E); 1368848b8605Smrg 1369848b8605Smrg /* always load, needed or not: 1370848b8605Smrg */ 1371848b8605Smrg init_inputs(p, index_size); 1372848b8605Smrg 1373848b8605Smrg /* Note address for loop jump 1374848b8605Smrg */ 1375848b8605Smrg label = x86_get_label(p->func); 1376848b8605Smrg { 1377848b8605Smrg struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); 1378848b8605Smrg int last_variant = -1; 1379848b8605Smrg struct x86_reg vb; 1380848b8605Smrg 1381848b8605Smrg for (j = 0; j < p->translate.key.nr_elements; j++) { 1382848b8605Smrg const struct translate_element *a = &p->translate.key.element[j]; 1383848b8605Smrg unsigned variant = p->element_to_buffer_variant[j]; 1384848b8605Smrg 1385848b8605Smrg /* Figure out source pointer address: 1386848b8605Smrg */ 1387848b8605Smrg if (variant != last_variant) { 1388848b8605Smrg last_variant = variant; 1389848b8605Smrg vb = get_buffer_ptr(p, index_size, variant, elt); 1390848b8605Smrg } 1391848b8605Smrg 1392848b8605Smrg if (!translate_attr(p, a, 1393848b8605Smrg x86_make_disp(vb, a->input_offset), 1394848b8605Smrg x86_make_disp(p->outbuf_EBX, a->output_offset))) 1395848b8605Smrg return FALSE; 1396848b8605Smrg } 1397848b8605Smrg 1398848b8605Smrg /* Next output vertex: 1399848b8605Smrg */ 1400848b8605Smrg x64_rexw(p->func); 1401848b8605Smrg x86_lea(p->func, p->outbuf_EBX, 1402848b8605Smrg x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); 1403848b8605Smrg 1404848b8605Smrg /* Incr index 1405848b8605Smrg */ 1406848b8605Smrg incr_inputs(p, index_size); 1407848b8605Smrg } 1408848b8605Smrg 1409848b8605Smrg /* decr count, loop if not zero 1410848b8605Smrg */ 1411848b8605Smrg x86_dec(p->func, p->count_EBP); 1412848b8605Smrg x86_jcc(p->func, cc_NZ, label); 1413848b8605Smrg 1414848b8605Smrg /* Exit mmx state? 1415848b8605Smrg */ 1416848b8605Smrg if (p->func->need_emms) 1417848b8605Smrg mmx_emms(p->func); 1418848b8605Smrg 1419848b8605Smrg /* Land forward jump here: 1420848b8605Smrg */ 1421848b8605Smrg x86_fixup_fwd_jump(p->func, fixup); 1422848b8605Smrg 1423848b8605Smrg /* Pop regs and return 1424848b8605Smrg */ 1425848b8605Smrg if (x86_target(p->func) != X86_64_STD_ABI) { 1426848b8605Smrg x86_pop(p->func, p->idx_ESI); 1427848b8605Smrg x86_pop(p->func, p->machine_EDI); 1428848b8605Smrg } 1429848b8605Smrg 1430848b8605Smrg x86_pop(p->func, p->count_EBP); 1431848b8605Smrg x86_pop(p->func, p->outbuf_EBX); 1432848b8605Smrg 1433848b8605Smrg if (x86_target(p->func) == X86_64_WIN64_ABI) { 1434848b8605Smrg sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), 1435848b8605Smrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); 1436848b8605Smrg sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), 1437848b8605Smrg x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); 1438848b8605Smrg } 1439848b8605Smrg x86_ret(p->func); 1440848b8605Smrg 1441848b8605Smrg return TRUE; 1442848b8605Smrg} 1443848b8605Smrg 1444848b8605Smrg 1445848b8605Smrgstatic void 1446848b8605Smrgtranslate_sse_set_buffer(struct translate *translate, 1447848b8605Smrg unsigned buf, 1448848b8605Smrg const void *ptr, unsigned stride, unsigned max_index) 1449848b8605Smrg{ 1450848b8605Smrg struct translate_sse *p = (struct translate_sse *) translate; 1451848b8605Smrg 1452848b8605Smrg if (buf < p->nr_buffers) { 1453848b8605Smrg p->buffer[buf].base_ptr = (char *) ptr; 1454848b8605Smrg p->buffer[buf].stride = stride; 1455848b8605Smrg p->buffer[buf].max_index = max_index; 1456848b8605Smrg } 1457848b8605Smrg 1458848b8605Smrg if (0) 1459848b8605Smrg debug_printf("%s %d/%d: %p %d\n", 1460848b8605Smrg __FUNCTION__, buf, p->nr_buffers, ptr, stride); 1461848b8605Smrg} 1462848b8605Smrg 1463848b8605Smrg 1464848b8605Smrgstatic void 1465848b8605Smrgtranslate_sse_release(struct translate *translate) 1466848b8605Smrg{ 1467848b8605Smrg struct translate_sse *p = (struct translate_sse *) translate; 1468848b8605Smrg 1469848b8605Smrg x86_release_func(&p->elt8_func); 1470848b8605Smrg x86_release_func(&p->elt16_func); 1471848b8605Smrg x86_release_func(&p->elt_func); 1472848b8605Smrg x86_release_func(&p->linear_func); 1473848b8605Smrg 1474848b8605Smrg os_free_aligned(p); 1475848b8605Smrg} 1476848b8605Smrg 1477848b8605Smrg 1478848b8605Smrgstruct translate * 1479848b8605Smrgtranslate_sse2_create(const struct translate_key *key) 1480848b8605Smrg{ 1481848b8605Smrg struct translate_sse *p = NULL; 1482848b8605Smrg unsigned i; 1483848b8605Smrg 1484848b8605Smrg /* this is misnamed, it actually refers to whether rtasm is enabled or not */ 1485848b8605Smrg if (!rtasm_cpu_has_sse()) 1486848b8605Smrg goto fail; 1487848b8605Smrg 1488848b8605Smrg p = os_malloc_aligned(sizeof(struct translate_sse), 16); 1489b8e80941Smrg if (!p) 1490848b8605Smrg goto fail; 1491848b8605Smrg 1492848b8605Smrg memset(p, 0, sizeof(*p)); 1493848b8605Smrg memcpy(p->consts, consts, sizeof(consts)); 1494848b8605Smrg 1495848b8605Smrg p->translate.key = *key; 1496848b8605Smrg p->translate.release = translate_sse_release; 1497848b8605Smrg p->translate.set_buffer = translate_sse_set_buffer; 1498848b8605Smrg 1499848b8605Smrg assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS); 1500848b8605Smrg 1501848b8605Smrg for (i = 0; i < key->nr_elements; i++) { 1502848b8605Smrg if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 1503848b8605Smrg unsigned j; 1504848b8605Smrg 1505848b8605Smrg p->nr_buffers = 1506848b8605Smrg MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 1507848b8605Smrg 1508848b8605Smrg if (key->element[i].instance_divisor) { 1509848b8605Smrg p->use_instancing = TRUE; 1510848b8605Smrg } 1511848b8605Smrg 1512848b8605Smrg /* 1513848b8605Smrg * Map vertex element to vertex buffer variant. 1514848b8605Smrg */ 1515848b8605Smrg for (j = 0; j < p->nr_buffer_variants; j++) { 1516848b8605Smrg if (p->buffer_variant[j].buffer_index == 1517848b8605Smrg key->element[i].input_buffer 1518848b8605Smrg && p->buffer_variant[j].instance_divisor == 1519848b8605Smrg key->element[i].instance_divisor) { 1520848b8605Smrg break; 1521848b8605Smrg } 1522848b8605Smrg } 1523848b8605Smrg if (j == p->nr_buffer_variants) { 1524848b8605Smrg p->buffer_variant[j].buffer_index = key->element[i].input_buffer; 1525848b8605Smrg p->buffer_variant[j].instance_divisor = 1526848b8605Smrg key->element[i].instance_divisor; 1527848b8605Smrg p->nr_buffer_variants++; 1528848b8605Smrg } 1529848b8605Smrg p->element_to_buffer_variant[i] = j; 1530848b8605Smrg } 1531848b8605Smrg else { 1532848b8605Smrg assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 1533848b8605Smrg 1534848b8605Smrg p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; 1535848b8605Smrg } 1536848b8605Smrg } 1537848b8605Smrg 1538848b8605Smrg if (0) 1539848b8605Smrg debug_printf("nr_buffers: %d\n", p->nr_buffers); 1540848b8605Smrg 1541848b8605Smrg if (!build_vertex_emit(p, &p->linear_func, 0)) 1542848b8605Smrg goto fail; 1543848b8605Smrg 1544848b8605Smrg if (!build_vertex_emit(p, &p->elt_func, 4)) 1545848b8605Smrg goto fail; 1546848b8605Smrg 1547848b8605Smrg if (!build_vertex_emit(p, &p->elt16_func, 2)) 1548848b8605Smrg goto fail; 1549848b8605Smrg 1550848b8605Smrg if (!build_vertex_emit(p, &p->elt8_func, 1)) 1551848b8605Smrg goto fail; 1552848b8605Smrg 1553848b8605Smrg p->translate.run = (run_func) x86_get_func(&p->linear_func); 1554848b8605Smrg if (p->translate.run == NULL) 1555848b8605Smrg goto fail; 1556848b8605Smrg 1557848b8605Smrg p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); 1558848b8605Smrg if (p->translate.run_elts == NULL) 1559848b8605Smrg goto fail; 1560848b8605Smrg 1561848b8605Smrg p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); 1562848b8605Smrg if (p->translate.run_elts16 == NULL) 1563848b8605Smrg goto fail; 1564848b8605Smrg 1565848b8605Smrg p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); 1566848b8605Smrg if (p->translate.run_elts8 == NULL) 1567848b8605Smrg goto fail; 1568848b8605Smrg 1569848b8605Smrg return &p->translate; 1570848b8605Smrg 1571848b8605Smrg fail: 1572848b8605Smrg if (p) 1573848b8605Smrg translate_sse_release(&p->translate); 1574848b8605Smrg 1575848b8605Smrg return NULL; 1576848b8605Smrg} 1577848b8605Smrg 1578848b8605Smrg 1579848b8605Smrg#else 1580848b8605Smrg 1581848b8605Smrgstruct translate * 1582848b8605Smrgtranslate_sse2_create(const struct translate_key *key) 1583848b8605Smrg{ 1584848b8605Smrg return NULL; 1585848b8605Smrg} 1586848b8605Smrg 1587848b8605Smrg#endif 1588