1/* 2 * Copyright 2003 VMware, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@vmware.com> 26 */ 27 28 29#include "pipe/p_config.h" 30#include "pipe/p_compiler.h" 31#include "util/u_memory.h" 32#include "util/u_math.h" 33#include "util/format/u_format.h" 34 35#include "translate.h" 36 37 38#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE) 39 40#include "rtasm/rtasm_cpu.h" 41#include "rtasm/rtasm_x86sse.h" 42 43 44#define X 0 45#define Y 1 46#define Z 2 47#define W 3 48 49 50struct translate_buffer 51{ 52 const void *base_ptr; 53 uintptr_t stride; 54 unsigned max_index; 55}; 56 57struct translate_buffer_variant 58{ 59 unsigned buffer_index; 60 unsigned instance_divisor; 61 void *ptr; /* updated either per vertex or per instance */ 62}; 63 64 65#define ELEMENT_BUFFER_INSTANCE_ID 1001 66 67#define NUM_CONSTS 7 68 69enum 70{ 71 CONST_IDENTITY, 72 CONST_INV_127, 73 CONST_INV_255, 74 CONST_INV_32767, 75 CONST_INV_65535, 76 CONST_INV_2147483647, 77 CONST_255 78}; 79 80#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} 81static float consts[NUM_CONSTS][4] = { 82 {0, 0, 0, 1}, 83 C(1.0 / 127.0), 84 C(1.0 / 255.0), 85 C(1.0 / 32767.0), 86 C(1.0 / 65535.0), 87 C(1.0 / 2147483647.0), 88 C(255.0) 89}; 90 91#undef C 92 93struct translate_sse 94{ 95 struct translate translate; 96 97 struct x86_function linear_func; 98 struct x86_function elt_func; 99 struct x86_function elt16_func; 100 struct x86_function elt8_func; 101 struct x86_function *func; 102 103 PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; 104 int8_t reg_to_const[16]; 105 int8_t const_to_reg[NUM_CONSTS]; 106 107 struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS]; 108 unsigned nr_buffers; 109 110 /* Multiple buffer variants can map to a single buffer. */ 111 struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS]; 112 unsigned nr_buffer_variants; 113 114 /* Multiple elements can map to a single buffer variant. */ 115 unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS]; 116 117 boolean use_instancing; 118 unsigned instance_id; 119 unsigned start_instance; 120 121 /* these are actually known values, but putting them in a struct 122 * like this is helpful to keep them in sync across the file. 123 */ 124 struct x86_reg tmp_EAX; 125 struct x86_reg tmp2_EDX; 126 struct x86_reg src_ECX; 127 struct x86_reg idx_ESI; /* either start+i or &elt[i] */ 128 struct x86_reg machine_EDI; 129 struct x86_reg outbuf_EBX; 130 struct x86_reg count_EBP; /* decrements to zero */ 131}; 132 133 134static int 135get_offset(const void *a, const void *b) 136{ 137 return (const char *) b - (const char *) a; 138} 139 140 141static struct x86_reg 142get_const(struct translate_sse *p, unsigned id) 143{ 144 struct x86_reg reg; 145 unsigned i; 146 147 if (p->const_to_reg[id] >= 0) 148 return x86_make_reg(file_XMM, p->const_to_reg[id]); 149 150 for (i = 2; i < 8; ++i) { 151 if (p->reg_to_const[i] < 0) 152 break; 153 } 154 155 /* TODO: be smarter here */ 156 if (i == 8) 157 --i; 158 159 reg = x86_make_reg(file_XMM, i); 160 161 if (p->reg_to_const[i] >= 0) 162 p->const_to_reg[p->reg_to_const[i]] = -1; 163 164 p->reg_to_const[i] = id; 165 p->const_to_reg[id] = i; 166 167 /* TODO: this should happen outside the loop, if possible */ 168 sse_movaps(p->func, reg, 169 x86_make_disp(p->machine_EDI, 170 get_offset(p, &p->consts[id][0]))); 171 172 return reg; 173} 174 175 176/* load the data in a SSE2 register, padding with zeros */ 177static boolean 178emit_load_sse2(struct translate_sse *p, 179 struct x86_reg data, struct x86_reg src, unsigned size) 180{ 181 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 182 struct x86_reg tmp = p->tmp_EAX; 183 switch (size) { 184 case 1: 185 x86_movzx8(p->func, tmp, src); 186 sse2_movd(p->func, data, tmp); 187 break; 188 case 2: 189 x86_movzx16(p->func, tmp, src); 190 sse2_movd(p->func, data, tmp); 191 break; 192 case 3: 193 x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); 194 x86_shl_imm(p->func, tmp, 16); 195 x86_mov16(p->func, tmp, src); 196 sse2_movd(p->func, data, tmp); 197 break; 198 case 4: 199 sse2_movd(p->func, data, src); 200 break; 201 case 6: 202 sse2_movd(p->func, data, src); 203 x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); 204 sse2_movd(p->func, tmpXMM, tmp); 205 sse2_punpckldq(p->func, data, tmpXMM); 206 break; 207 case 8: 208 sse2_movq(p->func, data, src); 209 break; 210 case 12: 211 sse2_movq(p->func, data, src); 212 sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); 213 sse2_punpcklqdq(p->func, data, tmpXMM); 214 break; 215 case 16: 216 sse2_movdqu(p->func, data, src); 217 break; 218 default: 219 return FALSE; 220 } 221 return TRUE; 222} 223 224 225/* this value can be passed for the out_chans argument */ 226#define CHANNELS_0001 5 227 228 229/* this function will load #chans float values, and will 230 * pad the register with zeroes at least up to out_chans. 231 * 232 * If out_chans is set to CHANNELS_0001, then the fourth 233 * value will be padded with 1. Only pass this value if 234 * chans < 4 or results are undefined. 235 */ 236static void 237emit_load_float32(struct translate_sse *p, struct x86_reg data, 238 struct x86_reg arg0, unsigned out_chans, unsigned chans) 239{ 240 switch (chans) { 241 case 1: 242 /* a 0 0 0 243 * a 0 0 1 244 */ 245 sse_movss(p->func, data, arg0); 246 if (out_chans == CHANNELS_0001) 247 sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 248 break; 249 case 2: 250 /* 0 0 0 1 251 * a b 0 1 252 */ 253 if (out_chans == CHANNELS_0001) 254 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 255 SHUF(X, Y, Z, W)); 256 else if (out_chans > 2) 257 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 258 sse_movlps(p->func, data, arg0); 259 break; 260 case 3: 261 /* Have to jump through some hoops: 262 * 263 * c 0 0 0 264 * c 0 0 1 if out_chans == CHANNELS_0001 265 * 0 0 c 0/1 266 * a b c 0/1 267 */ 268 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 269 if (out_chans == CHANNELS_0001) 270 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 271 SHUF(X, Y, Z, W)); 272 sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); 273 sse_movlps(p->func, data, arg0); 274 break; 275 case 4: 276 sse_movups(p->func, data, arg0); 277 break; 278 } 279} 280 281/* this function behaves like emit_load_float32, but loads 282 64-bit floating point numbers, converting them to 32-bit 283 ones */ 284static void 285emit_load_float64to32(struct translate_sse *p, struct x86_reg data, 286 struct x86_reg arg0, unsigned out_chans, unsigned chans) 287{ 288 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 289 switch (chans) { 290 case 1: 291 sse2_movsd(p->func, data, arg0); 292 if (out_chans > 1) 293 sse2_cvtpd2ps(p->func, data, data); 294 else 295 sse2_cvtsd2ss(p->func, data, data); 296 if (out_chans == CHANNELS_0001) 297 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 298 SHUF(X, Y, Z, W)); 299 break; 300 case 2: 301 sse2_movupd(p->func, data, arg0); 302 sse2_cvtpd2ps(p->func, data, data); 303 if (out_chans == CHANNELS_0001) 304 sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), 305 SHUF(X, Y, Z, W)); 306 else if (out_chans > 2) 307 sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); 308 break; 309 case 3: 310 sse2_movupd(p->func, data, arg0); 311 sse2_cvtpd2ps(p->func, data, data); 312 sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 313 if (out_chans > 3) 314 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 315 else 316 sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); 317 sse_movlhps(p->func, data, tmpXMM); 318 if (out_chans == CHANNELS_0001) 319 sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); 320 break; 321 case 4: 322 sse2_movupd(p->func, data, arg0); 323 sse2_cvtpd2ps(p->func, data, data); 324 sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); 325 sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); 326 sse_movlhps(p->func, data, tmpXMM); 327 break; 328 } 329} 330 331 332static void 333emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, 334 struct x86_reg dst_xmm, struct x86_reg src_gpr, 335 struct x86_reg src_xmm) 336{ 337 if (x86_target(p->func) != X86_32) 338 x64_mov64(p->func, dst_gpr, src_gpr); 339 else { 340 /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ 341 if (x86_target_caps(p->func) & X86_SSE2) 342 sse2_movq(p->func, dst_xmm, src_xmm); 343 else 344 sse_movlps(p->func, dst_xmm, src_xmm); 345 } 346} 347 348 349static void 350emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, 351 struct x86_reg dst_xmm, struct x86_reg src) 352{ 353 emit_mov64(p, dst_gpr, dst_xmm, src, src); 354} 355 356 357static void 358emit_store64(struct translate_sse *p, struct x86_reg dst, 359 struct x86_reg src_gpr, struct x86_reg src_xmm) 360{ 361 emit_mov64(p, dst, dst, src_gpr, src_xmm); 362} 363 364 365static void 366emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) 367{ 368 if (x86_target_caps(p->func) & X86_SSE2) 369 sse2_movdqu(p->func, dst, src); 370 else 371 sse_movups(p->func, dst, src); 372} 373 374 375/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, 376 * but may or may not be good on older processors 377 * TODO: may perhaps want to use non-temporal stores here if possible 378 */ 379static void 380emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, 381 unsigned size) 382{ 383 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 384 struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); 385 struct x86_reg dataGPR = p->tmp_EAX; 386 struct x86_reg dataGPR2 = p->tmp2_EDX; 387 388 if (size < 8) { 389 switch (size) { 390 case 1: 391 x86_mov8(p->func, dataGPR, src); 392 x86_mov8(p->func, dst, dataGPR); 393 break; 394 case 2: 395 x86_mov16(p->func, dataGPR, src); 396 x86_mov16(p->func, dst, dataGPR); 397 break; 398 case 3: 399 x86_mov16(p->func, dataGPR, src); 400 x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); 401 x86_mov16(p->func, dst, dataGPR); 402 x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); 403 break; 404 case 4: 405 x86_mov(p->func, dataGPR, src); 406 x86_mov(p->func, dst, dataGPR); 407 break; 408 case 6: 409 x86_mov(p->func, dataGPR, src); 410 x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); 411 x86_mov(p->func, dst, dataGPR); 412 x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); 413 break; 414 } 415 } 416 else if (!(x86_target_caps(p->func) & X86_SSE)) { 417 unsigned i = 0; 418 assert((size & 3) == 0); 419 for (i = 0; i < size; i += 4) { 420 x86_mov(p->func, dataGPR, x86_make_disp(src, i)); 421 x86_mov(p->func, x86_make_disp(dst, i), dataGPR); 422 } 423 } 424 else { 425 switch (size) { 426 case 8: 427 emit_load64(p, dataGPR, dataXMM, src); 428 emit_store64(p, dst, dataGPR, dataXMM); 429 break; 430 case 12: 431 emit_load64(p, dataGPR2, dataXMM, src); 432 x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); 433 emit_store64(p, dst, dataGPR2, dataXMM); 434 x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); 435 break; 436 case 16: 437 emit_mov128(p, dataXMM, src); 438 emit_mov128(p, dst, dataXMM); 439 break; 440 case 24: 441 emit_mov128(p, dataXMM, src); 442 emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); 443 emit_mov128(p, dst, dataXMM); 444 emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); 445 break; 446 case 32: 447 emit_mov128(p, dataXMM, src); 448 emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); 449 emit_mov128(p, dst, dataXMM); 450 emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); 451 break; 452 default: 453 assert(0); 454 } 455 } 456} 457 458static boolean 459translate_attr_convert(struct translate_sse *p, 460 const struct translate_element *a, 461 struct x86_reg src, struct x86_reg dst) 462{ 463 const struct util_format_description *input_desc = 464 util_format_description(a->input_format); 465 const struct util_format_description *output_desc = 466 util_format_description(a->output_format); 467 unsigned i; 468 boolean id_swizzle = TRUE; 469 unsigned swizzle[4] = 470 { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE, 471 PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE }; 472 unsigned needed_chans = 0; 473 unsigned imms[2] = { 0, 0x3f800000 }; 474 475 if (a->output_format == PIPE_FORMAT_NONE 476 || a->input_format == PIPE_FORMAT_NONE) 477 return FALSE; 478 479 if (input_desc->channel[0].size & 7) 480 return FALSE; 481 482 if (input_desc->colorspace != output_desc->colorspace) 483 return FALSE; 484 485 for (i = 1; i < input_desc->nr_channels; ++i) { 486 if (memcmp 487 (&input_desc->channel[i], &input_desc->channel[0], 488 sizeof(input_desc->channel[0]))) 489 return FALSE; 490 } 491 492 for (i = 1; i < output_desc->nr_channels; ++i) { 493 if (memcmp 494 (&output_desc->channel[i], &output_desc->channel[0], 495 sizeof(output_desc->channel[0]))) { 496 return FALSE; 497 } 498 } 499 500 for (i = 0; i < output_desc->nr_channels; ++i) { 501 if (output_desc->swizzle[i] < 4) 502 swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; 503 } 504 505 if ((x86_target_caps(p->func) & X86_SSE) && 506 (0 || a->output_format == PIPE_FORMAT_R32_FLOAT 507 || a->output_format == PIPE_FORMAT_R32G32_FLOAT 508 || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT 509 || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { 510 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 511 512 for (i = 0; i < output_desc->nr_channels; ++i) { 513 if (swizzle[i] == PIPE_SWIZZLE_0 514 && i >= input_desc->nr_channels) 515 swizzle[i] = i; 516 } 517 518 for (i = 0; i < output_desc->nr_channels; ++i) { 519 if (swizzle[i] < 4) 520 needed_chans = MAX2(needed_chans, swizzle[i] + 1); 521 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 522 id_swizzle = FALSE; 523 } 524 525 if (needed_chans > 0) { 526 switch (input_desc->channel[0].type) { 527 case UTIL_FORMAT_TYPE_UNSIGNED: 528 if (!(x86_target_caps(p->func) & X86_SSE2)) 529 return FALSE; 530 emit_load_sse2(p, dataXMM, src, 531 input_desc->channel[0].size * 532 input_desc->nr_channels >> 3); 533 534 /* TODO: add support for SSE4.1 pmovzx */ 535 switch (input_desc->channel[0].size) { 536 case 8: 537 /* TODO: this may be inefficient due to get_identity() being 538 * used both as a float and integer register. 539 */ 540 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 541 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 542 break; 543 case 16: 544 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 545 break; 546 case 32: /* we lose precision here */ 547 sse2_psrld_imm(p->func, dataXMM, 1); 548 break; 549 default: 550 return FALSE; 551 } 552 sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 553 if (input_desc->channel[0].normalized) { 554 struct x86_reg factor; 555 switch (input_desc->channel[0].size) { 556 case 8: 557 factor = get_const(p, CONST_INV_255); 558 break; 559 case 16: 560 factor = get_const(p, CONST_INV_65535); 561 break; 562 case 32: 563 factor = get_const(p, CONST_INV_2147483647); 564 break; 565 default: 566 assert(0); 567 factor.disp = 0; 568 factor.file = 0; 569 factor.idx = 0; 570 factor.mod = 0; 571 break; 572 } 573 sse_mulps(p->func, dataXMM, factor); 574 } 575 else if (input_desc->channel[0].size == 32) 576 /* compensate for the bit we threw away to fit u32 into s32 */ 577 sse_addps(p->func, dataXMM, dataXMM); 578 break; 579 case UTIL_FORMAT_TYPE_SIGNED: 580 if (!(x86_target_caps(p->func) & X86_SSE2)) 581 return FALSE; 582 emit_load_sse2(p, dataXMM, src, 583 input_desc->channel[0].size * 584 input_desc->nr_channels >> 3); 585 586 /* TODO: add support for SSE4.1 pmovsx */ 587 switch (input_desc->channel[0].size) { 588 case 8: 589 sse2_punpcklbw(p->func, dataXMM, dataXMM); 590 sse2_punpcklbw(p->func, dataXMM, dataXMM); 591 sse2_psrad_imm(p->func, dataXMM, 24); 592 break; 593 case 16: 594 sse2_punpcklwd(p->func, dataXMM, dataXMM); 595 sse2_psrad_imm(p->func, dataXMM, 16); 596 break; 597 case 32: /* we lose precision here */ 598 break; 599 default: 600 return FALSE; 601 } 602 sse2_cvtdq2ps(p->func, dataXMM, dataXMM); 603 if (input_desc->channel[0].normalized) { 604 struct x86_reg factor; 605 switch (input_desc->channel[0].size) { 606 case 8: 607 factor = get_const(p, CONST_INV_127); 608 break; 609 case 16: 610 factor = get_const(p, CONST_INV_32767); 611 break; 612 case 32: 613 factor = get_const(p, CONST_INV_2147483647); 614 break; 615 default: 616 assert(0); 617 factor.disp = 0; 618 factor.file = 0; 619 factor.idx = 0; 620 factor.mod = 0; 621 break; 622 } 623 sse_mulps(p->func, dataXMM, factor); 624 } 625 break; 626 627 break; 628 case UTIL_FORMAT_TYPE_FLOAT: 629 if (input_desc->channel[0].size != 32 630 && input_desc->channel[0].size != 64) { 631 return FALSE; 632 } 633 if (swizzle[3] == PIPE_SWIZZLE_1 634 && input_desc->nr_channels <= 3) { 635 swizzle[3] = PIPE_SWIZZLE_W; 636 needed_chans = CHANNELS_0001; 637 } 638 switch (input_desc->channel[0].size) { 639 case 32: 640 emit_load_float32(p, dataXMM, src, needed_chans, 641 input_desc->nr_channels); 642 break; 643 case 64: /* we lose precision here */ 644 if (!(x86_target_caps(p->func) & X86_SSE2)) 645 return FALSE; 646 emit_load_float64to32(p, dataXMM, src, needed_chans, 647 input_desc->nr_channels); 648 break; 649 default: 650 return FALSE; 651 } 652 break; 653 default: 654 return FALSE; 655 } 656 657 if (!id_swizzle) { 658 sse_shufps(p->func, dataXMM, dataXMM, 659 SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); 660 } 661 } 662 663 if (output_desc->nr_channels >= 4 664 && swizzle[0] < PIPE_SWIZZLE_0 665 && swizzle[1] < PIPE_SWIZZLE_0 666 && swizzle[2] < PIPE_SWIZZLE_0 667 && swizzle[3] < PIPE_SWIZZLE_0) { 668 sse_movups(p->func, dst, dataXMM); 669 } 670 else { 671 if (output_desc->nr_channels >= 2 672 && swizzle[0] < PIPE_SWIZZLE_0 673 && swizzle[1] < PIPE_SWIZZLE_0) { 674 sse_movlps(p->func, dst, dataXMM); 675 } 676 else { 677 if (swizzle[0] < PIPE_SWIZZLE_0) { 678 sse_movss(p->func, dst, dataXMM); 679 } 680 else { 681 x86_mov_imm(p->func, dst, 682 imms[swizzle[0] - PIPE_SWIZZLE_0]); 683 } 684 685 if (output_desc->nr_channels >= 2) { 686 if (swizzle[1] < PIPE_SWIZZLE_0) { 687 sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); 688 sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); 689 } 690 else { 691 x86_mov_imm(p->func, x86_make_disp(dst, 4), 692 imms[swizzle[1] - PIPE_SWIZZLE_0]); 693 } 694 } 695 } 696 697 if (output_desc->nr_channels >= 3) { 698 if (output_desc->nr_channels >= 4 699 && swizzle[2] < PIPE_SWIZZLE_0 700 && swizzle[3] < PIPE_SWIZZLE_0) { 701 sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); 702 } 703 else { 704 if (swizzle[2] < PIPE_SWIZZLE_0) { 705 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); 706 sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); 707 } 708 else { 709 x86_mov_imm(p->func, x86_make_disp(dst, 8), 710 imms[swizzle[2] - PIPE_SWIZZLE_0]); 711 } 712 713 if (output_desc->nr_channels >= 4) { 714 if (swizzle[3] < PIPE_SWIZZLE_0) { 715 sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); 716 sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); 717 } 718 else { 719 x86_mov_imm(p->func, x86_make_disp(dst, 12), 720 imms[swizzle[3] - PIPE_SWIZZLE_0]); 721 } 722 } 723 } 724 } 725 } 726 return TRUE; 727 } 728 else if ((x86_target_caps(p->func) & X86_SSE2) 729 && input_desc->channel[0].size == 8 730 && output_desc->channel[0].size == 16 731 && output_desc->channel[0].normalized == 732 input_desc->channel[0].normalized && 733 (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 734 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) 735 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED 736 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 737 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED 738 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { 739 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 740 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); 741 struct x86_reg tmp = p->tmp_EAX; 742 unsigned imms[2] = { 0, 1 }; 743 744 for (i = 0; i < output_desc->nr_channels; ++i) { 745 if (swizzle[i] == PIPE_SWIZZLE_0 746 && i >= input_desc->nr_channels) { 747 swizzle[i] = i; 748 } 749 } 750 751 for (i = 0; i < output_desc->nr_channels; ++i) { 752 if (swizzle[i] < 4) 753 needed_chans = MAX2(needed_chans, swizzle[i] + 1); 754 if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i) 755 id_swizzle = FALSE; 756 } 757 758 if (needed_chans > 0) { 759 emit_load_sse2(p, dataXMM, src, 760 input_desc->channel[0].size * 761 input_desc->nr_channels >> 3); 762 763 switch (input_desc->channel[0].type) { 764 case UTIL_FORMAT_TYPE_UNSIGNED: 765 if (input_desc->channel[0].normalized) { 766 sse2_punpcklbw(p->func, dataXMM, dataXMM); 767 if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) 768 sse2_psrlw_imm(p->func, dataXMM, 1); 769 } 770 else 771 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); 772 break; 773 case UTIL_FORMAT_TYPE_SIGNED: 774 if (input_desc->channel[0].normalized) { 775 sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); 776 sse2_punpcklbw(p->func, tmpXMM, dataXMM); 777 sse2_psllw_imm(p->func, dataXMM, 9); 778 sse2_psrlw_imm(p->func, dataXMM, 8); 779 sse2_por(p->func, tmpXMM, dataXMM); 780 sse2_psrlw_imm(p->func, dataXMM, 7); 781 sse2_por(p->func, tmpXMM, dataXMM); 782 { 783 struct x86_reg t = dataXMM; 784 dataXMM = tmpXMM; 785 tmpXMM = t; 786 } 787 } 788 else { 789 sse2_punpcklbw(p->func, dataXMM, dataXMM); 790 sse2_psraw_imm(p->func, dataXMM, 8); 791 } 792 break; 793 default: 794 assert(0); 795 } 796 797 if (output_desc->channel[0].normalized) 798 imms[1] = 799 (output_desc->channel[0].type == 800 UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; 801 802 if (!id_swizzle) 803 sse2_pshuflw(p->func, dataXMM, dataXMM, 804 (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | 805 ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); 806 } 807 808 if (output_desc->nr_channels >= 4 809 && swizzle[0] < PIPE_SWIZZLE_0 810 && swizzle[1] < PIPE_SWIZZLE_0 811 && swizzle[2] < PIPE_SWIZZLE_0 812 && swizzle[3] < PIPE_SWIZZLE_0) { 813 sse2_movq(p->func, dst, dataXMM); 814 } 815 else { 816 if (swizzle[0] < PIPE_SWIZZLE_0) { 817 if (output_desc->nr_channels >= 2 818 && swizzle[1] < PIPE_SWIZZLE_0) { 819 sse2_movd(p->func, dst, dataXMM); 820 } 821 else { 822 sse2_movd(p->func, tmp, dataXMM); 823 x86_mov16(p->func, dst, tmp); 824 if (output_desc->nr_channels >= 2) 825 x86_mov16_imm(p->func, x86_make_disp(dst, 2), 826 imms[swizzle[1] - PIPE_SWIZZLE_0]); 827 } 828 } 829 else { 830 if (output_desc->nr_channels >= 2 831 && swizzle[1] >= PIPE_SWIZZLE_0) { 832 x86_mov_imm(p->func, dst, 833 (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) | 834 imms[swizzle[0] - PIPE_SWIZZLE_0]); 835 } 836 else { 837 x86_mov16_imm(p->func, dst, 838 imms[swizzle[0] - PIPE_SWIZZLE_0]); 839 if (output_desc->nr_channels >= 2) { 840 sse2_movd(p->func, tmp, dataXMM); 841 x86_shr_imm(p->func, tmp, 16); 842 x86_mov16(p->func, x86_make_disp(dst, 2), tmp); 843 } 844 } 845 } 846 847 if (output_desc->nr_channels >= 3) { 848 if (swizzle[2] < PIPE_SWIZZLE_0) { 849 if (output_desc->nr_channels >= 4 850 && swizzle[3] < PIPE_SWIZZLE_0) { 851 sse2_psrlq_imm(p->func, dataXMM, 32); 852 sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); 853 } 854 else { 855 sse2_psrlq_imm(p->func, dataXMM, 32); 856 sse2_movd(p->func, tmp, dataXMM); 857 x86_mov16(p->func, x86_make_disp(dst, 4), tmp); 858 if (output_desc->nr_channels >= 4) { 859 x86_mov16_imm(p->func, x86_make_disp(dst, 6), 860 imms[swizzle[3] - PIPE_SWIZZLE_0]); 861 } 862 } 863 } 864 else { 865 if (output_desc->nr_channels >= 4 866 && swizzle[3] >= PIPE_SWIZZLE_0) { 867 x86_mov_imm(p->func, x86_make_disp(dst, 4), 868 (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16) 869 | imms[swizzle[2] - PIPE_SWIZZLE_0]); 870 } 871 else { 872 x86_mov16_imm(p->func, x86_make_disp(dst, 4), 873 imms[swizzle[2] - PIPE_SWIZZLE_0]); 874 875 if (output_desc->nr_channels >= 4) { 876 sse2_psrlq_imm(p->func, dataXMM, 48); 877 sse2_movd(p->func, tmp, dataXMM); 878 x86_mov16(p->func, x86_make_disp(dst, 6), tmp); 879 } 880 } 881 } 882 } 883 } 884 return TRUE; 885 } 886 else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], 887 sizeof(output_desc->channel[0]))) { 888 struct x86_reg tmp = p->tmp_EAX; 889 unsigned i; 890 891 if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 892 && output_desc->nr_channels == 4 893 && swizzle[0] == PIPE_SWIZZLE_W 894 && swizzle[1] == PIPE_SWIZZLE_Z 895 && swizzle[2] == PIPE_SWIZZLE_Y 896 && swizzle[3] == PIPE_SWIZZLE_X) { 897 /* TODO: support movbe */ 898 x86_mov(p->func, tmp, src); 899 x86_bswap(p->func, tmp); 900 x86_mov(p->func, dst, tmp); 901 return TRUE; 902 } 903 904 for (i = 0; i < output_desc->nr_channels; ++i) { 905 switch (output_desc->channel[0].size) { 906 case 8: 907 if (swizzle[i] >= PIPE_SWIZZLE_0) { 908 unsigned v = 0; 909 if (swizzle[i] == PIPE_SWIZZLE_1) { 910 switch (output_desc->channel[0].type) { 911 case UTIL_FORMAT_TYPE_UNSIGNED: 912 v = output_desc->channel[0].normalized ? 0xff : 1; 913 break; 914 case UTIL_FORMAT_TYPE_SIGNED: 915 v = output_desc->channel[0].normalized ? 0x7f : 1; 916 break; 917 default: 918 return FALSE; 919 } 920 } 921 x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); 922 } 923 else { 924 x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); 925 x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); 926 } 927 break; 928 case 16: 929 if (swizzle[i] >= PIPE_SWIZZLE_0) { 930 unsigned v = 0; 931 if (swizzle[i] == PIPE_SWIZZLE_1) { 932 switch (output_desc->channel[1].type) { 933 case UTIL_FORMAT_TYPE_UNSIGNED: 934 v = output_desc->channel[1].normalized ? 0xffff : 1; 935 break; 936 case UTIL_FORMAT_TYPE_SIGNED: 937 v = output_desc->channel[1].normalized ? 0x7fff : 1; 938 break; 939 case UTIL_FORMAT_TYPE_FLOAT: 940 v = 0x3c00; 941 break; 942 default: 943 return FALSE; 944 } 945 } 946 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); 947 } 948 else if (swizzle[i] == PIPE_SWIZZLE_0) { 949 x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); 950 } 951 else { 952 x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); 953 x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); 954 } 955 break; 956 case 32: 957 if (swizzle[i] >= PIPE_SWIZZLE_0) { 958 unsigned v = 0; 959 if (swizzle[i] == PIPE_SWIZZLE_1) { 960 switch (output_desc->channel[1].type) { 961 case UTIL_FORMAT_TYPE_UNSIGNED: 962 v = output_desc->channel[1].normalized ? 0xffffffff : 1; 963 break; 964 case UTIL_FORMAT_TYPE_SIGNED: 965 v = output_desc->channel[1].normalized ? 0x7fffffff : 1; 966 break; 967 case UTIL_FORMAT_TYPE_FLOAT: 968 v = 0x3f800000; 969 break; 970 default: 971 return FALSE; 972 } 973 } 974 x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); 975 } 976 else { 977 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); 978 x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); 979 } 980 break; 981 case 64: 982 if (swizzle[i] >= PIPE_SWIZZLE_0) { 983 unsigned l = 0; 984 unsigned h = 0; 985 if (swizzle[i] == PIPE_SWIZZLE_1) { 986 switch (output_desc->channel[1].type) { 987 case UTIL_FORMAT_TYPE_UNSIGNED: 988 h = output_desc->channel[1].normalized ? 0xffffffff : 0; 989 l = output_desc->channel[1].normalized ? 0xffffffff : 1; 990 break; 991 case UTIL_FORMAT_TYPE_SIGNED: 992 h = output_desc->channel[1].normalized ? 0x7fffffff : 0; 993 l = output_desc->channel[1].normalized ? 0xffffffff : 1; 994 break; 995 case UTIL_FORMAT_TYPE_FLOAT: 996 h = 0x3ff00000; 997 l = 0; 998 break; 999 default: 1000 return FALSE; 1001 } 1002 } 1003 x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); 1004 x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); 1005 } 1006 else { 1007 if (x86_target_caps(p->func) & X86_SSE) { 1008 struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); 1009 emit_load64(p, tmp, tmpXMM, 1010 x86_make_disp(src, swizzle[i] * 8)); 1011 emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); 1012 } 1013 else { 1014 x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); 1015 x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); 1016 x86_mov(p->func, tmp, 1017 x86_make_disp(src, swizzle[i] * 8 + 4)); 1018 x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); 1019 } 1020 } 1021 break; 1022 default: 1023 return FALSE; 1024 } 1025 } 1026 return TRUE; 1027 } 1028 /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ 1029 else if ((x86_target_caps(p->func) & X86_SSE2) && 1030 a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && 1031 (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM 1032 || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { 1033 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 1034 1035 /* load */ 1036 sse_movups(p->func, dataXMM, src); 1037 1038 if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { 1039 sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); 1040 } 1041 1042 /* scale by 255.0 */ 1043 sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); 1044 1045 /* pack and emit */ 1046 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 1047 sse2_packssdw(p->func, dataXMM, dataXMM); 1048 sse2_packuswb(p->func, dataXMM, dataXMM); 1049 sse2_movd(p->func, dst, dataXMM); 1050 1051 return TRUE; 1052 } 1053 1054 return FALSE; 1055} 1056 1057 1058static boolean 1059translate_attr(struct translate_sse *p, 1060 const struct translate_element *a, 1061 struct x86_reg src, struct x86_reg dst) 1062{ 1063 if (a->input_format == a->output_format) { 1064 emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); 1065 return TRUE; 1066 } 1067 1068 return translate_attr_convert(p, a, src, dst); 1069} 1070 1071 1072static boolean 1073init_inputs(struct translate_sse *p, unsigned index_size) 1074{ 1075 unsigned i; 1076 struct x86_reg instance_id = 1077 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1078 struct x86_reg start_instance = 1079 x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); 1080 1081 for (i = 0; i < p->nr_buffer_variants; i++) { 1082 struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1083 struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; 1084 1085 if (!index_size || variant->instance_divisor) { 1086 struct x86_reg buf_max_index = 1087 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); 1088 struct x86_reg buf_stride = 1089 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); 1090 struct x86_reg buf_ptr = 1091 x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); 1092 struct x86_reg buf_base_ptr = 1093 x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); 1094 struct x86_reg elt = p->idx_ESI; 1095 struct x86_reg tmp_EAX = p->tmp_EAX; 1096 1097 /* Calculate pointer to first attrib: 1098 * base_ptr + stride * index, where index depends on instance divisor 1099 */ 1100 if (variant->instance_divisor) { 1101 struct x86_reg tmp_EDX = p->tmp2_EDX; 1102 1103 /* Start with instance = instance_id 1104 * which is true if divisor is 1. 1105 */ 1106 x86_mov(p->func, tmp_EAX, instance_id); 1107 1108 if (variant->instance_divisor != 1) { 1109 struct x86_reg tmp_ECX = p->src_ECX; 1110 1111 /* TODO: Add x86_shr() to rtasm and use it whenever 1112 * instance divisor is power of two. 1113 */ 1114 x86_xor(p->func, tmp_EDX, tmp_EDX); 1115 x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); 1116 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 1117 } 1118 1119 /* instance = (instance_id / divisor) + start_instance 1120 */ 1121 x86_mov(p->func, tmp_EDX, start_instance); 1122 x86_add(p->func, tmp_EAX, tmp_EDX); 1123 1124 /* XXX we need to clamp the index here too, but to a 1125 * per-array max value, not the draw->pt.max_index value 1126 * that's being given to us via translate->set_buffer(). 1127 */ 1128 } 1129 else { 1130 x86_mov(p->func, tmp_EAX, elt); 1131 1132 /* Clamp to max_index 1133 */ 1134 x86_cmp(p->func, tmp_EAX, buf_max_index); 1135 x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); 1136 } 1137 1138 x86_mov(p->func, p->tmp2_EDX, buf_stride); 1139 x64_rexw(p->func); 1140 x86_imul(p->func, tmp_EAX, p->tmp2_EDX); 1141 x64_rexw(p->func); 1142 x86_add(p->func, tmp_EAX, buf_base_ptr); 1143 1144 x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1145 1146 /* In the linear case, keep the buffer pointer instead of the 1147 * index number. 1148 */ 1149 if (!index_size && p->nr_buffer_variants == 1) { 1150 x64_rexw(p->func); 1151 x86_mov(p->func, elt, tmp_EAX); 1152 } 1153 else { 1154 x64_rexw(p->func); 1155 x86_mov(p->func, buf_ptr, tmp_EAX); 1156 } 1157 } 1158 } 1159 1160 return TRUE; 1161} 1162 1163 1164static struct x86_reg 1165get_buffer_ptr(struct translate_sse *p, 1166 unsigned index_size, unsigned var_idx, struct x86_reg elt) 1167{ 1168 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 1169 return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); 1170 } 1171 if (!index_size && p->nr_buffer_variants == 1) { 1172 return p->idx_ESI; 1173 } 1174 else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { 1175 struct x86_reg ptr = p->src_ECX; 1176 struct x86_reg buf_ptr = 1177 x86_make_disp(p->machine_EDI, 1178 get_offset(p, &p->buffer_variant[var_idx].ptr)); 1179 1180 x64_rexw(p->func); 1181 x86_mov(p->func, ptr, buf_ptr); 1182 return ptr; 1183 } 1184 else { 1185 struct x86_reg ptr = p->src_ECX; 1186 const struct translate_buffer_variant *variant = 1187 &p->buffer_variant[var_idx]; 1188 struct x86_reg buf_stride = 1189 x86_make_disp(p->machine_EDI, 1190 get_offset(p, &p->buffer[variant->buffer_index].stride)); 1191 struct x86_reg buf_base_ptr = 1192 x86_make_disp(p->machine_EDI, 1193 get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); 1194 struct x86_reg buf_max_index = 1195 x86_make_disp(p->machine_EDI, 1196 get_offset(p, &p->buffer[variant->buffer_index].max_index)); 1197 1198 /* Calculate pointer to current attrib: 1199 */ 1200 switch (index_size) { 1201 case 1: 1202 x86_movzx8(p->func, ptr, elt); 1203 break; 1204 case 2: 1205 x86_movzx16(p->func, ptr, elt); 1206 break; 1207 case 4: 1208 x86_mov(p->func, ptr, elt); 1209 break; 1210 } 1211 1212 /* Clamp to max_index 1213 */ 1214 x86_cmp(p->func, ptr, buf_max_index); 1215 x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); 1216 1217 x86_mov(p->func, p->tmp2_EDX, buf_stride); 1218 x64_rexw(p->func); 1219 x86_imul(p->func, ptr, p->tmp2_EDX); 1220 x64_rexw(p->func); 1221 x86_add(p->func, ptr, buf_base_ptr); 1222 return ptr; 1223 } 1224} 1225 1226 1227static boolean 1228incr_inputs(struct translate_sse *p, unsigned index_size) 1229{ 1230 if (!index_size && p->nr_buffer_variants == 1) { 1231 const unsigned buffer_index = p->buffer_variant[0].buffer_index; 1232 struct x86_reg stride = 1233 x86_make_disp(p->machine_EDI, 1234 get_offset(p, &p->buffer[buffer_index].stride)); 1235 1236 if (p->buffer_variant[0].instance_divisor == 0) { 1237 x64_rexw(p->func); 1238 x86_add(p->func, p->idx_ESI, stride); 1239 sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); 1240 } 1241 } 1242 else if (!index_size) { 1243 unsigned i; 1244 1245 /* Is this worthwhile?? 1246 */ 1247 for (i = 0; i < p->nr_buffer_variants; i++) { 1248 struct translate_buffer_variant *variant = &p->buffer_variant[i]; 1249 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, 1250 get_offset(p, &variant->ptr)); 1251 struct x86_reg buf_stride = 1252 x86_make_disp(p->machine_EDI, 1253 get_offset(p, &p->buffer[variant->buffer_index].stride)); 1254 1255 if (variant->instance_divisor == 0) { 1256 x86_mov(p->func, p->tmp_EAX, buf_stride); 1257 x64_rexw(p->func); 1258 x86_add(p->func, p->tmp_EAX, buf_ptr); 1259 if (i == 0) 1260 sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 1261 x64_rexw(p->func); 1262 x86_mov(p->func, buf_ptr, p->tmp_EAX); 1263 } 1264 } 1265 } 1266 else { 1267 x64_rexw(p->func); 1268 x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); 1269 } 1270 1271 return TRUE; 1272} 1273 1274 1275/* Build run( struct translate *machine, 1276 * unsigned start, 1277 * unsigned count, 1278 * void *output_buffer ) 1279 * or 1280 * run_elts( struct translate *machine, 1281 * unsigned *elts, 1282 * unsigned count, 1283 * void *output_buffer ) 1284 * 1285 * Lots of hardcoding 1286 * 1287 * EAX -- pointer to current output vertex 1288 * ECX -- pointer to current attribute 1289 * 1290 */ 1291static boolean 1292build_vertex_emit(struct translate_sse *p, 1293 struct x86_function *func, unsigned index_size) 1294{ 1295 int fixup, label; 1296 unsigned j; 1297 1298 memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); 1299 memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); 1300 1301 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 1302 p->idx_ESI = x86_make_reg(file_REG32, reg_SI); 1303 p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); 1304 p->machine_EDI = x86_make_reg(file_REG32, reg_DI); 1305 p->count_EBP = x86_make_reg(file_REG32, reg_BP); 1306 p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); 1307 p->src_ECX = x86_make_reg(file_REG32, reg_CX); 1308 1309 p->func = func; 1310 1311 x86_init_func(p->func); 1312 1313 if (x86_target(p->func) == X86_64_WIN64_ABI) { 1314 /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" 1315 * above the return address 1316 */ 1317 sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), 1318 x86_make_reg(file_XMM, 6)); 1319 sse2_movdqa(p->func, 1320 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), 1321 x86_make_reg(file_XMM, 7)); 1322 } 1323 1324 x86_push(p->func, p->outbuf_EBX); 1325 x86_push(p->func, p->count_EBP); 1326 1327 /* on non-Win64 x86-64, these are already in the right registers */ 1328 if (x86_target(p->func) != X86_64_STD_ABI) { 1329 x86_push(p->func, p->machine_EDI); 1330 x86_push(p->func, p->idx_ESI); 1331 1332 if (x86_target(p->func) != X86_32) { 1333 x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1334 x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1335 } 1336 else { 1337 x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); 1338 x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); 1339 } 1340 } 1341 1342 x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); 1343 1344 if (x86_target(p->func) != X86_32) 1345 x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1346 else 1347 x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); 1348 1349 /* Load instance ID. 1350 */ 1351 if (p->use_instancing) { 1352 x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); 1353 x86_mov(p->func, 1354 x86_make_disp(p->machine_EDI, 1355 get_offset(p, &p->start_instance)), p->tmp2_EDX); 1356 1357 x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); 1358 x86_mov(p->func, 1359 x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), 1360 p->tmp_EAX); 1361 } 1362 1363 /* Get vertex count, compare to zero 1364 */ 1365 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 1366 x86_cmp(p->func, p->count_EBP, p->tmp_EAX); 1367 fixup = x86_jcc_forward(p->func, cc_E); 1368 1369 /* always load, needed or not: 1370 */ 1371 init_inputs(p, index_size); 1372 1373 /* Note address for loop jump 1374 */ 1375 label = x86_get_label(p->func); 1376 { 1377 struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); 1378 int last_variant = -1; 1379 struct x86_reg vb; 1380 1381 for (j = 0; j < p->translate.key.nr_elements; j++) { 1382 const struct translate_element *a = &p->translate.key.element[j]; 1383 unsigned variant = p->element_to_buffer_variant[j]; 1384 1385 /* Figure out source pointer address: 1386 */ 1387 if (variant != last_variant) { 1388 last_variant = variant; 1389 vb = get_buffer_ptr(p, index_size, variant, elt); 1390 } 1391 1392 if (!translate_attr(p, a, 1393 x86_make_disp(vb, a->input_offset), 1394 x86_make_disp(p->outbuf_EBX, a->output_offset))) 1395 return FALSE; 1396 } 1397 1398 /* Next output vertex: 1399 */ 1400 x64_rexw(p->func); 1401 x86_lea(p->func, p->outbuf_EBX, 1402 x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); 1403 1404 /* Incr index 1405 */ 1406 incr_inputs(p, index_size); 1407 } 1408 1409 /* decr count, loop if not zero 1410 */ 1411 x86_dec(p->func, p->count_EBP); 1412 x86_jcc(p->func, cc_NZ, label); 1413 1414 /* Exit mmx state? 1415 */ 1416 if (p->func->need_emms) 1417 mmx_emms(p->func); 1418 1419 /* Land forward jump here: 1420 */ 1421 x86_fixup_fwd_jump(p->func, fixup); 1422 1423 /* Pop regs and return 1424 */ 1425 if (x86_target(p->func) != X86_64_STD_ABI) { 1426 x86_pop(p->func, p->idx_ESI); 1427 x86_pop(p->func, p->machine_EDI); 1428 } 1429 1430 x86_pop(p->func, p->count_EBP); 1431 x86_pop(p->func, p->outbuf_EBX); 1432 1433 if (x86_target(p->func) == X86_64_WIN64_ABI) { 1434 sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), 1435 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); 1436 sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), 1437 x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); 1438 } 1439 x86_ret(p->func); 1440 1441 return TRUE; 1442} 1443 1444 1445static void 1446translate_sse_set_buffer(struct translate *translate, 1447 unsigned buf, 1448 const void *ptr, unsigned stride, unsigned max_index) 1449{ 1450 struct translate_sse *p = (struct translate_sse *) translate; 1451 1452 if (buf < p->nr_buffers) { 1453 p->buffer[buf].base_ptr = (char *) ptr; 1454 p->buffer[buf].stride = stride; 1455 p->buffer[buf].max_index = max_index; 1456 } 1457 1458 if (0) 1459 debug_printf("%s %d/%d: %p %d\n", 1460 __FUNCTION__, buf, p->nr_buffers, ptr, stride); 1461} 1462 1463 1464static void 1465translate_sse_release(struct translate *translate) 1466{ 1467 struct translate_sse *p = (struct translate_sse *) translate; 1468 1469 x86_release_func(&p->elt8_func); 1470 x86_release_func(&p->elt16_func); 1471 x86_release_func(&p->elt_func); 1472 x86_release_func(&p->linear_func); 1473 1474 os_free_aligned(p); 1475} 1476 1477 1478struct translate * 1479translate_sse2_create(const struct translate_key *key) 1480{ 1481 struct translate_sse *p = NULL; 1482 unsigned i; 1483 1484 /* this is misnamed, it actually refers to whether rtasm is enabled or not */ 1485 if (!rtasm_cpu_has_sse()) 1486 goto fail; 1487 1488 p = os_malloc_aligned(sizeof(struct translate_sse), 16); 1489 if (!p) 1490 goto fail; 1491 1492 memset(p, 0, sizeof(*p)); 1493 memcpy(p->consts, consts, sizeof(consts)); 1494 1495 p->translate.key = *key; 1496 p->translate.release = translate_sse_release; 1497 p->translate.set_buffer = translate_sse_set_buffer; 1498 1499 assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS); 1500 1501 for (i = 0; i < key->nr_elements; i++) { 1502 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 1503 unsigned j; 1504 1505 p->nr_buffers = 1506 MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 1507 1508 if (key->element[i].instance_divisor) { 1509 p->use_instancing = TRUE; 1510 } 1511 1512 /* 1513 * Map vertex element to vertex buffer variant. 1514 */ 1515 for (j = 0; j < p->nr_buffer_variants; j++) { 1516 if (p->buffer_variant[j].buffer_index == 1517 key->element[i].input_buffer 1518 && p->buffer_variant[j].instance_divisor == 1519 key->element[i].instance_divisor) { 1520 break; 1521 } 1522 } 1523 if (j == p->nr_buffer_variants) { 1524 p->buffer_variant[j].buffer_index = key->element[i].input_buffer; 1525 p->buffer_variant[j].instance_divisor = 1526 key->element[i].instance_divisor; 1527 p->nr_buffer_variants++; 1528 } 1529 p->element_to_buffer_variant[i] = j; 1530 } 1531 else { 1532 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 1533 1534 p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; 1535 } 1536 } 1537 1538 if (0) 1539 debug_printf("nr_buffers: %d\n", p->nr_buffers); 1540 1541 if (!build_vertex_emit(p, &p->linear_func, 0)) 1542 goto fail; 1543 1544 if (!build_vertex_emit(p, &p->elt_func, 4)) 1545 goto fail; 1546 1547 if (!build_vertex_emit(p, &p->elt16_func, 2)) 1548 goto fail; 1549 1550 if (!build_vertex_emit(p, &p->elt8_func, 1)) 1551 goto fail; 1552 1553 p->translate.run = (run_func) x86_get_func(&p->linear_func); 1554 if (p->translate.run == NULL) 1555 goto fail; 1556 1557 p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); 1558 if (p->translate.run_elts == NULL) 1559 goto fail; 1560 1561 p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); 1562 if (p->translate.run_elts16 == NULL) 1563 goto fail; 1564 1565 p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); 1566 if (p->translate.run_elts8 == NULL) 1567 goto fail; 1568 1569 return &p->translate; 1570 1571 fail: 1572 if (p) 1573 translate_sse_release(&p->translate); 1574 1575 return NULL; 1576} 1577 1578 1579#else 1580 1581struct translate * 1582translate_sse2_create(const struct translate_key *key) 1583{ 1584 return NULL; 1585} 1586 1587#endif 1588