14a49301eSmrg/*
2af69d88dSmrg * Copyright 2003 VMware, Inc.
34a49301eSmrg * All Rights Reserved.
44a49301eSmrg *
54a49301eSmrg * Permission is hereby granted, free of charge, to any person obtaining a
64a49301eSmrg * copy of this software and associated documentation files (the "Software"),
74a49301eSmrg * to deal in the Software without restriction, including without limitation
84a49301eSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
94a49301eSmrg * license, and/or sell copies of the Software, and to permit persons to whom
104a49301eSmrg * the Software is furnished to do so, subject to the following conditions:
114a49301eSmrg *
124a49301eSmrg * The above copyright notice and this permission notice (including the next
134a49301eSmrg * paragraph) shall be included in all copies or substantial portions of the
144a49301eSmrg * Software.
154a49301eSmrg *
164a49301eSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
174a49301eSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
184a49301eSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19af69d88dSmrg * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
204a49301eSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
214a49301eSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
224a49301eSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
234a49301eSmrg *
244a49301eSmrg * Authors:
25af69d88dSmrg *    Keith Whitwell <keithw@vmware.com>
264a49301eSmrg */
274a49301eSmrg
284a49301eSmrg
294a49301eSmrg#include "pipe/p_config.h"
304a49301eSmrg#include "pipe/p_compiler.h"
314a49301eSmrg#include "util/u_memory.h"
324a49301eSmrg#include "util/u_math.h"
337ec681f3Smrg#include "util/format/u_format.h"
344a49301eSmrg
354a49301eSmrg#include "translate.h"
364a49301eSmrg
374a49301eSmrg
387ec681f3Smrg#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)
394a49301eSmrg
404a49301eSmrg#include "rtasm/rtasm_cpu.h"
414a49301eSmrg#include "rtasm/rtasm_x86sse.h"
424a49301eSmrg
434a49301eSmrg
444a49301eSmrg#define X    0
454a49301eSmrg#define Y    1
464a49301eSmrg#define Z    2
474a49301eSmrg#define W    3
484a49301eSmrg
494a49301eSmrg
50af69d88dSmrgstruct translate_buffer
51af69d88dSmrg{
524a49301eSmrg   const void *base_ptr;
533464ebd5Sriastradh   uintptr_t stride;
543464ebd5Sriastradh   unsigned max_index;
554a49301eSmrg};
564a49301eSmrg
57af69d88dSmrgstruct translate_buffer_variant
58af69d88dSmrg{
59cdc920a0Smrg   unsigned buffer_index;
60cdc920a0Smrg   unsigned instance_divisor;
61af69d88dSmrg   void *ptr;                   /* updated either per vertex or per instance */
62cdc920a0Smrg};
63cdc920a0Smrg
64cdc920a0Smrg
65cdc920a0Smrg#define ELEMENT_BUFFER_INSTANCE_ID  1001
66cdc920a0Smrg
673464ebd5Sriastradh#define NUM_CONSTS 7
683464ebd5Sriastradh
693464ebd5Sriastradhenum
703464ebd5Sriastradh{
713464ebd5Sriastradh   CONST_IDENTITY,
723464ebd5Sriastradh   CONST_INV_127,
733464ebd5Sriastradh   CONST_INV_255,
743464ebd5Sriastradh   CONST_INV_32767,
753464ebd5Sriastradh   CONST_INV_65535,
763464ebd5Sriastradh   CONST_INV_2147483647,
773464ebd5Sriastradh   CONST_255
783464ebd5Sriastradh};
793464ebd5Sriastradh
803464ebd5Sriastradh#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
813464ebd5Sriastradhstatic float consts[NUM_CONSTS][4] = {
82af69d88dSmrg   {0, 0, 0, 1},
83af69d88dSmrg   C(1.0 / 127.0),
84af69d88dSmrg   C(1.0 / 255.0),
85af69d88dSmrg   C(1.0 / 32767.0),
86af69d88dSmrg   C(1.0 / 65535.0),
87af69d88dSmrg   C(1.0 / 2147483647.0),
88af69d88dSmrg   C(255.0)
893464ebd5Sriastradh};
90af69d88dSmrg
913464ebd5Sriastradh#undef C
924a49301eSmrg
93af69d88dSmrgstruct translate_sse
94af69d88dSmrg{
954a49301eSmrg   struct translate translate;
964a49301eSmrg
974a49301eSmrg   struct x86_function linear_func;
984a49301eSmrg   struct x86_function elt_func;
993464ebd5Sriastradh   struct x86_function elt16_func;
1003464ebd5Sriastradh   struct x86_function elt8_func;
1014a49301eSmrg   struct x86_function *func;
1024a49301eSmrg
103af69d88dSmrg     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
1043464ebd5Sriastradh   int8_t reg_to_const[16];
1053464ebd5Sriastradh   int8_t const_to_reg[NUM_CONSTS];
1064a49301eSmrg
107af69d88dSmrg   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
1084a49301eSmrg   unsigned nr_buffers;
1094a49301eSmrg
1103464ebd5Sriastradh   /* Multiple buffer variants can map to a single buffer. */
111af69d88dSmrg   struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
1123464ebd5Sriastradh   unsigned nr_buffer_variants;
113cdc920a0Smrg
1143464ebd5Sriastradh   /* Multiple elements can map to a single buffer variant. */
115af69d88dSmrg   unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116cdc920a0Smrg
117cdc920a0Smrg   boolean use_instancing;
118cdc920a0Smrg   unsigned instance_id;
119af69d88dSmrg   unsigned start_instance;
120cdc920a0Smrg
1214a49301eSmrg   /* these are actually known values, but putting them in a struct
1224a49301eSmrg    * like this is helpful to keep them in sync across the file.
1234a49301eSmrg    */
1244a49301eSmrg   struct x86_reg tmp_EAX;
1253464ebd5Sriastradh   struct x86_reg tmp2_EDX;
1263464ebd5Sriastradh   struct x86_reg src_ECX;
127af69d88dSmrg   struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
1283464ebd5Sriastradh   struct x86_reg machine_EDI;
1293464ebd5Sriastradh   struct x86_reg outbuf_EBX;
1303464ebd5Sriastradh   struct x86_reg count_EBP;    /* decrements to zero */
1314a49301eSmrg};
1324a49301eSmrg
133af69d88dSmrg
134af69d88dSmrgstatic int
135af69d88dSmrgget_offset(const void *a, const void *b)
1364a49301eSmrg{
137af69d88dSmrg   return (const char *) b - (const char *) a;
1384a49301eSmrg}
1394a49301eSmrg
140af69d88dSmrg
141af69d88dSmrgstatic struct x86_reg
142af69d88dSmrgget_const(struct translate_sse *p, unsigned id)
1433464ebd5Sriastradh{
1443464ebd5Sriastradh   struct x86_reg reg;
1453464ebd5Sriastradh   unsigned i;
1464a49301eSmrg
147af69d88dSmrg   if (p->const_to_reg[id] >= 0)
1483464ebd5Sriastradh      return x86_make_reg(file_XMM, p->const_to_reg[id]);
1494a49301eSmrg
150af69d88dSmrg   for (i = 2; i < 8; ++i) {
151af69d88dSmrg      if (p->reg_to_const[i] < 0)
1523464ebd5Sriastradh         break;
1534a49301eSmrg   }
1544a49301eSmrg
1553464ebd5Sriastradh   /* TODO: be smarter here */
156af69d88dSmrg   if (i == 8)
1573464ebd5Sriastradh      --i;
1583464ebd5Sriastradh
1593464ebd5Sriastradh   reg = x86_make_reg(file_XMM, i);
1603464ebd5Sriastradh
161af69d88dSmrg   if (p->reg_to_const[i] >= 0)
1623464ebd5Sriastradh      p->const_to_reg[p->reg_to_const[i]] = -1;
1633464ebd5Sriastradh
1643464ebd5Sriastradh   p->reg_to_const[i] = id;
1653464ebd5Sriastradh   p->const_to_reg[id] = i;
1663464ebd5Sriastradh
1673464ebd5Sriastradh   /* TODO: this should happen outside the loop, if possible */
1683464ebd5Sriastradh   sse_movaps(p->func, reg,
169af69d88dSmrg              x86_make_disp(p->machine_EDI,
170af69d88dSmrg                            get_offset(p, &p->consts[id][0])));
1713464ebd5Sriastradh
1724a49301eSmrg   return reg;
1734a49301eSmrg}
1744a49301eSmrg
175af69d88dSmrg
1763464ebd5Sriastradh/* load the data in a SSE2 register, padding with zeros */
177af69d88dSmrgstatic boolean
178af69d88dSmrgemit_load_sse2(struct translate_sse *p,
179af69d88dSmrg               struct x86_reg data, struct x86_reg src, unsigned size)
1804a49301eSmrg{
1813464ebd5Sriastradh   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
1823464ebd5Sriastradh   struct x86_reg tmp = p->tmp_EAX;
183af69d88dSmrg   switch (size) {
1843464ebd5Sriastradh   case 1:
1853464ebd5Sriastradh      x86_movzx8(p->func, tmp, src);
1863464ebd5Sriastradh      sse2_movd(p->func, data, tmp);
1873464ebd5Sriastradh      break;
1883464ebd5Sriastradh   case 2:
1893464ebd5Sriastradh      x86_movzx16(p->func, tmp, src);
1903464ebd5Sriastradh      sse2_movd(p->func, data, tmp);
1913464ebd5Sriastradh      break;
1923464ebd5Sriastradh   case 3:
1933464ebd5Sriastradh      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
1943464ebd5Sriastradh      x86_shl_imm(p->func, tmp, 16);
1953464ebd5Sriastradh      x86_mov16(p->func, tmp, src);
1963464ebd5Sriastradh      sse2_movd(p->func, data, tmp);
1973464ebd5Sriastradh      break;
1983464ebd5Sriastradh   case 4:
1993464ebd5Sriastradh      sse2_movd(p->func, data, src);
2003464ebd5Sriastradh      break;
2013464ebd5Sriastradh   case 6:
2023464ebd5Sriastradh      sse2_movd(p->func, data, src);
2033464ebd5Sriastradh      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
2043464ebd5Sriastradh      sse2_movd(p->func, tmpXMM, tmp);
2053464ebd5Sriastradh      sse2_punpckldq(p->func, data, tmpXMM);
2063464ebd5Sriastradh      break;
2073464ebd5Sriastradh   case 8:
2083464ebd5Sriastradh      sse2_movq(p->func, data, src);
2093464ebd5Sriastradh      break;
2103464ebd5Sriastradh   case 12:
2113464ebd5Sriastradh      sse2_movq(p->func, data, src);
2123464ebd5Sriastradh      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
2133464ebd5Sriastradh      sse2_punpcklqdq(p->func, data, tmpXMM);
2143464ebd5Sriastradh      break;
2153464ebd5Sriastradh   case 16:
2163464ebd5Sriastradh      sse2_movdqu(p->func, data, src);
2173464ebd5Sriastradh      break;
2183464ebd5Sriastradh   default:
2193464ebd5Sriastradh      return FALSE;
2204a49301eSmrg   }
2213464ebd5Sriastradh   return TRUE;
2224a49301eSmrg}
2234a49301eSmrg
224af69d88dSmrg
2253464ebd5Sriastradh/* this value can be passed for the out_chans argument */
2263464ebd5Sriastradh#define CHANNELS_0001 5
2273464ebd5Sriastradh
228af69d88dSmrg
2293464ebd5Sriastradh/* this function will load #chans float values, and will
2303464ebd5Sriastradh * pad the register with zeroes at least up to out_chans.
2313464ebd5Sriastradh *
2323464ebd5Sriastradh * If out_chans is set to CHANNELS_0001, then the fourth
2333464ebd5Sriastradh * value will be padded with 1. Only pass this value if
2343464ebd5Sriastradh * chans < 4 or results are undefined.
2353464ebd5Sriastradh */
236af69d88dSmrgstatic void
237af69d88dSmrgemit_load_float32(struct translate_sse *p, struct x86_reg data,
238af69d88dSmrg                  struct x86_reg arg0, unsigned out_chans, unsigned chans)
2394a49301eSmrg{
240af69d88dSmrg   switch (chans) {
2413464ebd5Sriastradh   case 1:
2423464ebd5Sriastradh      /* a 0 0 0
2433464ebd5Sriastradh       * a 0 0 1
2443464ebd5Sriastradh       */
2453464ebd5Sriastradh      sse_movss(p->func, data, arg0);
246af69d88dSmrg      if (out_chans == CHANNELS_0001)
247af69d88dSmrg         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
2483464ebd5Sriastradh      break;
2493464ebd5Sriastradh   case 2:
2503464ebd5Sriastradh      /* 0 0 0 1
2513464ebd5Sriastradh       * a b 0 1
2523464ebd5Sriastradh       */
253af69d88dSmrg      if (out_chans == CHANNELS_0001)
254af69d88dSmrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255af69d88dSmrg                    SHUF(X, Y, Z, W));
256af69d88dSmrg      else if (out_chans > 2)
257af69d88dSmrg         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
2583464ebd5Sriastradh      sse_movlps(p->func, data, arg0);
2593464ebd5Sriastradh      break;
2603464ebd5Sriastradh   case 3:
2613464ebd5Sriastradh      /* Have to jump through some hoops:
2623464ebd5Sriastradh       *
2633464ebd5Sriastradh       * c 0 0 0
2643464ebd5Sriastradh       * c 0 0 1 if out_chans == CHANNELS_0001
2653464ebd5Sriastradh       * 0 0 c 0/1
2663464ebd5Sriastradh       * a b c 0/1
2673464ebd5Sriastradh       */
2683464ebd5Sriastradh      sse_movss(p->func, data, x86_make_disp(arg0, 8));
269af69d88dSmrg      if (out_chans == CHANNELS_0001)
270af69d88dSmrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271af69d88dSmrg                    SHUF(X, Y, Z, W));
272af69d88dSmrg      sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
2733464ebd5Sriastradh      sse_movlps(p->func, data, arg0);
2743464ebd5Sriastradh      break;
2753464ebd5Sriastradh   case 4:
2763464ebd5Sriastradh      sse_movups(p->func, data, arg0);
2773464ebd5Sriastradh      break;
2784a49301eSmrg   }
2794a49301eSmrg}
2804a49301eSmrg
2813464ebd5Sriastradh/* this function behaves like emit_load_float32, but loads
2823464ebd5Sriastradh   64-bit floating point numbers, converting them to 32-bit
2833464ebd5Sriastradh  ones */
284af69d88dSmrgstatic void
285af69d88dSmrgemit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286af69d88dSmrg                      struct x86_reg arg0, unsigned out_chans, unsigned chans)
2873464ebd5Sriastradh{
2883464ebd5Sriastradh   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289af69d88dSmrg   switch (chans) {
2903464ebd5Sriastradh   case 1:
2913464ebd5Sriastradh      sse2_movsd(p->func, data, arg0);
292af69d88dSmrg      if (out_chans > 1)
2933464ebd5Sriastradh         sse2_cvtpd2ps(p->func, data, data);
2943464ebd5Sriastradh      else
2953464ebd5Sriastradh         sse2_cvtsd2ss(p->func, data, data);
296af69d88dSmrg      if (out_chans == CHANNELS_0001)
297af69d88dSmrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298af69d88dSmrg                    SHUF(X, Y, Z, W));
2993464ebd5Sriastradh      break;
3003464ebd5Sriastradh   case 2:
3013464ebd5Sriastradh      sse2_movupd(p->func, data, arg0);
3023464ebd5Sriastradh      sse2_cvtpd2ps(p->func, data, data);
303af69d88dSmrg      if (out_chans == CHANNELS_0001)
304af69d88dSmrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305af69d88dSmrg                    SHUF(X, Y, Z, W));
306af69d88dSmrg      else if (out_chans > 2)
307af69d88dSmrg         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308af69d88dSmrg      break;
3093464ebd5Sriastradh   case 3:
3103464ebd5Sriastradh      sse2_movupd(p->func, data, arg0);
3113464ebd5Sriastradh      sse2_cvtpd2ps(p->func, data, data);
3123464ebd5Sriastradh      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313af69d88dSmrg      if (out_chans > 3)
3143464ebd5Sriastradh         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
3153464ebd5Sriastradh      else
3163464ebd5Sriastradh         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
3173464ebd5Sriastradh      sse_movlhps(p->func, data, tmpXMM);
318af69d88dSmrg      if (out_chans == CHANNELS_0001)
319af69d88dSmrg         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
3203464ebd5Sriastradh      break;
3213464ebd5Sriastradh   case 4:
3223464ebd5Sriastradh      sse2_movupd(p->func, data, arg0);
3233464ebd5Sriastradh      sse2_cvtpd2ps(p->func, data, data);
3243464ebd5Sriastradh      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
3253464ebd5Sriastradh      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
3263464ebd5Sriastradh      sse_movlhps(p->func, data, tmpXMM);
3273464ebd5Sriastradh      break;
3283464ebd5Sriastradh   }
3293464ebd5Sriastradh}
3304a49301eSmrg
331af69d88dSmrg
332af69d88dSmrgstatic void
333af69d88dSmrgemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334af69d88dSmrg           struct x86_reg dst_xmm, struct x86_reg src_gpr,
335af69d88dSmrg           struct x86_reg src_xmm)
3364a49301eSmrg{
337af69d88dSmrg   if (x86_target(p->func) != X86_32)
3383464ebd5Sriastradh      x64_mov64(p->func, dst_gpr, src_gpr);
339af69d88dSmrg   else {
3403464ebd5Sriastradh      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341af69d88dSmrg      if (x86_target_caps(p->func) & X86_SSE2)
3423464ebd5Sriastradh         sse2_movq(p->func, dst_xmm, src_xmm);
3433464ebd5Sriastradh      else
3443464ebd5Sriastradh         sse_movlps(p->func, dst_xmm, src_xmm);
3453464ebd5Sriastradh   }
3464a49301eSmrg}
3474a49301eSmrg
348af69d88dSmrg
349af69d88dSmrgstatic void
350af69d88dSmrgemit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351af69d88dSmrg            struct x86_reg dst_xmm, struct x86_reg src)
3524a49301eSmrg{
3533464ebd5Sriastradh   emit_mov64(p, dst_gpr, dst_xmm, src, src);
3544a49301eSmrg}
3554a49301eSmrg
356af69d88dSmrg
357af69d88dSmrgstatic void
358af69d88dSmrgemit_store64(struct translate_sse *p, struct x86_reg dst,
359af69d88dSmrg             struct x86_reg src_gpr, struct x86_reg src_xmm)
3604a49301eSmrg{
3613464ebd5Sriastradh   emit_mov64(p, dst, dst, src_gpr, src_xmm);
3624a49301eSmrg}
3634a49301eSmrg
364af69d88dSmrg
365af69d88dSmrgstatic void
366af69d88dSmrgemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
3673464ebd5Sriastradh{
368af69d88dSmrg   if (x86_target_caps(p->func) & X86_SSE2)
3693464ebd5Sriastradh      sse2_movdqu(p->func, dst, src);
3703464ebd5Sriastradh   else
3713464ebd5Sriastradh      sse_movups(p->func, dst, src);
3723464ebd5Sriastradh}
3734a49301eSmrg
374af69d88dSmrg
3753464ebd5Sriastradh/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
3763464ebd5Sriastradh * but may or may not be good on older processors
3773464ebd5Sriastradh * TODO: may perhaps want to use non-temporal stores here if possible
3783464ebd5Sriastradh */
379af69d88dSmrgstatic void
380af69d88dSmrgemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381af69d88dSmrg            unsigned size)
3824a49301eSmrg{
3833464ebd5Sriastradh   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
3843464ebd5Sriastradh   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
3853464ebd5Sriastradh   struct x86_reg dataGPR = p->tmp_EAX;
3863464ebd5Sriastradh   struct x86_reg dataGPR2 = p->tmp2_EDX;
3873464ebd5Sriastradh
388af69d88dSmrg   if (size < 8) {
389af69d88dSmrg      switch (size) {
3903464ebd5Sriastradh      case 1:
3913464ebd5Sriastradh         x86_mov8(p->func, dataGPR, src);
3923464ebd5Sriastradh         x86_mov8(p->func, dst, dataGPR);
3933464ebd5Sriastradh         break;
3943464ebd5Sriastradh      case 2:
3953464ebd5Sriastradh         x86_mov16(p->func, dataGPR, src);
3963464ebd5Sriastradh         x86_mov16(p->func, dst, dataGPR);
3973464ebd5Sriastradh         break;
3983464ebd5Sriastradh      case 3:
3993464ebd5Sriastradh         x86_mov16(p->func, dataGPR, src);
4003464ebd5Sriastradh         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
4013464ebd5Sriastradh         x86_mov16(p->func, dst, dataGPR);
4023464ebd5Sriastradh         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
4033464ebd5Sriastradh         break;
4043464ebd5Sriastradh      case 4:
4053464ebd5Sriastradh         x86_mov(p->func, dataGPR, src);
4063464ebd5Sriastradh         x86_mov(p->func, dst, dataGPR);
4073464ebd5Sriastradh         break;
4083464ebd5Sriastradh      case 6:
4093464ebd5Sriastradh         x86_mov(p->func, dataGPR, src);
4103464ebd5Sriastradh         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
4113464ebd5Sriastradh         x86_mov(p->func, dst, dataGPR);
4123464ebd5Sriastradh         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
4133464ebd5Sriastradh         break;
4143464ebd5Sriastradh      }
4153464ebd5Sriastradh   }
416af69d88dSmrg   else if (!(x86_target_caps(p->func) & X86_SSE)) {
4173464ebd5Sriastradh      unsigned i = 0;
4183464ebd5Sriastradh      assert((size & 3) == 0);
419af69d88dSmrg      for (i = 0; i < size; i += 4) {
4203464ebd5Sriastradh         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
4213464ebd5Sriastradh         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
4223464ebd5Sriastradh      }
4233464ebd5Sriastradh   }
424af69d88dSmrg   else {
425af69d88dSmrg      switch (size) {
4263464ebd5Sriastradh      case 8:
4273464ebd5Sriastradh         emit_load64(p, dataGPR, dataXMM, src);
4283464ebd5Sriastradh         emit_store64(p, dst, dataGPR, dataXMM);
4293464ebd5Sriastradh         break;
4303464ebd5Sriastradh      case 12:
4313464ebd5Sriastradh         emit_load64(p, dataGPR2, dataXMM, src);
4323464ebd5Sriastradh         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
4333464ebd5Sriastradh         emit_store64(p, dst, dataGPR2, dataXMM);
4343464ebd5Sriastradh         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
4353464ebd5Sriastradh         break;
4363464ebd5Sriastradh      case 16:
4373464ebd5Sriastradh         emit_mov128(p, dataXMM, src);
4383464ebd5Sriastradh         emit_mov128(p, dst, dataXMM);
4393464ebd5Sriastradh         break;
4403464ebd5Sriastradh      case 24:
4413464ebd5Sriastradh         emit_mov128(p, dataXMM, src);
4423464ebd5Sriastradh         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
4433464ebd5Sriastradh         emit_mov128(p, dst, dataXMM);
4443464ebd5Sriastradh         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
4453464ebd5Sriastradh         break;
4463464ebd5Sriastradh      case 32:
4473464ebd5Sriastradh         emit_mov128(p, dataXMM, src);
4483464ebd5Sriastradh         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
4493464ebd5Sriastradh         emit_mov128(p, dst, dataXMM);
4503464ebd5Sriastradh         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
4513464ebd5Sriastradh         break;
4523464ebd5Sriastradh      default:
4533464ebd5Sriastradh         assert(0);
4543464ebd5Sriastradh      }
4553464ebd5Sriastradh   }
4564a49301eSmrg}
4574a49301eSmrg
458af69d88dSmrgstatic boolean
459af69d88dSmrgtranslate_attr_convert(struct translate_sse *p,
460af69d88dSmrg                       const struct translate_element *a,
461af69d88dSmrg                       struct x86_reg src, struct x86_reg dst)
4624a49301eSmrg{
463af69d88dSmrg   const struct util_format_description *input_desc =
464af69d88dSmrg      util_format_description(a->input_format);
465af69d88dSmrg   const struct util_format_description *output_desc =
466af69d88dSmrg      util_format_description(a->output_format);
4673464ebd5Sriastradh   unsigned i;
4683464ebd5Sriastradh   boolean id_swizzle = TRUE;
469af69d88dSmrg   unsigned swizzle[4] =
47001e04c3fSmrg      { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
47101e04c3fSmrg        PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
4723464ebd5Sriastradh   unsigned needed_chans = 0;
473af69d88dSmrg   unsigned imms[2] = { 0, 0x3f800000 };
4744a49301eSmrg
475af69d88dSmrg   if (a->output_format == PIPE_FORMAT_NONE
476af69d88dSmrg       || a->input_format == PIPE_FORMAT_NONE)
4773464ebd5Sriastradh      return FALSE;
4784a49301eSmrg
479af69d88dSmrg   if (input_desc->channel[0].size & 7)
4803464ebd5Sriastradh      return FALSE;
4814a49301eSmrg
482af69d88dSmrg   if (input_desc->colorspace != output_desc->colorspace)
4833464ebd5Sriastradh      return FALSE;
4844a49301eSmrg
485af69d88dSmrg   for (i = 1; i < input_desc->nr_channels; ++i) {
486af69d88dSmrg      if (memcmp
487af69d88dSmrg          (&input_desc->channel[i], &input_desc->channel[0],
488af69d88dSmrg           sizeof(input_desc->channel[0])))
4893464ebd5Sriastradh         return FALSE;
4903464ebd5Sriastradh   }
4914a49301eSmrg
492af69d88dSmrg   for (i = 1; i < output_desc->nr_channels; ++i) {
493af69d88dSmrg      if (memcmp
494af69d88dSmrg          (&output_desc->channel[i], &output_desc->channel[0],
495af69d88dSmrg           sizeof(output_desc->channel[0]))) {
4963464ebd5Sriastradh         return FALSE;
497af69d88dSmrg      }
4983464ebd5Sriastradh   }
4994a49301eSmrg
500af69d88dSmrg   for (i = 0; i < output_desc->nr_channels; ++i) {
501af69d88dSmrg      if (output_desc->swizzle[i] < 4)
5023464ebd5Sriastradh         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
5033464ebd5Sriastradh   }
5044a49301eSmrg
505af69d88dSmrg   if ((x86_target_caps(p->func) & X86_SSE) &&
506af69d88dSmrg       (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507af69d88dSmrg        || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508af69d88dSmrg        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509af69d88dSmrg        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
5103464ebd5Sriastradh      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
5114a49301eSmrg
512af69d88dSmrg      for (i = 0; i < output_desc->nr_channels; ++i) {
51301e04c3fSmrg         if (swizzle[i] == PIPE_SWIZZLE_0
514af69d88dSmrg             && i >= input_desc->nr_channels)
5153464ebd5Sriastradh            swizzle[i] = i;
5163464ebd5Sriastradh      }
5174a49301eSmrg
518af69d88dSmrg      for (i = 0; i < output_desc->nr_channels; ++i) {
519af69d88dSmrg         if (swizzle[i] < 4)
5203464ebd5Sriastradh            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
52101e04c3fSmrg         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
5223464ebd5Sriastradh            id_swizzle = FALSE;
5233464ebd5Sriastradh      }
5244a49301eSmrg
525af69d88dSmrg      if (needed_chans > 0) {
526af69d88dSmrg         switch (input_desc->channel[0].type) {
5273464ebd5Sriastradh         case UTIL_FORMAT_TYPE_UNSIGNED:
528af69d88dSmrg            if (!(x86_target_caps(p->func) & X86_SSE2))
5293464ebd5Sriastradh               return FALSE;
530af69d88dSmrg            emit_load_sse2(p, dataXMM, src,
531af69d88dSmrg                           input_desc->channel[0].size *
532af69d88dSmrg                           input_desc->nr_channels >> 3);
5333464ebd5Sriastradh
5343464ebd5Sriastradh            /* TODO: add support for SSE4.1 pmovzx */
535af69d88dSmrg            switch (input_desc->channel[0].size) {
5363464ebd5Sriastradh            case 8:
537af69d88dSmrg               /* TODO: this may be inefficient due to get_identity() being
538af69d88dSmrg                *  used both as a float and integer register.
539af69d88dSmrg                */
5403464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
5413464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
5423464ebd5Sriastradh               break;
5433464ebd5Sriastradh            case 16:
5443464ebd5Sriastradh               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
5453464ebd5Sriastradh               break;
546af69d88dSmrg            case 32:           /* we lose precision here */
5473464ebd5Sriastradh               sse2_psrld_imm(p->func, dataXMM, 1);
5483464ebd5Sriastradh               break;
5493464ebd5Sriastradh            default:
5503464ebd5Sriastradh               return FALSE;
5513464ebd5Sriastradh            }
5523464ebd5Sriastradh            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553af69d88dSmrg            if (input_desc->channel[0].normalized) {
5543464ebd5Sriastradh               struct x86_reg factor;
555af69d88dSmrg               switch (input_desc->channel[0].size) {
5563464ebd5Sriastradh               case 8:
5573464ebd5Sriastradh                  factor = get_const(p, CONST_INV_255);
5583464ebd5Sriastradh                  break;
5593464ebd5Sriastradh               case 16:
5603464ebd5Sriastradh                  factor = get_const(p, CONST_INV_65535);
5613464ebd5Sriastradh                  break;
5623464ebd5Sriastradh               case 32:
5633464ebd5Sriastradh                  factor = get_const(p, CONST_INV_2147483647);
5643464ebd5Sriastradh                  break;
5653464ebd5Sriastradh               default:
5663464ebd5Sriastradh                  assert(0);
5673464ebd5Sriastradh                  factor.disp = 0;
5683464ebd5Sriastradh                  factor.file = 0;
5693464ebd5Sriastradh                  factor.idx = 0;
5703464ebd5Sriastradh                  factor.mod = 0;
5713464ebd5Sriastradh                  break;
5723464ebd5Sriastradh               }
5733464ebd5Sriastradh               sse_mulps(p->func, dataXMM, factor);
5743464ebd5Sriastradh            }
575af69d88dSmrg            else if (input_desc->channel[0].size == 32)
576af69d88dSmrg               /* compensate for the bit we threw away to fit u32 into s32 */
577af69d88dSmrg               sse_addps(p->func, dataXMM, dataXMM);
5783464ebd5Sriastradh            break;
5793464ebd5Sriastradh         case UTIL_FORMAT_TYPE_SIGNED:
580af69d88dSmrg            if (!(x86_target_caps(p->func) & X86_SSE2))
5813464ebd5Sriastradh               return FALSE;
582af69d88dSmrg            emit_load_sse2(p, dataXMM, src,
583af69d88dSmrg                           input_desc->channel[0].size *
584af69d88dSmrg                           input_desc->nr_channels >> 3);
5853464ebd5Sriastradh
5863464ebd5Sriastradh            /* TODO: add support for SSE4.1 pmovsx */
587af69d88dSmrg            switch (input_desc->channel[0].size) {
5883464ebd5Sriastradh            case 8:
5893464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, dataXMM);
5903464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, dataXMM);
5913464ebd5Sriastradh               sse2_psrad_imm(p->func, dataXMM, 24);
5923464ebd5Sriastradh               break;
5933464ebd5Sriastradh            case 16:
5943464ebd5Sriastradh               sse2_punpcklwd(p->func, dataXMM, dataXMM);
5953464ebd5Sriastradh               sse2_psrad_imm(p->func, dataXMM, 16);
5963464ebd5Sriastradh               break;
597af69d88dSmrg            case 32:           /* we lose precision here */
5983464ebd5Sriastradh               break;
5993464ebd5Sriastradh            default:
6003464ebd5Sriastradh               return FALSE;
6013464ebd5Sriastradh            }
6023464ebd5Sriastradh            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603af69d88dSmrg            if (input_desc->channel[0].normalized) {
6043464ebd5Sriastradh               struct x86_reg factor;
605af69d88dSmrg               switch (input_desc->channel[0].size) {
6063464ebd5Sriastradh               case 8:
6073464ebd5Sriastradh                  factor = get_const(p, CONST_INV_127);
6083464ebd5Sriastradh                  break;
6093464ebd5Sriastradh               case 16:
6103464ebd5Sriastradh                  factor = get_const(p, CONST_INV_32767);
6113464ebd5Sriastradh                  break;
6123464ebd5Sriastradh               case 32:
6133464ebd5Sriastradh                  factor = get_const(p, CONST_INV_2147483647);
6143464ebd5Sriastradh                  break;
6153464ebd5Sriastradh               default:
6163464ebd5Sriastradh                  assert(0);
6173464ebd5Sriastradh                  factor.disp = 0;
6183464ebd5Sriastradh                  factor.file = 0;
6193464ebd5Sriastradh                  factor.idx = 0;
6203464ebd5Sriastradh                  factor.mod = 0;
6213464ebd5Sriastradh                  break;
6223464ebd5Sriastradh               }
6233464ebd5Sriastradh               sse_mulps(p->func, dataXMM, factor);
6243464ebd5Sriastradh            }
6253464ebd5Sriastradh            break;
6263464ebd5Sriastradh
6273464ebd5Sriastradh            break;
6283464ebd5Sriastradh         case UTIL_FORMAT_TYPE_FLOAT:
629af69d88dSmrg            if (input_desc->channel[0].size != 32
630af69d88dSmrg                && input_desc->channel[0].size != 64) {
6313464ebd5Sriastradh               return FALSE;
632af69d88dSmrg            }
63301e04c3fSmrg            if (swizzle[3] == PIPE_SWIZZLE_1
634af69d88dSmrg                && input_desc->nr_channels <= 3) {
63501e04c3fSmrg               swizzle[3] = PIPE_SWIZZLE_W;
6363464ebd5Sriastradh               needed_chans = CHANNELS_0001;
6373464ebd5Sriastradh            }
638af69d88dSmrg            switch (input_desc->channel[0].size) {
6393464ebd5Sriastradh            case 32:
640af69d88dSmrg               emit_load_float32(p, dataXMM, src, needed_chans,
641af69d88dSmrg                                 input_desc->nr_channels);
6423464ebd5Sriastradh               break;
643af69d88dSmrg            case 64:           /* we lose precision here */
644af69d88dSmrg               if (!(x86_target_caps(p->func) & X86_SSE2))
6453464ebd5Sriastradh                  return FALSE;
646af69d88dSmrg               emit_load_float64to32(p, dataXMM, src, needed_chans,
647af69d88dSmrg                                     input_desc->nr_channels);
6483464ebd5Sriastradh               break;
6493464ebd5Sriastradh            default:
6503464ebd5Sriastradh               return FALSE;
6513464ebd5Sriastradh            }
6523464ebd5Sriastradh            break;
6533464ebd5Sriastradh         default:
6543464ebd5Sriastradh            return FALSE;
6553464ebd5Sriastradh         }
6564a49301eSmrg
657af69d88dSmrg         if (!id_swizzle) {
658af69d88dSmrg            sse_shufps(p->func, dataXMM, dataXMM,
659af69d88dSmrg                       SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660af69d88dSmrg         }
6613464ebd5Sriastradh      }
6623464ebd5Sriastradh
663af69d88dSmrg      if (output_desc->nr_channels >= 4
66401e04c3fSmrg          && swizzle[0] < PIPE_SWIZZLE_0
66501e04c3fSmrg          && swizzle[1] < PIPE_SWIZZLE_0
66601e04c3fSmrg          && swizzle[2] < PIPE_SWIZZLE_0
66701e04c3fSmrg          && swizzle[3] < PIPE_SWIZZLE_0) {
6683464ebd5Sriastradh         sse_movups(p->func, dst, dataXMM);
669af69d88dSmrg      }
670af69d88dSmrg      else {
671af69d88dSmrg         if (output_desc->nr_channels >= 2
67201e04c3fSmrg             && swizzle[0] < PIPE_SWIZZLE_0
67301e04c3fSmrg             && swizzle[1] < PIPE_SWIZZLE_0) {
6743464ebd5Sriastradh            sse_movlps(p->func, dst, dataXMM);
675af69d88dSmrg         }
676af69d88dSmrg         else {
67701e04c3fSmrg            if (swizzle[0] < PIPE_SWIZZLE_0) {
6783464ebd5Sriastradh               sse_movss(p->func, dst, dataXMM);
679af69d88dSmrg            }
680af69d88dSmrg            else {
681af69d88dSmrg               x86_mov_imm(p->func, dst,
68201e04c3fSmrg                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
683af69d88dSmrg            }
6843464ebd5Sriastradh
685af69d88dSmrg            if (output_desc->nr_channels >= 2) {
68601e04c3fSmrg               if (swizzle[1] < PIPE_SWIZZLE_0) {
6873464ebd5Sriastradh                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
6883464ebd5Sriastradh                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
6893464ebd5Sriastradh               }
690af69d88dSmrg               else {
691af69d88dSmrg                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
69201e04c3fSmrg                              imms[swizzle[1] - PIPE_SWIZZLE_0]);
693af69d88dSmrg               }
6943464ebd5Sriastradh            }
6953464ebd5Sriastradh         }
6964a49301eSmrg
697af69d88dSmrg         if (output_desc->nr_channels >= 3) {
698af69d88dSmrg            if (output_desc->nr_channels >= 4
69901e04c3fSmrg                && swizzle[2] < PIPE_SWIZZLE_0
70001e04c3fSmrg                && swizzle[3] < PIPE_SWIZZLE_0) {
7013464ebd5Sriastradh               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702af69d88dSmrg            }
703af69d88dSmrg            else {
70401e04c3fSmrg               if (swizzle[2] < PIPE_SWIZZLE_0) {
7053464ebd5Sriastradh                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
7063464ebd5Sriastradh                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
7073464ebd5Sriastradh               }
708af69d88dSmrg               else {
709af69d88dSmrg                  x86_mov_imm(p->func, x86_make_disp(dst, 8),
71001e04c3fSmrg                              imms[swizzle[2] - PIPE_SWIZZLE_0]);
711af69d88dSmrg               }
7123464ebd5Sriastradh
713af69d88dSmrg               if (output_desc->nr_channels >= 4) {
71401e04c3fSmrg                  if (swizzle[3] < PIPE_SWIZZLE_0) {
7153464ebd5Sriastradh                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
7163464ebd5Sriastradh                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
7173464ebd5Sriastradh                  }
718af69d88dSmrg                  else {
719af69d88dSmrg                     x86_mov_imm(p->func, x86_make_disp(dst, 12),
72001e04c3fSmrg                                 imms[swizzle[3] - PIPE_SWIZZLE_0]);
721af69d88dSmrg                  }
7223464ebd5Sriastradh               }
7233464ebd5Sriastradh            }
7243464ebd5Sriastradh         }
7253464ebd5Sriastradh      }
7263464ebd5Sriastradh      return TRUE;
7273464ebd5Sriastradh   }
728af69d88dSmrg   else if ((x86_target_caps(p->func) & X86_SSE2)
729af69d88dSmrg            && input_desc->channel[0].size == 8
730af69d88dSmrg            && output_desc->channel[0].size == 16
731af69d88dSmrg            && output_desc->channel[0].normalized ==
732af69d88dSmrg            input_desc->channel[0].normalized &&
733af69d88dSmrg            (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734af69d88dSmrg                   && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735af69d88dSmrg             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736af69d88dSmrg                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737af69d88dSmrg             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738af69d88dSmrg                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
7393464ebd5Sriastradh      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
7403464ebd5Sriastradh      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
7413464ebd5Sriastradh      struct x86_reg tmp = p->tmp_EAX;
742af69d88dSmrg      unsigned imms[2] = { 0, 1 };
7433464ebd5Sriastradh
744af69d88dSmrg      for (i = 0; i < output_desc->nr_channels; ++i) {
74501e04c3fSmrg         if (swizzle[i] == PIPE_SWIZZLE_0
746af69d88dSmrg             && i >= input_desc->nr_channels) {
7473464ebd5Sriastradh            swizzle[i] = i;
748af69d88dSmrg         }
7493464ebd5Sriastradh      }
7504a49301eSmrg
751af69d88dSmrg      for (i = 0; i < output_desc->nr_channels; ++i) {
752af69d88dSmrg         if (swizzle[i] < 4)
7533464ebd5Sriastradh            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
75401e04c3fSmrg         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
7553464ebd5Sriastradh            id_swizzle = FALSE;
7563464ebd5Sriastradh      }
7574a49301eSmrg
758af69d88dSmrg      if (needed_chans > 0) {
759af69d88dSmrg         emit_load_sse2(p, dataXMM, src,
760af69d88dSmrg                        input_desc->channel[0].size *
761af69d88dSmrg                        input_desc->nr_channels >> 3);
7623464ebd5Sriastradh
763af69d88dSmrg         switch (input_desc->channel[0].type) {
7643464ebd5Sriastradh         case UTIL_FORMAT_TYPE_UNSIGNED:
765af69d88dSmrg            if (input_desc->channel[0].normalized) {
7663464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, dataXMM);
767af69d88dSmrg               if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768af69d88dSmrg                  sse2_psrlw_imm(p->func, dataXMM, 1);
7693464ebd5Sriastradh            }
7703464ebd5Sriastradh            else
7713464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
7723464ebd5Sriastradh            break;
7733464ebd5Sriastradh         case UTIL_FORMAT_TYPE_SIGNED:
774af69d88dSmrg            if (input_desc->channel[0].normalized) {
7753464ebd5Sriastradh               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
7763464ebd5Sriastradh               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
7773464ebd5Sriastradh               sse2_psllw_imm(p->func, dataXMM, 9);
7783464ebd5Sriastradh               sse2_psrlw_imm(p->func, dataXMM, 8);
7793464ebd5Sriastradh               sse2_por(p->func, tmpXMM, dataXMM);
7803464ebd5Sriastradh               sse2_psrlw_imm(p->func, dataXMM, 7);
7813464ebd5Sriastradh               sse2_por(p->func, tmpXMM, dataXMM);
7823464ebd5Sriastradh               {
7833464ebd5Sriastradh                  struct x86_reg t = dataXMM;
7843464ebd5Sriastradh                  dataXMM = tmpXMM;
7853464ebd5Sriastradh                  tmpXMM = t;
7863464ebd5Sriastradh               }
7873464ebd5Sriastradh            }
788af69d88dSmrg            else {
7893464ebd5Sriastradh               sse2_punpcklbw(p->func, dataXMM, dataXMM);
7903464ebd5Sriastradh               sse2_psraw_imm(p->func, dataXMM, 8);
7913464ebd5Sriastradh            }
7923464ebd5Sriastradh            break;
7933464ebd5Sriastradh         default:
7943464ebd5Sriastradh            assert(0);
7953464ebd5Sriastradh         }
7964a49301eSmrg
797af69d88dSmrg         if (output_desc->channel[0].normalized)
798af69d88dSmrg            imms[1] =
799af69d88dSmrg               (output_desc->channel[0].type ==
800af69d88dSmrg                UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
8014a49301eSmrg
802af69d88dSmrg         if (!id_swizzle)
803af69d88dSmrg            sse2_pshuflw(p->func, dataXMM, dataXMM,
804af69d88dSmrg                         (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805af69d88dSmrg                         ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
8063464ebd5Sriastradh      }
8074a49301eSmrg
808af69d88dSmrg      if (output_desc->nr_channels >= 4
80901e04c3fSmrg          && swizzle[0] < PIPE_SWIZZLE_0
81001e04c3fSmrg          && swizzle[1] < PIPE_SWIZZLE_0
81101e04c3fSmrg          && swizzle[2] < PIPE_SWIZZLE_0
81201e04c3fSmrg          && swizzle[3] < PIPE_SWIZZLE_0) {
8133464ebd5Sriastradh         sse2_movq(p->func, dst, dataXMM);
814af69d88dSmrg      }
815af69d88dSmrg      else {
81601e04c3fSmrg         if (swizzle[0] < PIPE_SWIZZLE_0) {
817af69d88dSmrg            if (output_desc->nr_channels >= 2
81801e04c3fSmrg                && swizzle[1] < PIPE_SWIZZLE_0) {
8193464ebd5Sriastradh               sse2_movd(p->func, dst, dataXMM);
820af69d88dSmrg            }
821af69d88dSmrg            else {
8223464ebd5Sriastradh               sse2_movd(p->func, tmp, dataXMM);
8233464ebd5Sriastradh               x86_mov16(p->func, dst, tmp);
824af69d88dSmrg               if (output_desc->nr_channels >= 2)
825af69d88dSmrg                  x86_mov16_imm(p->func, x86_make_disp(dst, 2),
82601e04c3fSmrg                                imms[swizzle[1] - PIPE_SWIZZLE_0]);
8273464ebd5Sriastradh            }
8283464ebd5Sriastradh         }
829af69d88dSmrg         else {
830af69d88dSmrg            if (output_desc->nr_channels >= 2
83101e04c3fSmrg                && swizzle[1] >= PIPE_SWIZZLE_0) {
832af69d88dSmrg               x86_mov_imm(p->func, dst,
83301e04c3fSmrg                           (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
83401e04c3fSmrg                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
835af69d88dSmrg            }
836af69d88dSmrg            else {
837af69d88dSmrg               x86_mov16_imm(p->func, dst,
83801e04c3fSmrg                             imms[swizzle[0] - PIPE_SWIZZLE_0]);
839af69d88dSmrg               if (output_desc->nr_channels >= 2) {
8403464ebd5Sriastradh                  sse2_movd(p->func, tmp, dataXMM);
8413464ebd5Sriastradh                  x86_shr_imm(p->func, tmp, 16);
8423464ebd5Sriastradh                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
8433464ebd5Sriastradh               }
8443464ebd5Sriastradh            }
8453464ebd5Sriastradh         }
8464a49301eSmrg
847af69d88dSmrg         if (output_desc->nr_channels >= 3) {
84801e04c3fSmrg            if (swizzle[2] < PIPE_SWIZZLE_0) {
849af69d88dSmrg               if (output_desc->nr_channels >= 4
85001e04c3fSmrg                   && swizzle[3] < PIPE_SWIZZLE_0) {
8513464ebd5Sriastradh                  sse2_psrlq_imm(p->func, dataXMM, 32);
8523464ebd5Sriastradh                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
8533464ebd5Sriastradh               }
854af69d88dSmrg               else {
8553464ebd5Sriastradh                  sse2_psrlq_imm(p->func, dataXMM, 32);
8563464ebd5Sriastradh                  sse2_movd(p->func, tmp, dataXMM);
8573464ebd5Sriastradh                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858af69d88dSmrg                  if (output_desc->nr_channels >= 4) {
859af69d88dSmrg                     x86_mov16_imm(p->func, x86_make_disp(dst, 6),
86001e04c3fSmrg                                   imms[swizzle[3] - PIPE_SWIZZLE_0]);
8613464ebd5Sriastradh                  }
8623464ebd5Sriastradh               }
8633464ebd5Sriastradh            }
864af69d88dSmrg            else {
865af69d88dSmrg               if (output_desc->nr_channels >= 4
86601e04c3fSmrg                   && swizzle[3] >= PIPE_SWIZZLE_0) {
867af69d88dSmrg                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
86801e04c3fSmrg                              (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
86901e04c3fSmrg                              | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870af69d88dSmrg               }
871af69d88dSmrg               else {
872af69d88dSmrg                  x86_mov16_imm(p->func, x86_make_disp(dst, 4),
87301e04c3fSmrg                                imms[swizzle[2] - PIPE_SWIZZLE_0]);
8743464ebd5Sriastradh
875af69d88dSmrg                  if (output_desc->nr_channels >= 4) {
8763464ebd5Sriastradh                     sse2_psrlq_imm(p->func, dataXMM, 48);
8773464ebd5Sriastradh                     sse2_movd(p->func, tmp, dataXMM);
8783464ebd5Sriastradh                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
8793464ebd5Sriastradh                  }
8803464ebd5Sriastradh               }
8813464ebd5Sriastradh            }
8823464ebd5Sriastradh         }
8833464ebd5Sriastradh      }
8843464ebd5Sriastradh      return TRUE;
8853464ebd5Sriastradh   }
886af69d88dSmrg   else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887af69d88dSmrg                    sizeof(output_desc->channel[0]))) {
8883464ebd5Sriastradh      struct x86_reg tmp = p->tmp_EAX;
8893464ebd5Sriastradh      unsigned i;
890af69d88dSmrg
891af69d88dSmrg      if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892af69d88dSmrg          && output_desc->nr_channels == 4
89301e04c3fSmrg          && swizzle[0] == PIPE_SWIZZLE_W
89401e04c3fSmrg          && swizzle[1] == PIPE_SWIZZLE_Z
89501e04c3fSmrg          && swizzle[2] == PIPE_SWIZZLE_Y
89601e04c3fSmrg          && swizzle[3] == PIPE_SWIZZLE_X) {
8973464ebd5Sriastradh         /* TODO: support movbe */
8983464ebd5Sriastradh         x86_mov(p->func, tmp, src);
8993464ebd5Sriastradh         x86_bswap(p->func, tmp);
9003464ebd5Sriastradh         x86_mov(p->func, dst, tmp);
9013464ebd5Sriastradh         return TRUE;
9023464ebd5Sriastradh      }
9034a49301eSmrg
904af69d88dSmrg      for (i = 0; i < output_desc->nr_channels; ++i) {
905af69d88dSmrg         switch (output_desc->channel[0].size) {
9063464ebd5Sriastradh         case 8:
90701e04c3fSmrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
9083464ebd5Sriastradh               unsigned v = 0;
90901e04c3fSmrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
910af69d88dSmrg                  switch (output_desc->channel[0].type) {
9113464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_UNSIGNED:
9123464ebd5Sriastradh                     v = output_desc->channel[0].normalized ? 0xff : 1;
9133464ebd5Sriastradh                     break;
9143464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_SIGNED:
9153464ebd5Sriastradh                     v = output_desc->channel[0].normalized ? 0x7f : 1;
9163464ebd5Sriastradh                     break;
9173464ebd5Sriastradh                  default:
9183464ebd5Sriastradh                     return FALSE;
9193464ebd5Sriastradh                  }
9203464ebd5Sriastradh               }
9213464ebd5Sriastradh               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
9223464ebd5Sriastradh            }
923af69d88dSmrg            else {
9243464ebd5Sriastradh               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
9253464ebd5Sriastradh               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
9263464ebd5Sriastradh            }
9273464ebd5Sriastradh            break;
9283464ebd5Sriastradh         case 16:
92901e04c3fSmrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
9303464ebd5Sriastradh               unsigned v = 0;
93101e04c3fSmrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
932af69d88dSmrg                  switch (output_desc->channel[1].type) {
9333464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_UNSIGNED:
9343464ebd5Sriastradh                     v = output_desc->channel[1].normalized ? 0xffff : 1;
9353464ebd5Sriastradh                     break;
9363464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_SIGNED:
9373464ebd5Sriastradh                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
9383464ebd5Sriastradh                     break;
9393464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_FLOAT:
9403464ebd5Sriastradh                     v = 0x3c00;
9413464ebd5Sriastradh                     break;
9423464ebd5Sriastradh                  default:
9433464ebd5Sriastradh                     return FALSE;
9443464ebd5Sriastradh                  }
9453464ebd5Sriastradh               }
9463464ebd5Sriastradh               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
9473464ebd5Sriastradh            }
94801e04c3fSmrg            else if (swizzle[i] == PIPE_SWIZZLE_0) {
9493464ebd5Sriastradh               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950af69d88dSmrg            }
951af69d88dSmrg            else {
9523464ebd5Sriastradh               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
9533464ebd5Sriastradh               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
9543464ebd5Sriastradh            }
9553464ebd5Sriastradh            break;
9563464ebd5Sriastradh         case 32:
95701e04c3fSmrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
9583464ebd5Sriastradh               unsigned v = 0;
95901e04c3fSmrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
960af69d88dSmrg                  switch (output_desc->channel[1].type) {
9613464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_UNSIGNED:
9623464ebd5Sriastradh                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
9633464ebd5Sriastradh                     break;
9643464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_SIGNED:
9653464ebd5Sriastradh                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
9663464ebd5Sriastradh                     break;
9673464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_FLOAT:
9683464ebd5Sriastradh                     v = 0x3f800000;
9693464ebd5Sriastradh                     break;
9703464ebd5Sriastradh                  default:
9713464ebd5Sriastradh                     return FALSE;
9723464ebd5Sriastradh                  }
9733464ebd5Sriastradh               }
9743464ebd5Sriastradh               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
9753464ebd5Sriastradh            }
976af69d88dSmrg            else {
9773464ebd5Sriastradh               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
9783464ebd5Sriastradh               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
9793464ebd5Sriastradh            }
9803464ebd5Sriastradh            break;
9813464ebd5Sriastradh         case 64:
98201e04c3fSmrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
9833464ebd5Sriastradh               unsigned l = 0;
9843464ebd5Sriastradh               unsigned h = 0;
98501e04c3fSmrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
986af69d88dSmrg                  switch (output_desc->channel[1].type) {
9873464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_UNSIGNED:
9883464ebd5Sriastradh                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
9893464ebd5Sriastradh                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
9903464ebd5Sriastradh                     break;
9913464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_SIGNED:
9923464ebd5Sriastradh                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
9933464ebd5Sriastradh                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
9943464ebd5Sriastradh                     break;
9953464ebd5Sriastradh                  case UTIL_FORMAT_TYPE_FLOAT:
9963464ebd5Sriastradh                     h = 0x3ff00000;
9973464ebd5Sriastradh                     l = 0;
9983464ebd5Sriastradh                     break;
9993464ebd5Sriastradh                  default:
10003464ebd5Sriastradh                     return FALSE;
10013464ebd5Sriastradh                  }
10023464ebd5Sriastradh               }
10033464ebd5Sriastradh               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
10043464ebd5Sriastradh               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
10053464ebd5Sriastradh            }
1006af69d88dSmrg            else {
1007af69d88dSmrg               if (x86_target_caps(p->func) & X86_SSE) {
10083464ebd5Sriastradh                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009af69d88dSmrg                  emit_load64(p, tmp, tmpXMM,
1010af69d88dSmrg                              x86_make_disp(src, swizzle[i] * 8));
10113464ebd5Sriastradh                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
10123464ebd5Sriastradh               }
1013af69d88dSmrg               else {
10143464ebd5Sriastradh                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
10153464ebd5Sriastradh                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016af69d88dSmrg                  x86_mov(p->func, tmp,
1017af69d88dSmrg                          x86_make_disp(src, swizzle[i] * 8 + 4));
10183464ebd5Sriastradh                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
10193464ebd5Sriastradh               }
10203464ebd5Sriastradh            }
10213464ebd5Sriastradh            break;
10223464ebd5Sriastradh         default:
10233464ebd5Sriastradh            return FALSE;
10243464ebd5Sriastradh         }
10253464ebd5Sriastradh      }
10263464ebd5Sriastradh      return TRUE;
10273464ebd5Sriastradh   }
10283464ebd5Sriastradh   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029af69d88dSmrg   else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030af69d88dSmrg            a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031af69d88dSmrg            (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032af69d88dSmrg             || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
10333464ebd5Sriastradh      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
10344a49301eSmrg
10353464ebd5Sriastradh      /* load */
10363464ebd5Sriastradh      sse_movups(p->func, dataXMM, src);
10374a49301eSmrg
1038af69d88dSmrg      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039af69d88dSmrg         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040af69d88dSmrg      }
10414a49301eSmrg
10423464ebd5Sriastradh      /* scale by 255.0 */
10433464ebd5Sriastradh      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
10444a49301eSmrg
10453464ebd5Sriastradh      /* pack and emit */
10463464ebd5Sriastradh      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
10473464ebd5Sriastradh      sse2_packssdw(p->func, dataXMM, dataXMM);
10483464ebd5Sriastradh      sse2_packuswb(p->func, dataXMM, dataXMM);
10493464ebd5Sriastradh      sse2_movd(p->func, dst, dataXMM);
10504a49301eSmrg
10513464ebd5Sriastradh      return TRUE;
10524a49301eSmrg   }
10534a49301eSmrg
10543464ebd5Sriastradh   return FALSE;
10554a49301eSmrg}
10564a49301eSmrg
1057af69d88dSmrg
1058af69d88dSmrgstatic boolean
1059af69d88dSmrgtranslate_attr(struct translate_sse *p,
1060af69d88dSmrg               const struct translate_element *a,
1061af69d88dSmrg               struct x86_reg src, struct x86_reg dst)
10623464ebd5Sriastradh{
1063af69d88dSmrg   if (a->input_format == a->output_format) {
10643464ebd5Sriastradh      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
10653464ebd5Sriastradh      return TRUE;
10663464ebd5Sriastradh   }
10673464ebd5Sriastradh
10683464ebd5Sriastradh   return translate_attr_convert(p, a, src, dst);
10693464ebd5Sriastradh}
10704a49301eSmrg
1071af69d88dSmrg
1072af69d88dSmrgstatic boolean
1073af69d88dSmrginit_inputs(struct translate_sse *p, unsigned index_size)
10744a49301eSmrg{
10754a49301eSmrg   unsigned i;
1076af69d88dSmrg   struct x86_reg instance_id =
1077af69d88dSmrg      x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078af69d88dSmrg   struct x86_reg start_instance =
1079af69d88dSmrg      x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080cdc920a0Smrg
10813464ebd5Sriastradh   for (i = 0; i < p->nr_buffer_variants; i++) {
10823464ebd5Sriastradh      struct translate_buffer_variant *variant = &p->buffer_variant[i];
10833464ebd5Sriastradh      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084cdc920a0Smrg
10853464ebd5Sriastradh      if (!index_size || variant->instance_divisor) {
1086af69d88dSmrg         struct x86_reg buf_max_index =
1087af69d88dSmrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088af69d88dSmrg         struct x86_reg buf_stride =
1089af69d88dSmrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090af69d88dSmrg         struct x86_reg buf_ptr =
1091af69d88dSmrg            x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092af69d88dSmrg         struct x86_reg buf_base_ptr =
1093af69d88dSmrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
10943464ebd5Sriastradh         struct x86_reg elt = p->idx_ESI;
1095cdc920a0Smrg         struct x86_reg tmp_EAX = p->tmp_EAX;
10964a49301eSmrg
10974a49301eSmrg         /* Calculate pointer to first attrib:
1098cdc920a0Smrg          *   base_ptr + stride * index, where index depends on instance divisor
10994a49301eSmrg          */
11003464ebd5Sriastradh         if (variant->instance_divisor) {
110101e04c3fSmrg            struct x86_reg tmp_EDX = p->tmp2_EDX;
110201e04c3fSmrg
1103af69d88dSmrg            /* Start with instance = instance_id
1104af69d88dSmrg             * which is true if divisor is 1.
1105cdc920a0Smrg             */
1106cdc920a0Smrg            x86_mov(p->func, tmp_EAX, instance_id);
1107cdc920a0Smrg
11083464ebd5Sriastradh            if (variant->instance_divisor != 1) {
11093464ebd5Sriastradh               struct x86_reg tmp_ECX = p->src_ECX;
1110cdc920a0Smrg
1111cdc920a0Smrg               /* TODO: Add x86_shr() to rtasm and use it whenever
1112cdc920a0Smrg                *       instance divisor is power of two.
1113cdc920a0Smrg                */
1114cdc920a0Smrg               x86_xor(p->func, tmp_EDX, tmp_EDX);
11153464ebd5Sriastradh               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116af69d88dSmrg               x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1117cdc920a0Smrg            }
11183464ebd5Sriastradh
111901e04c3fSmrg            /* instance = (instance_id / divisor) + start_instance
112001e04c3fSmrg             */
112101e04c3fSmrg            x86_mov(p->func, tmp_EDX, start_instance);
112201e04c3fSmrg            x86_add(p->func, tmp_EAX, tmp_EDX);
112301e04c3fSmrg
11243464ebd5Sriastradh            /* XXX we need to clamp the index here too, but to a
11253464ebd5Sriastradh             * per-array max value, not the draw->pt.max_index value
11263464ebd5Sriastradh             * that's being given to us via translate->set_buffer().
11273464ebd5Sriastradh             */
1128af69d88dSmrg         }
1129af69d88dSmrg         else {
1130cdc920a0Smrg            x86_mov(p->func, tmp_EAX, elt);
11313464ebd5Sriastradh
11323464ebd5Sriastradh            /* Clamp to max_index
11333464ebd5Sriastradh             */
11343464ebd5Sriastradh            x86_cmp(p->func, tmp_EAX, buf_max_index);
11353464ebd5Sriastradh            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136cdc920a0Smrg         }
11373464ebd5Sriastradh
1138af69d88dSmrg         x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139af69d88dSmrg         x64_rexw(p->func);
1140af69d88dSmrg         x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
11413464ebd5Sriastradh         x64_rexw(p->func);
1142cdc920a0Smrg         x86_add(p->func, tmp_EAX, buf_base_ptr);
11434a49301eSmrg
11443464ebd5Sriastradh         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
11454a49301eSmrg
11464a49301eSmrg         /* In the linear case, keep the buffer pointer instead of the
11474a49301eSmrg          * index number.
11484a49301eSmrg          */
1149af69d88dSmrg         if (!index_size && p->nr_buffer_variants == 1) {
11503464ebd5Sriastradh            x64_rexw(p->func);
1151cdc920a0Smrg            x86_mov(p->func, elt, tmp_EAX);
11523464ebd5Sriastradh         }
1153af69d88dSmrg         else {
11543464ebd5Sriastradh            x64_rexw(p->func);
1155cdc920a0Smrg            x86_mov(p->func, buf_ptr, tmp_EAX);
11563464ebd5Sriastradh         }
11574a49301eSmrg      }
11584a49301eSmrg   }
11594a49301eSmrg
11604a49301eSmrg   return TRUE;
11614a49301eSmrg}
11624a49301eSmrg
11634a49301eSmrg
1164af69d88dSmrgstatic struct x86_reg
1165af69d88dSmrgget_buffer_ptr(struct translate_sse *p,
1166af69d88dSmrg               unsigned index_size, unsigned var_idx, struct x86_reg elt)
11674a49301eSmrg{
1168cdc920a0Smrg   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169af69d88dSmrg      return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170cdc920a0Smrg   }
11713464ebd5Sriastradh   if (!index_size && p->nr_buffer_variants == 1) {
11723464ebd5Sriastradh      return p->idx_ESI;
11734a49301eSmrg   }
11743464ebd5Sriastradh   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
11753464ebd5Sriastradh      struct x86_reg ptr = p->src_ECX;
1176af69d88dSmrg      struct x86_reg buf_ptr =
11773464ebd5Sriastradh         x86_make_disp(p->machine_EDI,
11783464ebd5Sriastradh                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1179af69d88dSmrg
11803464ebd5Sriastradh      x64_rexw(p->func);
11814a49301eSmrg      x86_mov(p->func, ptr, buf_ptr);
11824a49301eSmrg      return ptr;
11834a49301eSmrg   }
11844a49301eSmrg   else {
11853464ebd5Sriastradh      struct x86_reg ptr = p->src_ECX;
1186af69d88dSmrg      const struct translate_buffer_variant *variant =
1187af69d88dSmrg         &p->buffer_variant[var_idx];
1188af69d88dSmrg      struct x86_reg buf_stride =
11893464ebd5Sriastradh         x86_make_disp(p->machine_EDI,
11903464ebd5Sriastradh                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1191af69d88dSmrg      struct x86_reg buf_base_ptr =
11923464ebd5Sriastradh         x86_make_disp(p->machine_EDI,
1193af69d88dSmrg                  get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
11943464ebd5Sriastradh      struct x86_reg buf_max_index =
11953464ebd5Sriastradh         x86_make_disp(p->machine_EDI,
1196af69d88dSmrg                  get_offset(p, &p->buffer[variant->buffer_index].max_index));
11974a49301eSmrg
11984a49301eSmrg      /* Calculate pointer to current attrib:
11994a49301eSmrg       */
1200af69d88dSmrg      switch (index_size) {
12013464ebd5Sriastradh      case 1:
12023464ebd5Sriastradh         x86_movzx8(p->func, ptr, elt);
12033464ebd5Sriastradh         break;
12043464ebd5Sriastradh      case 2:
12053464ebd5Sriastradh         x86_movzx16(p->func, ptr, elt);
12063464ebd5Sriastradh         break;
12073464ebd5Sriastradh      case 4:
12083464ebd5Sriastradh         x86_mov(p->func, ptr, elt);
12093464ebd5Sriastradh         break;
12103464ebd5Sriastradh      }
12113464ebd5Sriastradh
12123464ebd5Sriastradh      /* Clamp to max_index
12133464ebd5Sriastradh       */
12143464ebd5Sriastradh      x86_cmp(p->func, ptr, buf_max_index);
12153464ebd5Sriastradh      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
12163464ebd5Sriastradh
1217af69d88dSmrg      x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218af69d88dSmrg      x64_rexw(p->func);
1219af69d88dSmrg      x86_imul(p->func, ptr, p->tmp2_EDX);
12203464ebd5Sriastradh      x64_rexw(p->func);
12214a49301eSmrg      x86_add(p->func, ptr, buf_base_ptr);
12224a49301eSmrg      return ptr;
12234a49301eSmrg   }
12244a49301eSmrg}
12254a49301eSmrg
12264a49301eSmrg
1227af69d88dSmrgstatic boolean
1228af69d88dSmrgincr_inputs(struct translate_sse *p, unsigned index_size)
12294a49301eSmrg{
12303464ebd5Sriastradh   if (!index_size && p->nr_buffer_variants == 1) {
1231af69d88dSmrg      const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232af69d88dSmrg      struct x86_reg stride =
1233af69d88dSmrg         x86_make_disp(p->machine_EDI,
1234af69d88dSmrg                       get_offset(p, &p->buffer[buffer_index].stride));
12354a49301eSmrg
12363464ebd5Sriastradh      if (p->buffer_variant[0].instance_divisor == 0) {
12373464ebd5Sriastradh         x64_rexw(p->func);
12383464ebd5Sriastradh         x86_add(p->func, p->idx_ESI, stride);
12393464ebd5Sriastradh         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240cdc920a0Smrg      }
12414a49301eSmrg   }
12423464ebd5Sriastradh   else if (!index_size) {
12434a49301eSmrg      unsigned i;
12444a49301eSmrg
12454a49301eSmrg      /* Is this worthwhile??
12464a49301eSmrg       */
12473464ebd5Sriastradh      for (i = 0; i < p->nr_buffer_variants; i++) {
12483464ebd5Sriastradh         struct translate_buffer_variant *variant = &p->buffer_variant[i];
12493464ebd5Sriastradh         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
12503464ebd5Sriastradh                                                get_offset(p, &variant->ptr));
1251af69d88dSmrg      struct x86_reg buf_stride =
1252af69d88dSmrg         x86_make_disp(p->machine_EDI,
1253af69d88dSmrg                       get_offset(p, &p->buffer[variant->buffer_index].stride));
12543464ebd5Sriastradh
12553464ebd5Sriastradh         if (variant->instance_divisor == 0) {
12563464ebd5Sriastradh            x86_mov(p->func, p->tmp_EAX, buf_stride);
12573464ebd5Sriastradh            x64_rexw(p->func);
12583464ebd5Sriastradh            x86_add(p->func, p->tmp_EAX, buf_ptr);
1259af69d88dSmrg            if (i == 0)
1260af69d88dSmrg               sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
12613464ebd5Sriastradh            x64_rexw(p->func);
1262cdc920a0Smrg            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263cdc920a0Smrg         }
12644a49301eSmrg      }
1265af69d88dSmrg   }
12664a49301eSmrg   else {
12673464ebd5Sriastradh      x64_rexw(p->func);
12683464ebd5Sriastradh      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
12694a49301eSmrg   }
1270af69d88dSmrg
12714a49301eSmrg   return TRUE;
12724a49301eSmrg}
12734a49301eSmrg
12744a49301eSmrg
12754a49301eSmrg/* Build run( struct translate *machine,
12764a49301eSmrg *            unsigned start,
12774a49301eSmrg *            unsigned count,
12784a49301eSmrg *            void *output_buffer )
12794a49301eSmrg * or
12804a49301eSmrg *  run_elts( struct translate *machine,
12814a49301eSmrg *            unsigned *elts,
12824a49301eSmrg *            unsigned count,
12834a49301eSmrg *            void *output_buffer )
12844a49301eSmrg *
12854a49301eSmrg *  Lots of hardcoding
12864a49301eSmrg *
12874a49301eSmrg * EAX -- pointer to current output vertex
12884a49301eSmrg * ECX -- pointer to current attribute
12894a49301eSmrg *
12904a49301eSmrg */
1291af69d88dSmrgstatic boolean
1292af69d88dSmrgbuild_vertex_emit(struct translate_sse *p,
1293af69d88dSmrg                  struct x86_function *func, unsigned index_size)
12944a49301eSmrg{
12954a49301eSmrg   int fixup, label;
12964a49301eSmrg   unsigned j;
12974a49301eSmrg
12983464ebd5Sriastradh   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
12993464ebd5Sriastradh   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
13003464ebd5Sriastradh
1301af69d88dSmrg   p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302af69d88dSmrg   p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303af69d88dSmrg   p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304af69d88dSmrg   p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305af69d88dSmrg   p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306af69d88dSmrg   p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307af69d88dSmrg   p->src_ECX = x86_make_reg(file_REG32, reg_CX);
13084a49301eSmrg
13094a49301eSmrg   p->func = func;
13104a49301eSmrg
13114a49301eSmrg   x86_init_func(p->func);
13124a49301eSmrg
1313af69d88dSmrg   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314af69d88dSmrg      /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315af69d88dSmrg       * above the return address
1316af69d88dSmrg       */
1317af69d88dSmrg      sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318af69d88dSmrg                  x86_make_reg(file_XMM, 6));
1319af69d88dSmrg      sse2_movdqa(p->func,
1320af69d88dSmrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321af69d88dSmrg                  x86_make_reg(file_XMM, 7));
13223464ebd5Sriastradh   }
13234a49301eSmrg
13243464ebd5Sriastradh   x86_push(p->func, p->outbuf_EBX);
13253464ebd5Sriastradh   x86_push(p->func, p->count_EBP);
13263464ebd5Sriastradh
1327af69d88dSmrg   /* on non-Win64 x86-64, these are already in the right registers */
1328af69d88dSmrg   if (x86_target(p->func) != X86_64_STD_ABI) {
13293464ebd5Sriastradh      x86_push(p->func, p->machine_EDI);
13303464ebd5Sriastradh      x86_push(p->func, p->idx_ESI);
13313464ebd5Sriastradh
1332af69d88dSmrg      if (x86_target(p->func) != X86_32) {
1333af69d88dSmrg         x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334af69d88dSmrg         x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335af69d88dSmrg      }
1336af69d88dSmrg      else {
1337af69d88dSmrg         x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338af69d88dSmrg         x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339af69d88dSmrg      }
13403464ebd5Sriastradh   }
13413464ebd5Sriastradh
13423464ebd5Sriastradh   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
13433464ebd5Sriastradh
1344af69d88dSmrg   if (x86_target(p->func) != X86_32)
1345af69d88dSmrg      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
13463464ebd5Sriastradh   else
1347af69d88dSmrg      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348cdc920a0Smrg
1349cdc920a0Smrg   /* Load instance ID.
1350cdc920a0Smrg    */
1351cdc920a0Smrg   if (p->use_instancing) {
1352af69d88dSmrg      x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353cdc920a0Smrg      x86_mov(p->func,
1354af69d88dSmrg              x86_make_disp(p->machine_EDI,
1355af69d88dSmrg                            get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356af69d88dSmrg
1357af69d88dSmrg      x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358cdc920a0Smrg      x86_mov(p->func,
13593464ebd5Sriastradh              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360cdc920a0Smrg              p->tmp_EAX);
1361cdc920a0Smrg   }
13624a49301eSmrg
13634a49301eSmrg   /* Get vertex count, compare to zero
13644a49301eSmrg    */
13654a49301eSmrg   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
13663464ebd5Sriastradh   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
13674a49301eSmrg   fixup = x86_jcc_forward(p->func, cc_E);
13684a49301eSmrg
13694a49301eSmrg   /* always load, needed or not:
13704a49301eSmrg    */
13713464ebd5Sriastradh   init_inputs(p, index_size);
13724a49301eSmrg
13734a49301eSmrg   /* Note address for loop jump
13744a49301eSmrg    */
13754a49301eSmrg   label = x86_get_label(p->func);
13764a49301eSmrg   {
13773464ebd5Sriastradh      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
13783464ebd5Sriastradh      int last_variant = -1;
13794a49301eSmrg      struct x86_reg vb;
13804a49301eSmrg
13814a49301eSmrg      for (j = 0; j < p->translate.key.nr_elements; j++) {
13824a49301eSmrg         const struct translate_element *a = &p->translate.key.element[j];
13833464ebd5Sriastradh         unsigned variant = p->element_to_buffer_variant[j];
13844a49301eSmrg
13854a49301eSmrg         /* Figure out source pointer address:
13864a49301eSmrg          */
13873464ebd5Sriastradh         if (variant != last_variant) {
13883464ebd5Sriastradh            last_variant = variant;
13893464ebd5Sriastradh            vb = get_buffer_ptr(p, index_size, variant, elt);
13904a49301eSmrg         }
1391af69d88dSmrg
1392af69d88dSmrg         if (!translate_attr(p, a,
1393af69d88dSmrg                             x86_make_disp(vb, a->input_offset),
1394af69d88dSmrg                             x86_make_disp(p->outbuf_EBX, a->output_offset)))
13954a49301eSmrg            return FALSE;
13964a49301eSmrg      }
13974a49301eSmrg
13984a49301eSmrg      /* Next output vertex:
13994a49301eSmrg       */
14003464ebd5Sriastradh      x64_rexw(p->func);
1401af69d88dSmrg      x86_lea(p->func, p->outbuf_EBX,
1402af69d88dSmrg              x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
14034a49301eSmrg
14044a49301eSmrg      /* Incr index
1405af69d88dSmrg       */
1406af69d88dSmrg      incr_inputs(p, index_size);
14074a49301eSmrg   }
14084a49301eSmrg
14094a49301eSmrg   /* decr count, loop if not zero
14104a49301eSmrg    */
14113464ebd5Sriastradh   x86_dec(p->func, p->count_EBP);
14124a49301eSmrg   x86_jcc(p->func, cc_NZ, label);
14134a49301eSmrg
14144a49301eSmrg   /* Exit mmx state?
14154a49301eSmrg    */
14164a49301eSmrg   if (p->func->need_emms)
14174a49301eSmrg      mmx_emms(p->func);
14184a49301eSmrg
14194a49301eSmrg   /* Land forward jump here:
14204a49301eSmrg    */
14214a49301eSmrg   x86_fixup_fwd_jump(p->func, fixup);
14224a49301eSmrg
14234a49301eSmrg   /* Pop regs and return
14244a49301eSmrg    */
1425af69d88dSmrg   if (x86_target(p->func) != X86_64_STD_ABI) {
14263464ebd5Sriastradh      x86_pop(p->func, p->idx_ESI);
14273464ebd5Sriastradh      x86_pop(p->func, p->machine_EDI);
14283464ebd5Sriastradh   }
14293464ebd5Sriastradh
14303464ebd5Sriastradh   x86_pop(p->func, p->count_EBP);
14313464ebd5Sriastradh   x86_pop(p->func, p->outbuf_EBX);
14323464ebd5Sriastradh
1433af69d88dSmrg   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434af69d88dSmrg      sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435af69d88dSmrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436af69d88dSmrg      sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437af69d88dSmrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
14383464ebd5Sriastradh   }
14394a49301eSmrg   x86_ret(p->func);
14404a49301eSmrg
14414a49301eSmrg   return TRUE;
14424a49301eSmrg}
14434a49301eSmrg
14444a49301eSmrg
1445af69d88dSmrgstatic void
1446af69d88dSmrgtranslate_sse_set_buffer(struct translate *translate,
1447af69d88dSmrg                         unsigned buf,
1448af69d88dSmrg                         const void *ptr, unsigned stride, unsigned max_index)
14494a49301eSmrg{
1450af69d88dSmrg   struct translate_sse *p = (struct translate_sse *) translate;
14514a49301eSmrg
14524a49301eSmrg   if (buf < p->nr_buffers) {
1453af69d88dSmrg      p->buffer[buf].base_ptr = (char *) ptr;
14544a49301eSmrg      p->buffer[buf].stride = stride;
14553464ebd5Sriastradh      p->buffer[buf].max_index = max_index;
14564a49301eSmrg   }
14574a49301eSmrg
1458af69d88dSmrg   if (0)
1459af69d88dSmrg      debug_printf("%s %d/%d: %p %d\n",
1460af69d88dSmrg                   __FUNCTION__, buf, p->nr_buffers, ptr, stride);
14614a49301eSmrg}
14624a49301eSmrg
14634a49301eSmrg
1464af69d88dSmrgstatic void
1465af69d88dSmrgtranslate_sse_release(struct translate *translate)
14664a49301eSmrg{
1467af69d88dSmrg   struct translate_sse *p = (struct translate_sse *) translate;
14684a49301eSmrg
1469af69d88dSmrg   x86_release_func(&p->elt8_func);
1470af69d88dSmrg   x86_release_func(&p->elt16_func);
1471af69d88dSmrg   x86_release_func(&p->elt_func);
1472af69d88dSmrg   x86_release_func(&p->linear_func);
14734a49301eSmrg
14743464ebd5Sriastradh   os_free_aligned(p);
14754a49301eSmrg}
14764a49301eSmrg
14774a49301eSmrg
1478af69d88dSmrgstruct translate *
1479af69d88dSmrgtranslate_sse2_create(const struct translate_key *key)
14804a49301eSmrg{
14814a49301eSmrg   struct translate_sse *p = NULL;
14824a49301eSmrg   unsigned i;
14834a49301eSmrg
14843464ebd5Sriastradh   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
14853464ebd5Sriastradh   if (!rtasm_cpu_has_sse())
14864a49301eSmrg      goto fail;
14874a49301eSmrg
14883464ebd5Sriastradh   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
148901e04c3fSmrg   if (!p)
14904a49301eSmrg      goto fail;
1491af69d88dSmrg
14923464ebd5Sriastradh   memset(p, 0, sizeof(*p));
14933464ebd5Sriastradh   memcpy(p->consts, consts, sizeof(consts));
14944a49301eSmrg
14954a49301eSmrg   p->translate.key = *key;
14964a49301eSmrg   p->translate.release = translate_sse_release;
14974a49301eSmrg   p->translate.set_buffer = translate_sse_set_buffer;
14984a49301eSmrg
1499af69d88dSmrg   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500af69d88dSmrg
1501cdc920a0Smrg   for (i = 0; i < key->nr_elements; i++) {
1502cdc920a0Smrg      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503cdc920a0Smrg         unsigned j;
1504cdc920a0Smrg
1505af69d88dSmrg         p->nr_buffers =
1506af69d88dSmrg            MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507cdc920a0Smrg
1508cdc920a0Smrg         if (key->element[i].instance_divisor) {
1509cdc920a0Smrg            p->use_instancing = TRUE;
1510cdc920a0Smrg         }
1511cdc920a0Smrg
1512cdc920a0Smrg         /*
15133464ebd5Sriastradh          * Map vertex element to vertex buffer variant.
1514cdc920a0Smrg          */
15153464ebd5Sriastradh         for (j = 0; j < p->nr_buffer_variants; j++) {
1516af69d88dSmrg            if (p->buffer_variant[j].buffer_index ==
1517af69d88dSmrg                key->element[i].input_buffer
1518af69d88dSmrg                && p->buffer_variant[j].instance_divisor ==
1519af69d88dSmrg                key->element[i].instance_divisor) {
1520cdc920a0Smrg               break;
1521cdc920a0Smrg            }
1522cdc920a0Smrg         }
15233464ebd5Sriastradh         if (j == p->nr_buffer_variants) {
15243464ebd5Sriastradh            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525af69d88dSmrg            p->buffer_variant[j].instance_divisor =
1526af69d88dSmrg               key->element[i].instance_divisor;
15273464ebd5Sriastradh            p->nr_buffer_variants++;
1528cdc920a0Smrg         }
15293464ebd5Sriastradh         p->element_to_buffer_variant[i] = j;
1530af69d88dSmrg      }
1531af69d88dSmrg      else {
1532cdc920a0Smrg         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533cdc920a0Smrg
15343464ebd5Sriastradh         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535cdc920a0Smrg      }
1536cdc920a0Smrg   }
15374a49301eSmrg
1538af69d88dSmrg   if (0)
1539af69d88dSmrg      debug_printf("nr_buffers: %d\n", p->nr_buffers);
15404a49301eSmrg
15413464ebd5Sriastradh   if (!build_vertex_emit(p, &p->linear_func, 0))
15423464ebd5Sriastradh      goto fail;
15433464ebd5Sriastradh
15443464ebd5Sriastradh   if (!build_vertex_emit(p, &p->elt_func, 4))
15453464ebd5Sriastradh      goto fail;
15463464ebd5Sriastradh
15473464ebd5Sriastradh   if (!build_vertex_emit(p, &p->elt16_func, 2))
15483464ebd5Sriastradh      goto fail;
15493464ebd5Sriastradh
15503464ebd5Sriastradh   if (!build_vertex_emit(p, &p->elt8_func, 1))
15513464ebd5Sriastradh      goto fail;
15523464ebd5Sriastradh
15533464ebd5Sriastradh   p->translate.run = (run_func) x86_get_func(&p->linear_func);
15543464ebd5Sriastradh   if (p->translate.run == NULL)
15554a49301eSmrg      goto fail;
15564a49301eSmrg
15573464ebd5Sriastradh   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
15583464ebd5Sriastradh   if (p->translate.run_elts == NULL)
15594a49301eSmrg      goto fail;
15604a49301eSmrg
15613464ebd5Sriastradh   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
15623464ebd5Sriastradh   if (p->translate.run_elts16 == NULL)
15634a49301eSmrg      goto fail;
15644a49301eSmrg
15653464ebd5Sriastradh   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
15663464ebd5Sriastradh   if (p->translate.run_elts8 == NULL)
15674a49301eSmrg      goto fail;
15684a49301eSmrg
15694a49301eSmrg   return &p->translate;
15704a49301eSmrg
15714a49301eSmrg fail:
15724a49301eSmrg   if (p)
1573af69d88dSmrg      translate_sse_release(&p->translate);
15744a49301eSmrg
15754a49301eSmrg   return NULL;
15764a49301eSmrg}
15774a49301eSmrg
15784a49301eSmrg
15794a49301eSmrg#else
15804a49301eSmrg
1581af69d88dSmrgstruct translate *
1582af69d88dSmrgtranslate_sse2_create(const struct translate_key *key)
15834a49301eSmrg{
15844a49301eSmrg   return NULL;
15854a49301eSmrg}
15864a49301eSmrg
15874a49301eSmrg#endif
1588