1848b8605Smrg/*
2848b8605Smrg * Copyright 2003 VMware, Inc.
3848b8605Smrg * All Rights Reserved.
4848b8605Smrg *
5848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
6848b8605Smrg * copy of this software and associated documentation files (the "Software"),
7848b8605Smrg * to deal in the Software without restriction, including without limitation
8848b8605Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub
9848b8605Smrg * license, and/or sell copies of the Software, and to permit persons to whom
10848b8605Smrg * the Software is furnished to do so, subject to the following conditions:
11848b8605Smrg *
12848b8605Smrg * The above copyright notice and this permission notice (including the next
13848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the
14848b8605Smrg * Software.
15848b8605Smrg *
16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19848b8605Smrg * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20848b8605Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21848b8605Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22848b8605Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
23848b8605Smrg *
24848b8605Smrg * Authors:
25848b8605Smrg *    Keith Whitwell <keithw@vmware.com>
26848b8605Smrg */
27848b8605Smrg
28848b8605Smrg
29848b8605Smrg#include "pipe/p_config.h"
30848b8605Smrg#include "pipe/p_compiler.h"
31848b8605Smrg#include "util/u_memory.h"
32848b8605Smrg#include "util/u_math.h"
33848b8605Smrg#include "util/u_format.h"
34848b8605Smrg
35848b8605Smrg#include "translate.h"
36848b8605Smrg
37848b8605Smrg
38b8e80941Smrg#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39848b8605Smrg
40848b8605Smrg#include "rtasm/rtasm_cpu.h"
41848b8605Smrg#include "rtasm/rtasm_x86sse.h"
42848b8605Smrg
43848b8605Smrg
44848b8605Smrg#define X    0
45848b8605Smrg#define Y    1
46848b8605Smrg#define Z    2
47848b8605Smrg#define W    3
48848b8605Smrg
49848b8605Smrg
50848b8605Smrgstruct translate_buffer
51848b8605Smrg{
52848b8605Smrg   const void *base_ptr;
53848b8605Smrg   uintptr_t stride;
54848b8605Smrg   unsigned max_index;
55848b8605Smrg};
56848b8605Smrg
57848b8605Smrgstruct translate_buffer_variant
58848b8605Smrg{
59848b8605Smrg   unsigned buffer_index;
60848b8605Smrg   unsigned instance_divisor;
61848b8605Smrg   void *ptr;                   /* updated either per vertex or per instance */
62848b8605Smrg};
63848b8605Smrg
64848b8605Smrg
65848b8605Smrg#define ELEMENT_BUFFER_INSTANCE_ID  1001
66848b8605Smrg
67848b8605Smrg#define NUM_CONSTS 7
68848b8605Smrg
69848b8605Smrgenum
70848b8605Smrg{
71848b8605Smrg   CONST_IDENTITY,
72848b8605Smrg   CONST_INV_127,
73848b8605Smrg   CONST_INV_255,
74848b8605Smrg   CONST_INV_32767,
75848b8605Smrg   CONST_INV_65535,
76848b8605Smrg   CONST_INV_2147483647,
77848b8605Smrg   CONST_255
78848b8605Smrg};
79848b8605Smrg
80848b8605Smrg#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81848b8605Smrgstatic float consts[NUM_CONSTS][4] = {
82848b8605Smrg   {0, 0, 0, 1},
83848b8605Smrg   C(1.0 / 127.0),
84848b8605Smrg   C(1.0 / 255.0),
85848b8605Smrg   C(1.0 / 32767.0),
86848b8605Smrg   C(1.0 / 65535.0),
87848b8605Smrg   C(1.0 / 2147483647.0),
88848b8605Smrg   C(255.0)
89848b8605Smrg};
90848b8605Smrg
91848b8605Smrg#undef C
92848b8605Smrg
93848b8605Smrgstruct translate_sse
94848b8605Smrg{
95848b8605Smrg   struct translate translate;
96848b8605Smrg
97848b8605Smrg   struct x86_function linear_func;
98848b8605Smrg   struct x86_function elt_func;
99848b8605Smrg   struct x86_function elt16_func;
100848b8605Smrg   struct x86_function elt8_func;
101848b8605Smrg   struct x86_function *func;
102848b8605Smrg
103848b8605Smrg     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104848b8605Smrg   int8_t reg_to_const[16];
105848b8605Smrg   int8_t const_to_reg[NUM_CONSTS];
106848b8605Smrg
107848b8605Smrg   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108848b8605Smrg   unsigned nr_buffers;
109848b8605Smrg
110848b8605Smrg   /* Multiple buffer variants can map to a single buffer. */
111848b8605Smrg   struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112848b8605Smrg   unsigned nr_buffer_variants;
113848b8605Smrg
114848b8605Smrg   /* Multiple elements can map to a single buffer variant. */
115848b8605Smrg   unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116848b8605Smrg
117848b8605Smrg   boolean use_instancing;
118848b8605Smrg   unsigned instance_id;
119848b8605Smrg   unsigned start_instance;
120848b8605Smrg
121848b8605Smrg   /* these are actually known values, but putting them in a struct
122848b8605Smrg    * like this is helpful to keep them in sync across the file.
123848b8605Smrg    */
124848b8605Smrg   struct x86_reg tmp_EAX;
125848b8605Smrg   struct x86_reg tmp2_EDX;
126848b8605Smrg   struct x86_reg src_ECX;
127848b8605Smrg   struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
128848b8605Smrg   struct x86_reg machine_EDI;
129848b8605Smrg   struct x86_reg outbuf_EBX;
130848b8605Smrg   struct x86_reg count_EBP;    /* decrements to zero */
131848b8605Smrg};
132848b8605Smrg
133848b8605Smrg
134848b8605Smrgstatic int
135848b8605Smrgget_offset(const void *a, const void *b)
136848b8605Smrg{
137848b8605Smrg   return (const char *) b - (const char *) a;
138848b8605Smrg}
139848b8605Smrg
140848b8605Smrg
141848b8605Smrgstatic struct x86_reg
142848b8605Smrgget_const(struct translate_sse *p, unsigned id)
143848b8605Smrg{
144848b8605Smrg   struct x86_reg reg;
145848b8605Smrg   unsigned i;
146848b8605Smrg
147848b8605Smrg   if (p->const_to_reg[id] >= 0)
148848b8605Smrg      return x86_make_reg(file_XMM, p->const_to_reg[id]);
149848b8605Smrg
150848b8605Smrg   for (i = 2; i < 8; ++i) {
151848b8605Smrg      if (p->reg_to_const[i] < 0)
152848b8605Smrg         break;
153848b8605Smrg   }
154848b8605Smrg
155848b8605Smrg   /* TODO: be smarter here */
156848b8605Smrg   if (i == 8)
157848b8605Smrg      --i;
158848b8605Smrg
159848b8605Smrg   reg = x86_make_reg(file_XMM, i);
160848b8605Smrg
161848b8605Smrg   if (p->reg_to_const[i] >= 0)
162848b8605Smrg      p->const_to_reg[p->reg_to_const[i]] = -1;
163848b8605Smrg
164848b8605Smrg   p->reg_to_const[i] = id;
165848b8605Smrg   p->const_to_reg[id] = i;
166848b8605Smrg
167848b8605Smrg   /* TODO: this should happen outside the loop, if possible */
168848b8605Smrg   sse_movaps(p->func, reg,
169848b8605Smrg              x86_make_disp(p->machine_EDI,
170848b8605Smrg                            get_offset(p, &p->consts[id][0])));
171848b8605Smrg
172848b8605Smrg   return reg;
173848b8605Smrg}
174848b8605Smrg
175848b8605Smrg
176848b8605Smrg/* load the data in a SSE2 register, padding with zeros */
177848b8605Smrgstatic boolean
178848b8605Smrgemit_load_sse2(struct translate_sse *p,
179848b8605Smrg               struct x86_reg data, struct x86_reg src, unsigned size)
180848b8605Smrg{
181848b8605Smrg   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182848b8605Smrg   struct x86_reg tmp = p->tmp_EAX;
183848b8605Smrg   switch (size) {
184848b8605Smrg   case 1:
185848b8605Smrg      x86_movzx8(p->func, tmp, src);
186848b8605Smrg      sse2_movd(p->func, data, tmp);
187848b8605Smrg      break;
188848b8605Smrg   case 2:
189848b8605Smrg      x86_movzx16(p->func, tmp, src);
190848b8605Smrg      sse2_movd(p->func, data, tmp);
191848b8605Smrg      break;
192848b8605Smrg   case 3:
193848b8605Smrg      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194848b8605Smrg      x86_shl_imm(p->func, tmp, 16);
195848b8605Smrg      x86_mov16(p->func, tmp, src);
196848b8605Smrg      sse2_movd(p->func, data, tmp);
197848b8605Smrg      break;
198848b8605Smrg   case 4:
199848b8605Smrg      sse2_movd(p->func, data, src);
200848b8605Smrg      break;
201848b8605Smrg   case 6:
202848b8605Smrg      sse2_movd(p->func, data, src);
203848b8605Smrg      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204848b8605Smrg      sse2_movd(p->func, tmpXMM, tmp);
205848b8605Smrg      sse2_punpckldq(p->func, data, tmpXMM);
206848b8605Smrg      break;
207848b8605Smrg   case 8:
208848b8605Smrg      sse2_movq(p->func, data, src);
209848b8605Smrg      break;
210848b8605Smrg   case 12:
211848b8605Smrg      sse2_movq(p->func, data, src);
212848b8605Smrg      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213848b8605Smrg      sse2_punpcklqdq(p->func, data, tmpXMM);
214848b8605Smrg      break;
215848b8605Smrg   case 16:
216848b8605Smrg      sse2_movdqu(p->func, data, src);
217848b8605Smrg      break;
218848b8605Smrg   default:
219848b8605Smrg      return FALSE;
220848b8605Smrg   }
221848b8605Smrg   return TRUE;
222848b8605Smrg}
223848b8605Smrg
224848b8605Smrg
225848b8605Smrg/* this value can be passed for the out_chans argument */
226848b8605Smrg#define CHANNELS_0001 5
227848b8605Smrg
228848b8605Smrg
229848b8605Smrg/* this function will load #chans float values, and will
230848b8605Smrg * pad the register with zeroes at least up to out_chans.
231848b8605Smrg *
232848b8605Smrg * If out_chans is set to CHANNELS_0001, then the fourth
233848b8605Smrg * value will be padded with 1. Only pass this value if
234848b8605Smrg * chans < 4 or results are undefined.
235848b8605Smrg */
236848b8605Smrgstatic void
237848b8605Smrgemit_load_float32(struct translate_sse *p, struct x86_reg data,
238848b8605Smrg                  struct x86_reg arg0, unsigned out_chans, unsigned chans)
239848b8605Smrg{
240848b8605Smrg   switch (chans) {
241848b8605Smrg   case 1:
242848b8605Smrg      /* a 0 0 0
243848b8605Smrg       * a 0 0 1
244848b8605Smrg       */
245848b8605Smrg      sse_movss(p->func, data, arg0);
246848b8605Smrg      if (out_chans == CHANNELS_0001)
247848b8605Smrg         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248848b8605Smrg      break;
249848b8605Smrg   case 2:
250848b8605Smrg      /* 0 0 0 1
251848b8605Smrg       * a b 0 1
252848b8605Smrg       */
253848b8605Smrg      if (out_chans == CHANNELS_0001)
254848b8605Smrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255848b8605Smrg                    SHUF(X, Y, Z, W));
256848b8605Smrg      else if (out_chans > 2)
257848b8605Smrg         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258848b8605Smrg      sse_movlps(p->func, data, arg0);
259848b8605Smrg      break;
260848b8605Smrg   case 3:
261848b8605Smrg      /* Have to jump through some hoops:
262848b8605Smrg       *
263848b8605Smrg       * c 0 0 0
264848b8605Smrg       * c 0 0 1 if out_chans == CHANNELS_0001
265848b8605Smrg       * 0 0 c 0/1
266848b8605Smrg       * a b c 0/1
267848b8605Smrg       */
268848b8605Smrg      sse_movss(p->func, data, x86_make_disp(arg0, 8));
269848b8605Smrg      if (out_chans == CHANNELS_0001)
270848b8605Smrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271848b8605Smrg                    SHUF(X, Y, Z, W));
272848b8605Smrg      sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273848b8605Smrg      sse_movlps(p->func, data, arg0);
274848b8605Smrg      break;
275848b8605Smrg   case 4:
276848b8605Smrg      sse_movups(p->func, data, arg0);
277848b8605Smrg      break;
278848b8605Smrg   }
279848b8605Smrg}
280848b8605Smrg
281848b8605Smrg/* this function behaves like emit_load_float32, but loads
282848b8605Smrg   64-bit floating point numbers, converting them to 32-bit
283848b8605Smrg  ones */
284848b8605Smrgstatic void
285848b8605Smrgemit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286848b8605Smrg                      struct x86_reg arg0, unsigned out_chans, unsigned chans)
287848b8605Smrg{
288848b8605Smrg   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289848b8605Smrg   switch (chans) {
290848b8605Smrg   case 1:
291848b8605Smrg      sse2_movsd(p->func, data, arg0);
292848b8605Smrg      if (out_chans > 1)
293848b8605Smrg         sse2_cvtpd2ps(p->func, data, data);
294848b8605Smrg      else
295848b8605Smrg         sse2_cvtsd2ss(p->func, data, data);
296848b8605Smrg      if (out_chans == CHANNELS_0001)
297848b8605Smrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298848b8605Smrg                    SHUF(X, Y, Z, W));
299848b8605Smrg      break;
300848b8605Smrg   case 2:
301848b8605Smrg      sse2_movupd(p->func, data, arg0);
302848b8605Smrg      sse2_cvtpd2ps(p->func, data, data);
303848b8605Smrg      if (out_chans == CHANNELS_0001)
304848b8605Smrg         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305848b8605Smrg                    SHUF(X, Y, Z, W));
306848b8605Smrg      else if (out_chans > 2)
307848b8605Smrg         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308848b8605Smrg      break;
309848b8605Smrg   case 3:
310848b8605Smrg      sse2_movupd(p->func, data, arg0);
311848b8605Smrg      sse2_cvtpd2ps(p->func, data, data);
312848b8605Smrg      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313848b8605Smrg      if (out_chans > 3)
314848b8605Smrg         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315848b8605Smrg      else
316848b8605Smrg         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317848b8605Smrg      sse_movlhps(p->func, data, tmpXMM);
318848b8605Smrg      if (out_chans == CHANNELS_0001)
319848b8605Smrg         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320848b8605Smrg      break;
321848b8605Smrg   case 4:
322848b8605Smrg      sse2_movupd(p->func, data, arg0);
323848b8605Smrg      sse2_cvtpd2ps(p->func, data, data);
324848b8605Smrg      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325848b8605Smrg      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326848b8605Smrg      sse_movlhps(p->func, data, tmpXMM);
327848b8605Smrg      break;
328848b8605Smrg   }
329848b8605Smrg}
330848b8605Smrg
331848b8605Smrg
332848b8605Smrgstatic void
333848b8605Smrgemit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334848b8605Smrg           struct x86_reg dst_xmm, struct x86_reg src_gpr,
335848b8605Smrg           struct x86_reg src_xmm)
336848b8605Smrg{
337848b8605Smrg   if (x86_target(p->func) != X86_32)
338848b8605Smrg      x64_mov64(p->func, dst_gpr, src_gpr);
339848b8605Smrg   else {
340848b8605Smrg      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341848b8605Smrg      if (x86_target_caps(p->func) & X86_SSE2)
342848b8605Smrg         sse2_movq(p->func, dst_xmm, src_xmm);
343848b8605Smrg      else
344848b8605Smrg         sse_movlps(p->func, dst_xmm, src_xmm);
345848b8605Smrg   }
346848b8605Smrg}
347848b8605Smrg
348848b8605Smrg
349848b8605Smrgstatic void
350848b8605Smrgemit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351848b8605Smrg            struct x86_reg dst_xmm, struct x86_reg src)
352848b8605Smrg{
353848b8605Smrg   emit_mov64(p, dst_gpr, dst_xmm, src, src);
354848b8605Smrg}
355848b8605Smrg
356848b8605Smrg
357848b8605Smrgstatic void
358848b8605Smrgemit_store64(struct translate_sse *p, struct x86_reg dst,
359848b8605Smrg             struct x86_reg src_gpr, struct x86_reg src_xmm)
360848b8605Smrg{
361848b8605Smrg   emit_mov64(p, dst, dst, src_gpr, src_xmm);
362848b8605Smrg}
363848b8605Smrg
364848b8605Smrg
365848b8605Smrgstatic void
366848b8605Smrgemit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367848b8605Smrg{
368848b8605Smrg   if (x86_target_caps(p->func) & X86_SSE2)
369848b8605Smrg      sse2_movdqu(p->func, dst, src);
370848b8605Smrg   else
371848b8605Smrg      sse_movups(p->func, dst, src);
372848b8605Smrg}
373848b8605Smrg
374848b8605Smrg
375848b8605Smrg/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376848b8605Smrg * but may or may not be good on older processors
377848b8605Smrg * TODO: may perhaps want to use non-temporal stores here if possible
378848b8605Smrg */
379848b8605Smrgstatic void
380848b8605Smrgemit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381848b8605Smrg            unsigned size)
382848b8605Smrg{
383848b8605Smrg   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384848b8605Smrg   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385848b8605Smrg   struct x86_reg dataGPR = p->tmp_EAX;
386848b8605Smrg   struct x86_reg dataGPR2 = p->tmp2_EDX;
387848b8605Smrg
388848b8605Smrg   if (size < 8) {
389848b8605Smrg      switch (size) {
390848b8605Smrg      case 1:
391848b8605Smrg         x86_mov8(p->func, dataGPR, src);
392848b8605Smrg         x86_mov8(p->func, dst, dataGPR);
393848b8605Smrg         break;
394848b8605Smrg      case 2:
395848b8605Smrg         x86_mov16(p->func, dataGPR, src);
396848b8605Smrg         x86_mov16(p->func, dst, dataGPR);
397848b8605Smrg         break;
398848b8605Smrg      case 3:
399848b8605Smrg         x86_mov16(p->func, dataGPR, src);
400848b8605Smrg         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401848b8605Smrg         x86_mov16(p->func, dst, dataGPR);
402848b8605Smrg         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403848b8605Smrg         break;
404848b8605Smrg      case 4:
405848b8605Smrg         x86_mov(p->func, dataGPR, src);
406848b8605Smrg         x86_mov(p->func, dst, dataGPR);
407848b8605Smrg         break;
408848b8605Smrg      case 6:
409848b8605Smrg         x86_mov(p->func, dataGPR, src);
410848b8605Smrg         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411848b8605Smrg         x86_mov(p->func, dst, dataGPR);
412848b8605Smrg         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413848b8605Smrg         break;
414848b8605Smrg      }
415848b8605Smrg   }
416848b8605Smrg   else if (!(x86_target_caps(p->func) & X86_SSE)) {
417848b8605Smrg      unsigned i = 0;
418848b8605Smrg      assert((size & 3) == 0);
419848b8605Smrg      for (i = 0; i < size; i += 4) {
420848b8605Smrg         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421848b8605Smrg         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422848b8605Smrg      }
423848b8605Smrg   }
424848b8605Smrg   else {
425848b8605Smrg      switch (size) {
426848b8605Smrg      case 8:
427848b8605Smrg         emit_load64(p, dataGPR, dataXMM, src);
428848b8605Smrg         emit_store64(p, dst, dataGPR, dataXMM);
429848b8605Smrg         break;
430848b8605Smrg      case 12:
431848b8605Smrg         emit_load64(p, dataGPR2, dataXMM, src);
432848b8605Smrg         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433848b8605Smrg         emit_store64(p, dst, dataGPR2, dataXMM);
434848b8605Smrg         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435848b8605Smrg         break;
436848b8605Smrg      case 16:
437848b8605Smrg         emit_mov128(p, dataXMM, src);
438848b8605Smrg         emit_mov128(p, dst, dataXMM);
439848b8605Smrg         break;
440848b8605Smrg      case 24:
441848b8605Smrg         emit_mov128(p, dataXMM, src);
442848b8605Smrg         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443848b8605Smrg         emit_mov128(p, dst, dataXMM);
444848b8605Smrg         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445848b8605Smrg         break;
446848b8605Smrg      case 32:
447848b8605Smrg         emit_mov128(p, dataXMM, src);
448848b8605Smrg         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449848b8605Smrg         emit_mov128(p, dst, dataXMM);
450848b8605Smrg         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451848b8605Smrg         break;
452848b8605Smrg      default:
453848b8605Smrg         assert(0);
454848b8605Smrg      }
455848b8605Smrg   }
456848b8605Smrg}
457848b8605Smrg
458848b8605Smrgstatic boolean
459848b8605Smrgtranslate_attr_convert(struct translate_sse *p,
460848b8605Smrg                       const struct translate_element *a,
461848b8605Smrg                       struct x86_reg src, struct x86_reg dst)
462848b8605Smrg{
463848b8605Smrg   const struct util_format_description *input_desc =
464848b8605Smrg      util_format_description(a->input_format);
465848b8605Smrg   const struct util_format_description *output_desc =
466848b8605Smrg      util_format_description(a->output_format);
467848b8605Smrg   unsigned i;
468848b8605Smrg   boolean id_swizzle = TRUE;
469848b8605Smrg   unsigned swizzle[4] =
470b8e80941Smrg      { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471b8e80941Smrg        PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472848b8605Smrg   unsigned needed_chans = 0;
473848b8605Smrg   unsigned imms[2] = { 0, 0x3f800000 };
474848b8605Smrg
475848b8605Smrg   if (a->output_format == PIPE_FORMAT_NONE
476848b8605Smrg       || a->input_format == PIPE_FORMAT_NONE)
477848b8605Smrg      return FALSE;
478848b8605Smrg
479848b8605Smrg   if (input_desc->channel[0].size & 7)
480848b8605Smrg      return FALSE;
481848b8605Smrg
482848b8605Smrg   if (input_desc->colorspace != output_desc->colorspace)
483848b8605Smrg      return FALSE;
484848b8605Smrg
485848b8605Smrg   for (i = 1; i < input_desc->nr_channels; ++i) {
486848b8605Smrg      if (memcmp
487848b8605Smrg          (&input_desc->channel[i], &input_desc->channel[0],
488848b8605Smrg           sizeof(input_desc->channel[0])))
489848b8605Smrg         return FALSE;
490848b8605Smrg   }
491848b8605Smrg
492848b8605Smrg   for (i = 1; i < output_desc->nr_channels; ++i) {
493848b8605Smrg      if (memcmp
494848b8605Smrg          (&output_desc->channel[i], &output_desc->channel[0],
495848b8605Smrg           sizeof(output_desc->channel[0]))) {
496848b8605Smrg         return FALSE;
497848b8605Smrg      }
498848b8605Smrg   }
499848b8605Smrg
500848b8605Smrg   for (i = 0; i < output_desc->nr_channels; ++i) {
501848b8605Smrg      if (output_desc->swizzle[i] < 4)
502848b8605Smrg         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503848b8605Smrg   }
504848b8605Smrg
505848b8605Smrg   if ((x86_target_caps(p->func) & X86_SSE) &&
506848b8605Smrg       (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507848b8605Smrg        || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508848b8605Smrg        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509848b8605Smrg        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510848b8605Smrg      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511848b8605Smrg
512848b8605Smrg      for (i = 0; i < output_desc->nr_channels; ++i) {
513b8e80941Smrg         if (swizzle[i] == PIPE_SWIZZLE_0
514848b8605Smrg             && i >= input_desc->nr_channels)
515848b8605Smrg            swizzle[i] = i;
516848b8605Smrg      }
517848b8605Smrg
518848b8605Smrg      for (i = 0; i < output_desc->nr_channels; ++i) {
519848b8605Smrg         if (swizzle[i] < 4)
520848b8605Smrg            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521b8e80941Smrg         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522848b8605Smrg            id_swizzle = FALSE;
523848b8605Smrg      }
524848b8605Smrg
525848b8605Smrg      if (needed_chans > 0) {
526848b8605Smrg         switch (input_desc->channel[0].type) {
527848b8605Smrg         case UTIL_FORMAT_TYPE_UNSIGNED:
528848b8605Smrg            if (!(x86_target_caps(p->func) & X86_SSE2))
529848b8605Smrg               return FALSE;
530848b8605Smrg            emit_load_sse2(p, dataXMM, src,
531848b8605Smrg                           input_desc->channel[0].size *
532848b8605Smrg                           input_desc->nr_channels >> 3);
533848b8605Smrg
534848b8605Smrg            /* TODO: add support for SSE4.1 pmovzx */
535848b8605Smrg            switch (input_desc->channel[0].size) {
536848b8605Smrg            case 8:
537848b8605Smrg               /* TODO: this may be inefficient due to get_identity() being
538848b8605Smrg                *  used both as a float and integer register.
539848b8605Smrg                */
540848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542848b8605Smrg               break;
543848b8605Smrg            case 16:
544848b8605Smrg               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545848b8605Smrg               break;
546848b8605Smrg            case 32:           /* we lose precision here */
547848b8605Smrg               sse2_psrld_imm(p->func, dataXMM, 1);
548848b8605Smrg               break;
549848b8605Smrg            default:
550848b8605Smrg               return FALSE;
551848b8605Smrg            }
552848b8605Smrg            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553848b8605Smrg            if (input_desc->channel[0].normalized) {
554848b8605Smrg               struct x86_reg factor;
555848b8605Smrg               switch (input_desc->channel[0].size) {
556848b8605Smrg               case 8:
557848b8605Smrg                  factor = get_const(p, CONST_INV_255);
558848b8605Smrg                  break;
559848b8605Smrg               case 16:
560848b8605Smrg                  factor = get_const(p, CONST_INV_65535);
561848b8605Smrg                  break;
562848b8605Smrg               case 32:
563848b8605Smrg                  factor = get_const(p, CONST_INV_2147483647);
564848b8605Smrg                  break;
565848b8605Smrg               default:
566848b8605Smrg                  assert(0);
567848b8605Smrg                  factor.disp = 0;
568848b8605Smrg                  factor.file = 0;
569848b8605Smrg                  factor.idx = 0;
570848b8605Smrg                  factor.mod = 0;
571848b8605Smrg                  break;
572848b8605Smrg               }
573848b8605Smrg               sse_mulps(p->func, dataXMM, factor);
574848b8605Smrg            }
575848b8605Smrg            else if (input_desc->channel[0].size == 32)
576848b8605Smrg               /* compensate for the bit we threw away to fit u32 into s32 */
577848b8605Smrg               sse_addps(p->func, dataXMM, dataXMM);
578848b8605Smrg            break;
579848b8605Smrg         case UTIL_FORMAT_TYPE_SIGNED:
580848b8605Smrg            if (!(x86_target_caps(p->func) & X86_SSE2))
581848b8605Smrg               return FALSE;
582848b8605Smrg            emit_load_sse2(p, dataXMM, src,
583848b8605Smrg                           input_desc->channel[0].size *
584848b8605Smrg                           input_desc->nr_channels >> 3);
585848b8605Smrg
586848b8605Smrg            /* TODO: add support for SSE4.1 pmovsx */
587848b8605Smrg            switch (input_desc->channel[0].size) {
588848b8605Smrg            case 8:
589848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, dataXMM);
590848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, dataXMM);
591848b8605Smrg               sse2_psrad_imm(p->func, dataXMM, 24);
592848b8605Smrg               break;
593848b8605Smrg            case 16:
594848b8605Smrg               sse2_punpcklwd(p->func, dataXMM, dataXMM);
595848b8605Smrg               sse2_psrad_imm(p->func, dataXMM, 16);
596848b8605Smrg               break;
597848b8605Smrg            case 32:           /* we lose precision here */
598848b8605Smrg               break;
599848b8605Smrg            default:
600848b8605Smrg               return FALSE;
601848b8605Smrg            }
602848b8605Smrg            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603848b8605Smrg            if (input_desc->channel[0].normalized) {
604848b8605Smrg               struct x86_reg factor;
605848b8605Smrg               switch (input_desc->channel[0].size) {
606848b8605Smrg               case 8:
607848b8605Smrg                  factor = get_const(p, CONST_INV_127);
608848b8605Smrg                  break;
609848b8605Smrg               case 16:
610848b8605Smrg                  factor = get_const(p, CONST_INV_32767);
611848b8605Smrg                  break;
612848b8605Smrg               case 32:
613848b8605Smrg                  factor = get_const(p, CONST_INV_2147483647);
614848b8605Smrg                  break;
615848b8605Smrg               default:
616848b8605Smrg                  assert(0);
617848b8605Smrg                  factor.disp = 0;
618848b8605Smrg                  factor.file = 0;
619848b8605Smrg                  factor.idx = 0;
620848b8605Smrg                  factor.mod = 0;
621848b8605Smrg                  break;
622848b8605Smrg               }
623848b8605Smrg               sse_mulps(p->func, dataXMM, factor);
624848b8605Smrg            }
625848b8605Smrg            break;
626848b8605Smrg
627848b8605Smrg            break;
628848b8605Smrg         case UTIL_FORMAT_TYPE_FLOAT:
629848b8605Smrg            if (input_desc->channel[0].size != 32
630848b8605Smrg                && input_desc->channel[0].size != 64) {
631848b8605Smrg               return FALSE;
632848b8605Smrg            }
633b8e80941Smrg            if (swizzle[3] == PIPE_SWIZZLE_1
634848b8605Smrg                && input_desc->nr_channels <= 3) {
635b8e80941Smrg               swizzle[3] = PIPE_SWIZZLE_W;
636848b8605Smrg               needed_chans = CHANNELS_0001;
637848b8605Smrg            }
638848b8605Smrg            switch (input_desc->channel[0].size) {
639848b8605Smrg            case 32:
640848b8605Smrg               emit_load_float32(p, dataXMM, src, needed_chans,
641848b8605Smrg                                 input_desc->nr_channels);
642848b8605Smrg               break;
643848b8605Smrg            case 64:           /* we lose precision here */
644848b8605Smrg               if (!(x86_target_caps(p->func) & X86_SSE2))
645848b8605Smrg                  return FALSE;
646848b8605Smrg               emit_load_float64to32(p, dataXMM, src, needed_chans,
647848b8605Smrg                                     input_desc->nr_channels);
648848b8605Smrg               break;
649848b8605Smrg            default:
650848b8605Smrg               return FALSE;
651848b8605Smrg            }
652848b8605Smrg            break;
653848b8605Smrg         default:
654848b8605Smrg            return FALSE;
655848b8605Smrg         }
656848b8605Smrg
657848b8605Smrg         if (!id_swizzle) {
658848b8605Smrg            sse_shufps(p->func, dataXMM, dataXMM,
659848b8605Smrg                       SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660848b8605Smrg         }
661848b8605Smrg      }
662848b8605Smrg
663848b8605Smrg      if (output_desc->nr_channels >= 4
664b8e80941Smrg          && swizzle[0] < PIPE_SWIZZLE_0
665b8e80941Smrg          && swizzle[1] < PIPE_SWIZZLE_0
666b8e80941Smrg          && swizzle[2] < PIPE_SWIZZLE_0
667b8e80941Smrg          && swizzle[3] < PIPE_SWIZZLE_0) {
668848b8605Smrg         sse_movups(p->func, dst, dataXMM);
669848b8605Smrg      }
670848b8605Smrg      else {
671848b8605Smrg         if (output_desc->nr_channels >= 2
672b8e80941Smrg             && swizzle[0] < PIPE_SWIZZLE_0
673b8e80941Smrg             && swizzle[1] < PIPE_SWIZZLE_0) {
674848b8605Smrg            sse_movlps(p->func, dst, dataXMM);
675848b8605Smrg         }
676848b8605Smrg         else {
677b8e80941Smrg            if (swizzle[0] < PIPE_SWIZZLE_0) {
678848b8605Smrg               sse_movss(p->func, dst, dataXMM);
679848b8605Smrg            }
680848b8605Smrg            else {
681848b8605Smrg               x86_mov_imm(p->func, dst,
682b8e80941Smrg                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
683848b8605Smrg            }
684848b8605Smrg
685848b8605Smrg            if (output_desc->nr_channels >= 2) {
686b8e80941Smrg               if (swizzle[1] < PIPE_SWIZZLE_0) {
687848b8605Smrg                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688848b8605Smrg                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689848b8605Smrg               }
690848b8605Smrg               else {
691848b8605Smrg                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
692b8e80941Smrg                              imms[swizzle[1] - PIPE_SWIZZLE_0]);
693848b8605Smrg               }
694848b8605Smrg            }
695848b8605Smrg         }
696848b8605Smrg
697848b8605Smrg         if (output_desc->nr_channels >= 3) {
698848b8605Smrg            if (output_desc->nr_channels >= 4
699b8e80941Smrg                && swizzle[2] < PIPE_SWIZZLE_0
700b8e80941Smrg                && swizzle[3] < PIPE_SWIZZLE_0) {
701848b8605Smrg               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702848b8605Smrg            }
703848b8605Smrg            else {
704b8e80941Smrg               if (swizzle[2] < PIPE_SWIZZLE_0) {
705848b8605Smrg                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706848b8605Smrg                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707848b8605Smrg               }
708848b8605Smrg               else {
709848b8605Smrg                  x86_mov_imm(p->func, x86_make_disp(dst, 8),
710b8e80941Smrg                              imms[swizzle[2] - PIPE_SWIZZLE_0]);
711848b8605Smrg               }
712848b8605Smrg
713848b8605Smrg               if (output_desc->nr_channels >= 4) {
714b8e80941Smrg                  if (swizzle[3] < PIPE_SWIZZLE_0) {
715848b8605Smrg                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716848b8605Smrg                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717848b8605Smrg                  }
718848b8605Smrg                  else {
719848b8605Smrg                     x86_mov_imm(p->func, x86_make_disp(dst, 12),
720b8e80941Smrg                                 imms[swizzle[3] - PIPE_SWIZZLE_0]);
721848b8605Smrg                  }
722848b8605Smrg               }
723848b8605Smrg            }
724848b8605Smrg         }
725848b8605Smrg      }
726848b8605Smrg      return TRUE;
727848b8605Smrg   }
728848b8605Smrg   else if ((x86_target_caps(p->func) & X86_SSE2)
729848b8605Smrg            && input_desc->channel[0].size == 8
730848b8605Smrg            && output_desc->channel[0].size == 16
731848b8605Smrg            && output_desc->channel[0].normalized ==
732848b8605Smrg            input_desc->channel[0].normalized &&
733848b8605Smrg            (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734848b8605Smrg                   && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735848b8605Smrg             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736848b8605Smrg                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737848b8605Smrg             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738848b8605Smrg                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739848b8605Smrg      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740848b8605Smrg      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741848b8605Smrg      struct x86_reg tmp = p->tmp_EAX;
742848b8605Smrg      unsigned imms[2] = { 0, 1 };
743848b8605Smrg
744848b8605Smrg      for (i = 0; i < output_desc->nr_channels; ++i) {
745b8e80941Smrg         if (swizzle[i] == PIPE_SWIZZLE_0
746848b8605Smrg             && i >= input_desc->nr_channels) {
747848b8605Smrg            swizzle[i] = i;
748848b8605Smrg         }
749848b8605Smrg      }
750848b8605Smrg
751848b8605Smrg      for (i = 0; i < output_desc->nr_channels; ++i) {
752848b8605Smrg         if (swizzle[i] < 4)
753848b8605Smrg            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754b8e80941Smrg         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755848b8605Smrg            id_swizzle = FALSE;
756848b8605Smrg      }
757848b8605Smrg
758848b8605Smrg      if (needed_chans > 0) {
759848b8605Smrg         emit_load_sse2(p, dataXMM, src,
760848b8605Smrg                        input_desc->channel[0].size *
761848b8605Smrg                        input_desc->nr_channels >> 3);
762848b8605Smrg
763848b8605Smrg         switch (input_desc->channel[0].type) {
764848b8605Smrg         case UTIL_FORMAT_TYPE_UNSIGNED:
765848b8605Smrg            if (input_desc->channel[0].normalized) {
766848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, dataXMM);
767848b8605Smrg               if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768848b8605Smrg                  sse2_psrlw_imm(p->func, dataXMM, 1);
769848b8605Smrg            }
770848b8605Smrg            else
771848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772848b8605Smrg            break;
773848b8605Smrg         case UTIL_FORMAT_TYPE_SIGNED:
774848b8605Smrg            if (input_desc->channel[0].normalized) {
775848b8605Smrg               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776848b8605Smrg               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777848b8605Smrg               sse2_psllw_imm(p->func, dataXMM, 9);
778848b8605Smrg               sse2_psrlw_imm(p->func, dataXMM, 8);
779848b8605Smrg               sse2_por(p->func, tmpXMM, dataXMM);
780848b8605Smrg               sse2_psrlw_imm(p->func, dataXMM, 7);
781848b8605Smrg               sse2_por(p->func, tmpXMM, dataXMM);
782848b8605Smrg               {
783848b8605Smrg                  struct x86_reg t = dataXMM;
784848b8605Smrg                  dataXMM = tmpXMM;
785848b8605Smrg                  tmpXMM = t;
786848b8605Smrg               }
787848b8605Smrg            }
788848b8605Smrg            else {
789848b8605Smrg               sse2_punpcklbw(p->func, dataXMM, dataXMM);
790848b8605Smrg               sse2_psraw_imm(p->func, dataXMM, 8);
791848b8605Smrg            }
792848b8605Smrg            break;
793848b8605Smrg         default:
794848b8605Smrg            assert(0);
795848b8605Smrg         }
796848b8605Smrg
797848b8605Smrg         if (output_desc->channel[0].normalized)
798848b8605Smrg            imms[1] =
799848b8605Smrg               (output_desc->channel[0].type ==
800848b8605Smrg                UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801848b8605Smrg
802848b8605Smrg         if (!id_swizzle)
803848b8605Smrg            sse2_pshuflw(p->func, dataXMM, dataXMM,
804848b8605Smrg                         (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805848b8605Smrg                         ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806848b8605Smrg      }
807848b8605Smrg
808848b8605Smrg      if (output_desc->nr_channels >= 4
809b8e80941Smrg          && swizzle[0] < PIPE_SWIZZLE_0
810b8e80941Smrg          && swizzle[1] < PIPE_SWIZZLE_0
811b8e80941Smrg          && swizzle[2] < PIPE_SWIZZLE_0
812b8e80941Smrg          && swizzle[3] < PIPE_SWIZZLE_0) {
813848b8605Smrg         sse2_movq(p->func, dst, dataXMM);
814848b8605Smrg      }
815848b8605Smrg      else {
816b8e80941Smrg         if (swizzle[0] < PIPE_SWIZZLE_0) {
817848b8605Smrg            if (output_desc->nr_channels >= 2
818b8e80941Smrg                && swizzle[1] < PIPE_SWIZZLE_0) {
819848b8605Smrg               sse2_movd(p->func, dst, dataXMM);
820848b8605Smrg            }
821848b8605Smrg            else {
822848b8605Smrg               sse2_movd(p->func, tmp, dataXMM);
823848b8605Smrg               x86_mov16(p->func, dst, tmp);
824848b8605Smrg               if (output_desc->nr_channels >= 2)
825848b8605Smrg                  x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826b8e80941Smrg                                imms[swizzle[1] - PIPE_SWIZZLE_0]);
827848b8605Smrg            }
828848b8605Smrg         }
829848b8605Smrg         else {
830848b8605Smrg            if (output_desc->nr_channels >= 2
831b8e80941Smrg                && swizzle[1] >= PIPE_SWIZZLE_0) {
832848b8605Smrg               x86_mov_imm(p->func, dst,
833b8e80941Smrg                           (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834b8e80941Smrg                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
835848b8605Smrg            }
836848b8605Smrg            else {
837848b8605Smrg               x86_mov16_imm(p->func, dst,
838b8e80941Smrg                             imms[swizzle[0] - PIPE_SWIZZLE_0]);
839848b8605Smrg               if (output_desc->nr_channels >= 2) {
840848b8605Smrg                  sse2_movd(p->func, tmp, dataXMM);
841848b8605Smrg                  x86_shr_imm(p->func, tmp, 16);
842848b8605Smrg                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843848b8605Smrg               }
844848b8605Smrg            }
845848b8605Smrg         }
846848b8605Smrg
847848b8605Smrg         if (output_desc->nr_channels >= 3) {
848b8e80941Smrg            if (swizzle[2] < PIPE_SWIZZLE_0) {
849848b8605Smrg               if (output_desc->nr_channels >= 4
850b8e80941Smrg                   && swizzle[3] < PIPE_SWIZZLE_0) {
851848b8605Smrg                  sse2_psrlq_imm(p->func, dataXMM, 32);
852848b8605Smrg                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853848b8605Smrg               }
854848b8605Smrg               else {
855848b8605Smrg                  sse2_psrlq_imm(p->func, dataXMM, 32);
856848b8605Smrg                  sse2_movd(p->func, tmp, dataXMM);
857848b8605Smrg                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858848b8605Smrg                  if (output_desc->nr_channels >= 4) {
859848b8605Smrg                     x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860b8e80941Smrg                                   imms[swizzle[3] - PIPE_SWIZZLE_0]);
861848b8605Smrg                  }
862848b8605Smrg               }
863848b8605Smrg            }
864848b8605Smrg            else {
865848b8605Smrg               if (output_desc->nr_channels >= 4
866b8e80941Smrg                   && swizzle[3] >= PIPE_SWIZZLE_0) {
867848b8605Smrg                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
868b8e80941Smrg                              (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869b8e80941Smrg                              | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870848b8605Smrg               }
871848b8605Smrg               else {
872848b8605Smrg                  x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873b8e80941Smrg                                imms[swizzle[2] - PIPE_SWIZZLE_0]);
874848b8605Smrg
875848b8605Smrg                  if (output_desc->nr_channels >= 4) {
876848b8605Smrg                     sse2_psrlq_imm(p->func, dataXMM, 48);
877848b8605Smrg                     sse2_movd(p->func, tmp, dataXMM);
878848b8605Smrg                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879848b8605Smrg                  }
880848b8605Smrg               }
881848b8605Smrg            }
882848b8605Smrg         }
883848b8605Smrg      }
884848b8605Smrg      return TRUE;
885848b8605Smrg   }
886848b8605Smrg   else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887848b8605Smrg                    sizeof(output_desc->channel[0]))) {
888848b8605Smrg      struct x86_reg tmp = p->tmp_EAX;
889848b8605Smrg      unsigned i;
890848b8605Smrg
891848b8605Smrg      if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892848b8605Smrg          && output_desc->nr_channels == 4
893b8e80941Smrg          && swizzle[0] == PIPE_SWIZZLE_W
894b8e80941Smrg          && swizzle[1] == PIPE_SWIZZLE_Z
895b8e80941Smrg          && swizzle[2] == PIPE_SWIZZLE_Y
896b8e80941Smrg          && swizzle[3] == PIPE_SWIZZLE_X) {
897848b8605Smrg         /* TODO: support movbe */
898848b8605Smrg         x86_mov(p->func, tmp, src);
899848b8605Smrg         x86_bswap(p->func, tmp);
900848b8605Smrg         x86_mov(p->func, dst, tmp);
901848b8605Smrg         return TRUE;
902848b8605Smrg      }
903848b8605Smrg
904848b8605Smrg      for (i = 0; i < output_desc->nr_channels; ++i) {
905848b8605Smrg         switch (output_desc->channel[0].size) {
906848b8605Smrg         case 8:
907b8e80941Smrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
908848b8605Smrg               unsigned v = 0;
909b8e80941Smrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
910848b8605Smrg                  switch (output_desc->channel[0].type) {
911848b8605Smrg                  case UTIL_FORMAT_TYPE_UNSIGNED:
912848b8605Smrg                     v = output_desc->channel[0].normalized ? 0xff : 1;
913848b8605Smrg                     break;
914848b8605Smrg                  case UTIL_FORMAT_TYPE_SIGNED:
915848b8605Smrg                     v = output_desc->channel[0].normalized ? 0x7f : 1;
916848b8605Smrg                     break;
917848b8605Smrg                  default:
918848b8605Smrg                     return FALSE;
919848b8605Smrg                  }
920848b8605Smrg               }
921848b8605Smrg               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922848b8605Smrg            }
923848b8605Smrg            else {
924848b8605Smrg               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925848b8605Smrg               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926848b8605Smrg            }
927848b8605Smrg            break;
928848b8605Smrg         case 16:
929b8e80941Smrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
930848b8605Smrg               unsigned v = 0;
931b8e80941Smrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
932848b8605Smrg                  switch (output_desc->channel[1].type) {
933848b8605Smrg                  case UTIL_FORMAT_TYPE_UNSIGNED:
934848b8605Smrg                     v = output_desc->channel[1].normalized ? 0xffff : 1;
935848b8605Smrg                     break;
936848b8605Smrg                  case UTIL_FORMAT_TYPE_SIGNED:
937848b8605Smrg                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
938848b8605Smrg                     break;
939848b8605Smrg                  case UTIL_FORMAT_TYPE_FLOAT:
940848b8605Smrg                     v = 0x3c00;
941848b8605Smrg                     break;
942848b8605Smrg                  default:
943848b8605Smrg                     return FALSE;
944848b8605Smrg                  }
945848b8605Smrg               }
946848b8605Smrg               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947848b8605Smrg            }
948b8e80941Smrg            else if (swizzle[i] == PIPE_SWIZZLE_0) {
949848b8605Smrg               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950848b8605Smrg            }
951848b8605Smrg            else {
952848b8605Smrg               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953848b8605Smrg               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954848b8605Smrg            }
955848b8605Smrg            break;
956848b8605Smrg         case 32:
957b8e80941Smrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
958848b8605Smrg               unsigned v = 0;
959b8e80941Smrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
960848b8605Smrg                  switch (output_desc->channel[1].type) {
961848b8605Smrg                  case UTIL_FORMAT_TYPE_UNSIGNED:
962848b8605Smrg                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963848b8605Smrg                     break;
964848b8605Smrg                  case UTIL_FORMAT_TYPE_SIGNED:
965848b8605Smrg                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966848b8605Smrg                     break;
967848b8605Smrg                  case UTIL_FORMAT_TYPE_FLOAT:
968848b8605Smrg                     v = 0x3f800000;
969848b8605Smrg                     break;
970848b8605Smrg                  default:
971848b8605Smrg                     return FALSE;
972848b8605Smrg                  }
973848b8605Smrg               }
974848b8605Smrg               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975848b8605Smrg            }
976848b8605Smrg            else {
977848b8605Smrg               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978848b8605Smrg               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979848b8605Smrg            }
980848b8605Smrg            break;
981848b8605Smrg         case 64:
982b8e80941Smrg            if (swizzle[i] >= PIPE_SWIZZLE_0) {
983848b8605Smrg               unsigned l = 0;
984848b8605Smrg               unsigned h = 0;
985b8e80941Smrg               if (swizzle[i] == PIPE_SWIZZLE_1) {
986848b8605Smrg                  switch (output_desc->channel[1].type) {
987848b8605Smrg                  case UTIL_FORMAT_TYPE_UNSIGNED:
988848b8605Smrg                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989848b8605Smrg                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990848b8605Smrg                     break;
991848b8605Smrg                  case UTIL_FORMAT_TYPE_SIGNED:
992848b8605Smrg                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993848b8605Smrg                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994848b8605Smrg                     break;
995848b8605Smrg                  case UTIL_FORMAT_TYPE_FLOAT:
996848b8605Smrg                     h = 0x3ff00000;
997848b8605Smrg                     l = 0;
998848b8605Smrg                     break;
999848b8605Smrg                  default:
1000848b8605Smrg                     return FALSE;
1001848b8605Smrg                  }
1002848b8605Smrg               }
1003848b8605Smrg               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004848b8605Smrg               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005848b8605Smrg            }
1006848b8605Smrg            else {
1007848b8605Smrg               if (x86_target_caps(p->func) & X86_SSE) {
1008848b8605Smrg                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009848b8605Smrg                  emit_load64(p, tmp, tmpXMM,
1010848b8605Smrg                              x86_make_disp(src, swizzle[i] * 8));
1011848b8605Smrg                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012848b8605Smrg               }
1013848b8605Smrg               else {
1014848b8605Smrg                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015848b8605Smrg                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016848b8605Smrg                  x86_mov(p->func, tmp,
1017848b8605Smrg                          x86_make_disp(src, swizzle[i] * 8 + 4));
1018848b8605Smrg                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019848b8605Smrg               }
1020848b8605Smrg            }
1021848b8605Smrg            break;
1022848b8605Smrg         default:
1023848b8605Smrg            return FALSE;
1024848b8605Smrg         }
1025848b8605Smrg      }
1026848b8605Smrg      return TRUE;
1027848b8605Smrg   }
1028848b8605Smrg   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029848b8605Smrg   else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030848b8605Smrg            a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031848b8605Smrg            (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032848b8605Smrg             || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033848b8605Smrg      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034848b8605Smrg
1035848b8605Smrg      /* load */
1036848b8605Smrg      sse_movups(p->func, dataXMM, src);
1037848b8605Smrg
1038848b8605Smrg      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039848b8605Smrg         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040848b8605Smrg      }
1041848b8605Smrg
1042848b8605Smrg      /* scale by 255.0 */
1043848b8605Smrg      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044848b8605Smrg
1045848b8605Smrg      /* pack and emit */
1046848b8605Smrg      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047848b8605Smrg      sse2_packssdw(p->func, dataXMM, dataXMM);
1048848b8605Smrg      sse2_packuswb(p->func, dataXMM, dataXMM);
1049848b8605Smrg      sse2_movd(p->func, dst, dataXMM);
1050848b8605Smrg
1051848b8605Smrg      return TRUE;
1052848b8605Smrg   }
1053848b8605Smrg
1054848b8605Smrg   return FALSE;
1055848b8605Smrg}
1056848b8605Smrg
1057848b8605Smrg
1058848b8605Smrgstatic boolean
1059848b8605Smrgtranslate_attr(struct translate_sse *p,
1060848b8605Smrg               const struct translate_element *a,
1061848b8605Smrg               struct x86_reg src, struct x86_reg dst)
1062848b8605Smrg{
1063848b8605Smrg   if (a->input_format == a->output_format) {
1064848b8605Smrg      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065848b8605Smrg      return TRUE;
1066848b8605Smrg   }
1067848b8605Smrg
1068848b8605Smrg   return translate_attr_convert(p, a, src, dst);
1069848b8605Smrg}
1070848b8605Smrg
1071848b8605Smrg
1072848b8605Smrgstatic boolean
1073848b8605Smrginit_inputs(struct translate_sse *p, unsigned index_size)
1074848b8605Smrg{
1075848b8605Smrg   unsigned i;
1076848b8605Smrg   struct x86_reg instance_id =
1077848b8605Smrg      x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078848b8605Smrg   struct x86_reg start_instance =
1079848b8605Smrg      x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080848b8605Smrg
1081848b8605Smrg   for (i = 0; i < p->nr_buffer_variants; i++) {
1082848b8605Smrg      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083848b8605Smrg      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084848b8605Smrg
1085848b8605Smrg      if (!index_size || variant->instance_divisor) {
1086848b8605Smrg         struct x86_reg buf_max_index =
1087848b8605Smrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088848b8605Smrg         struct x86_reg buf_stride =
1089848b8605Smrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090848b8605Smrg         struct x86_reg buf_ptr =
1091848b8605Smrg            x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092848b8605Smrg         struct x86_reg buf_base_ptr =
1093848b8605Smrg            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094848b8605Smrg         struct x86_reg elt = p->idx_ESI;
1095848b8605Smrg         struct x86_reg tmp_EAX = p->tmp_EAX;
1096848b8605Smrg
1097848b8605Smrg         /* Calculate pointer to first attrib:
1098848b8605Smrg          *   base_ptr + stride * index, where index depends on instance divisor
1099848b8605Smrg          */
1100848b8605Smrg         if (variant->instance_divisor) {
1101b8e80941Smrg            struct x86_reg tmp_EDX = p->tmp2_EDX;
1102b8e80941Smrg
1103848b8605Smrg            /* Start with instance = instance_id
1104848b8605Smrg             * which is true if divisor is 1.
1105848b8605Smrg             */
1106848b8605Smrg            x86_mov(p->func, tmp_EAX, instance_id);
1107848b8605Smrg
1108848b8605Smrg            if (variant->instance_divisor != 1) {
1109848b8605Smrg               struct x86_reg tmp_ECX = p->src_ECX;
1110848b8605Smrg
1111848b8605Smrg               /* TODO: Add x86_shr() to rtasm and use it whenever
1112848b8605Smrg                *       instance divisor is power of two.
1113848b8605Smrg                */
1114848b8605Smrg               x86_xor(p->func, tmp_EDX, tmp_EDX);
1115848b8605Smrg               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116848b8605Smrg               x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1117848b8605Smrg            }
1118848b8605Smrg
1119b8e80941Smrg            /* instance = (instance_id / divisor) + start_instance
1120b8e80941Smrg             */
1121b8e80941Smrg            x86_mov(p->func, tmp_EDX, start_instance);
1122b8e80941Smrg            x86_add(p->func, tmp_EAX, tmp_EDX);
1123b8e80941Smrg
1124848b8605Smrg            /* XXX we need to clamp the index here too, but to a
1125848b8605Smrg             * per-array max value, not the draw->pt.max_index value
1126848b8605Smrg             * that's being given to us via translate->set_buffer().
1127848b8605Smrg             */
1128848b8605Smrg         }
1129848b8605Smrg         else {
1130848b8605Smrg            x86_mov(p->func, tmp_EAX, elt);
1131848b8605Smrg
1132848b8605Smrg            /* Clamp to max_index
1133848b8605Smrg             */
1134848b8605Smrg            x86_cmp(p->func, tmp_EAX, buf_max_index);
1135848b8605Smrg            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136848b8605Smrg         }
1137848b8605Smrg
1138848b8605Smrg         x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139848b8605Smrg         x64_rexw(p->func);
1140848b8605Smrg         x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141848b8605Smrg         x64_rexw(p->func);
1142848b8605Smrg         x86_add(p->func, tmp_EAX, buf_base_ptr);
1143848b8605Smrg
1144848b8605Smrg         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145848b8605Smrg
1146848b8605Smrg         /* In the linear case, keep the buffer pointer instead of the
1147848b8605Smrg          * index number.
1148848b8605Smrg          */
1149848b8605Smrg         if (!index_size && p->nr_buffer_variants == 1) {
1150848b8605Smrg            x64_rexw(p->func);
1151848b8605Smrg            x86_mov(p->func, elt, tmp_EAX);
1152848b8605Smrg         }
1153848b8605Smrg         else {
1154848b8605Smrg            x64_rexw(p->func);
1155848b8605Smrg            x86_mov(p->func, buf_ptr, tmp_EAX);
1156848b8605Smrg         }
1157848b8605Smrg      }
1158848b8605Smrg   }
1159848b8605Smrg
1160848b8605Smrg   return TRUE;
1161848b8605Smrg}
1162848b8605Smrg
1163848b8605Smrg
1164848b8605Smrgstatic struct x86_reg
1165848b8605Smrgget_buffer_ptr(struct translate_sse *p,
1166848b8605Smrg               unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167848b8605Smrg{
1168848b8605Smrg   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169848b8605Smrg      return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170848b8605Smrg   }
1171848b8605Smrg   if (!index_size && p->nr_buffer_variants == 1) {
1172848b8605Smrg      return p->idx_ESI;
1173848b8605Smrg   }
1174848b8605Smrg   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175848b8605Smrg      struct x86_reg ptr = p->src_ECX;
1176848b8605Smrg      struct x86_reg buf_ptr =
1177848b8605Smrg         x86_make_disp(p->machine_EDI,
1178848b8605Smrg                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1179848b8605Smrg
1180848b8605Smrg      x64_rexw(p->func);
1181848b8605Smrg      x86_mov(p->func, ptr, buf_ptr);
1182848b8605Smrg      return ptr;
1183848b8605Smrg   }
1184848b8605Smrg   else {
1185848b8605Smrg      struct x86_reg ptr = p->src_ECX;
1186848b8605Smrg      const struct translate_buffer_variant *variant =
1187848b8605Smrg         &p->buffer_variant[var_idx];
1188848b8605Smrg      struct x86_reg buf_stride =
1189848b8605Smrg         x86_make_disp(p->machine_EDI,
1190848b8605Smrg                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1191848b8605Smrg      struct x86_reg buf_base_ptr =
1192848b8605Smrg         x86_make_disp(p->machine_EDI,
1193848b8605Smrg                  get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194848b8605Smrg      struct x86_reg buf_max_index =
1195848b8605Smrg         x86_make_disp(p->machine_EDI,
1196848b8605Smrg                  get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197848b8605Smrg
1198848b8605Smrg      /* Calculate pointer to current attrib:
1199848b8605Smrg       */
1200848b8605Smrg      switch (index_size) {
1201848b8605Smrg      case 1:
1202848b8605Smrg         x86_movzx8(p->func, ptr, elt);
1203848b8605Smrg         break;
1204848b8605Smrg      case 2:
1205848b8605Smrg         x86_movzx16(p->func, ptr, elt);
1206848b8605Smrg         break;
1207848b8605Smrg      case 4:
1208848b8605Smrg         x86_mov(p->func, ptr, elt);
1209848b8605Smrg         break;
1210848b8605Smrg      }
1211848b8605Smrg
1212848b8605Smrg      /* Clamp to max_index
1213848b8605Smrg       */
1214848b8605Smrg      x86_cmp(p->func, ptr, buf_max_index);
1215848b8605Smrg      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216848b8605Smrg
1217848b8605Smrg      x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218848b8605Smrg      x64_rexw(p->func);
1219848b8605Smrg      x86_imul(p->func, ptr, p->tmp2_EDX);
1220848b8605Smrg      x64_rexw(p->func);
1221848b8605Smrg      x86_add(p->func, ptr, buf_base_ptr);
1222848b8605Smrg      return ptr;
1223848b8605Smrg   }
1224848b8605Smrg}
1225848b8605Smrg
1226848b8605Smrg
1227848b8605Smrgstatic boolean
1228848b8605Smrgincr_inputs(struct translate_sse *p, unsigned index_size)
1229848b8605Smrg{
1230848b8605Smrg   if (!index_size && p->nr_buffer_variants == 1) {
1231848b8605Smrg      const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232848b8605Smrg      struct x86_reg stride =
1233848b8605Smrg         x86_make_disp(p->machine_EDI,
1234848b8605Smrg                       get_offset(p, &p->buffer[buffer_index].stride));
1235848b8605Smrg
1236848b8605Smrg      if (p->buffer_variant[0].instance_divisor == 0) {
1237848b8605Smrg         x64_rexw(p->func);
1238848b8605Smrg         x86_add(p->func, p->idx_ESI, stride);
1239848b8605Smrg         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240848b8605Smrg      }
1241848b8605Smrg   }
1242848b8605Smrg   else if (!index_size) {
1243848b8605Smrg      unsigned i;
1244848b8605Smrg
1245848b8605Smrg      /* Is this worthwhile??
1246848b8605Smrg       */
1247848b8605Smrg      for (i = 0; i < p->nr_buffer_variants; i++) {
1248848b8605Smrg         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249848b8605Smrg         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250848b8605Smrg                                                get_offset(p, &variant->ptr));
1251848b8605Smrg      struct x86_reg buf_stride =
1252848b8605Smrg         x86_make_disp(p->machine_EDI,
1253848b8605Smrg                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1254848b8605Smrg
1255848b8605Smrg         if (variant->instance_divisor == 0) {
1256848b8605Smrg            x86_mov(p->func, p->tmp_EAX, buf_stride);
1257848b8605Smrg            x64_rexw(p->func);
1258848b8605Smrg            x86_add(p->func, p->tmp_EAX, buf_ptr);
1259848b8605Smrg            if (i == 0)
1260848b8605Smrg               sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261848b8605Smrg            x64_rexw(p->func);
1262848b8605Smrg            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263848b8605Smrg         }
1264848b8605Smrg      }
1265848b8605Smrg   }
1266848b8605Smrg   else {
1267848b8605Smrg      x64_rexw(p->func);
1268848b8605Smrg      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269848b8605Smrg   }
1270848b8605Smrg
1271848b8605Smrg   return TRUE;
1272848b8605Smrg}
1273848b8605Smrg
1274848b8605Smrg
1275848b8605Smrg/* Build run( struct translate *machine,
1276848b8605Smrg *            unsigned start,
1277848b8605Smrg *            unsigned count,
1278848b8605Smrg *            void *output_buffer )
1279848b8605Smrg * or
1280848b8605Smrg *  run_elts( struct translate *machine,
1281848b8605Smrg *            unsigned *elts,
1282848b8605Smrg *            unsigned count,
1283848b8605Smrg *            void *output_buffer )
1284848b8605Smrg *
1285848b8605Smrg *  Lots of hardcoding
1286848b8605Smrg *
1287848b8605Smrg * EAX -- pointer to current output vertex
1288848b8605Smrg * ECX -- pointer to current attribute
1289848b8605Smrg *
1290848b8605Smrg */
1291848b8605Smrgstatic boolean
1292848b8605Smrgbuild_vertex_emit(struct translate_sse *p,
1293848b8605Smrg                  struct x86_function *func, unsigned index_size)
1294848b8605Smrg{
1295848b8605Smrg   int fixup, label;
1296848b8605Smrg   unsigned j;
1297848b8605Smrg
1298848b8605Smrg   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299848b8605Smrg   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300848b8605Smrg
1301848b8605Smrg   p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302848b8605Smrg   p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303848b8605Smrg   p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304848b8605Smrg   p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305848b8605Smrg   p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306848b8605Smrg   p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307848b8605Smrg   p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308848b8605Smrg
1309848b8605Smrg   p->func = func;
1310848b8605Smrg
1311848b8605Smrg   x86_init_func(p->func);
1312848b8605Smrg
1313848b8605Smrg   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314848b8605Smrg      /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315848b8605Smrg       * above the return address
1316848b8605Smrg       */
1317848b8605Smrg      sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318848b8605Smrg                  x86_make_reg(file_XMM, 6));
1319848b8605Smrg      sse2_movdqa(p->func,
1320848b8605Smrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321848b8605Smrg                  x86_make_reg(file_XMM, 7));
1322848b8605Smrg   }
1323848b8605Smrg
1324848b8605Smrg   x86_push(p->func, p->outbuf_EBX);
1325848b8605Smrg   x86_push(p->func, p->count_EBP);
1326848b8605Smrg
1327848b8605Smrg   /* on non-Win64 x86-64, these are already in the right registers */
1328848b8605Smrg   if (x86_target(p->func) != X86_64_STD_ABI) {
1329848b8605Smrg      x86_push(p->func, p->machine_EDI);
1330848b8605Smrg      x86_push(p->func, p->idx_ESI);
1331848b8605Smrg
1332848b8605Smrg      if (x86_target(p->func) != X86_32) {
1333848b8605Smrg         x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334848b8605Smrg         x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335848b8605Smrg      }
1336848b8605Smrg      else {
1337848b8605Smrg         x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338848b8605Smrg         x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339848b8605Smrg      }
1340848b8605Smrg   }
1341848b8605Smrg
1342848b8605Smrg   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343848b8605Smrg
1344848b8605Smrg   if (x86_target(p->func) != X86_32)
1345848b8605Smrg      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346848b8605Smrg   else
1347848b8605Smrg      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348848b8605Smrg
1349848b8605Smrg   /* Load instance ID.
1350848b8605Smrg    */
1351848b8605Smrg   if (p->use_instancing) {
1352848b8605Smrg      x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353848b8605Smrg      x86_mov(p->func,
1354848b8605Smrg              x86_make_disp(p->machine_EDI,
1355848b8605Smrg                            get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356848b8605Smrg
1357848b8605Smrg      x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358848b8605Smrg      x86_mov(p->func,
1359848b8605Smrg              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360848b8605Smrg              p->tmp_EAX);
1361848b8605Smrg   }
1362848b8605Smrg
1363848b8605Smrg   /* Get vertex count, compare to zero
1364848b8605Smrg    */
1365848b8605Smrg   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366848b8605Smrg   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367848b8605Smrg   fixup = x86_jcc_forward(p->func, cc_E);
1368848b8605Smrg
1369848b8605Smrg   /* always load, needed or not:
1370848b8605Smrg    */
1371848b8605Smrg   init_inputs(p, index_size);
1372848b8605Smrg
1373848b8605Smrg   /* Note address for loop jump
1374848b8605Smrg    */
1375848b8605Smrg   label = x86_get_label(p->func);
1376848b8605Smrg   {
1377848b8605Smrg      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378848b8605Smrg      int last_variant = -1;
1379848b8605Smrg      struct x86_reg vb;
1380848b8605Smrg
1381848b8605Smrg      for (j = 0; j < p->translate.key.nr_elements; j++) {
1382848b8605Smrg         const struct translate_element *a = &p->translate.key.element[j];
1383848b8605Smrg         unsigned variant = p->element_to_buffer_variant[j];
1384848b8605Smrg
1385848b8605Smrg         /* Figure out source pointer address:
1386848b8605Smrg          */
1387848b8605Smrg         if (variant != last_variant) {
1388848b8605Smrg            last_variant = variant;
1389848b8605Smrg            vb = get_buffer_ptr(p, index_size, variant, elt);
1390848b8605Smrg         }
1391848b8605Smrg
1392848b8605Smrg         if (!translate_attr(p, a,
1393848b8605Smrg                             x86_make_disp(vb, a->input_offset),
1394848b8605Smrg                             x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395848b8605Smrg            return FALSE;
1396848b8605Smrg      }
1397848b8605Smrg
1398848b8605Smrg      /* Next output vertex:
1399848b8605Smrg       */
1400848b8605Smrg      x64_rexw(p->func);
1401848b8605Smrg      x86_lea(p->func, p->outbuf_EBX,
1402848b8605Smrg              x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403848b8605Smrg
1404848b8605Smrg      /* Incr index
1405848b8605Smrg       */
1406848b8605Smrg      incr_inputs(p, index_size);
1407848b8605Smrg   }
1408848b8605Smrg
1409848b8605Smrg   /* decr count, loop if not zero
1410848b8605Smrg    */
1411848b8605Smrg   x86_dec(p->func, p->count_EBP);
1412848b8605Smrg   x86_jcc(p->func, cc_NZ, label);
1413848b8605Smrg
1414848b8605Smrg   /* Exit mmx state?
1415848b8605Smrg    */
1416848b8605Smrg   if (p->func->need_emms)
1417848b8605Smrg      mmx_emms(p->func);
1418848b8605Smrg
1419848b8605Smrg   /* Land forward jump here:
1420848b8605Smrg    */
1421848b8605Smrg   x86_fixup_fwd_jump(p->func, fixup);
1422848b8605Smrg
1423848b8605Smrg   /* Pop regs and return
1424848b8605Smrg    */
1425848b8605Smrg   if (x86_target(p->func) != X86_64_STD_ABI) {
1426848b8605Smrg      x86_pop(p->func, p->idx_ESI);
1427848b8605Smrg      x86_pop(p->func, p->machine_EDI);
1428848b8605Smrg   }
1429848b8605Smrg
1430848b8605Smrg   x86_pop(p->func, p->count_EBP);
1431848b8605Smrg   x86_pop(p->func, p->outbuf_EBX);
1432848b8605Smrg
1433848b8605Smrg   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434848b8605Smrg      sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435848b8605Smrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436848b8605Smrg      sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437848b8605Smrg                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438848b8605Smrg   }
1439848b8605Smrg   x86_ret(p->func);
1440848b8605Smrg
1441848b8605Smrg   return TRUE;
1442848b8605Smrg}
1443848b8605Smrg
1444848b8605Smrg
1445848b8605Smrgstatic void
1446848b8605Smrgtranslate_sse_set_buffer(struct translate *translate,
1447848b8605Smrg                         unsigned buf,
1448848b8605Smrg                         const void *ptr, unsigned stride, unsigned max_index)
1449848b8605Smrg{
1450848b8605Smrg   struct translate_sse *p = (struct translate_sse *) translate;
1451848b8605Smrg
1452848b8605Smrg   if (buf < p->nr_buffers) {
1453848b8605Smrg      p->buffer[buf].base_ptr = (char *) ptr;
1454848b8605Smrg      p->buffer[buf].stride = stride;
1455848b8605Smrg      p->buffer[buf].max_index = max_index;
1456848b8605Smrg   }
1457848b8605Smrg
1458848b8605Smrg   if (0)
1459848b8605Smrg      debug_printf("%s %d/%d: %p %d\n",
1460848b8605Smrg                   __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461848b8605Smrg}
1462848b8605Smrg
1463848b8605Smrg
1464848b8605Smrgstatic void
1465848b8605Smrgtranslate_sse_release(struct translate *translate)
1466848b8605Smrg{
1467848b8605Smrg   struct translate_sse *p = (struct translate_sse *) translate;
1468848b8605Smrg
1469848b8605Smrg   x86_release_func(&p->elt8_func);
1470848b8605Smrg   x86_release_func(&p->elt16_func);
1471848b8605Smrg   x86_release_func(&p->elt_func);
1472848b8605Smrg   x86_release_func(&p->linear_func);
1473848b8605Smrg
1474848b8605Smrg   os_free_aligned(p);
1475848b8605Smrg}
1476848b8605Smrg
1477848b8605Smrg
1478848b8605Smrgstruct translate *
1479848b8605Smrgtranslate_sse2_create(const struct translate_key *key)
1480848b8605Smrg{
1481848b8605Smrg   struct translate_sse *p = NULL;
1482848b8605Smrg   unsigned i;
1483848b8605Smrg
1484848b8605Smrg   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485848b8605Smrg   if (!rtasm_cpu_has_sse())
1486848b8605Smrg      goto fail;
1487848b8605Smrg
1488848b8605Smrg   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489b8e80941Smrg   if (!p)
1490848b8605Smrg      goto fail;
1491848b8605Smrg
1492848b8605Smrg   memset(p, 0, sizeof(*p));
1493848b8605Smrg   memcpy(p->consts, consts, sizeof(consts));
1494848b8605Smrg
1495848b8605Smrg   p->translate.key = *key;
1496848b8605Smrg   p->translate.release = translate_sse_release;
1497848b8605Smrg   p->translate.set_buffer = translate_sse_set_buffer;
1498848b8605Smrg
1499848b8605Smrg   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500848b8605Smrg
1501848b8605Smrg   for (i = 0; i < key->nr_elements; i++) {
1502848b8605Smrg      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503848b8605Smrg         unsigned j;
1504848b8605Smrg
1505848b8605Smrg         p->nr_buffers =
1506848b8605Smrg            MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507848b8605Smrg
1508848b8605Smrg         if (key->element[i].instance_divisor) {
1509848b8605Smrg            p->use_instancing = TRUE;
1510848b8605Smrg         }
1511848b8605Smrg
1512848b8605Smrg         /*
1513848b8605Smrg          * Map vertex element to vertex buffer variant.
1514848b8605Smrg          */
1515848b8605Smrg         for (j = 0; j < p->nr_buffer_variants; j++) {
1516848b8605Smrg            if (p->buffer_variant[j].buffer_index ==
1517848b8605Smrg                key->element[i].input_buffer
1518848b8605Smrg                && p->buffer_variant[j].instance_divisor ==
1519848b8605Smrg                key->element[i].instance_divisor) {
1520848b8605Smrg               break;
1521848b8605Smrg            }
1522848b8605Smrg         }
1523848b8605Smrg         if (j == p->nr_buffer_variants) {
1524848b8605Smrg            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525848b8605Smrg            p->buffer_variant[j].instance_divisor =
1526848b8605Smrg               key->element[i].instance_divisor;
1527848b8605Smrg            p->nr_buffer_variants++;
1528848b8605Smrg         }
1529848b8605Smrg         p->element_to_buffer_variant[i] = j;
1530848b8605Smrg      }
1531848b8605Smrg      else {
1532848b8605Smrg         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533848b8605Smrg
1534848b8605Smrg         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535848b8605Smrg      }
1536848b8605Smrg   }
1537848b8605Smrg
1538848b8605Smrg   if (0)
1539848b8605Smrg      debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540848b8605Smrg
1541848b8605Smrg   if (!build_vertex_emit(p, &p->linear_func, 0))
1542848b8605Smrg      goto fail;
1543848b8605Smrg
1544848b8605Smrg   if (!build_vertex_emit(p, &p->elt_func, 4))
1545848b8605Smrg      goto fail;
1546848b8605Smrg
1547848b8605Smrg   if (!build_vertex_emit(p, &p->elt16_func, 2))
1548848b8605Smrg      goto fail;
1549848b8605Smrg
1550848b8605Smrg   if (!build_vertex_emit(p, &p->elt8_func, 1))
1551848b8605Smrg      goto fail;
1552848b8605Smrg
1553848b8605Smrg   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554848b8605Smrg   if (p->translate.run == NULL)
1555848b8605Smrg      goto fail;
1556848b8605Smrg
1557848b8605Smrg   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558848b8605Smrg   if (p->translate.run_elts == NULL)
1559848b8605Smrg      goto fail;
1560848b8605Smrg
1561848b8605Smrg   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562848b8605Smrg   if (p->translate.run_elts16 == NULL)
1563848b8605Smrg      goto fail;
1564848b8605Smrg
1565848b8605Smrg   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566848b8605Smrg   if (p->translate.run_elts8 == NULL)
1567848b8605Smrg      goto fail;
1568848b8605Smrg
1569848b8605Smrg   return &p->translate;
1570848b8605Smrg
1571848b8605Smrg fail:
1572848b8605Smrg   if (p)
1573848b8605Smrg      translate_sse_release(&p->translate);
1574848b8605Smrg
1575848b8605Smrg   return NULL;
1576848b8605Smrg}
1577848b8605Smrg
1578848b8605Smrg
1579848b8605Smrg#else
1580848b8605Smrg
1581848b8605Smrgstruct translate *
1582848b8605Smrgtranslate_sse2_create(const struct translate_key *key)
1583848b8605Smrg{
1584848b8605Smrg   return NULL;
1585848b8605Smrg}
1586848b8605Smrg
1587848b8605Smrg#endif
1588