1/*
2 * Copyright 2003 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@vmware.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33#include "util/format/u_format.h"
34
35#include "translate.h"
36
37
38#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(EMBEDDED_DEVICE)
39
40#include "rtasm/rtasm_cpu.h"
41#include "rtasm/rtasm_x86sse.h"
42
43
44#define X    0
45#define Y    1
46#define Z    2
47#define W    3
48
49
50struct translate_buffer
51{
52   const void *base_ptr;
53   uintptr_t stride;
54   unsigned max_index;
55};
56
57struct translate_buffer_variant
58{
59   unsigned buffer_index;
60   unsigned instance_divisor;
61   void *ptr;                   /* updated either per vertex or per instance */
62};
63
64
65#define ELEMENT_BUFFER_INSTANCE_ID  1001
66
67#define NUM_CONSTS 7
68
69enum
70{
71   CONST_IDENTITY,
72   CONST_INV_127,
73   CONST_INV_255,
74   CONST_INV_32767,
75   CONST_INV_65535,
76   CONST_INV_2147483647,
77   CONST_255
78};
79
80#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
81static float consts[NUM_CONSTS][4] = {
82   {0, 0, 0, 1},
83   C(1.0 / 127.0),
84   C(1.0 / 255.0),
85   C(1.0 / 32767.0),
86   C(1.0 / 65535.0),
87   C(1.0 / 2147483647.0),
88   C(255.0)
89};
90
91#undef C
92
93struct translate_sse
94{
95   struct translate translate;
96
97   struct x86_function linear_func;
98   struct x86_function elt_func;
99   struct x86_function elt16_func;
100   struct x86_function elt8_func;
101   struct x86_function *func;
102
103     PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
104   int8_t reg_to_const[16];
105   int8_t const_to_reg[NUM_CONSTS];
106
107   struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS];
108   unsigned nr_buffers;
109
110   /* Multiple buffer variants can map to a single buffer. */
111   struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS];
112   unsigned nr_buffer_variants;
113
114   /* Multiple elements can map to a single buffer variant. */
115   unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS];
116
117   boolean use_instancing;
118   unsigned instance_id;
119   unsigned start_instance;
120
121   /* these are actually known values, but putting them in a struct
122    * like this is helpful to keep them in sync across the file.
123    */
124   struct x86_reg tmp_EAX;
125   struct x86_reg tmp2_EDX;
126   struct x86_reg src_ECX;
127   struct x86_reg idx_ESI;      /* either start+i or &elt[i] */
128   struct x86_reg machine_EDI;
129   struct x86_reg outbuf_EBX;
130   struct x86_reg count_EBP;    /* decrements to zero */
131};
132
133
134static int
135get_offset(const void *a, const void *b)
136{
137   return (const char *) b - (const char *) a;
138}
139
140
141static struct x86_reg
142get_const(struct translate_sse *p, unsigned id)
143{
144   struct x86_reg reg;
145   unsigned i;
146
147   if (p->const_to_reg[id] >= 0)
148      return x86_make_reg(file_XMM, p->const_to_reg[id]);
149
150   for (i = 2; i < 8; ++i) {
151      if (p->reg_to_const[i] < 0)
152         break;
153   }
154
155   /* TODO: be smarter here */
156   if (i == 8)
157      --i;
158
159   reg = x86_make_reg(file_XMM, i);
160
161   if (p->reg_to_const[i] >= 0)
162      p->const_to_reg[p->reg_to_const[i]] = -1;
163
164   p->reg_to_const[i] = id;
165   p->const_to_reg[id] = i;
166
167   /* TODO: this should happen outside the loop, if possible */
168   sse_movaps(p->func, reg,
169              x86_make_disp(p->machine_EDI,
170                            get_offset(p, &p->consts[id][0])));
171
172   return reg;
173}
174
175
176/* load the data in a SSE2 register, padding with zeros */
177static boolean
178emit_load_sse2(struct translate_sse *p,
179               struct x86_reg data, struct x86_reg src, unsigned size)
180{
181   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
182   struct x86_reg tmp = p->tmp_EAX;
183   switch (size) {
184   case 1:
185      x86_movzx8(p->func, tmp, src);
186      sse2_movd(p->func, data, tmp);
187      break;
188   case 2:
189      x86_movzx16(p->func, tmp, src);
190      sse2_movd(p->func, data, tmp);
191      break;
192   case 3:
193      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
194      x86_shl_imm(p->func, tmp, 16);
195      x86_mov16(p->func, tmp, src);
196      sse2_movd(p->func, data, tmp);
197      break;
198   case 4:
199      sse2_movd(p->func, data, src);
200      break;
201   case 6:
202      sse2_movd(p->func, data, src);
203      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
204      sse2_movd(p->func, tmpXMM, tmp);
205      sse2_punpckldq(p->func, data, tmpXMM);
206      break;
207   case 8:
208      sse2_movq(p->func, data, src);
209      break;
210   case 12:
211      sse2_movq(p->func, data, src);
212      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
213      sse2_punpcklqdq(p->func, data, tmpXMM);
214      break;
215   case 16:
216      sse2_movdqu(p->func, data, src);
217      break;
218   default:
219      return FALSE;
220   }
221   return TRUE;
222}
223
224
225/* this value can be passed for the out_chans argument */
226#define CHANNELS_0001 5
227
228
229/* this function will load #chans float values, and will
230 * pad the register with zeroes at least up to out_chans.
231 *
232 * If out_chans is set to CHANNELS_0001, then the fourth
233 * value will be padded with 1. Only pass this value if
234 * chans < 4 or results are undefined.
235 */
236static void
237emit_load_float32(struct translate_sse *p, struct x86_reg data,
238                  struct x86_reg arg0, unsigned out_chans, unsigned chans)
239{
240   switch (chans) {
241   case 1:
242      /* a 0 0 0
243       * a 0 0 1
244       */
245      sse_movss(p->func, data, arg0);
246      if (out_chans == CHANNELS_0001)
247         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
248      break;
249   case 2:
250      /* 0 0 0 1
251       * a b 0 1
252       */
253      if (out_chans == CHANNELS_0001)
254         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
255                    SHUF(X, Y, Z, W));
256      else if (out_chans > 2)
257         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
258      sse_movlps(p->func, data, arg0);
259      break;
260   case 3:
261      /* Have to jump through some hoops:
262       *
263       * c 0 0 0
264       * c 0 0 1 if out_chans == CHANNELS_0001
265       * 0 0 c 0/1
266       * a b c 0/1
267       */
268      sse_movss(p->func, data, x86_make_disp(arg0, 8));
269      if (out_chans == CHANNELS_0001)
270         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
271                    SHUF(X, Y, Z, W));
272      sse_shufps(p->func, data, data, SHUF(Y, Z, X, W));
273      sse_movlps(p->func, data, arg0);
274      break;
275   case 4:
276      sse_movups(p->func, data, arg0);
277      break;
278   }
279}
280
281/* this function behaves like emit_load_float32, but loads
282   64-bit floating point numbers, converting them to 32-bit
283  ones */
284static void
285emit_load_float64to32(struct translate_sse *p, struct x86_reg data,
286                      struct x86_reg arg0, unsigned out_chans, unsigned chans)
287{
288   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
289   switch (chans) {
290   case 1:
291      sse2_movsd(p->func, data, arg0);
292      if (out_chans > 1)
293         sse2_cvtpd2ps(p->func, data, data);
294      else
295         sse2_cvtsd2ss(p->func, data, data);
296      if (out_chans == CHANNELS_0001)
297         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
298                    SHUF(X, Y, Z, W));
299      break;
300   case 2:
301      sse2_movupd(p->func, data, arg0);
302      sse2_cvtpd2ps(p->func, data, data);
303      if (out_chans == CHANNELS_0001)
304         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY),
305                    SHUF(X, Y, Z, W));
306      else if (out_chans > 2)
307         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY));
308      break;
309   case 3:
310      sse2_movupd(p->func, data, arg0);
311      sse2_cvtpd2ps(p->func, data, data);
312      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
313      if (out_chans > 3)
314         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
315      else
316         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
317      sse_movlhps(p->func, data, tmpXMM);
318      if (out_chans == CHANNELS_0001)
319         sse_orps(p->func, data, get_const(p, CONST_IDENTITY));
320      break;
321   case 4:
322      sse2_movupd(p->func, data, arg0);
323      sse2_cvtpd2ps(p->func, data, data);
324      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
325      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
326      sse_movlhps(p->func, data, tmpXMM);
327      break;
328   }
329}
330
331
332static void
333emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr,
334           struct x86_reg dst_xmm, struct x86_reg src_gpr,
335           struct x86_reg src_xmm)
336{
337   if (x86_target(p->func) != X86_32)
338      x64_mov64(p->func, dst_gpr, src_gpr);
339   else {
340      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
341      if (x86_target_caps(p->func) & X86_SSE2)
342         sse2_movq(p->func, dst_xmm, src_xmm);
343      else
344         sse_movlps(p->func, dst_xmm, src_xmm);
345   }
346}
347
348
349static void
350emit_load64(struct translate_sse *p, struct x86_reg dst_gpr,
351            struct x86_reg dst_xmm, struct x86_reg src)
352{
353   emit_mov64(p, dst_gpr, dst_xmm, src, src);
354}
355
356
357static void
358emit_store64(struct translate_sse *p, struct x86_reg dst,
359             struct x86_reg src_gpr, struct x86_reg src_xmm)
360{
361   emit_mov64(p, dst, dst, src_gpr, src_xmm);
362}
363
364
365static void
366emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
367{
368   if (x86_target_caps(p->func) & X86_SSE2)
369      sse2_movdqu(p->func, dst, src);
370   else
371      sse_movups(p->func, dst, src);
372}
373
374
375/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
376 * but may or may not be good on older processors
377 * TODO: may perhaps want to use non-temporal stores here if possible
378 */
379static void
380emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src,
381            unsigned size)
382{
383   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
384   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
385   struct x86_reg dataGPR = p->tmp_EAX;
386   struct x86_reg dataGPR2 = p->tmp2_EDX;
387
388   if (size < 8) {
389      switch (size) {
390      case 1:
391         x86_mov8(p->func, dataGPR, src);
392         x86_mov8(p->func, dst, dataGPR);
393         break;
394      case 2:
395         x86_mov16(p->func, dataGPR, src);
396         x86_mov16(p->func, dst, dataGPR);
397         break;
398      case 3:
399         x86_mov16(p->func, dataGPR, src);
400         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
401         x86_mov16(p->func, dst, dataGPR);
402         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
403         break;
404      case 4:
405         x86_mov(p->func, dataGPR, src);
406         x86_mov(p->func, dst, dataGPR);
407         break;
408      case 6:
409         x86_mov(p->func, dataGPR, src);
410         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
411         x86_mov(p->func, dst, dataGPR);
412         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
413         break;
414      }
415   }
416   else if (!(x86_target_caps(p->func) & X86_SSE)) {
417      unsigned i = 0;
418      assert((size & 3) == 0);
419      for (i = 0; i < size; i += 4) {
420         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
421         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
422      }
423   }
424   else {
425      switch (size) {
426      case 8:
427         emit_load64(p, dataGPR, dataXMM, src);
428         emit_store64(p, dst, dataGPR, dataXMM);
429         break;
430      case 12:
431         emit_load64(p, dataGPR2, dataXMM, src);
432         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
433         emit_store64(p, dst, dataGPR2, dataXMM);
434         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
435         break;
436      case 16:
437         emit_mov128(p, dataXMM, src);
438         emit_mov128(p, dst, dataXMM);
439         break;
440      case 24:
441         emit_mov128(p, dataXMM, src);
442         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
443         emit_mov128(p, dst, dataXMM);
444         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
445         break;
446      case 32:
447         emit_mov128(p, dataXMM, src);
448         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
449         emit_mov128(p, dst, dataXMM);
450         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
451         break;
452      default:
453         assert(0);
454      }
455   }
456}
457
458static boolean
459translate_attr_convert(struct translate_sse *p,
460                       const struct translate_element *a,
461                       struct x86_reg src, struct x86_reg dst)
462{
463   const struct util_format_description *input_desc =
464      util_format_description(a->input_format);
465   const struct util_format_description *output_desc =
466      util_format_description(a->output_format);
467   unsigned i;
468   boolean id_swizzle = TRUE;
469   unsigned swizzle[4] =
470      { PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE,
471        PIPE_SWIZZLE_NONE, PIPE_SWIZZLE_NONE };
472   unsigned needed_chans = 0;
473   unsigned imms[2] = { 0, 0x3f800000 };
474
475   if (a->output_format == PIPE_FORMAT_NONE
476       || a->input_format == PIPE_FORMAT_NONE)
477      return FALSE;
478
479   if (input_desc->channel[0].size & 7)
480      return FALSE;
481
482   if (input_desc->colorspace != output_desc->colorspace)
483      return FALSE;
484
485   for (i = 1; i < input_desc->nr_channels; ++i) {
486      if (memcmp
487          (&input_desc->channel[i], &input_desc->channel[0],
488           sizeof(input_desc->channel[0])))
489         return FALSE;
490   }
491
492   for (i = 1; i < output_desc->nr_channels; ++i) {
493      if (memcmp
494          (&output_desc->channel[i], &output_desc->channel[0],
495           sizeof(output_desc->channel[0]))) {
496         return FALSE;
497      }
498   }
499
500   for (i = 0; i < output_desc->nr_channels; ++i) {
501      if (output_desc->swizzle[i] < 4)
502         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
503   }
504
505   if ((x86_target_caps(p->func) & X86_SSE) &&
506       (0 || a->output_format == PIPE_FORMAT_R32_FLOAT
507        || a->output_format == PIPE_FORMAT_R32G32_FLOAT
508        || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
509        || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
510      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
511
512      for (i = 0; i < output_desc->nr_channels; ++i) {
513         if (swizzle[i] == PIPE_SWIZZLE_0
514             && i >= input_desc->nr_channels)
515            swizzle[i] = i;
516      }
517
518      for (i = 0; i < output_desc->nr_channels; ++i) {
519         if (swizzle[i] < 4)
520            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
521         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
522            id_swizzle = FALSE;
523      }
524
525      if (needed_chans > 0) {
526         switch (input_desc->channel[0].type) {
527         case UTIL_FORMAT_TYPE_UNSIGNED:
528            if (!(x86_target_caps(p->func) & X86_SSE2))
529               return FALSE;
530            emit_load_sse2(p, dataXMM, src,
531                           input_desc->channel[0].size *
532                           input_desc->nr_channels >> 3);
533
534            /* TODO: add support for SSE4.1 pmovzx */
535            switch (input_desc->channel[0].size) {
536            case 8:
537               /* TODO: this may be inefficient due to get_identity() being
538                *  used both as a float and integer register.
539                */
540               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
541               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
542               break;
543            case 16:
544               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
545               break;
546            case 32:           /* we lose precision here */
547               sse2_psrld_imm(p->func, dataXMM, 1);
548               break;
549            default:
550               return FALSE;
551            }
552            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
553            if (input_desc->channel[0].normalized) {
554               struct x86_reg factor;
555               switch (input_desc->channel[0].size) {
556               case 8:
557                  factor = get_const(p, CONST_INV_255);
558                  break;
559               case 16:
560                  factor = get_const(p, CONST_INV_65535);
561                  break;
562               case 32:
563                  factor = get_const(p, CONST_INV_2147483647);
564                  break;
565               default:
566                  assert(0);
567                  factor.disp = 0;
568                  factor.file = 0;
569                  factor.idx = 0;
570                  factor.mod = 0;
571                  break;
572               }
573               sse_mulps(p->func, dataXMM, factor);
574            }
575            else if (input_desc->channel[0].size == 32)
576               /* compensate for the bit we threw away to fit u32 into s32 */
577               sse_addps(p->func, dataXMM, dataXMM);
578            break;
579         case UTIL_FORMAT_TYPE_SIGNED:
580            if (!(x86_target_caps(p->func) & X86_SSE2))
581               return FALSE;
582            emit_load_sse2(p, dataXMM, src,
583                           input_desc->channel[0].size *
584                           input_desc->nr_channels >> 3);
585
586            /* TODO: add support for SSE4.1 pmovsx */
587            switch (input_desc->channel[0].size) {
588            case 8:
589               sse2_punpcklbw(p->func, dataXMM, dataXMM);
590               sse2_punpcklbw(p->func, dataXMM, dataXMM);
591               sse2_psrad_imm(p->func, dataXMM, 24);
592               break;
593            case 16:
594               sse2_punpcklwd(p->func, dataXMM, dataXMM);
595               sse2_psrad_imm(p->func, dataXMM, 16);
596               break;
597            case 32:           /* we lose precision here */
598               break;
599            default:
600               return FALSE;
601            }
602            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
603            if (input_desc->channel[0].normalized) {
604               struct x86_reg factor;
605               switch (input_desc->channel[0].size) {
606               case 8:
607                  factor = get_const(p, CONST_INV_127);
608                  break;
609               case 16:
610                  factor = get_const(p, CONST_INV_32767);
611                  break;
612               case 32:
613                  factor = get_const(p, CONST_INV_2147483647);
614                  break;
615               default:
616                  assert(0);
617                  factor.disp = 0;
618                  factor.file = 0;
619                  factor.idx = 0;
620                  factor.mod = 0;
621                  break;
622               }
623               sse_mulps(p->func, dataXMM, factor);
624            }
625            break;
626
627            break;
628         case UTIL_FORMAT_TYPE_FLOAT:
629            if (input_desc->channel[0].size != 32
630                && input_desc->channel[0].size != 64) {
631               return FALSE;
632            }
633            if (swizzle[3] == PIPE_SWIZZLE_1
634                && input_desc->nr_channels <= 3) {
635               swizzle[3] = PIPE_SWIZZLE_W;
636               needed_chans = CHANNELS_0001;
637            }
638            switch (input_desc->channel[0].size) {
639            case 32:
640               emit_load_float32(p, dataXMM, src, needed_chans,
641                                 input_desc->nr_channels);
642               break;
643            case 64:           /* we lose precision here */
644               if (!(x86_target_caps(p->func) & X86_SSE2))
645                  return FALSE;
646               emit_load_float64to32(p, dataXMM, src, needed_chans,
647                                     input_desc->nr_channels);
648               break;
649            default:
650               return FALSE;
651            }
652            break;
653         default:
654            return FALSE;
655         }
656
657         if (!id_swizzle) {
658            sse_shufps(p->func, dataXMM, dataXMM,
659                       SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]));
660         }
661      }
662
663      if (output_desc->nr_channels >= 4
664          && swizzle[0] < PIPE_SWIZZLE_0
665          && swizzle[1] < PIPE_SWIZZLE_0
666          && swizzle[2] < PIPE_SWIZZLE_0
667          && swizzle[3] < PIPE_SWIZZLE_0) {
668         sse_movups(p->func, dst, dataXMM);
669      }
670      else {
671         if (output_desc->nr_channels >= 2
672             && swizzle[0] < PIPE_SWIZZLE_0
673             && swizzle[1] < PIPE_SWIZZLE_0) {
674            sse_movlps(p->func, dst, dataXMM);
675         }
676         else {
677            if (swizzle[0] < PIPE_SWIZZLE_0) {
678               sse_movss(p->func, dst, dataXMM);
679            }
680            else {
681               x86_mov_imm(p->func, dst,
682                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
683            }
684
685            if (output_desc->nr_channels >= 2) {
686               if (swizzle[1] < PIPE_SWIZZLE_0) {
687                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
688                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
689               }
690               else {
691                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
692                              imms[swizzle[1] - PIPE_SWIZZLE_0]);
693               }
694            }
695         }
696
697         if (output_desc->nr_channels >= 3) {
698            if (output_desc->nr_channels >= 4
699                && swizzle[2] < PIPE_SWIZZLE_0
700                && swizzle[3] < PIPE_SWIZZLE_0) {
701               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
702            }
703            else {
704               if (swizzle[2] < PIPE_SWIZZLE_0) {
705                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
706                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
707               }
708               else {
709                  x86_mov_imm(p->func, x86_make_disp(dst, 8),
710                              imms[swizzle[2] - PIPE_SWIZZLE_0]);
711               }
712
713               if (output_desc->nr_channels >= 4) {
714                  if (swizzle[3] < PIPE_SWIZZLE_0) {
715                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
716                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
717                  }
718                  else {
719                     x86_mov_imm(p->func, x86_make_disp(dst, 12),
720                                 imms[swizzle[3] - PIPE_SWIZZLE_0]);
721                  }
722               }
723            }
724         }
725      }
726      return TRUE;
727   }
728   else if ((x86_target_caps(p->func) & X86_SSE2)
729            && input_desc->channel[0].size == 8
730            && output_desc->channel[0].size == 16
731            && output_desc->channel[0].normalized ==
732            input_desc->channel[0].normalized &&
733            (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
734                   && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
735             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED
736                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
737             || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED
738                 && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) {
739      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
740      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
741      struct x86_reg tmp = p->tmp_EAX;
742      unsigned imms[2] = { 0, 1 };
743
744      for (i = 0; i < output_desc->nr_channels; ++i) {
745         if (swizzle[i] == PIPE_SWIZZLE_0
746             && i >= input_desc->nr_channels) {
747            swizzle[i] = i;
748         }
749      }
750
751      for (i = 0; i < output_desc->nr_channels; ++i) {
752         if (swizzle[i] < 4)
753            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
754         if (swizzle[i] < PIPE_SWIZZLE_0 && swizzle[i] != i)
755            id_swizzle = FALSE;
756      }
757
758      if (needed_chans > 0) {
759         emit_load_sse2(p, dataXMM, src,
760                        input_desc->channel[0].size *
761                        input_desc->nr_channels >> 3);
762
763         switch (input_desc->channel[0].type) {
764         case UTIL_FORMAT_TYPE_UNSIGNED:
765            if (input_desc->channel[0].normalized) {
766               sse2_punpcklbw(p->func, dataXMM, dataXMM);
767               if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
768                  sse2_psrlw_imm(p->func, dataXMM, 1);
769            }
770            else
771               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
772            break;
773         case UTIL_FORMAT_TYPE_SIGNED:
774            if (input_desc->channel[0].normalized) {
775               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
776               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
777               sse2_psllw_imm(p->func, dataXMM, 9);
778               sse2_psrlw_imm(p->func, dataXMM, 8);
779               sse2_por(p->func, tmpXMM, dataXMM);
780               sse2_psrlw_imm(p->func, dataXMM, 7);
781               sse2_por(p->func, tmpXMM, dataXMM);
782               {
783                  struct x86_reg t = dataXMM;
784                  dataXMM = tmpXMM;
785                  tmpXMM = t;
786               }
787            }
788            else {
789               sse2_punpcklbw(p->func, dataXMM, dataXMM);
790               sse2_psraw_imm(p->func, dataXMM, 8);
791            }
792            break;
793         default:
794            assert(0);
795         }
796
797         if (output_desc->channel[0].normalized)
798            imms[1] =
799               (output_desc->channel[0].type ==
800                UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
801
802         if (!id_swizzle)
803            sse2_pshuflw(p->func, dataXMM, dataXMM,
804                         (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) |
805                         ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
806      }
807
808      if (output_desc->nr_channels >= 4
809          && swizzle[0] < PIPE_SWIZZLE_0
810          && swizzle[1] < PIPE_SWIZZLE_0
811          && swizzle[2] < PIPE_SWIZZLE_0
812          && swizzle[3] < PIPE_SWIZZLE_0) {
813         sse2_movq(p->func, dst, dataXMM);
814      }
815      else {
816         if (swizzle[0] < PIPE_SWIZZLE_0) {
817            if (output_desc->nr_channels >= 2
818                && swizzle[1] < PIPE_SWIZZLE_0) {
819               sse2_movd(p->func, dst, dataXMM);
820            }
821            else {
822               sse2_movd(p->func, tmp, dataXMM);
823               x86_mov16(p->func, dst, tmp);
824               if (output_desc->nr_channels >= 2)
825                  x86_mov16_imm(p->func, x86_make_disp(dst, 2),
826                                imms[swizzle[1] - PIPE_SWIZZLE_0]);
827            }
828         }
829         else {
830            if (output_desc->nr_channels >= 2
831                && swizzle[1] >= PIPE_SWIZZLE_0) {
832               x86_mov_imm(p->func, dst,
833                           (imms[swizzle[1] - PIPE_SWIZZLE_0] << 16) |
834                           imms[swizzle[0] - PIPE_SWIZZLE_0]);
835            }
836            else {
837               x86_mov16_imm(p->func, dst,
838                             imms[swizzle[0] - PIPE_SWIZZLE_0]);
839               if (output_desc->nr_channels >= 2) {
840                  sse2_movd(p->func, tmp, dataXMM);
841                  x86_shr_imm(p->func, tmp, 16);
842                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
843               }
844            }
845         }
846
847         if (output_desc->nr_channels >= 3) {
848            if (swizzle[2] < PIPE_SWIZZLE_0) {
849               if (output_desc->nr_channels >= 4
850                   && swizzle[3] < PIPE_SWIZZLE_0) {
851                  sse2_psrlq_imm(p->func, dataXMM, 32);
852                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
853               }
854               else {
855                  sse2_psrlq_imm(p->func, dataXMM, 32);
856                  sse2_movd(p->func, tmp, dataXMM);
857                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
858                  if (output_desc->nr_channels >= 4) {
859                     x86_mov16_imm(p->func, x86_make_disp(dst, 6),
860                                   imms[swizzle[3] - PIPE_SWIZZLE_0]);
861                  }
862               }
863            }
864            else {
865               if (output_desc->nr_channels >= 4
866                   && swizzle[3] >= PIPE_SWIZZLE_0) {
867                  x86_mov_imm(p->func, x86_make_disp(dst, 4),
868                              (imms[swizzle[3] - PIPE_SWIZZLE_0] << 16)
869                              | imms[swizzle[2] - PIPE_SWIZZLE_0]);
870               }
871               else {
872                  x86_mov16_imm(p->func, x86_make_disp(dst, 4),
873                                imms[swizzle[2] - PIPE_SWIZZLE_0]);
874
875                  if (output_desc->nr_channels >= 4) {
876                     sse2_psrlq_imm(p->func, dataXMM, 48);
877                     sse2_movd(p->func, tmp, dataXMM);
878                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
879                  }
880               }
881            }
882         }
883      }
884      return TRUE;
885   }
886   else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0],
887                    sizeof(output_desc->channel[0]))) {
888      struct x86_reg tmp = p->tmp_EAX;
889      unsigned i;
890
891      if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4
892          && output_desc->nr_channels == 4
893          && swizzle[0] == PIPE_SWIZZLE_W
894          && swizzle[1] == PIPE_SWIZZLE_Z
895          && swizzle[2] == PIPE_SWIZZLE_Y
896          && swizzle[3] == PIPE_SWIZZLE_X) {
897         /* TODO: support movbe */
898         x86_mov(p->func, tmp, src);
899         x86_bswap(p->func, tmp);
900         x86_mov(p->func, dst, tmp);
901         return TRUE;
902      }
903
904      for (i = 0; i < output_desc->nr_channels; ++i) {
905         switch (output_desc->channel[0].size) {
906         case 8:
907            if (swizzle[i] >= PIPE_SWIZZLE_0) {
908               unsigned v = 0;
909               if (swizzle[i] == PIPE_SWIZZLE_1) {
910                  switch (output_desc->channel[0].type) {
911                  case UTIL_FORMAT_TYPE_UNSIGNED:
912                     v = output_desc->channel[0].normalized ? 0xff : 1;
913                     break;
914                  case UTIL_FORMAT_TYPE_SIGNED:
915                     v = output_desc->channel[0].normalized ? 0x7f : 1;
916                     break;
917                  default:
918                     return FALSE;
919                  }
920               }
921               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
922            }
923            else {
924               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
925               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
926            }
927            break;
928         case 16:
929            if (swizzle[i] >= PIPE_SWIZZLE_0) {
930               unsigned v = 0;
931               if (swizzle[i] == PIPE_SWIZZLE_1) {
932                  switch (output_desc->channel[1].type) {
933                  case UTIL_FORMAT_TYPE_UNSIGNED:
934                     v = output_desc->channel[1].normalized ? 0xffff : 1;
935                     break;
936                  case UTIL_FORMAT_TYPE_SIGNED:
937                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
938                     break;
939                  case UTIL_FORMAT_TYPE_FLOAT:
940                     v = 0x3c00;
941                     break;
942                  default:
943                     return FALSE;
944                  }
945               }
946               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
947            }
948            else if (swizzle[i] == PIPE_SWIZZLE_0) {
949               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
950            }
951            else {
952               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
953               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
954            }
955            break;
956         case 32:
957            if (swizzle[i] >= PIPE_SWIZZLE_0) {
958               unsigned v = 0;
959               if (swizzle[i] == PIPE_SWIZZLE_1) {
960                  switch (output_desc->channel[1].type) {
961                  case UTIL_FORMAT_TYPE_UNSIGNED:
962                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
963                     break;
964                  case UTIL_FORMAT_TYPE_SIGNED:
965                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
966                     break;
967                  case UTIL_FORMAT_TYPE_FLOAT:
968                     v = 0x3f800000;
969                     break;
970                  default:
971                     return FALSE;
972                  }
973               }
974               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
975            }
976            else {
977               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
978               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
979            }
980            break;
981         case 64:
982            if (swizzle[i] >= PIPE_SWIZZLE_0) {
983               unsigned l = 0;
984               unsigned h = 0;
985               if (swizzle[i] == PIPE_SWIZZLE_1) {
986                  switch (output_desc->channel[1].type) {
987                  case UTIL_FORMAT_TYPE_UNSIGNED:
988                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
989                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
990                     break;
991                  case UTIL_FORMAT_TYPE_SIGNED:
992                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
993                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
994                     break;
995                  case UTIL_FORMAT_TYPE_FLOAT:
996                     h = 0x3ff00000;
997                     l = 0;
998                     break;
999                  default:
1000                     return FALSE;
1001                  }
1002               }
1003               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
1004               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
1005            }
1006            else {
1007               if (x86_target_caps(p->func) & X86_SSE) {
1008                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
1009                  emit_load64(p, tmp, tmpXMM,
1010                              x86_make_disp(src, swizzle[i] * 8));
1011                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
1012               }
1013               else {
1014                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1015                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1016                  x86_mov(p->func, tmp,
1017                          x86_make_disp(src, swizzle[i] * 8 + 4));
1018                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1019               }
1020            }
1021            break;
1022         default:
1023            return FALSE;
1024         }
1025      }
1026      return TRUE;
1027   }
1028   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1029   else if ((x86_target_caps(p->func) & X86_SSE2) &&
1030            a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT &&
1031            (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1032             || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) {
1033      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1034
1035      /* load */
1036      sse_movups(p->func, dataXMM, src);
1037
1038      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
1039         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3));
1040      }
1041
1042      /* scale by 255.0 */
1043      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1044
1045      /* pack and emit */
1046      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1047      sse2_packssdw(p->func, dataXMM, dataXMM);
1048      sse2_packuswb(p->func, dataXMM, dataXMM);
1049      sse2_movd(p->func, dst, dataXMM);
1050
1051      return TRUE;
1052   }
1053
1054   return FALSE;
1055}
1056
1057
1058static boolean
1059translate_attr(struct translate_sse *p,
1060               const struct translate_element *a,
1061               struct x86_reg src, struct x86_reg dst)
1062{
1063   if (a->input_format == a->output_format) {
1064      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1065      return TRUE;
1066   }
1067
1068   return translate_attr_convert(p, a, src, dst);
1069}
1070
1071
1072static boolean
1073init_inputs(struct translate_sse *p, unsigned index_size)
1074{
1075   unsigned i;
1076   struct x86_reg instance_id =
1077      x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1078   struct x86_reg start_instance =
1079      x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance));
1080
1081   for (i = 0; i < p->nr_buffer_variants; i++) {
1082      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1083      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1084
1085      if (!index_size || variant->instance_divisor) {
1086         struct x86_reg buf_max_index =
1087            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index));
1088         struct x86_reg buf_stride =
1089            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride));
1090         struct x86_reg buf_ptr =
1091            x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr));
1092         struct x86_reg buf_base_ptr =
1093            x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr));
1094         struct x86_reg elt = p->idx_ESI;
1095         struct x86_reg tmp_EAX = p->tmp_EAX;
1096
1097         /* Calculate pointer to first attrib:
1098          *   base_ptr + stride * index, where index depends on instance divisor
1099          */
1100         if (variant->instance_divisor) {
1101            struct x86_reg tmp_EDX = p->tmp2_EDX;
1102
1103            /* Start with instance = instance_id
1104             * which is true if divisor is 1.
1105             */
1106            x86_mov(p->func, tmp_EAX, instance_id);
1107
1108            if (variant->instance_divisor != 1) {
1109               struct x86_reg tmp_ECX = p->src_ECX;
1110
1111               /* TODO: Add x86_shr() to rtasm and use it whenever
1112                *       instance divisor is power of two.
1113                */
1114               x86_xor(p->func, tmp_EDX, tmp_EDX);
1115               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1116               x86_div(p->func, tmp_ECX);       /* EAX = EDX:EAX / ECX */
1117            }
1118
1119            /* instance = (instance_id / divisor) + start_instance
1120             */
1121            x86_mov(p->func, tmp_EDX, start_instance);
1122            x86_add(p->func, tmp_EAX, tmp_EDX);
1123
1124            /* XXX we need to clamp the index here too, but to a
1125             * per-array max value, not the draw->pt.max_index value
1126             * that's being given to us via translate->set_buffer().
1127             */
1128         }
1129         else {
1130            x86_mov(p->func, tmp_EAX, elt);
1131
1132            /* Clamp to max_index
1133             */
1134            x86_cmp(p->func, tmp_EAX, buf_max_index);
1135            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1136         }
1137
1138         x86_mov(p->func, p->tmp2_EDX, buf_stride);
1139         x64_rexw(p->func);
1140         x86_imul(p->func, tmp_EAX, p->tmp2_EDX);
1141         x64_rexw(p->func);
1142         x86_add(p->func, tmp_EAX, buf_base_ptr);
1143
1144         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1145
1146         /* In the linear case, keep the buffer pointer instead of the
1147          * index number.
1148          */
1149         if (!index_size && p->nr_buffer_variants == 1) {
1150            x64_rexw(p->func);
1151            x86_mov(p->func, elt, tmp_EAX);
1152         }
1153         else {
1154            x64_rexw(p->func);
1155            x86_mov(p->func, buf_ptr, tmp_EAX);
1156         }
1157      }
1158   }
1159
1160   return TRUE;
1161}
1162
1163
1164static struct x86_reg
1165get_buffer_ptr(struct translate_sse *p,
1166               unsigned index_size, unsigned var_idx, struct x86_reg elt)
1167{
1168   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1169      return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id));
1170   }
1171   if (!index_size && p->nr_buffer_variants == 1) {
1172      return p->idx_ESI;
1173   }
1174   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1175      struct x86_reg ptr = p->src_ECX;
1176      struct x86_reg buf_ptr =
1177         x86_make_disp(p->machine_EDI,
1178                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1179
1180      x64_rexw(p->func);
1181      x86_mov(p->func, ptr, buf_ptr);
1182      return ptr;
1183   }
1184   else {
1185      struct x86_reg ptr = p->src_ECX;
1186      const struct translate_buffer_variant *variant =
1187         &p->buffer_variant[var_idx];
1188      struct x86_reg buf_stride =
1189         x86_make_disp(p->machine_EDI,
1190                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1191      struct x86_reg buf_base_ptr =
1192         x86_make_disp(p->machine_EDI,
1193                  get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1194      struct x86_reg buf_max_index =
1195         x86_make_disp(p->machine_EDI,
1196                  get_offset(p, &p->buffer[variant->buffer_index].max_index));
1197
1198      /* Calculate pointer to current attrib:
1199       */
1200      switch (index_size) {
1201      case 1:
1202         x86_movzx8(p->func, ptr, elt);
1203         break;
1204      case 2:
1205         x86_movzx16(p->func, ptr, elt);
1206         break;
1207      case 4:
1208         x86_mov(p->func, ptr, elt);
1209         break;
1210      }
1211
1212      /* Clamp to max_index
1213       */
1214      x86_cmp(p->func, ptr, buf_max_index);
1215      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1216
1217      x86_mov(p->func, p->tmp2_EDX, buf_stride);
1218      x64_rexw(p->func);
1219      x86_imul(p->func, ptr, p->tmp2_EDX);
1220      x64_rexw(p->func);
1221      x86_add(p->func, ptr, buf_base_ptr);
1222      return ptr;
1223   }
1224}
1225
1226
1227static boolean
1228incr_inputs(struct translate_sse *p, unsigned index_size)
1229{
1230   if (!index_size && p->nr_buffer_variants == 1) {
1231      const unsigned buffer_index = p->buffer_variant[0].buffer_index;
1232      struct x86_reg stride =
1233         x86_make_disp(p->machine_EDI,
1234                       get_offset(p, &p->buffer[buffer_index].stride));
1235
1236      if (p->buffer_variant[0].instance_divisor == 0) {
1237         x64_rexw(p->func);
1238         x86_add(p->func, p->idx_ESI, stride);
1239         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1240      }
1241   }
1242   else if (!index_size) {
1243      unsigned i;
1244
1245      /* Is this worthwhile??
1246       */
1247      for (i = 0; i < p->nr_buffer_variants; i++) {
1248         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1249         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1250                                                get_offset(p, &variant->ptr));
1251      struct x86_reg buf_stride =
1252         x86_make_disp(p->machine_EDI,
1253                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1254
1255         if (variant->instance_divisor == 0) {
1256            x86_mov(p->func, p->tmp_EAX, buf_stride);
1257            x64_rexw(p->func);
1258            x86_add(p->func, p->tmp_EAX, buf_ptr);
1259            if (i == 0)
1260               sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1261            x64_rexw(p->func);
1262            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1263         }
1264      }
1265   }
1266   else {
1267      x64_rexw(p->func);
1268      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1269   }
1270
1271   return TRUE;
1272}
1273
1274
1275/* Build run( struct translate *machine,
1276 *            unsigned start,
1277 *            unsigned count,
1278 *            void *output_buffer )
1279 * or
1280 *  run_elts( struct translate *machine,
1281 *            unsigned *elts,
1282 *            unsigned count,
1283 *            void *output_buffer )
1284 *
1285 *  Lots of hardcoding
1286 *
1287 * EAX -- pointer to current output vertex
1288 * ECX -- pointer to current attribute
1289 *
1290 */
1291static boolean
1292build_vertex_emit(struct translate_sse *p,
1293                  struct x86_function *func, unsigned index_size)
1294{
1295   int fixup, label;
1296   unsigned j;
1297
1298   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1299   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1300
1301   p->tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1302   p->idx_ESI = x86_make_reg(file_REG32, reg_SI);
1303   p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX);
1304   p->machine_EDI = x86_make_reg(file_REG32, reg_DI);
1305   p->count_EBP = x86_make_reg(file_REG32, reg_BP);
1306   p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX);
1307   p->src_ECX = x86_make_reg(file_REG32, reg_CX);
1308
1309   p->func = func;
1310
1311   x86_init_func(p->func);
1312
1313   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1314      /* the ABI guarantees a 16-byte aligned 32-byte "shadow space"
1315       * above the return address
1316       */
1317      sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8),
1318                  x86_make_reg(file_XMM, 6));
1319      sse2_movdqa(p->func,
1320                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24),
1321                  x86_make_reg(file_XMM, 7));
1322   }
1323
1324   x86_push(p->func, p->outbuf_EBX);
1325   x86_push(p->func, p->count_EBP);
1326
1327   /* on non-Win64 x86-64, these are already in the right registers */
1328   if (x86_target(p->func) != X86_64_STD_ABI) {
1329      x86_push(p->func, p->machine_EDI);
1330      x86_push(p->func, p->idx_ESI);
1331
1332      if (x86_target(p->func) != X86_32) {
1333         x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1334         x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1335      }
1336      else {
1337         x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1338         x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1339      }
1340   }
1341
1342   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1343
1344   if (x86_target(p->func) != X86_32)
1345      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1346   else
1347      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1348
1349   /* Load instance ID.
1350    */
1351   if (p->use_instancing) {
1352      x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4));
1353      x86_mov(p->func,
1354              x86_make_disp(p->machine_EDI,
1355                            get_offset(p, &p->start_instance)), p->tmp2_EDX);
1356
1357      x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5));
1358      x86_mov(p->func,
1359              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1360              p->tmp_EAX);
1361   }
1362
1363   /* Get vertex count, compare to zero
1364    */
1365   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1366   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1367   fixup = x86_jcc_forward(p->func, cc_E);
1368
1369   /* always load, needed or not:
1370    */
1371   init_inputs(p, index_size);
1372
1373   /* Note address for loop jump
1374    */
1375   label = x86_get_label(p->func);
1376   {
1377      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1378      int last_variant = -1;
1379      struct x86_reg vb;
1380
1381      for (j = 0; j < p->translate.key.nr_elements; j++) {
1382         const struct translate_element *a = &p->translate.key.element[j];
1383         unsigned variant = p->element_to_buffer_variant[j];
1384
1385         /* Figure out source pointer address:
1386          */
1387         if (variant != last_variant) {
1388            last_variant = variant;
1389            vb = get_buffer_ptr(p, index_size, variant, elt);
1390         }
1391
1392         if (!translate_attr(p, a,
1393                             x86_make_disp(vb, a->input_offset),
1394                             x86_make_disp(p->outbuf_EBX, a->output_offset)))
1395            return FALSE;
1396      }
1397
1398      /* Next output vertex:
1399       */
1400      x64_rexw(p->func);
1401      x86_lea(p->func, p->outbuf_EBX,
1402              x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride));
1403
1404      /* Incr index
1405       */
1406      incr_inputs(p, index_size);
1407   }
1408
1409   /* decr count, loop if not zero
1410    */
1411   x86_dec(p->func, p->count_EBP);
1412   x86_jcc(p->func, cc_NZ, label);
1413
1414   /* Exit mmx state?
1415    */
1416   if (p->func->need_emms)
1417      mmx_emms(p->func);
1418
1419   /* Land forward jump here:
1420    */
1421   x86_fixup_fwd_jump(p->func, fixup);
1422
1423   /* Pop regs and return
1424    */
1425   if (x86_target(p->func) != X86_64_STD_ABI) {
1426      x86_pop(p->func, p->idx_ESI);
1427      x86_pop(p->func, p->machine_EDI);
1428   }
1429
1430   x86_pop(p->func, p->count_EBP);
1431   x86_pop(p->func, p->outbuf_EBX);
1432
1433   if (x86_target(p->func) == X86_64_WIN64_ABI) {
1434      sse2_movdqa(p->func, x86_make_reg(file_XMM, 6),
1435                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1436      sse2_movdqa(p->func, x86_make_reg(file_XMM, 7),
1437                  x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1438   }
1439   x86_ret(p->func);
1440
1441   return TRUE;
1442}
1443
1444
1445static void
1446translate_sse_set_buffer(struct translate *translate,
1447                         unsigned buf,
1448                         const void *ptr, unsigned stride, unsigned max_index)
1449{
1450   struct translate_sse *p = (struct translate_sse *) translate;
1451
1452   if (buf < p->nr_buffers) {
1453      p->buffer[buf].base_ptr = (char *) ptr;
1454      p->buffer[buf].stride = stride;
1455      p->buffer[buf].max_index = max_index;
1456   }
1457
1458   if (0)
1459      debug_printf("%s %d/%d: %p %d\n",
1460                   __FUNCTION__, buf, p->nr_buffers, ptr, stride);
1461}
1462
1463
1464static void
1465translate_sse_release(struct translate *translate)
1466{
1467   struct translate_sse *p = (struct translate_sse *) translate;
1468
1469   x86_release_func(&p->elt8_func);
1470   x86_release_func(&p->elt16_func);
1471   x86_release_func(&p->elt_func);
1472   x86_release_func(&p->linear_func);
1473
1474   os_free_aligned(p);
1475}
1476
1477
1478struct translate *
1479translate_sse2_create(const struct translate_key *key)
1480{
1481   struct translate_sse *p = NULL;
1482   unsigned i;
1483
1484   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1485   if (!rtasm_cpu_has_sse())
1486      goto fail;
1487
1488   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1489   if (!p)
1490      goto fail;
1491
1492   memset(p, 0, sizeof(*p));
1493   memcpy(p->consts, consts, sizeof(consts));
1494
1495   p->translate.key = *key;
1496   p->translate.release = translate_sse_release;
1497   p->translate.set_buffer = translate_sse_set_buffer;
1498
1499   assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS);
1500
1501   for (i = 0; i < key->nr_elements; i++) {
1502      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1503         unsigned j;
1504
1505         p->nr_buffers =
1506            MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1507
1508         if (key->element[i].instance_divisor) {
1509            p->use_instancing = TRUE;
1510         }
1511
1512         /*
1513          * Map vertex element to vertex buffer variant.
1514          */
1515         for (j = 0; j < p->nr_buffer_variants; j++) {
1516            if (p->buffer_variant[j].buffer_index ==
1517                key->element[i].input_buffer
1518                && p->buffer_variant[j].instance_divisor ==
1519                key->element[i].instance_divisor) {
1520               break;
1521            }
1522         }
1523         if (j == p->nr_buffer_variants) {
1524            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1525            p->buffer_variant[j].instance_divisor =
1526               key->element[i].instance_divisor;
1527            p->nr_buffer_variants++;
1528         }
1529         p->element_to_buffer_variant[i] = j;
1530      }
1531      else {
1532         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1533
1534         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1535      }
1536   }
1537
1538   if (0)
1539      debug_printf("nr_buffers: %d\n", p->nr_buffers);
1540
1541   if (!build_vertex_emit(p, &p->linear_func, 0))
1542      goto fail;
1543
1544   if (!build_vertex_emit(p, &p->elt_func, 4))
1545      goto fail;
1546
1547   if (!build_vertex_emit(p, &p->elt16_func, 2))
1548      goto fail;
1549
1550   if (!build_vertex_emit(p, &p->elt8_func, 1))
1551      goto fail;
1552
1553   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1554   if (p->translate.run == NULL)
1555      goto fail;
1556
1557   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1558   if (p->translate.run_elts == NULL)
1559      goto fail;
1560
1561   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1562   if (p->translate.run_elts16 == NULL)
1563      goto fail;
1564
1565   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1566   if (p->translate.run_elts8 == NULL)
1567      goto fail;
1568
1569   return &p->translate;
1570
1571 fail:
1572   if (p)
1573      translate_sse_release(&p->translate);
1574
1575   return NULL;
1576}
1577
1578
1579#else
1580
1581struct translate *
1582translate_sse2_create(const struct translate_key *key)
1583{
1584   return NULL;
1585}
1586
1587#endif
1588