translate_sse.c revision cdc920a0
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33
34#include "translate.h"
35
36
37#if defined(PIPE_ARCH_X86)
38
39#include "rtasm/rtasm_cpu.h"
40#include "rtasm/rtasm_x86sse.h"
41
42
43#define X    0
44#define Y    1
45#define Z    2
46#define W    3
47
48
49typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50                                     unsigned start,
51                                     unsigned count,
52                                     unsigned instance_id,
53                                     void *output_buffer);
54
55typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
56                                          const unsigned *elts,
57                                          unsigned count,
58                                          unsigned instance_id,
59                                          void *output_buffer);
60
61struct translate_buffer {
62   const void *base_ptr;
63   unsigned stride;
64};
65
66struct translate_buffer_varient {
67   unsigned buffer_index;
68   unsigned instance_divisor;
69   void *ptr;                    /* updated either per vertex or per instance */
70};
71
72
73#define ELEMENT_BUFFER_INSTANCE_ID  1001
74
75
76struct translate_sse {
77   struct translate translate;
78
79   struct x86_function linear_func;
80   struct x86_function elt_func;
81   struct x86_function *func;
82
83   boolean loaded_identity;
84   boolean loaded_255;
85   boolean loaded_inv_255;
86
87   float identity[4];
88   float float_255[4];
89   float inv_255[4];
90
91   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
92   unsigned nr_buffers;
93
94   /* Multiple buffer varients can map to a single buffer. */
95   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
96   unsigned nr_buffer_varients;
97
98   /* Multiple elements can map to a single buffer varient. */
99   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
100
101   boolean use_instancing;
102   unsigned instance_id;
103
104   run_func      gen_run;
105   run_elts_func gen_run_elts;
106
107   /* these are actually known values, but putting them in a struct
108    * like this is helpful to keep them in sync across the file.
109    */
110   struct x86_reg tmp_EAX;
111   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
112   struct x86_reg outbuf_ECX;
113   struct x86_reg machine_EDX;
114   struct x86_reg count_ESI;    /* decrements to zero */
115};
116
117static int get_offset( const void *a, const void *b )
118{
119   return (const char *)b - (const char *)a;
120}
121
122
123
124static struct x86_reg get_identity( struct translate_sse *p )
125{
126   struct x86_reg reg = x86_make_reg(file_XMM, 6);
127
128   if (!p->loaded_identity) {
129      p->loaded_identity = TRUE;
130      p->identity[0] = 0;
131      p->identity[1] = 0;
132      p->identity[2] = 0;
133      p->identity[3] = 1;
134
135      sse_movups(p->func, reg,
136		 x86_make_disp(p->machine_EDX,
137			       get_offset(p, &p->identity[0])));
138   }
139
140   return reg;
141}
142
143static struct x86_reg get_255( struct translate_sse *p )
144{
145   struct x86_reg reg = x86_make_reg(file_XMM, 7);
146
147   if (!p->loaded_255) {
148      p->loaded_255 = TRUE;
149      p->float_255[0] =
150	 p->float_255[1] =
151	 p->float_255[2] =
152	 p->float_255[3] = 255.0f;
153
154      sse_movups(p->func, reg,
155		 x86_make_disp(p->machine_EDX,
156			       get_offset(p, &p->float_255[0])));
157   }
158
159   return reg;
160}
161
162static struct x86_reg get_inv_255( struct translate_sse *p )
163{
164   struct x86_reg reg = x86_make_reg(file_XMM, 5);
165
166   if (!p->loaded_inv_255) {
167      p->loaded_inv_255 = TRUE;
168      p->inv_255[0] =
169	 p->inv_255[1] =
170	 p->inv_255[2] =
171	 p->inv_255[3] = 1.0f / 255.0f;
172
173      sse_movups(p->func, reg,
174		 x86_make_disp(p->machine_EDX,
175			       get_offset(p, &p->inv_255[0])));
176   }
177
178   return reg;
179}
180
181
182static void emit_load_R32G32B32A32( struct translate_sse *p,
183				    struct x86_reg data,
184				    struct x86_reg arg0 )
185{
186   sse_movups(p->func, data, arg0);
187}
188
189static void emit_load_R32G32B32( struct translate_sse *p,
190				 struct x86_reg data,
191				 struct x86_reg arg0 )
192{
193   /* Have to jump through some hoops:
194    *
195    * c 0 0 0
196    * c 0 0 1
197    * 0 0 c 1
198    * a b c 1
199    */
200   sse_movss(p->func, data, x86_make_disp(arg0, 8));
201   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
202   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
203   sse_movlps(p->func, data, arg0);
204}
205
206static void emit_load_R32G32( struct translate_sse *p,
207			   struct x86_reg data,
208			   struct x86_reg arg0 )
209{
210   /* 0 0 0 1
211    * a b 0 1
212    */
213   sse_movups(p->func, data, get_identity(p) );
214   sse_movlps(p->func, data, arg0);
215}
216
217
218static void emit_load_R32( struct translate_sse *p,
219			   struct x86_reg data,
220			   struct x86_reg arg0 )
221{
222   /* a 0 0 0
223    * a 0 0 1
224    */
225   sse_movss(p->func, data, arg0);
226   sse_orps(p->func, data, get_identity(p) );
227}
228
229
230static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
231				       struct x86_reg data,
232				       struct x86_reg src )
233{
234
235   /* Load and unpack twice:
236    */
237   sse_movss(p->func, data, src);
238   sse2_punpcklbw(p->func, data, get_identity(p));
239   sse2_punpcklbw(p->func, data, get_identity(p));
240
241   /* Convert to float:
242    */
243   sse2_cvtdq2ps(p->func, data, data);
244
245
246   /* Scale by 1/255.0
247    */
248   sse_mulps(p->func, data, get_inv_255(p));
249}
250
251
252
253
254static void emit_store_R32G32B32A32( struct translate_sse *p,
255				     struct x86_reg dest,
256				     struct x86_reg dataXMM )
257{
258   sse_movups(p->func, dest, dataXMM);
259}
260
261static void emit_store_R32G32B32( struct translate_sse *p,
262				  struct x86_reg dest,
263				  struct x86_reg dataXMM )
264{
265   /* Emit two, shuffle, emit one.
266    */
267   sse_movlps(p->func, dest, dataXMM);
268   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
269   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
270}
271
272static void emit_store_R32G32( struct translate_sse *p,
273			       struct x86_reg dest,
274			       struct x86_reg dataXMM )
275{
276   sse_movlps(p->func, dest, dataXMM);
277}
278
279static void emit_store_R32( struct translate_sse *p,
280			    struct x86_reg dest,
281			    struct x86_reg dataXMM )
282{
283   sse_movss(p->func, dest, dataXMM);
284}
285
286
287
288static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
289				       struct x86_reg dest,
290				       struct x86_reg dataXMM )
291{
292   /* Scale by 255.0
293    */
294   sse_mulps(p->func, dataXMM, get_255(p));
295
296   /* Pack and emit:
297    */
298   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
299   sse2_packssdw(p->func, dataXMM, dataXMM);
300   sse2_packuswb(p->func, dataXMM, dataXMM);
301   sse_movss(p->func, dest, dataXMM);
302}
303
304
305
306
307
308/* Extended swizzles?  Maybe later.
309 */
310static void emit_swizzle( struct translate_sse *p,
311			  struct x86_reg dest,
312			  struct x86_reg src,
313			  unsigned char shuffle )
314{
315   sse_shufps(p->func, dest, src, shuffle);
316}
317
318
319static boolean translate_attr( struct translate_sse *p,
320			       const struct translate_element *a,
321			       struct x86_reg srcECX,
322			       struct x86_reg dstEAX)
323{
324   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
325
326   switch (a->input_format) {
327   case PIPE_FORMAT_R32_FLOAT:
328      emit_load_R32(p, dataXMM, srcECX);
329      break;
330   case PIPE_FORMAT_R32G32_FLOAT:
331      emit_load_R32G32(p, dataXMM, srcECX);
332      break;
333   case PIPE_FORMAT_R32G32B32_FLOAT:
334      emit_load_R32G32B32(p, dataXMM, srcECX);
335      break;
336   case PIPE_FORMAT_R32G32B32A32_FLOAT:
337      emit_load_R32G32B32A32(p, dataXMM, srcECX);
338      break;
339   case PIPE_FORMAT_A8R8G8B8_UNORM:
340      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
341      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
342      break;
343   case PIPE_FORMAT_R8G8B8A8_UNORM:
344      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
345      break;
346   default:
347      return FALSE;
348   }
349
350   switch (a->output_format) {
351   case PIPE_FORMAT_R32_FLOAT:
352      emit_store_R32(p, dstEAX, dataXMM);
353      break;
354   case PIPE_FORMAT_R32G32_FLOAT:
355      emit_store_R32G32(p, dstEAX, dataXMM);
356      break;
357   case PIPE_FORMAT_R32G32B32_FLOAT:
358      emit_store_R32G32B32(p, dstEAX, dataXMM);
359      break;
360   case PIPE_FORMAT_R32G32B32A32_FLOAT:
361      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
362      break;
363   case PIPE_FORMAT_A8R8G8B8_UNORM:
364      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
365      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
366      break;
367   case PIPE_FORMAT_R8G8B8A8_UNORM:
368      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
369      break;
370   default:
371      return FALSE;
372   }
373
374   return TRUE;
375}
376
377
378static boolean init_inputs( struct translate_sse *p,
379                            boolean linear )
380{
381   unsigned i;
382   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
383                                              get_offset(p, &p->instance_id));
384
385   for (i = 0; i < p->nr_buffer_varients; i++) {
386      struct translate_buffer_varient *varient = &p->buffer_varient[i];
387      struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
388
389      if (linear || varient->instance_divisor) {
390         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
391                                                     get_offset(p, &buffer->stride));
392         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
393                                                     get_offset(p, &varient->ptr));
394         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
395                                                     get_offset(p, &buffer->base_ptr));
396         struct x86_reg elt = p->idx_EBX;
397         struct x86_reg tmp_EAX = p->tmp_EAX;
398
399         /* Calculate pointer to first attrib:
400          *   base_ptr + stride * index, where index depends on instance divisor
401          */
402         if (varient->instance_divisor) {
403            /* Our index is instance ID divided by instance divisor.
404             */
405            x86_mov(p->func, tmp_EAX, instance_id);
406
407            if (varient->instance_divisor != 1) {
408               struct x86_reg tmp_EDX = p->machine_EDX;
409               struct x86_reg tmp_ECX = p->outbuf_ECX;
410
411               /* TODO: Add x86_shr() to rtasm and use it whenever
412                *       instance divisor is power of two.
413                */
414
415               x86_push(p->func, tmp_EDX);
416               x86_push(p->func, tmp_ECX);
417               x86_xor(p->func, tmp_EDX, tmp_EDX);
418               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
419               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
420               x86_pop(p->func, tmp_ECX);
421               x86_pop(p->func, tmp_EDX);
422            }
423         } else {
424            x86_mov(p->func, tmp_EAX, elt);
425         }
426         x86_imul(p->func, tmp_EAX, buf_stride);
427         x86_add(p->func, tmp_EAX, buf_base_ptr);
428
429
430         /* In the linear case, keep the buffer pointer instead of the
431          * index number.
432          */
433         if (linear && p->nr_buffer_varients == 1)
434            x86_mov(p->func, elt, tmp_EAX);
435         else
436            x86_mov(p->func, buf_ptr, tmp_EAX);
437      }
438   }
439
440   return TRUE;
441}
442
443
444static struct x86_reg get_buffer_ptr( struct translate_sse *p,
445                                      boolean linear,
446                                      unsigned var_idx,
447                                      struct x86_reg elt )
448{
449   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
450      return x86_make_disp(p->machine_EDX,
451                           get_offset(p, &p->instance_id));
452   }
453   if (linear && p->nr_buffer_varients == 1) {
454      return p->idx_EBX;
455   }
456   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
457      struct x86_reg ptr = p->tmp_EAX;
458      struct x86_reg buf_ptr =
459         x86_make_disp(p->machine_EDX,
460                       get_offset(p, &p->buffer_varient[var_idx].ptr));
461
462      x86_mov(p->func, ptr, buf_ptr);
463      return ptr;
464   }
465   else {
466      struct x86_reg ptr = p->tmp_EAX;
467      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
468
469      struct x86_reg buf_stride =
470         x86_make_disp(p->machine_EDX,
471                       get_offset(p, &p->buffer[varient->buffer_index].stride));
472
473      struct x86_reg buf_base_ptr =
474         x86_make_disp(p->machine_EDX,
475                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
476
477
478
479      /* Calculate pointer to current attrib:
480       */
481      x86_mov(p->func, ptr, buf_stride);
482      x86_imul(p->func, ptr, elt);
483      x86_add(p->func, ptr, buf_base_ptr);
484      return ptr;
485   }
486}
487
488
489
490static boolean incr_inputs( struct translate_sse *p,
491                            boolean linear )
492{
493   if (linear && p->nr_buffer_varients == 1) {
494      struct x86_reg stride = x86_make_disp(p->machine_EDX,
495                                            get_offset(p, &p->buffer[0].stride));
496
497      if (p->buffer_varient[0].instance_divisor == 0) {
498         x86_add(p->func, p->idx_EBX, stride);
499         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
500      }
501   }
502   else if (linear) {
503      unsigned i;
504
505      /* Is this worthwhile??
506       */
507      for (i = 0; i < p->nr_buffer_varients; i++) {
508         struct translate_buffer_varient *varient = &p->buffer_varient[i];
509         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
510                                                get_offset(p, &varient->ptr));
511         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
512                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
513
514         if (varient->instance_divisor == 0) {
515            x86_mov(p->func, p->tmp_EAX, buf_ptr);
516            x86_add(p->func, p->tmp_EAX, buf_stride);
517            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
518            x86_mov(p->func, buf_ptr, p->tmp_EAX);
519         }
520      }
521   }
522   else {
523      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
524   }
525
526   return TRUE;
527}
528
529
530/* Build run( struct translate *machine,
531 *            unsigned start,
532 *            unsigned count,
533 *            void *output_buffer )
534 * or
535 *  run_elts( struct translate *machine,
536 *            unsigned *elts,
537 *            unsigned count,
538 *            void *output_buffer )
539 *
540 *  Lots of hardcoding
541 *
542 * EAX -- pointer to current output vertex
543 * ECX -- pointer to current attribute
544 *
545 */
546static boolean build_vertex_emit( struct translate_sse *p,
547				  struct x86_function *func,
548				  boolean linear )
549{
550   int fixup, label;
551   unsigned j;
552
553   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
554   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
555   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
556   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
557   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
558
559   p->func = func;
560   p->loaded_inv_255 = FALSE;
561   p->loaded_255 = FALSE;
562   p->loaded_identity = FALSE;
563
564   x86_init_func(p->func);
565
566   /* Push a few regs?
567    */
568   x86_push(p->func, p->idx_EBX);
569   x86_push(p->func, p->count_ESI);
570
571   /* Load arguments into regs:
572    */
573   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
574   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
575   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
576   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
577
578   /* Load instance ID.
579    */
580   if (p->use_instancing) {
581      x86_mov(p->func,
582              p->tmp_EAX,
583              x86_fn_arg(p->func, 4));
584      x86_mov(p->func,
585              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
586              p->tmp_EAX);
587   }
588
589   /* Get vertex count, compare to zero
590    */
591   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
592   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
593   fixup = x86_jcc_forward(p->func, cc_E);
594
595   /* always load, needed or not:
596    */
597   init_inputs(p, linear);
598
599   /* Note address for loop jump
600    */
601   label = x86_get_label(p->func);
602   {
603      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
604      int last_varient = -1;
605      struct x86_reg vb;
606
607      for (j = 0; j < p->translate.key.nr_elements; j++) {
608         const struct translate_element *a = &p->translate.key.element[j];
609         unsigned varient = p->element_to_buffer_varient[j];
610
611         /* Figure out source pointer address:
612          */
613         if (varient != last_varient) {
614            last_varient = varient;
615            vb = get_buffer_ptr(p, linear, varient, elt);
616         }
617
618         if (!translate_attr( p, a,
619                              x86_make_disp(vb, a->input_offset),
620                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
621            return FALSE;
622      }
623
624      /* Next output vertex:
625       */
626      x86_lea(p->func,
627              p->outbuf_ECX,
628              x86_make_disp(p->outbuf_ECX,
629                            p->translate.key.output_stride));
630
631      /* Incr index
632       */
633      incr_inputs( p, linear );
634   }
635
636   /* decr count, loop if not zero
637    */
638   x86_dec(p->func, p->count_ESI);
639   x86_jcc(p->func, cc_NZ, label);
640
641   /* Exit mmx state?
642    */
643   if (p->func->need_emms)
644      mmx_emms(p->func);
645
646   /* Land forward jump here:
647    */
648   x86_fixup_fwd_jump(p->func, fixup);
649
650   /* Pop regs and return
651    */
652
653   x86_pop(p->func, p->count_ESI);
654   x86_pop(p->func, p->idx_EBX);
655   x86_ret(p->func);
656
657   return TRUE;
658}
659
660
661
662
663
664
665
666static void translate_sse_set_buffer( struct translate *translate,
667				unsigned buf,
668				const void *ptr,
669				unsigned stride )
670{
671   struct translate_sse *p = (struct translate_sse *)translate;
672
673   if (buf < p->nr_buffers) {
674      p->buffer[buf].base_ptr = (char *)ptr;
675      p->buffer[buf].stride = stride;
676   }
677
678   if (0) debug_printf("%s %d/%d: %p %d\n",
679                       __FUNCTION__, buf,
680                       p->nr_buffers,
681                       ptr, stride);
682}
683
684
685static void translate_sse_release( struct translate *translate )
686{
687   struct translate_sse *p = (struct translate_sse *)translate;
688
689   x86_release_func( &p->linear_func );
690   x86_release_func( &p->elt_func );
691
692   FREE(p);
693}
694
695static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
696			      const unsigned *elts,
697			      unsigned count,
698                              unsigned instance_id,
699			      void *output_buffer )
700{
701   struct translate_sse *p = (struct translate_sse *)translate;
702
703   p->gen_run_elts( translate,
704		    elts,
705		    count,
706                    instance_id,
707                    output_buffer);
708}
709
710static void PIPE_CDECL translate_sse_run( struct translate *translate,
711			 unsigned start,
712			 unsigned count,
713                         unsigned instance_id,
714			 void *output_buffer )
715{
716   struct translate_sse *p = (struct translate_sse *)translate;
717
718   p->gen_run( translate,
719	       start,
720	       count,
721               instance_id,
722               output_buffer);
723}
724
725
726struct translate *translate_sse2_create( const struct translate_key *key )
727{
728   struct translate_sse *p = NULL;
729   unsigned i;
730
731   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
732      goto fail;
733
734   p = CALLOC_STRUCT( translate_sse );
735   if (p == NULL)
736      goto fail;
737
738   p->translate.key = *key;
739   p->translate.release = translate_sse_release;
740   p->translate.set_buffer = translate_sse_set_buffer;
741   p->translate.run_elts = translate_sse_run_elts;
742   p->translate.run = translate_sse_run;
743
744   for (i = 0; i < key->nr_elements; i++) {
745      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
746         unsigned j;
747
748         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
749
750         if (key->element[i].instance_divisor) {
751            p->use_instancing = TRUE;
752         }
753
754         /*
755          * Map vertex element to vertex buffer varient.
756          */
757         for (j = 0; j < p->nr_buffer_varients; j++) {
758            if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
759                p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
760               break;
761            }
762         }
763         if (j == p->nr_buffer_varients) {
764            p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
765            p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
766            p->nr_buffer_varients++;
767         }
768         p->element_to_buffer_varient[i] = j;
769      } else {
770         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
771
772         p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
773      }
774   }
775
776   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
777
778   if (!build_vertex_emit(p, &p->linear_func, TRUE))
779      goto fail;
780
781   if (!build_vertex_emit(p, &p->elt_func, FALSE))
782      goto fail;
783
784   p->gen_run = (run_func)x86_get_func(&p->linear_func);
785   if (p->gen_run == NULL)
786      goto fail;
787
788   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
789   if (p->gen_run_elts == NULL)
790      goto fail;
791
792   return &p->translate;
793
794 fail:
795   if (p)
796      translate_sse_release( &p->translate );
797
798   return NULL;
799}
800
801
802
803#else
804
805struct translate *translate_sse2_create( const struct translate_key *key )
806{
807   return NULL;
808}
809
810#endif
811