1/**************************************************************************
2 *
3 * Copyright 2007 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/*
29 * Binning code for triangles
30 */
31
32#include "util/u_math.h"
33#include "util/u_memory.h"
34#include "util/u_rect.h"
35#include "util/u_sse.h"
36#include "lp_perf.h"
37#include "lp_setup_context.h"
38#include "lp_rast.h"
39#include "lp_state_fs.h"
40#include "lp_state_setup.h"
41#include "lp_context.h"
42
43#include <inttypes.h>
44
45#define NUM_CHANNELS 4
46
47#if defined(PIPE_ARCH_SSE)
48#include <emmintrin.h>
49#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
50#include <altivec.h>
51#include "util/u_pwr8.h"
52#endif
53
54#if !defined(PIPE_ARCH_SSE)
55
56static inline int
57subpixel_snap(float a)
58{
59   return util_iround(FIXED_ONE * a);
60}
61
62#endif
63
64/* Position and area in fixed point coordinates */
65struct fixed_position {
66   int32_t x[4];
67   int32_t y[4];
68   int32_t dx01;
69   int32_t dy01;
70   int32_t dx20;
71   int32_t dy20;
72   int64_t area;
73};
74
75
76/**
77 * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
78 * immediately after it.
79 * The memory is allocated from the per-scene pool, not per-tile.
80 * \param tri_size  returns number of bytes allocated
81 * \param num_inputs  number of fragment shader inputs
82 * \return pointer to triangle space
83 */
84struct lp_rast_triangle *
85lp_setup_alloc_triangle(struct lp_scene *scene,
86                        unsigned nr_inputs,
87                        unsigned nr_planes,
88                        unsigned *tri_size)
89{
90   unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
91   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
92   struct lp_rast_triangle *tri;
93
94   STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0);
95
96   *tri_size = (sizeof(struct lp_rast_triangle) +
97                3 * input_array_sz +
98                plane_sz);
99
100   tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
101   if (!tri)
102      return NULL;
103
104   tri->inputs.stride = input_array_sz;
105
106   {
107      char *a = (char *)tri;
108      char *b = (char *)&GET_PLANES(tri)[nr_planes];
109      assert(b - a == *tri_size);
110   }
111
112   return tri;
113}
114
115void
116lp_setup_print_vertex(struct lp_setup_context *setup,
117                      const char *name,
118                      const float (*v)[4])
119{
120   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
121   int i, j;
122
123   debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
124                name,
125                v[0][0], v[0][1], v[0][2], v[0][3]);
126
127   for (i = 0; i < key->num_inputs; i++) {
128      const float *in = v[key->inputs[i].src_index];
129
130      debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
131                   i,
132                   name, key->inputs[i].src_index,
133                   (key->inputs[i].usage_mask & 0x1) ? "x" : " ",
134                   (key->inputs[i].usage_mask & 0x2) ? "y" : " ",
135                   (key->inputs[i].usage_mask & 0x4) ? "z" : " ",
136                   (key->inputs[i].usage_mask & 0x8) ? "w" : " ");
137
138      for (j = 0; j < 4; j++)
139         if (key->inputs[i].usage_mask & (1<<j))
140            debug_printf("%.5f ", in[j]);
141
142      debug_printf("\n");
143   }
144}
145
146
147/**
148 * Print triangle vertex attribs (for debug).
149 */
150void
151lp_setup_print_triangle(struct lp_setup_context *setup,
152                        const float (*v0)[4],
153                        const float (*v1)[4],
154                        const float (*v2)[4])
155{
156   debug_printf("triangle\n");
157
158   {
159      const float ex = v0[0][0] - v2[0][0];
160      const float ey = v0[0][1] - v2[0][1];
161      const float fx = v1[0][0] - v2[0][0];
162      const float fy = v1[0][1] - v2[0][1];
163
164      /* det = cross(e,f).z */
165      const float det = ex * fy - ey * fx;
166      if (det < 0.0f)
167         debug_printf("   - ccw\n");
168      else if (det > 0.0f)
169         debug_printf("   - cw\n");
170      else
171         debug_printf("   - zero area\n");
172   }
173
174   lp_setup_print_vertex(setup, "v0", v0);
175   lp_setup_print_vertex(setup, "v1", v1);
176   lp_setup_print_vertex(setup, "v2", v2);
177}
178
179
180#define MAX_PLANES 8
181static unsigned
182lp_rast_tri_tab[MAX_PLANES+1] = {
183   0,               /* should be impossible */
184   LP_RAST_OP_TRIANGLE_1,
185   LP_RAST_OP_TRIANGLE_2,
186   LP_RAST_OP_TRIANGLE_3,
187   LP_RAST_OP_TRIANGLE_4,
188   LP_RAST_OP_TRIANGLE_5,
189   LP_RAST_OP_TRIANGLE_6,
190   LP_RAST_OP_TRIANGLE_7,
191   LP_RAST_OP_TRIANGLE_8
192};
193
194static unsigned
195lp_rast_32_tri_tab[MAX_PLANES+1] = {
196   0,               /* should be impossible */
197   LP_RAST_OP_TRIANGLE_32_1,
198   LP_RAST_OP_TRIANGLE_32_2,
199   LP_RAST_OP_TRIANGLE_32_3,
200   LP_RAST_OP_TRIANGLE_32_4,
201   LP_RAST_OP_TRIANGLE_32_5,
202   LP_RAST_OP_TRIANGLE_32_6,
203   LP_RAST_OP_TRIANGLE_32_7,
204   LP_RAST_OP_TRIANGLE_32_8
205};
206
207
208
209/**
210 * The primitive covers the whole tile- shade whole tile.
211 *
212 * \param tx, ty  the tile position in tiles, not pixels
213 */
214static boolean
215lp_setup_whole_tile(struct lp_setup_context *setup,
216                    const struct lp_rast_shader_inputs *inputs,
217                    int tx, int ty)
218{
219   struct lp_scene *scene = setup->scene;
220
221   LP_COUNT(nr_fully_covered_64);
222
223   /* if variant is opaque and scissor doesn't effect the tile */
224   if (inputs->opaque) {
225      /* Several things prevent this optimization from working:
226       * - For layered rendering we can't determine if this covers the same layer
227       * as previous rendering (or in case of clears those actually always cover
228       * all layers so optimization is impossible). Need to use fb_max_layer and
229       * not setup->layer_slot to determine this since even if there's currently
230       * no slot assigned previous rendering could have used one.
231       * - If there were any Begin/End query commands in the scene then those
232       * would get removed which would be very wrong. Furthermore, if queries
233       * were just active we also can't do the optimization since to get
234       * accurate query results we unfortunately need to execute the rendering
235       * commands.
236       */
237      if (!scene->fb.zsbuf && scene->fb_max_layer == 0 && !scene->had_queries) {
238         /*
239          * All previous rendering will be overwritten so reset the bin.
240          */
241         lp_scene_bin_reset( scene, tx, ty );
242      }
243
244      LP_COUNT(nr_shade_opaque_64);
245      return lp_scene_bin_cmd_with_state( scene, tx, ty,
246                                          setup->fs.stored,
247                                          LP_RAST_OP_SHADE_TILE_OPAQUE,
248                                          lp_rast_arg_inputs(inputs) );
249   } else {
250      LP_COUNT(nr_shade_64);
251      return lp_scene_bin_cmd_with_state( scene, tx, ty,
252                                          setup->fs.stored,
253                                          LP_RAST_OP_SHADE_TILE,
254                                          lp_rast_arg_inputs(inputs) );
255   }
256}
257
258
259/**
260 * Do basic setup for triangle rasterization and determine which
261 * framebuffer tiles are touched.  Put the triangle in the scene's
262 * bins for the tiles which we overlap.
263 */
264static boolean
265do_triangle_ccw(struct lp_setup_context *setup,
266                struct fixed_position* position,
267                const float (*v0)[4],
268                const float (*v1)[4],
269                const float (*v2)[4],
270                boolean frontfacing )
271{
272   struct lp_scene *scene = setup->scene;
273   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
274   struct lp_rast_triangle *tri;
275   struct lp_rast_plane *plane;
276   const struct u_rect *scissor;
277   struct u_rect bbox, bboxpos;
278   boolean s_planes[4];
279   unsigned tri_bytes;
280   int nr_planes = 3;
281   unsigned viewport_index = 0;
282   unsigned layer = 0;
283   const float (*pv)[4];
284
285   /* Area should always be positive here */
286   assert(position->area > 0);
287
288   if (0)
289      lp_setup_print_triangle(setup, v0, v1, v2);
290
291   if (setup->flatshade_first) {
292      pv = v0;
293   }
294   else {
295      pv = v2;
296   }
297   if (setup->viewport_index_slot > 0) {
298      unsigned *udata = (unsigned*)pv[setup->viewport_index_slot];
299      viewport_index = lp_clamp_viewport_idx(*udata);
300   }
301   if (setup->layer_slot > 0) {
302      layer = *(unsigned*)pv[setup->layer_slot];
303      layer = MIN2(layer, scene->fb_max_layer);
304   }
305
306   /* Bounding rectangle (in pixels) */
307   {
308      /* Yes this is necessary to accurately calculate bounding boxes
309       * with the two fill-conventions we support.  GL (normally) ends
310       * up needing a bottom-left fill convention, which requires
311       * slightly different rounding.
312       */
313      int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;
314
315      /* Inclusive x0, exclusive x1 */
316      bbox.x0 =  MIN3(position->x[0], position->x[1], position->x[2]) >> FIXED_ORDER;
317      bbox.x1 = (MAX3(position->x[0], position->x[1], position->x[2]) - 1) >> FIXED_ORDER;
318
319      /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */
320      bbox.y0 = (MIN3(position->y[0], position->y[1], position->y[2]) + adj) >> FIXED_ORDER;
321      bbox.y1 = (MAX3(position->y[0], position->y[1], position->y[2]) - 1 + adj) >> FIXED_ORDER;
322   }
323
324   if (bbox.x1 < bbox.x0 ||
325       bbox.y1 < bbox.y0) {
326      if (0) debug_printf("empty bounding box\n");
327      LP_COUNT(nr_culled_tris);
328      return TRUE;
329   }
330
331   if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
332      if (0) debug_printf("offscreen\n");
333      LP_COUNT(nr_culled_tris);
334      return TRUE;
335   }
336
337   bboxpos = bbox;
338
339   /* Can safely discard negative regions, but need to keep hold of
340    * information about when the triangle extends past screen
341    * boundaries.  See trimmed_box in lp_setup_bin_triangle().
342    */
343   bboxpos.x0 = MAX2(bboxpos.x0, 0);
344   bboxpos.y0 = MAX2(bboxpos.y0, 0);
345
346   nr_planes = 3;
347   /*
348    * Determine how many scissor planes we need, that is drop scissor
349    * edges if the bounding box of the tri is fully inside that edge.
350    */
351   if (setup->scissor_test) {
352      /* why not just use draw_regions */
353      scissor = &setup->scissors[viewport_index];
354      scissor_planes_needed(s_planes, &bboxpos, scissor);
355      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
356   }
357
358   tri = lp_setup_alloc_triangle(scene,
359                                 key->num_inputs,
360                                 nr_planes,
361                                 &tri_bytes);
362   if (!tri)
363      return FALSE;
364
365#ifdef DEBUG
366   tri->v[0][0] = v0[0][0];
367   tri->v[1][0] = v1[0][0];
368   tri->v[2][0] = v2[0][0];
369   tri->v[0][1] = v0[0][1];
370   tri->v[1][1] = v1[0][1];
371   tri->v[2][1] = v2[0][1];
372#endif
373
374   LP_COUNT(nr_tris);
375
376   /* Setup parameter interpolants:
377    */
378   setup->setup.variant->jit_function(v0, v1, v2,
379                                      frontfacing,
380                                      GET_A0(&tri->inputs),
381                                      GET_DADX(&tri->inputs),
382                                      GET_DADY(&tri->inputs));
383
384   tri->inputs.frontfacing = frontfacing;
385   tri->inputs.disable = FALSE;
386   tri->inputs.opaque = setup->fs.current.variant->opaque;
387   tri->inputs.layer = layer;
388   tri->inputs.viewport_index = viewport_index;
389
390   if (0)
391      lp_dump_setup_coef(&setup->setup.variant->key,
392                         (const float (*)[4])GET_A0(&tri->inputs),
393                         (const float (*)[4])GET_DADX(&tri->inputs),
394                         (const float (*)[4])GET_DADY(&tri->inputs));
395
396   plane = GET_PLANES(tri);
397
398#if defined(PIPE_ARCH_SSE)
399   if (1) {
400      __m128i vertx, verty;
401      __m128i shufx, shufy;
402      __m128i dcdx, dcdy;
403      __m128i cdx02, cdx13, cdy02, cdy13, c02, c13;
404      __m128i c01, c23, unused;
405      __m128i dcdx_neg_mask;
406      __m128i dcdy_neg_mask;
407      __m128i dcdx_zero_mask;
408      __m128i top_left_flag, c_dec;
409      __m128i eo, p0, p1, p2;
410      __m128i zero = _mm_setzero_si128();
411
412      vertx = _mm_load_si128((__m128i *)position->x); /* vertex x coords */
413      verty = _mm_load_si128((__m128i *)position->y); /* vertex y coords */
414
415      shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
416      shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
417
418      dcdx = _mm_sub_epi32(verty, shufy);
419      dcdy = _mm_sub_epi32(vertx, shufx);
420
421      dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
422      dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
423      dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
424
425      top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
426
427      c_dec = _mm_or_si128(dcdx_neg_mask,
428                           _mm_and_si128(dcdx_zero_mask,
429                                         _mm_xor_si128(dcdy_neg_mask,
430                                                       top_left_flag)));
431
432      /*
433       * 64 bit arithmetic.
434       * Note we need _signed_ mul (_mm_mul_epi32) which we emulate.
435       */
436      cdx02 = mm_mullohi_epi32(dcdx, vertx, &cdx13);
437      cdy02 = mm_mullohi_epi32(dcdy, verty, &cdy13);
438      c02 = _mm_sub_epi64(cdx02, cdy02);
439      c13 = _mm_sub_epi64(cdx13, cdy13);
440      c02 = _mm_sub_epi64(c02, _mm_shuffle_epi32(c_dec,
441                                                 _MM_SHUFFLE(2,2,0,0)));
442      c13 = _mm_sub_epi64(c13, _mm_shuffle_epi32(c_dec,
443                                                 _MM_SHUFFLE(3,3,1,1)));
444
445      /*
446       * Useful for very small fbs/tris (or fewer subpixel bits) only:
447       * c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
448       *                   mm_mullo_epi32(dcdy, verty));
449       *
450       * c = _mm_sub_epi32(c, c_dec);
451       */
452
453      /* Scale up to match c:
454       */
455      dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
456      dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
457
458      /*
459       * Calculate trivial reject values:
460       * Note eo cannot overflow even if dcdx/dcdy would already have
461       * 31 bits (which they shouldn't have). This is because eo
462       * is never negative (albeit if we rely on that need to be careful...)
463       */
464      eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
465                         _mm_and_si128(dcdx_neg_mask, dcdx));
466
467      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
468
469      /*
470       * Pointless transpose which gets undone immediately in
471       * rasterization.
472       * It is actually difficult to do away with it - would essentially
473       * need GET_PLANES_DX, GET_PLANES_DY etc., but the calculations
474       * for this then would need to depend on the number of planes.
475       * The transpose is quite special here due to c being 64bit...
476       * The store has to be unaligned (unless we'd make the plane size
477       * a multiple of 128), and of course storing eo separately...
478       */
479      c01 = _mm_unpacklo_epi64(c02, c13);
480      c23 = _mm_unpackhi_epi64(c02, c13);
481      transpose2_64_2_32(&c01, &c23, &dcdx, &dcdy,
482                         &p0, &p1, &p2, &unused);
483      _mm_storeu_si128((__m128i *)&plane[0], p0);
484      plane[0].eo = (uint32_t)_mm_cvtsi128_si32(eo);
485      _mm_storeu_si128((__m128i *)&plane[1], p1);
486      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(3,2,0,1));
487      plane[1].eo = (uint32_t)_mm_cvtsi128_si32(eo);
488      _mm_storeu_si128((__m128i *)&plane[2], p2);
489      eo = _mm_shuffle_epi32(eo, _MM_SHUFFLE(0,0,0,2));
490      plane[2].eo = (uint32_t)_mm_cvtsi128_si32(eo);
491   } else
492#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
493   /*
494    * XXX this code is effectively disabled for all practical purposes,
495    * as the allowed fb size is tiny if FIXED_ORDER is 8.
496    */
497   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
498       setup->fb.height <= MAX_FIXED_LENGTH32 &&
499       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
500       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
501      unsigned int bottom_edge;
502      __m128i vertx, verty;
503      __m128i shufx, shufy;
504      __m128i dcdx, dcdy, c;
505      __m128i unused;
506      __m128i dcdx_neg_mask;
507      __m128i dcdy_neg_mask;
508      __m128i dcdx_zero_mask;
509      __m128i top_left_flag;
510      __m128i c_inc_mask, c_inc;
511      __m128i eo, p0, p1, p2;
512      __m128i_union vshuf_mask;
513      __m128i zero = vec_splats((unsigned char) 0);
514      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
515
516#ifdef PIPE_ARCH_LITTLE_ENDIAN
517      vshuf_mask.i[0] = 0x07060504;
518      vshuf_mask.i[1] = 0x0B0A0908;
519      vshuf_mask.i[2] = 0x03020100;
520      vshuf_mask.i[3] = 0x0F0E0D0C;
521#else
522      vshuf_mask.i[0] = 0x00010203;
523      vshuf_mask.i[1] = 0x0C0D0E0F;
524      vshuf_mask.i[2] = 0x04050607;
525      vshuf_mask.i[3] = 0x08090A0B;
526#endif
527
528      /* vertex x coords */
529      vertx = vec_load_si128((const uint32_t *) position->x);
530      /* vertex y coords */
531      verty = vec_load_si128((const uint32_t *) position->y);
532
533      shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
534      shufy = vec_perm (verty, verty, vshuf_mask.m128i);
535
536      dcdx = vec_sub_epi32(verty, shufy);
537      dcdy = vec_sub_epi32(vertx, shufx);
538
539      dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
540      dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
541      dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
542
543      bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
544      top_left_flag = (__m128i) vec_splats(bottom_edge);
545
546      c_inc_mask = vec_or(dcdx_neg_mask,
547                                vec_and(dcdx_zero_mask,
548                                              vec_xor(dcdy_neg_mask,
549                                                            top_left_flag)));
550
551      c_inc = vec_srli_epi32(c_inc_mask, 31);
552
553      c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
554                        vec_mullo_epi32(dcdy, verty));
555
556      c = vec_add_epi32(c, c_inc);
557
558      /* Scale up to match c:
559       */
560      dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
561      dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
562
563      /* Calculate trivial reject values:
564       */
565      eo = vec_sub_epi32(vec_andnot_si128(dcdy_neg_mask, dcdy),
566                         vec_and(dcdx_neg_mask, dcdx));
567
568      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
569
570      /* Pointless transpose which gets undone immediately in
571       * rasterization:
572       */
573      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
574                       &p0, &p1, &p2, &unused);
575
576#define STORE_PLANE(plane, vec) do {                  \
577         vec_store_si128((uint32_t *)&temp_vec, vec); \
578         plane.c    = (int64_t)temp_vec[0];           \
579         plane.dcdx = temp_vec[1];                    \
580         plane.dcdy = temp_vec[2];                    \
581         plane.eo   = temp_vec[3];                    \
582      } while(0)
583
584      STORE_PLANE(plane[0], p0);
585      STORE_PLANE(plane[1], p1);
586      STORE_PLANE(plane[2], p2);
587#undef STORE_PLANE
588   } else
589#endif
590   {
591      int i;
592      plane[0].dcdy = position->dx01;
593      plane[1].dcdy = position->x[1] - position->x[2];
594      plane[2].dcdy = position->dx20;
595      plane[0].dcdx = position->dy01;
596      plane[1].dcdx = position->y[1] - position->y[2];
597      plane[2].dcdx = position->dy20;
598
599      for (i = 0; i < 3; i++) {
600         /* half-edge constants, will be iterated over the whole render
601          * target.
602          */
603         plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
604                      IMUL64(plane[i].dcdy, position->y[i]);
605
606         /* correct for top-left vs. bottom-left fill convention.
607          */
608         if (plane[i].dcdx < 0) {
609            /* both fill conventions want this - adjust for left edges */
610            plane[i].c++;
611         }
612         else if (plane[i].dcdx == 0) {
613            if (setup->bottom_edge_rule == 0){
614               /* correct for top-left fill convention:
615                */
616               if (plane[i].dcdy > 0) plane[i].c++;
617            }
618            else {
619               /* correct for bottom-left fill convention:
620                */
621               if (plane[i].dcdy < 0) plane[i].c++;
622            }
623         }
624
625         /* Scale up to match c:
626          */
627         assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx);
628         assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy);
629         plane[i].dcdx <<= FIXED_ORDER;
630         plane[i].dcdy <<= FIXED_ORDER;
631
632         /* find trivial reject offsets for each edge for a single-pixel
633          * sized block.  These will be scaled up at each recursive level to
634          * match the active blocksize.  Scaling in this way works best if
635          * the blocks are square.
636          */
637         plane[i].eo = 0;
638         if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
639         if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
640      }
641   }
642
643   if (0) {
644      debug_printf("p0: %"PRIx64"/%08x/%08x/%08x\n",
645                   plane[0].c,
646                   plane[0].dcdx,
647                   plane[0].dcdy,
648                   plane[0].eo);
649
650      debug_printf("p1: %"PRIx64"/%08x/%08x/%08x\n",
651                   plane[1].c,
652                   plane[1].dcdx,
653                   plane[1].dcdy,
654                   plane[1].eo);
655
656      debug_printf("p2: %"PRIx64"/%08x/%08x/%08x\n",
657                   plane[2].c,
658                   plane[2].dcdx,
659                   plane[2].dcdy,
660                   plane[2].eo);
661   }
662
663
664   /*
665    * When rasterizing scissored tris, use the intersection of the
666    * triangle bounding box and the scissor rect to generate the
667    * scissor planes.
668    *
669    * This permits us to cut off the triangle "tails" that are present
670    * in the intermediate recursive levels caused when two of the
671    * triangles edges don't diverge quickly enough to trivially reject
672    * exterior blocks from the triangle.
673    *
674    * It's not really clear if it's worth worrying about these tails,
675    * but since we generate the planes for each scissored tri, it's
676    * free to trim them in this case.
677    *
678    * Note that otherwise, the scissor planes only vary in 'C' value,
679    * and even then only on state-changes.  Could alternatively store
680    * these planes elsewhere.
681    * (Or only store the c value together with a bit indicating which
682    * scissor edge this is, so rasterization would treat them differently
683    * (easier to evaluate) to ordinary planes.)
684    */
685   if (nr_planes > 3) {
686      /* why not just use draw_regions */
687      struct lp_rast_plane *plane_s = &plane[3];
688
689      if (s_planes[0]) {
690         plane_s->dcdx = ~0U << 8;
691         plane_s->dcdy = 0;
692         plane_s->c = (1-scissor->x0) << 8;
693         plane_s->eo = 1 << 8;
694         plane_s++;
695      }
696      if (s_planes[1]) {
697         plane_s->dcdx = 1 << 8;
698         plane_s->dcdy = 0;
699         plane_s->c = (scissor->x1+1) << 8;
700         plane_s->eo = 0 << 8;
701         plane_s++;
702      }
703      if (s_planes[2]) {
704         plane_s->dcdx = 0;
705         plane_s->dcdy = 1 << 8;
706         plane_s->c = (1-scissor->y0) << 8;
707         plane_s->eo = 1 << 8;
708         plane_s++;
709      }
710      if (s_planes[3]) {
711         plane_s->dcdx = 0;
712         plane_s->dcdy = ~0U << 8;
713         plane_s->c = (scissor->y1+1) << 8;
714         plane_s->eo = 0;
715         plane_s++;
716      }
717      assert(plane_s == &plane[nr_planes]);
718   }
719
720   return lp_setup_bin_triangle(setup, tri, &bbox, &bboxpos, nr_planes, viewport_index);
721}
722
723/*
724 * Round to nearest less or equal power of two of the input.
725 *
726 * Undefined if no bit set exists, so code should check against 0 first.
727 */
728static inline uint32_t
729floor_pot(uint32_t n)
730{
731#if defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
732   if (n == 0)
733      return 0;
734
735   __asm__("bsr %1,%0"
736          : "=r" (n)
737          : "rm" (n)
738          : "cc");
739   return 1 << n;
740#else
741   n |= (n >>  1);
742   n |= (n >>  2);
743   n |= (n >>  4);
744   n |= (n >>  8);
745   n |= (n >> 16);
746   return n - (n >> 1);
747#endif
748}
749
750
751boolean
752lp_setup_bin_triangle(struct lp_setup_context *setup,
753                      struct lp_rast_triangle *tri,
754                      const struct u_rect *bboxorig,
755                      const struct u_rect *bbox,
756                      int nr_planes,
757                      unsigned viewport_index)
758{
759   struct lp_scene *scene = setup->scene;
760   struct u_rect trimmed_box = *bbox;
761   int i;
762   /* What is the largest power-of-two boundary this triangle crosses:
763    */
764   int dx = floor_pot((bbox->x0 ^ bbox->x1) |
765		      (bbox->y0 ^ bbox->y1));
766
767   /* The largest dimension of the rasterized area of the triangle
768    * (aligned to a 4x4 grid), rounded down to the nearest power of two:
769    */
770   int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
771                 (bbox->y1 - (bbox->y0 & ~3)));
772   int sz = floor_pot(max_sz);
773
774   /*
775    * NOTE: It is important to use the original bounding box
776    * which might contain negative values here, because if the
777    * plane math may overflow or not with the 32bit rasterization
778    * functions depends on the original extent of the triangle.
779    */
780   int max_szorig = ((bboxorig->x1 - (bboxorig->x0 & ~3)) |
781                     (bboxorig->y1 - (bboxorig->y0 & ~3)));
782   boolean use_32bits = max_szorig <= MAX_FIXED_LENGTH32;
783
784   /* Now apply scissor, etc to the bounding box.  Could do this
785    * earlier, but it confuses the logic for tri-16 and would force
786    * the rasterizer to also respect scissor, etc, just for the rare
787    * cases where a small triangle extends beyond the scissor.
788    */
789   u_rect_find_intersection(&setup->draw_regions[viewport_index],
790                            &trimmed_box);
791
792   /* Determine which tile(s) intersect the triangle's bounding box
793    */
794   if (dx < TILE_SIZE)
795   {
796      int ix0 = bbox->x0 / TILE_SIZE;
797      int iy0 = bbox->y0 / TILE_SIZE;
798      unsigned px = bbox->x0 & 63 & ~3;
799      unsigned py = bbox->y0 & 63 & ~3;
800
801      assert(iy0 == bbox->y1 / TILE_SIZE &&
802	     ix0 == bbox->x1 / TILE_SIZE);
803
804      if (nr_planes == 3) {
805         if (sz < 4)
806         {
807            /* Triangle is contained in a single 4x4 stamp:
808             */
809            assert(px + 4 <= TILE_SIZE);
810            assert(py + 4 <= TILE_SIZE);
811            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
812                                                setup->fs.stored,
813                                                use_32bits ?
814                                                LP_RAST_OP_TRIANGLE_32_3_4 :
815                                                LP_RAST_OP_TRIANGLE_3_4,
816                                                lp_rast_arg_triangle_contained(tri, px, py) );
817         }
818
819         if (sz < 16)
820         {
821            /* Triangle is contained in a single 16x16 block:
822             */
823
824            /*
825             * The 16x16 block is only 4x4 aligned, and can exceed the tile
826             * dimensions if the triangle is 16 pixels in one dimension but 4
827             * in the other. So budge the 16x16 back inside the tile.
828             */
829            px = MIN2(px, TILE_SIZE - 16);
830            py = MIN2(py, TILE_SIZE - 16);
831
832            assert(px + 16 <= TILE_SIZE);
833            assert(py + 16 <= TILE_SIZE);
834
835            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
836                                                setup->fs.stored,
837                                                use_32bits ?
838                                                LP_RAST_OP_TRIANGLE_32_3_16 :
839                                                LP_RAST_OP_TRIANGLE_3_16,
840                                                lp_rast_arg_triangle_contained(tri, px, py) );
841         }
842      }
843      else if (nr_planes == 4 && sz < 16)
844      {
845         px = MIN2(px, TILE_SIZE - 16);
846         py = MIN2(py, TILE_SIZE - 16);
847
848         assert(px + 16 <= TILE_SIZE);
849         assert(py + 16 <= TILE_SIZE);
850
851         return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
852                                            setup->fs.stored,
853                                            use_32bits ?
854                                            LP_RAST_OP_TRIANGLE_32_4_16 :
855                                            LP_RAST_OP_TRIANGLE_4_16,
856                                            lp_rast_arg_triangle_contained(tri, px, py));
857      }
858
859
860      /* Triangle is contained in a single tile:
861       */
862      return lp_scene_bin_cmd_with_state(
863         scene, ix0, iy0, setup->fs.stored,
864         use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes],
865         lp_rast_arg_triangle(tri, (1<<nr_planes)-1));
866   }
867   else
868   {
869      struct lp_rast_plane *plane = GET_PLANES(tri);
870      int64_t c[MAX_PLANES];
871      int64_t ei[MAX_PLANES];
872
873      int64_t eo[MAX_PLANES];
874      int64_t xstep[MAX_PLANES];
875      int64_t ystep[MAX_PLANES];
876      int x, y;
877
878      int ix0 = trimmed_box.x0 / TILE_SIZE;
879      int iy0 = trimmed_box.y0 / TILE_SIZE;
880      int ix1 = trimmed_box.x1 / TILE_SIZE;
881      int iy1 = trimmed_box.y1 / TILE_SIZE;
882
883      for (i = 0; i < nr_planes; i++) {
884         c[i] = (plane[i].c +
885                 IMUL64(plane[i].dcdy, iy0) * TILE_SIZE -
886                 IMUL64(plane[i].dcdx, ix0) * TILE_SIZE);
887
888         ei[i] = (plane[i].dcdy -
889                  plane[i].dcdx -
890                  (int64_t)plane[i].eo) << TILE_ORDER;
891
892         eo[i] = (int64_t)plane[i].eo << TILE_ORDER;
893         xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
894         ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
895      }
896
897
898
899      /* Test tile-sized blocks against the triangle.
900       * Discard blocks fully outside the tri.  If the block is fully
901       * contained inside the tri, bin an lp_rast_shade_tile command.
902       * Else, bin a lp_rast_triangle command.
903       */
904      for (y = iy0; y <= iy1; y++)
905      {
906         boolean in = FALSE;  /* are we inside the triangle? */
907         int64_t cx[MAX_PLANES];
908
909         for (i = 0; i < nr_planes; i++)
910            cx[i] = c[i];
911
912         for (x = ix0; x <= ix1; x++)
913         {
914            int out = 0;
915            int partial = 0;
916
917            for (i = 0; i < nr_planes; i++) {
918               int64_t planeout = cx[i] + eo[i];
919               int64_t planepartial = cx[i] + ei[i] - 1;
920               out |= (int) (planeout >> 63);
921               partial |= ((int) (planepartial >> 63)) & (1<<i);
922            }
923
924            if (out) {
925               /* do nothing */
926               if (in)
927                  break;  /* exiting triangle, all done with this row */
928               LP_COUNT(nr_empty_64);
929            }
930            else if (partial) {
931               /* Not trivially accepted by at least one plane -
932                * rasterize/shade partial tile
933                */
934               int count = util_bitcount(partial);
935               in = TRUE;
936
937               if (!lp_scene_bin_cmd_with_state( scene, x, y,
938                                                 setup->fs.stored,
939                                                 use_32bits ?
940                                                 lp_rast_32_tri_tab[count] :
941                                                 lp_rast_tri_tab[count],
942                                                 lp_rast_arg_triangle(tri, partial) ))
943                  goto fail;
944
945               LP_COUNT(nr_partially_covered_64);
946            }
947            else {
948               /* triangle covers the whole tile- shade whole tile */
949               LP_COUNT(nr_fully_covered_64);
950               in = TRUE;
951               if (!lp_setup_whole_tile(setup, &tri->inputs, x, y))
952                  goto fail;
953            }
954
955            /* Iterate cx values across the region: */
956            for (i = 0; i < nr_planes; i++)
957               cx[i] += xstep[i];
958         }
959
960         /* Iterate c values down the region: */
961         for (i = 0; i < nr_planes; i++)
962            c[i] += ystep[i];
963      }
964   }
965
966   return TRUE;
967
968fail:
969   /* Need to disable any partially binned triangle.  This is easier
970    * than trying to locate all the triangle, shade-tile, etc,
971    * commands which may have been binned.
972    */
973   tri->inputs.disable = TRUE;
974   return FALSE;
975}
976
977
978/**
979 * Try to draw the triangle, restart the scene on failure.
980 */
981static void retry_triangle_ccw( struct lp_setup_context *setup,
982                                struct fixed_position* position,
983                                const float (*v0)[4],
984                                const float (*v1)[4],
985                                const float (*v2)[4],
986                                boolean front)
987{
988   if (!do_triangle_ccw( setup, position, v0, v1, v2, front ))
989   {
990      if (!lp_setup_flush_and_restart(setup))
991         return;
992
993      if (!do_triangle_ccw( setup, position, v0, v1, v2, front ))
994         return;
995   }
996}
997
998/**
999 * Calculate fixed position data for a triangle
1000 * It is unfortunate we need to do that here (as we need area
1001 * calculated in fixed point), as there's quite some code duplication
1002 * to what is done in the jit setup prog.
1003 */
1004static inline void
1005calc_fixed_position(struct lp_setup_context *setup,
1006                    struct fixed_position* position,
1007                    const float (*v0)[4],
1008                    const float (*v1)[4],
1009                    const float (*v2)[4])
1010{
1011   /*
1012    * The rounding may not be quite the same with PIPE_ARCH_SSE
1013    * (util_iround right now only does nearest/even on x87,
1014    * otherwise nearest/away-from-zero).
1015    * Both should be acceptable, I think.
1016    */
1017#if defined(PIPE_ARCH_SSE)
1018   __m128 v0r, v1r;
1019   __m128 vxy0xy2, vxy1xy0;
1020   __m128i vxy0xy2i, vxy1xy0i;
1021   __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
1022   __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
1023   __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
1024   v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
1025   vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
1026   v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
1027   vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
1028   vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
1029   vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
1030   vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);
1031   vxy1xy0 = _mm_mul_ps(vxy1xy0, fixed_one);
1032   vxy0xy2i = _mm_cvtps_epi32(vxy0xy2);
1033   vxy1xy0i = _mm_cvtps_epi32(vxy1xy0);
1034   dxdy0120 = _mm_sub_epi32(vxy0xy2i, vxy1xy0i);
1035   _mm_store_si128((__m128i *)&position->dx01, dxdy0120);
1036   /*
1037    * For the mul, would need some more shuffles, plus emulation
1038    * for the signed mul (without sse41), so don't bother.
1039    */
1040   x0x2y0y2 = _mm_shuffle_epi32(vxy0xy2i, _MM_SHUFFLE(3,1,2,0));
1041   x1x0y1y0 = _mm_shuffle_epi32(vxy1xy0i, _MM_SHUFFLE(3,1,2,0));
1042   x0120 = _mm_unpacklo_epi32(x0x2y0y2, x1x0y1y0);
1043   y0120 = _mm_unpackhi_epi32(x0x2y0y2, x1x0y1y0);
1044   _mm_store_si128((__m128i *)&position->x[0], x0120);
1045   _mm_store_si128((__m128i *)&position->y[0], y0120);
1046
1047#else
1048   position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
1049   position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
1050   position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
1051   position->x[3] = 0; // should be unused
1052
1053   position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
1054   position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
1055   position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
1056   position->y[3] = 0; // should be unused
1057
1058   position->dx01 = position->x[0] - position->x[1];
1059   position->dy01 = position->y[0] - position->y[1];
1060
1061   position->dx20 = position->x[2] - position->x[0];
1062   position->dy20 = position->y[2] - position->y[0];
1063#endif
1064
1065   position->area = IMUL64(position->dx01, position->dy20) -
1066         IMUL64(position->dx20, position->dy01);
1067}
1068
1069
1070/**
1071 * Rotate a triangle, flipping its clockwise direction,
1072 * Swaps values for xy[0] and xy[1]
1073 */
1074static inline void
1075rotate_fixed_position_01( struct fixed_position* position )
1076{
1077   int x, y;
1078
1079   x = position->x[1];
1080   y = position->y[1];
1081   position->x[1] = position->x[0];
1082   position->y[1] = position->y[0];
1083   position->x[0] = x;
1084   position->y[0] = y;
1085
1086   position->dx01 = -position->dx01;
1087   position->dy01 = -position->dy01;
1088   position->dx20 = position->x[2] - position->x[0];
1089   position->dy20 = position->y[2] - position->y[0];
1090
1091   position->area = -position->area;
1092}
1093
1094
1095/**
1096 * Rotate a triangle, flipping its clockwise direction,
1097 * Swaps values for xy[1] and xy[2]
1098 */
1099static inline void
1100rotate_fixed_position_12( struct fixed_position* position )
1101{
1102   int x, y;
1103
1104   x = position->x[2];
1105   y = position->y[2];
1106   position->x[2] = position->x[1];
1107   position->y[2] = position->y[1];
1108   position->x[1] = x;
1109   position->y[1] = y;
1110
1111   x = position->dx01;
1112   y = position->dy01;
1113   position->dx01 = -position->dx20;
1114   position->dy01 = -position->dy20;
1115   position->dx20 = -x;
1116   position->dy20 = -y;
1117
1118   position->area = -position->area;
1119}
1120
1121
1122/**
1123 * Draw triangle if it's CW, cull otherwise.
1124 */
1125static void triangle_cw(struct lp_setup_context *setup,
1126                        const float (*v0)[4],
1127                        const float (*v1)[4],
1128                        const float (*v2)[4])
1129{
1130   PIPE_ALIGN_VAR(16) struct fixed_position position;
1131   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
1132
1133   if (lp_context->active_statistics_queries) {
1134      lp_context->pipeline_statistics.c_primitives++;
1135   }
1136
1137   calc_fixed_position(setup, &position, v0, v1, v2);
1138
1139   if (position.area < 0) {
1140      if (setup->flatshade_first) {
1141         rotate_fixed_position_12(&position);
1142         retry_triangle_ccw(setup, &position, v0, v2, v1, !setup->ccw_is_frontface);
1143      } else {
1144         rotate_fixed_position_01(&position);
1145         retry_triangle_ccw(setup, &position, v1, v0, v2, !setup->ccw_is_frontface);
1146      }
1147   }
1148}
1149
1150
1151static void triangle_ccw(struct lp_setup_context *setup,
1152                         const float (*v0)[4],
1153                         const float (*v1)[4],
1154                         const float (*v2)[4])
1155{
1156   PIPE_ALIGN_VAR(16) struct fixed_position position;
1157   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
1158
1159   if (lp_context->active_statistics_queries) {
1160      lp_context->pipeline_statistics.c_primitives++;
1161   }
1162
1163   calc_fixed_position(setup, &position, v0, v1, v2);
1164
1165   if (position.area > 0)
1166      retry_triangle_ccw(setup, &position, v0, v1, v2, setup->ccw_is_frontface);
1167}
1168
1169/**
1170 * Draw triangle whether it's CW or CCW.
1171 */
1172static void triangle_both(struct lp_setup_context *setup,
1173                          const float (*v0)[4],
1174                          const float (*v1)[4],
1175                          const float (*v2)[4])
1176{
1177   PIPE_ALIGN_VAR(16) struct fixed_position position;
1178   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
1179
1180   if (lp_context->active_statistics_queries) {
1181      lp_context->pipeline_statistics.c_primitives++;
1182   }
1183
1184   calc_fixed_position(setup, &position, v0, v1, v2);
1185
1186   if (0) {
1187      assert(!util_is_inf_or_nan(v0[0][0]));
1188      assert(!util_is_inf_or_nan(v0[0][1]));
1189      assert(!util_is_inf_or_nan(v1[0][0]));
1190      assert(!util_is_inf_or_nan(v1[0][1]));
1191      assert(!util_is_inf_or_nan(v2[0][0]));
1192      assert(!util_is_inf_or_nan(v2[0][1]));
1193   }
1194
1195   if (position.area > 0)
1196      retry_triangle_ccw( setup, &position, v0, v1, v2, setup->ccw_is_frontface );
1197   else if (position.area < 0) {
1198      if (setup->flatshade_first) {
1199         rotate_fixed_position_12( &position );
1200         retry_triangle_ccw( setup, &position, v0, v2, v1, !setup->ccw_is_frontface );
1201      } else {
1202         rotate_fixed_position_01( &position );
1203         retry_triangle_ccw( setup, &position, v1, v0, v2, !setup->ccw_is_frontface );
1204      }
1205   }
1206}
1207
1208
1209static void triangle_noop(struct lp_setup_context *setup,
1210                          const float (*v0)[4],
1211                          const float (*v1)[4],
1212                          const float (*v2)[4])
1213{
1214}
1215
1216
1217void
1218lp_setup_choose_triangle(struct lp_setup_context *setup)
1219{
1220   if (setup->rasterizer_discard) {
1221      setup->triangle = triangle_noop;
1222      return;
1223   }
1224   switch (setup->cullmode) {
1225   case PIPE_FACE_NONE:
1226      setup->triangle = triangle_both;
1227      break;
1228   case PIPE_FACE_BACK:
1229      setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw;
1230      break;
1231   case PIPE_FACE_FRONT:
1232      setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
1233      break;
1234   default:
1235      setup->triangle = triangle_noop;
1236      break;
1237   }
1238}
1239