1/************************************************************************** 2 * 3 * Copyright 2007-2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/* 29 * Rasterization for binned triangles within a tile 30 */ 31 32#include <limits.h> 33#include "util/u_math.h" 34#include "lp_debug.h" 35#include "lp_perf.h" 36#include "lp_rast_priv.h" 37 38/** 39 * Shade all pixels in a 4x4 block. 40 */ 41static void 42block_full_4(struct lp_rasterizer_task *task, 43 const struct lp_rast_triangle *tri, 44 int x, int y) 45{ 46 lp_rast_shade_quads_all(task, &tri->inputs, x, y); 47} 48 49 50/** 51 * Shade all pixels in a 16x16 block. 52 */ 53static void 54block_full_16(struct lp_rasterizer_task *task, 55 const struct lp_rast_triangle *tri, 56 int x, int y) 57{ 58 unsigned ix, iy; 59 assert(x % 16 == 0); 60 assert(y % 16 == 0); 61 for (iy = 0; iy < 16; iy += 4) 62 for (ix = 0; ix < 16; ix += 4) 63 block_full_4(task, tri, x + ix, y + iy); 64} 65 66static inline unsigned 67build_mask_linear(int32_t c, int32_t dcdx, int32_t dcdy) 68{ 69 unsigned mask = 0; 70 71 int32_t c0 = c; 72 int32_t c1 = c0 + dcdy; 73 int32_t c2 = c1 + dcdy; 74 int32_t c3 = c2 + dcdy; 75 76 mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 77 mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 78 mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 79 mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 80 mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 81 mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 82 mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 83 mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 84 mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 85 mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 86 mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 87 mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 88 mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 89 mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 90 mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 91 mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 92 93 return mask; 94} 95 96 97static inline void 98build_masks(int32_t c, 99 int32_t cdiff, 100 int32_t dcdx, 101 int32_t dcdy, 102 unsigned *outmask, 103 unsigned *partmask) 104{ 105 *outmask |= build_mask_linear(c, dcdx, dcdy); 106 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 107} 108 109void 110lp_rast_triangle_3_16(struct lp_rasterizer_task *task, 111 const union lp_rast_cmd_arg arg) 112{ 113 union lp_rast_cmd_arg arg2; 114 arg2.triangle.tri = arg.triangle.tri; 115 arg2.triangle.plane_mask = (1<<3)-1; 116 lp_rast_triangle_3(task, arg2); 117} 118 119void 120lp_rast_triangle_3_4(struct lp_rasterizer_task *task, 121 const union lp_rast_cmd_arg arg) 122{ 123 lp_rast_triangle_3_16(task, arg); 124} 125 126void 127lp_rast_triangle_4_16(struct lp_rasterizer_task *task, 128 const union lp_rast_cmd_arg arg) 129{ 130 union lp_rast_cmd_arg arg2; 131 arg2.triangle.tri = arg.triangle.tri; 132 arg2.triangle.plane_mask = (1<<4)-1; 133 lp_rast_triangle_4(task, arg2); 134} 135 136void 137lp_rast_triangle_ms_3_16(struct lp_rasterizer_task *task, 138 const union lp_rast_cmd_arg arg) 139{ 140 union lp_rast_cmd_arg arg2; 141 arg2.triangle.tri = arg.triangle.tri; 142 arg2.triangle.plane_mask = (1<<3)-1; 143 lp_rast_triangle_ms_3(task, arg2); 144} 145 146void 147lp_rast_triangle_ms_3_4(struct lp_rasterizer_task *task, 148 const union lp_rast_cmd_arg arg) 149{ 150 lp_rast_triangle_ms_3_16(task, arg); 151} 152 153void 154lp_rast_triangle_ms_4_16(struct lp_rasterizer_task *task, 155 const union lp_rast_cmd_arg arg) 156{ 157 union lp_rast_cmd_arg arg2; 158 arg2.triangle.tri = arg.triangle.tri; 159 arg2.triangle.plane_mask = (1<<4)-1; 160 lp_rast_triangle_ms_4(task, arg2); 161} 162 163#if defined(PIPE_ARCH_SSE) 164 165#include <emmintrin.h> 166#include "util/u_sse.h" 167 168 169static inline void 170build_masks_sse(int c, 171 int cdiff, 172 int dcdx, 173 int dcdy, 174 unsigned *outmask, 175 unsigned *partmask) 176{ 177 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 178 __m128i xdcdy = _mm_set1_epi32(dcdy); 179 180 /* Get values across the quad 181 */ 182 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 183 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 184 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 185 186 { 187 __m128i cstep01, cstep23, result; 188 189 cstep01 = _mm_packs_epi32(cstep0, cstep1); 190 cstep23 = _mm_packs_epi32(cstep2, cstep3); 191 result = _mm_packs_epi16(cstep01, cstep23); 192 193 *outmask |= _mm_movemask_epi8(result); 194 } 195 196 197 { 198 __m128i cio4 = _mm_set1_epi32(cdiff); 199 __m128i cstep01, cstep23, result; 200 201 cstep0 = _mm_add_epi32(cstep0, cio4); 202 cstep1 = _mm_add_epi32(cstep1, cio4); 203 cstep2 = _mm_add_epi32(cstep2, cio4); 204 cstep3 = _mm_add_epi32(cstep3, cio4); 205 206 cstep01 = _mm_packs_epi32(cstep0, cstep1); 207 cstep23 = _mm_packs_epi32(cstep2, cstep3); 208 result = _mm_packs_epi16(cstep01, cstep23); 209 210 *partmask |= _mm_movemask_epi8(result); 211 } 212} 213 214 215static inline unsigned 216build_mask_linear_sse(int c, int dcdx, int dcdy) 217{ 218 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 219 __m128i xdcdy = _mm_set1_epi32(dcdy); 220 221 /* Get values across the quad 222 */ 223 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 224 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 225 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 226 227 /* pack pairs of results into epi16 228 */ 229 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 230 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 231 232 /* pack into epi8, preserving sign bits 233 */ 234 __m128i result = _mm_packs_epi16(cstep01, cstep23); 235 236 /* extract sign bits to create mask 237 */ 238 return _mm_movemask_epi8(result); 239} 240 241static inline unsigned 242sign_bits4(const __m128i *cstep, int cdiff) 243{ 244 245 /* Adjust the step values 246 */ 247 __m128i cio4 = _mm_set1_epi32(cdiff); 248 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 249 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 250 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 251 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 252 253 /* Pack down to epi8 254 */ 255 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 256 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 257 __m128i result = _mm_packs_epi16(cstep01, cstep23); 258 259 /* Extract the sign bits 260 */ 261 return _mm_movemask_epi8(result); 262} 263 264#define COLUMN0 ((1<<0)|(1<<4)|(1<<8) |(1<<12)) 265#define COLUMN1 ((1<<1)|(1<<5)|(1<<9) |(1<<13)) 266#define COLUMN2 ((1<<2)|(1<<6)|(1<<10)|(1<<14)) 267#define COLUMN3 ((1<<3)|(1<<7)|(1<<11)|(1<<15)) 268 269#define ROW0 ((1<<0) |(1<<1) |(1<<2) |(1<<3)) 270#define ROW1 ((1<<4) |(1<<5) |(1<<6) |(1<<7)) 271#define ROW2 ((1<<8) |(1<<9) |(1<<10)|(1<<11)) 272#define ROW3 ((1<<12)|(1<<13)|(1<<14)|(1<<15)) 273 274#define STAMP_SIZE 4 275static unsigned bottom_mask_tab[STAMP_SIZE] = { 276 ROW3, 277 ROW3 | ROW2, 278 ROW3 | ROW2 | ROW1, 279 ROW3 | ROW2 | ROW1 | ROW0, 280}; 281 282static unsigned right_mask_tab[STAMP_SIZE] = { 283 COLUMN3, 284 COLUMN3 | COLUMN2, 285 COLUMN3 | COLUMN2 | COLUMN1, 286 COLUMN3 | COLUMN2 | COLUMN1 | COLUMN0, 287}; 288 289 290#define NR_PLANES 3 291 292void 293lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 294 const union lp_rast_cmd_arg arg) 295{ 296 const struct lp_rast_triangle *tri = arg.triangle.tri; 297 const struct lp_rast_plane *plane = GET_PLANES(tri); 298 int x = (arg.triangle.plane_mask & 0xff) + task->x; 299 int y = (arg.triangle.plane_mask >> 8) + task->y; 300 unsigned i, j; 301 302 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 303 unsigned nr = 0; 304 305 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 306 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 307 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 308 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 309 __m128i zero = _mm_setzero_si128(); 310 311 __m128i c, dcdx, dcdy, rej4; 312 __m128i dcdx_neg_mask, dcdy_neg_mask; 313 __m128i dcdx2, dcdx3; 314 315 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 316 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 317 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 318 __m128i unused; 319 320 transpose4_epi32(&p0, &p1, &p2, &zero, 321 &c, &unused, &dcdx, &dcdy); 322 323 /* recalc eo - easier than trying to load as scalars / shuffle... */ 324 dcdx_neg_mask = _mm_srai_epi32(dcdx, 31); 325 dcdy_neg_mask = _mm_srai_epi32(dcdy, 31); 326 rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy), 327 _mm_and_si128(dcdx_neg_mask, dcdx)); 328 329 /* Adjust dcdx; 330 */ 331 dcdx = _mm_sub_epi32(zero, dcdx); 332 333 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 334 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 335 rej4 = _mm_slli_epi32(rej4, 2); 336 337 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 338 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 339 rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 340 341 dcdx2 = _mm_add_epi32(dcdx, dcdx); 342 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 343 344 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 345 &span_0, &span_1, &span_2, &unused); 346 347 for (i = 0; i < 4; i++) { 348 __m128i cx = c; 349 350 for (j = 0; j < 4; j++) { 351 __m128i c4rej = _mm_add_epi32(cx, rej4); 352 __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 353 354 /* if (is_zero(rej_masks)) */ 355 if (_mm_movemask_epi8(rej_masks) == 0) { 356 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 357 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 358 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 359 360 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 361 362 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 363 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 364 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 365 366 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 367 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 368 369 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 370 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 371 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 372 373 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 374 375 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 376 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 377 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 378 379 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 380 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 381 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 382 383 unsigned mask = _mm_movemask_epi8(c_0123); 384 385 out[nr].i = i; 386 out[nr].j = j; 387 out[nr].mask = mask; 388 if (mask != 0xffff) 389 nr++; 390 } 391 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 392 } 393 394 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 395 } 396 397 for (i = 0; i < nr; i++) 398 lp_rast_shade_quads_mask(task, 399 &tri->inputs, 400 x + 4 * out[i].j, 401 y + 4 * out[i].i, 402 0xffff & ~out[i].mask); 403} 404 405void 406lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 407 const union lp_rast_cmd_arg arg) 408{ 409 const struct lp_rast_triangle *tri = arg.triangle.tri; 410 const struct lp_rast_plane *plane = GET_PLANES(tri); 411 unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 412 unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 413 414 /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */ 415 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */ 416 __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]); 417 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); 418 __m128i zero = _mm_setzero_si128(); 419 420 __m128i c, dcdx, dcdy; 421 __m128i dcdx2, dcdx3; 422 423 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 424 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 425 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 426 __m128i unused; 427 428 transpose4_epi32(&p0, &p1, &p2, &zero, 429 &c, &unused, &dcdx, &dcdy); 430 431 /* Adjust dcdx; 432 */ 433 dcdx = _mm_sub_epi32(zero, dcdx); 434 435 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 436 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 437 438 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 439 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 440 441 dcdx2 = _mm_add_epi32(dcdx, dcdx); 442 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 443 444 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 445 &span_0, &span_1, &span_2, &unused); 446 447 448 { 449 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 450 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 451 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 452 453 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 454 455 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 456 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 457 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 458 459 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 460 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 461 462 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 463 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 464 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 465 466 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 467 468 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 469 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 470 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 471 472 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 473 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 474 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 475 476 unsigned mask = _mm_movemask_epi8(c_0123); 477 478 if (mask != 0xffff) 479 lp_rast_shade_quads_mask(task, 480 &tri->inputs, 481 x, 482 y, 483 0xffff & ~mask); 484 } 485} 486 487#undef NR_PLANES 488 489#else 490 491#if defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN 492 493#include <altivec.h> 494#include "util/u_pwr8.h" 495 496static inline void 497build_masks_ppc(int c, 498 int cdiff, 499 int dcdx, 500 int dcdy, 501 unsigned *outmask, 502 unsigned *partmask) 503{ 504 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 505 __m128i xdcdy = (__m128i) vec_splats(dcdy); 506 507 /* Get values across the quad 508 */ 509 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 510 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 511 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 512 513 { 514 __m128i cstep01, cstep23, result; 515 516 cstep01 = vec_packs_epi32(cstep0, cstep1); 517 cstep23 = vec_packs_epi32(cstep2, cstep3); 518 result = vec_packs_epi16(cstep01, cstep23); 519 520 *outmask |= vec_movemask_epi8(result); 521 } 522 523 524 { 525 __m128i cio4 = (__m128i) vec_splats(cdiff); 526 __m128i cstep01, cstep23, result; 527 528 cstep0 = vec_add_epi32(cstep0, cio4); 529 cstep1 = vec_add_epi32(cstep1, cio4); 530 cstep2 = vec_add_epi32(cstep2, cio4); 531 cstep3 = vec_add_epi32(cstep3, cio4); 532 533 cstep01 = vec_packs_epi32(cstep0, cstep1); 534 cstep23 = vec_packs_epi32(cstep2, cstep3); 535 result = vec_packs_epi16(cstep01, cstep23); 536 537 *partmask |= vec_movemask_epi8(result); 538 } 539} 540 541static inline unsigned 542build_mask_linear_ppc(int c, int dcdx, int dcdy) 543{ 544 __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 545 __m128i xdcdy = (__m128i) vec_splats(dcdy); 546 547 /* Get values across the quad 548 */ 549 __m128i cstep1 = vec_add_epi32(cstep0, xdcdy); 550 __m128i cstep2 = vec_add_epi32(cstep1, xdcdy); 551 __m128i cstep3 = vec_add_epi32(cstep2, xdcdy); 552 553 /* pack pairs of results into epi16 554 */ 555 __m128i cstep01 = vec_packs_epi32(cstep0, cstep1); 556 __m128i cstep23 = vec_packs_epi32(cstep2, cstep3); 557 558 /* pack into epi8, preserving sign bits 559 */ 560 __m128i result = vec_packs_epi16(cstep01, cstep23); 561 562 /* extract sign bits to create mask 563 */ 564 return vec_movemask_epi8(result); 565} 566 567static inline __m128i 568lp_plane_to_m128i(const struct lp_rast_plane *plane) 569{ 570 return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx, 571 (int32_t)plane->dcdy, (int32_t)plane->eo); 572} 573 574#define NR_PLANES 3 575 576void 577lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 578 const union lp_rast_cmd_arg arg) 579{ 580 const struct lp_rast_triangle *tri = arg.triangle.tri; 581 const struct lp_rast_plane *plane = GET_PLANES(tri); 582 int x = (arg.triangle.plane_mask & 0xff) + task->x; 583 int y = (arg.triangle.plane_mask >> 8) + task->y; 584 unsigned i, j; 585 586 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 587 unsigned nr = 0; 588 589 __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ 590 __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ 591 __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ 592 __m128i zero = vec_splats((unsigned char) 0); 593 594 __m128i c; 595 __m128i dcdx; 596 __m128i dcdy; 597 __m128i rej4; 598 599 __m128i dcdx2; 600 __m128i dcdx3; 601 602 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 603 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 604 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 605 __m128i unused; 606 607 __m128i vshuf_mask0; 608 __m128i vshuf_mask1; 609 __m128i vshuf_mask2; 610 611#if UTIL_ARCH_LITTLE_ENDIAN 612 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); 613 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); 614 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); 615#else 616 vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); 617 vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); 618 vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); 619#endif 620 621 transpose4_epi32(&p0, &p1, &p2, &zero, 622 &c, &dcdx, &dcdy, &rej4); 623 624 /* Adjust dcdx; 625 */ 626 dcdx = vec_sub_epi32(zero, dcdx); 627 628 c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); 629 c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); 630 rej4 = vec_slli_epi32(rej4, 2); 631 632 /* 633 * Adjust so we can just check the sign bit (< 0 comparison), 634 * instead of having to do a less efficient <= 0 comparison 635 */ 636 c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); 637 rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); 638 639 dcdx2 = vec_add_epi32(dcdx, dcdx); 640 dcdx3 = vec_add_epi32(dcdx2, dcdx); 641 642 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 643 &span_0, &span_1, &span_2, &unused); 644 645 for (i = 0; i < 4; i++) { 646 __m128i cx = c; 647 648 for (j = 0; j < 4; j++) { 649 __m128i c4rej = vec_add_epi32(cx, rej4); 650 __m128i rej_masks = vec_srai_epi32(c4rej, 31); 651 652 /* if (is_zero(rej_masks)) */ 653 if (vec_movemask_epi8(rej_masks) == 0) { 654 __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); 655 __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); 656 __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); 657 658 __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); 659 660 __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); 661 __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); 662 __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); 663 664 __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); 665 __m128i c_01 = vec_packs_epi32(c_0, c_1); 666 667 __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); 668 __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); 669 __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); 670 671 __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); 672 673 __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); 674 __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); 675 __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); 676 677 __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); 678 __m128i c_23 = vec_packs_epi32(c_2, c_3); 679 __m128i c_0123 = vec_packs_epi16(c_01, c_23); 680 681 unsigned mask = vec_movemask_epi8(c_0123); 682 683 out[nr].i = i; 684 out[nr].j = j; 685 out[nr].mask = mask; 686 if (mask != 0xffff) 687 nr++; 688 } 689 cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); 690 } 691 692 c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); 693 } 694 695 for (i = 0; i < nr; i++) 696 lp_rast_shade_quads_mask(task, 697 &tri->inputs, 698 x + 4 * out[i].j, 699 y + 4 * out[i].i, 700 0xffff & ~out[i].mask); 701} 702 703#undef NR_PLANES 704 705#else 706 707void 708lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, 709 const union lp_rast_cmd_arg arg) 710{ 711 union lp_rast_cmd_arg arg2; 712 arg2.triangle.tri = arg.triangle.tri; 713 arg2.triangle.plane_mask = (1<<3)-1; 714 lp_rast_triangle_32_3(task, arg2); 715} 716 717#endif /* _ARCH_PWR8 && UTIL_ARCH_LITTLE_ENDIAN */ 718 719void 720lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task, 721 const union lp_rast_cmd_arg arg) 722{ 723 union lp_rast_cmd_arg arg2; 724 arg2.triangle.tri = arg.triangle.tri; 725 arg2.triangle.plane_mask = (1<<4)-1; 726 lp_rast_triangle_32_4(task, arg2); 727} 728 729void 730lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task, 731 const union lp_rast_cmd_arg arg) 732{ 733 lp_rast_triangle_32_3_16(task, arg); 734} 735 736#endif 737 738#if defined PIPE_ARCH_SSE 739#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_sse((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 740#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_sse((int)c, dcdx, dcdy) 741#elif (defined(_ARCH_PWR8) && UTIL_ARCH_LITTLE_ENDIAN) 742#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_ppc((int)c, (int)cdiff, dcdx, dcdy, omask, pmask) 743#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_ppc((int)c, dcdx, dcdy) 744#else 745#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask) 746#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy) 747#endif 748 749#define RASTER_64 1 750 751#define TAG(x) x##_1 752#define NR_PLANES 1 753#include "lp_rast_tri_tmp.h" 754 755#define TAG(x) x##_2 756#define NR_PLANES 2 757#include "lp_rast_tri_tmp.h" 758 759#define TAG(x) x##_3 760#define NR_PLANES 3 761/*#define TRI_4 lp_rast_triangle_3_4*/ 762/*#define TRI_16 lp_rast_triangle_3_16*/ 763#include "lp_rast_tri_tmp.h" 764 765#define TAG(x) x##_4 766#define NR_PLANES 4 767/*#define TRI_16 lp_rast_triangle_4_16*/ 768#include "lp_rast_tri_tmp.h" 769 770#define TAG(x) x##_5 771#define NR_PLANES 5 772#include "lp_rast_tri_tmp.h" 773 774#define TAG(x) x##_6 775#define NR_PLANES 6 776#include "lp_rast_tri_tmp.h" 777 778#define TAG(x) x##_7 779#define NR_PLANES 7 780#include "lp_rast_tri_tmp.h" 781 782#define TAG(x) x##_8 783#define NR_PLANES 8 784#include "lp_rast_tri_tmp.h" 785 786#undef RASTER_64 787 788#define TAG(x) x##_32_1 789#define NR_PLANES 1 790#include "lp_rast_tri_tmp.h" 791 792#define TAG(x) x##_32_2 793#define NR_PLANES 2 794#include "lp_rast_tri_tmp.h" 795 796#define TAG(x) x##_32_3 797#define NR_PLANES 3 798/*#define TRI_4 lp_rast_triangle_3_4*/ 799/*#define TRI_16 lp_rast_triangle_3_16*/ 800#include "lp_rast_tri_tmp.h" 801 802#define TAG(x) x##_32_4 803#define NR_PLANES 4 804#ifdef PIPE_ARCH_SSE 805#define TRI_16 lp_rast_triangle_32_4_16 806#endif 807#include "lp_rast_tri_tmp.h" 808 809#define TAG(x) x##_32_5 810#define NR_PLANES 5 811#include "lp_rast_tri_tmp.h" 812 813#define TAG(x) x##_32_6 814#define NR_PLANES 6 815#include "lp_rast_tri_tmp.h" 816 817#define TAG(x) x##_32_7 818#define NR_PLANES 7 819#include "lp_rast_tri_tmp.h" 820 821#define TAG(x) x##_32_8 822#define NR_PLANES 8 823#include "lp_rast_tri_tmp.h" 824 825#define MULTISAMPLE 1 826#define RASTER_64 1 827 828#define TAG(x) x##_ms_1 829#define NR_PLANES 1 830#include "lp_rast_tri_tmp.h" 831 832#define TAG(x) x##_ms_2 833#define NR_PLANES 2 834#include "lp_rast_tri_tmp.h" 835 836#define TAG(x) x##_ms_3 837#define NR_PLANES 3 838/*#define TRI_4 lp_rast_triangle_3_4*/ 839/*#define TRI_16 lp_rast_triangle_3_16*/ 840#include "lp_rast_tri_tmp.h" 841 842#define TAG(x) x##_ms_4 843#define NR_PLANES 4 844/*#define TRI_16 lp_rast_triangle_4_16*/ 845#include "lp_rast_tri_tmp.h" 846 847#define TAG(x) x##_ms_5 848#define NR_PLANES 5 849#include "lp_rast_tri_tmp.h" 850 851#define TAG(x) x##_ms_6 852#define NR_PLANES 6 853#include "lp_rast_tri_tmp.h" 854 855#define TAG(x) x##_ms_7 856#define NR_PLANES 7 857#include "lp_rast_tri_tmp.h" 858 859#define TAG(x) x##_ms_8 860#define NR_PLANES 8 861#include "lp_rast_tri_tmp.h" 862 863#undef RASTER_64 864