1/************************************************************************** 2 * 3 * Copyright 2010-2021 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 20 * USE OR OTHER DEALINGS IN THE SOFTWARE. 21 * 22 * The above copyright notice and this permission notice (including the 23 * next paragraph) shall be included in all copies or substantial portions 24 * of the Software. 25 * 26 **************************************************************************/ 27 28 29#include "pipe/p_config.h" 30 31#include "util/u_math.h" 32#include "util/u_cpu_detect.h" 33#include "util/u_pack_color.h" 34#include "util/u_surface.h" 35#include "util/u_sse.h" 36 37#include "lp_jit.h" 38#include "lp_rast.h" 39#include "lp_debug.h" 40#include "lp_state_fs.h" 41#include "lp_linear_priv.h" 42 43 44#if defined(PIPE_ARCH_SSE) 45 46#include <emmintrin.h> 47 48 49struct nearest_sampler { 50 PIPE_ALIGN_VAR(16) uint32_t out[64]; 51 52 const struct lp_jit_texture *texture; 53 float fsrc_x; /* src_x0 */ 54 float fsrc_y; /* src_y0 */ 55 float fdsdx; /* sx */ 56 float fdsdy; /* sx */ 57 float fdtdx; /* sy */ 58 float fdtdy; /* sy */ 59 int width; 60 int y; 61 62 const uint32_t *(*fetch)(struct nearest_sampler *samp); 63}; 64 65 66struct linear_interp { 67 PIPE_ALIGN_VAR(16) uint32_t out[64]; 68 __m128i a0; 69 __m128i dadx; 70 __m128i dady; 71 int width; /* rounded up to multiple of 4 */ 72 boolean is_constant; 73}; 74 75/* Organize all the information needed for blending in one place. 76 * Could have blend function pointer here, but we currently always 77 * know which one we want to call. 78 */ 79struct color_blend { 80 const uint32_t *src; 81 uint8_t *color; 82 int stride; 83 int width; /* the exact width */ 84}; 85 86 87/* Organize all the information needed for running each of the shaders 88 * in one place. 89 */ 90struct shader { 91 PIPE_ALIGN_VAR(16) uint32_t out0[64]; 92 const uint32_t *src0; 93 const uint32_t *src1; 94 __m128i const0; 95 int width; /* rounded up to multiple of 4 */ 96}; 97 98 99/* For a row of pixels, perform add/one/inv_src_alpha (ie 100 * premultiplied alpha) blending between the incoming pixels and the 101 * destination buffer. 102 * 103 * Used to implement the BLIT_RGBA + blend shader, there are no 104 * operations from the pixel shader left to implement at this level - 105 * effectively the pixel shader was just a texture fetch which has 106 * already been performed. This routine then purely implements 107 * blending. 108 */ 109static void 110blend_premul(struct color_blend *blend) 111{ 112 const uint32_t *src = blend->src; /* aligned */ 113 uint32_t *dst = (uint32_t *)blend->color; /* unaligned */ 114 int width = blend->width; 115 int i; 116 __m128i tmp; 117 union { __m128i m128; uint ui[4]; } dstreg; 118 119 blend->color += blend->stride; 120 121 for (i = 0; i + 3 < width; i += 4) { 122 tmp = _mm_loadu_si128((const __m128i *)&dst[i]); /* UNALIGNED READ */ 123 dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i], 124 tmp); 125 _mm_storeu_si128((__m128i *)&dst[i], dstreg.m128); /* UNALIGNED WRITE */ 126 } 127 128 if (i < width) { 129 int j; 130 for (j = 0; j < width - i ; j++) { 131 dstreg.ui[j] = dst[i+j]; 132 } 133 dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i], 134 dstreg.m128); 135 for (; i < width; i++) 136 dst[i] = dstreg.ui[i&3]; 137 } 138} 139 140 141static void 142blend_noop(struct color_blend *blend) 143{ 144 memcpy(blend->color, blend->src, blend->width * sizeof(unsigned)); 145 blend->color += blend->stride; 146} 147 148 149static void 150init_blend(struct color_blend *blend, 151 int x, int y, int width, int height, 152 uint8_t *color, 153 int stride) 154{ 155 blend->color = color + x * 4 + y * stride; 156 blend->stride = stride; 157 blend->width = width; 158} 159 160 161/* 162 * Perform nearest filtered lookup of a row of texels. Texture lookup 163 * is assumed to be axis aligned but with arbitrary scaling. 164 * 165 * Texture coordinate interpolation is performed in 24.8 fixed point. 166 * Note that the longest span we will encounter is 64 pixels long, 167 * meaning that 8 fractional bits is more than sufficient to represent 168 * the shallowest gradient possible within this span. 169 * 170 * After 64 pixels (ie. in the next tile), the starting point will be 171 * recalculated with floating point arithmetic. 172 * 173 * XXX: migrate this to use Jose's quad blitter texture fetch routines. 174 */ 175static const uint32_t * 176fetch_row(struct nearest_sampler *samp) 177{ 178 int y = samp->y++; 179 uint32_t *row = samp->out; 180 const struct lp_jit_texture *texture = samp->texture; 181 int yy = util_iround(samp->fsrc_y + samp->fdtdy * y); 182 const uint32_t *src_row = 183 (const uint32_t *)((const uint8_t *)texture->base + 184 yy * texture->row_stride[0]); 185 int iscale_x = samp->fdsdx * 256; 186 int acc = samp->fsrc_x * 256 + 128; 187 int width = samp->width; 188 int i; 189 190 for (i = 0; i < width; i++) { 191 row[i] = src_row[acc>>8]; 192 acc += iscale_x; 193 } 194 195 return row; 196} 197 198/* Version of fetch_row which can cope with texture edges. In 199 * practise, aero never triggers this. 200 */ 201static const uint32_t * 202fetch_row_clamped(struct nearest_sampler *samp) 203{ 204 int y = samp->y++; 205 uint32_t *row = samp->out; 206 const struct lp_jit_texture *texture = samp->texture; 207 208 int yy = util_iround(samp->fsrc_y + samp->fdtdy * y); 209 210 const uint32_t *src_row = 211 (const uint32_t *)((const uint8_t *)texture->base + 212 CLAMP(yy, 0, texture->height-1) * 213 texture->row_stride[0]); 214 float src_x0 = samp->fsrc_x; 215 float scale_x = samp->fdsdx; 216 int width = samp->width; 217 int i; 218 219 for (i = 0; i < width; i++) { 220 row[i] = src_row[CLAMP(util_iround(src_x0 + i*scale_x),0,texture->width-1)]; 221 } 222 223 return row; 224} 225 226/* It vary rarely happens that some non-axis-aligned texturing creeps 227 * into the linear path. Handle it here. The alternative would be 228 * more pre-checking or an option to fallback by returning false from 229 * jit_linear. 230 */ 231static const uint32_t * 232fetch_row_xy_clamped(struct nearest_sampler *samp) 233{ 234 int y = samp->y++; 235 uint32_t *row = samp->out; 236 const struct lp_jit_texture *texture = samp->texture; 237 float yrow = samp->fsrc_y + samp->fdtdy * y; 238 float xrow = samp->fsrc_x + samp->fdsdy * y; 239 int width = samp->width; 240 int i; 241 242 for (i = 0; i < width; i++) { 243 int yy = util_iround(yrow + samp->fdtdx * i); 244 int xx = util_iround(xrow + samp->fdsdx * i); 245 246 const uint32_t *src_row = 247 (const uint32_t *)((const uint8_t *)texture->base + 248 CLAMP(yy, 0, texture->height-1) * 249 texture->row_stride[0]); 250 251 row[i] = src_row[CLAMP(xx,0,texture->width-1)]; 252 } 253 254 return row; 255} 256 257 258static boolean 259init_nearest_sampler(struct nearest_sampler *samp, 260 const struct lp_jit_texture *texture, 261 int x0, int y0, 262 int width, int height, 263 float s0, float dsdx, float dsdy, 264 float t0, float dtdx, float dtdy, 265 float w0, float dwdx, float dwdy) 266{ 267 int i; 268 float oow = 1.0f / w0; 269 270 if (dwdx != 0.0 || dwdy != 0.0) 271 return FALSE; 272 273 samp->texture = texture; 274 samp->width = width; 275 samp->fdsdx = dsdx * texture->width * oow; 276 samp->fdsdy = dsdy * texture->width * oow; 277 samp->fdtdx = dtdx * texture->height * oow; 278 samp->fdtdy = dtdy * texture->height * oow; 279 samp->fsrc_x = (samp->fdsdx * x0 + 280 samp->fdsdy * y0 + 281 s0 * texture->width * oow - 0.5f); 282 283 samp->fsrc_y = (samp->fdtdx * x0 + 284 samp->fdtdy * y0 + 285 t0 * texture->height * oow - 0.5f); 286 samp->y = 0; 287 288 /* Because we want to permit consumers of this data to round up to 289 * the next multiple of 4, and because we don't want valgrind to 290 * complain about uninitialized reads, set the last bit of the 291 * buffer to zero: 292 */ 293 for (i = width; i & 3; i++) 294 samp->out[i] = 0; 295 296 if (dsdy != 0 || dtdx != 0) 297 { 298 /* Arbitrary texture lookup: 299 */ 300 samp->fetch = fetch_row_xy_clamped; 301 } 302 else 303 { 304 /* Axis aligned stretch blit, abitrary scaling factors including 305 * flipped, minifying and magnifying: 306 */ 307 int isrc_x = util_iround(samp->fsrc_x); 308 int isrc_y = util_iround(samp->fsrc_y); 309 int isrc_x1 = util_iround(samp->fsrc_x + width * samp->fdsdx); 310 int isrc_y1 = util_iround(samp->fsrc_y + height * samp->fdtdy); 311 312 /* Look at the maximum and minimum texture coordinates we will be 313 * fetching and figure out if we need to use clamping. There is 314 * similar code in u_blit_sw.c which takes a better approach to 315 * this which could be substituted later. 316 */ 317 if (isrc_x <= texture->width && isrc_x >= 0 && 318 isrc_y <= texture->height && isrc_y >= 0 && 319 isrc_x1 <= texture->width && isrc_x1 >= 0 && 320 isrc_y1 <= texture->height && isrc_y1 >= 0) 321 { 322 samp->fetch = fetch_row; 323 } 324 else { 325 samp->fetch = fetch_row_clamped; 326 } 327 } 328 329 return TRUE; 330} 331 332 333static const uint32_t * 334shade_rgb1(struct shader *shader) 335{ 336 const __m128i rgb1 = _mm_set1_epi32(0xff000000); 337 const uint32_t *src0 = shader->src0; 338 uint32_t *dst = shader->out0; 339 int width = shader->width; 340 int i; 341 342 for (i = 0; i + 3 < width; i += 4) { 343 __m128i s = *(const __m128i *)&src0[i]; 344 *(__m128i *)&dst[i] = _mm_or_si128(s, rgb1); 345 } 346 347 return shader->out0; 348} 349 350 351static void 352init_shader(struct shader *shader, 353 int x, int y, int width, int height) 354{ 355 shader->width = align(width, 4); 356} 357 358 359/* Linear shader which implements the BLIT_RGBA shader with the 360 * additional constraints imposed by lp_setup_is_blit(). 361 */ 362static boolean 363blit_rgba_blit(const struct lp_rast_state *state, 364 unsigned x, unsigned y, 365 unsigned width, unsigned height, 366 const float (*a0)[4], 367 const float (*dadx)[4], 368 const float (*dady)[4], 369 uint8_t *color, 370 unsigned stride) 371{ 372 const struct lp_jit_context *context = &state->jit_context; 373 const struct lp_jit_texture *texture = &context->textures[0]; 374 const uint8_t *src; 375 unsigned src_stride; 376 int src_x, src_y; 377 378 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 379 380 /* Require w==1.0: 381 */ 382 if (a0[0][3] != 1.0 || 383 dadx[0][3] != 0.0 || 384 dady[0][3] != 0.0) 385 return FALSE; 386 387 src_x = x + util_iround(a0[1][0]*texture->width - 0.5f); 388 src_y = y + util_iround(a0[1][1]*texture->height - 0.5f); 389 390 src = texture->base; 391 src_stride = texture->row_stride[0]; 392 393 /* Fall back to blit_rgba() if clamping required: 394 */ 395 if (src_x < 0 || 396 src_y < 0 || 397 src_x + width > texture->width || 398 src_y + height > texture->height) 399 return FALSE; 400 401 util_copy_rect(color, PIPE_FORMAT_B8G8R8A8_UNORM, stride, 402 x, y, 403 width, height, 404 src, src_stride, 405 src_x, src_y); 406 407 return TRUE; 408} 409 410 411/* Linear shader which implements the BLIT_RGB1 shader, with the 412 * additional constraints imposed by lp_setup_is_blit(). 413 */ 414static boolean 415blit_rgb1_blit(const struct lp_rast_state *state, 416 unsigned x, unsigned y, 417 unsigned width, unsigned height, 418 const float (*a0)[4], 419 const float (*dadx)[4], 420 const float (*dady)[4], 421 uint8_t *color, 422 unsigned stride) 423{ 424 const struct lp_jit_context *context = &state->jit_context; 425 const struct lp_jit_texture *texture = &context->textures[0]; 426 const uint8_t *src; 427 unsigned src_stride; 428 int src_x, src_y; 429 430 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 431 432 /* Require w==1.0: 433 */ 434 if (a0[0][3] != 1.0 || 435 dadx[0][3] != 0.0 || 436 dady[0][3] != 0.0) 437 return FALSE; 438 439 color += x * 4 + y * stride; 440 441 src_x = x + util_iround(a0[1][0]*texture->width - 0.5f); 442 src_y = y + util_iround(a0[1][1]*texture->height - 0.5f); 443 444 src = texture->base; 445 src_stride = texture->row_stride[0]; 446 src += src_x * 4; 447 src += src_y * src_stride; 448 449 if (src_x < 0 || 450 src_y < 0 || 451 src_x + width > texture->width || 452 src_y + height > texture->height) 453 return FALSE; 454 455 for (y = 0; y < height; y++) { 456 const uint32_t *src_row = (const uint32_t *)src; 457 uint32_t *dst_row = (uint32_t *)color; 458 459 for (x = 0; x < width; x++) { 460 *dst_row++ = *src_row++ | 0xff000000; 461 } 462 463 color += stride; 464 src += src_stride; 465 } 466 467 return TRUE; 468} 469 470 471/* Linear shader variant implementing the BLIT_RGBA shader without 472 * blending. 473 */ 474static boolean 475blit_rgba(const struct lp_rast_state *state, 476 unsigned x, unsigned y, 477 unsigned width, unsigned height, 478 const float (*a0)[4], 479 const float (*dadx)[4], 480 const float (*dady)[4], 481 uint8_t *color, 482 unsigned stride) 483{ 484 const struct lp_jit_context *context = &state->jit_context; 485 struct nearest_sampler samp; 486 struct color_blend blend; 487 488 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 489 490 if (!init_nearest_sampler(&samp, 491 &context->textures[0], 492 x, y, width, height, 493 a0[1][0], dadx[1][0], dady[1][0], 494 a0[1][1], dadx[1][1], dady[1][1], 495 a0[0][3], dadx[0][3], dady[0][3])) 496 return FALSE; 497 498 init_blend(&blend, 499 x, y, width, height, 500 color, stride); 501 502 /* Rasterize the rectangle and run the shader: 503 */ 504 for (y = 0; y < height; y++) { 505 blend.src = samp.fetch(&samp); 506 blend_noop(&blend); 507 } 508 509 return TRUE; 510} 511 512 513static boolean 514blit_rgb1(const struct lp_rast_state *state, 515 unsigned x, unsigned y, 516 unsigned width, unsigned height, 517 const float (*a0)[4], 518 const float (*dadx)[4], 519 const float (*dady)[4], 520 uint8_t *color, 521 unsigned stride) 522{ 523 const struct lp_jit_context *context = &state->jit_context; 524 struct nearest_sampler samp; 525 struct color_blend blend; 526 struct shader shader; 527 528 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 529 530 if (!init_nearest_sampler(&samp, 531 &context->textures[0], 532 x, y, width, height, 533 a0[1][0], dadx[1][0], dady[1][0], 534 a0[1][1], dadx[1][1], dady[1][1], 535 a0[0][3], dadx[0][3], dady[0][3])) 536 return FALSE; 537 538 init_blend(&blend, 539 x, y, width, height, 540 color, stride); 541 542 543 init_shader(&shader, 544 x, y, width, height); 545 546 /* Rasterize the rectangle and run the shader: 547 */ 548 for (y = 0; y < height; y++) { 549 shader.src0 = samp.fetch(&samp); 550 blend.src = shade_rgb1(&shader); 551 blend_noop(&blend); 552 } 553 554 return TRUE; 555} 556 557 558/* Linear shader variant implementing the BLIT_RGBA shader with 559 * one/inv_src_alpha blending. 560 */ 561static boolean 562blit_rgba_blend_premul(const struct lp_rast_state *state, 563 unsigned x, unsigned y, 564 unsigned width, unsigned height, 565 const float (*a0)[4], 566 const float (*dadx)[4], 567 const float (*dady)[4], 568 uint8_t *color, 569 unsigned stride) 570{ 571 const struct lp_jit_context *context = &state->jit_context; 572 struct nearest_sampler samp; 573 struct color_blend blend; 574 575 LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__); 576 577 if (!init_nearest_sampler(&samp, 578 &context->textures[0], 579 x, y, width, height, 580 a0[1][0], dadx[1][0], dady[1][0], 581 a0[1][1], dadx[1][1], dady[1][1], 582 a0[0][3], dadx[0][3], dady[0][3])) 583 return FALSE; 584 585 586 init_blend(&blend, 587 x, y, width, height, 588 color, stride); 589 590 /* Rasterize the rectangle and run the shader: 591 */ 592 for (y = 0; y < height; y++) { 593 blend.src = samp.fetch(&samp); 594 blend_premul(&blend); 595 } 596 597 return TRUE; 598} 599 600 601/* Linear shader which always emits red. Used for debugging. 602 */ 603static boolean 604linear_red(const struct lp_rast_state *state, 605 unsigned x, unsigned y, 606 unsigned width, unsigned height, 607 const float (*a0)[4], 608 const float (*dadx)[4], 609 const float (*dady)[4], 610 uint8_t *color, 611 unsigned stride) 612{ 613 union util_color uc; 614 615 util_pack_color_ub(0xff, 0, 0, 0xff, 616 PIPE_FORMAT_B8G8R8A8_UNORM, &uc); 617 618 util_fill_rect(color, 619 PIPE_FORMAT_B8G8R8A8_UNORM, 620 stride, 621 x, 622 y, 623 width, 624 height, 625 &uc); 626 627 return TRUE; 628} 629 630 631/* Noop linear shader variant, for debugging. 632 */ 633static boolean 634linear_no_op(const struct lp_rast_state *state, 635 unsigned x, unsigned y, 636 unsigned width, unsigned height, 637 const float (*a0)[4], 638 const float (*dadx)[4], 639 const float (*dady)[4], 640 uint8_t *color, 641 unsigned stride) 642{ 643 return TRUE; 644} 645 646/* Check for ADD/ONE/INV_SRC_ALPHA, ie premultiplied-alpha blending. 647 */ 648static boolean 649is_one_inv_src_alpha_blend(const struct lp_fragment_shader_variant *variant) 650{ 651 return 652 !variant->key.blend.logicop_enable && 653 variant->key.blend.rt[0].blend_enable && 654 variant->key.blend.rt[0].rgb_func == PIPE_BLEND_ADD && 655 variant->key.blend.rt[0].rgb_src_factor == PIPE_BLENDFACTOR_ONE && 656 variant->key.blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA && 657 variant->key.blend.rt[0].alpha_func == PIPE_BLEND_ADD && 658 variant->key.blend.rt[0].alpha_src_factor == PIPE_BLENDFACTOR_ONE && 659 variant->key.blend.rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA && 660 variant->key.blend.rt[0].colormask == 0xf; 661} 662 663 664/* Examine the fragment shader varient and determine whether we can 665 * substitute a fastpath linear shader implementation. 666 */ 667void 668llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant) 669{ 670 struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(&variant->key, 0); 671 672 if (LP_PERF & PERF_NO_SHADE) { 673 variant->jit_linear = linear_red; 674 return; 675 } 676 677 if (!samp0) 678 return; 679 680 enum pipe_format tex_format = samp0->texture_state.format; 681 if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA && 682 tex_format == PIPE_FORMAT_B8G8R8A8_UNORM && 683 is_nearest_clamp_sampler(samp0)) { 684 if (variant->opaque) { 685 variant->jit_linear_blit = blit_rgba_blit; 686 variant->jit_linear = blit_rgba; 687 } 688 else if (is_one_inv_src_alpha_blend(variant) && 689 util_get_cpu_caps()->has_sse2) { 690 variant->jit_linear = blit_rgba_blend_premul; 691 } 692 return; 693 } 694 695 if (variant->shader->kind == LP_FS_KIND_BLIT_RGB1 && 696 variant->opaque && 697 (tex_format == PIPE_FORMAT_B8G8R8A8_UNORM || 698 tex_format == PIPE_FORMAT_B8G8R8X8_UNORM) && 699 is_nearest_clamp_sampler(samp0)) { 700 variant->jit_linear_blit = blit_rgb1_blit; 701 variant->jit_linear = blit_rgb1; 702 return; 703 } 704 705 if (0) { 706 variant->jit_linear = linear_no_op; 707 return; 708 } 709} 710#else 711void 712llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant) 713{ 714 /* don't bother if there is no SSE */ 715} 716#endif 717 718