1/**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file blend_jit.cpp 24 * 25 * @brief Implementation of the blend jitter 26 * 27 * Notes: 28 * 29 ******************************************************************************/ 30#include "jit_pch.hpp" 31#include "builder.h" 32#include "jit_api.h" 33#include "blend_jit.h" 34#include "gen_state_llvm.h" 35#include "functionpasses/passes.h" 36 37#include "util/compiler.h" 38 39// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized 40#define QUANTIZE_THRESHOLD 2 41 42using namespace llvm; 43using namespace SwrJit; 44 45////////////////////////////////////////////////////////////////////////// 46/// Interface to Jitting a blend shader 47////////////////////////////////////////////////////////////////////////// 48struct BlendJit : public Builder 49{ 50 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){}; 51 52 template <bool Color, bool Alpha> 53 void GenerateBlendFactor(SWR_BLEND_FACTOR factor, 54 Value* constColor[4], 55 Value* src[4], 56 Value* src1[4], 57 Value* dst[4], 58 Value* result[4]) 59 { 60 Value* out[4]; 61 62 switch (factor) 63 { 64 case BLENDFACTOR_ONE: 65 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f); 66 break; 67 case BLENDFACTOR_SRC_COLOR: 68 out[0] = src[0]; 69 out[1] = src[1]; 70 out[2] = src[2]; 71 out[3] = src[3]; 72 break; 73 case BLENDFACTOR_SRC_ALPHA: 74 out[0] = out[1] = out[2] = out[3] = src[3]; 75 break; 76 case BLENDFACTOR_DST_ALPHA: 77 out[0] = out[1] = out[2] = out[3] = dst[3]; 78 break; 79 case BLENDFACTOR_DST_COLOR: 80 out[0] = dst[0]; 81 out[1] = dst[1]; 82 out[2] = dst[2]; 83 out[3] = dst[3]; 84 break; 85 case BLENDFACTOR_SRC_ALPHA_SATURATE: 86 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3])); 87 out[3] = VIMMED1(1.0f); 88 break; 89 case BLENDFACTOR_CONST_COLOR: 90 out[0] = constColor[0]; 91 out[1] = constColor[1]; 92 out[2] = constColor[2]; 93 out[3] = constColor[3]; 94 break; 95 case BLENDFACTOR_CONST_ALPHA: 96 out[0] = out[1] = out[2] = out[3] = constColor[3]; 97 break; 98 case BLENDFACTOR_SRC1_COLOR: 99 out[0] = src1[0]; 100 out[1] = src1[1]; 101 out[2] = src1[2]; 102 out[3] = src1[3]; 103 break; 104 case BLENDFACTOR_SRC1_ALPHA: 105 out[0] = out[1] = out[2] = out[3] = src1[3]; 106 break; 107 case BLENDFACTOR_ZERO: 108 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 109 break; 110 case BLENDFACTOR_INV_SRC_COLOR: 111 out[0] = FSUB(VIMMED1(1.0f), src[0]); 112 out[1] = FSUB(VIMMED1(1.0f), src[1]); 113 out[2] = FSUB(VIMMED1(1.0f), src[2]); 114 out[3] = FSUB(VIMMED1(1.0f), src[3]); 115 break; 116 case BLENDFACTOR_INV_SRC_ALPHA: 117 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]); 118 break; 119 case BLENDFACTOR_INV_DST_ALPHA: 120 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]); 121 break; 122 case BLENDFACTOR_INV_DST_COLOR: 123 out[0] = FSUB(VIMMED1(1.0f), dst[0]); 124 out[1] = FSUB(VIMMED1(1.0f), dst[1]); 125 out[2] = FSUB(VIMMED1(1.0f), dst[2]); 126 out[3] = FSUB(VIMMED1(1.0f), dst[3]); 127 break; 128 case BLENDFACTOR_INV_CONST_COLOR: 129 out[0] = FSUB(VIMMED1(1.0f), constColor[0]); 130 out[1] = FSUB(VIMMED1(1.0f), constColor[1]); 131 out[2] = FSUB(VIMMED1(1.0f), constColor[2]); 132 out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 133 break; 134 case BLENDFACTOR_INV_CONST_ALPHA: 135 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]); 136 break; 137 case BLENDFACTOR_INV_SRC1_COLOR: 138 out[0] = FSUB(VIMMED1(1.0f), src1[0]); 139 out[1] = FSUB(VIMMED1(1.0f), src1[1]); 140 out[2] = FSUB(VIMMED1(1.0f), src1[2]); 141 out[3] = FSUB(VIMMED1(1.0f), src1[3]); 142 break; 143 case BLENDFACTOR_INV_SRC1_ALPHA: 144 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]); 145 break; 146 default: 147 SWR_INVALID("Unsupported blend factor: %d", factor); 148 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 149 break; 150 } 151 152 if (Color) 153 { 154 result[0] = out[0]; 155 result[1] = out[1]; 156 result[2] = out[2]; 157 } 158 159 if (Alpha) 160 { 161 result[3] = out[3]; 162 } 163 } 164 165 void Clamp(SWR_FORMAT format, Value* src[4]) 166 { 167 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 168 SWR_TYPE type = info.type[0]; 169 170 switch (type) 171 { 172 default: 173 break; 174 175 case SWR_TYPE_UNORM: 176 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f)); 177 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f)); 178 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f)); 179 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f)); 180 break; 181 182 case SWR_TYPE_SNORM: 183 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f)); 184 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f)); 185 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f)); 186 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f)); 187 break; 188 189 case SWR_TYPE_UNKNOWN: 190 SWR_INVALID("Unsupported format type: %d", type); 191 } 192 } 193 194 void ApplyDefaults(SWR_FORMAT format, Value* src[4]) 195 { 196 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 197 198 bool valid[] = {false, false, false, false}; 199 for (uint32_t c = 0; c < info.numComps; ++c) 200 { 201 valid[info.swizzle[c]] = true; 202 } 203 204 for (uint32_t c = 0; c < 4; ++c) 205 { 206 if (!valid[c]) 207 { 208 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty); 209 } 210 } 211 } 212 213 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4]) 214 { 215 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 216 217 for (uint32_t c = 0; c < info.numComps; ++c) 218 { 219 if (info.type[c] == SWR_TYPE_UNUSED) 220 { 221 src[info.swizzle[c]] = 222 BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty); 223 } 224 } 225 } 226 227 void Quantize(SWR_FORMAT format, Value* src[4]) 228 { 229 const SWR_FORMAT_INFO& info = GetFormatInfo(format); 230 for (uint32_t c = 0; c < info.numComps; ++c) 231 { 232 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED) 233 { 234 uint32_t swizComp = info.swizzle[c]; 235 float factor = (float)((1 << info.bpc[c]) - 1); 236 switch (info.type[c]) 237 { 238 case SWR_TYPE_UNORM: 239 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f)); 240 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO)); 241 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor)); 242 break; 243 default: 244 SWR_INVALID("Unsupported format type: %d", info.type[c]); 245 } 246 } 247 } 248 } 249 250 template <bool Color, bool Alpha> 251 void BlendFunc(SWR_BLEND_OP blendOp, 252 Value* src[4], 253 Value* srcFactor[4], 254 Value* dst[4], 255 Value* dstFactor[4], 256 Value* result[4]) 257 { 258 Value* out[4]; 259 Value* srcBlend[4]; 260 Value* dstBlend[4]; 261 for (uint32_t i = 0; i < 4; ++i) 262 { 263 srcBlend[i] = FMUL(src[i], srcFactor[i]); 264 dstBlend[i] = FMUL(dst[i], dstFactor[i]); 265 } 266 267 switch (blendOp) 268 { 269 case BLENDOP_ADD: 270 out[0] = FADD(srcBlend[0], dstBlend[0]); 271 out[1] = FADD(srcBlend[1], dstBlend[1]); 272 out[2] = FADD(srcBlend[2], dstBlend[2]); 273 out[3] = FADD(srcBlend[3], dstBlend[3]); 274 break; 275 276 case BLENDOP_SUBTRACT: 277 out[0] = FSUB(srcBlend[0], dstBlend[0]); 278 out[1] = FSUB(srcBlend[1], dstBlend[1]); 279 out[2] = FSUB(srcBlend[2], dstBlend[2]); 280 out[3] = FSUB(srcBlend[3], dstBlend[3]); 281 break; 282 283 case BLENDOP_REVSUBTRACT: 284 out[0] = FSUB(dstBlend[0], srcBlend[0]); 285 out[1] = FSUB(dstBlend[1], srcBlend[1]); 286 out[2] = FSUB(dstBlend[2], srcBlend[2]); 287 out[3] = FSUB(dstBlend[3], srcBlend[3]); 288 break; 289 290 case BLENDOP_MIN: 291 out[0] = VMINPS(src[0], dst[0]); 292 out[1] = VMINPS(src[1], dst[1]); 293 out[2] = VMINPS(src[2], dst[2]); 294 out[3] = VMINPS(src[3], dst[3]); 295 break; 296 297 case BLENDOP_MAX: 298 out[0] = VMAXPS(src[0], dst[0]); 299 out[1] = VMAXPS(src[1], dst[1]); 300 out[2] = VMAXPS(src[2], dst[2]); 301 out[3] = VMAXPS(src[3], dst[3]); 302 break; 303 304 default: 305 SWR_INVALID("Unsupported blend operation: %d", blendOp); 306 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f); 307 break; 308 } 309 310 if (Color) 311 { 312 result[0] = out[0]; 313 result[1] = out[1]; 314 result[2] = out[2]; 315 } 316 317 if (Alpha) 318 { 319 result[3] = out[3]; 320 } 321 } 322 323 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4]) 324 { 325 // Op: (s == PS output, d = RT contents) 326 switch (logicOp) 327 { 328 case LOGICOP_CLEAR: 329 result[0] = VIMMED1(0); 330 result[1] = VIMMED1(0); 331 result[2] = VIMMED1(0); 332 result[3] = VIMMED1(0); 333 break; 334 335 case LOGICOP_NOR: 336 // ~(s | d) 337 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 338 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 339 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 340 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 341 break; 342 343 case LOGICOP_AND_INVERTED: 344 // ~s & d 345 // todo: use avx andnot instr when I can find the intrinsic to call 346 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 347 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 348 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 349 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 350 break; 351 352 case LOGICOP_COPY_INVERTED: 353 // ~s 354 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF)); 355 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF)); 356 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF)); 357 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF)); 358 break; 359 360 case LOGICOP_AND_REVERSE: 361 // s & ~d 362 // todo: use avx andnot instr when I can find the intrinsic to call 363 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 364 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 365 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 366 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 367 break; 368 369 case LOGICOP_INVERT: 370 // ~d 371 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF)); 372 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF)); 373 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF)); 374 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF)); 375 break; 376 377 case LOGICOP_XOR: 378 // s ^ d 379 result[0] = XOR(src[0], dst[0]); 380 result[1] = XOR(src[1], dst[1]); 381 result[2] = XOR(src[2], dst[2]); 382 result[3] = XOR(src[3], dst[3]); 383 break; 384 385 case LOGICOP_NAND: 386 // ~(s & d) 387 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 388 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 389 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 390 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 391 break; 392 393 case LOGICOP_AND: 394 // s & d 395 result[0] = AND(src[0], dst[0]); 396 result[1] = AND(src[1], dst[1]); 397 result[2] = AND(src[2], dst[2]); 398 result[3] = AND(src[3], dst[3]); 399 break; 400 401 case LOGICOP_EQUIV: 402 // ~(s ^ d) 403 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF)); 404 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF)); 405 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF)); 406 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF)); 407 break; 408 409 case LOGICOP_NOOP: 410 result[0] = dst[0]; 411 result[1] = dst[1]; 412 result[2] = dst[2]; 413 result[3] = dst[3]; 414 break; 415 416 case LOGICOP_OR_INVERTED: 417 // ~s | d 418 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]); 419 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]); 420 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]); 421 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]); 422 break; 423 424 case LOGICOP_COPY: 425 result[0] = src[0]; 426 result[1] = src[1]; 427 result[2] = src[2]; 428 result[3] = src[3]; 429 break; 430 431 case LOGICOP_OR_REVERSE: 432 // s | ~d 433 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]); 434 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]); 435 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]); 436 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]); 437 break; 438 439 case LOGICOP_OR: 440 // s | d 441 result[0] = OR(src[0], dst[0]); 442 result[1] = OR(src[1], dst[1]); 443 result[2] = OR(src[2], dst[2]); 444 result[3] = OR(src[3], dst[3]); 445 break; 446 447 case LOGICOP_SET: 448 result[0] = VIMMED1(0xFFFFFFFF); 449 result[1] = VIMMED1(0xFFFFFFFF); 450 result[2] = VIMMED1(0xFFFFFFFF); 451 result[3] = VIMMED1(0xFFFFFFFF); 452 break; 453 454 default: 455 SWR_INVALID("Unsupported logic operation: %d", logicOp); 456 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f); 457 break; 458 } 459 } 460 461 void 462 AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask) 463 { 464 // load uint32_t reference 465 Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference})); 466 467 // load alpha 468 Value* pAlpha = LOAD(ppAlpha, {0, 0}); 469 470 Value* pTest = nullptr; 471 if (state.alphaTestFormat == ALPHA_TEST_UNORM8) 472 { 473 // convert float alpha to unorm8 474 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f)); 475 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty); 476 477 // compare 478 switch (state.alphaTestFunction) 479 { 480 case ZFUNC_ALWAYS: 481 pTest = VIMMED1(true); 482 break; 483 case ZFUNC_NEVER: 484 pTest = VIMMED1(false); 485 break; 486 case ZFUNC_LT: 487 pTest = ICMP_ULT(pAlphaU8, pRef); 488 break; 489 case ZFUNC_EQ: 490 pTest = ICMP_EQ(pAlphaU8, pRef); 491 break; 492 case ZFUNC_LE: 493 pTest = ICMP_ULE(pAlphaU8, pRef); 494 break; 495 case ZFUNC_GT: 496 pTest = ICMP_UGT(pAlphaU8, pRef); 497 break; 498 case ZFUNC_NE: 499 pTest = ICMP_NE(pAlphaU8, pRef); 500 break; 501 case ZFUNC_GE: 502 pTest = ICMP_UGE(pAlphaU8, pRef); 503 break; 504 default: 505 SWR_INVALID("Invalid alpha test function"); 506 break; 507 } 508 } 509 else 510 { 511 // cast ref to float 512 pRef = BITCAST(pRef, mSimdFP32Ty); 513 514 // compare 515 switch (state.alphaTestFunction) 516 { 517 case ZFUNC_ALWAYS: 518 pTest = VIMMED1(true); 519 break; 520 case ZFUNC_NEVER: 521 pTest = VIMMED1(false); 522 break; 523 case ZFUNC_LT: 524 pTest = FCMP_OLT(pAlpha, pRef); 525 break; 526 case ZFUNC_EQ: 527 pTest = FCMP_OEQ(pAlpha, pRef); 528 break; 529 case ZFUNC_LE: 530 pTest = FCMP_OLE(pAlpha, pRef); 531 break; 532 case ZFUNC_GT: 533 pTest = FCMP_OGT(pAlpha, pRef); 534 break; 535 case ZFUNC_NE: 536 pTest = FCMP_ONE(pAlpha, pRef); 537 break; 538 case ZFUNC_GE: 539 pTest = FCMP_OGE(pAlpha, pRef); 540 break; 541 default: 542 SWR_INVALID("Invalid alpha test function"); 543 break; 544 } 545 } 546 547 // load current mask 548 Value* pMask = LOAD(ppMask); 549 550 // convert to int1 mask 551 pMask = MASK(pMask); 552 553 // and with alpha test result 554 pMask = AND(pMask, pTest); 555 556 // convert back to vector mask 557 pMask = VMASK(pMask); 558 559 // store new mask 560 STORE(pMask, ppMask); 561 } 562 563 Function* Create(const BLEND_COMPILE_STATE& state) 564 { 565 std::stringstream fnName("BLND_", 566 std::ios_base::in | std::ios_base::out | std::ios_base::ate); 567 fnName << ComputeCRC(0, &state, sizeof(state)); 568 569 // blend function signature 570 // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*); 571 572 std::vector<Type*> args{ 573 PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT* 574 }; 575 576 // std::vector<Type*> args{ 577 // PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT* 578 //}; 579 580 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); 581 Function* blendFunc = Function::Create( 582 fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule); 583 blendFunc->getParent()->setModuleIdentifier(blendFunc->getName()); 584 585 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc); 586 587 IRB()->SetInsertPoint(entry); 588 589 // arguments 590 auto argitr = blendFunc->arg_begin(); 591 Value* pBlendContext = &*argitr++; 592 pBlendContext->setName("pBlendContext"); 593 Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState}); 594 pBlendState->setName("pBlendState"); 595 Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src}); 596 pSrc->setName("src"); 597 Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1}); 598 pSrc1->setName("src1"); 599 Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha}); 600 pSrc0Alpha->setName("src0alpha"); 601 Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum}); 602 sampleNum->setName("sampleNum"); 603 Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst}); 604 pDst->setName("pDst"); 605 Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result}); 606 pResult->setName("result"); 607 Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask}); 608 ppoMask->setName("ppoMask"); 609 Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask}); 610 ppMask->setName("pMask"); 611 612 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, 613 "Unsupported hot tile format"); 614 Value* dst[4]; 615 Value* constantColor[4]; 616 Value* src[4]; 617 Value* src1[4]; 618 Value* result[4]; 619 for (uint32_t i = 0; i < 4; ++i) 620 { 621 // load hot tile 622 dst[i] = LOAD(pDst, {0, i}); 623 624 // load constant color 625 constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i})); 626 627 // load src 628 src[i] = LOAD(pSrc, {0, i}); 629 630 // load src1 631 src1[i] = LOAD(pSrc1, {0, i}); 632 } 633 Value* currentSampleMask = VIMMED1(-1); 634 if (state.desc.alphaToCoverageEnable) 635 { 636 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); 637 uint32_t bits = (1 << state.desc.numSamples) - 1; 638 currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); 639 currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty); 640 } 641 642 // alpha test 643 if (state.desc.alphaTestEnable) 644 { 645 // Gather for archrast stats 646 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); 647 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask); 648 } 649 else 650 { 651 // Gather for archrast stats 652 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested}); 653 } 654 655 // color blend 656 if (state.blendState.blendEnable) 657 { 658 // Gather for archrast stats 659 STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); 660 661 // clamp sources 662 Clamp(state.format, src); 663 Clamp(state.format, src1); 664 Clamp(state.format, dst); 665 Clamp(state.format, constantColor); 666 667 // apply defaults to hottile contents to take into account missing components 668 ApplyDefaults(state.format, dst); 669 670 // Force defaults for unused 'X' components 671 ApplyUnusedDefaults(state.format, dst); 672 673 // Quantize low precision components 674 Quantize(state.format, dst); 675 676 // special case clamping for R11G11B10_float which has no sign bit 677 if (state.format == R11G11B10_FLOAT) 678 { 679 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f)); 680 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f)); 681 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f)); 682 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f)); 683 } 684 685 Value* srcFactor[4]; 686 Value* dstFactor[4]; 687 if (state.desc.independentAlphaBlendEnable) 688 { 689 GenerateBlendFactor<true, false>( 690 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 691 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, 692 constantColor, 693 src, 694 src1, 695 dst, 696 srcFactor); 697 698 GenerateBlendFactor<true, false>( 699 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 700 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, 701 constantColor, 702 src, 703 src1, 704 dst, 705 dstFactor); 706 707 BlendFunc<true, false>( 708 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 709 BlendFunc<false, true>( 710 state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result); 711 } 712 else 713 { 714 GenerateBlendFactor<true, true>( 715 state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor); 716 GenerateBlendFactor<true, true>( 717 state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor); 718 719 BlendFunc<true, true>( 720 state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result); 721 } 722 723 // store results out 724 for (uint32_t i = 0; i < 4; ++i) 725 { 726 STORE(result[i], pResult, {0, i}); 727 } 728 } 729 else 730 { 731 // Gather for archrast stats 732 STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended}); 733 } 734 735 if (state.blendState.logicOpEnable) 736 { 737 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format); 738 Value* vMask[4]; 739 float scale[4]; 740 741 if (!state.blendState.blendEnable) 742 { 743 Clamp(state.format, src); 744 Clamp(state.format, dst); 745 } 746 747 for (uint32_t i = 0; i < 4; i++) 748 { 749 if (info.type[i] == SWR_TYPE_UNUSED) 750 { 751 continue; 752 } 753 754 if (info.bpc[i] >= 32) 755 { 756 vMask[i] = VIMMED1(0xFFFFFFFF); 757 scale[i] = 0xFFFFFFFF; 758 } 759 else 760 { 761 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1); 762 if (info.type[i] == SWR_TYPE_SNORM) 763 scale[i] = (1 << (info.bpc[i] - 1)) - 1; 764 else 765 scale[i] = (1 << info.bpc[i]) - 1; 766 } 767 768 switch (info.type[i]) 769 { 770 default: 771 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); 772 break; 773 774 case SWR_TYPE_UNKNOWN: 775 case SWR_TYPE_UNUSED: 776 FALLTHROUGH; 777 778 case SWR_TYPE_UINT: 779 case SWR_TYPE_SINT: 780 src[i] = BITCAST(src[i], mSimdInt32Ty); 781 dst[i] = BITCAST(dst[i], mSimdInt32Ty); 782 break; 783 case SWR_TYPE_SNORM: 784 src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); 785 dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); 786 break; 787 case SWR_TYPE_UNORM: 788 src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty); 789 dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty); 790 break; 791 } 792 } 793 794 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result); 795 796 // store results out 797 for (uint32_t i = 0; i < 4; ++i) 798 { 799 if (info.type[i] == SWR_TYPE_UNUSED) 800 { 801 continue; 802 } 803 804 // clear upper bits from PS output not in RT format after doing logic op 805 result[i] = AND(result[i], vMask[i]); 806 807 switch (info.type[i]) 808 { 809 default: 810 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]); 811 break; 812 813 case SWR_TYPE_UNKNOWN: 814 case SWR_TYPE_UNUSED: 815 FALLTHROUGH; 816 817 case SWR_TYPE_UINT: 818 case SWR_TYPE_SINT: 819 result[i] = BITCAST(result[i], mSimdFP32Ty); 820 break; 821 case SWR_TYPE_SNORM: 822 result[i] = SHL(result[i], C(32 - info.bpc[i])); 823 result[i] = ASHR(result[i], C(32 - info.bpc[i])); 824 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); 825 break; 826 case SWR_TYPE_UNORM: 827 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i])); 828 break; 829 } 830 831 STORE(result[i], pResult, {0, i}); 832 } 833 } 834 835 if (state.desc.oMaskEnable) 836 { 837 assert(!(state.desc.alphaToCoverageEnable)); 838 // load current mask 839 Value* oMask = LOAD(ppoMask); 840 currentSampleMask = AND(oMask, currentSampleMask); 841 } 842 843 if (state.desc.sampleMaskEnable) 844 { 845 Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask}); 846 currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask); 847 } 848 849 if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || 850 state.desc.oMaskEnable) 851 { 852 // load coverage mask and mask off any lanes with no samples 853 Value* pMask = LOAD(ppMask); 854 Value* sampleMasked = SHL(C(1), sampleNum); 855 currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked)); 856 currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty); 857 Value* outputMask = AND(pMask, currentSampleMask); 858 // store new mask 859 STORE(outputMask, GEP(ppMask, C(0))); 860 } 861 862 RET_VOID(); 863 864 JitManager::DumpToFile(blendFunc, ""); 865 866 ::FunctionPassManager passes(JM()->mpCurrentModule); 867 868 passes.add(createBreakCriticalEdgesPass()); 869 passes.add(createCFGSimplificationPass()); 870 passes.add(createEarlyCSEPass()); 871 passes.add(createPromoteMemoryToRegisterPass()); 872 passes.add(createCFGSimplificationPass()); 873 passes.add(createEarlyCSEPass()); 874 passes.add(createInstructionCombiningPass()); 875#if LLVM_VERSION_MAJOR <= 11 876 passes.add(createConstantPropagationPass()); 877#endif 878 passes.add(createSCCPPass()); 879 passes.add(createAggressiveDCEPass()); 880 881 passes.add(createLowerX86Pass(this)); 882 883 passes.run(*blendFunc); 884 885 JitManager::DumpToFile(blendFunc, "optimized"); 886 887 return blendFunc; 888 } 889}; 890 891////////////////////////////////////////////////////////////////////////// 892/// @brief JITs from fetch shader IR 893/// @param hJitMgr - JitManager handle 894/// @param func - LLVM function IR 895/// @return PFN_FETCH_FUNC - pointer to fetch code 896PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc) 897{ 898 const llvm::Function* func = (const llvm::Function*)hFunc; 899 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 900 PFN_BLEND_JIT_FUNC pfnBlend; 901 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str())); 902 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot 903 // add new IR to the module 904 pJitMgr->mIsModuleFinalized = true; 905 906 return pfnBlend; 907} 908 909////////////////////////////////////////////////////////////////////////// 910/// @brief JIT compiles blend shader 911/// @param hJitMgr - JitManager handle 912/// @param state - blend state to build function from 913extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, 914 const BLEND_COMPILE_STATE& state) 915{ 916 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr); 917 918 pJitMgr->SetupNewModule(); 919 920 BlendJit theJit(pJitMgr); 921 HANDLE hFunc = theJit.Create(state); 922 923 return JitBlendFunc(hJitMgr, hFunc); 924} 925