1/****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30#include "jit_pch.hpp"
31#include "builder.h"
32#include "jit_api.h"
33#include "blend_jit.h"
34#include "gen_state_llvm.h"
35#include "functionpasses/passes.h"
36
37#include "util/compiler.h"
38
39// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
40#define QUANTIZE_THRESHOLD 2
41
42using namespace llvm;
43using namespace SwrJit;
44
45//////////////////////////////////////////////////////////////////////////
46/// Interface to Jitting a blend shader
47//////////////////////////////////////////////////////////////////////////
48struct BlendJit : public Builder
49{
50    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
51
52    template <bool Color, bool Alpha>
53    void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
54                             Value*           constColor[4],
55                             Value*           src[4],
56                             Value*           src1[4],
57                             Value*           dst[4],
58                             Value*           result[4])
59    {
60        Value* out[4];
61
62        switch (factor)
63        {
64        case BLENDFACTOR_ONE:
65            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
66            break;
67        case BLENDFACTOR_SRC_COLOR:
68            out[0] = src[0];
69            out[1] = src[1];
70            out[2] = src[2];
71            out[3] = src[3];
72            break;
73        case BLENDFACTOR_SRC_ALPHA:
74            out[0] = out[1] = out[2] = out[3] = src[3];
75            break;
76        case BLENDFACTOR_DST_ALPHA:
77            out[0] = out[1] = out[2] = out[3] = dst[3];
78            break;
79        case BLENDFACTOR_DST_COLOR:
80            out[0] = dst[0];
81            out[1] = dst[1];
82            out[2] = dst[2];
83            out[3] = dst[3];
84            break;
85        case BLENDFACTOR_SRC_ALPHA_SATURATE:
86            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
87            out[3]                   = VIMMED1(1.0f);
88            break;
89        case BLENDFACTOR_CONST_COLOR:
90            out[0] = constColor[0];
91            out[1] = constColor[1];
92            out[2] = constColor[2];
93            out[3] = constColor[3];
94            break;
95        case BLENDFACTOR_CONST_ALPHA:
96            out[0] = out[1] = out[2] = out[3] = constColor[3];
97            break;
98        case BLENDFACTOR_SRC1_COLOR:
99            out[0] = src1[0];
100            out[1] = src1[1];
101            out[2] = src1[2];
102            out[3] = src1[3];
103            break;
104        case BLENDFACTOR_SRC1_ALPHA:
105            out[0] = out[1] = out[2] = out[3] = src1[3];
106            break;
107        case BLENDFACTOR_ZERO:
108            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
109            break;
110        case BLENDFACTOR_INV_SRC_COLOR:
111            out[0] = FSUB(VIMMED1(1.0f), src[0]);
112            out[1] = FSUB(VIMMED1(1.0f), src[1]);
113            out[2] = FSUB(VIMMED1(1.0f), src[2]);
114            out[3] = FSUB(VIMMED1(1.0f), src[3]);
115            break;
116        case BLENDFACTOR_INV_SRC_ALPHA:
117            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
118            break;
119        case BLENDFACTOR_INV_DST_ALPHA:
120            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
121            break;
122        case BLENDFACTOR_INV_DST_COLOR:
123            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
124            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
125            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
126            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
127            break;
128        case BLENDFACTOR_INV_CONST_COLOR:
129            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
130            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
131            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
132            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
133            break;
134        case BLENDFACTOR_INV_CONST_ALPHA:
135            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
136            break;
137        case BLENDFACTOR_INV_SRC1_COLOR:
138            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
139            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
140            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
141            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
142            break;
143        case BLENDFACTOR_INV_SRC1_ALPHA:
144            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
145            break;
146        default:
147            SWR_INVALID("Unsupported blend factor: %d", factor);
148            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
149            break;
150        }
151
152        if (Color)
153        {
154            result[0] = out[0];
155            result[1] = out[1];
156            result[2] = out[2];
157        }
158
159        if (Alpha)
160        {
161            result[3] = out[3];
162        }
163    }
164
165    void Clamp(SWR_FORMAT format, Value* src[4])
166    {
167        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
168        SWR_TYPE               type = info.type[0];
169
170        switch (type)
171        {
172        default:
173            break;
174
175        case SWR_TYPE_UNORM:
176            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
177            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
178            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
179            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
180            break;
181
182        case SWR_TYPE_SNORM:
183            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
184            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
185            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
186            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
187            break;
188
189        case SWR_TYPE_UNKNOWN:
190            SWR_INVALID("Unsupported format type: %d", type);
191        }
192    }
193
194    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
195    {
196        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
197
198        bool valid[] = {false, false, false, false};
199        for (uint32_t c = 0; c < info.numComps; ++c)
200        {
201            valid[info.swizzle[c]] = true;
202        }
203
204        for (uint32_t c = 0; c < 4; ++c)
205        {
206            if (!valid[c])
207            {
208                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
209            }
210        }
211    }
212
213    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
214    {
215        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
216
217        for (uint32_t c = 0; c < info.numComps; ++c)
218        {
219            if (info.type[c] == SWR_TYPE_UNUSED)
220            {
221                src[info.swizzle[c]] =
222                    BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
223            }
224        }
225    }
226
227    void Quantize(SWR_FORMAT format, Value* src[4])
228    {
229        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
230        for (uint32_t c = 0; c < info.numComps; ++c)
231        {
232            if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
233            {
234                uint32_t swizComp = info.swizzle[c];
235                float    factor   = (float)((1 << info.bpc[c]) - 1);
236                switch (info.type[c])
237                {
238                case SWR_TYPE_UNORM:
239                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
240                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
241                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
242                    break;
243                default:
244                    SWR_INVALID("Unsupported format type: %d", info.type[c]);
245                }
246            }
247        }
248    }
249
250    template <bool Color, bool Alpha>
251    void BlendFunc(SWR_BLEND_OP blendOp,
252                   Value*       src[4],
253                   Value*       srcFactor[4],
254                   Value*       dst[4],
255                   Value*       dstFactor[4],
256                   Value*       result[4])
257    {
258        Value* out[4];
259        Value* srcBlend[4];
260        Value* dstBlend[4];
261        for (uint32_t i = 0; i < 4; ++i)
262        {
263            srcBlend[i] = FMUL(src[i], srcFactor[i]);
264            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
265        }
266
267        switch (blendOp)
268        {
269        case BLENDOP_ADD:
270            out[0] = FADD(srcBlend[0], dstBlend[0]);
271            out[1] = FADD(srcBlend[1], dstBlend[1]);
272            out[2] = FADD(srcBlend[2], dstBlend[2]);
273            out[3] = FADD(srcBlend[3], dstBlend[3]);
274            break;
275
276        case BLENDOP_SUBTRACT:
277            out[0] = FSUB(srcBlend[0], dstBlend[0]);
278            out[1] = FSUB(srcBlend[1], dstBlend[1]);
279            out[2] = FSUB(srcBlend[2], dstBlend[2]);
280            out[3] = FSUB(srcBlend[3], dstBlend[3]);
281            break;
282
283        case BLENDOP_REVSUBTRACT:
284            out[0] = FSUB(dstBlend[0], srcBlend[0]);
285            out[1] = FSUB(dstBlend[1], srcBlend[1]);
286            out[2] = FSUB(dstBlend[2], srcBlend[2]);
287            out[3] = FSUB(dstBlend[3], srcBlend[3]);
288            break;
289
290        case BLENDOP_MIN:
291            out[0] = VMINPS(src[0], dst[0]);
292            out[1] = VMINPS(src[1], dst[1]);
293            out[2] = VMINPS(src[2], dst[2]);
294            out[3] = VMINPS(src[3], dst[3]);
295            break;
296
297        case BLENDOP_MAX:
298            out[0] = VMAXPS(src[0], dst[0]);
299            out[1] = VMAXPS(src[1], dst[1]);
300            out[2] = VMAXPS(src[2], dst[2]);
301            out[3] = VMAXPS(src[3], dst[3]);
302            break;
303
304        default:
305            SWR_INVALID("Unsupported blend operation: %d", blendOp);
306            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
307            break;
308        }
309
310        if (Color)
311        {
312            result[0] = out[0];
313            result[1] = out[1];
314            result[2] = out[2];
315        }
316
317        if (Alpha)
318        {
319            result[3] = out[3];
320        }
321    }
322
323    void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
324    {
325        // Op: (s == PS output, d = RT contents)
326        switch (logicOp)
327        {
328        case LOGICOP_CLEAR:
329            result[0] = VIMMED1(0);
330            result[1] = VIMMED1(0);
331            result[2] = VIMMED1(0);
332            result[3] = VIMMED1(0);
333            break;
334
335        case LOGICOP_NOR:
336            // ~(s | d)
337            result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
338            result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
339            result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
340            result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
341            break;
342
343        case LOGICOP_AND_INVERTED:
344            // ~s & d
345            // todo: use avx andnot instr when I can find the intrinsic to call
346            result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
347            result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
348            result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
349            result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
350            break;
351
352        case LOGICOP_COPY_INVERTED:
353            // ~s
354            result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
355            result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
356            result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
357            result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
358            break;
359
360        case LOGICOP_AND_REVERSE:
361            // s & ~d
362            // todo: use avx andnot instr when I can find the intrinsic to call
363            result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
364            result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
365            result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
366            result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
367            break;
368
369        case LOGICOP_INVERT:
370            // ~d
371            result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
372            result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
373            result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
374            result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
375            break;
376
377        case LOGICOP_XOR:
378            // s ^ d
379            result[0] = XOR(src[0], dst[0]);
380            result[1] = XOR(src[1], dst[1]);
381            result[2] = XOR(src[2], dst[2]);
382            result[3] = XOR(src[3], dst[3]);
383            break;
384
385        case LOGICOP_NAND:
386            // ~(s & d)
387            result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
388            result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
389            result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
390            result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
391            break;
392
393        case LOGICOP_AND:
394            // s & d
395            result[0] = AND(src[0], dst[0]);
396            result[1] = AND(src[1], dst[1]);
397            result[2] = AND(src[2], dst[2]);
398            result[3] = AND(src[3], dst[3]);
399            break;
400
401        case LOGICOP_EQUIV:
402            // ~(s ^ d)
403            result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
404            result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
405            result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
406            result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
407            break;
408
409        case LOGICOP_NOOP:
410            result[0] = dst[0];
411            result[1] = dst[1];
412            result[2] = dst[2];
413            result[3] = dst[3];
414            break;
415
416        case LOGICOP_OR_INVERTED:
417            // ~s | d
418            result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
419            result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
420            result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
421            result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
422            break;
423
424        case LOGICOP_COPY:
425            result[0] = src[0];
426            result[1] = src[1];
427            result[2] = src[2];
428            result[3] = src[3];
429            break;
430
431        case LOGICOP_OR_REVERSE:
432            // s | ~d
433            result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
434            result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
435            result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
436            result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
437            break;
438
439        case LOGICOP_OR:
440            // s | d
441            result[0] = OR(src[0], dst[0]);
442            result[1] = OR(src[1], dst[1]);
443            result[2] = OR(src[2], dst[2]);
444            result[3] = OR(src[3], dst[3]);
445            break;
446
447        case LOGICOP_SET:
448            result[0] = VIMMED1(0xFFFFFFFF);
449            result[1] = VIMMED1(0xFFFFFFFF);
450            result[2] = VIMMED1(0xFFFFFFFF);
451            result[3] = VIMMED1(0xFFFFFFFF);
452            break;
453
454        default:
455            SWR_INVALID("Unsupported logic operation: %d", logicOp);
456            result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
457            break;
458        }
459    }
460
461    void
462    AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
463    {
464        // load uint32_t reference
465        Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
466
467        // load alpha
468        Value* pAlpha = LOAD(ppAlpha, {0, 0});
469
470        Value* pTest = nullptr;
471        if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
472        {
473            // convert float alpha to unorm8
474            Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
475            pAlphaU8        = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
476
477            // compare
478            switch (state.alphaTestFunction)
479            {
480            case ZFUNC_ALWAYS:
481                pTest = VIMMED1(true);
482                break;
483            case ZFUNC_NEVER:
484                pTest = VIMMED1(false);
485                break;
486            case ZFUNC_LT:
487                pTest = ICMP_ULT(pAlphaU8, pRef);
488                break;
489            case ZFUNC_EQ:
490                pTest = ICMP_EQ(pAlphaU8, pRef);
491                break;
492            case ZFUNC_LE:
493                pTest = ICMP_ULE(pAlphaU8, pRef);
494                break;
495            case ZFUNC_GT:
496                pTest = ICMP_UGT(pAlphaU8, pRef);
497                break;
498            case ZFUNC_NE:
499                pTest = ICMP_NE(pAlphaU8, pRef);
500                break;
501            case ZFUNC_GE:
502                pTest = ICMP_UGE(pAlphaU8, pRef);
503                break;
504            default:
505                SWR_INVALID("Invalid alpha test function");
506                break;
507            }
508        }
509        else
510        {
511            // cast ref to float
512            pRef = BITCAST(pRef, mSimdFP32Ty);
513
514            // compare
515            switch (state.alphaTestFunction)
516            {
517            case ZFUNC_ALWAYS:
518                pTest = VIMMED1(true);
519                break;
520            case ZFUNC_NEVER:
521                pTest = VIMMED1(false);
522                break;
523            case ZFUNC_LT:
524                pTest = FCMP_OLT(pAlpha, pRef);
525                break;
526            case ZFUNC_EQ:
527                pTest = FCMP_OEQ(pAlpha, pRef);
528                break;
529            case ZFUNC_LE:
530                pTest = FCMP_OLE(pAlpha, pRef);
531                break;
532            case ZFUNC_GT:
533                pTest = FCMP_OGT(pAlpha, pRef);
534                break;
535            case ZFUNC_NE:
536                pTest = FCMP_ONE(pAlpha, pRef);
537                break;
538            case ZFUNC_GE:
539                pTest = FCMP_OGE(pAlpha, pRef);
540                break;
541            default:
542                SWR_INVALID("Invalid alpha test function");
543                break;
544            }
545        }
546
547        // load current mask
548        Value* pMask = LOAD(ppMask);
549
550        // convert to int1 mask
551        pMask = MASK(pMask);
552
553        // and with alpha test result
554        pMask = AND(pMask, pTest);
555
556        // convert back to vector mask
557        pMask = VMASK(pMask);
558
559        // store new mask
560        STORE(pMask, ppMask);
561    }
562
563    Function* Create(const BLEND_COMPILE_STATE& state)
564    {
565        std::stringstream fnName("BLND_",
566                                 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
567        fnName << ComputeCRC(0, &state, sizeof(state));
568
569        // blend function signature
570        // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
571
572        std::vector<Type*> args{
573            PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
574        };
575
576        // std::vector<Type*> args{
577        //    PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
578        //};
579
580        FunctionType* fTy       = FunctionType::get(IRB()->getVoidTy(), args, false);
581        Function*     blendFunc = Function::Create(
582            fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
583        blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
584
585        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
586
587        IRB()->SetInsertPoint(entry);
588
589        // arguments
590        auto   argitr        = blendFunc->arg_begin();
591        Value* pBlendContext = &*argitr++;
592        pBlendContext->setName("pBlendContext");
593        Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
594        pBlendState->setName("pBlendState");
595        Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
596        pSrc->setName("src");
597        Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
598        pSrc1->setName("src1");
599        Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
600        pSrc0Alpha->setName("src0alpha");
601        Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
602        sampleNum->setName("sampleNum");
603        Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
604        pDst->setName("pDst");
605        Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
606        pResult->setName("result");
607        Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
608        ppoMask->setName("ppoMask");
609        Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
610        ppMask->setName("pMask");
611
612        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
613                      "Unsupported hot tile format");
614        Value* dst[4];
615        Value* constantColor[4];
616        Value* src[4];
617        Value* src1[4];
618        Value* result[4];
619        for (uint32_t i = 0; i < 4; ++i)
620        {
621            // load hot tile
622            dst[i] = LOAD(pDst, {0, i});
623
624            // load constant color
625            constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
626
627            // load src
628            src[i] = LOAD(pSrc, {0, i});
629
630            // load src1
631            src1[i] = LOAD(pSrc1, {0, i});
632        }
633        Value* currentSampleMask = VIMMED1(-1);
634        if (state.desc.alphaToCoverageEnable)
635        {
636            Value*   pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
637            uint32_t bits        = (1 << state.desc.numSamples) - 1;
638            currentSampleMask    = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
639            currentSampleMask    = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
640        }
641
642        // alpha test
643        if (state.desc.alphaTestEnable)
644        {
645            // Gather for archrast stats
646            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
647            AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
648        }
649        else
650        {
651            // Gather for archrast stats
652            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
653        }
654
655        // color blend
656        if (state.blendState.blendEnable)
657        {
658            // Gather for archrast stats
659            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
660
661            // clamp sources
662            Clamp(state.format, src);
663            Clamp(state.format, src1);
664            Clamp(state.format, dst);
665            Clamp(state.format, constantColor);
666
667            // apply defaults to hottile contents to take into account missing components
668            ApplyDefaults(state.format, dst);
669
670            // Force defaults for unused 'X' components
671            ApplyUnusedDefaults(state.format, dst);
672
673            // Quantize low precision components
674            Quantize(state.format, dst);
675
676            // special case clamping for R11G11B10_float which has no sign bit
677            if (state.format == R11G11B10_FLOAT)
678            {
679                dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
680                dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
681                dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
682                dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
683            }
684
685            Value* srcFactor[4];
686            Value* dstFactor[4];
687            if (state.desc.independentAlphaBlendEnable)
688            {
689                GenerateBlendFactor<true, false>(
690                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
691                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
692                                                 constantColor,
693                                                 src,
694                                                 src1,
695                                                 dst,
696                                                 srcFactor);
697
698                GenerateBlendFactor<true, false>(
699                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
700                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
701                                                 constantColor,
702                                                 src,
703                                                 src1,
704                                                 dst,
705                                                 dstFactor);
706
707                BlendFunc<true, false>(
708                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
709                BlendFunc<false, true>(
710                    state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
711            }
712            else
713            {
714                GenerateBlendFactor<true, true>(
715                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
716                GenerateBlendFactor<true, true>(
717                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
718
719                BlendFunc<true, true>(
720                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
721            }
722
723            // store results out
724            for (uint32_t i = 0; i < 4; ++i)
725            {
726                STORE(result[i], pResult, {0, i});
727            }
728        }
729        else
730        {
731            // Gather for archrast stats
732            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
733        }
734
735        if (state.blendState.logicOpEnable)
736        {
737            const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
738            Value*                 vMask[4];
739            float                  scale[4];
740
741            if (!state.blendState.blendEnable)
742            {
743                Clamp(state.format, src);
744                Clamp(state.format, dst);
745            }
746
747            for (uint32_t i = 0; i < 4; i++)
748            {
749                if (info.type[i] == SWR_TYPE_UNUSED)
750                {
751                    continue;
752                }
753
754                if (info.bpc[i] >= 32)
755                {
756                    vMask[i] = VIMMED1(0xFFFFFFFF);
757                    scale[i] = 0xFFFFFFFF;
758                }
759                else
760                {
761                    vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
762                    if (info.type[i] == SWR_TYPE_SNORM)
763                        scale[i] = (1 << (info.bpc[i] - 1)) - 1;
764                    else
765                        scale[i] = (1 << info.bpc[i]) - 1;
766                }
767
768                switch (info.type[i])
769                {
770                default:
771                    SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
772                    break;
773
774                case SWR_TYPE_UNKNOWN:
775                case SWR_TYPE_UNUSED:
776                    FALLTHROUGH;
777
778                case SWR_TYPE_UINT:
779                case SWR_TYPE_SINT:
780                    src[i] = BITCAST(src[i], mSimdInt32Ty);
781                    dst[i] = BITCAST(dst[i], mSimdInt32Ty);
782                    break;
783                case SWR_TYPE_SNORM:
784                    src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
785                    dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
786                    break;
787                case SWR_TYPE_UNORM:
788                    src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
789                    dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
790                    break;
791                }
792            }
793
794            LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
795
796            // store results out
797            for (uint32_t i = 0; i < 4; ++i)
798            {
799                if (info.type[i] == SWR_TYPE_UNUSED)
800                {
801                    continue;
802                }
803
804                // clear upper bits from PS output not in RT format after doing logic op
805                result[i] = AND(result[i], vMask[i]);
806
807                switch (info.type[i])
808                {
809                default:
810                    SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
811                    break;
812
813                case SWR_TYPE_UNKNOWN:
814                case SWR_TYPE_UNUSED:
815                    FALLTHROUGH;
816
817                case SWR_TYPE_UINT:
818                case SWR_TYPE_SINT:
819                    result[i] = BITCAST(result[i], mSimdFP32Ty);
820                    break;
821                case SWR_TYPE_SNORM:
822                    result[i] = SHL(result[i], C(32 - info.bpc[i]));
823                    result[i] = ASHR(result[i], C(32 - info.bpc[i]));
824                    result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
825                    break;
826                case SWR_TYPE_UNORM:
827                    result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
828                    break;
829                }
830
831                STORE(result[i], pResult, {0, i});
832            }
833        }
834
835        if (state.desc.oMaskEnable)
836        {
837            assert(!(state.desc.alphaToCoverageEnable));
838            // load current mask
839            Value* oMask      = LOAD(ppoMask);
840            currentSampleMask = AND(oMask, currentSampleMask);
841        }
842
843        if (state.desc.sampleMaskEnable)
844        {
845            Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
846            currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
847        }
848
849        if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
850            state.desc.oMaskEnable)
851        {
852            // load coverage mask and mask off any lanes with no samples
853            Value* pMask        = LOAD(ppMask);
854            Value* sampleMasked = SHL(C(1), sampleNum);
855            currentSampleMask   = AND(currentSampleMask, VBROADCAST(sampleMasked));
856            currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
857            Value* outputMask = AND(pMask, currentSampleMask);
858            // store new mask
859            STORE(outputMask, GEP(ppMask, C(0)));
860        }
861
862        RET_VOID();
863
864        JitManager::DumpToFile(blendFunc, "");
865
866        ::FunctionPassManager passes(JM()->mpCurrentModule);
867
868        passes.add(createBreakCriticalEdgesPass());
869        passes.add(createCFGSimplificationPass());
870        passes.add(createEarlyCSEPass());
871        passes.add(createPromoteMemoryToRegisterPass());
872        passes.add(createCFGSimplificationPass());
873        passes.add(createEarlyCSEPass());
874        passes.add(createInstructionCombiningPass());
875#if LLVM_VERSION_MAJOR <= 11
876        passes.add(createConstantPropagationPass());
877#endif
878        passes.add(createSCCPPass());
879        passes.add(createAggressiveDCEPass());
880
881        passes.add(createLowerX86Pass(this));
882
883        passes.run(*blendFunc);
884
885        JitManager::DumpToFile(blendFunc, "optimized");
886
887        return blendFunc;
888    }
889};
890
891//////////////////////////////////////////////////////////////////////////
892/// @brief JITs from fetch shader IR
893/// @param hJitMgr - JitManager handle
894/// @param func   - LLVM function IR
895/// @return PFN_FETCH_FUNC - pointer to fetch code
896PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
897{
898    const llvm::Function* func    = (const llvm::Function*)hFunc;
899    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
900    PFN_BLEND_JIT_FUNC    pfnBlend;
901    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
902    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
903    // add new IR to the module
904    pJitMgr->mIsModuleFinalized = true;
905
906    return pfnBlend;
907}
908
909//////////////////////////////////////////////////////////////////////////
910/// @brief JIT compiles blend shader
911/// @param hJitMgr - JitManager handle
912/// @param state   - blend state to build function from
913extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE                     hJitMgr,
914                                                      const BLEND_COMPILE_STATE& state)
915{
916    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
917
918    pJitMgr->SetupNewModule();
919
920    BlendJit theJit(pJitMgr);
921    HANDLE   hFunc = theJit.Create(state);
922
923    return JitBlendFunc(hJitMgr, hFunc);
924}
925