Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file
     10 /// This is the parent TargetLowering class for hardware code gen
     11 /// targets.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AMDGPUISelLowering.h"
     16 #include "AMDGPU.h"
     17 #include "AMDGPUInstrInfo.h"
     18 #include "AMDGPUMachineFunction.h"
     19 #include "GCNSubtarget.h"
     20 #include "SIMachineFunctionInfo.h"
     21 #include "llvm/CodeGen/Analysis.h"
     22 #include "llvm/IR/DiagnosticInfo.h"
     23 #include "llvm/IR/IntrinsicsAMDGPU.h"
     24 #include "llvm/Support/CommandLine.h"
     25 #include "llvm/Support/KnownBits.h"
     26 #include "llvm/Target/TargetMachine.h"
     27 
     28 using namespace llvm;
     29 
     30 #include "AMDGPUGenCallingConv.inc"
     31 
     32 static cl::opt<bool> AMDGPUBypassSlowDiv(
     33   "amdgpu-bypass-slow-div",
     34   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
     35   cl::init(true));
     36 
     37 // Find a larger type to do a load / store of a vector with.
     38 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
     39   unsigned StoreSize = VT.getStoreSizeInBits();
     40   if (StoreSize <= 32)
     41     return EVT::getIntegerVT(Ctx, StoreSize);
     42 
     43   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
     44   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
     45 }
     46 
     47 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
     48   EVT VT = Op.getValueType();
     49   KnownBits Known = DAG.computeKnownBits(Op);
     50   return VT.getSizeInBits() - Known.countMinLeadingZeros();
     51 }
     52 
     53 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
     54   EVT VT = Op.getValueType();
     55 
     56   // In order for this to be a signed 24-bit value, bit 23, must
     57   // be a sign bit.
     58   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
     59 }
     60 
     61 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     62                                            const AMDGPUSubtarget &STI)
     63     : TargetLowering(TM), Subtarget(&STI) {
     64   // Lower floating point store/load to integer store/load to reduce the number
     65   // of patterns in tablegen.
     66   setOperationAction(ISD::LOAD, MVT::f32, Promote);
     67   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
     68 
     69   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
     70   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
     71 
     72   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
     73   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
     74 
     75   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
     76   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
     77 
     78   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
     79   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
     80 
     81   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
     82   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
     83 
     84   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
     85   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
     86 
     87   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
     88   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
     89 
     90   setOperationAction(ISD::LOAD, MVT::i64, Promote);
     91   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
     92 
     93   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
     94   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
     95 
     96   setOperationAction(ISD::LOAD, MVT::f64, Promote);
     97   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
     98 
     99   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
    100   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
    101 
    102   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
    103   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
    104 
    105   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
    106   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
    107 
    108   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
    109   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
    110 
    111   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
    112   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
    113 
    114   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
    115   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
    116 
    117   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
    118   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
    119 
    120   // There are no 64-bit extloads. These should be done as a 32-bit extload and
    121   // an extension to 64-bit.
    122   for (MVT VT : MVT::integer_valuetypes()) {
    123     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
    124     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
    125     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
    126   }
    127 
    128   for (MVT VT : MVT::integer_valuetypes()) {
    129     if (VT == MVT::i64)
    130       continue;
    131 
    132     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    133     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
    134     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
    135     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
    136 
    137     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    138     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
    139     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
    140     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
    141 
    142     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
    143     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
    144     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
    145     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
    146   }
    147 
    148   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
    149     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
    150     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
    151     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
    152     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
    153     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
    154     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
    155     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
    156     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
    157     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
    158     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
    159     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
    160     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
    161     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
    162     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
    163     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
    164   }
    165 
    166   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
    167   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
    168   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
    169   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
    170   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
    171   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
    172   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
    173 
    174   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
    175   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
    176   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
    177   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
    178   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
    179 
    180   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
    181   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
    182   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
    183   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
    184   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
    185 
    186   setOperationAction(ISD::STORE, MVT::f32, Promote);
    187   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
    188 
    189   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
    190   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
    191 
    192   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
    193   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
    194 
    195   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
    196   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
    197 
    198   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
    199   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
    200 
    201   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
    202   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
    203 
    204   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
    205   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
    206 
    207   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
    208   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
    209 
    210   setOperationAction(ISD::STORE, MVT::i64, Promote);
    211   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
    212 
    213   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
    214   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
    215 
    216   setOperationAction(ISD::STORE, MVT::f64, Promote);
    217   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
    218 
    219   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
    220   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
    221 
    222   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
    223   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
    224 
    225   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
    226   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
    227 
    228   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
    229   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
    230 
    231   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
    232   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
    233 
    234   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
    235   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
    236 
    237   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
    238   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
    239 
    240   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
    241   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
    242   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
    243   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
    244 
    245   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
    246   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
    247   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
    248   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
    249 
    250   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
    251   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
    252   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
    253   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
    254   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
    255   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
    256   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
    257 
    258   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
    259   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
    260 
    261   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
    262   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
    263 
    264   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
    265   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
    266   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
    267   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
    268 
    269   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
    270   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
    271 
    272   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
    273   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
    274   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
    275   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
    276   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
    277   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
    278   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
    279 
    280   setOperationAction(ISD::Constant, MVT::i32, Legal);
    281   setOperationAction(ISD::Constant, MVT::i64, Legal);
    282   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
    283   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
    284 
    285   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    286   setOperationAction(ISD::BRIND, MVT::Other, Expand);
    287 
    288   // This is totally unsupported, just custom lower to produce an error.
    289   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
    290 
    291   // Library functions.  These default to Expand, but we have instructions
    292   // for them.
    293   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
    294   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
    295   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
    296   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
    297   setOperationAction(ISD::FABS,   MVT::f32, Legal);
    298   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
    299   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
    300   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
    301   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
    302   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
    303 
    304   setOperationAction(ISD::FROUND, MVT::f32, Custom);
    305   setOperationAction(ISD::FROUND, MVT::f64, Custom);
    306 
    307   setOperationAction(ISD::FLOG, MVT::f32, Custom);
    308   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
    309   setOperationAction(ISD::FEXP, MVT::f32, Custom);
    310 
    311 
    312   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
    313   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
    314 
    315   setOperationAction(ISD::FREM, MVT::f16, Custom);
    316   setOperationAction(ISD::FREM, MVT::f32, Custom);
    317   setOperationAction(ISD::FREM, MVT::f64, Custom);
    318 
    319   // Expand to fneg + fadd.
    320   setOperationAction(ISD::FSUB, MVT::f64, Expand);
    321 
    322   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
    323   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
    324   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
    325   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
    326   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
    327   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
    328   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
    329   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
    330   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
    331   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
    332   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
    333   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
    334   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
    335   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
    336   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
    337   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
    338   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
    339   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
    340   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
    341   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
    342   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
    343   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
    344   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
    345   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
    346   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
    347   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
    348   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
    349   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
    350   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
    351   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
    352 
    353   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
    354   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
    355   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
    356 
    357   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
    358   for (MVT VT : ScalarIntVTs) {
    359     // These should use [SU]DIVREM, so set them to expand
    360     setOperationAction(ISD::SDIV, VT, Expand);
    361     setOperationAction(ISD::UDIV, VT, Expand);
    362     setOperationAction(ISD::SREM, VT, Expand);
    363     setOperationAction(ISD::UREM, VT, Expand);
    364 
    365     // GPU does not have divrem function for signed or unsigned.
    366     setOperationAction(ISD::SDIVREM, VT, Custom);
    367     setOperationAction(ISD::UDIVREM, VT, Custom);
    368 
    369     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
    370     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    371     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    372 
    373     setOperationAction(ISD::BSWAP, VT, Expand);
    374     setOperationAction(ISD::CTTZ, VT, Expand);
    375     setOperationAction(ISD::CTLZ, VT, Expand);
    376 
    377     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
    378     setOperationAction(ISD::ADDC, VT, Legal);
    379     setOperationAction(ISD::SUBC, VT, Legal);
    380     setOperationAction(ISD::ADDE, VT, Legal);
    381     setOperationAction(ISD::SUBE, VT, Legal);
    382   }
    383 
    384   // The hardware supports 32-bit FSHR, but not FSHL.
    385   setOperationAction(ISD::FSHR, MVT::i32, Legal);
    386 
    387   // The hardware supports 32-bit ROTR, but not ROTL.
    388   setOperationAction(ISD::ROTL, MVT::i32, Expand);
    389   setOperationAction(ISD::ROTL, MVT::i64, Expand);
    390   setOperationAction(ISD::ROTR, MVT::i64, Expand);
    391 
    392   setOperationAction(ISD::MULHU, MVT::i16, Expand);
    393   setOperationAction(ISD::MULHS, MVT::i16, Expand);
    394 
    395   setOperationAction(ISD::MUL, MVT::i64, Expand);
    396   setOperationAction(ISD::MULHU, MVT::i64, Expand);
    397   setOperationAction(ISD::MULHS, MVT::i64, Expand);
    398   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
    399   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
    400   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
    401   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
    402   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
    403 
    404   setOperationAction(ISD::SMIN, MVT::i32, Legal);
    405   setOperationAction(ISD::UMIN, MVT::i32, Legal);
    406   setOperationAction(ISD::SMAX, MVT::i32, Legal);
    407   setOperationAction(ISD::UMAX, MVT::i32, Legal);
    408 
    409   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
    410   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
    411   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
    412   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
    413 
    414   static const MVT::SimpleValueType VectorIntTypes[] = {
    415     MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
    416   };
    417 
    418   for (MVT VT : VectorIntTypes) {
    419     // Expand the following operations for the current type by default.
    420     setOperationAction(ISD::ADD,  VT, Expand);
    421     setOperationAction(ISD::AND,  VT, Expand);
    422     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
    423     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
    424     setOperationAction(ISD::MUL,  VT, Expand);
    425     setOperationAction(ISD::MULHU, VT, Expand);
    426     setOperationAction(ISD::MULHS, VT, Expand);
    427     setOperationAction(ISD::OR,   VT, Expand);
    428     setOperationAction(ISD::SHL,  VT, Expand);
    429     setOperationAction(ISD::SRA,  VT, Expand);
    430     setOperationAction(ISD::SRL,  VT, Expand);
    431     setOperationAction(ISD::ROTL, VT, Expand);
    432     setOperationAction(ISD::ROTR, VT, Expand);
    433     setOperationAction(ISD::SUB,  VT, Expand);
    434     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
    435     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
    436     setOperationAction(ISD::SDIV, VT, Expand);
    437     setOperationAction(ISD::UDIV, VT, Expand);
    438     setOperationAction(ISD::SREM, VT, Expand);
    439     setOperationAction(ISD::UREM, VT, Expand);
    440     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
    441     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
    442     setOperationAction(ISD::SDIVREM, VT, Expand);
    443     setOperationAction(ISD::UDIVREM, VT, Expand);
    444     setOperationAction(ISD::SELECT, VT, Expand);
    445     setOperationAction(ISD::VSELECT, VT, Expand);
    446     setOperationAction(ISD::SELECT_CC, VT, Expand);
    447     setOperationAction(ISD::XOR,  VT, Expand);
    448     setOperationAction(ISD::BSWAP, VT, Expand);
    449     setOperationAction(ISD::CTPOP, VT, Expand);
    450     setOperationAction(ISD::CTTZ, VT, Expand);
    451     setOperationAction(ISD::CTLZ, VT, Expand);
    452     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    453     setOperationAction(ISD::SETCC, VT, Expand);
    454   }
    455 
    456   static const MVT::SimpleValueType FloatVectorTypes[] = {
    457      MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
    458   };
    459 
    460   for (MVT VT : FloatVectorTypes) {
    461     setOperationAction(ISD::FABS, VT, Expand);
    462     setOperationAction(ISD::FMINNUM, VT, Expand);
    463     setOperationAction(ISD::FMAXNUM, VT, Expand);
    464     setOperationAction(ISD::FADD, VT, Expand);
    465     setOperationAction(ISD::FCEIL, VT, Expand);
    466     setOperationAction(ISD::FCOS, VT, Expand);
    467     setOperationAction(ISD::FDIV, VT, Expand);
    468     setOperationAction(ISD::FEXP2, VT, Expand);
    469     setOperationAction(ISD::FEXP, VT, Expand);
    470     setOperationAction(ISD::FLOG2, VT, Expand);
    471     setOperationAction(ISD::FREM, VT, Expand);
    472     setOperationAction(ISD::FLOG, VT, Expand);
    473     setOperationAction(ISD::FLOG10, VT, Expand);
    474     setOperationAction(ISD::FPOW, VT, Expand);
    475     setOperationAction(ISD::FFLOOR, VT, Expand);
    476     setOperationAction(ISD::FTRUNC, VT, Expand);
    477     setOperationAction(ISD::FMUL, VT, Expand);
    478     setOperationAction(ISD::FMA, VT, Expand);
    479     setOperationAction(ISD::FRINT, VT, Expand);
    480     setOperationAction(ISD::FNEARBYINT, VT, Expand);
    481     setOperationAction(ISD::FSQRT, VT, Expand);
    482     setOperationAction(ISD::FSIN, VT, Expand);
    483     setOperationAction(ISD::FSUB, VT, Expand);
    484     setOperationAction(ISD::FNEG, VT, Expand);
    485     setOperationAction(ISD::VSELECT, VT, Expand);
    486     setOperationAction(ISD::SELECT_CC, VT, Expand);
    487     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    488     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
    489     setOperationAction(ISD::SETCC, VT, Expand);
    490     setOperationAction(ISD::FCANONICALIZE, VT, Expand);
    491   }
    492 
    493   // This causes using an unrolled select operation rather than expansion with
    494   // bit operations. This is in general better, but the alternative using BFI
    495   // instructions may be better if the select sources are SGPRs.
    496   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
    497   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
    498 
    499   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
    500   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
    501 
    502   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
    503   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
    504 
    505   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
    506   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
    507 
    508   // There are no libcalls of any kind.
    509   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
    510     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
    511 
    512   setSchedulingPreference(Sched::RegPressure);
    513   setJumpIsExpensive(true);
    514 
    515   // FIXME: This is only partially true. If we have to do vector compares, any
    516   // SGPR pair can be a condition register. If we have a uniform condition, we
    517   // are better off doing SALU operations, where there is only one SCC. For now,
    518   // we don't have a way of knowing during instruction selection if a condition
    519   // will be uniform and we always use vector compares. Assume we are using
    520   // vector compares until that is fixed.
    521   setHasMultipleConditionRegisters(true);
    522 
    523   setMinCmpXchgSizeInBits(32);
    524   setSupportsUnalignedAtomics(false);
    525 
    526   PredictableSelectIsExpensive = false;
    527 
    528   // We want to find all load dependencies for long chains of stores to enable
    529   // merging into very wide vectors. The problem is with vectors with > 4
    530   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
    531   // vectors are a legal type, even though we have to split the loads
    532   // usually. When we can more precisely specify load legality per address
    533   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
    534   // smarter so that they can figure out what to do in 2 iterations without all
    535   // N > 4 stores on the same chain.
    536   GatherAllAliasesMaxDepth = 16;
    537 
    538   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
    539   // about these during lowering.
    540   MaxStoresPerMemcpy  = 0xffffffff;
    541   MaxStoresPerMemmove = 0xffffffff;
    542   MaxStoresPerMemset  = 0xffffffff;
    543 
    544   // The expansion for 64-bit division is enormous.
    545   if (AMDGPUBypassSlowDiv)
    546     addBypassSlowDiv(64, 32);
    547 
    548   setTargetDAGCombine(ISD::BITCAST);
    549   setTargetDAGCombine(ISD::SHL);
    550   setTargetDAGCombine(ISD::SRA);
    551   setTargetDAGCombine(ISD::SRL);
    552   setTargetDAGCombine(ISD::TRUNCATE);
    553   setTargetDAGCombine(ISD::MUL);
    554   setTargetDAGCombine(ISD::MULHU);
    555   setTargetDAGCombine(ISD::MULHS);
    556   setTargetDAGCombine(ISD::SELECT);
    557   setTargetDAGCombine(ISD::SELECT_CC);
    558   setTargetDAGCombine(ISD::STORE);
    559   setTargetDAGCombine(ISD::FADD);
    560   setTargetDAGCombine(ISD::FSUB);
    561   setTargetDAGCombine(ISD::FNEG);
    562   setTargetDAGCombine(ISD::FABS);
    563   setTargetDAGCombine(ISD::AssertZext);
    564   setTargetDAGCombine(ISD::AssertSext);
    565   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
    566 }
    567 
    568 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
    569   if (getTargetMachine().Options.NoSignedZerosFPMath)
    570     return true;
    571 
    572   const auto Flags = Op.getNode()->getFlags();
    573   if (Flags.hasNoSignedZeros())
    574     return true;
    575 
    576   return false;
    577 }
    578 
    579 //===----------------------------------------------------------------------===//
    580 // Target Information
    581 //===----------------------------------------------------------------------===//
    582 
    583 LLVM_READNONE
    584 static bool fnegFoldsIntoOp(unsigned Opc) {
    585   switch (Opc) {
    586   case ISD::FADD:
    587   case ISD::FSUB:
    588   case ISD::FMUL:
    589   case ISD::FMA:
    590   case ISD::FMAD:
    591   case ISD::FMINNUM:
    592   case ISD::FMAXNUM:
    593   case ISD::FMINNUM_IEEE:
    594   case ISD::FMAXNUM_IEEE:
    595   case ISD::FSIN:
    596   case ISD::FTRUNC:
    597   case ISD::FRINT:
    598   case ISD::FNEARBYINT:
    599   case ISD::FCANONICALIZE:
    600   case AMDGPUISD::RCP:
    601   case AMDGPUISD::RCP_LEGACY:
    602   case AMDGPUISD::RCP_IFLAG:
    603   case AMDGPUISD::SIN_HW:
    604   case AMDGPUISD::FMUL_LEGACY:
    605   case AMDGPUISD::FMIN_LEGACY:
    606   case AMDGPUISD::FMAX_LEGACY:
    607   case AMDGPUISD::FMED3:
    608     // TODO: handle llvm.amdgcn.fma.legacy
    609     return true;
    610   default:
    611     return false;
    612   }
    613 }
    614 
    615 /// \p returns true if the operation will definitely need to use a 64-bit
    616 /// encoding, and thus will use a VOP3 encoding regardless of the source
    617 /// modifiers.
    618 LLVM_READONLY
    619 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
    620   return N->getNumOperands() > 2 || VT == MVT::f64;
    621 }
    622 
    623 // Most FP instructions support source modifiers, but this could be refined
    624 // slightly.
    625 LLVM_READONLY
    626 static bool hasSourceMods(const SDNode *N) {
    627   if (isa<MemSDNode>(N))
    628     return false;
    629 
    630   switch (N->getOpcode()) {
    631   case ISD::CopyToReg:
    632   case ISD::SELECT:
    633   case ISD::FDIV:
    634   case ISD::FREM:
    635   case ISD::INLINEASM:
    636   case ISD::INLINEASM_BR:
    637   case AMDGPUISD::DIV_SCALE:
    638   case ISD::INTRINSIC_W_CHAIN:
    639 
    640   // TODO: Should really be looking at the users of the bitcast. These are
    641   // problematic because bitcasts are used to legalize all stores to integer
    642   // types.
    643   case ISD::BITCAST:
    644     return false;
    645   case ISD::INTRINSIC_WO_CHAIN: {
    646     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
    647     case Intrinsic::amdgcn_interp_p1:
    648     case Intrinsic::amdgcn_interp_p2:
    649     case Intrinsic::amdgcn_interp_mov:
    650     case Intrinsic::amdgcn_interp_p1_f16:
    651     case Intrinsic::amdgcn_interp_p2_f16:
    652       return false;
    653     default:
    654       return true;
    655     }
    656   }
    657   default:
    658     return true;
    659   }
    660 }
    661 
    662 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
    663                                                  unsigned CostThreshold) {
    664   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
    665   // it is truly free to use a source modifier in all cases. If there are
    666   // multiple users but for each one will necessitate using VOP3, there will be
    667   // a code size increase. Try to avoid increasing code size unless we know it
    668   // will save on the instruction count.
    669   unsigned NumMayIncreaseSize = 0;
    670   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
    671 
    672   // XXX - Should this limit number of uses to check?
    673   for (const SDNode *U : N->uses()) {
    674     if (!hasSourceMods(U))
    675       return false;
    676 
    677     if (!opMustUseVOP3Encoding(U, VT)) {
    678       if (++NumMayIncreaseSize > CostThreshold)
    679         return false;
    680     }
    681   }
    682 
    683   return true;
    684 }
    685 
    686 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
    687                                               ISD::NodeType ExtendKind) const {
    688   assert(!VT.isVector() && "only scalar expected");
    689 
    690   // Round to the next multiple of 32-bits.
    691   unsigned Size = VT.getSizeInBits();
    692   if (Size <= 32)
    693     return MVT::i32;
    694   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
    695 }
    696 
    697 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
    698   return MVT::i32;
    699 }
    700 
    701 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
    702   return true;
    703 }
    704 
    705 // The backend supports 32 and 64 bit floating point immediates.
    706 // FIXME: Why are we reporting vectors of FP immediates as legal?
    707 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
    708                                         bool ForCodeSize) const {
    709   EVT ScalarVT = VT.getScalarType();
    710   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
    711          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
    712 }
    713 
    714 // We don't want to shrink f64 / f32 constants.
    715 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
    716   EVT ScalarVT = VT.getScalarType();
    717   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
    718 }
    719 
    720 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
    721                                                  ISD::LoadExtType ExtTy,
    722                                                  EVT NewVT) const {
    723   // TODO: This may be worth removing. Check regression tests for diffs.
    724   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
    725     return false;
    726 
    727   unsigned NewSize = NewVT.getStoreSizeInBits();
    728 
    729   // If we are reducing to a 32-bit load or a smaller multi-dword load,
    730   // this is always better.
    731   if (NewSize >= 32)
    732     return true;
    733 
    734   EVT OldVT = N->getValueType(0);
    735   unsigned OldSize = OldVT.getStoreSizeInBits();
    736 
    737   MemSDNode *MN = cast<MemSDNode>(N);
    738   unsigned AS = MN->getAddressSpace();
    739   // Do not shrink an aligned scalar load to sub-dword.
    740   // Scalar engine cannot do sub-dword loads.
    741   if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
    742       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
    743        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
    744        (isa<LoadSDNode>(N) &&
    745         AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
    746       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
    747     return false;
    748 
    749   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
    750   // extloads, so doing one requires using a buffer_load. In cases where we
    751   // still couldn't use a scalar load, using the wider load shouldn't really
    752   // hurt anything.
    753 
    754   // If the old size already had to be an extload, there's no harm in continuing
    755   // to reduce the width.
    756   return (OldSize < 32);
    757 }
    758 
    759 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
    760                                                    const SelectionDAG &DAG,
    761                                                    const MachineMemOperand &MMO) const {
    762 
    763   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
    764 
    765   if (LoadTy.getScalarType() == MVT::i32)
    766     return false;
    767 
    768   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
    769   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
    770 
    771   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
    772     return false;
    773 
    774   bool Fast = false;
    775   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
    776                                         CastTy, MMO, &Fast) &&
    777          Fast;
    778 }
    779 
    780 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
    781 // profitable with the expansion for 64-bit since it's generally good to
    782 // speculate things.
    783 // FIXME: These should really have the size as a parameter.
    784 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
    785   return true;
    786 }
    787 
    788 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
    789   return true;
    790 }
    791 
    792 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
    793   switch (N->getOpcode()) {
    794   case ISD::EntryToken:
    795   case ISD::TokenFactor:
    796     return true;
    797   case ISD::INTRINSIC_WO_CHAIN: {
    798     unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
    799     switch (IntrID) {
    800     case Intrinsic::amdgcn_readfirstlane:
    801     case Intrinsic::amdgcn_readlane:
    802       return true;
    803     }
    804     return false;
    805   }
    806   case ISD::LOAD:
    807     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
    808         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
    809       return true;
    810     return false;
    811   }
    812   return false;
    813 }
    814 
    815 SDValue AMDGPUTargetLowering::getNegatedExpression(
    816     SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
    817     NegatibleCost &Cost, unsigned Depth) const {
    818 
    819   switch (Op.getOpcode()) {
    820   case ISD::FMA:
    821   case ISD::FMAD: {
    822     // Negating a fma is not free if it has users without source mods.
    823     if (!allUsesHaveSourceMods(Op.getNode()))
    824       return SDValue();
    825     break;
    826   }
    827   default:
    828     break;
    829   }
    830 
    831   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
    832                                               ForCodeSize, Cost, Depth);
    833 }
    834 
    835 //===---------------------------------------------------------------------===//
    836 // Target Properties
    837 //===---------------------------------------------------------------------===//
    838 
    839 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
    840   assert(VT.isFloatingPoint());
    841 
    842   // Packed operations do not have a fabs modifier.
    843   return VT == MVT::f32 || VT == MVT::f64 ||
    844          (Subtarget->has16BitInsts() && VT == MVT::f16);
    845 }
    846 
    847 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
    848   assert(VT.isFloatingPoint());
    849   return VT == MVT::f32 || VT == MVT::f64 ||
    850          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
    851          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
    852 }
    853 
    854 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
    855                                                          unsigned NumElem,
    856                                                          unsigned AS) const {
    857   return true;
    858 }
    859 
    860 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
    861   // There are few operations which truly have vector input operands. Any vector
    862   // operation is going to involve operations on each component, and a
    863   // build_vector will be a copy per element, so it always makes sense to use a
    864   // build_vector input in place of the extracted element to avoid a copy into a
    865   // super register.
    866   //
    867   // We should probably only do this if all users are extracts only, but this
    868   // should be the common case.
    869   return true;
    870 }
    871 
    872 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
    873   // Truncate is just accessing a subregister.
    874 
    875   unsigned SrcSize = Source.getSizeInBits();
    876   unsigned DestSize = Dest.getSizeInBits();
    877 
    878   return DestSize < SrcSize && DestSize % 32 == 0 ;
    879 }
    880 
    881 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
    882   // Truncate is just accessing a subregister.
    883 
    884   unsigned SrcSize = Source->getScalarSizeInBits();
    885   unsigned DestSize = Dest->getScalarSizeInBits();
    886 
    887   if (DestSize== 16 && Subtarget->has16BitInsts())
    888     return SrcSize >= 32;
    889 
    890   return DestSize < SrcSize && DestSize % 32 == 0;
    891 }
    892 
    893 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
    894   unsigned SrcSize = Src->getScalarSizeInBits();
    895   unsigned DestSize = Dest->getScalarSizeInBits();
    896 
    897   if (SrcSize == 16 && Subtarget->has16BitInsts())
    898     return DestSize >= 32;
    899 
    900   return SrcSize == 32 && DestSize == 64;
    901 }
    902 
    903 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
    904   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
    905   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
    906   // this will enable reducing 64-bit operations the 32-bit, which is always
    907   // good.
    908 
    909   if (Src == MVT::i16)
    910     return Dest == MVT::i32 ||Dest == MVT::i64 ;
    911 
    912   return Src == MVT::i32 && Dest == MVT::i64;
    913 }
    914 
    915 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
    916   return isZExtFree(Val.getValueType(), VT2);
    917 }
    918 
    919 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
    920   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
    921   // limited number of native 64-bit operations. Shrinking an operation to fit
    922   // in a single 32-bit register should always be helpful. As currently used,
    923   // this is much less general than the name suggests, and is only used in
    924   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
    925   // not profitable, and may actually be harmful.
    926   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
    927 }
    928 
    929 //===---------------------------------------------------------------------===//
    930 // TargetLowering Callbacks
    931 //===---------------------------------------------------------------------===//
    932 
    933 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
    934                                                   bool IsVarArg) {
    935   switch (CC) {
    936   case CallingConv::AMDGPU_VS:
    937   case CallingConv::AMDGPU_GS:
    938   case CallingConv::AMDGPU_PS:
    939   case CallingConv::AMDGPU_CS:
    940   case CallingConv::AMDGPU_HS:
    941   case CallingConv::AMDGPU_ES:
    942   case CallingConv::AMDGPU_LS:
    943     return CC_AMDGPU;
    944   case CallingConv::C:
    945   case CallingConv::Fast:
    946   case CallingConv::Cold:
    947     return CC_AMDGPU_Func;
    948   case CallingConv::AMDGPU_Gfx:
    949     return CC_SI_Gfx;
    950   case CallingConv::AMDGPU_KERNEL:
    951   case CallingConv::SPIR_KERNEL:
    952   default:
    953     report_fatal_error("Unsupported calling convention for call");
    954   }
    955 }
    956 
    957 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
    958                                                     bool IsVarArg) {
    959   switch (CC) {
    960   case CallingConv::AMDGPU_KERNEL:
    961   case CallingConv::SPIR_KERNEL:
    962     llvm_unreachable("kernels should not be handled here");
    963   case CallingConv::AMDGPU_VS:
    964   case CallingConv::AMDGPU_GS:
    965   case CallingConv::AMDGPU_PS:
    966   case CallingConv::AMDGPU_CS:
    967   case CallingConv::AMDGPU_HS:
    968   case CallingConv::AMDGPU_ES:
    969   case CallingConv::AMDGPU_LS:
    970     return RetCC_SI_Shader;
    971   case CallingConv::AMDGPU_Gfx:
    972     return RetCC_SI_Gfx;
    973   case CallingConv::C:
    974   case CallingConv::Fast:
    975   case CallingConv::Cold:
    976     return RetCC_AMDGPU_Func;
    977   default:
    978     report_fatal_error("Unsupported calling convention.");
    979   }
    980 }
    981 
    982 /// The SelectionDAGBuilder will automatically promote function arguments
    983 /// with illegal types.  However, this does not work for the AMDGPU targets
    984 /// since the function arguments are stored in memory as these illegal types.
    985 /// In order to handle this properly we need to get the original types sizes
    986 /// from the LLVM IR Function and fixup the ISD:InputArg values before
    987 /// passing them to AnalyzeFormalArguments()
    988 
    989 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
    990 /// input values across multiple registers.  Each item in the Ins array
    991 /// represents a single value that will be stored in registers.  Ins[x].VT is
    992 /// the value type of the value that will be stored in the register, so
    993 /// whatever SDNode we lower the argument to needs to be this type.
    994 ///
    995 /// In order to correctly lower the arguments we need to know the size of each
    996 /// argument.  Since Ins[x].VT gives us the size of the register that will
    997 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
    998 /// for the orignal function argument so that we can deduce the correct memory
    999 /// type to use for Ins[x].  In most cases the correct memory type will be
   1000 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
   1001 /// we have a kernel argument of type v8i8, this argument will be split into
   1002 /// 8 parts and each part will be represented by its own item in the Ins array.
   1003 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
   1004 /// the argument before it was split.  From this, we deduce that the memory type
   1005 /// for each individual part is i8.  We pass the memory type as LocVT to the
   1006 /// calling convention analysis function and the register type (Ins[x].VT) as
   1007 /// the ValVT.
   1008 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
   1009   CCState &State,
   1010   const SmallVectorImpl<ISD::InputArg> &Ins) const {
   1011   const MachineFunction &MF = State.getMachineFunction();
   1012   const Function &Fn = MF.getFunction();
   1013   LLVMContext &Ctx = Fn.getParent()->getContext();
   1014   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
   1015   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
   1016   CallingConv::ID CC = Fn.getCallingConv();
   1017 
   1018   Align MaxAlign = Align(1);
   1019   uint64_t ExplicitArgOffset = 0;
   1020   const DataLayout &DL = Fn.getParent()->getDataLayout();
   1021 
   1022   unsigned InIndex = 0;
   1023 
   1024   for (const Argument &Arg : Fn.args()) {
   1025     const bool IsByRef = Arg.hasByRefAttr();
   1026     Type *BaseArgTy = Arg.getType();
   1027     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
   1028     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
   1029     if (!Alignment)
   1030       Alignment = DL.getABITypeAlign(MemArgTy);
   1031     MaxAlign = max(Alignment, MaxAlign);
   1032     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
   1033 
   1034     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
   1035     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
   1036 
   1037     // We're basically throwing away everything passed into us and starting over
   1038     // to get accurate in-memory offsets. The "PartOffset" is completely useless
   1039     // to us as computed in Ins.
   1040     //
   1041     // We also need to figure out what type legalization is trying to do to get
   1042     // the correct memory offsets.
   1043 
   1044     SmallVector<EVT, 16> ValueVTs;
   1045     SmallVector<uint64_t, 16> Offsets;
   1046     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
   1047 
   1048     for (unsigned Value = 0, NumValues = ValueVTs.size();
   1049          Value != NumValues; ++Value) {
   1050       uint64_t BasePartOffset = Offsets[Value];
   1051 
   1052       EVT ArgVT = ValueVTs[Value];
   1053       EVT MemVT = ArgVT;
   1054       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
   1055       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
   1056 
   1057       if (NumRegs == 1) {
   1058         // This argument is not split, so the IR type is the memory type.
   1059         if (ArgVT.isExtended()) {
   1060           // We have an extended type, like i24, so we should just use the
   1061           // register type.
   1062           MemVT = RegisterVT;
   1063         } else {
   1064           MemVT = ArgVT;
   1065         }
   1066       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
   1067                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
   1068         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
   1069         // We have a vector value which has been split into a vector with
   1070         // the same scalar type, but fewer elements.  This should handle
   1071         // all the floating-point vector types.
   1072         MemVT = RegisterVT;
   1073       } else if (ArgVT.isVector() &&
   1074                  ArgVT.getVectorNumElements() == NumRegs) {
   1075         // This arg has been split so that each element is stored in a separate
   1076         // register.
   1077         MemVT = ArgVT.getScalarType();
   1078       } else if (ArgVT.isExtended()) {
   1079         // We have an extended type, like i65.
   1080         MemVT = RegisterVT;
   1081       } else {
   1082         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
   1083         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
   1084         if (RegisterVT.isInteger()) {
   1085           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
   1086         } else if (RegisterVT.isVector()) {
   1087           assert(!RegisterVT.getScalarType().isFloatingPoint());
   1088           unsigned NumElements = RegisterVT.getVectorNumElements();
   1089           assert(MemoryBits % NumElements == 0);
   1090           // This vector type has been split into another vector type with
   1091           // a different elements size.
   1092           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
   1093                                            MemoryBits / NumElements);
   1094           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
   1095         } else {
   1096           llvm_unreachable("cannot deduce memory type.");
   1097         }
   1098       }
   1099 
   1100       // Convert one element vectors to scalar.
   1101       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
   1102         MemVT = MemVT.getScalarType();
   1103 
   1104       // Round up vec3/vec5 argument.
   1105       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
   1106         assert(MemVT.getVectorNumElements() == 3 ||
   1107                MemVT.getVectorNumElements() == 5);
   1108         MemVT = MemVT.getPow2VectorType(State.getContext());
   1109       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
   1110         MemVT = MemVT.getRoundIntegerType(State.getContext());
   1111       }
   1112 
   1113       unsigned PartOffset = 0;
   1114       for (unsigned i = 0; i != NumRegs; ++i) {
   1115         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
   1116                                                BasePartOffset + PartOffset,
   1117                                                MemVT.getSimpleVT(),
   1118                                                CCValAssign::Full));
   1119         PartOffset += MemVT.getStoreSize();
   1120       }
   1121     }
   1122   }
   1123 }
   1124 
   1125 SDValue AMDGPUTargetLowering::LowerReturn(
   1126   SDValue Chain, CallingConv::ID CallConv,
   1127   bool isVarArg,
   1128   const SmallVectorImpl<ISD::OutputArg> &Outs,
   1129   const SmallVectorImpl<SDValue> &OutVals,
   1130   const SDLoc &DL, SelectionDAG &DAG) const {
   1131   // FIXME: Fails for r600 tests
   1132   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
   1133   // "wave terminate should not have return values");
   1134   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
   1135 }
   1136 
   1137 //===---------------------------------------------------------------------===//
   1138 // Target specific lowering
   1139 //===---------------------------------------------------------------------===//
   1140 
   1141 /// Selects the correct CCAssignFn for a given CallingConvention value.
   1142 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   1143                                                     bool IsVarArg) {
   1144   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
   1145 }
   1146 
   1147 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
   1148                                                       bool IsVarArg) {
   1149   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
   1150 }
   1151 
   1152 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
   1153                                                   SelectionDAG &DAG,
   1154                                                   MachineFrameInfo &MFI,
   1155                                                   int ClobberedFI) const {
   1156   SmallVector<SDValue, 8> ArgChains;
   1157   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
   1158   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
   1159 
   1160   // Include the original chain at the beginning of the list. When this is
   1161   // used by target LowerCall hooks, this helps legalize find the
   1162   // CALLSEQ_BEGIN node.
   1163   ArgChains.push_back(Chain);
   1164 
   1165   // Add a chain value for each stack argument corresponding
   1166   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
   1167                             UE = DAG.getEntryNode().getNode()->use_end();
   1168        U != UE; ++U) {
   1169     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
   1170       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
   1171         if (FI->getIndex() < 0) {
   1172           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
   1173           int64_t InLastByte = InFirstByte;
   1174           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
   1175 
   1176           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
   1177               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
   1178             ArgChains.push_back(SDValue(L, 1));
   1179         }
   1180       }
   1181     }
   1182   }
   1183 
   1184   // Build a tokenfactor for all the chains.
   1185   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
   1186 }
   1187 
   1188 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
   1189                                                  SmallVectorImpl<SDValue> &InVals,
   1190                                                  StringRef Reason) const {
   1191   SDValue Callee = CLI.Callee;
   1192   SelectionDAG &DAG = CLI.DAG;
   1193 
   1194   const Function &Fn = DAG.getMachineFunction().getFunction();
   1195 
   1196   StringRef FuncName("<unknown>");
   1197 
   1198   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
   1199     FuncName = G->getSymbol();
   1200   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
   1201     FuncName = G->getGlobal()->getName();
   1202 
   1203   DiagnosticInfoUnsupported NoCalls(
   1204     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
   1205   DAG.getContext()->diagnose(NoCalls);
   1206 
   1207   if (!CLI.IsTailCall) {
   1208     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
   1209       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
   1210   }
   1211 
   1212   return DAG.getEntryNode();
   1213 }
   1214 
   1215 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
   1216                                         SmallVectorImpl<SDValue> &InVals) const {
   1217   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
   1218 }
   1219 
   1220 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   1221                                                       SelectionDAG &DAG) const {
   1222   const Function &Fn = DAG.getMachineFunction().getFunction();
   1223 
   1224   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
   1225                                             SDLoc(Op).getDebugLoc());
   1226   DAG.getContext()->diagnose(NoDynamicAlloca);
   1227   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
   1228   return DAG.getMergeValues(Ops, SDLoc());
   1229 }
   1230 
   1231 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   1232                                              SelectionDAG &DAG) const {
   1233   switch (Op.getOpcode()) {
   1234   default:
   1235     Op->print(errs(), &DAG);
   1236     llvm_unreachable("Custom lowering code for this "
   1237                      "instruction is not implemented yet!");
   1238     break;
   1239   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   1240   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   1241   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
   1242   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   1243   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
   1244   case ISD::FREM: return LowerFREM(Op, DAG);
   1245   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
   1246   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
   1247   case ISD::FRINT: return LowerFRINT(Op, DAG);
   1248   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
   1249   case ISD::FROUND: return LowerFROUND(Op, DAG);
   1250   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   1251   case ISD::FLOG:
   1252     return LowerFLOG(Op, DAG, numbers::ln2f);
   1253   case ISD::FLOG10:
   1254     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
   1255   case ISD::FEXP:
   1256     return lowerFEXP(Op, DAG);
   1257   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
   1258   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   1259   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
   1260   case ISD::FP_TO_SINT:
   1261   case ISD::FP_TO_UINT:
   1262     return LowerFP_TO_INT(Op, DAG);
   1263   case ISD::CTTZ:
   1264   case ISD::CTTZ_ZERO_UNDEF:
   1265   case ISD::CTLZ:
   1266   case ISD::CTLZ_ZERO_UNDEF:
   1267     return LowerCTLZ_CTTZ(Op, DAG);
   1268   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   1269   }
   1270   return Op;
   1271 }
   1272 
   1273 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
   1274                                               SmallVectorImpl<SDValue> &Results,
   1275                                               SelectionDAG &DAG) const {
   1276   switch (N->getOpcode()) {
   1277   case ISD::SIGN_EXTEND_INREG:
   1278     // Different parts of legalization seem to interpret which type of
   1279     // sign_extend_inreg is the one to check for custom lowering. The extended
   1280     // from type is what really matters, but some places check for custom
   1281     // lowering of the result type. This results in trying to use
   1282     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
   1283     // nothing here and let the illegal result integer be handled normally.
   1284     return;
   1285   default:
   1286     return;
   1287   }
   1288 }
   1289 
   1290 bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
   1291   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   1292   if (!GVar || !GVar->hasInitializer())
   1293     return false;
   1294 
   1295   return !isa<UndefValue>(GVar->getInitializer());
   1296 }
   1297 
   1298 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   1299                                                  SDValue Op,
   1300                                                  SelectionDAG &DAG) const {
   1301 
   1302   const DataLayout &DL = DAG.getDataLayout();
   1303   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   1304   const GlobalValue *GV = G->getGlobal();
   1305 
   1306   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
   1307       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
   1308     if (!MFI->isModuleEntryFunction() &&
   1309         !GV->getName().equals("llvm.amdgcn.module.lds")) {
   1310       SDLoc DL(Op);
   1311       const Function &Fn = DAG.getMachineFunction().getFunction();
   1312       DiagnosticInfoUnsupported BadLDSDecl(
   1313         Fn, "local memory global used by non-kernel function",
   1314         DL.getDebugLoc(), DS_Warning);
   1315       DAG.getContext()->diagnose(BadLDSDecl);
   1316 
   1317       // We currently don't have a way to correctly allocate LDS objects that
   1318       // aren't directly associated with a kernel. We do force inlining of
   1319       // functions that use local objects. However, if these dead functions are
   1320       // not eliminated, we don't want a compile time error. Just emit a warning
   1321       // and a trap, since there should be no callable path here.
   1322       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
   1323       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
   1324                                         Trap, DAG.getRoot());
   1325       DAG.setRoot(OutputChain);
   1326       return DAG.getUNDEF(Op.getValueType());
   1327     }
   1328 
   1329     // XXX: What does the value of G->getOffset() mean?
   1330     assert(G->getOffset() == 0 &&
   1331          "Do not know what to do with an non-zero offset");
   1332 
   1333     // TODO: We could emit code to handle the initialization somewhere.
   1334     if (!hasDefinedInitializer(GV)) {
   1335       unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
   1336       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
   1337     }
   1338   }
   1339 
   1340   const Function &Fn = DAG.getMachineFunction().getFunction();
   1341   DiagnosticInfoUnsupported BadInit(
   1342       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
   1343   DAG.getContext()->diagnose(BadInit);
   1344   return SDValue();
   1345 }
   1346 
   1347 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
   1348                                                   SelectionDAG &DAG) const {
   1349   SmallVector<SDValue, 8> Args;
   1350 
   1351   EVT VT = Op.getValueType();
   1352   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
   1353     SDLoc SL(Op);
   1354     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
   1355     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
   1356 
   1357     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
   1358     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
   1359   }
   1360 
   1361   for (const SDUse &U : Op->ops())
   1362     DAG.ExtractVectorElements(U.get(), Args);
   1363 
   1364   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
   1365 }
   1366 
   1367 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   1368                                                      SelectionDAG &DAG) const {
   1369 
   1370   SmallVector<SDValue, 8> Args;
   1371   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   1372   EVT VT = Op.getValueType();
   1373   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
   1374                             VT.getVectorNumElements());
   1375 
   1376   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
   1377 }
   1378 
   1379 /// Generate Min/Max node
   1380 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
   1381                                                    SDValue LHS, SDValue RHS,
   1382                                                    SDValue True, SDValue False,
   1383                                                    SDValue CC,
   1384                                                    DAGCombinerInfo &DCI) const {
   1385   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
   1386     return SDValue();
   1387 
   1388   SelectionDAG &DAG = DCI.DAG;
   1389   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
   1390   switch (CCOpcode) {
   1391   case ISD::SETOEQ:
   1392   case ISD::SETONE:
   1393   case ISD::SETUNE:
   1394   case ISD::SETNE:
   1395   case ISD::SETUEQ:
   1396   case ISD::SETEQ:
   1397   case ISD::SETFALSE:
   1398   case ISD::SETFALSE2:
   1399   case ISD::SETTRUE:
   1400   case ISD::SETTRUE2:
   1401   case ISD::SETUO:
   1402   case ISD::SETO:
   1403     break;
   1404   case ISD::SETULE:
   1405   case ISD::SETULT: {
   1406     if (LHS == True)
   1407       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   1408     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   1409   }
   1410   case ISD::SETOLE:
   1411   case ISD::SETOLT:
   1412   case ISD::SETLE:
   1413   case ISD::SETLT: {
   1414     // Ordered. Assume ordered for undefined.
   1415 
   1416     // Only do this after legalization to avoid interfering with other combines
   1417     // which might occur.
   1418     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
   1419         !DCI.isCalledByLegalizer())
   1420       return SDValue();
   1421 
   1422     // We need to permute the operands to get the correct NaN behavior. The
   1423     // selected operand is the second one based on the failing compare with NaN,
   1424     // so permute it based on the compare type the hardware uses.
   1425     if (LHS == True)
   1426       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1427     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1428   }
   1429   case ISD::SETUGE:
   1430   case ISD::SETUGT: {
   1431     if (LHS == True)
   1432       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
   1433     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
   1434   }
   1435   case ISD::SETGT:
   1436   case ISD::SETGE:
   1437   case ISD::SETOGE:
   1438   case ISD::SETOGT: {
   1439     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
   1440         !DCI.isCalledByLegalizer())
   1441       return SDValue();
   1442 
   1443     if (LHS == True)
   1444       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
   1445     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
   1446   }
   1447   case ISD::SETCC_INVALID:
   1448     llvm_unreachable("Invalid setcc condcode!");
   1449   }
   1450   return SDValue();
   1451 }
   1452 
   1453 std::pair<SDValue, SDValue>
   1454 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
   1455   SDLoc SL(Op);
   1456 
   1457   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1458 
   1459   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1460   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1461 
   1462   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   1463   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   1464 
   1465   return std::make_pair(Lo, Hi);
   1466 }
   1467 
   1468 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
   1469   SDLoc SL(Op);
   1470 
   1471   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1472   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   1473   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   1474 }
   1475 
   1476 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
   1477   SDLoc SL(Op);
   1478 
   1479   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
   1480   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   1481   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   1482 }
   1483 
   1484 // Split a vector type into two parts. The first part is a power of two vector.
   1485 // The second part is whatever is left over, and is a scalar if it would
   1486 // otherwise be a 1-vector.
   1487 std::pair<EVT, EVT>
   1488 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
   1489   EVT LoVT, HiVT;
   1490   EVT EltVT = VT.getVectorElementType();
   1491   unsigned NumElts = VT.getVectorNumElements();
   1492   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
   1493   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
   1494   HiVT = NumElts - LoNumElts == 1
   1495              ? EltVT
   1496              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
   1497   return std::make_pair(LoVT, HiVT);
   1498 }
   1499 
   1500 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
   1501 // scalar.
   1502 std::pair<SDValue, SDValue>
   1503 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
   1504                                   const EVT &LoVT, const EVT &HiVT,
   1505                                   SelectionDAG &DAG) const {
   1506   assert(LoVT.getVectorNumElements() +
   1507                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
   1508              N.getValueType().getVectorNumElements() &&
   1509          "More vector elements requested than available!");
   1510   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
   1511                            DAG.getVectorIdxConstant(0, DL));
   1512   SDValue Hi = DAG.getNode(
   1513       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
   1514       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
   1515   return std::make_pair(Lo, Hi);
   1516 }
   1517 
   1518 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
   1519                                               SelectionDAG &DAG) const {
   1520   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1521   EVT VT = Op.getValueType();
   1522   SDLoc SL(Op);
   1523 
   1524 
   1525   // If this is a 2 element vector, we really want to scalarize and not create
   1526   // weird 1 element vectors.
   1527   if (VT.getVectorNumElements() == 2) {
   1528     SDValue Ops[2];
   1529     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
   1530     return DAG.getMergeValues(Ops, SL);
   1531   }
   1532 
   1533   SDValue BasePtr = Load->getBasePtr();
   1534   EVT MemVT = Load->getMemoryVT();
   1535 
   1536   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
   1537 
   1538   EVT LoVT, HiVT;
   1539   EVT LoMemVT, HiMemVT;
   1540   SDValue Lo, Hi;
   1541 
   1542   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
   1543   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
   1544   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
   1545 
   1546   unsigned Size = LoMemVT.getStoreSize();
   1547   unsigned BaseAlign = Load->getAlignment();
   1548   unsigned HiAlign = MinAlign(BaseAlign, Size);
   1549 
   1550   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
   1551                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
   1552                                   BaseAlign, Load->getMemOperand()->getFlags());
   1553   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
   1554   SDValue HiLoad =
   1555       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
   1556                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
   1557                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
   1558 
   1559   SDValue Join;
   1560   if (LoVT == HiVT) {
   1561     // This is the case that the vector is power of two so was evenly split.
   1562     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
   1563   } else {
   1564     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
   1565                        DAG.getVectorIdxConstant(0, SL));
   1566     Join = DAG.getNode(
   1567         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
   1568         VT, Join, HiLoad,
   1569         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
   1570   }
   1571 
   1572   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
   1573                                      LoLoad.getValue(1), HiLoad.getValue(1))};
   1574 
   1575   return DAG.getMergeValues(Ops, SL);
   1576 }
   1577 
   1578 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
   1579                                                      SelectionDAG &DAG) const {
   1580   LoadSDNode *Load = cast<LoadSDNode>(Op);
   1581   EVT VT = Op.getValueType();
   1582   SDValue BasePtr = Load->getBasePtr();
   1583   EVT MemVT = Load->getMemoryVT();
   1584   SDLoc SL(Op);
   1585   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
   1586   unsigned BaseAlign = Load->getAlignment();
   1587   unsigned NumElements = MemVT.getVectorNumElements();
   1588 
   1589   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
   1590   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
   1591   if (NumElements != 3 ||
   1592       (BaseAlign < 8 &&
   1593        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
   1594     return SplitVectorLoad(Op, DAG);
   1595 
   1596   assert(NumElements == 3);
   1597 
   1598   EVT WideVT =
   1599       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
   1600   EVT WideMemVT =
   1601       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
   1602   SDValue WideLoad = DAG.getExtLoad(
   1603       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
   1604       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
   1605   return DAG.getMergeValues(
   1606       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
   1607                    DAG.getVectorIdxConstant(0, SL)),
   1608        WideLoad.getValue(1)},
   1609       SL);
   1610 }
   1611 
   1612 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
   1613                                                SelectionDAG &DAG) const {
   1614   StoreSDNode *Store = cast<StoreSDNode>(Op);
   1615   SDValue Val = Store->getValue();
   1616   EVT VT = Val.getValueType();
   1617 
   1618   // If this is a 2 element vector, we really want to scalarize and not create
   1619   // weird 1 element vectors.
   1620   if (VT.getVectorNumElements() == 2)
   1621     return scalarizeVectorStore(Store, DAG);
   1622 
   1623   EVT MemVT = Store->getMemoryVT();
   1624   SDValue Chain = Store->getChain();
   1625   SDValue BasePtr = Store->getBasePtr();
   1626   SDLoc SL(Op);
   1627 
   1628   EVT LoVT, HiVT;
   1629   EVT LoMemVT, HiMemVT;
   1630   SDValue Lo, Hi;
   1631 
   1632   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
   1633   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
   1634   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
   1635 
   1636   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
   1637 
   1638   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
   1639   unsigned BaseAlign = Store->getAlignment();
   1640   unsigned Size = LoMemVT.getStoreSize();
   1641   unsigned HiAlign = MinAlign(BaseAlign, Size);
   1642 
   1643   SDValue LoStore =
   1644       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
   1645                         Store->getMemOperand()->getFlags());
   1646   SDValue HiStore =
   1647       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
   1648                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
   1649 
   1650   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
   1651 }
   1652 
   1653 // This is a shortcut for integer division because we have fast i32<->f32
   1654 // conversions, and fast f32 reciprocal instructions. The fractional part of a
   1655 // float is enough to accurately represent up to a 24-bit signed integer.
   1656 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   1657                                             bool Sign) const {
   1658   SDLoc DL(Op);
   1659   EVT VT = Op.getValueType();
   1660   SDValue LHS = Op.getOperand(0);
   1661   SDValue RHS = Op.getOperand(1);
   1662   MVT IntVT = MVT::i32;
   1663   MVT FltVT = MVT::f32;
   1664 
   1665   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
   1666   if (LHSSignBits < 9)
   1667     return SDValue();
   1668 
   1669   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
   1670   if (RHSSignBits < 9)
   1671     return SDValue();
   1672 
   1673   unsigned BitSize = VT.getSizeInBits();
   1674   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
   1675   unsigned DivBits = BitSize - SignBits;
   1676   if (Sign)
   1677     ++DivBits;
   1678 
   1679   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
   1680   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
   1681 
   1682   SDValue jq = DAG.getConstant(1, DL, IntVT);
   1683 
   1684   if (Sign) {
   1685     // char|short jq = ia ^ ib;
   1686     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
   1687 
   1688     // jq = jq >> (bitsize - 2)
   1689     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
   1690                      DAG.getConstant(BitSize - 2, DL, VT));
   1691 
   1692     // jq = jq | 0x1
   1693     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
   1694   }
   1695 
   1696   // int ia = (int)LHS;
   1697   SDValue ia = LHS;
   1698 
   1699   // int ib, (int)RHS;
   1700   SDValue ib = RHS;
   1701 
   1702   // float fa = (float)ia;
   1703   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
   1704 
   1705   // float fb = (float)ib;
   1706   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
   1707 
   1708   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
   1709                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
   1710 
   1711   // fq = trunc(fq);
   1712   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
   1713 
   1714   // float fqneg = -fq;
   1715   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
   1716 
   1717   MachineFunction &MF = DAG.getMachineFunction();
   1718   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
   1719 
   1720   // float fr = mad(fqneg, fb, fa);
   1721   unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
   1722                     (unsigned)ISD::FMA :
   1723                     !MFI->getMode().allFP32Denormals() ?
   1724                     (unsigned)ISD::FMAD :
   1725                     (unsigned)AMDGPUISD::FMAD_FTZ;
   1726   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
   1727 
   1728   // int iq = (int)fq;
   1729   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
   1730 
   1731   // fr = fabs(fr);
   1732   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
   1733 
   1734   // fb = fabs(fb);
   1735   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
   1736 
   1737   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   1738 
   1739   // int cv = fr >= fb;
   1740   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
   1741 
   1742   // jq = (cv ? jq : 0);
   1743   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
   1744 
   1745   // dst = iq + jq;
   1746   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
   1747 
   1748   // Rem needs compensation, it's easier to recompute it
   1749   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
   1750   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
   1751 
   1752   // Truncate to number of bits this divide really is.
   1753   if (Sign) {
   1754     SDValue InRegSize
   1755       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
   1756     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
   1757     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
   1758   } else {
   1759     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
   1760     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
   1761     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
   1762   }
   1763 
   1764   return DAG.getMergeValues({ Div, Rem }, DL);
   1765 }
   1766 
   1767 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   1768                                       SelectionDAG &DAG,
   1769                                       SmallVectorImpl<SDValue> &Results) const {
   1770   SDLoc DL(Op);
   1771   EVT VT = Op.getValueType();
   1772 
   1773   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
   1774 
   1775   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   1776 
   1777   SDValue One = DAG.getConstant(1, DL, HalfVT);
   1778   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
   1779 
   1780   //HiLo split
   1781   SDValue LHS = Op.getOperand(0);
   1782   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
   1783   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
   1784 
   1785   SDValue RHS = Op.getOperand(1);
   1786   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
   1787   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
   1788 
   1789   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
   1790       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
   1791 
   1792     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   1793                               LHS_Lo, RHS_Lo);
   1794 
   1795     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
   1796     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
   1797 
   1798     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
   1799     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
   1800     return;
   1801   }
   1802 
   1803   if (isTypeLegal(MVT::i64)) {
   1804     MachineFunction &MF = DAG.getMachineFunction();
   1805     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   1806 
   1807     // Compute denominator reciprocal.
   1808     unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
   1809                     (unsigned)ISD::FMA :
   1810                     !MFI->getMode().allFP32Denormals() ?
   1811                     (unsigned)ISD::FMAD :
   1812                     (unsigned)AMDGPUISD::FMAD_FTZ;
   1813 
   1814     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
   1815     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
   1816     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
   1817       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
   1818       Cvt_Lo);
   1819     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
   1820     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
   1821       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
   1822     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
   1823       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
   1824     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
   1825     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
   1826       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
   1827       Mul1);
   1828     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
   1829     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
   1830     SDValue Rcp64 = DAG.getBitcast(VT,
   1831                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
   1832 
   1833     SDValue Zero64 = DAG.getConstant(0, DL, VT);
   1834     SDValue One64  = DAG.getConstant(1, DL, VT);
   1835     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
   1836     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
   1837 
   1838     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
   1839     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
   1840     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
   1841     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
   1842                                     Zero);
   1843     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
   1844                                     One);
   1845 
   1846     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
   1847                                   Mulhi1_Lo, Zero1);
   1848     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
   1849                                   Mulhi1_Hi, Add1_Lo.getValue(1));
   1850     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
   1851     SDValue Add1 = DAG.getBitcast(VT,
   1852                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
   1853 
   1854     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
   1855     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
   1856     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
   1857                                     Zero);
   1858     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
   1859                                     One);
   1860 
   1861     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
   1862                                   Mulhi2_Lo, Zero1);
   1863     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
   1864                                    Mulhi2_Hi, Add1_Lo.getValue(1));
   1865     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
   1866                                   Zero, Add2_Lo.getValue(1));
   1867     SDValue Add2 = DAG.getBitcast(VT,
   1868                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
   1869     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
   1870 
   1871     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
   1872 
   1873     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
   1874     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
   1875     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
   1876                                   Mul3_Lo, Zero1);
   1877     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
   1878                                   Mul3_Hi, Sub1_Lo.getValue(1));
   1879     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
   1880     SDValue Sub1 = DAG.getBitcast(VT,
   1881                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
   1882 
   1883     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
   1884     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
   1885                                  ISD::SETUGE);
   1886     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
   1887                                  ISD::SETUGE);
   1888     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
   1889 
   1890     // TODO: Here and below portions of the code can be enclosed into if/endif.
   1891     // Currently control flow is unconditional and we have 4 selects after
   1892     // potential endif to substitute PHIs.
   1893 
   1894     // if C3 != 0 ...
   1895     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
   1896                                   RHS_Lo, Zero1);
   1897     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
   1898                                   RHS_Hi, Sub1_Lo.getValue(1));
   1899     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
   1900                                   Zero, Sub2_Lo.getValue(1));
   1901     SDValue Sub2 = DAG.getBitcast(VT,
   1902                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
   1903 
   1904     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
   1905 
   1906     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
   1907                                  ISD::SETUGE);
   1908     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
   1909                                  ISD::SETUGE);
   1910     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
   1911 
   1912     // if (C6 != 0)
   1913     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
   1914 
   1915     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
   1916                                   RHS_Lo, Zero1);
   1917     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
   1918                                   RHS_Hi, Sub2_Lo.getValue(1));
   1919     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
   1920                                   Zero, Sub3_Lo.getValue(1));
   1921     SDValue Sub3 = DAG.getBitcast(VT,
   1922                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
   1923 
   1924     // endif C6
   1925     // endif C3
   1926 
   1927     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
   1928     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
   1929 
   1930     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
   1931     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
   1932 
   1933     Results.push_back(Div);
   1934     Results.push_back(Rem);
   1935 
   1936     return;
   1937   }
   1938 
   1939   // r600 expandion.
   1940   // Get Speculative values
   1941   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
   1942   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
   1943 
   1944   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
   1945   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
   1946   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
   1947 
   1948   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
   1949   SDValue DIV_Lo = Zero;
   1950 
   1951   const unsigned halfBitWidth = HalfVT.getSizeInBits();
   1952 
   1953   for (unsigned i = 0; i < halfBitWidth; ++i) {
   1954     const unsigned bitPos = halfBitWidth - i - 1;
   1955     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
   1956     // Get value of high bit
   1957     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
   1958     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
   1959     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
   1960 
   1961     // Shift
   1962     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
   1963     // Add LHS high bit
   1964     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
   1965 
   1966     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
   1967     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
   1968 
   1969     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
   1970 
   1971     // Update REM
   1972     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
   1973     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
   1974   }
   1975 
   1976   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
   1977   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
   1978   Results.push_back(DIV);
   1979   Results.push_back(REM);
   1980 }
   1981 
   1982 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   1983                                            SelectionDAG &DAG) const {
   1984   SDLoc DL(Op);
   1985   EVT VT = Op.getValueType();
   1986 
   1987   if (VT == MVT::i64) {
   1988     SmallVector<SDValue, 2> Results;
   1989     LowerUDIVREM64(Op, DAG, Results);
   1990     return DAG.getMergeValues(Results, DL);
   1991   }
   1992 
   1993   if (VT == MVT::i32) {
   1994     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
   1995       return Res;
   1996   }
   1997 
   1998   SDValue X = Op.getOperand(0);
   1999   SDValue Y = Op.getOperand(1);
   2000 
   2001   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
   2002   // algorithm used here.
   2003 
   2004   // Initial estimate of inv(y).
   2005   SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
   2006 
   2007   // One round of UNR.
   2008   SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
   2009   SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
   2010   Z = DAG.getNode(ISD::ADD, DL, VT, Z,
   2011                   DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
   2012 
   2013   // Quotient/remainder estimate.
   2014   SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
   2015   SDValue R =
   2016       DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
   2017 
   2018   // First quotient/remainder refinement.
   2019   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   2020   SDValue One = DAG.getConstant(1, DL, VT);
   2021   SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
   2022   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
   2023                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
   2024   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
   2025                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
   2026 
   2027   // Second quotient/remainder refinement.
   2028   Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
   2029   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
   2030                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
   2031   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
   2032                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
   2033 
   2034   return DAG.getMergeValues({Q, R}, DL);
   2035 }
   2036 
   2037 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   2038                                            SelectionDAG &DAG) const {
   2039   SDLoc DL(Op);
   2040   EVT VT = Op.getValueType();
   2041 
   2042   SDValue LHS = Op.getOperand(0);
   2043   SDValue RHS = Op.getOperand(1);
   2044 
   2045   SDValue Zero = DAG.getConstant(0, DL, VT);
   2046   SDValue NegOne = DAG.getConstant(-1, DL, VT);
   2047 
   2048   if (VT == MVT::i32) {
   2049     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
   2050       return Res;
   2051   }
   2052 
   2053   if (VT == MVT::i64 &&
   2054       DAG.ComputeNumSignBits(LHS) > 32 &&
   2055       DAG.ComputeNumSignBits(RHS) > 32) {
   2056     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
   2057 
   2058     //HiLo split
   2059     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
   2060     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
   2061     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
   2062                                  LHS_Lo, RHS_Lo);
   2063     SDValue Res[2] = {
   2064       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
   2065       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
   2066     };
   2067     return DAG.getMergeValues(Res, DL);
   2068   }
   2069 
   2070   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
   2071   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
   2072   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
   2073   SDValue RSign = LHSign; // Remainder sign is the same as LHS
   2074 
   2075   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
   2076   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
   2077 
   2078   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
   2079   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
   2080 
   2081   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
   2082   SDValue Rem = Div.getValue(1);
   2083 
   2084   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
   2085   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
   2086 
   2087   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
   2088   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
   2089 
   2090   SDValue Res[2] = {
   2091     Div,
   2092     Rem
   2093   };
   2094   return DAG.getMergeValues(Res, DL);
   2095 }
   2096 
   2097 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
   2098 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   2099   SDLoc SL(Op);
   2100   EVT VT = Op.getValueType();
   2101   auto Flags = Op->getFlags();
   2102   SDValue X = Op.getOperand(0);
   2103   SDValue Y = Op.getOperand(1);
   2104 
   2105   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
   2106   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
   2107   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
   2108   // TODO: For f32 use FMAD instead if !hasFastFMA32?
   2109   return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
   2110 }
   2111 
   2112 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
   2113   SDLoc SL(Op);
   2114   SDValue Src = Op.getOperand(0);
   2115 
   2116   // result = trunc(src)
   2117   // if (src > 0.0 && src != result)
   2118   //   result += 1.0
   2119 
   2120   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2121 
   2122   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   2123   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
   2124 
   2125   EVT SetCCVT =
   2126       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   2127 
   2128   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
   2129   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   2130   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   2131 
   2132   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
   2133   // TODO: Should this propagate fast-math-flags?
   2134   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   2135 }
   2136 
   2137 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
   2138                                   SelectionDAG &DAG) {
   2139   const unsigned FractBits = 52;
   2140   const unsigned ExpBits = 11;
   2141 
   2142   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
   2143                                 Hi,
   2144                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
   2145                                 DAG.getConstant(ExpBits, SL, MVT::i32));
   2146   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
   2147                             DAG.getConstant(1023, SL, MVT::i32));
   2148 
   2149   return Exp;
   2150 }
   2151 
   2152 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
   2153   SDLoc SL(Op);
   2154   SDValue Src = Op.getOperand(0);
   2155 
   2156   assert(Op.getValueType() == MVT::f64);
   2157 
   2158   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   2159   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   2160 
   2161   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   2162 
   2163   // Extract the upper half, since this is where we will find the sign and
   2164   // exponent.
   2165   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
   2166 
   2167   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
   2168 
   2169   const unsigned FractBits = 52;
   2170 
   2171   // Extract the sign bit.
   2172   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
   2173   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
   2174 
   2175   // Extend back to 64-bits.
   2176   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
   2177   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
   2178 
   2179   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
   2180   const SDValue FractMask
   2181     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
   2182 
   2183   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
   2184   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
   2185   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
   2186 
   2187   EVT SetCCVT =
   2188       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
   2189 
   2190   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
   2191 
   2192   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
   2193   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
   2194 
   2195   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
   2196   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
   2197 
   2198   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
   2199 }
   2200 
   2201 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
   2202   SDLoc SL(Op);
   2203   SDValue Src = Op.getOperand(0);
   2204 
   2205   assert(Op.getValueType() == MVT::f64);
   2206 
   2207   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
   2208   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
   2209   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
   2210 
   2211   // TODO: Should this propagate fast-math-flags?
   2212 
   2213   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
   2214   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
   2215 
   2216   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
   2217 
   2218   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
   2219   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
   2220 
   2221   EVT SetCCVT =
   2222       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   2223   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
   2224 
   2225   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
   2226 }
   2227 
   2228 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
   2229   // FNEARBYINT and FRINT are the same, except in their handling of FP
   2230   // exceptions. Those aren't really meaningful for us, and OpenCL only has
   2231   // rint, so just treat them as equivalent.
   2232   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
   2233 }
   2234 
   2235 // XXX - May require not supporting f32 denormals?
   2236 
   2237 // Don't handle v2f16. The extra instructions to scalarize and repack around the
   2238 // compare and vselect end up producing worse code than scalarizing the whole
   2239 // operation.
   2240 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   2241   SDLoc SL(Op);
   2242   SDValue X = Op.getOperand(0);
   2243   EVT VT = Op.getValueType();
   2244 
   2245   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
   2246 
   2247   // TODO: Should this propagate fast-math-flags?
   2248 
   2249   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
   2250 
   2251   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
   2252 
   2253   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
   2254   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
   2255   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
   2256 
   2257   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
   2258 
   2259   EVT SetCCVT =
   2260       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   2261 
   2262   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
   2263 
   2264   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
   2265 
   2266   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
   2267 }
   2268 
   2269 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   2270   SDLoc SL(Op);
   2271   SDValue Src = Op.getOperand(0);
   2272 
   2273   // result = trunc(src);
   2274   // if (src < 0.0 && src != result)
   2275   //   result += -1.0.
   2276 
   2277   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2278 
   2279   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
   2280   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
   2281 
   2282   EVT SetCCVT =
   2283       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
   2284 
   2285   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
   2286   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
   2287   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
   2288 
   2289   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
   2290   // TODO: Should this propagate fast-math-flags?
   2291   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
   2292 }
   2293 
   2294 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
   2295                                         double Log2BaseInverted) const {
   2296   EVT VT = Op.getValueType();
   2297 
   2298   SDLoc SL(Op);
   2299   SDValue Operand = Op.getOperand(0);
   2300   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
   2301   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
   2302 
   2303   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
   2304 }
   2305 
   2306 // exp2(M_LOG2E_F * f);
   2307 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
   2308   EVT VT = Op.getValueType();
   2309   SDLoc SL(Op);
   2310   SDValue Src = Op.getOperand(0);
   2311 
   2312   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
   2313   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
   2314   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
   2315 }
   2316 
   2317 static bool isCtlzOpc(unsigned Opc) {
   2318   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
   2319 }
   2320 
   2321 static bool isCttzOpc(unsigned Opc) {
   2322   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
   2323 }
   2324 
   2325 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
   2326   SDLoc SL(Op);
   2327   SDValue Src = Op.getOperand(0);
   2328   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
   2329                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
   2330 
   2331   unsigned ISDOpc, NewOpc;
   2332   if (isCtlzOpc(Op.getOpcode())) {
   2333     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
   2334     NewOpc = AMDGPUISD::FFBH_U32;
   2335   } else if (isCttzOpc(Op.getOpcode())) {
   2336     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
   2337     NewOpc = AMDGPUISD::FFBL_B32;
   2338   } else
   2339     llvm_unreachable("Unexpected OPCode!!!");
   2340 
   2341 
   2342   if (ZeroUndef && Src.getValueType() == MVT::i32)
   2343     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
   2344 
   2345   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   2346 
   2347   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   2348   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
   2349 
   2350   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
   2351   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
   2352 
   2353   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
   2354                                    *DAG.getContext(), MVT::i32);
   2355 
   2356   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
   2357   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
   2358 
   2359   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
   2360   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
   2361 
   2362   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
   2363   SDValue Add, NewOpr;
   2364   if (isCtlzOpc(Op.getOpcode())) {
   2365     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
   2366     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
   2367     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
   2368   } else {
   2369     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
   2370     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
   2371     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
   2372   }
   2373 
   2374   if (!ZeroUndef) {
   2375     // Test if the full 64-bit input is zero.
   2376 
   2377     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
   2378     // which we probably don't want.
   2379     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
   2380     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
   2381     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
   2382 
   2383     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
   2384     // with the same cycles, otherwise it is slower.
   2385     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
   2386     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
   2387 
   2388     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
   2389 
   2390     // The instruction returns -1 for 0 input, but the defined intrinsic
   2391     // behavior is to return the number of bits.
   2392     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
   2393                          SrcIsZero, Bits32, NewOpr);
   2394   }
   2395 
   2396   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
   2397 }
   2398 
   2399 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
   2400                                                bool Signed) const {
   2401   // Unsigned
   2402   // cul2f(ulong u)
   2403   //{
   2404   //  uint lz = clz(u);
   2405   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
   2406   //  u = (u << lz) & 0x7fffffffffffffffUL;
   2407   //  ulong t = u & 0xffffffffffUL;
   2408   //  uint v = (e << 23) | (uint)(u >> 40);
   2409   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
   2410   //  return as_float(v + r);
   2411   //}
   2412   // Signed
   2413   // cl2f(long l)
   2414   //{
   2415   //  long s = l >> 63;
   2416   //  float r = cul2f((l + s) ^ s);
   2417   //  return s ? -r : r;
   2418   //}
   2419 
   2420   SDLoc SL(Op);
   2421   SDValue Src = Op.getOperand(0);
   2422   SDValue L = Src;
   2423 
   2424   SDValue S;
   2425   if (Signed) {
   2426     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
   2427     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
   2428 
   2429     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
   2430     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
   2431   }
   2432 
   2433   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
   2434                                    *DAG.getContext(), MVT::f32);
   2435 
   2436 
   2437   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
   2438   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
   2439   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
   2440   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
   2441 
   2442   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
   2443   SDValue E = DAG.getSelect(SL, MVT::i32,
   2444     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
   2445     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
   2446     ZeroI32);
   2447 
   2448   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
   2449     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
   2450     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
   2451 
   2452   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
   2453                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
   2454 
   2455   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
   2456                              U, DAG.getConstant(40, SL, MVT::i64));
   2457 
   2458   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
   2459     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
   2460     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
   2461 
   2462   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
   2463   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
   2464   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
   2465 
   2466   SDValue One = DAG.getConstant(1, SL, MVT::i32);
   2467 
   2468   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
   2469 
   2470   SDValue R = DAG.getSelect(SL, MVT::i32,
   2471     RCmp,
   2472     One,
   2473     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
   2474   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
   2475   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
   2476 
   2477   if (!Signed)
   2478     return R;
   2479 
   2480   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
   2481   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
   2482 }
   2483 
   2484 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
   2485                                                bool Signed) const {
   2486   SDLoc SL(Op);
   2487   SDValue Src = Op.getOperand(0);
   2488 
   2489   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
   2490 
   2491   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   2492                            DAG.getConstant(0, SL, MVT::i32));
   2493   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
   2494                            DAG.getConstant(1, SL, MVT::i32));
   2495 
   2496   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
   2497                               SL, MVT::f64, Hi);
   2498 
   2499   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
   2500 
   2501   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
   2502                               DAG.getConstant(32, SL, MVT::i32));
   2503   // TODO: Should this propagate fast-math-flags?
   2504   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
   2505 }
   2506 
   2507 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
   2508                                                SelectionDAG &DAG) const {
   2509   // TODO: Factor out code common with LowerSINT_TO_FP.
   2510   EVT DestVT = Op.getValueType();
   2511   SDValue Src = Op.getOperand(0);
   2512   EVT SrcVT = Src.getValueType();
   2513 
   2514   if (SrcVT == MVT::i16) {
   2515     if (DestVT == MVT::f16)
   2516       return Op;
   2517     SDLoc DL(Op);
   2518 
   2519     // Promote src to i32
   2520     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
   2521     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
   2522   }
   2523 
   2524   assert(SrcVT == MVT::i64 && "operation should be legal");
   2525 
   2526   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
   2527     SDLoc DL(Op);
   2528 
   2529     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
   2530     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
   2531     SDValue FPRound =
   2532         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
   2533 
   2534     return FPRound;
   2535   }
   2536 
   2537   if (DestVT == MVT::f32)
   2538     return LowerINT_TO_FP32(Op, DAG, false);
   2539 
   2540   assert(DestVT == MVT::f64);
   2541   return LowerINT_TO_FP64(Op, DAG, false);
   2542 }
   2543 
   2544 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
   2545                                               SelectionDAG &DAG) const {
   2546   EVT DestVT = Op.getValueType();
   2547 
   2548   SDValue Src = Op.getOperand(0);
   2549   EVT SrcVT = Src.getValueType();
   2550 
   2551   if (SrcVT == MVT::i16) {
   2552     if (DestVT == MVT::f16)
   2553       return Op;
   2554 
   2555     SDLoc DL(Op);
   2556     // Promote src to i32
   2557     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
   2558     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
   2559   }
   2560 
   2561   assert(SrcVT == MVT::i64 && "operation should be legal");
   2562 
   2563   // TODO: Factor out code common with LowerUINT_TO_FP.
   2564 
   2565   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
   2566     SDLoc DL(Op);
   2567     SDValue Src = Op.getOperand(0);
   2568 
   2569     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
   2570     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
   2571     SDValue FPRound =
   2572         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
   2573 
   2574     return FPRound;
   2575   }
   2576 
   2577   if (DestVT == MVT::f32)
   2578     return LowerINT_TO_FP32(Op, DAG, true);
   2579 
   2580   assert(DestVT == MVT::f64);
   2581   return LowerINT_TO_FP64(Op, DAG, true);
   2582 }
   2583 
   2584 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
   2585                                                bool Signed) const {
   2586   SDLoc SL(Op);
   2587 
   2588   SDValue Src = Op.getOperand(0);
   2589 
   2590   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
   2591 
   2592   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
   2593                                  MVT::f64);
   2594   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
   2595                                  MVT::f64);
   2596   // TODO: Should this propagate fast-math-flags?
   2597   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
   2598 
   2599   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
   2600 
   2601 
   2602   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
   2603 
   2604   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
   2605                            MVT::i32, FloorMul);
   2606   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
   2607 
   2608   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
   2609 
   2610   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
   2611 }
   2612 
   2613 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
   2614   SDLoc DL(Op);
   2615   SDValue N0 = Op.getOperand(0);
   2616 
   2617   // Convert to target node to get known bits
   2618   if (N0.getValueType() == MVT::f32)
   2619     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
   2620 
   2621   if (getTargetMachine().Options.UnsafeFPMath) {
   2622     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
   2623     return SDValue();
   2624   }
   2625 
   2626   assert(N0.getSimpleValueType() == MVT::f64);
   2627 
   2628   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
   2629   const unsigned ExpMask = 0x7ff;
   2630   const unsigned ExpBiasf64 = 1023;
   2631   const unsigned ExpBiasf16 = 15;
   2632   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   2633   SDValue One = DAG.getConstant(1, DL, MVT::i32);
   2634   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
   2635   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
   2636                            DAG.getConstant(32, DL, MVT::i64));
   2637   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
   2638   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
   2639   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
   2640                           DAG.getConstant(20, DL, MVT::i64));
   2641   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
   2642                   DAG.getConstant(ExpMask, DL, MVT::i32));
   2643   // Subtract the fp64 exponent bias (1023) to get the real exponent and
   2644   // add the f16 bias (15) to get the biased exponent for the f16 format.
   2645   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
   2646                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
   2647 
   2648   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
   2649                           DAG.getConstant(8, DL, MVT::i32));
   2650   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
   2651                   DAG.getConstant(0xffe, DL, MVT::i32));
   2652 
   2653   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
   2654                                   DAG.getConstant(0x1ff, DL, MVT::i32));
   2655   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
   2656 
   2657   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
   2658   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
   2659 
   2660   // (M != 0 ? 0x0200 : 0) | 0x7c00;
   2661   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
   2662       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
   2663                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
   2664 
   2665   // N = M | (E << 12);
   2666   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
   2667       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
   2668                   DAG.getConstant(12, DL, MVT::i32)));
   2669 
   2670   // B = clamp(1-E, 0, 13);
   2671   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
   2672                                   One, E);
   2673   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
   2674   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
   2675                   DAG.getConstant(13, DL, MVT::i32));
   2676 
   2677   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
   2678                                    DAG.getConstant(0x1000, DL, MVT::i32));
   2679 
   2680   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
   2681   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
   2682   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
   2683   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
   2684 
   2685   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
   2686   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
   2687                               DAG.getConstant(0x7, DL, MVT::i32));
   2688   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
   2689                   DAG.getConstant(2, DL, MVT::i32));
   2690   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
   2691                                One, Zero, ISD::SETEQ);
   2692   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
   2693                                One, Zero, ISD::SETGT);
   2694   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
   2695   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
   2696 
   2697   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
   2698                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
   2699   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
   2700                       I, V, ISD::SETEQ);
   2701 
   2702   // Extract the sign bit.
   2703   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
   2704                             DAG.getConstant(16, DL, MVT::i32));
   2705   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
   2706                      DAG.getConstant(0x8000, DL, MVT::i32));
   2707 
   2708   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
   2709   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
   2710 }
   2711 
   2712 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
   2713                                              SelectionDAG &DAG) const {
   2714   SDValue Src = Op.getOperand(0);
   2715   unsigned OpOpcode = Op.getOpcode();
   2716   EVT SrcVT = Src.getValueType();
   2717   EVT DestVT = Op.getValueType();
   2718 
   2719   // Will be selected natively
   2720   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
   2721     return Op;
   2722 
   2723   // Promote i16 to i32
   2724   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
   2725     SDLoc DL(Op);
   2726 
   2727     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
   2728     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
   2729   }
   2730 
   2731   if (SrcVT == MVT::f16 ||
   2732       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
   2733     SDLoc DL(Op);
   2734 
   2735     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
   2736     unsigned Ext =
   2737         OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
   2738     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
   2739   }
   2740 
   2741   if (DestVT == MVT::i64 && SrcVT == MVT::f64)
   2742     return LowerFP64_TO_INT(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
   2743 
   2744   return SDValue();
   2745 }
   2746 
   2747 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   2748                                                      SelectionDAG &DAG) const {
   2749   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   2750   MVT VT = Op.getSimpleValueType();
   2751   MVT ScalarVT = VT.getScalarType();
   2752 
   2753   assert(VT.isVector());
   2754 
   2755   SDValue Src = Op.getOperand(0);
   2756   SDLoc DL(Op);
   2757 
   2758   // TODO: Don't scalarize on Evergreen?
   2759   unsigned NElts = VT.getVectorNumElements();
   2760   SmallVector<SDValue, 8> Args;
   2761   DAG.ExtractVectorElements(Src, Args, 0, NElts);
   2762 
   2763   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
   2764   for (unsigned I = 0; I < NElts; ++I)
   2765     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
   2766 
   2767   return DAG.getBuildVector(VT, DL, Args);
   2768 }
   2769 
   2770 //===----------------------------------------------------------------------===//
   2771 // Custom DAG optimizations
   2772 //===----------------------------------------------------------------------===//
   2773 
   2774 static bool isU24(SDValue Op, SelectionDAG &DAG) {
   2775   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
   2776 }
   2777 
   2778 static bool isI24(SDValue Op, SelectionDAG &DAG) {
   2779   EVT VT = Op.getValueType();
   2780   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
   2781                                      // as unsigned 24-bit values.
   2782     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
   2783 }
   2784 
   2785 static SDValue simplifyMul24(SDNode *Node24,
   2786                              TargetLowering::DAGCombinerInfo &DCI) {
   2787   SelectionDAG &DAG = DCI.DAG;
   2788   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   2789   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
   2790 
   2791   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
   2792   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
   2793   unsigned NewOpcode = Node24->getOpcode();
   2794   if (IsIntrin) {
   2795     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
   2796     NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
   2797       AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
   2798   }
   2799 
   2800   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
   2801 
   2802   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
   2803   // the operands to have other uses, but will only perform simplifications that
   2804   // involve bypassing some nodes for this user.
   2805   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
   2806   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
   2807   if (DemandedLHS || DemandedRHS)
   2808     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
   2809                        DemandedLHS ? DemandedLHS : LHS,
   2810                        DemandedRHS ? DemandedRHS : RHS);
   2811 
   2812   // Now try SimplifyDemandedBits which can simplify the nodes used by our
   2813   // operands if this node is the only user.
   2814   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
   2815     return SDValue(Node24, 0);
   2816   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
   2817     return SDValue(Node24, 0);
   2818 
   2819   return SDValue();
   2820 }
   2821 
   2822 template <typename IntTy>
   2823 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
   2824                                uint32_t Width, const SDLoc &DL) {
   2825   if (Width + Offset < 32) {
   2826     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
   2827     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
   2828     return DAG.getConstant(Result, DL, MVT::i32);
   2829   }
   2830 
   2831   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
   2832 }
   2833 
   2834 static bool hasVolatileUser(SDNode *Val) {
   2835   for (SDNode *U : Val->uses()) {
   2836     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
   2837       if (M->isVolatile())
   2838         return true;
   2839     }
   2840   }
   2841 
   2842   return false;
   2843 }
   2844 
   2845 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
   2846   // i32 vectors are the canonical memory type.
   2847   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
   2848     return false;
   2849 
   2850   if (!VT.isByteSized())
   2851     return false;
   2852 
   2853   unsigned Size = VT.getStoreSize();
   2854 
   2855   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
   2856     return false;
   2857 
   2858   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
   2859     return false;
   2860 
   2861   return true;
   2862 }
   2863 
   2864 // Replace load of an illegal type with a store of a bitcast to a friendlier
   2865 // type.
   2866 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
   2867                                                  DAGCombinerInfo &DCI) const {
   2868   if (!DCI.isBeforeLegalize())
   2869     return SDValue();
   2870 
   2871   LoadSDNode *LN = cast<LoadSDNode>(N);
   2872   if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
   2873     return SDValue();
   2874 
   2875   SDLoc SL(N);
   2876   SelectionDAG &DAG = DCI.DAG;
   2877   EVT VT = LN->getMemoryVT();
   2878 
   2879   unsigned Size = VT.getStoreSize();
   2880   Align Alignment = LN->getAlign();
   2881   if (Alignment < Size && isTypeLegal(VT)) {
   2882     bool IsFast;
   2883     unsigned AS = LN->getAddressSpace();
   2884 
   2885     // Expand unaligned loads earlier than legalization. Due to visitation order
   2886     // problems during legalization, the emitted instructions to pack and unpack
   2887     // the bytes again are not eliminated in the case of an unaligned copy.
   2888     if (!allowsMisalignedMemoryAccesses(
   2889             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
   2890       SDValue Ops[2];
   2891 
   2892       if (VT.isVector())
   2893         std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
   2894       else
   2895         std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
   2896 
   2897       return DAG.getMergeValues(Ops, SDLoc(N));
   2898     }
   2899 
   2900     if (!IsFast)
   2901       return SDValue();
   2902   }
   2903 
   2904   if (!shouldCombineMemoryType(VT))
   2905     return SDValue();
   2906 
   2907   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
   2908 
   2909   SDValue NewLoad
   2910     = DAG.getLoad(NewVT, SL, LN->getChain(),
   2911                   LN->getBasePtr(), LN->getMemOperand());
   2912 
   2913   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
   2914   DCI.CombineTo(N, BC, NewLoad.getValue(1));
   2915   return SDValue(N, 0);
   2916 }
   2917 
   2918 // Replace store of an illegal type with a store of a bitcast to a friendlier
   2919 // type.
   2920 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
   2921                                                   DAGCombinerInfo &DCI) const {
   2922   if (!DCI.isBeforeLegalize())
   2923     return SDValue();
   2924 
   2925   StoreSDNode *SN = cast<StoreSDNode>(N);
   2926   if (!SN->isSimple() || !ISD::isNormalStore(SN))
   2927     return SDValue();
   2928 
   2929   EVT VT = SN->getMemoryVT();
   2930   unsigned Size = VT.getStoreSize();
   2931 
   2932   SDLoc SL(N);
   2933   SelectionDAG &DAG = DCI.DAG;
   2934   Align Alignment = SN->getAlign();
   2935   if (Alignment < Size && isTypeLegal(VT)) {
   2936     bool IsFast;
   2937     unsigned AS = SN->getAddressSpace();
   2938 
   2939     // Expand unaligned stores earlier than legalization. Due to visitation
   2940     // order problems during legalization, the emitted instructions to pack and
   2941     // unpack the bytes again are not eliminated in the case of an unaligned
   2942     // copy.
   2943     if (!allowsMisalignedMemoryAccesses(
   2944             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
   2945       if (VT.isVector())
   2946         return scalarizeVectorStore(SN, DAG);
   2947 
   2948       return expandUnalignedStore(SN, DAG);
   2949     }
   2950 
   2951     if (!IsFast)
   2952       return SDValue();
   2953   }
   2954 
   2955   if (!shouldCombineMemoryType(VT))
   2956     return SDValue();
   2957 
   2958   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
   2959   SDValue Val = SN->getValue();
   2960 
   2961   //DCI.AddToWorklist(Val.getNode());
   2962 
   2963   bool OtherUses = !Val.hasOneUse();
   2964   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
   2965   if (OtherUses) {
   2966     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
   2967     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
   2968   }
   2969 
   2970   return DAG.getStore(SN->getChain(), SL, CastVal,
   2971                       SN->getBasePtr(), SN->getMemOperand());
   2972 }
   2973 
   2974 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
   2975 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
   2976 // issues.
   2977 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
   2978                                                         DAGCombinerInfo &DCI) const {
   2979   SelectionDAG &DAG = DCI.DAG;
   2980   SDValue N0 = N->getOperand(0);
   2981 
   2982   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
   2983   //     (vt2 (truncate (assertzext vt0:x, vt1)))
   2984   if (N0.getOpcode() == ISD::TRUNCATE) {
   2985     SDValue N1 = N->getOperand(1);
   2986     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
   2987     SDLoc SL(N);
   2988 
   2989     SDValue Src = N0.getOperand(0);
   2990     EVT SrcVT = Src.getValueType();
   2991     if (SrcVT.bitsGE(ExtVT)) {
   2992       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
   2993       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
   2994     }
   2995   }
   2996 
   2997   return SDValue();
   2998 }
   2999 
   3000 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   3001   SDNode *N, DAGCombinerInfo &DCI) const {
   3002   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   3003   switch (IID) {
   3004   case Intrinsic::amdgcn_mul_i24:
   3005   case Intrinsic::amdgcn_mul_u24:
   3006     return simplifyMul24(N, DCI);
   3007   case Intrinsic::amdgcn_fract:
   3008   case Intrinsic::amdgcn_rsq:
   3009   case Intrinsic::amdgcn_rcp_legacy:
   3010   case Intrinsic::amdgcn_rsq_legacy:
   3011   case Intrinsic::amdgcn_rsq_clamp:
   3012   case Intrinsic::amdgcn_ldexp: {
   3013     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
   3014     SDValue Src = N->getOperand(1);
   3015     return Src.isUndef() ? Src : SDValue();
   3016   }
   3017   default:
   3018     return SDValue();
   3019   }
   3020 }
   3021 
   3022 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
   3023 /// binary operation \p Opc to it with the corresponding constant operands.
   3024 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
   3025   DAGCombinerInfo &DCI, const SDLoc &SL,
   3026   unsigned Opc, SDValue LHS,
   3027   uint32_t ValLo, uint32_t ValHi) const {
   3028   SelectionDAG &DAG = DCI.DAG;
   3029   SDValue Lo, Hi;
   3030   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
   3031 
   3032   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
   3033   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
   3034 
   3035   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
   3036   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
   3037 
   3038   // Re-visit the ands. It's possible we eliminated one of them and it could
   3039   // simplify the vector.
   3040   DCI.AddToWorklist(Lo.getNode());
   3041   DCI.AddToWorklist(Hi.getNode());
   3042 
   3043   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
   3044   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
   3045 }
   3046 
   3047 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   3048                                                 DAGCombinerInfo &DCI) const {
   3049   EVT VT = N->getValueType(0);
   3050 
   3051   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   3052   if (!RHS)
   3053     return SDValue();
   3054 
   3055   SDValue LHS = N->getOperand(0);
   3056   unsigned RHSVal = RHS->getZExtValue();
   3057   if (!RHSVal)
   3058     return LHS;
   3059 
   3060   SDLoc SL(N);
   3061   SelectionDAG &DAG = DCI.DAG;
   3062 
   3063   switch (LHS->getOpcode()) {
   3064   default:
   3065     break;
   3066   case ISD::ZERO_EXTEND:
   3067   case ISD::SIGN_EXTEND:
   3068   case ISD::ANY_EXTEND: {
   3069     SDValue X = LHS->getOperand(0);
   3070 
   3071     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
   3072         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
   3073       // Prefer build_vector as the canonical form if packed types are legal.
   3074       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
   3075       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
   3076        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
   3077       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
   3078     }
   3079 
   3080     // shl (ext x) => zext (shl x), if shift does not overflow int
   3081     if (VT != MVT::i64)
   3082       break;
   3083     KnownBits Known = DAG.computeKnownBits(X);
   3084     unsigned LZ = Known.countMinLeadingZeros();
   3085     if (LZ < RHSVal)
   3086       break;
   3087     EVT XVT = X.getValueType();
   3088     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
   3089     return DAG.getZExtOrTrunc(Shl, SL, VT);
   3090   }
   3091   }
   3092 
   3093   if (VT != MVT::i64)
   3094     return SDValue();
   3095 
   3096   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
   3097 
   3098   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
   3099   // common case, splitting this into a move and a 32-bit shift is faster and
   3100   // the same code size.
   3101   if (RHSVal < 32)
   3102     return SDValue();
   3103 
   3104   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
   3105 
   3106   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
   3107   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
   3108 
   3109   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   3110 
   3111   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
   3112   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
   3113 }
   3114 
   3115 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
   3116                                                 DAGCombinerInfo &DCI) const {
   3117   if (N->getValueType(0) != MVT::i64)
   3118     return SDValue();
   3119 
   3120   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   3121   if (!RHS)
   3122     return SDValue();
   3123 
   3124   SelectionDAG &DAG = DCI.DAG;
   3125   SDLoc SL(N);
   3126   unsigned RHSVal = RHS->getZExtValue();
   3127 
   3128   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
   3129   if (RHSVal == 32) {
   3130     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
   3131     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
   3132                                    DAG.getConstant(31, SL, MVT::i32));
   3133 
   3134     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
   3135     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
   3136   }
   3137 
   3138   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
   3139   if (RHSVal == 63) {
   3140     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
   3141     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
   3142                                    DAG.getConstant(31, SL, MVT::i32));
   3143     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
   3144     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
   3145   }
   3146 
   3147   return SDValue();
   3148 }
   3149 
   3150 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
   3151                                                 DAGCombinerInfo &DCI) const {
   3152   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   3153   if (!RHS)
   3154     return SDValue();
   3155 
   3156   EVT VT = N->getValueType(0);
   3157   SDValue LHS = N->getOperand(0);
   3158   unsigned ShiftAmt = RHS->getZExtValue();
   3159   SelectionDAG &DAG = DCI.DAG;
   3160   SDLoc SL(N);
   3161 
   3162   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
   3163   // this improves the ability to match BFE patterns in isel.
   3164   if (LHS.getOpcode() == ISD::AND) {
   3165     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
   3166       if (Mask->getAPIntValue().isShiftedMask() &&
   3167           Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
   3168         return DAG.getNode(
   3169             ISD::AND, SL, VT,
   3170             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
   3171             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
   3172       }
   3173     }
   3174   }
   3175 
   3176   if (VT != MVT::i64)
   3177     return SDValue();
   3178 
   3179   if (ShiftAmt < 32)
   3180     return SDValue();
   3181 
   3182   // srl i64:x, C for C >= 32
   3183   // =>
   3184   //   build_pair (srl hi_32(x), C - 32), 0
   3185   SDValue One = DAG.getConstant(1, SL, MVT::i32);
   3186   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
   3187 
   3188   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
   3189   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
   3190 
   3191   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
   3192   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
   3193 
   3194   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
   3195 
   3196   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
   3197 }
   3198 
   3199 SDValue AMDGPUTargetLowering::performTruncateCombine(
   3200   SDNode *N, DAGCombinerInfo &DCI) const {
   3201   SDLoc SL(N);
   3202   SelectionDAG &DAG = DCI.DAG;
   3203   EVT VT = N->getValueType(0);
   3204   SDValue Src = N->getOperand(0);
   3205 
   3206   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
   3207   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
   3208     SDValue Vec = Src.getOperand(0);
   3209     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
   3210       SDValue Elt0 = Vec.getOperand(0);
   3211       EVT EltVT = Elt0.getValueType();
   3212       if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
   3213         if (EltVT.isFloatingPoint()) {
   3214           Elt0 = DAG.getNode(ISD::BITCAST, SL,
   3215                              EltVT.changeTypeToInteger(), Elt0);
   3216         }
   3217 
   3218         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
   3219       }
   3220     }
   3221   }
   3222 
   3223   // Equivalent of above for accessing the high element of a vector as an
   3224   // integer operation.
   3225   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
   3226   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
   3227     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
   3228       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
   3229         SDValue BV = stripBitcast(Src.getOperand(0));
   3230         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
   3231             BV.getValueType().getVectorNumElements() == 2) {
   3232           SDValue SrcElt = BV.getOperand(1);
   3233           EVT SrcEltVT = SrcElt.getValueType();
   3234           if (SrcEltVT.isFloatingPoint()) {
   3235             SrcElt = DAG.getNode(ISD::BITCAST, SL,
   3236                                  SrcEltVT.changeTypeToInteger(), SrcElt);
   3237           }
   3238 
   3239           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
   3240         }
   3241       }
   3242     }
   3243   }
   3244 
   3245   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
   3246   //
   3247   // i16 (trunc (srl i64:x, K)), K <= 16 ->
   3248   //     i16 (trunc (srl (i32 (trunc x), K)))
   3249   if (VT.getScalarSizeInBits() < 32) {
   3250     EVT SrcVT = Src.getValueType();
   3251     if (SrcVT.getScalarSizeInBits() > 32 &&
   3252         (Src.getOpcode() == ISD::SRL ||
   3253          Src.getOpcode() == ISD::SRA ||
   3254          Src.getOpcode() == ISD::SHL)) {
   3255       SDValue Amt = Src.getOperand(1);
   3256       KnownBits Known = DAG.computeKnownBits(Amt);
   3257       unsigned Size = VT.getScalarSizeInBits();
   3258       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
   3259           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
   3260         EVT MidVT = VT.isVector() ?
   3261           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
   3262                            VT.getVectorNumElements()) : MVT::i32;
   3263 
   3264         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
   3265         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
   3266                                     Src.getOperand(0));
   3267         DCI.AddToWorklist(Trunc.getNode());
   3268 
   3269         if (Amt.getValueType() != NewShiftVT) {
   3270           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
   3271           DCI.AddToWorklist(Amt.getNode());
   3272         }
   3273 
   3274         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
   3275                                           Trunc, Amt);
   3276         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
   3277       }
   3278     }
   3279   }
   3280 
   3281   return SDValue();
   3282 }
   3283 
   3284 // We need to specifically handle i64 mul here to avoid unnecessary conversion
   3285 // instructions. If we only match on the legalized i64 mul expansion,
   3286 // SimplifyDemandedBits will be unable to remove them because there will be
   3287 // multiple uses due to the separate mul + mulh[su].
   3288 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
   3289                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
   3290   if (Size <= 32) {
   3291     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
   3292     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
   3293   }
   3294 
   3295   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
   3296   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
   3297 
   3298   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
   3299   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
   3300 
   3301   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
   3302 }
   3303 
   3304 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   3305                                                 DAGCombinerInfo &DCI) const {
   3306   EVT VT = N->getValueType(0);
   3307 
   3308   // Don't generate 24-bit multiplies on values that are in SGPRs, since
   3309   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
   3310   // unnecessarily). isDivergent() is used as an approximation of whether the
   3311   // value is in an SGPR.
   3312   if (!N->isDivergent())
   3313     return SDValue();
   3314 
   3315   unsigned Size = VT.getSizeInBits();
   3316   if (VT.isVector() || Size > 64)
   3317     return SDValue();
   3318 
   3319   // There are i16 integer mul/mad.
   3320   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
   3321     return SDValue();
   3322 
   3323   SelectionDAG &DAG = DCI.DAG;
   3324   SDLoc DL(N);
   3325 
   3326   SDValue N0 = N->getOperand(0);
   3327   SDValue N1 = N->getOperand(1);
   3328 
   3329   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
   3330   // in the source into any_extends if the result of the mul is truncated. Since
   3331   // we can assume the high bits are whatever we want, use the underlying value
   3332   // to avoid the unknown high bits from interfering.
   3333   if (N0.getOpcode() == ISD::ANY_EXTEND)
   3334     N0 = N0.getOperand(0);
   3335 
   3336   if (N1.getOpcode() == ISD::ANY_EXTEND)
   3337     N1 = N1.getOperand(0);
   3338 
   3339   SDValue Mul;
   3340 
   3341   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
   3342     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
   3343     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
   3344     Mul = getMul24(DAG, DL, N0, N1, Size, false);
   3345   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
   3346     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
   3347     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
   3348     Mul = getMul24(DAG, DL, N0, N1, Size, true);
   3349   } else {
   3350     return SDValue();
   3351   }
   3352 
   3353   // We need to use sext even for MUL_U24, because MUL_U24 is used
   3354   // for signed multiply of 8 and 16-bit types.
   3355   return DAG.getSExtOrTrunc(Mul, DL, VT);
   3356 }
   3357 
   3358 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
   3359                                                   DAGCombinerInfo &DCI) const {
   3360   EVT VT = N->getValueType(0);
   3361 
   3362   if (!Subtarget->hasMulI24() || VT.isVector())
   3363     return SDValue();
   3364 
   3365   SelectionDAG &DAG = DCI.DAG;
   3366   SDLoc DL(N);
   3367 
   3368   SDValue N0 = N->getOperand(0);
   3369   SDValue N1 = N->getOperand(1);
   3370 
   3371   if (!isI24(N0, DAG) || !isI24(N1, DAG))
   3372     return SDValue();
   3373 
   3374   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
   3375   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
   3376 
   3377   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
   3378   DCI.AddToWorklist(Mulhi.getNode());
   3379   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
   3380 }
   3381 
   3382 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
   3383                                                   DAGCombinerInfo &DCI) const {
   3384   EVT VT = N->getValueType(0);
   3385 
   3386   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
   3387     return SDValue();
   3388 
   3389   SelectionDAG &DAG = DCI.DAG;
   3390   SDLoc DL(N);
   3391 
   3392   SDValue N0 = N->getOperand(0);
   3393   SDValue N1 = N->getOperand(1);
   3394 
   3395   if (!isU24(N0, DAG) || !isU24(N1, DAG))
   3396     return SDValue();
   3397 
   3398   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
   3399   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
   3400 
   3401   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
   3402   DCI.AddToWorklist(Mulhi.getNode());
   3403   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
   3404 }
   3405 
   3406 static bool isNegativeOne(SDValue Val) {
   3407   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
   3408     return C->isAllOnesValue();
   3409   return false;
   3410 }
   3411 
   3412 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
   3413                                           SDValue Op,
   3414                                           const SDLoc &DL,
   3415                                           unsigned Opc) const {
   3416   EVT VT = Op.getValueType();
   3417   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
   3418   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
   3419                               LegalVT != MVT::i16))
   3420     return SDValue();
   3421 
   3422   if (VT != MVT::i32)
   3423     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
   3424 
   3425   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
   3426   if (VT != MVT::i32)
   3427     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
   3428 
   3429   return FFBX;
   3430 }
   3431 
   3432 // The native instructions return -1 on 0 input. Optimize out a select that
   3433 // produces -1 on 0.
   3434 //
   3435 // TODO: If zero is not undef, we could also do this if the output is compared
   3436 // against the bitwidth.
   3437 //
   3438 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
   3439 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
   3440                                                  SDValue LHS, SDValue RHS,
   3441                                                  DAGCombinerInfo &DCI) const {
   3442   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
   3443   if (!CmpRhs || !CmpRhs->isNullValue())
   3444     return SDValue();
   3445 
   3446   SelectionDAG &DAG = DCI.DAG;
   3447   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   3448   SDValue CmpLHS = Cond.getOperand(0);
   3449 
   3450   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
   3451   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
   3452   if (CCOpcode == ISD::SETEQ &&
   3453       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
   3454       RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
   3455     unsigned Opc =
   3456         isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
   3457     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
   3458   }
   3459 
   3460   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
   3461   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
   3462   if (CCOpcode == ISD::SETNE &&
   3463       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
   3464       LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
   3465     unsigned Opc =
   3466         isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
   3467 
   3468     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
   3469   }
   3470 
   3471   return SDValue();
   3472 }
   3473 
   3474 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
   3475                                          unsigned Op,
   3476                                          const SDLoc &SL,
   3477                                          SDValue Cond,
   3478                                          SDValue N1,
   3479                                          SDValue N2) {
   3480   SelectionDAG &DAG = DCI.DAG;
   3481   EVT VT = N1.getValueType();
   3482 
   3483   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
   3484                                   N1.getOperand(0), N2.getOperand(0));
   3485   DCI.AddToWorklist(NewSelect.getNode());
   3486   return DAG.getNode(Op, SL, VT, NewSelect);
   3487 }
   3488 
   3489 // Pull a free FP operation out of a select so it may fold into uses.
   3490 //
   3491 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
   3492 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
   3493 //
   3494 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
   3495 // select c, (fabs x), +k -> fabs (select c, x, k)
   3496 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   3497                                     SDValue N) {
   3498   SelectionDAG &DAG = DCI.DAG;
   3499   SDValue Cond = N.getOperand(0);
   3500   SDValue LHS = N.getOperand(1);
   3501   SDValue RHS = N.getOperand(2);
   3502 
   3503   EVT VT = N.getValueType();
   3504   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
   3505       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
   3506     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
   3507                                      SDLoc(N), Cond, LHS, RHS);
   3508   }
   3509 
   3510   bool Inv = false;
   3511   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
   3512     std::swap(LHS, RHS);
   3513     Inv = true;
   3514   }
   3515 
   3516   // TODO: Support vector constants.
   3517   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
   3518   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
   3519     SDLoc SL(N);
   3520     // If one side is an fneg/fabs and the other is a constant, we can push the
   3521     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
   3522     SDValue NewLHS = LHS.getOperand(0);
   3523     SDValue NewRHS = RHS;
   3524 
   3525     // Careful: if the neg can be folded up, don't try to pull it back down.
   3526     bool ShouldFoldNeg = true;
   3527 
   3528     if (NewLHS.hasOneUse()) {
   3529       unsigned Opc = NewLHS.getOpcode();
   3530       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
   3531         ShouldFoldNeg = false;
   3532       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
   3533         ShouldFoldNeg = false;
   3534     }
   3535 
   3536     if (ShouldFoldNeg) {
   3537       if (LHS.getOpcode() == ISD::FNEG)
   3538         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
   3539       else if (CRHS->isNegative())
   3540         return SDValue();
   3541 
   3542       if (Inv)
   3543         std::swap(NewLHS, NewRHS);
   3544 
   3545       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
   3546                                       Cond, NewLHS, NewRHS);
   3547       DCI.AddToWorklist(NewSelect.getNode());
   3548       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
   3549     }
   3550   }
   3551 
   3552   return SDValue();
   3553 }
   3554 
   3555 
   3556 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
   3557                                                    DAGCombinerInfo &DCI) const {
   3558   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
   3559     return Folded;
   3560 
   3561   SDValue Cond = N->getOperand(0);
   3562   if (Cond.getOpcode() != ISD::SETCC)
   3563     return SDValue();
   3564 
   3565   EVT VT = N->getValueType(0);
   3566   SDValue LHS = Cond.getOperand(0);
   3567   SDValue RHS = Cond.getOperand(1);
   3568   SDValue CC = Cond.getOperand(2);
   3569 
   3570   SDValue True = N->getOperand(1);
   3571   SDValue False = N->getOperand(2);
   3572 
   3573   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
   3574     SelectionDAG &DAG = DCI.DAG;
   3575     if (DAG.isConstantValueOfAnyType(True) &&
   3576         !DAG.isConstantValueOfAnyType(False)) {
   3577       // Swap cmp + select pair to move constant to false input.
   3578       // This will allow using VOPC cndmasks more often.
   3579       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
   3580 
   3581       SDLoc SL(N);
   3582       ISD::CondCode NewCC =
   3583           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
   3584 
   3585       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
   3586       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
   3587     }
   3588 
   3589     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
   3590       SDValue MinMax
   3591         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
   3592       // Revisit this node so we can catch min3/max3/med3 patterns.
   3593       //DCI.AddToWorklist(MinMax.getNode());
   3594       return MinMax;
   3595     }
   3596   }
   3597 
   3598   // There's no reason to not do this if the condition has other uses.
   3599   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
   3600 }
   3601 
   3602 static bool isInv2Pi(const APFloat &APF) {
   3603   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
   3604   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
   3605   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
   3606 
   3607   return APF.bitwiseIsEqual(KF16) ||
   3608          APF.bitwiseIsEqual(KF32) ||
   3609          APF.bitwiseIsEqual(KF64);
   3610 }
   3611 
   3612 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
   3613 // additional cost to negate them.
   3614 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
   3615   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
   3616     if (C->isZero() && !C->isNegative())
   3617       return true;
   3618 
   3619     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
   3620       return true;
   3621   }
   3622 
   3623   return false;
   3624 }
   3625 
   3626 static unsigned inverseMinMax(unsigned Opc) {
   3627   switch (Opc) {
   3628   case ISD::FMAXNUM:
   3629     return ISD::FMINNUM;
   3630   case ISD::FMINNUM:
   3631     return ISD::FMAXNUM;
   3632   case ISD::FMAXNUM_IEEE:
   3633     return ISD::FMINNUM_IEEE;
   3634   case ISD::FMINNUM_IEEE:
   3635     return ISD::FMAXNUM_IEEE;
   3636   case AMDGPUISD::FMAX_LEGACY:
   3637     return AMDGPUISD::FMIN_LEGACY;
   3638   case AMDGPUISD::FMIN_LEGACY:
   3639     return  AMDGPUISD::FMAX_LEGACY;
   3640   default:
   3641     llvm_unreachable("invalid min/max opcode");
   3642   }
   3643 }
   3644 
   3645 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   3646                                                  DAGCombinerInfo &DCI) const {
   3647   SelectionDAG &DAG = DCI.DAG;
   3648   SDValue N0 = N->getOperand(0);
   3649   EVT VT = N->getValueType(0);
   3650 
   3651   unsigned Opc = N0.getOpcode();
   3652 
   3653   // If the input has multiple uses and we can either fold the negate down, or
   3654   // the other uses cannot, give up. This both prevents unprofitable
   3655   // transformations and infinite loops: we won't repeatedly try to fold around
   3656   // a negate that has no 'good' form.
   3657   if (N0.hasOneUse()) {
   3658     // This may be able to fold into the source, but at a code size cost. Don't
   3659     // fold if the fold into the user is free.
   3660     if (allUsesHaveSourceMods(N, 0))
   3661       return SDValue();
   3662   } else {
   3663     if (fnegFoldsIntoOp(Opc) &&
   3664         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
   3665       return SDValue();
   3666   }
   3667 
   3668   SDLoc SL(N);
   3669   switch (Opc) {
   3670   case ISD::FADD: {
   3671     if (!mayIgnoreSignedZero(N0))
   3672       return SDValue();
   3673 
   3674     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
   3675     SDValue LHS = N0.getOperand(0);
   3676     SDValue RHS = N0.getOperand(1);
   3677 
   3678     if (LHS.getOpcode() != ISD::FNEG)
   3679       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
   3680     else
   3681       LHS = LHS.getOperand(0);
   3682 
   3683     if (RHS.getOpcode() != ISD::FNEG)
   3684       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
   3685     else
   3686       RHS = RHS.getOperand(0);
   3687 
   3688     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
   3689     if (Res.getOpcode() != ISD::FADD)
   3690       return SDValue(); // Op got folded away.
   3691     if (!N0.hasOneUse())
   3692       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
   3693     return Res;
   3694   }
   3695   case ISD::FMUL:
   3696   case AMDGPUISD::FMUL_LEGACY: {
   3697     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
   3698     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
   3699     SDValue LHS = N0.getOperand(0);
   3700     SDValue RHS = N0.getOperand(1);
   3701 
   3702     if (LHS.getOpcode() == ISD::FNEG)
   3703       LHS = LHS.getOperand(0);
   3704     else if (RHS.getOpcode() == ISD::FNEG)
   3705       RHS = RHS.getOperand(0);
   3706     else
   3707       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
   3708 
   3709     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
   3710     if (Res.getOpcode() != Opc)
   3711       return SDValue(); // Op got folded away.
   3712     if (!N0.hasOneUse())
   3713       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
   3714     return Res;
   3715   }
   3716   case ISD::FMA:
   3717   case ISD::FMAD: {
   3718     // TODO: handle llvm.amdgcn.fma.legacy
   3719     if (!mayIgnoreSignedZero(N0))
   3720       return SDValue();
   3721 
   3722     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
   3723     SDValue LHS = N0.getOperand(0);
   3724     SDValue MHS = N0.getOperand(1);
   3725     SDValue RHS = N0.getOperand(2);
   3726 
   3727     if (LHS.getOpcode() == ISD::FNEG)
   3728       LHS = LHS.getOperand(0);
   3729     else if (MHS.getOpcode() == ISD::FNEG)
   3730       MHS = MHS.getOperand(0);
   3731     else
   3732       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
   3733 
   3734     if (RHS.getOpcode() != ISD::FNEG)
   3735       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
   3736     else
   3737       RHS = RHS.getOperand(0);
   3738 
   3739     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
   3740     if (Res.getOpcode() != Opc)
   3741       return SDValue(); // Op got folded away.
   3742     if (!N0.hasOneUse())
   3743       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
   3744     return Res;
   3745   }
   3746   case ISD::FMAXNUM:
   3747   case ISD::FMINNUM:
   3748   case ISD::FMAXNUM_IEEE:
   3749   case ISD::FMINNUM_IEEE:
   3750   case AMDGPUISD::FMAX_LEGACY:
   3751   case AMDGPUISD::FMIN_LEGACY: {
   3752     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
   3753     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
   3754     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
   3755     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
   3756 
   3757     SDValue LHS = N0.getOperand(0);
   3758     SDValue RHS = N0.getOperand(1);
   3759 
   3760     // 0 doesn't have a negated inline immediate.
   3761     // TODO: This constant check should be generalized to other operations.
   3762     if (isConstantCostlierToNegate(RHS))
   3763       return SDValue();
   3764 
   3765     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
   3766     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
   3767     unsigned Opposite = inverseMinMax(Opc);
   3768 
   3769     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
   3770     if (Res.getOpcode() != Opposite)
   3771       return SDValue(); // Op got folded away.
   3772     if (!N0.hasOneUse())
   3773       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
   3774     return Res;
   3775   }
   3776   case AMDGPUISD::FMED3: {
   3777     SDValue Ops[3];
   3778     for (unsigned I = 0; I < 3; ++I)
   3779       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
   3780 
   3781     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
   3782     if (Res.getOpcode() != AMDGPUISD::FMED3)
   3783       return SDValue(); // Op got folded away.
   3784 
   3785     if (!N0.hasOneUse()) {
   3786       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
   3787       DAG.ReplaceAllUsesWith(N0, Neg);
   3788 
   3789       for (SDNode *U : Neg->uses())
   3790         DCI.AddToWorklist(U);
   3791     }
   3792 
   3793     return Res;
   3794   }
   3795   case ISD::FP_EXTEND:
   3796   case ISD::FTRUNC:
   3797   case ISD::FRINT:
   3798   case ISD::FNEARBYINT: // XXX - Should fround be handled?
   3799   case ISD::FSIN:
   3800   case ISD::FCANONICALIZE:
   3801   case AMDGPUISD::RCP:
   3802   case AMDGPUISD::RCP_LEGACY:
   3803   case AMDGPUISD::RCP_IFLAG:
   3804   case AMDGPUISD::SIN_HW: {
   3805     SDValue CvtSrc = N0.getOperand(0);
   3806     if (CvtSrc.getOpcode() == ISD::FNEG) {
   3807       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
   3808       // (fneg (rcp (fneg x))) -> (rcp x)
   3809       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
   3810     }
   3811 
   3812     if (!N0.hasOneUse())
   3813       return SDValue();
   3814 
   3815     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
   3816     // (fneg (rcp x)) -> (rcp (fneg x))
   3817     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
   3818     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
   3819   }
   3820   case ISD::FP_ROUND: {
   3821     SDValue CvtSrc = N0.getOperand(0);
   3822 
   3823     if (CvtSrc.getOpcode() == ISD::FNEG) {
   3824       // (fneg (fp_round (fneg x))) -> (fp_round x)
   3825       return DAG.getNode(ISD::FP_ROUND, SL, VT,
   3826                          CvtSrc.getOperand(0), N0.getOperand(1));
   3827     }
   3828 
   3829     if (!N0.hasOneUse())
   3830       return SDValue();
   3831 
   3832     // (fneg (fp_round x)) -> (fp_round (fneg x))
   3833     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
   3834     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
   3835   }
   3836   case ISD::FP16_TO_FP: {
   3837     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
   3838     // f16, but legalization of f16 fneg ends up pulling it out of the source.
   3839     // Put the fneg back as a legal source operation that can be matched later.
   3840     SDLoc SL(N);
   3841 
   3842     SDValue Src = N0.getOperand(0);
   3843     EVT SrcVT = Src.getValueType();
   3844 
   3845     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
   3846     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
   3847                                   DAG.getConstant(0x8000, SL, SrcVT));
   3848     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
   3849   }
   3850   default:
   3851     return SDValue();
   3852   }
   3853 }
   3854 
   3855 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
   3856                                                  DAGCombinerInfo &DCI) const {
   3857   SelectionDAG &DAG = DCI.DAG;
   3858   SDValue N0 = N->getOperand(0);
   3859 
   3860   if (!N0.hasOneUse())
   3861     return SDValue();
   3862 
   3863   switch (N0.getOpcode()) {
   3864   case ISD::FP16_TO_FP: {
   3865     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
   3866     SDLoc SL(N);
   3867     SDValue Src = N0.getOperand(0);
   3868     EVT SrcVT = Src.getValueType();
   3869 
   3870     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
   3871     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
   3872                                   DAG.getConstant(0x7fff, SL, SrcVT));
   3873     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
   3874   }
   3875   default:
   3876     return SDValue();
   3877   }
   3878 }
   3879 
   3880 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
   3881                                                 DAGCombinerInfo &DCI) const {
   3882   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
   3883   if (!CFP)
   3884     return SDValue();
   3885 
   3886   // XXX - Should this flush denormals?
   3887   const APFloat &Val = CFP->getValueAPF();
   3888   APFloat One(Val.getSemantics(), "1.0");
   3889   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
   3890 }
   3891 
   3892 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
   3893                                                 DAGCombinerInfo &DCI) const {
   3894   SelectionDAG &DAG = DCI.DAG;
   3895   SDLoc DL(N);
   3896 
   3897   switch(N->getOpcode()) {
   3898   default:
   3899     break;
   3900   case ISD::BITCAST: {
   3901     EVT DestVT = N->getValueType(0);
   3902 
   3903     // Push casts through vector builds. This helps avoid emitting a large
   3904     // number of copies when materializing floating point vector constants.
   3905     //
   3906     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
   3907     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
   3908     if (DestVT.isVector()) {
   3909       SDValue Src = N->getOperand(0);
   3910       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
   3911         EVT SrcVT = Src.getValueType();
   3912         unsigned NElts = DestVT.getVectorNumElements();
   3913 
   3914         if (SrcVT.getVectorNumElements() == NElts) {
   3915           EVT DestEltVT = DestVT.getVectorElementType();
   3916 
   3917           SmallVector<SDValue, 8> CastedElts;
   3918           SDLoc SL(N);
   3919           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
   3920             SDValue Elt = Src.getOperand(I);
   3921             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
   3922           }
   3923 
   3924           return DAG.getBuildVector(DestVT, SL, CastedElts);
   3925         }
   3926       }
   3927     }
   3928 
   3929     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
   3930       break;
   3931 
   3932     // Fold bitcasts of constants.
   3933     //
   3934     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
   3935     // TODO: Generalize and move to DAGCombiner
   3936     SDValue Src = N->getOperand(0);
   3937     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
   3938       SDLoc SL(N);
   3939       uint64_t CVal = C->getZExtValue();
   3940       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
   3941                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
   3942                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
   3943       return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
   3944     }
   3945 
   3946     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
   3947       const APInt &Val = C->getValueAPF().bitcastToAPInt();
   3948       SDLoc SL(N);
   3949       uint64_t CVal = Val.getZExtValue();
   3950       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
   3951                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
   3952                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
   3953 
   3954       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
   3955     }
   3956 
   3957     break;
   3958   }
   3959   case ISD::SHL: {
   3960     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   3961       break;
   3962 
   3963     return performShlCombine(N, DCI);
   3964   }
   3965   case ISD::SRL: {
   3966     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   3967       break;
   3968 
   3969     return performSrlCombine(N, DCI);
   3970   }
   3971   case ISD::SRA: {
   3972     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
   3973       break;
   3974 
   3975     return performSraCombine(N, DCI);
   3976   }
   3977   case ISD::TRUNCATE:
   3978     return performTruncateCombine(N, DCI);
   3979   case ISD::MUL:
   3980     return performMulCombine(N, DCI);
   3981   case ISD::MULHS:
   3982     return performMulhsCombine(N, DCI);
   3983   case ISD::MULHU:
   3984     return performMulhuCombine(N, DCI);
   3985   case AMDGPUISD::MUL_I24:
   3986   case AMDGPUISD::MUL_U24:
   3987   case AMDGPUISD::MULHI_I24:
   3988   case AMDGPUISD::MULHI_U24:
   3989     return simplifyMul24(N, DCI);
   3990   case ISD::SELECT:
   3991     return performSelectCombine(N, DCI);
   3992   case ISD::FNEG:
   3993     return performFNegCombine(N, DCI);
   3994   case ISD::FABS:
   3995     return performFAbsCombine(N, DCI);
   3996   case AMDGPUISD::BFE_I32:
   3997   case AMDGPUISD::BFE_U32: {
   3998     assert(!N->getValueType(0).isVector() &&
   3999            "Vector handling of BFE not implemented");
   4000     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
   4001     if (!Width)
   4002       break;
   4003 
   4004     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
   4005     if (WidthVal == 0)
   4006       return DAG.getConstant(0, DL, MVT::i32);
   4007 
   4008     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
   4009     if (!Offset)
   4010       break;
   4011 
   4012     SDValue BitsFrom = N->getOperand(0);
   4013     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
   4014 
   4015     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
   4016 
   4017     if (OffsetVal == 0) {
   4018       // This is already sign / zero extended, so try to fold away extra BFEs.
   4019       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
   4020 
   4021       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
   4022       if (OpSignBits >= SignBits)
   4023         return BitsFrom;
   4024 
   4025       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
   4026       if (Signed) {
   4027         // This is a sign_extend_inreg. Replace it to take advantage of existing
   4028         // DAG Combines. If not eliminated, we will match back to BFE during
   4029         // selection.
   4030 
   4031         // TODO: The sext_inreg of extended types ends, although we can could
   4032         // handle them in a single BFE.
   4033         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
   4034                            DAG.getValueType(SmallVT));
   4035       }
   4036 
   4037       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
   4038     }
   4039 
   4040     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
   4041       if (Signed) {
   4042         return constantFoldBFE<int32_t>(DAG,
   4043                                         CVal->getSExtValue(),
   4044                                         OffsetVal,
   4045                                         WidthVal,
   4046                                         DL);
   4047       }
   4048 
   4049       return constantFoldBFE<uint32_t>(DAG,
   4050                                        CVal->getZExtValue(),
   4051                                        OffsetVal,
   4052                                        WidthVal,
   4053                                        DL);
   4054     }
   4055 
   4056     if ((OffsetVal + WidthVal) >= 32 &&
   4057         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
   4058       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
   4059       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
   4060                          BitsFrom, ShiftVal);
   4061     }
   4062 
   4063     if (BitsFrom.hasOneUse()) {
   4064       APInt Demanded = APInt::getBitsSet(32,
   4065                                          OffsetVal,
   4066                                          OffsetVal + WidthVal);
   4067 
   4068       KnownBits Known;
   4069       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
   4070                                             !DCI.isBeforeLegalizeOps());
   4071       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   4072       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
   4073           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
   4074         DCI.CommitTargetLoweringOpt(TLO);
   4075       }
   4076     }
   4077 
   4078     break;
   4079   }
   4080   case ISD::LOAD:
   4081     return performLoadCombine(N, DCI);
   4082   case ISD::STORE:
   4083     return performStoreCombine(N, DCI);
   4084   case AMDGPUISD::RCP:
   4085   case AMDGPUISD::RCP_IFLAG:
   4086     return performRcpCombine(N, DCI);
   4087   case ISD::AssertZext:
   4088   case ISD::AssertSext:
   4089     return performAssertSZExtCombine(N, DCI);
   4090   case ISD::INTRINSIC_WO_CHAIN:
   4091     return performIntrinsicWOChainCombine(N, DCI);
   4092   }
   4093   return SDValue();
   4094 }
   4095 
   4096 //===----------------------------------------------------------------------===//
   4097 // Helper functions
   4098 //===----------------------------------------------------------------------===//
   4099 
   4100 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
   4101                                                    const TargetRegisterClass *RC,
   4102                                                    Register Reg, EVT VT,
   4103                                                    const SDLoc &SL,
   4104                                                    bool RawReg) const {
   4105   MachineFunction &MF = DAG.getMachineFunction();
   4106   MachineRegisterInfo &MRI = MF.getRegInfo();
   4107   Register VReg;
   4108 
   4109   if (!MRI.isLiveIn(Reg)) {
   4110     VReg = MRI.createVirtualRegister(RC);
   4111     MRI.addLiveIn(Reg, VReg);
   4112   } else {
   4113     VReg = MRI.getLiveInVirtReg(Reg);
   4114   }
   4115 
   4116   if (RawReg)
   4117     return DAG.getRegister(VReg, VT);
   4118 
   4119   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
   4120 }
   4121 
   4122 // This may be called multiple times, and nothing prevents creating multiple
   4123 // objects at the same offset. See if we already defined this object.
   4124 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
   4125                                        int64_t Offset) {
   4126   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
   4127     if (MFI.getObjectOffset(I) == Offset) {
   4128       assert(MFI.getObjectSize(I) == Size);
   4129       return I;
   4130     }
   4131   }
   4132 
   4133   return MFI.CreateFixedObject(Size, Offset, true);
   4134 }
   4135 
   4136 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
   4137                                                   EVT VT,
   4138                                                   const SDLoc &SL,
   4139                                                   int64_t Offset) const {
   4140   MachineFunction &MF = DAG.getMachineFunction();
   4141   MachineFrameInfo &MFI = MF.getFrameInfo();
   4142   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
   4143 
   4144   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
   4145   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
   4146 
   4147   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
   4148                      MachineMemOperand::MODereferenceable |
   4149                          MachineMemOperand::MOInvariant);
   4150 }
   4151 
   4152 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
   4153                                                    const SDLoc &SL,
   4154                                                    SDValue Chain,
   4155                                                    SDValue ArgVal,
   4156                                                    int64_t Offset) const {
   4157   MachineFunction &MF = DAG.getMachineFunction();
   4158   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
   4159 
   4160   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
   4161   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
   4162                                MachineMemOperand::MODereferenceable);
   4163   return Store;
   4164 }
   4165 
   4166 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
   4167                                              const TargetRegisterClass *RC,
   4168                                              EVT VT, const SDLoc &SL,
   4169                                              const ArgDescriptor &Arg) const {
   4170   assert(Arg && "Attempting to load missing argument");
   4171 
   4172   SDValue V = Arg.isRegister() ?
   4173     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
   4174     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
   4175 
   4176   if (!Arg.isMasked())
   4177     return V;
   4178 
   4179   unsigned Mask = Arg.getMask();
   4180   unsigned Shift = countTrailingZeros<unsigned>(Mask);
   4181   V = DAG.getNode(ISD::SRL, SL, VT, V,
   4182                   DAG.getShiftAmountConstant(Shift, VT, SL));
   4183   return DAG.getNode(ISD::AND, SL, VT, V,
   4184                      DAG.getConstant(Mask >> Shift, SL, VT));
   4185 }
   4186 
   4187 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
   4188     const MachineFunction &MF, const ImplicitParameter Param) const {
   4189   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
   4190   const AMDGPUSubtarget &ST =
   4191       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
   4192   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
   4193   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
   4194   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
   4195                        ExplicitArgOffset;
   4196   switch (Param) {
   4197   case GRID_DIM:
   4198     return ArgOffset;
   4199   case GRID_OFFSET:
   4200     return ArgOffset + 4;
   4201   }
   4202   llvm_unreachable("unexpected implicit parameter type");
   4203 }
   4204 
   4205 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
   4206 
   4207 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   4208   switch ((AMDGPUISD::NodeType)Opcode) {
   4209   case AMDGPUISD::FIRST_NUMBER: break;
   4210   // AMDIL DAG nodes
   4211   NODE_NAME_CASE(UMUL);
   4212   NODE_NAME_CASE(BRANCH_COND);
   4213 
   4214   // AMDGPU DAG nodes
   4215   NODE_NAME_CASE(IF)
   4216   NODE_NAME_CASE(ELSE)
   4217   NODE_NAME_CASE(LOOP)
   4218   NODE_NAME_CASE(CALL)
   4219   NODE_NAME_CASE(TC_RETURN)
   4220   NODE_NAME_CASE(TRAP)
   4221   NODE_NAME_CASE(RET_FLAG)
   4222   NODE_NAME_CASE(RETURN_TO_EPILOG)
   4223   NODE_NAME_CASE(ENDPGM)
   4224   NODE_NAME_CASE(DWORDADDR)
   4225   NODE_NAME_CASE(FRACT)
   4226   NODE_NAME_CASE(SETCC)
   4227   NODE_NAME_CASE(SETREG)
   4228   NODE_NAME_CASE(DENORM_MODE)
   4229   NODE_NAME_CASE(FMA_W_CHAIN)
   4230   NODE_NAME_CASE(FMUL_W_CHAIN)
   4231   NODE_NAME_CASE(CLAMP)
   4232   NODE_NAME_CASE(COS_HW)
   4233   NODE_NAME_CASE(SIN_HW)
   4234   NODE_NAME_CASE(FMAX_LEGACY)
   4235   NODE_NAME_CASE(FMIN_LEGACY)
   4236   NODE_NAME_CASE(FMAX3)
   4237   NODE_NAME_CASE(SMAX3)
   4238   NODE_NAME_CASE(UMAX3)
   4239   NODE_NAME_CASE(FMIN3)
   4240   NODE_NAME_CASE(SMIN3)
   4241   NODE_NAME_CASE(UMIN3)
   4242   NODE_NAME_CASE(FMED3)
   4243   NODE_NAME_CASE(SMED3)
   4244   NODE_NAME_CASE(UMED3)
   4245   NODE_NAME_CASE(FDOT2)
   4246   NODE_NAME_CASE(URECIP)
   4247   NODE_NAME_CASE(DIV_SCALE)
   4248   NODE_NAME_CASE(DIV_FMAS)
   4249   NODE_NAME_CASE(DIV_FIXUP)
   4250   NODE_NAME_CASE(FMAD_FTZ)
   4251   NODE_NAME_CASE(RCP)
   4252   NODE_NAME_CASE(RSQ)
   4253   NODE_NAME_CASE(RCP_LEGACY)
   4254   NODE_NAME_CASE(RCP_IFLAG)
   4255   NODE_NAME_CASE(FMUL_LEGACY)
   4256   NODE_NAME_CASE(RSQ_CLAMP)
   4257   NODE_NAME_CASE(LDEXP)
   4258   NODE_NAME_CASE(FP_CLASS)
   4259   NODE_NAME_CASE(DOT4)
   4260   NODE_NAME_CASE(CARRY)
   4261   NODE_NAME_CASE(BORROW)
   4262   NODE_NAME_CASE(BFE_U32)
   4263   NODE_NAME_CASE(BFE_I32)
   4264   NODE_NAME_CASE(BFI)
   4265   NODE_NAME_CASE(BFM)
   4266   NODE_NAME_CASE(FFBH_U32)
   4267   NODE_NAME_CASE(FFBH_I32)
   4268   NODE_NAME_CASE(FFBL_B32)
   4269   NODE_NAME_CASE(MUL_U24)
   4270   NODE_NAME_CASE(MUL_I24)
   4271   NODE_NAME_CASE(MULHI_U24)
   4272   NODE_NAME_CASE(MULHI_I24)
   4273   NODE_NAME_CASE(MAD_U24)
   4274   NODE_NAME_CASE(MAD_I24)
   4275   NODE_NAME_CASE(MAD_I64_I32)
   4276   NODE_NAME_CASE(MAD_U64_U32)
   4277   NODE_NAME_CASE(PERM)
   4278   NODE_NAME_CASE(TEXTURE_FETCH)
   4279   NODE_NAME_CASE(R600_EXPORT)
   4280   NODE_NAME_CASE(CONST_ADDRESS)
   4281   NODE_NAME_CASE(REGISTER_LOAD)
   4282   NODE_NAME_CASE(REGISTER_STORE)
   4283   NODE_NAME_CASE(SAMPLE)
   4284   NODE_NAME_CASE(SAMPLEB)
   4285   NODE_NAME_CASE(SAMPLED)
   4286   NODE_NAME_CASE(SAMPLEL)
   4287   NODE_NAME_CASE(CVT_F32_UBYTE0)
   4288   NODE_NAME_CASE(CVT_F32_UBYTE1)
   4289   NODE_NAME_CASE(CVT_F32_UBYTE2)
   4290   NODE_NAME_CASE(CVT_F32_UBYTE3)
   4291   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
   4292   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
   4293   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
   4294   NODE_NAME_CASE(CVT_PK_I16_I32)
   4295   NODE_NAME_CASE(CVT_PK_U16_U32)
   4296   NODE_NAME_CASE(FP_TO_FP16)
   4297   NODE_NAME_CASE(FP16_ZEXT)
   4298   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   4299   NODE_NAME_CASE(CONST_DATA_PTR)
   4300   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   4301   NODE_NAME_CASE(LDS)
   4302   NODE_NAME_CASE(DUMMY_CHAIN)
   4303   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   4304   NODE_NAME_CASE(LOAD_D16_HI)
   4305   NODE_NAME_CASE(LOAD_D16_LO)
   4306   NODE_NAME_CASE(LOAD_D16_HI_I8)
   4307   NODE_NAME_CASE(LOAD_D16_HI_U8)
   4308   NODE_NAME_CASE(LOAD_D16_LO_I8)
   4309   NODE_NAME_CASE(LOAD_D16_LO_U8)
   4310   NODE_NAME_CASE(STORE_MSKOR)
   4311   NODE_NAME_CASE(LOAD_CONSTANT)
   4312   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   4313   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
   4314   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   4315   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
   4316   NODE_NAME_CASE(DS_ORDERED_COUNT)
   4317   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   4318   NODE_NAME_CASE(ATOMIC_INC)
   4319   NODE_NAME_CASE(ATOMIC_DEC)
   4320   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
   4321   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
   4322   NODE_NAME_CASE(BUFFER_LOAD)
   4323   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
   4324   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
   4325   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
   4326   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
   4327   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
   4328   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
   4329   NODE_NAME_CASE(SBUFFER_LOAD)
   4330   NODE_NAME_CASE(BUFFER_STORE)
   4331   NODE_NAME_CASE(BUFFER_STORE_BYTE)
   4332   NODE_NAME_CASE(BUFFER_STORE_SHORT)
   4333   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
   4334   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
   4335   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
   4336   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
   4337   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
   4338   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
   4339   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
   4340   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
   4341   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
   4342   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
   4343   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
   4344   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
   4345   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
   4346   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
   4347   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
   4348   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
   4349   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
   4350   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
   4351   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
   4352 
   4353   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   4354   }
   4355   return nullptr;
   4356 }
   4357 
   4358 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
   4359                                               SelectionDAG &DAG, int Enabled,
   4360                                               int &RefinementSteps,
   4361                                               bool &UseOneConstNR,
   4362                                               bool Reciprocal) const {
   4363   EVT VT = Operand.getValueType();
   4364 
   4365   if (VT == MVT::f32) {
   4366     RefinementSteps = 0;
   4367     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
   4368   }
   4369 
   4370   // TODO: There is also f64 rsq instruction, but the documentation is less
   4371   // clear on its precision.
   4372 
   4373   return SDValue();
   4374 }
   4375 
   4376 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
   4377                                                SelectionDAG &DAG, int Enabled,
   4378                                                int &RefinementSteps) const {
   4379   EVT VT = Operand.getValueType();
   4380 
   4381   if (VT == MVT::f32) {
   4382     // Reciprocal, < 1 ulp error.
   4383     //
   4384     // This reciprocal approximation converges to < 0.5 ulp error with one
   4385     // newton rhapson performed with two fused multiple adds (FMAs).
   4386 
   4387     RefinementSteps = 0;
   4388     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
   4389   }
   4390 
   4391   // TODO: There is also f64 rcp instruction, but the documentation is less
   4392   // clear on its precision.
   4393 
   4394   return SDValue();
   4395 }
   4396 
   4397 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   4398     const SDValue Op, KnownBits &Known,
   4399     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
   4400 
   4401   Known.resetAll(); // Don't know anything.
   4402 
   4403   unsigned Opc = Op.getOpcode();
   4404 
   4405   switch (Opc) {
   4406   default:
   4407     break;
   4408   case AMDGPUISD::CARRY:
   4409   case AMDGPUISD::BORROW: {
   4410     Known.Zero = APInt::getHighBitsSet(32, 31);
   4411     break;
   4412   }
   4413 
   4414   case AMDGPUISD::BFE_I32:
   4415   case AMDGPUISD::BFE_U32: {
   4416     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   4417     if (!CWidth)
   4418       return;
   4419 
   4420     uint32_t Width = CWidth->getZExtValue() & 0x1f;
   4421 
   4422     if (Opc == AMDGPUISD::BFE_U32)
   4423       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
   4424 
   4425     break;
   4426   }
   4427   case AMDGPUISD::FP_TO_FP16:
   4428   case AMDGPUISD::FP16_ZEXT: {
   4429     unsigned BitWidth = Known.getBitWidth();
   4430 
   4431     // High bits are zero.
   4432     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
   4433     break;
   4434   }
   4435   case AMDGPUISD::MUL_U24:
   4436   case AMDGPUISD::MUL_I24: {
   4437     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
   4438     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
   4439     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
   4440                       RHSKnown.countMinTrailingZeros();
   4441     Known.Zero.setLowBits(std::min(TrailZ, 32u));
   4442     // Skip extra check if all bits are known zeros.
   4443     if (TrailZ >= 32)
   4444       break;
   4445 
   4446     // Truncate to 24 bits.
   4447     LHSKnown = LHSKnown.trunc(24);
   4448     RHSKnown = RHSKnown.trunc(24);
   4449 
   4450     if (Opc == AMDGPUISD::MUL_I24) {
   4451       unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
   4452       unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
   4453       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
   4454       if (MaxValBits >= 32)
   4455         break;
   4456       bool LHSNegative = LHSKnown.isNegative();
   4457       bool LHSNonNegative = LHSKnown.isNonNegative();
   4458       bool LHSPositive = LHSKnown.isStrictlyPositive();
   4459       bool RHSNegative = RHSKnown.isNegative();
   4460       bool RHSNonNegative = RHSKnown.isNonNegative();
   4461       bool RHSPositive = RHSKnown.isStrictlyPositive();
   4462 
   4463       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
   4464         Known.Zero.setHighBits(32 - MaxValBits);
   4465       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
   4466         Known.One.setHighBits(32 - MaxValBits);
   4467     } else {
   4468       unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
   4469       unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
   4470       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
   4471       if (MaxValBits >= 32)
   4472         break;
   4473       Known.Zero.setHighBits(32 - MaxValBits);
   4474     }
   4475     break;
   4476   }
   4477   case AMDGPUISD::PERM: {
   4478     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   4479     if (!CMask)
   4480       return;
   4481 
   4482     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
   4483     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
   4484     unsigned Sel = CMask->getZExtValue();
   4485 
   4486     for (unsigned I = 0; I < 32; I += 8) {
   4487       unsigned SelBits = Sel & 0xff;
   4488       if (SelBits < 4) {
   4489         SelBits *= 8;
   4490         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
   4491         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
   4492       } else if (SelBits < 7) {
   4493         SelBits = (SelBits & 3) * 8;
   4494         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
   4495         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
   4496       } else if (SelBits == 0x0c) {
   4497         Known.Zero |= 0xFFull << I;
   4498       } else if (SelBits > 0x0c) {
   4499         Known.One |= 0xFFull << I;
   4500       }
   4501       Sel >>= 8;
   4502     }
   4503     break;
   4504   }
   4505   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
   4506     Known.Zero.setHighBits(24);
   4507     break;
   4508   }
   4509   case AMDGPUISD::BUFFER_LOAD_USHORT: {
   4510     Known.Zero.setHighBits(16);
   4511     break;
   4512   }
   4513   case AMDGPUISD::LDS: {
   4514     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
   4515     Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
   4516 
   4517     Known.Zero.setHighBits(16);
   4518     Known.Zero.setLowBits(Log2(Alignment));
   4519     break;
   4520   }
   4521   case ISD::INTRINSIC_WO_CHAIN: {
   4522     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   4523     switch (IID) {
   4524     case Intrinsic::amdgcn_mbcnt_lo:
   4525     case Intrinsic::amdgcn_mbcnt_hi: {
   4526       const GCNSubtarget &ST =
   4527           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
   4528       // These return at most the wavefront size - 1.
   4529       unsigned Size = Op.getValueType().getSizeInBits();
   4530       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
   4531       break;
   4532     }
   4533     default:
   4534       break;
   4535     }
   4536   }
   4537   }
   4538 }
   4539 
   4540 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   4541     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
   4542     unsigned Depth) const {
   4543   switch (Op.getOpcode()) {
   4544   case AMDGPUISD::BFE_I32: {
   4545     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   4546     if (!Width)
   4547       return 1;
   4548 
   4549     unsigned SignBits = 32 - Width->getZExtValue() + 1;
   4550     if (!isNullConstant(Op.getOperand(1)))
   4551       return SignBits;
   4552 
   4553     // TODO: Could probably figure something out with non-0 offsets.
   4554     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
   4555     return std::max(SignBits, Op0SignBits);
   4556   }
   4557 
   4558   case AMDGPUISD::BFE_U32: {
   4559     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   4560     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
   4561   }
   4562 
   4563   case AMDGPUISD::CARRY:
   4564   case AMDGPUISD::BORROW:
   4565     return 31;
   4566   case AMDGPUISD::BUFFER_LOAD_BYTE:
   4567     return 25;
   4568   case AMDGPUISD::BUFFER_LOAD_SHORT:
   4569     return 17;
   4570   case AMDGPUISD::BUFFER_LOAD_UBYTE:
   4571     return 24;
   4572   case AMDGPUISD::BUFFER_LOAD_USHORT:
   4573     return 16;
   4574   case AMDGPUISD::FP_TO_FP16:
   4575   case AMDGPUISD::FP16_ZEXT:
   4576     return 16;
   4577   default:
   4578     return 1;
   4579   }
   4580 }
   4581 
   4582 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
   4583   GISelKnownBits &Analysis, Register R,
   4584   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
   4585   unsigned Depth) const {
   4586   const MachineInstr *MI = MRI.getVRegDef(R);
   4587   if (!MI)
   4588     return 1;
   4589 
   4590   // TODO: Check range metadata on MMO.
   4591   switch (MI->getOpcode()) {
   4592   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
   4593     return 25;
   4594   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
   4595     return 17;
   4596   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
   4597     return 24;
   4598   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
   4599     return 16;
   4600   default:
   4601     return 1;
   4602   }
   4603 }
   4604 
   4605 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
   4606                                                         const SelectionDAG &DAG,
   4607                                                         bool SNaN,
   4608                                                         unsigned Depth) const {
   4609   unsigned Opcode = Op.getOpcode();
   4610   switch (Opcode) {
   4611   case AMDGPUISD::FMIN_LEGACY:
   4612   case AMDGPUISD::FMAX_LEGACY: {
   4613     if (SNaN)
   4614       return true;
   4615 
   4616     // TODO: Can check no nans on one of the operands for each one, but which
   4617     // one?
   4618     return false;
   4619   }
   4620   case AMDGPUISD::FMUL_LEGACY:
   4621   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
   4622     if (SNaN)
   4623       return true;
   4624     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
   4625            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
   4626   }
   4627   case AMDGPUISD::FMED3:
   4628   case AMDGPUISD::FMIN3:
   4629   case AMDGPUISD::FMAX3:
   4630   case AMDGPUISD::FMAD_FTZ: {
   4631     if (SNaN)
   4632       return true;
   4633     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
   4634            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
   4635            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
   4636   }
   4637   case AMDGPUISD::CVT_F32_UBYTE0:
   4638   case AMDGPUISD::CVT_F32_UBYTE1:
   4639   case AMDGPUISD::CVT_F32_UBYTE2:
   4640   case AMDGPUISD::CVT_F32_UBYTE3:
   4641     return true;
   4642 
   4643   case AMDGPUISD::RCP:
   4644   case AMDGPUISD::RSQ:
   4645   case AMDGPUISD::RCP_LEGACY:
   4646   case AMDGPUISD::RSQ_CLAMP: {
   4647     if (SNaN)
   4648       return true;
   4649 
   4650     // TODO: Need is known positive check.
   4651     return false;
   4652   }
   4653   case AMDGPUISD::LDEXP:
   4654   case AMDGPUISD::FRACT: {
   4655     if (SNaN)
   4656       return true;
   4657     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
   4658   }
   4659   case AMDGPUISD::DIV_SCALE:
   4660   case AMDGPUISD::DIV_FMAS:
   4661   case AMDGPUISD::DIV_FIXUP:
   4662     // TODO: Refine on operands.
   4663     return SNaN;
   4664   case AMDGPUISD::SIN_HW:
   4665   case AMDGPUISD::COS_HW: {
   4666     // TODO: Need check for infinity
   4667     return SNaN;
   4668   }
   4669   case ISD::INTRINSIC_WO_CHAIN: {
   4670     unsigned IntrinsicID
   4671       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   4672     // TODO: Handle more intrinsics
   4673     switch (IntrinsicID) {
   4674     case Intrinsic::amdgcn_cubeid:
   4675       return true;
   4676 
   4677     case Intrinsic::amdgcn_frexp_mant: {
   4678       if (SNaN)
   4679         return true;
   4680       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
   4681     }
   4682     case Intrinsic::amdgcn_cvt_pkrtz: {
   4683       if (SNaN)
   4684         return true;
   4685       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
   4686              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
   4687     }
   4688     case Intrinsic::amdgcn_rcp:
   4689     case Intrinsic::amdgcn_rsq:
   4690     case Intrinsic::amdgcn_rcp_legacy:
   4691     case Intrinsic::amdgcn_rsq_legacy:
   4692     case Intrinsic::amdgcn_rsq_clamp: {
   4693       if (SNaN)
   4694         return true;
   4695 
   4696       // TODO: Need is known positive check.
   4697       return false;
   4698     }
   4699     case Intrinsic::amdgcn_trig_preop:
   4700     case Intrinsic::amdgcn_fdot2:
   4701       // TODO: Refine on operand
   4702       return SNaN;
   4703     case Intrinsic::amdgcn_fma_legacy:
   4704       if (SNaN)
   4705         return true;
   4706       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
   4707              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
   4708              DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
   4709     default:
   4710       return false;
   4711     }
   4712   }
   4713   default:
   4714     return false;
   4715   }
   4716 }
   4717 
   4718 TargetLowering::AtomicExpansionKind
   4719 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   4720   switch (RMW->getOperation()) {
   4721   case AtomicRMWInst::Nand:
   4722   case AtomicRMWInst::FAdd:
   4723   case AtomicRMWInst::FSub:
   4724     return AtomicExpansionKind::CmpXChg;
   4725   default:
   4726     return AtomicExpansionKind::None;
   4727   }
   4728 }
   4729