Home | History | Annotate | Line # | Download | only in i386
i386-expand.cc revision 1.1.1.1
      1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
      2 
      3 This file is part of GCC.
      4 
      5 GCC is free software; you can redistribute it and/or modify
      6 it under the terms of the GNU General Public License as published by
      7 the Free Software Foundation; either version 3, or (at your option)
      8 any later version.
      9 
     10 GCC is distributed in the hope that it will be useful,
     11 but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 GNU General Public License for more details.
     14 
     15 You should have received a copy of the GNU General Public License
     16 along with GCC; see the file COPYING3.  If not see
     17 <http://www.gnu.org/licenses/>.  */
     18 
     19 #define IN_TARGET_CODE 1
     20 
     21 #include "config.h"
     22 #include "system.h"
     23 #include "coretypes.h"
     24 #include "backend.h"
     25 #include "rtl.h"
     26 #include "tree.h"
     27 #include "memmodel.h"
     28 #include "gimple.h"
     29 #include "cfghooks.h"
     30 #include "cfgloop.h"
     31 #include "df.h"
     32 #include "tm_p.h"
     33 #include "stringpool.h"
     34 #include "expmed.h"
     35 #include "optabs.h"
     36 #include "regs.h"
     37 #include "emit-rtl.h"
     38 #include "recog.h"
     39 #include "cgraph.h"
     40 #include "diagnostic.h"
     41 #include "cfgbuild.h"
     42 #include "alias.h"
     43 #include "fold-const.h"
     44 #include "attribs.h"
     45 #include "calls.h"
     46 #include "stor-layout.h"
     47 #include "varasm.h"
     48 #include "output.h"
     49 #include "insn-attr.h"
     50 #include "flags.h"
     51 #include "except.h"
     52 #include "explow.h"
     53 #include "expr.h"
     54 #include "cfgrtl.h"
     55 #include "common/common-target.h"
     56 #include "langhooks.h"
     57 #include "reload.h"
     58 #include "gimplify.h"
     59 #include "dwarf2.h"
     60 #include "tm-constrs.h"
     61 #include "cselib.h"
     62 #include "sched-int.h"
     63 #include "opts.h"
     64 #include "tree-pass.h"
     65 #include "context.h"
     66 #include "pass_manager.h"
     67 #include "target-globals.h"
     68 #include "gimple-iterator.h"
     69 #include "tree-vectorizer.h"
     70 #include "shrink-wrap.h"
     71 #include "builtins.h"
     72 #include "rtl-iter.h"
     73 #include "tree-iterator.h"
     74 #include "dbgcnt.h"
     75 #include "case-cfn-macros.h"
     76 #include "dojump.h"
     77 #include "fold-const-call.h"
     78 #include "tree-vrp.h"
     79 #include "tree-ssanames.h"
     80 #include "selftest.h"
     81 #include "selftest-rtl.h"
     82 #include "print-rtl.h"
     83 #include "intl.h"
     84 #include "ifcvt.h"
     85 #include "symbol-summary.h"
     86 #include "ipa-prop.h"
     87 #include "ipa-fnsummary.h"
     88 #include "wide-int-bitmask.h"
     89 #include "tree-vector-builder.h"
     90 #include "debug.h"
     91 #include "dwarf2out.h"
     92 #include "i386-options.h"
     93 #include "i386-builtins.h"
     94 #include "i386-expand.h"
     95 
     96 /* Split one or more double-mode RTL references into pairs of half-mode
     97    references.  The RTL can be REG, offsettable MEM, integer constant, or
     98    CONST_DOUBLE.  "operands" is a pointer to an array of double-mode RTLs to
     99    split and "num" is its length.  lo_half and hi_half are output arrays
    100    that parallel "operands".  */
    101 
    102 void
    103 split_double_mode (machine_mode mode, rtx operands[],
    104 		   int num, rtx lo_half[], rtx hi_half[])
    105 {
    106   machine_mode half_mode;
    107   unsigned int byte;
    108   rtx mem_op = NULL_RTX;
    109   int mem_num = 0;
    110 
    111   switch (mode)
    112     {
    113     case E_TImode:
    114       half_mode = DImode;
    115       break;
    116     case E_DImode:
    117       half_mode = SImode;
    118       break;
    119     case E_P2HImode:
    120       half_mode = HImode;
    121       break;
    122     case E_P2QImode:
    123       half_mode = QImode;
    124       break;
    125     default:
    126       gcc_unreachable ();
    127     }
    128 
    129   byte = GET_MODE_SIZE (half_mode);
    130 
    131   while (num--)
    132     {
    133       rtx op = operands[num];
    134 
    135       /* simplify_subreg refuse to split volatile memory addresses,
    136          but we still have to handle it.  */
    137       if (MEM_P (op))
    138 	{
    139 	  if (mem_op && rtx_equal_p (op, mem_op))
    140 	    {
    141 	      lo_half[num] = lo_half[mem_num];
    142 	      hi_half[num] = hi_half[mem_num];
    143 	    }
    144 	  else
    145 	    {
    146 	      mem_op = op;
    147 	      mem_num = num;
    148 	      lo_half[num] = adjust_address (op, half_mode, 0);
    149 	      hi_half[num] = adjust_address (op, half_mode, byte);
    150 	    }
    151 	}
    152       else
    153 	{
    154 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
    155 					      GET_MODE (op) == VOIDmode
    156 					      ? mode : GET_MODE (op), 0);
    157 
    158 	  rtx tmp = simplify_gen_subreg (half_mode, op,
    159 					 GET_MODE (op) == VOIDmode
    160 					 ? mode : GET_MODE (op), byte);
    161 	  /* simplify_gen_subreg will return NULL RTX for the
    162 	     high half of the paradoxical subreg. */
    163 	  hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
    164 	}
    165     }
    166 }
    167 
    168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
    169    for the target.  */
    170 
    171 void
    172 ix86_expand_clear (rtx dest)
    173 {
    174   rtx tmp;
    175 
    176   /* We play register width games, which are only valid after reload.  */
    177   gcc_assert (reload_completed);
    178 
    179   /* Avoid HImode and its attendant prefix byte.  */
    180   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
    181     dest = gen_rtx_REG (SImode, REGNO (dest));
    182   tmp = gen_rtx_SET (dest, const0_rtx);
    183 
    184   if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
    185     {
    186       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
    187       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
    188     }
    189 
    190   emit_insn (tmp);
    191 }
    192 
    193 /* Return true if V can be broadcasted from an integer of WIDTH bits
    194    which is returned in VAL_BROADCAST.  Otherwise, return false.  */
    195 
    196 static bool
    197 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
    198 		HOST_WIDE_INT &val_broadcast)
    199 {
    200   wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
    201   val_broadcast = wi::extract_uhwi (val, 0, width);
    202   for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
    203     {
    204       HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
    205       if (val_broadcast != each)
    206 	return false;
    207     }
    208   val_broadcast = sext_hwi (val_broadcast, width);
    209   return true;
    210 }
    211 
    212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
    213 
    214 static rtx
    215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
    216 {
    217   /* Don't use integer vector broadcast if we can't move from GPR to SSE
    218      register directly.  */
    219   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
    220     return nullptr;
    221 
    222   /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
    223      broadcast only if vector broadcast is available.  */
    224   if (!TARGET_AVX
    225       || !CONST_WIDE_INT_P (op)
    226       || standard_sse_constant_p (op, mode)
    227       || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
    228 	  != GET_MODE_BITSIZE (mode)))
    229     return nullptr;
    230 
    231   HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
    232   HOST_WIDE_INT val_broadcast;
    233   scalar_int_mode broadcast_mode;
    234   if (TARGET_AVX2
    235       && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
    236 			 val_broadcast))
    237     broadcast_mode = QImode;
    238   else if (TARGET_AVX2
    239 	   && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
    240 			      val_broadcast))
    241     broadcast_mode = HImode;
    242   else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
    243 			   val_broadcast))
    244     broadcast_mode = SImode;
    245   else if (TARGET_64BIT
    246 	   && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
    247 			      val_broadcast))
    248     broadcast_mode = DImode;
    249   else
    250     return nullptr;
    251 
    252   /* Check if OP can be broadcasted from VAL.  */
    253   for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
    254     if (val != CONST_WIDE_INT_ELT (op, i))
    255       return nullptr;
    256 
    257   unsigned int nunits = (GET_MODE_SIZE (mode)
    258 			 / GET_MODE_SIZE (broadcast_mode));
    259   machine_mode vector_mode;
    260   if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
    261     gcc_unreachable ();
    262   rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
    263   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
    264 					       target,
    265 					       GEN_INT (val_broadcast));
    266   gcc_assert (ok);
    267   target = lowpart_subreg (mode, target, vector_mode);
    268   return target;
    269 }
    270 
    271 void
    272 ix86_expand_move (machine_mode mode, rtx operands[])
    273 {
    274   rtx op0, op1;
    275   rtx tmp, addend = NULL_RTX;
    276   enum tls_model model;
    277 
    278   op0 = operands[0];
    279   op1 = operands[1];
    280 
    281   /* Avoid complex sets of likely spilled hard registers before reload.  */
    282   if (!ix86_hardreg_mov_ok (op0, op1))
    283     {
    284       tmp = gen_reg_rtx (mode);
    285       operands[0] = tmp;
    286       ix86_expand_move (mode, operands);
    287       operands[0] = op0;
    288       operands[1] = tmp;
    289       op1 = tmp;
    290     }
    291 
    292   switch (GET_CODE (op1))
    293     {
    294     case CONST:
    295       tmp = XEXP (op1, 0);
    296 
    297       if (GET_CODE (tmp) != PLUS
    298 	  || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
    299 	break;
    300 
    301       op1 = XEXP (tmp, 0);
    302       addend = XEXP (tmp, 1);
    303       /* FALLTHRU */
    304 
    305     case SYMBOL_REF:
    306       model = SYMBOL_REF_TLS_MODEL (op1);
    307 
    308       if (model)
    309 	op1 = legitimize_tls_address (op1, model, true);
    310       else if (ix86_force_load_from_GOT_p (op1))
    311 	{
    312 	  /* Load the external function address via GOT slot to avoid PLT.  */
    313 	  op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
    314 				(TARGET_64BIT
    315 				 ? UNSPEC_GOTPCREL
    316 				 : UNSPEC_GOT));
    317 	  op1 = gen_rtx_CONST (Pmode, op1);
    318 	  op1 = gen_const_mem (Pmode, op1);
    319 	  set_mem_alias_set (op1, ix86_GOT_alias_set ());
    320 	}
    321       else
    322 	{
    323 	  tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
    324 	  if (tmp)
    325 	    {
    326 	      op1 = tmp;
    327 	      if (!addend)
    328 		break;
    329 	    }
    330 	  else
    331 	    {
    332 	      op1 = operands[1];
    333 	      break;
    334 	    }
    335 	}
    336 
    337       if (addend)
    338 	{
    339 	  op1 = force_operand (op1, NULL_RTX);
    340 	  op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
    341 				     op0, 1, OPTAB_DIRECT);
    342 	}
    343       else
    344 	op1 = force_operand (op1, op0);
    345 
    346       if (op1 == op0)
    347 	return;
    348 
    349       op1 = convert_to_mode (mode, op1, 1);
    350 
    351     default:
    352       break;
    353 
    354     case SUBREG:
    355       /* As not all values in XFmode are representable in real_value,
    356 	 we might be called with unfoldable SUBREGs of constants.  */
    357       if (mode == XFmode
    358 	  && CONSTANT_P (SUBREG_REG (op1))
    359 	  && can_create_pseudo_p ())
    360 	{
    361 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
    362 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
    363 	  if (r)
    364 	    r = validize_mem (r);
    365 	  else
    366 	    r = force_reg (imode, SUBREG_REG (op1));
    367 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
    368 	}
    369       break;
    370     }
    371 
    372   if ((flag_pic || MACHOPIC_INDIRECT)
    373       && symbolic_operand (op1, mode))
    374     {
    375       if (TARGET_MACHO && !TARGET_64BIT)
    376 	{
    377 #if TARGET_MACHO
    378 	  /* dynamic-no-pic */
    379 	  if (MACHOPIC_INDIRECT)
    380 	    {
    381 	      rtx temp = (op0 && REG_P (op0) && mode == Pmode)
    382 			 ? op0 : gen_reg_rtx (Pmode);
    383 	      op1 = machopic_indirect_data_reference (op1, temp);
    384 	      if (MACHOPIC_PURE)
    385 		op1 = machopic_legitimize_pic_address (op1, mode,
    386 						       temp == op1 ? 0 : temp);
    387 	    }
    388 	  if (op0 != op1 && GET_CODE (op0) != MEM)
    389 	    {
    390 	      rtx insn = gen_rtx_SET (op0, op1);
    391 	      emit_insn (insn);
    392 	      return;
    393 	    }
    394 	  if (GET_CODE (op0) == MEM)
    395 	    op1 = force_reg (Pmode, op1);
    396 	  else
    397 	    {
    398 	      rtx temp = op0;
    399 	      if (GET_CODE (temp) != REG)
    400 		temp = gen_reg_rtx (Pmode);
    401 	      temp = legitimize_pic_address (op1, temp);
    402 	      if (temp == op0)
    403 	    return;
    404 	      op1 = temp;
    405 	    }
    406       /* dynamic-no-pic */
    407 #endif
    408 	}
    409       else
    410 	{
    411 	  if (MEM_P (op0))
    412 	    op1 = force_reg (mode, op1);
    413 	  else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
    414 	    {
    415 	      rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
    416 	      op1 = legitimize_pic_address (op1, reg);
    417 	      if (op0 == op1)
    418 		return;
    419 	      op1 = convert_to_mode (mode, op1, 1);
    420 	    }
    421 	}
    422     }
    423   else
    424     {
    425       if (MEM_P (op0)
    426 	  && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
    427 	      || !push_operand (op0, mode))
    428 	  && MEM_P (op1))
    429 	op1 = force_reg (mode, op1);
    430 
    431       if (push_operand (op0, mode)
    432 	  && ! general_no_elim_operand (op1, mode))
    433 	op1 = copy_to_mode_reg (mode, op1);
    434 
    435       /* Force large constants in 64bit compilation into register
    436 	 to get them CSEed.  */
    437       if (can_create_pseudo_p ()
    438 	  && (mode == DImode) && TARGET_64BIT
    439 	  && immediate_operand (op1, mode)
    440 	  && !x86_64_zext_immediate_operand (op1, VOIDmode)
    441 	  && !register_operand (op0, mode)
    442 	  && optimize)
    443 	op1 = copy_to_mode_reg (mode, op1);
    444 
    445       if (can_create_pseudo_p ())
    446 	{
    447 	  if (CONST_DOUBLE_P (op1))
    448 	    {
    449 	      /* If we are loading a floating point constant to a
    450 		 register, force the value to memory now, since we'll
    451 		 get better code out the back end.  */
    452 
    453 	      op1 = validize_mem (force_const_mem (mode, op1));
    454 	      if (!register_operand (op0, mode))
    455 		{
    456 		  rtx temp = gen_reg_rtx (mode);
    457 		  emit_insn (gen_rtx_SET (temp, op1));
    458 		  emit_move_insn (op0, temp);
    459 		  return;
    460 		}
    461 	    }
    462 	  else if (GET_MODE_SIZE (mode) >= 16)
    463 	    {
    464 	      rtx tmp = ix86_convert_const_wide_int_to_broadcast
    465 		(GET_MODE (op0), op1);
    466 	      if (tmp != nullptr)
    467 		op1 = tmp;
    468 	    }
    469 	}
    470     }
    471 
    472   emit_insn (gen_rtx_SET (op0, op1));
    473 }
    474 
    475 /* OP is a memref of CONST_VECTOR, return scalar constant mem
    476    if CONST_VECTOR is a vec_duplicate, else return NULL.  */
    477 static rtx
    478 ix86_broadcast_from_constant (machine_mode mode, rtx op)
    479 {
    480   int nunits = GET_MODE_NUNITS (mode);
    481   if (nunits < 2)
    482     return nullptr;
    483 
    484   /* Don't use integer vector broadcast if we can't move from GPR to SSE
    485      register directly.  */
    486   if (!TARGET_INTER_UNIT_MOVES_TO_VEC
    487       && INTEGRAL_MODE_P (mode))
    488     return nullptr;
    489 
    490   /* Convert CONST_VECTOR to a non-standard SSE constant integer
    491      broadcast only if vector broadcast is available.  */
    492   if (!(TARGET_AVX2
    493 	|| (TARGET_AVX
    494 	    && (GET_MODE_INNER (mode) == SImode
    495 		|| GET_MODE_INNER (mode) == DImode))
    496 	|| FLOAT_MODE_P (mode))
    497       || standard_sse_constant_p (op, mode))
    498     return nullptr;
    499 
    500   /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
    501      We can still put 64-bit integer constant in memory when
    502      avx512 embed broadcast is available.  */
    503   if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
    504       && (!TARGET_AVX512F
    505 	  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
    506     return nullptr;
    507 
    508   if (GET_MODE_INNER (mode) == TImode)
    509     return nullptr;
    510 
    511   rtx constant = get_pool_constant (XEXP (op, 0));
    512   if (GET_CODE (constant) != CONST_VECTOR)
    513     return nullptr;
    514 
    515   /* There could be some rtx like
    516      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
    517      but with "*.LC1" refer to V2DI constant vector.  */
    518   if (GET_MODE (constant) != mode)
    519     {
    520       constant = simplify_subreg (mode, constant, GET_MODE (constant),
    521 				  0);
    522       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
    523 	return nullptr;
    524     }
    525 
    526   rtx first = XVECEXP (constant, 0, 0);
    527 
    528   for (int i = 1; i < nunits; ++i)
    529     {
    530       rtx tmp = XVECEXP (constant, 0, i);
    531       /* Vector duplicate value.  */
    532       if (!rtx_equal_p (tmp, first))
    533 	return nullptr;
    534     }
    535 
    536   return first;
    537 }
    538 
    539 void
    540 ix86_expand_vector_move (machine_mode mode, rtx operands[])
    541 {
    542   rtx op0 = operands[0], op1 = operands[1];
    543   /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
    544      psABI since the biggest alignment is 4 byte for IA MCU psABI.  */
    545   unsigned int align = (TARGET_IAMCU
    546 			? GET_MODE_BITSIZE (mode)
    547 			: GET_MODE_ALIGNMENT (mode));
    548 
    549   if (push_operand (op0, VOIDmode))
    550     op0 = emit_move_resolve_push (mode, op0);
    551 
    552   /* Force constants other than zero into memory.  We do not know how
    553      the instructions used to build constants modify the upper 64 bits
    554      of the register, once we have that information we may be able
    555      to handle some of them more efficiently.  */
    556   if (can_create_pseudo_p ()
    557       && (CONSTANT_P (op1)
    558 	  || (SUBREG_P (op1)
    559 	      && CONSTANT_P (SUBREG_REG (op1))))
    560       && ((register_operand (op0, mode)
    561 	   && !standard_sse_constant_p (op1, mode))
    562 	  /* ix86_expand_vector_move_misalign() does not like constants.  */
    563 	  || (SSE_REG_MODE_P (mode)
    564 	      && MEM_P (op0)
    565 	      && MEM_ALIGN (op0) < align)))
    566     {
    567       if (SUBREG_P (op1))
    568 	{
    569 	  machine_mode imode = GET_MODE (SUBREG_REG (op1));
    570 	  rtx r = force_const_mem (imode, SUBREG_REG (op1));
    571 	  if (r)
    572 	    r = validize_mem (r);
    573 	  else
    574 	    r = force_reg (imode, SUBREG_REG (op1));
    575 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
    576 	}
    577       else
    578 	{
    579 	  machine_mode mode = GET_MODE (op0);
    580 	  rtx tmp = ix86_convert_const_wide_int_to_broadcast
    581 	    (mode, op1);
    582 	  if (tmp == nullptr)
    583 	    op1 = validize_mem (force_const_mem (mode, op1));
    584 	  else
    585 	    op1 = tmp;
    586 	}
    587     }
    588 
    589   if (can_create_pseudo_p ()
    590       && GET_MODE_SIZE (mode) >= 16
    591       && VECTOR_MODE_P (mode)
    592       && (MEM_P (op1)
    593 	  && SYMBOL_REF_P (XEXP (op1, 0))
    594 	  && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
    595     {
    596       rtx first = ix86_broadcast_from_constant (mode, op1);
    597       if (first != nullptr)
    598 	{
    599 	  /* Broadcast to XMM/YMM/ZMM register from an integer
    600 	     constant or scalar mem.  */
    601 	  op1 = gen_reg_rtx (mode);
    602 	  if (FLOAT_MODE_P (mode)
    603 	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
    604 	    first = force_const_mem (GET_MODE_INNER (mode), first);
    605 	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
    606 						       op1, first);
    607 	  gcc_assert (ok);
    608 	  emit_move_insn (op0, op1);
    609 	  return;
    610 	}
    611     }
    612 
    613   /* We need to check memory alignment for SSE mode since attribute
    614      can make operands unaligned.  */
    615   if (can_create_pseudo_p ()
    616       && SSE_REG_MODE_P (mode)
    617       && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
    618 	  || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
    619     {
    620       rtx tmp[2];
    621 
    622       /* ix86_expand_vector_move_misalign() does not like both
    623 	 arguments in memory.  */
    624       if (!register_operand (op0, mode)
    625 	  && !register_operand (op1, mode))
    626 	{
    627 	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
    628 	  emit_move_insn (scratch, op1);
    629 	  op1 = scratch;
    630 	}
    631 
    632       tmp[0] = op0; tmp[1] = op1;
    633       ix86_expand_vector_move_misalign (mode, tmp);
    634       return;
    635     }
    636 
    637   /* Special case TImode to V1TImode conversions, via V2DI.  */
    638   if (mode == V1TImode
    639       && SUBREG_P (op1)
    640       && GET_MODE (SUBREG_REG (op1)) == TImode
    641       && TARGET_64BIT && TARGET_SSE
    642       && can_create_pseudo_p ())
    643     {
    644       rtx tmp = gen_reg_rtx (V2DImode);
    645       rtx lo = gen_reg_rtx (DImode);
    646       rtx hi = gen_reg_rtx (DImode);
    647       emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
    648       emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
    649       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
    650       emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
    651       return;
    652     }
    653 
    654   /* If operand0 is a hard register, make operand1 a pseudo.  */
    655   if (can_create_pseudo_p ()
    656       && !ix86_hardreg_mov_ok (op0, op1))
    657     {
    658       rtx tmp = gen_reg_rtx (GET_MODE (op0));
    659       emit_move_insn (tmp, op1);
    660       emit_move_insn (op0, tmp);
    661       return;
    662     }
    663 
    664   /* Make operand1 a register if it isn't already.  */
    665   if (can_create_pseudo_p ()
    666       && !register_operand (op0, mode)
    667       && !register_operand (op1, mode))
    668     {
    669       rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
    670       emit_move_insn (tmp, op1);
    671       emit_move_insn (op0, tmp);
    672       return;
    673     }
    674 
    675   emit_insn (gen_rtx_SET (op0, op1));
    676 }
    677 
    678 /* Split 32-byte AVX unaligned load and store if needed.  */
    679 
    680 static void
    681 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
    682 {
    683   rtx m;
    684   rtx (*extract) (rtx, rtx, rtx);
    685   machine_mode mode;
    686 
    687   if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
    688       || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
    689     {
    690       emit_insn (gen_rtx_SET (op0, op1));
    691       return;
    692     }
    693 
    694   rtx orig_op0 = NULL_RTX;
    695   mode = GET_MODE (op0);
    696   switch (GET_MODE_CLASS (mode))
    697     {
    698     case MODE_VECTOR_INT:
    699     case MODE_INT:
    700       if (mode != V32QImode)
    701 	{
    702 	  if (!MEM_P (op0))
    703 	    {
    704 	      orig_op0 = op0;
    705 	      op0 = gen_reg_rtx (V32QImode);
    706 	    }
    707 	  else
    708 	    op0 = gen_lowpart (V32QImode, op0);
    709 	  op1 = gen_lowpart (V32QImode, op1);
    710 	  mode = V32QImode;
    711 	}
    712       break;
    713     case MODE_VECTOR_FLOAT:
    714       break;
    715     default:
    716       gcc_unreachable ();
    717     }
    718 
    719   switch (mode)
    720     {
    721     default:
    722       gcc_unreachable ();
    723     case E_V32QImode:
    724       extract = gen_avx_vextractf128v32qi;
    725       mode = V16QImode;
    726       break;
    727     case E_V16HFmode:
    728       extract = gen_avx_vextractf128v16hf;
    729       mode = V8HFmode;
    730       break;
    731     case E_V8SFmode:
    732       extract = gen_avx_vextractf128v8sf;
    733       mode = V4SFmode;
    734       break;
    735     case E_V4DFmode:
    736       extract = gen_avx_vextractf128v4df;
    737       mode = V2DFmode;
    738       break;
    739     }
    740 
    741   if (MEM_P (op1))
    742     {
    743       rtx r = gen_reg_rtx (mode);
    744       m = adjust_address (op1, mode, 0);
    745       emit_move_insn (r, m);
    746       m = adjust_address (op1, mode, 16);
    747       r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
    748       emit_move_insn (op0, r);
    749     }
    750   else if (MEM_P (op0))
    751     {
    752       m = adjust_address (op0, mode, 0);
    753       emit_insn (extract (m, op1, const0_rtx));
    754       m = adjust_address (op0, mode, 16);
    755       emit_insn (extract (m, copy_rtx (op1), const1_rtx));
    756     }
    757   else
    758     gcc_unreachable ();
    759 
    760   if (orig_op0)
    761     emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
    762 }
    763 
    764 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
    765    straight to ix86_expand_vector_move.  */
    766 /* Code generation for scalar reg-reg moves of single and double precision data:
    767      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
    768        movaps reg, reg
    769      else
    770        movss reg, reg
    771      if (x86_sse_partial_reg_dependency == true)
    772        movapd reg, reg
    773      else
    774        movsd reg, reg
    775 
    776    Code generation for scalar loads of double precision data:
    777      if (x86_sse_split_regs == true)
    778        movlpd mem, reg      (gas syntax)
    779      else
    780        movsd mem, reg
    781 
    782    Code generation for unaligned packed loads of single precision data
    783    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
    784      if (x86_sse_unaligned_move_optimal)
    785        movups mem, reg
    786 
    787      if (x86_sse_partial_reg_dependency == true)
    788        {
    789          xorps  reg, reg
    790          movlps mem, reg
    791          movhps mem+8, reg
    792        }
    793      else
    794        {
    795          movlps mem, reg
    796          movhps mem+8, reg
    797        }
    798 
    799    Code generation for unaligned packed loads of double precision data
    800    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
    801      if (x86_sse_unaligned_move_optimal)
    802        movupd mem, reg
    803 
    804      if (x86_sse_split_regs == true)
    805        {
    806          movlpd mem, reg
    807          movhpd mem+8, reg
    808        }
    809      else
    810        {
    811          movsd  mem, reg
    812          movhpd mem+8, reg
    813        }
    814  */
    815 
    816 void
    817 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
    818 {
    819   rtx op0, op1, m;
    820 
    821   op0 = operands[0];
    822   op1 = operands[1];
    823 
    824   /* Use unaligned load/store for AVX512 or when optimizing for size.  */
    825   if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
    826     {
    827       emit_insn (gen_rtx_SET (op0, op1));
    828       return;
    829     }
    830 
    831   if (TARGET_AVX)
    832     {
    833       if (GET_MODE_SIZE (mode) == 32)
    834 	ix86_avx256_split_vector_move_misalign (op0, op1);
    835       else
    836 	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
    837 	emit_insn (gen_rtx_SET (op0, op1));
    838       return;
    839     }
    840 
    841   if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
    842       || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
    843     {
    844       emit_insn (gen_rtx_SET (op0, op1));
    845       return;
    846     }
    847 
    848   /* ??? If we have typed data, then it would appear that using
    849      movdqu is the only way to get unaligned data loaded with
    850      integer type.  */
    851   if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
    852     {
    853       emit_insn (gen_rtx_SET (op0, op1));
    854       return;
    855     }
    856 
    857   if (MEM_P (op1))
    858     {
    859       if (TARGET_SSE2 && mode == V2DFmode)
    860         {
    861           rtx zero;
    862 
    863 	  /* When SSE registers are split into halves, we can avoid
    864 	     writing to the top half twice.  */
    865 	  if (TARGET_SSE_SPLIT_REGS)
    866 	    {
    867 	      emit_clobber (op0);
    868 	      zero = op0;
    869 	    }
    870 	  else
    871 	    {
    872 	      /* ??? Not sure about the best option for the Intel chips.
    873 		 The following would seem to satisfy; the register is
    874 		 entirely cleared, breaking the dependency chain.  We
    875 		 then store to the upper half, with a dependency depth
    876 		 of one.  A rumor has it that Intel recommends two movsd
    877 		 followed by an unpacklpd, but this is unconfirmed.  And
    878 		 given that the dependency depth of the unpacklpd would
    879 		 still be one, I'm not sure why this would be better.  */
    880 	      zero = CONST0_RTX (V2DFmode);
    881 	    }
    882 
    883 	  m = adjust_address (op1, DFmode, 0);
    884 	  emit_insn (gen_sse2_loadlpd (op0, zero, m));
    885 	  m = adjust_address (op1, DFmode, 8);
    886 	  emit_insn (gen_sse2_loadhpd (op0, op0, m));
    887 	}
    888       else
    889         {
    890 	  rtx t;
    891 
    892 	  if (mode != V4SFmode)
    893 	    t = gen_reg_rtx (V4SFmode);
    894 	  else
    895 	    t = op0;
    896 
    897 	  if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
    898 	    emit_move_insn (t, CONST0_RTX (V4SFmode));
    899 	  else
    900 	    emit_clobber (t);
    901 
    902 	  m = adjust_address (op1, V2SFmode, 0);
    903 	  emit_insn (gen_sse_loadlps (t, t, m));
    904 	  m = adjust_address (op1, V2SFmode, 8);
    905 	  emit_insn (gen_sse_loadhps (t, t, m));
    906 	  if (mode != V4SFmode)
    907 	    emit_move_insn (op0, gen_lowpart (mode, t));
    908 	}
    909     }
    910   else if (MEM_P (op0))
    911     {
    912       if (TARGET_SSE2 && mode == V2DFmode)
    913 	{
    914 	  m = adjust_address (op0, DFmode, 0);
    915 	  emit_insn (gen_sse2_storelpd (m, op1));
    916 	  m = adjust_address (op0, DFmode, 8);
    917 	  emit_insn (gen_sse2_storehpd (m, op1));
    918 	}
    919       else
    920 	{
    921 	  if (mode != V4SFmode)
    922 	    op1 = gen_lowpart (V4SFmode, op1);
    923 
    924 	  m = adjust_address (op0, V2SFmode, 0);
    925 	  emit_insn (gen_sse_storelps (m, op1));
    926 	  m = adjust_address (op0, V2SFmode, 8);
    927 	  emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
    928 	}
    929     }
    930   else
    931     gcc_unreachable ();
    932 }
    933 
    934 /* Move bits 64:95 to bits 32:63.  */
    935 
    936 void
    937 ix86_move_vector_high_sse_to_mmx (rtx op)
    938 {
    939   rtx mask = gen_rtx_PARALLEL (VOIDmode,
    940 			       gen_rtvec (4, GEN_INT (0), GEN_INT (2),
    941 					  GEN_INT (0), GEN_INT (0)));
    942   rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
    943   op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
    944   rtx insn = gen_rtx_SET (dest, op);
    945   emit_insn (insn);
    946 }
    947 
    948 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2.  */
    949 
    950 void
    951 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
    952 {
    953   rtx op0 = operands[0];
    954   rtx op1 = operands[1];
    955   rtx op2 = operands[2];
    956 
    957   machine_mode dmode = GET_MODE (op0);
    958   machine_mode smode = GET_MODE (op1);
    959   machine_mode inner_dmode = GET_MODE_INNER (dmode);
    960   machine_mode inner_smode = GET_MODE_INNER (smode);
    961 
    962   /* Get the corresponding SSE mode for destination.  */
    963   int nunits = 16 / GET_MODE_SIZE (inner_dmode);
    964   machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    965 					    nunits).require ();
    966   machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
    967 						 nunits / 2).require ();
    968 
    969   /* Get the corresponding SSE mode for source.  */
    970   nunits = 16 / GET_MODE_SIZE (inner_smode);
    971   machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
    972 					    nunits).require ();
    973 
    974   /* Generate SSE pack with signed/unsigned saturation.  */
    975   rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
    976   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
    977   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
    978 
    979   op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
    980   op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
    981   rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
    982 						    op1, op2));
    983   emit_insn (insn);
    984 
    985   ix86_move_vector_high_sse_to_mmx (op0);
    986 }
    987 
    988 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX.  */
    989 
    990 void
    991 ix86_split_mmx_punpck (rtx operands[], bool high_p)
    992 {
    993   rtx op0 = operands[0];
    994   rtx op1 = operands[1];
    995   rtx op2 = operands[2];
    996   machine_mode mode = GET_MODE (op0);
    997   rtx mask;
    998   /* The corresponding SSE mode.  */
    999   machine_mode sse_mode, double_sse_mode;
   1000 
   1001   switch (mode)
   1002     {
   1003     case E_V4QImode:
   1004     case E_V8QImode:
   1005       sse_mode = V16QImode;
   1006       double_sse_mode = V32QImode;
   1007       mask = gen_rtx_PARALLEL (VOIDmode,
   1008 			       gen_rtvec (16,
   1009 					  GEN_INT (0), GEN_INT (16),
   1010 					  GEN_INT (1), GEN_INT (17),
   1011 					  GEN_INT (2), GEN_INT (18),
   1012 					  GEN_INT (3), GEN_INT (19),
   1013 					  GEN_INT (4), GEN_INT (20),
   1014 					  GEN_INT (5), GEN_INT (21),
   1015 					  GEN_INT (6), GEN_INT (22),
   1016 					  GEN_INT (7), GEN_INT (23)));
   1017       break;
   1018 
   1019     case E_V4HImode:
   1020     case E_V2HImode:
   1021       sse_mode = V8HImode;
   1022       double_sse_mode = V16HImode;
   1023       mask = gen_rtx_PARALLEL (VOIDmode,
   1024 			       gen_rtvec (8,
   1025 					  GEN_INT (0), GEN_INT (8),
   1026 					  GEN_INT (1), GEN_INT (9),
   1027 					  GEN_INT (2), GEN_INT (10),
   1028 					  GEN_INT (3), GEN_INT (11)));
   1029       break;
   1030 
   1031     case E_V2SImode:
   1032       sse_mode = V4SImode;
   1033       double_sse_mode = V8SImode;
   1034       mask = gen_rtx_PARALLEL (VOIDmode,
   1035 			       gen_rtvec (4,
   1036 					  GEN_INT (0), GEN_INT (4),
   1037 					  GEN_INT (1), GEN_INT (5)));
   1038       break;
   1039 
   1040     case E_V2SFmode:
   1041       sse_mode = V4SFmode;
   1042       double_sse_mode = V8SFmode;
   1043       mask = gen_rtx_PARALLEL (VOIDmode,
   1044 			       gen_rtvec (4,
   1045 					  GEN_INT (0), GEN_INT (4),
   1046 					  GEN_INT (1), GEN_INT (5)));
   1047       break;
   1048 
   1049     default:
   1050       gcc_unreachable ();
   1051     }
   1052 
   1053   /* Generate SSE punpcklXX.  */
   1054   rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
   1055   op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
   1056   op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
   1057 
   1058   op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
   1059   op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
   1060   rtx insn = gen_rtx_SET (dest, op2);
   1061   emit_insn (insn);
   1062 
   1063   /* Move high bits to low bits.  */
   1064   if (high_p)
   1065     {
   1066       if (sse_mode == V4SFmode)
   1067 	{
   1068 	  mask = gen_rtx_PARALLEL (VOIDmode,
   1069 				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
   1070 					      GEN_INT (4), GEN_INT (5)));
   1071 	  op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
   1072 	  op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
   1073 	}
   1074       else
   1075 	{
   1076 	  int sz = GET_MODE_SIZE (mode);
   1077 
   1078 	  if (sz == 4)
   1079 	    mask = gen_rtx_PARALLEL (VOIDmode,
   1080 				     gen_rtvec (4, GEN_INT (1), GEN_INT (0),
   1081 						GEN_INT (0), GEN_INT (1)));
   1082 	  else if (sz == 8)
   1083 	    mask = gen_rtx_PARALLEL (VOIDmode,
   1084 				     gen_rtvec (4, GEN_INT (2), GEN_INT (3),
   1085 						GEN_INT (0), GEN_INT (1)));
   1086 	  else
   1087 	    gcc_unreachable ();
   1088 
   1089 	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
   1090 	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
   1091 	}
   1092 
   1093       insn = gen_rtx_SET (dest, op1);
   1094       emit_insn (insn);
   1095     }
   1096 }
   1097 
   1098 /* Helper function of ix86_fixup_binary_operands to canonicalize
   1099    operand order.  Returns true if the operands should be swapped.  */
   1100 
   1101 static bool
   1102 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
   1103 			     rtx operands[])
   1104 {
   1105   rtx dst = operands[0];
   1106   rtx src1 = operands[1];
   1107   rtx src2 = operands[2];
   1108 
   1109   /* If the operation is not commutative, we can't do anything.  */
   1110   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
   1111       && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
   1112     return false;
   1113 
   1114   /* Highest priority is that src1 should match dst.  */
   1115   if (rtx_equal_p (dst, src1))
   1116     return false;
   1117   if (rtx_equal_p (dst, src2))
   1118     return true;
   1119 
   1120   /* Next highest priority is that immediate constants come second.  */
   1121   if (immediate_operand (src2, mode))
   1122     return false;
   1123   if (immediate_operand (src1, mode))
   1124     return true;
   1125 
   1126   /* Lowest priority is that memory references should come second.  */
   1127   if (MEM_P (src2))
   1128     return false;
   1129   if (MEM_P (src1))
   1130     return true;
   1131 
   1132   return false;
   1133 }
   1134 
   1135 
   1136 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
   1137    destination to use for the operation.  If different from the true
   1138    destination in operands[0], a copy operation will be required.  */
   1139 
   1140 rtx
   1141 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
   1142 			    rtx operands[])
   1143 {
   1144   rtx dst = operands[0];
   1145   rtx src1 = operands[1];
   1146   rtx src2 = operands[2];
   1147 
   1148   /* Canonicalize operand order.  */
   1149   if (ix86_swap_binary_operands_p (code, mode, operands))
   1150     {
   1151       /* It is invalid to swap operands of different modes.  */
   1152       gcc_assert (GET_MODE (src1) == GET_MODE (src2));
   1153 
   1154       std::swap (src1, src2);
   1155     }
   1156 
   1157   /* Both source operands cannot be in memory.  */
   1158   if (MEM_P (src1) && MEM_P (src2))
   1159     {
   1160       /* Optimization: Only read from memory once.  */
   1161       if (rtx_equal_p (src1, src2))
   1162 	{
   1163 	  src2 = force_reg (mode, src2);
   1164 	  src1 = src2;
   1165 	}
   1166       else if (rtx_equal_p (dst, src1))
   1167 	src2 = force_reg (mode, src2);
   1168       else
   1169 	src1 = force_reg (mode, src1);
   1170     }
   1171 
   1172   /* If the destination is memory, and we do not have matching source
   1173      operands, do things in registers.  */
   1174   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
   1175     dst = gen_reg_rtx (mode);
   1176 
   1177   /* Source 1 cannot be a constant.  */
   1178   if (CONSTANT_P (src1))
   1179     src1 = force_reg (mode, src1);
   1180 
   1181   /* Source 1 cannot be a non-matching memory.  */
   1182   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
   1183     src1 = force_reg (mode, src1);
   1184 
   1185   /* Improve address combine.  */
   1186   if (code == PLUS
   1187       && GET_MODE_CLASS (mode) == MODE_INT
   1188       && MEM_P (src2))
   1189     src2 = force_reg (mode, src2);
   1190 
   1191   operands[1] = src1;
   1192   operands[2] = src2;
   1193   return dst;
   1194 }
   1195 
   1196 /* Similarly, but assume that the destination has already been
   1197    set up properly.  */
   1198 
   1199 void
   1200 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
   1201 				    machine_mode mode, rtx operands[])
   1202 {
   1203   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
   1204   gcc_assert (dst == operands[0]);
   1205 }
   1206 
   1207 /* Attempt to expand a binary operator.  Make the expansion closer to the
   1208    actual machine, then just general_operand, which will allow 3 separate
   1209    memory references (one output, two input) in a single insn.  */
   1210 
   1211 void
   1212 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
   1213 			     rtx operands[])
   1214 {
   1215   rtx src1, src2, dst, op, clob;
   1216 
   1217   dst = ix86_fixup_binary_operands (code, mode, operands);
   1218   src1 = operands[1];
   1219   src2 = operands[2];
   1220 
   1221  /* Emit the instruction.  */
   1222 
   1223   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
   1224 
   1225   if (reload_completed
   1226       && code == PLUS
   1227       && !rtx_equal_p (dst, src1))
   1228     {
   1229       /* This is going to be an LEA; avoid splitting it later.  */
   1230       emit_insn (op);
   1231     }
   1232   else
   1233     {
   1234       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   1235       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
   1236     }
   1237 
   1238   /* Fix up the destination if needed.  */
   1239   if (dst != operands[0])
   1240     emit_move_insn (operands[0], dst);
   1241 }
   1242 
   1243 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
   1244    the given OPERANDS.  */
   1245 
   1246 void
   1247 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
   1248 				     rtx operands[])
   1249 {
   1250   rtx op1 = NULL_RTX, op2 = NULL_RTX;
   1251   if (SUBREG_P (operands[1]))
   1252     {
   1253       op1 = operands[1];
   1254       op2 = operands[2];
   1255     }
   1256   else if (SUBREG_P (operands[2]))
   1257     {
   1258       op1 = operands[2];
   1259       op2 = operands[1];
   1260     }
   1261   /* Optimize (__m128i) d | (__m128i) e and similar code
   1262      when d and e are float vectors into float vector logical
   1263      insn.  In C/C++ without using intrinsics there is no other way
   1264      to express vector logical operation on float vectors than
   1265      to cast them temporarily to integer vectors.  */
   1266   if (op1
   1267       && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
   1268       && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
   1269       && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
   1270       && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
   1271       && SUBREG_BYTE (op1) == 0
   1272       && (GET_CODE (op2) == CONST_VECTOR
   1273 	  || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
   1274 	      && SUBREG_BYTE (op2) == 0))
   1275       && can_create_pseudo_p ())
   1276     {
   1277       rtx dst;
   1278       switch (GET_MODE (SUBREG_REG (op1)))
   1279 	{
   1280 	case E_V4SFmode:
   1281 	case E_V8SFmode:
   1282 	case E_V16SFmode:
   1283 	case E_V2DFmode:
   1284 	case E_V4DFmode:
   1285 	case E_V8DFmode:
   1286 	  dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
   1287 	  if (GET_CODE (op2) == CONST_VECTOR)
   1288 	    {
   1289 	      op2 = gen_lowpart (GET_MODE (dst), op2);
   1290 	      op2 = force_reg (GET_MODE (dst), op2);
   1291 	    }
   1292 	  else
   1293 	    {
   1294 	      op1 = operands[1];
   1295 	      op2 = SUBREG_REG (operands[2]);
   1296 	      if (!vector_operand (op2, GET_MODE (dst)))
   1297 		op2 = force_reg (GET_MODE (dst), op2);
   1298 	    }
   1299 	  op1 = SUBREG_REG (op1);
   1300 	  if (!vector_operand (op1, GET_MODE (dst)))
   1301 	    op1 = force_reg (GET_MODE (dst), op1);
   1302 	  emit_insn (gen_rtx_SET (dst,
   1303 				  gen_rtx_fmt_ee (code, GET_MODE (dst),
   1304 						  op1, op2)));
   1305 	  emit_move_insn (operands[0], gen_lowpart (mode, dst));
   1306 	  return;
   1307 	default:
   1308 	  break;
   1309 	}
   1310     }
   1311   if (!vector_operand (operands[1], mode))
   1312     operands[1] = force_reg (mode, operands[1]);
   1313   if (!vector_operand (operands[2], mode))
   1314     operands[2] = force_reg (mode, operands[2]);
   1315   ix86_fixup_binary_operands_no_copy (code, mode, operands);
   1316   emit_insn (gen_rtx_SET (operands[0],
   1317 			  gen_rtx_fmt_ee (code, mode, operands[1],
   1318 					  operands[2])));
   1319 }
   1320 
   1321 /* Return TRUE or FALSE depending on whether the binary operator meets the
   1322    appropriate constraints.  */
   1323 
   1324 bool
   1325 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
   1326 			 rtx operands[3])
   1327 {
   1328   rtx dst = operands[0];
   1329   rtx src1 = operands[1];
   1330   rtx src2 = operands[2];
   1331 
   1332   /* Both source operands cannot be in memory.  */
   1333   if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
   1334       && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
   1335     return false;
   1336 
   1337   /* Canonicalize operand order for commutative operators.  */
   1338   if (ix86_swap_binary_operands_p (code, mode, operands))
   1339     std::swap (src1, src2);
   1340 
   1341   /* If the destination is memory, we must have a matching source operand.  */
   1342   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
   1343     return false;
   1344 
   1345   /* Source 1 cannot be a constant.  */
   1346   if (CONSTANT_P (src1))
   1347     return false;
   1348 
   1349   /* Source 1 cannot be a non-matching memory.  */
   1350   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
   1351     /* Support "andhi/andsi/anddi" as a zero-extending move.  */
   1352     return (code == AND
   1353 	    && (mode == HImode
   1354 		|| mode == SImode
   1355 		|| (TARGET_64BIT && mode == DImode))
   1356 	    && satisfies_constraint_L (src2));
   1357 
   1358   return true;
   1359 }
   1360 
   1361 /* Attempt to expand a unary operator.  Make the expansion closer to the
   1362    actual machine, then just general_operand, which will allow 2 separate
   1363    memory references (one output, one input) in a single insn.  */
   1364 
   1365 void
   1366 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
   1367 			    rtx operands[])
   1368 {
   1369   bool matching_memory = false;
   1370   rtx src, dst, op, clob;
   1371 
   1372   dst = operands[0];
   1373   src = operands[1];
   1374 
   1375   /* If the destination is memory, and we do not have matching source
   1376      operands, do things in registers.  */
   1377   if (MEM_P (dst))
   1378     {
   1379       if (rtx_equal_p (dst, src))
   1380 	matching_memory = true;
   1381       else
   1382 	dst = gen_reg_rtx (mode);
   1383     }
   1384 
   1385   /* When source operand is memory, destination must match.  */
   1386   if (MEM_P (src) && !matching_memory)
   1387     src = force_reg (mode, src);
   1388 
   1389   /* Emit the instruction.  */
   1390 
   1391   op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
   1392 
   1393   if (code == NOT)
   1394     emit_insn (op);
   1395   else
   1396     {
   1397       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   1398       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
   1399     }
   1400 
   1401   /* Fix up the destination if needed.  */
   1402   if (dst != operands[0])
   1403     emit_move_insn (operands[0], dst);
   1404 }
   1405 
   1406 /* Predict just emitted jump instruction to be taken with probability PROB.  */
   1407 
   1408 static void
   1409 predict_jump (int prob)
   1410 {
   1411   rtx_insn *insn = get_last_insn ();
   1412   gcc_assert (JUMP_P (insn));
   1413   add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
   1414 }
   1415 
   1416 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
   1417    divisor are within the range [0-255].  */
   1418 
   1419 void
   1420 ix86_split_idivmod (machine_mode mode, rtx operands[],
   1421 		    bool unsigned_p)
   1422 {
   1423   rtx_code_label *end_label, *qimode_label;
   1424   rtx div, mod;
   1425   rtx_insn *insn;
   1426   rtx scratch, tmp0, tmp1, tmp2;
   1427   rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
   1428 
   1429   operands[2] = force_reg (mode, operands[2]);
   1430   operands[3] = force_reg (mode, operands[3]);
   1431 
   1432   switch (mode)
   1433     {
   1434     case E_SImode:
   1435       if (GET_MODE (operands[0]) == SImode)
   1436 	{
   1437 	  if (GET_MODE (operands[1]) == SImode)
   1438 	    gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
   1439 	  else
   1440 	    gen_divmod4_1
   1441 	      = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
   1442 	}
   1443       else
   1444 	gen_divmod4_1
   1445 	  = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
   1446       break;
   1447 
   1448     case E_DImode:
   1449       gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
   1450       break;
   1451 
   1452     default:
   1453       gcc_unreachable ();
   1454     }
   1455 
   1456   end_label = gen_label_rtx ();
   1457   qimode_label = gen_label_rtx ();
   1458 
   1459   scratch = gen_reg_rtx (mode);
   1460 
   1461   /* Use 8bit unsigned divimod if dividend and divisor are within
   1462      the range [0-255].  */
   1463   emit_move_insn (scratch, operands[2]);
   1464   scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
   1465 				 scratch, 1, OPTAB_DIRECT);
   1466   emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
   1467   tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
   1468   tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
   1469   tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
   1470 			       gen_rtx_LABEL_REF (VOIDmode, qimode_label),
   1471 			       pc_rtx);
   1472   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
   1473   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   1474   JUMP_LABEL (insn) = qimode_label;
   1475 
   1476   /* Generate original signed/unsigned divimod.  */
   1477   emit_insn (gen_divmod4_1 (operands[0], operands[1],
   1478 			    operands[2], operands[3]));
   1479 
   1480   /* Branch to the end.  */
   1481   emit_jump_insn (gen_jump (end_label));
   1482   emit_barrier ();
   1483 
   1484   /* Generate 8bit unsigned divide.  */
   1485   emit_label (qimode_label);
   1486   /* Don't use operands[0] for result of 8bit divide since not all
   1487      registers support QImode ZERO_EXTRACT.  */
   1488   tmp0 = lowpart_subreg (HImode, scratch, mode);
   1489   tmp1 = lowpart_subreg (HImode, operands[2], mode);
   1490   tmp2 = lowpart_subreg (QImode, operands[3], mode);
   1491   emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
   1492 
   1493   if (unsigned_p)
   1494     {
   1495       div = gen_rtx_UDIV (mode, operands[2], operands[3]);
   1496       mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
   1497     }
   1498   else
   1499     {
   1500       div = gen_rtx_DIV (mode, operands[2], operands[3]);
   1501       mod = gen_rtx_MOD (mode, operands[2], operands[3]);
   1502     }
   1503   if (mode == SImode)
   1504     {
   1505       if (GET_MODE (operands[0]) != SImode)
   1506 	div = gen_rtx_ZERO_EXTEND (DImode, div);
   1507       if (GET_MODE (operands[1]) != SImode)
   1508 	mod = gen_rtx_ZERO_EXTEND (DImode, mod);
   1509     }
   1510 
   1511   /* Extract remainder from AH.  */
   1512   scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
   1513   tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
   1514 			       GEN_INT (8), GEN_INT (8));
   1515   insn = emit_move_insn (operands[1], tmp1);
   1516   set_unique_reg_note (insn, REG_EQUAL, mod);
   1517 
   1518   /* Zero extend quotient from AL.  */
   1519   tmp1 = gen_lowpart (QImode, tmp0);
   1520   insn = emit_insn (gen_extend_insn
   1521 		    (operands[0], tmp1,
   1522 		     GET_MODE (operands[0]), QImode, 1));
   1523   set_unique_reg_note (insn, REG_EQUAL, div);
   1524 
   1525   emit_label (end_label);
   1526 }
   1527 
   1528 /* Emit x86 binary operand CODE in mode MODE, where the first operand
   1529    matches destination.  RTX includes clobber of FLAGS_REG.  */
   1530 
   1531 void
   1532 ix86_emit_binop (enum rtx_code code, machine_mode mode,
   1533 		 rtx dst, rtx src)
   1534 {
   1535   rtx op, clob;
   1536 
   1537   op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
   1538   clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   1539 
   1540   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
   1541 }
   1542 
   1543 /* Return true if regno1 def is nearest to the insn.  */
   1544 
   1545 static bool
   1546 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
   1547 {
   1548   rtx_insn *prev = insn;
   1549   rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
   1550 
   1551   if (insn == start)
   1552     return false;
   1553   while (prev && prev != start)
   1554     {
   1555       if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
   1556 	{
   1557 	  prev = PREV_INSN (prev);
   1558 	  continue;
   1559 	}
   1560       if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
   1561 	return true;
   1562       else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
   1563 	return false;
   1564       prev = PREV_INSN (prev);
   1565     }
   1566 
   1567   /* None of the regs is defined in the bb.  */
   1568   return false;
   1569 }
   1570 
   1571 /* INSN_UID of the last insn emitted by zero store peephole2s.  */
   1572 int ix86_last_zero_store_uid;
   1573 
   1574 /* Split lea instructions into a sequence of instructions
   1575    which are executed on ALU to avoid AGU stalls.
   1576    It is assumed that it is allowed to clobber flags register
   1577    at lea position.  */
   1578 
   1579 void
   1580 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
   1581 {
   1582   unsigned int regno0, regno1, regno2;
   1583   struct ix86_address parts;
   1584   rtx target, tmp;
   1585   int ok, adds;
   1586 
   1587   ok = ix86_decompose_address (operands[1], &parts);
   1588   gcc_assert (ok);
   1589 
   1590   target = gen_lowpart (mode, operands[0]);
   1591 
   1592   regno0 = true_regnum (target);
   1593   regno1 = INVALID_REGNUM;
   1594   regno2 = INVALID_REGNUM;
   1595 
   1596   if (parts.base)
   1597     {
   1598       parts.base = gen_lowpart (mode, parts.base);
   1599       regno1 = true_regnum (parts.base);
   1600     }
   1601 
   1602   if (parts.index)
   1603     {
   1604       parts.index = gen_lowpart (mode, parts.index);
   1605       regno2 = true_regnum (parts.index);
   1606     }
   1607 
   1608   if (parts.disp)
   1609     parts.disp = gen_lowpart (mode, parts.disp);
   1610 
   1611   if (parts.scale > 1)
   1612     {
   1613       /* Case r1 = r1 + ...  */
   1614       if (regno1 == regno0)
   1615 	{
   1616 	  /* If we have a case r1 = r1 + C * r2 then we
   1617 	     should use multiplication which is very
   1618 	     expensive.  Assume cost model is wrong if we
   1619 	     have such case here.  */
   1620 	  gcc_assert (regno2 != regno0);
   1621 
   1622 	  for (adds = parts.scale; adds > 0; adds--)
   1623 	    ix86_emit_binop (PLUS, mode, target, parts.index);
   1624 	}
   1625       else
   1626 	{
   1627 	  /* r1 = r2 + r3 * C case.  Need to move r3 into r1.  */
   1628 	  if (regno0 != regno2)
   1629 	    emit_insn (gen_rtx_SET (target, parts.index));
   1630 
   1631 	  /* Use shift for scaling, but emit it as MULT instead
   1632 	     to avoid it being immediately peephole2 optimized back
   1633 	     into lea.  */
   1634 	  ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
   1635 
   1636 	  if (parts.base)
   1637 	    ix86_emit_binop (PLUS, mode, target, parts.base);
   1638 
   1639 	  if (parts.disp && parts.disp != const0_rtx)
   1640 	    ix86_emit_binop (PLUS, mode, target, parts.disp);
   1641 	}
   1642     }
   1643   else if (!parts.base && !parts.index)
   1644     {
   1645       gcc_assert(parts.disp);
   1646       emit_insn (gen_rtx_SET (target, parts.disp));
   1647     }
   1648   else
   1649     {
   1650       if (!parts.base)
   1651 	{
   1652 	  if (regno0 != regno2)
   1653 	    emit_insn (gen_rtx_SET (target, parts.index));
   1654 	}
   1655       else if (!parts.index)
   1656 	{
   1657 	  if (regno0 != regno1)
   1658 	    emit_insn (gen_rtx_SET (target, parts.base));
   1659 	}
   1660       else
   1661 	{
   1662 	  if (regno0 == regno1)
   1663 	    tmp = parts.index;
   1664 	  else if (regno0 == regno2)
   1665 	    tmp = parts.base;
   1666 	  else
   1667 	    {
   1668 	      rtx tmp1;
   1669 
   1670 	      /* Find better operand for SET instruction, depending
   1671 		 on which definition is farther from the insn.  */
   1672 	      if (find_nearest_reg_def (insn, regno1, regno2))
   1673 		tmp = parts.index, tmp1 = parts.base;
   1674 	      else
   1675 		tmp = parts.base, tmp1 = parts.index;
   1676 
   1677 	      emit_insn (gen_rtx_SET (target, tmp));
   1678 
   1679 	      if (parts.disp && parts.disp != const0_rtx)
   1680 		ix86_emit_binop (PLUS, mode, target, parts.disp);
   1681 
   1682 	      ix86_emit_binop (PLUS, mode, target, tmp1);
   1683 	      return;
   1684 	    }
   1685 
   1686 	  ix86_emit_binop (PLUS, mode, target, tmp);
   1687 	}
   1688 
   1689       if (parts.disp && parts.disp != const0_rtx)
   1690 	ix86_emit_binop (PLUS, mode, target, parts.disp);
   1691     }
   1692 }
   1693 
   1694 /* Post-reload splitter for converting an SF or DFmode value in an
   1695    SSE register into an unsigned SImode.  */
   1696 
   1697 void
   1698 ix86_split_convert_uns_si_sse (rtx operands[])
   1699 {
   1700   machine_mode vecmode;
   1701   rtx value, large, zero_or_two31, input, two31, x;
   1702 
   1703   large = operands[1];
   1704   zero_or_two31 = operands[2];
   1705   input = operands[3];
   1706   two31 = operands[4];
   1707   vecmode = GET_MODE (large);
   1708   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
   1709 
   1710   /* Load up the value into the low element.  We must ensure that the other
   1711      elements are valid floats -- zero is the easiest such value.  */
   1712   if (MEM_P (input))
   1713     {
   1714       if (vecmode == V4SFmode)
   1715 	emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
   1716       else
   1717 	emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
   1718     }
   1719   else
   1720     {
   1721       input = gen_rtx_REG (vecmode, REGNO (input));
   1722       emit_move_insn (value, CONST0_RTX (vecmode));
   1723       if (vecmode == V4SFmode)
   1724 	emit_insn (gen_sse_movss (value, value, input));
   1725       else
   1726 	emit_insn (gen_sse2_movsd (value, value, input));
   1727     }
   1728 
   1729   emit_move_insn (large, two31);
   1730   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
   1731 
   1732   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
   1733   emit_insn (gen_rtx_SET (large, x));
   1734 
   1735   x = gen_rtx_AND (vecmode, zero_or_two31, large);
   1736   emit_insn (gen_rtx_SET (zero_or_two31, x));
   1737 
   1738   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
   1739   emit_insn (gen_rtx_SET (value, x));
   1740 
   1741   large = gen_rtx_REG (V4SImode, REGNO (large));
   1742   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
   1743 
   1744   x = gen_rtx_REG (V4SImode, REGNO (value));
   1745   if (vecmode == V4SFmode)
   1746     emit_insn (gen_fix_truncv4sfv4si2 (x, value));
   1747   else
   1748     emit_insn (gen_sse2_cvttpd2dq (x, value));
   1749   value = x;
   1750 
   1751   emit_insn (gen_xorv4si3 (value, value, large));
   1752 }
   1753 
   1754 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
   1755 						 machine_mode mode, rtx target,
   1756 						 rtx var, int one_var);
   1757 
   1758 /* Convert an unsigned DImode value into a DFmode, using only SSE.
   1759    Expects the 64-bit DImode to be supplied in a pair of integral
   1760    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
   1761    -mfpmath=sse, !optimize_size only.  */
   1762 
   1763 void
   1764 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
   1765 {
   1766   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
   1767   rtx int_xmm, fp_xmm;
   1768   rtx biases, exponents;
   1769   rtx x;
   1770 
   1771   int_xmm = gen_reg_rtx (V4SImode);
   1772   if (TARGET_INTER_UNIT_MOVES_TO_VEC)
   1773     emit_insn (gen_movdi_to_sse (int_xmm, input));
   1774   else if (TARGET_SSE_SPLIT_REGS)
   1775     {
   1776       emit_clobber (int_xmm);
   1777       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
   1778     }
   1779   else
   1780     {
   1781       x = gen_reg_rtx (V2DImode);
   1782       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
   1783       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
   1784     }
   1785 
   1786   x = gen_rtx_CONST_VECTOR (V4SImode,
   1787 			    gen_rtvec (4, GEN_INT (0x43300000UL),
   1788 				       GEN_INT (0x45300000UL),
   1789 				       const0_rtx, const0_rtx));
   1790   exponents = validize_mem (force_const_mem (V4SImode, x));
   1791 
   1792   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
   1793   emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
   1794 
   1795   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
   1796      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
   1797      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
   1798      (0x1.0p84 + double(fp_value_hi_xmm)).
   1799      Note these exponents differ by 32.  */
   1800 
   1801   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
   1802 
   1803   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
   1804      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
   1805   real_ldexp (&bias_lo_rvt, &dconst1, 52);
   1806   real_ldexp (&bias_hi_rvt, &dconst1, 84);
   1807   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
   1808   x = const_double_from_real_value (bias_hi_rvt, DFmode);
   1809   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
   1810   biases = validize_mem (force_const_mem (V2DFmode, biases));
   1811   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
   1812 
   1813   /* Add the upper and lower DFmode values together.  */
   1814   if (TARGET_SSE3)
   1815     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
   1816   else
   1817     {
   1818       x = copy_to_mode_reg (V2DFmode, fp_xmm);
   1819       emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
   1820       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
   1821     }
   1822 
   1823   ix86_expand_vector_extract (false, target, fp_xmm, 0);
   1824 }
   1825 
   1826 /* Not used, but eases macroization of patterns.  */
   1827 void
   1828 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
   1829 {
   1830   gcc_unreachable ();
   1831 }
   1832 
   1833 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
   1834 
   1835 /* Convert an unsigned SImode value into a DFmode.  Only currently used
   1836    for SSE, but applicable anywhere.  */
   1837 
   1838 void
   1839 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
   1840 {
   1841   REAL_VALUE_TYPE TWO31r;
   1842   rtx x, fp;
   1843 
   1844   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
   1845 			   NULL, 1, OPTAB_DIRECT);
   1846 
   1847   fp = gen_reg_rtx (DFmode);
   1848   emit_insn (gen_floatsidf2 (fp, x));
   1849 
   1850   real_ldexp (&TWO31r, &dconst1, 31);
   1851   x = const_double_from_real_value (TWO31r, DFmode);
   1852 
   1853   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
   1854 
   1855   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   1856   if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
   1857     x = ix86_expand_sse_fabs (x, NULL);
   1858 
   1859   if (x != target)
   1860     emit_move_insn (target, x);
   1861 }
   1862 
   1863 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
   1864    32-bit mode; otherwise we have a direct convert instruction.  */
   1865 
   1866 void
   1867 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
   1868 {
   1869   REAL_VALUE_TYPE TWO32r;
   1870   rtx fp_lo, fp_hi, x;
   1871 
   1872   fp_lo = gen_reg_rtx (DFmode);
   1873   fp_hi = gen_reg_rtx (DFmode);
   1874 
   1875   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
   1876 
   1877   real_ldexp (&TWO32r, &dconst1, 32);
   1878   x = const_double_from_real_value (TWO32r, DFmode);
   1879   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
   1880 
   1881   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
   1882 
   1883   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
   1884 			   0, OPTAB_DIRECT);
   1885   if (x != target)
   1886     emit_move_insn (target, x);
   1887 }
   1888 
   1889 /* Convert an unsigned SImode value into a SFmode, using only SSE.
   1890    For x86_32, -mfpmath=sse, !optimize_size only.  */
   1891 void
   1892 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
   1893 {
   1894   REAL_VALUE_TYPE ONE16r;
   1895   rtx fp_hi, fp_lo, int_hi, int_lo, x;
   1896 
   1897   real_ldexp (&ONE16r, &dconst1, 16);
   1898   x = const_double_from_real_value (ONE16r, SFmode);
   1899   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
   1900 				      NULL, 0, OPTAB_DIRECT);
   1901   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
   1902 				      NULL, 0, OPTAB_DIRECT);
   1903   fp_hi = gen_reg_rtx (SFmode);
   1904   fp_lo = gen_reg_rtx (SFmode);
   1905   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
   1906   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
   1907   if (TARGET_FMA)
   1908     {
   1909       x = validize_mem (force_const_mem (SFmode, x));
   1910       fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
   1911       emit_move_insn (target, fp_hi);
   1912     }
   1913   else
   1914     {
   1915       fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
   1916 				   0, OPTAB_DIRECT);
   1917       fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
   1918 				   0, OPTAB_DIRECT);
   1919       if (!rtx_equal_p (target, fp_hi))
   1920 	emit_move_insn (target, fp_hi);
   1921     }
   1922 }
   1923 
   1924 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
   1925    a vector of unsigned ints VAL to vector of floats TARGET.  */
   1926 
   1927 void
   1928 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
   1929 {
   1930   rtx tmp[8];
   1931   REAL_VALUE_TYPE TWO16r;
   1932   machine_mode intmode = GET_MODE (val);
   1933   machine_mode fltmode = GET_MODE (target);
   1934   rtx (*cvt) (rtx, rtx);
   1935 
   1936   if (intmode == V4SImode)
   1937     cvt = gen_floatv4siv4sf2;
   1938   else
   1939     cvt = gen_floatv8siv8sf2;
   1940   tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
   1941   tmp[0] = force_reg (intmode, tmp[0]);
   1942   tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
   1943 				OPTAB_DIRECT);
   1944   tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
   1945 				NULL_RTX, 1, OPTAB_DIRECT);
   1946   tmp[3] = gen_reg_rtx (fltmode);
   1947   emit_insn (cvt (tmp[3], tmp[1]));
   1948   tmp[4] = gen_reg_rtx (fltmode);
   1949   emit_insn (cvt (tmp[4], tmp[2]));
   1950   real_ldexp (&TWO16r, &dconst1, 16);
   1951   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
   1952   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
   1953   if (TARGET_FMA)
   1954     {
   1955       tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
   1956       emit_move_insn (target, tmp[6]);
   1957     }
   1958   else
   1959     {
   1960       tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
   1961 				    NULL_RTX, 1, OPTAB_DIRECT);
   1962       tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
   1963 				    target, 1, OPTAB_DIRECT);
   1964       if (tmp[7] != target)
   1965 	emit_move_insn (target, tmp[7]);
   1966     }
   1967 }
   1968 
   1969 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
   1970    pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
   1971    This is done by doing just signed conversion if < 0x1p31, and otherwise by
   1972    subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
   1973 
   1974 rtx
   1975 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
   1976 {
   1977   REAL_VALUE_TYPE TWO31r;
   1978   rtx two31r, tmp[4];
   1979   machine_mode mode = GET_MODE (val);
   1980   machine_mode scalarmode = GET_MODE_INNER (mode);
   1981   machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
   1982   rtx (*cmp) (rtx, rtx, rtx, rtx);
   1983   int i;
   1984 
   1985   for (i = 0; i < 3; i++)
   1986     tmp[i] = gen_reg_rtx (mode);
   1987   real_ldexp (&TWO31r, &dconst1, 31);
   1988   two31r = const_double_from_real_value (TWO31r, scalarmode);
   1989   two31r = ix86_build_const_vector (mode, 1, two31r);
   1990   two31r = force_reg (mode, two31r);
   1991   switch (mode)
   1992     {
   1993     case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
   1994     case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
   1995     case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
   1996     case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
   1997     default: gcc_unreachable ();
   1998     }
   1999   tmp[3] = gen_rtx_LE (mode, two31r, val);
   2000   emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
   2001   tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
   2002 				0, OPTAB_DIRECT);
   2003   if (intmode == V4SImode || TARGET_AVX2)
   2004     *xorp = expand_simple_binop (intmode, ASHIFT,
   2005 				 gen_lowpart (intmode, tmp[0]),
   2006 				 GEN_INT (31), NULL_RTX, 0,
   2007 				 OPTAB_DIRECT);
   2008   else
   2009     {
   2010       rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
   2011       two31 = ix86_build_const_vector (intmode, 1, two31);
   2012       *xorp = expand_simple_binop (intmode, AND,
   2013 				   gen_lowpart (intmode, tmp[0]),
   2014 				   two31, NULL_RTX, 0,
   2015 				   OPTAB_DIRECT);
   2016     }
   2017   return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
   2018 			      0, OPTAB_DIRECT);
   2019 }
   2020 
   2021 /* Generate code for floating point ABS or NEG.  */
   2022 
   2023 void
   2024 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
   2025 				rtx operands[])
   2026 {
   2027   rtx set, dst, src;
   2028   bool use_sse = false;
   2029   bool vector_mode = VECTOR_MODE_P (mode);
   2030   machine_mode vmode = mode;
   2031   rtvec par;
   2032 
   2033   if (vector_mode || mode == TFmode || mode == HFmode)
   2034     {
   2035       use_sse = true;
   2036       if (mode == HFmode)
   2037 	vmode = V8HFmode;
   2038     }
   2039   else if (TARGET_SSE_MATH)
   2040     {
   2041       use_sse = SSE_FLOAT_MODE_P (mode);
   2042       if (mode == SFmode)
   2043 	vmode = V4SFmode;
   2044       else if (mode == DFmode)
   2045 	vmode = V2DFmode;
   2046     }
   2047 
   2048   dst = operands[0];
   2049   src = operands[1];
   2050 
   2051   set = gen_rtx_fmt_e (code, mode, src);
   2052   set = gen_rtx_SET (dst, set);
   2053 
   2054   if (use_sse)
   2055     {
   2056       rtx mask, use, clob;
   2057 
   2058       /* NEG and ABS performed with SSE use bitwise mask operations.
   2059 	 Create the appropriate mask now.  */
   2060       mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
   2061       use = gen_rtx_USE (VOIDmode, mask);
   2062       if (vector_mode || mode == TFmode)
   2063 	par = gen_rtvec (2, set, use);
   2064       else
   2065 	{
   2066           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   2067 	  par = gen_rtvec (3, set, use, clob);
   2068         }
   2069     }
   2070   else
   2071     {
   2072       rtx clob;
   2073 
   2074       /* Changing of sign for FP values is doable using integer unit too.  */
   2075       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   2076       par = gen_rtvec (2, set, clob);
   2077     }
   2078 
   2079   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
   2080 }
   2081 
   2082 /* Deconstruct a floating point ABS or NEG operation
   2083    with integer registers into integer operations.  */
   2084 
   2085 void
   2086 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
   2087 			       rtx operands[])
   2088 {
   2089   enum rtx_code absneg_op;
   2090   rtx dst, set;
   2091 
   2092   gcc_assert (operands_match_p (operands[0], operands[1]));
   2093 
   2094   switch (mode)
   2095     {
   2096     case E_SFmode:
   2097       dst = gen_lowpart (SImode, operands[0]);
   2098 
   2099       if (code == ABS)
   2100 	{
   2101 	  set = gen_int_mode (0x7fffffff, SImode);
   2102 	  absneg_op = AND;
   2103 	}
   2104       else
   2105 	{
   2106 	  set = gen_int_mode (0x80000000, SImode);
   2107 	  absneg_op = XOR;
   2108 	}
   2109       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
   2110       break;
   2111 
   2112     case E_DFmode:
   2113       if (TARGET_64BIT)
   2114 	{
   2115 	  dst = gen_lowpart (DImode, operands[0]);
   2116 	  dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
   2117 
   2118 	  if (code == ABS)
   2119 	    set = const0_rtx;
   2120 	  else
   2121 	    set = gen_rtx_NOT (DImode, dst);
   2122 	}
   2123       else
   2124 	{
   2125 	  dst = gen_highpart (SImode, operands[0]);
   2126 
   2127 	  if (code == ABS)
   2128 	    {
   2129 	      set = gen_int_mode (0x7fffffff, SImode);
   2130 	      absneg_op = AND;
   2131 	    }
   2132 	  else
   2133 	    {
   2134 	      set = gen_int_mode (0x80000000, SImode);
   2135 	      absneg_op = XOR;
   2136 	    }
   2137 	  set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
   2138 	}
   2139       break;
   2140 
   2141     case E_XFmode:
   2142       dst = gen_rtx_REG (SImode,
   2143 			 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
   2144       if (code == ABS)
   2145 	{
   2146 	  set = GEN_INT (0x7fff);
   2147 	  absneg_op = AND;
   2148 	}
   2149       else
   2150 	{
   2151 	  set = GEN_INT (0x8000);
   2152 	  absneg_op = XOR;
   2153 	}
   2154       set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
   2155       break;
   2156 
   2157     default:
   2158       gcc_unreachable ();
   2159     }
   2160 
   2161   set = gen_rtx_SET (dst, set);
   2162 
   2163   rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   2164   rtvec par = gen_rtvec (2, set, clob);
   2165 
   2166   emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
   2167 }
   2168 
   2169 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
   2170 
   2171 void
   2172 ix86_expand_copysign (rtx operands[])
   2173 {
   2174   machine_mode mode, vmode;
   2175   rtx dest, vdest, op0, op1, mask, op2, op3;
   2176 
   2177   mode = GET_MODE (operands[0]);
   2178 
   2179   if (mode == HFmode)
   2180     vmode = V8HFmode;
   2181   else if (mode == SFmode)
   2182     vmode = V4SFmode;
   2183   else if (mode == DFmode)
   2184     vmode = V2DFmode;
   2185   else if (mode == TFmode)
   2186     vmode = mode;
   2187   else
   2188     gcc_unreachable ();
   2189 
   2190   if (rtx_equal_p (operands[1], operands[2]))
   2191     {
   2192       emit_move_insn (operands[0], operands[1]);
   2193       return;
   2194     }
   2195 
   2196   dest = operands[0];
   2197   vdest = lowpart_subreg (vmode, dest, mode);
   2198   if (vdest == NULL_RTX)
   2199     vdest = gen_reg_rtx (vmode);
   2200   else
   2201     dest = NULL_RTX;
   2202   op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
   2203   mask = ix86_build_signbit_mask (vmode, 0, 0);
   2204 
   2205   if (CONST_DOUBLE_P (operands[1]))
   2206     {
   2207       op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
   2208       /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
   2209       if (op0 == CONST0_RTX (mode))
   2210 	{
   2211 	  emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
   2212 	  if (dest)
   2213 	    emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
   2214 	  return;
   2215 	}
   2216 
   2217       if (GET_MODE_SIZE (mode) < 16)
   2218 	op0 = ix86_build_const_vector (vmode, false, op0);
   2219       op0 = force_reg (vmode, op0);
   2220     }
   2221   else
   2222     op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
   2223 
   2224   op2 = gen_reg_rtx (vmode);
   2225   op3 = gen_reg_rtx (vmode);
   2226   emit_move_insn (op2, gen_rtx_AND (vmode,
   2227 				    gen_rtx_NOT (vmode, mask),
   2228 				    op0));
   2229   emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
   2230   emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
   2231   if (dest)
   2232     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
   2233 }
   2234 
   2235 /* Expand an xorsign operation.  */
   2236 
   2237 void
   2238 ix86_expand_xorsign (rtx operands[])
   2239 {
   2240   machine_mode mode, vmode;
   2241   rtx dest, vdest, op0, op1, mask, x, temp;
   2242 
   2243   dest = operands[0];
   2244   op0 = operands[1];
   2245   op1 = operands[2];
   2246 
   2247   mode = GET_MODE (dest);
   2248 
   2249   if (mode == HFmode)
   2250     vmode = V8HFmode;
   2251   else if (mode == SFmode)
   2252     vmode = V4SFmode;
   2253   else if (mode == DFmode)
   2254     vmode = V2DFmode;
   2255   else
   2256     gcc_unreachable ();
   2257 
   2258   temp = gen_reg_rtx (vmode);
   2259   mask = ix86_build_signbit_mask (vmode, 0, 0);
   2260 
   2261   op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
   2262   x = gen_rtx_AND (vmode, op1, mask);
   2263   emit_insn (gen_rtx_SET (temp, x));
   2264 
   2265   op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
   2266   x = gen_rtx_XOR (vmode, temp, op0);
   2267 
   2268   vdest = lowpart_subreg (vmode, dest, mode);
   2269   if (vdest == NULL_RTX)
   2270     vdest = gen_reg_rtx (vmode);
   2271   else
   2272     dest = NULL_RTX;
   2273   emit_insn (gen_rtx_SET (vdest, x));
   2274 
   2275   if (dest)
   2276     emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
   2277 }
   2278 
   2279 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
   2280 
   2281 void
   2282 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
   2283 {
   2284   machine_mode mode = GET_MODE (op0);
   2285   rtx tmp;
   2286 
   2287   /* Handle special case - vector comparsion with boolean result, transform
   2288      it using ptest instruction.  */
   2289   if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
   2290     {
   2291       rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
   2292       machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
   2293 
   2294       gcc_assert (code == EQ || code == NE);
   2295       /* Generate XOR since we can't check that one operand is zero vector.  */
   2296       tmp = gen_reg_rtx (mode);
   2297       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
   2298       tmp = gen_lowpart (p_mode, tmp);
   2299       emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
   2300 			      gen_rtx_UNSPEC (CCmode,
   2301 					      gen_rtvec (2, tmp, tmp),
   2302 					      UNSPEC_PTEST)));
   2303       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
   2304       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   2305 				  gen_rtx_LABEL_REF (VOIDmode, label),
   2306 				  pc_rtx);
   2307       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   2308       return;
   2309     }
   2310 
   2311   switch (mode)
   2312     {
   2313     case E_HFmode:
   2314     case E_SFmode:
   2315     case E_DFmode:
   2316     case E_XFmode:
   2317     case E_QImode:
   2318     case E_HImode:
   2319     case E_SImode:
   2320       simple:
   2321       tmp = ix86_expand_compare (code, op0, op1);
   2322       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   2323 				  gen_rtx_LABEL_REF (VOIDmode, label),
   2324 				  pc_rtx);
   2325       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   2326       return;
   2327 
   2328     case E_DImode:
   2329       if (TARGET_64BIT)
   2330 	goto simple;
   2331       /* For 32-bit target DI comparison may be performed on
   2332 	 SSE registers.  To allow this we should avoid split
   2333 	 to SI mode which is achieved by doing xor in DI mode
   2334 	 and then comparing with zero (which is recognized by
   2335 	 STV pass).  We don't compare using xor when optimizing
   2336 	 for size.  */
   2337       if (!optimize_insn_for_size_p ()
   2338 	  && TARGET_STV
   2339 	  && (code == EQ || code == NE))
   2340 	{
   2341 	  op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
   2342 	  op1 = const0_rtx;
   2343 	}
   2344       /* FALLTHRU */
   2345     case E_TImode:
   2346       /* Expand DImode branch into multiple compare+branch.  */
   2347       {
   2348 	rtx lo[2], hi[2];
   2349 	rtx_code_label *label2;
   2350 	enum rtx_code code1, code2, code3;
   2351 	machine_mode submode;
   2352 
   2353 	if (CONSTANT_P (op0) && !CONSTANT_P (op1))
   2354 	  {
   2355 	    std::swap (op0, op1);
   2356 	    code = swap_condition (code);
   2357 	  }
   2358 
   2359 	split_double_mode (mode, &op0, 1, lo+0, hi+0);
   2360 	split_double_mode (mode, &op1, 1, lo+1, hi+1);
   2361 
   2362 	submode = mode == DImode ? SImode : DImode;
   2363 
   2364 	/* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
   2365 	   avoid two branches.  This costs one extra insn, so disable when
   2366 	   optimizing for size.  */
   2367 
   2368 	if ((code == EQ || code == NE)
   2369 	    && (!optimize_insn_for_size_p ()
   2370 	        || hi[1] == const0_rtx || lo[1] == const0_rtx))
   2371 	  {
   2372 	    rtx xor0, xor1;
   2373 
   2374 	    xor1 = hi[0];
   2375 	    if (hi[1] != const0_rtx)
   2376 	      xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
   2377 				   NULL_RTX, 0, OPTAB_WIDEN);
   2378 
   2379 	    xor0 = lo[0];
   2380 	    if (lo[1] != const0_rtx)
   2381 	      xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
   2382 				   NULL_RTX, 0, OPTAB_WIDEN);
   2383 
   2384 	    tmp = expand_binop (submode, ior_optab, xor1, xor0,
   2385 				NULL_RTX, 0, OPTAB_WIDEN);
   2386 
   2387 	    ix86_expand_branch (code, tmp, const0_rtx, label);
   2388 	    return;
   2389 	  }
   2390 
   2391 	/* Otherwise, if we are doing less-than or greater-or-equal-than,
   2392 	   op1 is a constant and the low word is zero, then we can just
   2393 	   examine the high word.  Similarly for low word -1 and
   2394 	   less-or-equal-than or greater-than.  */
   2395 
   2396 	if (CONST_INT_P (hi[1]))
   2397 	  switch (code)
   2398 	    {
   2399 	    case LT: case LTU: case GE: case GEU:
   2400 	      if (lo[1] == const0_rtx)
   2401 		{
   2402 		  ix86_expand_branch (code, hi[0], hi[1], label);
   2403 		  return;
   2404 		}
   2405 	      break;
   2406 	    case LE: case LEU: case GT: case GTU:
   2407 	      if (lo[1] == constm1_rtx)
   2408 		{
   2409 		  ix86_expand_branch (code, hi[0], hi[1], label);
   2410 		  return;
   2411 		}
   2412 	      break;
   2413 	    default:
   2414 	      break;
   2415 	    }
   2416 
   2417 	/* Emulate comparisons that do not depend on Zero flag with
   2418 	   double-word subtraction.  Note that only Overflow, Sign
   2419 	   and Carry flags are valid, so swap arguments and condition
   2420 	   of comparisons that would otherwise test Zero flag.  */
   2421 
   2422 	switch (code)
   2423 	  {
   2424 	  case LE: case LEU: case GT: case GTU:
   2425 	    std::swap (lo[0], lo[1]);
   2426 	    std::swap (hi[0], hi[1]);
   2427 	    code = swap_condition (code);
   2428 	    /* FALLTHRU */
   2429 
   2430 	  case LT: case LTU: case GE: case GEU:
   2431 	    {
   2432 	      bool uns = (code == LTU || code == GEU);
   2433 	      rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
   2434 		= uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
   2435 
   2436 	      if (!nonimmediate_operand (lo[0], submode))
   2437 		lo[0] = force_reg (submode, lo[0]);
   2438 	      if (!x86_64_general_operand (lo[1], submode))
   2439 		lo[1] = force_reg (submode, lo[1]);
   2440 
   2441 	      if (!register_operand (hi[0], submode))
   2442 		hi[0] = force_reg (submode, hi[0]);
   2443 	      if ((uns && !nonimmediate_operand (hi[1], submode))
   2444 		  || (!uns && !x86_64_general_operand (hi[1], submode)))
   2445 		hi[1] = force_reg (submode, hi[1]);
   2446 
   2447 	      emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
   2448 
   2449 	      tmp = gen_rtx_SCRATCH (submode);
   2450 	      emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
   2451 
   2452 	      tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
   2453 	      ix86_expand_branch (code, tmp, const0_rtx, label);
   2454 	      return;
   2455 	    }
   2456 
   2457 	  default:
   2458 	    break;
   2459 	  }
   2460 
   2461 	/* Otherwise, we need two or three jumps.  */
   2462 
   2463 	label2 = gen_label_rtx ();
   2464 
   2465 	code1 = code;
   2466 	code2 = swap_condition (code);
   2467 	code3 = unsigned_condition (code);
   2468 
   2469 	switch (code)
   2470 	  {
   2471 	  case LT: case GT: case LTU: case GTU:
   2472 	    break;
   2473 
   2474 	  case LE:   code1 = LT;  code2 = GT;  break;
   2475 	  case GE:   code1 = GT;  code2 = LT;  break;
   2476 	  case LEU:  code1 = LTU; code2 = GTU; break;
   2477 	  case GEU:  code1 = GTU; code2 = LTU; break;
   2478 
   2479 	  case EQ:   code1 = UNKNOWN; code2 = NE;  break;
   2480 	  case NE:   code2 = UNKNOWN; break;
   2481 
   2482 	  default:
   2483 	    gcc_unreachable ();
   2484 	  }
   2485 
   2486 	/*
   2487 	 * a < b =>
   2488 	 *    if (hi(a) < hi(b)) goto true;
   2489 	 *    if (hi(a) > hi(b)) goto false;
   2490 	 *    if (lo(a) < lo(b)) goto true;
   2491 	 *  false:
   2492 	 */
   2493 
   2494 	if (code1 != UNKNOWN)
   2495 	  ix86_expand_branch (code1, hi[0], hi[1], label);
   2496 	if (code2 != UNKNOWN)
   2497 	  ix86_expand_branch (code2, hi[0], hi[1], label2);
   2498 
   2499 	ix86_expand_branch (code3, lo[0], lo[1], label);
   2500 
   2501 	if (code2 != UNKNOWN)
   2502 	  emit_label (label2);
   2503 	return;
   2504       }
   2505 
   2506     default:
   2507       gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
   2508       goto simple;
   2509     }
   2510 }
   2511 
   2512 /* Figure out whether to use unordered fp comparisons.  */
   2513 
   2514 static bool
   2515 ix86_unordered_fp_compare (enum rtx_code code)
   2516 {
   2517   if (!TARGET_IEEE_FP)
   2518     return false;
   2519 
   2520   switch (code)
   2521     {
   2522     case LT:
   2523     case LE:
   2524     case GT:
   2525     case GE:
   2526     case LTGT:
   2527       return false;
   2528 
   2529     case EQ:
   2530     case NE:
   2531 
   2532     case UNORDERED:
   2533     case ORDERED:
   2534     case UNLT:
   2535     case UNLE:
   2536     case UNGT:
   2537     case UNGE:
   2538     case UNEQ:
   2539       return true;
   2540 
   2541     default:
   2542       gcc_unreachable ();
   2543     }
   2544 }
   2545 
   2546 /* Return a comparison we can do and that it is equivalent to
   2547    swap_condition (code) apart possibly from orderedness.
   2548    But, never change orderedness if TARGET_IEEE_FP, returning
   2549    UNKNOWN in that case if necessary.  */
   2550 
   2551 static enum rtx_code
   2552 ix86_fp_swap_condition (enum rtx_code code)
   2553 {
   2554   switch (code)
   2555     {
   2556     case GT:                   /* GTU - CF=0 & ZF=0 */
   2557       return TARGET_IEEE_FP ? UNKNOWN : UNLT;
   2558     case GE:                   /* GEU - CF=0 */
   2559       return TARGET_IEEE_FP ? UNKNOWN : UNLE;
   2560     case UNLT:                 /* LTU - CF=1 */
   2561       return TARGET_IEEE_FP ? UNKNOWN : GT;
   2562     case UNLE:                 /* LEU - CF=1 | ZF=1 */
   2563       return TARGET_IEEE_FP ? UNKNOWN : GE;
   2564     default:
   2565       return swap_condition (code);
   2566     }
   2567 }
   2568 
   2569 /* Return cost of comparison CODE using the best strategy for performance.
   2570    All following functions do use number of instructions as a cost metrics.
   2571    In future this should be tweaked to compute bytes for optimize_size and
   2572    take into account performance of various instructions on various CPUs.  */
   2573 
   2574 static int
   2575 ix86_fp_comparison_cost (enum rtx_code code)
   2576 {
   2577   int arith_cost;
   2578 
   2579   /* The cost of code using bit-twiddling on %ah.  */
   2580   switch (code)
   2581     {
   2582     case UNLE:
   2583     case UNLT:
   2584     case LTGT:
   2585     case GT:
   2586     case GE:
   2587     case UNORDERED:
   2588     case ORDERED:
   2589     case UNEQ:
   2590       arith_cost = 4;
   2591       break;
   2592     case LT:
   2593     case NE:
   2594     case EQ:
   2595     case UNGE:
   2596       arith_cost = TARGET_IEEE_FP ? 5 : 4;
   2597       break;
   2598     case LE:
   2599     case UNGT:
   2600       arith_cost = TARGET_IEEE_FP ? 6 : 4;
   2601       break;
   2602     default:
   2603       gcc_unreachable ();
   2604     }
   2605 
   2606   switch (ix86_fp_comparison_strategy (code))
   2607     {
   2608     case IX86_FPCMP_COMI:
   2609       return arith_cost > 4 ? 3 : 2;
   2610     case IX86_FPCMP_SAHF:
   2611       return arith_cost > 4 ? 4 : 3;
   2612     default:
   2613       return arith_cost;
   2614     }
   2615 }
   2616 
   2617 /* Swap, force into registers, or otherwise massage the two operands
   2618    to a fp comparison.  The operands are updated in place; the new
   2619    comparison code is returned.  */
   2620 
   2621 static enum rtx_code
   2622 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
   2623 {
   2624   bool unordered_compare = ix86_unordered_fp_compare (code);
   2625   rtx op0 = *pop0, op1 = *pop1;
   2626   machine_mode op_mode = GET_MODE (op0);
   2627   bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
   2628 
   2629   /* All of the unordered compare instructions only work on registers.
   2630      The same is true of the fcomi compare instructions.  The XFmode
   2631      compare instructions require registers except when comparing
   2632      against zero or when converting operand 1 from fixed point to
   2633      floating point.  */
   2634 
   2635   if (!is_sse
   2636       && (unordered_compare
   2637 	  || (op_mode == XFmode
   2638 	      && ! (standard_80387_constant_p (op0) == 1
   2639 		    || standard_80387_constant_p (op1) == 1)
   2640 	      && GET_CODE (op1) != FLOAT)
   2641 	  || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
   2642     {
   2643       op0 = force_reg (op_mode, op0);
   2644       op1 = force_reg (op_mode, op1);
   2645     }
   2646   else
   2647     {
   2648       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
   2649 	 things around if they appear profitable, otherwise force op0
   2650 	 into a register.  */
   2651 
   2652       if (standard_80387_constant_p (op0) == 0
   2653 	  || (MEM_P (op0)
   2654 	      && ! (standard_80387_constant_p (op1) == 0
   2655 		    || MEM_P (op1))))
   2656 	{
   2657 	  enum rtx_code new_code = ix86_fp_swap_condition (code);
   2658 	  if (new_code != UNKNOWN)
   2659 	    {
   2660 	      std::swap (op0, op1);
   2661 	      code = new_code;
   2662 	    }
   2663 	}
   2664 
   2665       if (!REG_P (op0))
   2666 	op0 = force_reg (op_mode, op0);
   2667 
   2668       if (CONSTANT_P (op1))
   2669 	{
   2670 	  int tmp = standard_80387_constant_p (op1);
   2671 	  if (tmp == 0)
   2672 	    op1 = validize_mem (force_const_mem (op_mode, op1));
   2673 	  else if (tmp == 1)
   2674 	    {
   2675 	      if (TARGET_CMOVE)
   2676 		op1 = force_reg (op_mode, op1);
   2677 	    }
   2678 	  else
   2679 	    op1 = force_reg (op_mode, op1);
   2680 	}
   2681     }
   2682 
   2683   /* Try to rearrange the comparison to make it cheaper.  */
   2684   if (ix86_fp_comparison_cost (code)
   2685       > ix86_fp_comparison_cost (swap_condition (code))
   2686       && (REG_P (op1) || can_create_pseudo_p ()))
   2687     {
   2688       std::swap (op0, op1);
   2689       code = swap_condition (code);
   2690       if (!REG_P (op0))
   2691 	op0 = force_reg (op_mode, op0);
   2692     }
   2693 
   2694   *pop0 = op0;
   2695   *pop1 = op1;
   2696   return code;
   2697 }
   2698 
   2699 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
   2700 
   2701 static rtx
   2702 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
   2703 {
   2704   bool unordered_compare = ix86_unordered_fp_compare (code);
   2705   machine_mode cmp_mode;
   2706   rtx tmp, scratch;
   2707 
   2708   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
   2709 
   2710   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
   2711   if (unordered_compare)
   2712     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   2713 
   2714   /* Do fcomi/sahf based test when profitable.  */
   2715   switch (ix86_fp_comparison_strategy (code))
   2716     {
   2717     case IX86_FPCMP_COMI:
   2718       cmp_mode = CCFPmode;
   2719       emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
   2720       break;
   2721 
   2722     case IX86_FPCMP_SAHF:
   2723       cmp_mode = CCFPmode;
   2724       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
   2725       scratch = gen_reg_rtx (HImode);
   2726       emit_insn (gen_rtx_SET (scratch, tmp));
   2727       emit_insn (gen_x86_sahf_1 (scratch));
   2728       break;
   2729 
   2730     case IX86_FPCMP_ARITH:
   2731       cmp_mode = CCNOmode;
   2732       tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
   2733       scratch = gen_reg_rtx (HImode);
   2734       emit_insn (gen_rtx_SET (scratch, tmp));
   2735 
   2736       /* In the unordered case, we have to check C2 for NaN's, which
   2737 	 doesn't happen to work out to anything nice combination-wise.
   2738 	 So do some bit twiddling on the value we've got in AH to come
   2739 	 up with an appropriate set of condition codes.  */
   2740 
   2741       switch (code)
   2742 	{
   2743 	case GT:
   2744 	case UNGT:
   2745 	  if (code == GT || !TARGET_IEEE_FP)
   2746 	    {
   2747 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
   2748 	      code = EQ;
   2749 	    }
   2750 	  else
   2751 	    {
   2752 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2753 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
   2754 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
   2755 	      cmp_mode = CCmode;
   2756 	      code = GEU;
   2757 	    }
   2758 	  break;
   2759 	case LT:
   2760 	case UNLT:
   2761 	  if (code == LT && TARGET_IEEE_FP)
   2762 	    {
   2763 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2764 	      emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
   2765 	      cmp_mode = CCmode;
   2766 	      code = EQ;
   2767 	    }
   2768 	  else
   2769 	    {
   2770 	      emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
   2771 	      code = NE;
   2772 	    }
   2773 	  break;
   2774 	case GE:
   2775 	case UNGE:
   2776 	  if (code == GE || !TARGET_IEEE_FP)
   2777 	    {
   2778 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
   2779 	      code = EQ;
   2780 	    }
   2781 	  else
   2782 	    {
   2783 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2784 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
   2785 	      code = NE;
   2786 	    }
   2787 	  break;
   2788 	case LE:
   2789 	case UNLE:
   2790 	  if (code == LE && TARGET_IEEE_FP)
   2791 	    {
   2792 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2793 	      emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
   2794 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
   2795 	      cmp_mode = CCmode;
   2796 	      code = LTU;
   2797 	    }
   2798 	  else
   2799 	    {
   2800 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
   2801 	      code = NE;
   2802 	    }
   2803 	  break;
   2804 	case EQ:
   2805 	case UNEQ:
   2806 	  if (code == EQ && TARGET_IEEE_FP)
   2807 	    {
   2808 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2809 	      emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
   2810 	      cmp_mode = CCmode;
   2811 	      code = EQ;
   2812 	    }
   2813 	  else
   2814 	    {
   2815 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
   2816 	      code = NE;
   2817 	    }
   2818 	  break;
   2819 	case NE:
   2820 	case LTGT:
   2821 	  if (code == NE && TARGET_IEEE_FP)
   2822 	    {
   2823 	      emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
   2824 	      emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
   2825 					     GEN_INT (0x40)));
   2826 	      code = NE;
   2827 	    }
   2828 	  else
   2829 	    {
   2830 	      emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
   2831 	      code = EQ;
   2832 	    }
   2833 	  break;
   2834 
   2835 	case UNORDERED:
   2836 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
   2837 	  code = NE;
   2838 	  break;
   2839 	case ORDERED:
   2840 	  emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
   2841 	  code = EQ;
   2842 	  break;
   2843 
   2844 	default:
   2845 	  gcc_unreachable ();
   2846 	}
   2847 	break;
   2848 
   2849     default:
   2850       gcc_unreachable();
   2851     }
   2852 
   2853   /* Return the test that should be put into the flags user, i.e.
   2854      the bcc, scc, or cmov instruction.  */
   2855   return gen_rtx_fmt_ee (code, VOIDmode,
   2856 			 gen_rtx_REG (cmp_mode, FLAGS_REG),
   2857 			 const0_rtx);
   2858 }
   2859 
   2860 /* Generate insn patterns to do an integer compare of OPERANDS.  */
   2861 
   2862 static rtx
   2863 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
   2864 {
   2865   machine_mode cmpmode;
   2866   rtx tmp, flags;
   2867 
   2868   /* Swap operands to emit carry flag comparison.  */
   2869   if ((code == GTU || code == LEU)
   2870       && nonimmediate_operand (op1, VOIDmode))
   2871     {
   2872       std::swap (op0, op1);
   2873       code = swap_condition (code);
   2874     }
   2875 
   2876   cmpmode = SELECT_CC_MODE (code, op0, op1);
   2877   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
   2878 
   2879   /* This is very simple, but making the interface the same as in the
   2880      FP case makes the rest of the code easier.  */
   2881   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
   2882   emit_insn (gen_rtx_SET (flags, tmp));
   2883 
   2884   /* Return the test that should be put into the flags user, i.e.
   2885      the bcc, scc, or cmov instruction.  */
   2886   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
   2887 }
   2888 
   2889 static rtx
   2890 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
   2891 {
   2892   rtx ret;
   2893 
   2894   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
   2895     ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
   2896 
   2897   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
   2898     {
   2899       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
   2900       ret = ix86_expand_fp_compare (code, op0, op1);
   2901     }
   2902   else
   2903     ret = ix86_expand_int_compare (code, op0, op1);
   2904 
   2905   return ret;
   2906 }
   2907 
   2908 void
   2909 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
   2910 {
   2911   rtx ret;
   2912 
   2913   gcc_assert (GET_MODE (dest) == QImode);
   2914 
   2915   ret = ix86_expand_compare (code, op0, op1);
   2916   PUT_MODE (ret, QImode);
   2917   emit_insn (gen_rtx_SET (dest, ret));
   2918 }
   2919 
   2920 /* Expand floating point op0 <=> op1, i.e.
   2921    dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2.  */
   2922 
   2923 void
   2924 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
   2925 {
   2926   gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
   2927   rtx gt = ix86_expand_fp_compare (GT, op0, op1);
   2928   rtx l0 = gen_label_rtx ();
   2929   rtx l1 = gen_label_rtx ();
   2930   rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
   2931   rtx lend = gen_label_rtx ();
   2932   rtx tmp;
   2933   rtx_insn *jmp;
   2934   if (l2)
   2935     {
   2936       rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
   2937 			       gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
   2938       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
   2939 				  gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
   2940       jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   2941       add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
   2942     }
   2943   rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
   2944 			   gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
   2945   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
   2946 			      gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
   2947   jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   2948   add_reg_br_prob_note (jmp, profile_probability::unlikely ());
   2949   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
   2950 			      gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
   2951   jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   2952   add_reg_br_prob_note (jmp, profile_probability::even ());
   2953   emit_move_insn (dest, constm1_rtx);
   2954   emit_jump (lend);
   2955   emit_label (l0);
   2956   emit_move_insn (dest, const0_rtx);
   2957   emit_jump (lend);
   2958   emit_label (l1);
   2959   emit_move_insn (dest, const1_rtx);
   2960   emit_jump (lend);
   2961   if (l2)
   2962     {
   2963       emit_label (l2);
   2964       emit_move_insn (dest, const2_rtx);
   2965     }
   2966   emit_label (lend);
   2967 }
   2968 
   2969 /* Expand comparison setting or clearing carry flag.  Return true when
   2970    successful and set pop for the operation.  */
   2971 static bool
   2972 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
   2973 {
   2974   machine_mode mode
   2975     = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
   2976 
   2977   /* Do not handle double-mode compares that go through special path.  */
   2978   if (mode == (TARGET_64BIT ? TImode : DImode))
   2979     return false;
   2980 
   2981   if (SCALAR_FLOAT_MODE_P (mode))
   2982     {
   2983       rtx compare_op;
   2984       rtx_insn *compare_seq;
   2985 
   2986       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
   2987 
   2988       /* Shortcut:  following common codes never translate
   2989 	 into carry flag compares.  */
   2990       if (code == EQ || code == NE || code == UNEQ || code == LTGT
   2991 	  || code == ORDERED || code == UNORDERED)
   2992 	return false;
   2993 
   2994       /* These comparisons require zero flag; swap operands so they won't.  */
   2995       if ((code == GT || code == UNLE || code == LE || code == UNGT)
   2996 	  && !TARGET_IEEE_FP)
   2997 	{
   2998 	  std::swap (op0, op1);
   2999 	  code = swap_condition (code);
   3000 	}
   3001 
   3002       /* Try to expand the comparison and verify that we end up with
   3003 	 carry flag based comparison.  This fails to be true only when
   3004 	 we decide to expand comparison using arithmetic that is not
   3005 	 too common scenario.  */
   3006       start_sequence ();
   3007       compare_op = ix86_expand_fp_compare (code, op0, op1);
   3008       compare_seq = get_insns ();
   3009       end_sequence ();
   3010 
   3011       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
   3012         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
   3013       else
   3014 	code = GET_CODE (compare_op);
   3015 
   3016       if (code != LTU && code != GEU)
   3017 	return false;
   3018 
   3019       emit_insn (compare_seq);
   3020       *pop = compare_op;
   3021       return true;
   3022     }
   3023 
   3024   if (!INTEGRAL_MODE_P (mode))
   3025     return false;
   3026 
   3027   switch (code)
   3028     {
   3029     case LTU:
   3030     case GEU:
   3031       break;
   3032 
   3033     /* Convert a==0 into (unsigned)a<1.  */
   3034     case EQ:
   3035     case NE:
   3036       if (op1 != const0_rtx)
   3037 	return false;
   3038       op1 = const1_rtx;
   3039       code = (code == EQ ? LTU : GEU);
   3040       break;
   3041 
   3042     /* Convert a>b into b<a or a>=b-1.  */
   3043     case GTU:
   3044     case LEU:
   3045       if (CONST_INT_P (op1))
   3046 	{
   3047 	  op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
   3048 	  /* Bail out on overflow.  We still can swap operands but that
   3049 	     would force loading of the constant into register.  */
   3050 	  if (op1 == const0_rtx
   3051 	      || !x86_64_immediate_operand (op1, GET_MODE (op1)))
   3052 	    return false;
   3053 	  code = (code == GTU ? GEU : LTU);
   3054 	}
   3055       else
   3056 	{
   3057 	  std::swap (op0, op1);
   3058 	  code = (code == GTU ? LTU : GEU);
   3059 	}
   3060       break;
   3061 
   3062     /* Convert a>=0 into (unsigned)a<0x80000000.  */
   3063     case LT:
   3064     case GE:
   3065       if (mode == DImode || op1 != const0_rtx)
   3066 	return false;
   3067       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
   3068       code = (code == LT ? GEU : LTU);
   3069       break;
   3070     case LE:
   3071     case GT:
   3072       if (mode == DImode || op1 != constm1_rtx)
   3073 	return false;
   3074       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
   3075       code = (code == LE ? GEU : LTU);
   3076       break;
   3077 
   3078     default:
   3079       return false;
   3080     }
   3081   /* Swapping operands may cause constant to appear as first operand.  */
   3082   if (!nonimmediate_operand (op0, VOIDmode))
   3083     {
   3084       if (!can_create_pseudo_p ())
   3085 	return false;
   3086       op0 = force_reg (mode, op0);
   3087     }
   3088   *pop = ix86_expand_compare (code, op0, op1);
   3089   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
   3090   return true;
   3091 }
   3092 
   3093 /* Expand conditional increment or decrement using adb/sbb instructions.
   3094    The default case using setcc followed by the conditional move can be
   3095    done by generic code.  */
   3096 bool
   3097 ix86_expand_int_addcc (rtx operands[])
   3098 {
   3099   enum rtx_code code = GET_CODE (operands[1]);
   3100   rtx flags;
   3101   rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
   3102   rtx compare_op;
   3103   rtx val = const0_rtx;
   3104   bool fpcmp = false;
   3105   machine_mode mode;
   3106   rtx op0 = XEXP (operands[1], 0);
   3107   rtx op1 = XEXP (operands[1], 1);
   3108 
   3109   if (operands[3] != const1_rtx
   3110       && operands[3] != constm1_rtx)
   3111     return false;
   3112   if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
   3113      return false;
   3114   code = GET_CODE (compare_op);
   3115 
   3116   flags = XEXP (compare_op, 0);
   3117 
   3118   if (GET_MODE (flags) == CCFPmode)
   3119     {
   3120       fpcmp = true;
   3121       code = ix86_fp_compare_code_to_integer (code);
   3122     }
   3123 
   3124   if (code != LTU)
   3125     {
   3126       val = constm1_rtx;
   3127       if (fpcmp)
   3128 	PUT_CODE (compare_op,
   3129 		  reverse_condition_maybe_unordered
   3130 		    (GET_CODE (compare_op)));
   3131       else
   3132 	PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
   3133     }
   3134 
   3135   mode = GET_MODE (operands[0]);
   3136 
   3137   /* Construct either adc or sbb insn.  */
   3138   if ((code == LTU) == (operands[3] == constm1_rtx))
   3139     insn = gen_sub3_carry;
   3140   else
   3141     insn = gen_add3_carry;
   3142 
   3143   emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
   3144 
   3145   return true;
   3146 }
   3147 
   3148 bool
   3149 ix86_expand_int_movcc (rtx operands[])
   3150 {
   3151   enum rtx_code code = GET_CODE (operands[1]), compare_code;
   3152   rtx_insn *compare_seq;
   3153   rtx compare_op;
   3154   machine_mode mode = GET_MODE (operands[0]);
   3155   bool sign_bit_compare_p = false;
   3156   rtx op0 = XEXP (operands[1], 0);
   3157   rtx op1 = XEXP (operands[1], 1);
   3158   rtx op2 = operands[2];
   3159   rtx op3 = operands[3];
   3160 
   3161   if (GET_MODE (op0) == TImode
   3162       || (GET_MODE (op0) == DImode
   3163 	  && !TARGET_64BIT))
   3164     return false;
   3165 
   3166   start_sequence ();
   3167   compare_op = ix86_expand_compare (code, op0, op1);
   3168   compare_seq = get_insns ();
   3169   end_sequence ();
   3170 
   3171   compare_code = GET_CODE (compare_op);
   3172 
   3173   if ((op1 == const0_rtx && (code == GE || code == LT))
   3174       || (op1 == constm1_rtx && (code == GT || code == LE)))
   3175     sign_bit_compare_p = true;
   3176 
   3177   /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
   3178      but if op1 is a constant, the latter form allows more optimizations,
   3179      either through the last 2 ops being constant handling, or the one
   3180      constant and one variable cases.  On the other side, for cmov the
   3181      former might be better as we don't need to load the constant into
   3182      another register.  */
   3183   if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
   3184     op2 = op1;
   3185   /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1.  */
   3186   else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
   3187     op3 = op1;
   3188 
   3189   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
   3190      HImode insns, we'd be swallowed in word prefix ops.  */
   3191 
   3192   if ((mode != HImode || TARGET_FAST_PREFIX)
   3193       && (mode != (TARGET_64BIT ? TImode : DImode))
   3194       && CONST_INT_P (op2)
   3195       && CONST_INT_P (op3))
   3196     {
   3197       rtx out = operands[0];
   3198       HOST_WIDE_INT ct = INTVAL (op2);
   3199       HOST_WIDE_INT cf = INTVAL (op3);
   3200       HOST_WIDE_INT diff;
   3201 
   3202       diff = ct - cf;
   3203       /*  Sign bit compares are better done using shifts than we do by using
   3204 	  sbb.  */
   3205       if (sign_bit_compare_p
   3206 	  || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
   3207 	{
   3208 	  /* Detect overlap between destination and compare sources.  */
   3209 	  rtx tmp = out;
   3210 
   3211           if (!sign_bit_compare_p)
   3212 	    {
   3213 	      rtx flags;
   3214 	      bool fpcmp = false;
   3215 
   3216 	      compare_code = GET_CODE (compare_op);
   3217 
   3218 	      flags = XEXP (compare_op, 0);
   3219 
   3220 	      if (GET_MODE (flags) == CCFPmode)
   3221 		{
   3222 		  fpcmp = true;
   3223 		  compare_code
   3224 		    = ix86_fp_compare_code_to_integer (compare_code);
   3225 		}
   3226 
   3227 	      /* To simplify rest of code, restrict to the GEU case.  */
   3228 	      if (compare_code == LTU)
   3229 		{
   3230 		  std::swap (ct, cf);
   3231 		  compare_code = reverse_condition (compare_code);
   3232 		  code = reverse_condition (code);
   3233 		}
   3234 	      else
   3235 		{
   3236 		  if (fpcmp)
   3237 		    PUT_CODE (compare_op,
   3238 			      reverse_condition_maybe_unordered
   3239 			        (GET_CODE (compare_op)));
   3240 		  else
   3241 		    PUT_CODE (compare_op,
   3242 			      reverse_condition (GET_CODE (compare_op)));
   3243 		}
   3244 	      diff = ct - cf;
   3245 
   3246 	      if (reg_overlap_mentioned_p (out, op0)
   3247 		  || reg_overlap_mentioned_p (out, op1))
   3248 		tmp = gen_reg_rtx (mode);
   3249 
   3250 	      if (mode == DImode)
   3251 		emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
   3252 	      else
   3253 		emit_insn (gen_x86_movsicc_0_m1	(gen_lowpart (SImode, tmp),
   3254 						 flags, compare_op));
   3255 	    }
   3256 	  else
   3257 	    {
   3258 	      if (code == GT || code == GE)
   3259 		code = reverse_condition (code);
   3260 	      else
   3261 		{
   3262 		  std::swap (ct, cf);
   3263 		  diff = ct - cf;
   3264 		}
   3265 	      tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
   3266 	    }
   3267 
   3268 	  if (diff == 1)
   3269 	    {
   3270 	      /*
   3271 	       * cmpl op0,op1
   3272 	       * sbbl dest,dest
   3273 	       * [addl dest, ct]
   3274 	       *
   3275 	       * Size 5 - 8.
   3276 	       */
   3277 	      if (ct)
   3278 		tmp = expand_simple_binop (mode, PLUS,
   3279 					   tmp, GEN_INT (ct),
   3280 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
   3281 	    }
   3282 	  else if (cf == -1)
   3283 	    {
   3284 	      /*
   3285 	       * cmpl op0,op1
   3286 	       * sbbl dest,dest
   3287 	       * orl $ct, dest
   3288 	       *
   3289 	       * Size 8.
   3290 	       */
   3291 	      tmp = expand_simple_binop (mode, IOR,
   3292 					 tmp, GEN_INT (ct),
   3293 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
   3294 	    }
   3295 	  else if (diff == -1 && ct)
   3296 	    {
   3297 	      /*
   3298 	       * cmpl op0,op1
   3299 	       * sbbl dest,dest
   3300 	       * notl dest
   3301 	       * [addl dest, cf]
   3302 	       *
   3303 	       * Size 8 - 11.
   3304 	       */
   3305 	      tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
   3306 	      if (cf)
   3307 		tmp = expand_simple_binop (mode, PLUS,
   3308 					   copy_rtx (tmp), GEN_INT (cf),
   3309 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
   3310 	    }
   3311 	  else
   3312 	    {
   3313 	      /*
   3314 	       * cmpl op0,op1
   3315 	       * sbbl dest,dest
   3316 	       * [notl dest]
   3317 	       * andl cf - ct, dest
   3318 	       * [addl dest, ct]
   3319 	       *
   3320 	       * Size 8 - 11.
   3321 	       */
   3322 
   3323 	      if (cf == 0)
   3324 		{
   3325 		  cf = ct;
   3326 		  ct = 0;
   3327 		  tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
   3328 		}
   3329 
   3330 	      tmp = expand_simple_binop (mode, AND,
   3331 					 copy_rtx (tmp),
   3332 					 gen_int_mode (cf - ct, mode),
   3333 					 copy_rtx (tmp), 1, OPTAB_DIRECT);
   3334 	      if (ct)
   3335 		tmp = expand_simple_binop (mode, PLUS,
   3336 					   copy_rtx (tmp), GEN_INT (ct),
   3337 					   copy_rtx (tmp), 1, OPTAB_DIRECT);
   3338 	    }
   3339 
   3340 	  if (!rtx_equal_p (tmp, out))
   3341 	    emit_move_insn (copy_rtx (out), copy_rtx (tmp));
   3342 
   3343 	  return true;
   3344 	}
   3345 
   3346       if (diff < 0)
   3347 	{
   3348 	  machine_mode cmp_mode = GET_MODE (op0);
   3349 	  enum rtx_code new_code;
   3350 
   3351 	  if (SCALAR_FLOAT_MODE_P (cmp_mode))
   3352 	    {
   3353 	      gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
   3354 
   3355 	      /* We may be reversing a non-trapping
   3356 		 comparison to a trapping comparison.  */
   3357 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
   3358 		      && code != EQ && code != NE
   3359 		      && code != ORDERED && code != UNORDERED)
   3360 		    new_code = UNKNOWN;
   3361 		  else
   3362 		    new_code = reverse_condition_maybe_unordered (code);
   3363 	    }
   3364 	  else
   3365 	    new_code = ix86_reverse_condition (code, cmp_mode);
   3366 	  if (new_code != UNKNOWN)
   3367 	    {
   3368 	      std::swap (ct, cf);
   3369 	      diff = -diff;
   3370 	      code = new_code;
   3371 	    }
   3372 	}
   3373 
   3374       compare_code = UNKNOWN;
   3375       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
   3376 	  && CONST_INT_P (op1))
   3377 	{
   3378 	  if (op1 == const0_rtx
   3379 	      && (code == LT || code == GE))
   3380 	    compare_code = code;
   3381 	  else if (op1 == constm1_rtx)
   3382 	    {
   3383 	      if (code == LE)
   3384 		compare_code = LT;
   3385 	      else if (code == GT)
   3386 		compare_code = GE;
   3387 	    }
   3388 	}
   3389 
   3390       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
   3391       if (compare_code != UNKNOWN
   3392 	  && GET_MODE (op0) == GET_MODE (out)
   3393 	  && (cf == -1 || ct == -1))
   3394 	{
   3395 	  /* If lea code below could be used, only optimize
   3396 	     if it results in a 2 insn sequence.  */
   3397 
   3398 	  if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
   3399 		 || diff == 3 || diff == 5 || diff == 9)
   3400 	      || (compare_code == LT && ct == -1)
   3401 	      || (compare_code == GE && cf == -1))
   3402 	    {
   3403 	      /*
   3404 	       * notl op1	(if necessary)
   3405 	       * sarl $31, op1
   3406 	       * orl cf, op1
   3407 	       */
   3408 	      if (ct != -1)
   3409 		{
   3410 		  cf = ct;
   3411 		  ct = -1;
   3412 		  code = reverse_condition (code);
   3413 		}
   3414 
   3415 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
   3416 
   3417 	      out = expand_simple_binop (mode, IOR,
   3418 					 out, GEN_INT (cf),
   3419 					 out, 1, OPTAB_DIRECT);
   3420 	      if (out != operands[0])
   3421 		emit_move_insn (operands[0], out);
   3422 
   3423 	      return true;
   3424 	    }
   3425 	}
   3426 
   3427 
   3428       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
   3429 	   || diff == 3 || diff == 5 || diff == 9)
   3430 	  && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
   3431 	  && (mode != DImode
   3432 	      || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
   3433 	{
   3434 	  /*
   3435 	   * xorl dest,dest
   3436 	   * cmpl op1,op2
   3437 	   * setcc dest
   3438 	   * lea cf(dest*(ct-cf)),dest
   3439 	   *
   3440 	   * Size 14.
   3441 	   *
   3442 	   * This also catches the degenerate setcc-only case.
   3443 	   */
   3444 
   3445 	  rtx tmp;
   3446 	  int nops;
   3447 
   3448 	  out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
   3449 
   3450 	  nops = 0;
   3451 	  /* On x86_64 the lea instruction operates on Pmode, so we need
   3452 	     to get arithmetics done in proper mode to match.  */
   3453 	  if (diff == 1)
   3454 	    tmp = copy_rtx (out);
   3455 	  else
   3456 	    {
   3457 	      rtx out1;
   3458 	      out1 = copy_rtx (out);
   3459 	      tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
   3460 	      nops++;
   3461 	      if (diff & 1)
   3462 		{
   3463 		  tmp = gen_rtx_PLUS (mode, tmp, out1);
   3464 		  nops++;
   3465 		}
   3466 	    }
   3467 	  if (cf != 0)
   3468 	    {
   3469 	      tmp = plus_constant (mode, tmp, cf);
   3470 	      nops++;
   3471 	    }
   3472 	  if (!rtx_equal_p (tmp, out))
   3473 	    {
   3474 	      if (nops == 1)
   3475 		out = force_operand (tmp, copy_rtx (out));
   3476 	      else
   3477 		emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
   3478 	    }
   3479 	  if (!rtx_equal_p (out, operands[0]))
   3480 	    emit_move_insn (operands[0], copy_rtx (out));
   3481 
   3482 	  return true;
   3483 	}
   3484 
   3485       /*
   3486        * General case:			Jumpful:
   3487        *   xorl dest,dest		cmpl op1, op2
   3488        *   cmpl op1, op2		movl ct, dest
   3489        *   setcc dest			jcc 1f
   3490        *   decl dest			movl cf, dest
   3491        *   andl (cf-ct),dest		1:
   3492        *   addl ct,dest
   3493        *
   3494        * Size 20.			Size 14.
   3495        *
   3496        * This is reasonably steep, but branch mispredict costs are
   3497        * high on modern cpus, so consider failing only if optimizing
   3498        * for space.
   3499        */
   3500 
   3501       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
   3502 	  && BRANCH_COST (optimize_insn_for_speed_p (),
   3503 		  	  false) >= 2)
   3504 	{
   3505 	  if (cf == 0)
   3506 	    {
   3507 	      machine_mode cmp_mode = GET_MODE (op0);
   3508 	      enum rtx_code new_code;
   3509 
   3510 	      if (SCALAR_FLOAT_MODE_P (cmp_mode))
   3511 		{
   3512 		  gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
   3513 
   3514 		  /* We may be reversing a non-trapping
   3515 		     comparison to a trapping comparison.  */
   3516 		  if (HONOR_NANS (cmp_mode) && flag_trapping_math
   3517 		      && code != EQ && code != NE
   3518 		      && code != ORDERED && code != UNORDERED)
   3519 		    new_code = UNKNOWN;
   3520 		  else
   3521 		    new_code = reverse_condition_maybe_unordered (code);
   3522 
   3523 		}
   3524 	      else
   3525 		{
   3526 		  new_code = ix86_reverse_condition (code, cmp_mode);
   3527 		  if (compare_code != UNKNOWN && new_code != UNKNOWN)
   3528 		    compare_code = reverse_condition (compare_code);
   3529 		}
   3530 
   3531 	      if (new_code != UNKNOWN)
   3532 		{
   3533 		  cf = ct;
   3534 		  ct = 0;
   3535 		  code = new_code;
   3536 		}
   3537 	    }
   3538 
   3539 	  if (compare_code != UNKNOWN)
   3540 	    {
   3541 	      /* notl op1	(if needed)
   3542 		 sarl $31, op1
   3543 		 andl (cf-ct), op1
   3544 		 addl ct, op1
   3545 
   3546 		 For x < 0 (resp. x <= -1) there will be no notl,
   3547 		 so if possible swap the constants to get rid of the
   3548 		 complement.
   3549 		 True/false will be -1/0 while code below (store flag
   3550 		 followed by decrement) is 0/-1, so the constants need
   3551 		 to be exchanged once more.  */
   3552 
   3553 	      if (compare_code == GE || !cf)
   3554 		{
   3555 		  code = reverse_condition (code);
   3556 		  compare_code = LT;
   3557 		}
   3558 	      else
   3559 		std::swap (ct, cf);
   3560 
   3561 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
   3562 	    }
   3563 	  else
   3564 	    {
   3565 	      out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
   3566 
   3567 	      out = expand_simple_binop (mode, PLUS, copy_rtx (out),
   3568 					 constm1_rtx,
   3569 					 copy_rtx (out), 1, OPTAB_DIRECT);
   3570 	    }
   3571 
   3572 	  out = expand_simple_binop (mode, AND, copy_rtx (out),
   3573 				     gen_int_mode (cf - ct, mode),
   3574 				     copy_rtx (out), 1, OPTAB_DIRECT);
   3575 	  if (ct)
   3576 	    out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
   3577 				       copy_rtx (out), 1, OPTAB_DIRECT);
   3578 	  if (!rtx_equal_p (out, operands[0]))
   3579 	    emit_move_insn (operands[0], copy_rtx (out));
   3580 
   3581 	  return true;
   3582 	}
   3583     }
   3584 
   3585   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
   3586     {
   3587       /* Try a few things more with specific constants and a variable.  */
   3588 
   3589       optab op;
   3590       rtx var, orig_out, out, tmp;
   3591 
   3592       if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
   3593 	return false;
   3594 
   3595       operands[2] = op2;
   3596       operands[3] = op3;
   3597 
   3598       /* If one of the two operands is an interesting constant, load a
   3599 	 constant with the above and mask it in with a logical operation.  */
   3600 
   3601       if (CONST_INT_P (operands[2]))
   3602 	{
   3603 	  var = operands[3];
   3604 	  if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
   3605 	    operands[3] = constm1_rtx, op = and_optab;
   3606 	  else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
   3607 	    operands[3] = const0_rtx, op = ior_optab;
   3608 	  else
   3609 	    return false;
   3610 	}
   3611       else if (CONST_INT_P (operands[3]))
   3612 	{
   3613 	  var = operands[2];
   3614 	  if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
   3615 	    {
   3616 	      /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
   3617 		 "x <= 0 ? x : 0" to enable sign_bit_compare_p.  */
   3618 	      if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
   3619 		operands[1] = simplify_gen_relational (LT, VOIDmode,
   3620 						       GET_MODE (op0),
   3621 						       op0, const0_rtx);
   3622 
   3623 	      operands[2] = constm1_rtx;
   3624 	      op = and_optab;
   3625 	    }
   3626 	  else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
   3627 	    operands[2] = const0_rtx, op = ior_optab;
   3628 	  else
   3629 	    return false;
   3630 	}
   3631       else
   3632         return false;
   3633 
   3634       orig_out = operands[0];
   3635       tmp = gen_reg_rtx (mode);
   3636       operands[0] = tmp;
   3637 
   3638       /* Recurse to get the constant loaded.  */
   3639       if (!ix86_expand_int_movcc (operands))
   3640         return false;
   3641 
   3642       /* Mask in the interesting variable.  */
   3643       out = expand_binop (mode, op, var, tmp, orig_out, 0,
   3644 			  OPTAB_WIDEN);
   3645       if (!rtx_equal_p (out, orig_out))
   3646 	emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
   3647 
   3648       return true;
   3649     }
   3650 
   3651   /*
   3652    * For comparison with above,
   3653    *
   3654    * movl cf,dest
   3655    * movl ct,tmp
   3656    * cmpl op1,op2
   3657    * cmovcc tmp,dest
   3658    *
   3659    * Size 15.
   3660    */
   3661 
   3662   if (! nonimmediate_operand (operands[2], mode))
   3663     operands[2] = force_reg (mode, operands[2]);
   3664   if (! nonimmediate_operand (operands[3], mode))
   3665     operands[3] = force_reg (mode, operands[3]);
   3666 
   3667   if (! register_operand (operands[2], VOIDmode)
   3668       && (mode == QImode
   3669           || ! register_operand (operands[3], VOIDmode)))
   3670     operands[2] = force_reg (mode, operands[2]);
   3671 
   3672   if (mode == QImode
   3673       && ! register_operand (operands[3], VOIDmode))
   3674     operands[3] = force_reg (mode, operands[3]);
   3675 
   3676   emit_insn (compare_seq);
   3677   emit_insn (gen_rtx_SET (operands[0],
   3678 			  gen_rtx_IF_THEN_ELSE (mode,
   3679 						compare_op, operands[2],
   3680 						operands[3])));
   3681   return true;
   3682 }
   3683 
   3684 /* Detect conditional moves that exactly match min/max operational
   3685    semantics.  Note that this is IEEE safe, as long as we don't
   3686    interchange the operands.
   3687 
   3688    Returns FALSE if this conditional move doesn't match a MIN/MAX,
   3689    and TRUE if the operation is successful and instructions are emitted.  */
   3690 
   3691 static bool
   3692 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
   3693 			   rtx cmp_op1, rtx if_true, rtx if_false)
   3694 {
   3695   machine_mode mode;
   3696   bool is_min;
   3697   rtx tmp;
   3698 
   3699   if (code == LT)
   3700     ;
   3701   else if (code == UNGE)
   3702     std::swap (if_true, if_false);
   3703   else
   3704     return false;
   3705 
   3706   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
   3707     is_min = true;
   3708   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
   3709     is_min = false;
   3710   else
   3711     return false;
   3712 
   3713   mode = GET_MODE (dest);
   3714 
   3715   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
   3716      but MODE may be a vector mode and thus not appropriate.  */
   3717   if (!flag_finite_math_only || flag_signed_zeros)
   3718     {
   3719       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
   3720       rtvec v;
   3721 
   3722       if_true = force_reg (mode, if_true);
   3723       v = gen_rtvec (2, if_true, if_false);
   3724       tmp = gen_rtx_UNSPEC (mode, v, u);
   3725     }
   3726   else
   3727     {
   3728       code = is_min ? SMIN : SMAX;
   3729       if (MEM_P (if_true) && MEM_P (if_false))
   3730 	if_true = force_reg (mode, if_true);
   3731       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
   3732     }
   3733 
   3734   emit_insn (gen_rtx_SET (dest, tmp));
   3735   return true;
   3736 }
   3737 
   3738 /* Return true if MODE is valid for vector compare to mask register,
   3739    Same result for conditionl vector move with mask register.  */
   3740 static bool
   3741 ix86_valid_mask_cmp_mode (machine_mode mode)
   3742 {
   3743   /* XOP has its own vector conditional movement.  */
   3744   if (TARGET_XOP && !TARGET_AVX512F)
   3745     return false;
   3746 
   3747   /* HFmode only supports vcmpsh whose dest is mask register.  */
   3748   if (TARGET_AVX512FP16 && mode == HFmode)
   3749     return true;
   3750 
   3751   /* AVX512F is needed for mask operation.  */
   3752   if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
   3753     return false;
   3754 
   3755   /* AVX512BW is needed for vector QI/HImode,
   3756      AVX512VL is needed for 128/256-bit vector.  */
   3757   machine_mode inner_mode = GET_MODE_INNER (mode);
   3758   int vector_size = GET_MODE_SIZE (mode);
   3759   if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
   3760     return false;
   3761 
   3762   return vector_size == 64 || TARGET_AVX512VL;
   3763 }
   3764 
   3765 /* Return true if integer mask comparison should be used.  */
   3766 static bool
   3767 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
   3768 		     rtx op_true, rtx op_false)
   3769 {
   3770   int vector_size = GET_MODE_SIZE (mode);
   3771 
   3772   if (cmp_mode == HFmode)
   3773     return true;
   3774   else if (vector_size < 16)
   3775     return false;
   3776   else if (vector_size == 64)
   3777     return true;
   3778   else if (GET_MODE_INNER (cmp_mode) == HFmode)
   3779     return true;
   3780 
   3781   /* When op_true is NULL, op_false must be NULL, or vice versa.  */
   3782   gcc_assert (!op_true == !op_false);
   3783 
   3784   /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
   3785      vector dest is required.  */
   3786   if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
   3787     return false;
   3788 
   3789   /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
   3790   if (op_false == CONST0_RTX (mode)
   3791       || op_true == CONST0_RTX (mode)
   3792       || (INTEGRAL_MODE_P (mode)
   3793 	  && (op_true == CONSTM1_RTX (mode)
   3794 	      || op_false == CONSTM1_RTX (mode))))
   3795     return false;
   3796 
   3797   return true;
   3798 }
   3799 
   3800 /* Expand an SSE comparison.  Return the register with the result.  */
   3801 
   3802 static rtx
   3803 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
   3804 		     rtx op_true, rtx op_false)
   3805 {
   3806   machine_mode mode = GET_MODE (dest);
   3807   machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
   3808 
   3809   /* In general case result of comparison can differ from operands' type.  */
   3810   machine_mode cmp_mode;
   3811 
   3812   /* In AVX512F the result of comparison is an integer mask.  */
   3813   bool maskcmp = false;
   3814   rtx x;
   3815 
   3816   if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
   3817     {
   3818       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
   3819       maskcmp = true;
   3820       cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
   3821     }
   3822   else
   3823     cmp_mode = cmp_ops_mode;
   3824 
   3825   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
   3826 
   3827   bool (*op1_predicate)(rtx, machine_mode)
   3828     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
   3829 
   3830   if (!op1_predicate (cmp_op1, cmp_ops_mode))
   3831     cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
   3832 
   3833   if (optimize
   3834       || (maskcmp && cmp_mode != mode)
   3835       || (op_true && reg_overlap_mentioned_p (dest, op_true))
   3836       || (op_false && reg_overlap_mentioned_p (dest, op_false)))
   3837     dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
   3838 
   3839   if (maskcmp)
   3840     {
   3841       bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
   3842       gcc_assert (ok);
   3843       return dest;
   3844     }
   3845 
   3846   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
   3847 
   3848   if (cmp_mode != mode)
   3849     {
   3850       x = force_reg (cmp_ops_mode, x);
   3851       convert_move (dest, x, false);
   3852     }
   3853   else
   3854     emit_insn (gen_rtx_SET (dest, x));
   3855 
   3856   return dest;
   3857 }
   3858 
   3859 /* Emit x86 binary operand CODE in mode MODE for SSE vector
   3860    instructions that can be performed using GP registers.  */
   3861 
   3862 static void
   3863 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
   3864 		     rtx dst, rtx src1, rtx src2)
   3865 {
   3866   rtx tmp;
   3867 
   3868   tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
   3869 
   3870   if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
   3871       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
   3872     {
   3873       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
   3874       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
   3875     }
   3876 
   3877   emit_insn (tmp);
   3878 }
   3879 
   3880 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
   3881    operations.  This is used for both scalar and vector conditional moves.  */
   3882 
   3883 void
   3884 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
   3885 {
   3886   machine_mode mode = GET_MODE (dest);
   3887   machine_mode cmpmode = GET_MODE (cmp);
   3888   rtx x;
   3889 
   3890   /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506.  */
   3891   if (rtx_equal_p (op_true, op_false))
   3892     {
   3893       emit_move_insn (dest, op_true);
   3894       return;
   3895     }
   3896 
   3897   /* If we have an integer mask and FP value then we need
   3898      to cast mask to FP mode.  */
   3899   if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
   3900     {
   3901       cmp = force_reg (cmpmode, cmp);
   3902       cmp = gen_rtx_SUBREG (mode, cmp, 0);
   3903     }
   3904 
   3905   /* In AVX512F the result of comparison is an integer mask.  */
   3906   if (mode != cmpmode
   3907       && GET_MODE_CLASS (cmpmode) == MODE_INT)
   3908     {
   3909       gcc_assert (ix86_valid_mask_cmp_mode (mode));
   3910       /* Using scalar/vector move with mask register.  */
   3911       cmp = force_reg (cmpmode, cmp);
   3912       /* Optimize for mask zero.  */
   3913       op_true = (op_true != CONST0_RTX (mode)
   3914 		 ? force_reg (mode, op_true) : op_true);
   3915       op_false = (op_false != CONST0_RTX (mode)
   3916 		  ? force_reg (mode, op_false) : op_false);
   3917       if (op_true == CONST0_RTX (mode))
   3918 	{
   3919 	  if (cmpmode == E_DImode && !TARGET_64BIT)
   3920 	    {
   3921 	      x = gen_reg_rtx (cmpmode);
   3922 	      emit_insn (gen_knotdi (x, cmp));
   3923 	    }
   3924 	  else
   3925 	    x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
   3926 	  cmp = x;
   3927 	  /* Reverse op_true op_false.  */
   3928 	  std::swap (op_true, op_false);
   3929 	}
   3930 
   3931       if (mode == HFmode)
   3932 	emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
   3933       else
   3934 	emit_insn (gen_rtx_SET (dest,
   3935 				gen_rtx_VEC_MERGE (mode,
   3936 						   op_true, op_false, cmp)));
   3937       return;
   3938     }
   3939 
   3940   if (vector_all_ones_operand (op_true, mode)
   3941       && op_false == CONST0_RTX (mode))
   3942     {
   3943       emit_move_insn (dest, cmp);
   3944       return;
   3945     }
   3946   else if (op_false == CONST0_RTX (mode))
   3947     {
   3948       x = expand_simple_binop (mode, AND, cmp, op_true,
   3949 			       dest, 1, OPTAB_DIRECT);
   3950       if (x != dest)
   3951 	emit_move_insn (dest, x);
   3952       return;
   3953     }
   3954   else if (op_true == CONST0_RTX (mode))
   3955     {
   3956       op_false = force_reg (mode, op_false);
   3957       x = gen_rtx_NOT (mode, cmp);
   3958       ix86_emit_vec_binop (AND, mode, dest, x, op_false);
   3959       return;
   3960     }
   3961   else if (vector_all_ones_operand (op_true, mode))
   3962     {
   3963       x = expand_simple_binop (mode, IOR, cmp, op_false,
   3964 			       dest, 1, OPTAB_DIRECT);
   3965       if (x != dest)
   3966 	emit_move_insn (dest, x);
   3967       return;
   3968     }
   3969 
   3970   if (TARGET_XOP)
   3971     {
   3972       op_true = force_reg (mode, op_true);
   3973 
   3974       if (GET_MODE_SIZE (mode) < 16
   3975 	  || !nonimmediate_operand (op_false, mode))
   3976 	op_false = force_reg (mode, op_false);
   3977 
   3978       emit_insn (gen_rtx_SET (dest,
   3979 			      gen_rtx_IF_THEN_ELSE (mode, cmp,
   3980 						    op_true, op_false)));
   3981       return;
   3982     }
   3983 
   3984   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
   3985   machine_mode blend_mode = mode;
   3986 
   3987   if (GET_MODE_SIZE (mode) < 16
   3988       || !vector_operand (op_true, mode))
   3989     op_true = force_reg (mode, op_true);
   3990 
   3991   op_false = force_reg (mode, op_false);
   3992 
   3993   switch (mode)
   3994     {
   3995     case E_V2SFmode:
   3996       if (TARGET_SSE4_1)
   3997 	gen = gen_mmx_blendvps;
   3998       break;
   3999     case E_V4SFmode:
   4000       if (TARGET_SSE4_1)
   4001 	gen = gen_sse4_1_blendvps;
   4002       break;
   4003     case E_V2DFmode:
   4004       if (TARGET_SSE4_1)
   4005 	gen = gen_sse4_1_blendvpd;
   4006       break;
   4007     case E_SFmode:
   4008       if (TARGET_SSE4_1)
   4009 	gen = gen_sse4_1_blendvss;
   4010       break;
   4011     case E_DFmode:
   4012       if (TARGET_SSE4_1)
   4013 	gen = gen_sse4_1_blendvsd;
   4014       break;
   4015     case E_V8QImode:
   4016     case E_V4HImode:
   4017     case E_V2SImode:
   4018       if (TARGET_SSE4_1)
   4019 	{
   4020 	  gen = gen_mmx_pblendvb_v8qi;
   4021 	  blend_mode = V8QImode;
   4022 	}
   4023       break;
   4024     case E_V4QImode:
   4025     case E_V2HImode:
   4026       if (TARGET_SSE4_1)
   4027 	{
   4028 	  gen = gen_mmx_pblendvb_v4qi;
   4029 	  blend_mode = V4QImode;
   4030 	}
   4031       break;
   4032     case E_V2QImode:
   4033       if (TARGET_SSE4_1)
   4034 	gen = gen_mmx_pblendvb_v2qi;
   4035       break;
   4036     case E_V16QImode:
   4037     case E_V8HImode:
   4038     case E_V8HFmode:
   4039     case E_V4SImode:
   4040     case E_V2DImode:
   4041       if (TARGET_SSE4_1)
   4042 	{
   4043 	  gen = gen_sse4_1_pblendvb;
   4044 	  blend_mode = V16QImode;
   4045 	}
   4046       break;
   4047     case E_V8SFmode:
   4048       if (TARGET_AVX)
   4049 	gen = gen_avx_blendvps256;
   4050       break;
   4051     case E_V4DFmode:
   4052       if (TARGET_AVX)
   4053 	gen = gen_avx_blendvpd256;
   4054       break;
   4055     case E_V32QImode:
   4056     case E_V16HImode:
   4057     case E_V16HFmode:
   4058     case E_V8SImode:
   4059     case E_V4DImode:
   4060       if (TARGET_AVX2)
   4061 	{
   4062 	  gen = gen_avx2_pblendvb;
   4063 	  blend_mode = V32QImode;
   4064 	}
   4065       break;
   4066 
   4067     case E_V64QImode:
   4068       gen = gen_avx512bw_blendmv64qi;
   4069       break;
   4070     case E_V32HImode:
   4071       gen = gen_avx512bw_blendmv32hi;
   4072       break;
   4073     case E_V32HFmode:
   4074       gen = gen_avx512bw_blendmv32hf;
   4075       break;
   4076     case E_V16SImode:
   4077       gen = gen_avx512f_blendmv16si;
   4078       break;
   4079     case E_V8DImode:
   4080       gen = gen_avx512f_blendmv8di;
   4081       break;
   4082     case E_V8DFmode:
   4083       gen = gen_avx512f_blendmv8df;
   4084       break;
   4085     case E_V16SFmode:
   4086       gen = gen_avx512f_blendmv16sf;
   4087       break;
   4088 
   4089     default:
   4090       break;
   4091     }
   4092 
   4093   if (gen != NULL)
   4094     {
   4095       if (blend_mode == mode)
   4096 	x = dest;
   4097       else
   4098 	{
   4099 	  x = gen_reg_rtx (blend_mode);
   4100 	  op_false = gen_lowpart (blend_mode, op_false);
   4101 	  op_true = gen_lowpart (blend_mode, op_true);
   4102 	  cmp = gen_lowpart (blend_mode, cmp);
   4103 	}
   4104 
   4105       emit_insn (gen (x, op_false, op_true, cmp));
   4106 
   4107       if (x != dest)
   4108 	emit_move_insn (dest, gen_lowpart (mode, x));
   4109     }
   4110   else
   4111     {
   4112       rtx t2, t3;
   4113 
   4114       t2 = expand_simple_binop (mode, AND, op_true, cmp,
   4115 				NULL, 1, OPTAB_DIRECT);
   4116 
   4117       t3 = gen_reg_rtx (mode);
   4118       x = gen_rtx_NOT (mode, cmp);
   4119       ix86_emit_vec_binop (AND, mode, t3, x, op_false);
   4120 
   4121       x = expand_simple_binop (mode, IOR, t3, t2,
   4122 			       dest, 1, OPTAB_DIRECT);
   4123       if (x != dest)
   4124 	emit_move_insn (dest, x);
   4125     }
   4126 }
   4127 
   4128 /* Swap, force into registers, or otherwise massage the two operands
   4129    to an sse comparison with a mask result.  Thus we differ a bit from
   4130    ix86_prepare_fp_compare_args which expects to produce a flags result.
   4131 
   4132    The DEST operand exists to help determine whether to commute commutative
   4133    operators.  The POP0/POP1 operands are updated in place.  The new
   4134    comparison code is returned, or UNKNOWN if not implementable.  */
   4135 
   4136 static enum rtx_code
   4137 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
   4138 				  rtx *pop0, rtx *pop1)
   4139 {
   4140   switch (code)
   4141     {
   4142     case LTGT:
   4143     case UNEQ:
   4144       /* AVX supports all the needed comparisons.  */
   4145       if (TARGET_AVX)
   4146 	break;
   4147       /* We have no LTGT as an operator.  We could implement it with
   4148 	 NE & ORDERED, but this requires an extra temporary.  It's
   4149 	 not clear that it's worth it.  */
   4150       return UNKNOWN;
   4151 
   4152     case LT:
   4153     case LE:
   4154     case UNGT:
   4155     case UNGE:
   4156       /* These are supported directly.  */
   4157       break;
   4158 
   4159     case EQ:
   4160     case NE:
   4161     case UNORDERED:
   4162     case ORDERED:
   4163       /* AVX has 3 operand comparisons, no need to swap anything.  */
   4164       if (TARGET_AVX)
   4165 	break;
   4166       /* For commutative operators, try to canonicalize the destination
   4167 	 operand to be first in the comparison - this helps reload to
   4168 	 avoid extra moves.  */
   4169       if (!dest || !rtx_equal_p (dest, *pop1))
   4170 	break;
   4171       /* FALLTHRU */
   4172 
   4173     case GE:
   4174     case GT:
   4175     case UNLE:
   4176     case UNLT:
   4177       /* These are not supported directly before AVX, and furthermore
   4178 	 ix86_expand_sse_fp_minmax only optimizes LT/UNGE.  Swap the
   4179 	 comparison operands to transform into something that is
   4180 	 supported.  */
   4181       std::swap (*pop0, *pop1);
   4182       code = swap_condition (code);
   4183       break;
   4184 
   4185     default:
   4186       gcc_unreachable ();
   4187     }
   4188 
   4189   return code;
   4190 }
   4191 
   4192 /* Expand a floating-point conditional move.  Return true if successful.  */
   4193 
   4194 bool
   4195 ix86_expand_fp_movcc (rtx operands[])
   4196 {
   4197   machine_mode mode = GET_MODE (operands[0]);
   4198   enum rtx_code code = GET_CODE (operands[1]);
   4199   rtx tmp, compare_op;
   4200   rtx op0 = XEXP (operands[1], 0);
   4201   rtx op1 = XEXP (operands[1], 1);
   4202 
   4203   if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
   4204     {
   4205       machine_mode cmode;
   4206 
   4207       /* Since we've no cmove for sse registers, don't force bad register
   4208 	 allocation just to gain access to it.  Deny movcc when the
   4209 	 comparison mode doesn't match the move mode.  */
   4210       cmode = GET_MODE (op0);
   4211       if (cmode == VOIDmode)
   4212 	cmode = GET_MODE (op1);
   4213       if (cmode != mode)
   4214 	return false;
   4215 
   4216       code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
   4217       if (code == UNKNOWN)
   4218 	return false;
   4219 
   4220       if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
   4221 				     operands[2], operands[3]))
   4222 	return true;
   4223 
   4224       tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
   4225 				 operands[2], operands[3]);
   4226       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
   4227       return true;
   4228     }
   4229 
   4230   if (GET_MODE (op0) == TImode
   4231       || (GET_MODE (op0) == DImode
   4232 	  && !TARGET_64BIT))
   4233     return false;
   4234 
   4235   /* The floating point conditional move instructions don't directly
   4236      support conditions resulting from a signed integer comparison.  */
   4237 
   4238   compare_op = ix86_expand_compare (code, op0, op1);
   4239   if (!fcmov_comparison_operator (compare_op, VOIDmode))
   4240     {
   4241       tmp = gen_reg_rtx (QImode);
   4242       ix86_expand_setcc (tmp, code, op0, op1);
   4243 
   4244       compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
   4245     }
   4246 
   4247   emit_insn (gen_rtx_SET (operands[0],
   4248 			  gen_rtx_IF_THEN_ELSE (mode, compare_op,
   4249 						operands[2], operands[3])));
   4250 
   4251   return true;
   4252 }
   4253 
   4254 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes.  */
   4255 
   4256 static int
   4257 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
   4258 {
   4259   switch (code)
   4260     {
   4261     case EQ:
   4262       return 0;
   4263     case LT:
   4264     case LTU:
   4265       return 1;
   4266     case LE:
   4267     case LEU:
   4268       return 2;
   4269     case NE:
   4270       return 4;
   4271     case GE:
   4272     case GEU:
   4273       return 5;
   4274     case GT:
   4275     case GTU:
   4276       return 6;
   4277     default:
   4278       gcc_unreachable ();
   4279     }
   4280 }
   4281 
   4282 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes.  */
   4283 
   4284 static int
   4285 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
   4286 {
   4287   switch (code)
   4288     {
   4289     case EQ:
   4290       return 0x00;
   4291     case NE:
   4292       return 0x04;
   4293     case GT:
   4294       return 0x0e;
   4295     case LE:
   4296       return 0x02;
   4297     case GE:
   4298       return 0x0d;
   4299     case LT:
   4300       return 0x01;
   4301     case UNLE:
   4302       return 0x0a;
   4303     case UNLT:
   4304       return 0x09;
   4305     case UNGE:
   4306       return 0x05;
   4307     case UNGT:
   4308       return 0x06;
   4309     case UNEQ:
   4310       return 0x18;
   4311     case LTGT:
   4312       return 0x0c;
   4313     case ORDERED:
   4314       return 0x07;
   4315     case UNORDERED:
   4316       return 0x03;
   4317     default:
   4318       gcc_unreachable ();
   4319     }
   4320 }
   4321 
   4322 /* Return immediate value to be used in UNSPEC_PCMP
   4323    for comparison CODE in MODE.  */
   4324 
   4325 static int
   4326 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
   4327 {
   4328   if (FLOAT_MODE_P (mode))
   4329     return ix86_fp_cmp_code_to_pcmp_immediate (code);
   4330   return ix86_int_cmp_code_to_pcmp_immediate (code);
   4331 }
   4332 
   4333 /* Expand AVX-512 vector comparison.  */
   4334 
   4335 bool
   4336 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
   4337 {
   4338   machine_mode mask_mode = GET_MODE (dest);
   4339   machine_mode cmp_mode = GET_MODE (cmp_op0);
   4340   rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
   4341   int unspec_code;
   4342   rtx unspec;
   4343 
   4344   switch (code)
   4345     {
   4346     case LEU:
   4347     case GTU:
   4348     case GEU:
   4349     case LTU:
   4350       unspec_code = UNSPEC_UNSIGNED_PCMP;
   4351       break;
   4352 
   4353     default:
   4354       unspec_code = UNSPEC_PCMP;
   4355     }
   4356 
   4357   unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
   4358 			   unspec_code);
   4359   emit_insn (gen_rtx_SET (dest, unspec));
   4360 
   4361   return true;
   4362 }
   4363 
   4364 /* Expand fp vector comparison.  */
   4365 
   4366 bool
   4367 ix86_expand_fp_vec_cmp (rtx operands[])
   4368 {
   4369   enum rtx_code code = GET_CODE (operands[1]);
   4370   rtx cmp;
   4371 
   4372   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
   4373 					   &operands[2], &operands[3]);
   4374   if (code == UNKNOWN)
   4375     {
   4376       rtx temp;
   4377       switch (GET_CODE (operands[1]))
   4378 	{
   4379 	case LTGT:
   4380 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
   4381 				      operands[3], NULL, NULL);
   4382 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
   4383 				     operands[3], NULL, NULL);
   4384 	  code = AND;
   4385 	  break;
   4386 	case UNEQ:
   4387 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
   4388 				      operands[3], NULL, NULL);
   4389 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
   4390 				     operands[3], NULL, NULL);
   4391 	  code = IOR;
   4392 	  break;
   4393 	default:
   4394 	  gcc_unreachable ();
   4395 	}
   4396       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
   4397 				 OPTAB_DIRECT);
   4398     }
   4399   else
   4400     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
   4401 			       NULL, NULL);
   4402 
   4403   if (operands[0] != cmp)
   4404     emit_move_insn (operands[0], cmp);
   4405 
   4406   return true;
   4407 }
   4408 
   4409 static rtx
   4410 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
   4411 			 rtx op_true, rtx op_false, bool *negate)
   4412 {
   4413   machine_mode data_mode = GET_MODE (dest);
   4414   machine_mode mode = GET_MODE (cop0);
   4415   rtx x;
   4416 
   4417   *negate = false;
   4418 
   4419   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
   4420   if (TARGET_XOP
   4421       && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
   4422       && GET_MODE_SIZE (mode) <= 16)
   4423     ;
   4424   /* AVX512F supports all of the comparsions
   4425      on all 128/256/512-bit vector int types.  */
   4426   else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
   4427     ;
   4428   else
   4429     {
   4430       /* Canonicalize the comparison to EQ, GT, GTU.  */
   4431       switch (code)
   4432 	{
   4433 	case EQ:
   4434 	case GT:
   4435 	case GTU:
   4436 	  break;
   4437 
   4438 	case NE:
   4439 	case LE:
   4440 	case LEU:
   4441 	  code = reverse_condition (code);
   4442 	  *negate = true;
   4443 	  break;
   4444 
   4445 	case GE:
   4446 	case GEU:
   4447 	  code = reverse_condition (code);
   4448 	  *negate = true;
   4449 	  /* FALLTHRU */
   4450 
   4451 	case LT:
   4452 	case LTU:
   4453 	  std::swap (cop0, cop1);
   4454 	  code = swap_condition (code);
   4455 	  break;
   4456 
   4457 	default:
   4458 	  gcc_unreachable ();
   4459 	}
   4460 
   4461       /* Only SSE4.1/SSE4.2 supports V2DImode.  */
   4462       if (mode == V2DImode)
   4463 	{
   4464 	  switch (code)
   4465 	    {
   4466 	    case EQ:
   4467 	      /* SSE4.1 supports EQ.  */
   4468 	      if (!TARGET_SSE4_1)
   4469 		return NULL;
   4470 	      break;
   4471 
   4472 	    case GT:
   4473 	    case GTU:
   4474 	      /* SSE4.2 supports GT/GTU.  */
   4475 	      if (!TARGET_SSE4_2)
   4476 		return NULL;
   4477 	      break;
   4478 
   4479 	    default:
   4480 	      gcc_unreachable ();
   4481 	    }
   4482 	}
   4483 
   4484       rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
   4485       rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
   4486       if (*negate)
   4487 	std::swap (optrue, opfalse);
   4488 
   4489       /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
   4490 	 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
   4491 	 min (x, y) == x).  While we add one instruction (the minimum),
   4492 	 we remove the need for two instructions in the negation, as the
   4493 	 result is done this way.
   4494 	 When using masks, do it for SI/DImode element types, as it is shorter
   4495 	 than the two subtractions.  */
   4496       if ((code != EQ
   4497 	   && GET_MODE_SIZE (mode) != 64
   4498 	   && vector_all_ones_operand (opfalse, data_mode)
   4499 	   && optrue == CONST0_RTX (data_mode))
   4500 	  || (code == GTU
   4501 	      && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
   4502 	      /* Don't do it if not using integer masks and we'd end up with
   4503 		 the right values in the registers though.  */
   4504 	      && (GET_MODE_SIZE (mode) == 64
   4505 		  || !vector_all_ones_operand (optrue, data_mode)
   4506 		  || opfalse != CONST0_RTX (data_mode))))
   4507 	{
   4508 	  rtx (*gen) (rtx, rtx, rtx) = NULL;
   4509 
   4510 	  switch (mode)
   4511 	    {
   4512 	    case E_V16SImode:
   4513 	      gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
   4514 	      break;
   4515 	    case E_V8DImode:
   4516 	      gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
   4517 	      cop0 = force_reg (mode, cop0);
   4518 	      cop1 = force_reg (mode, cop1);
   4519 	      break;
   4520 	    case E_V32QImode:
   4521 	      if (TARGET_AVX2)
   4522 		gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
   4523 	      break;
   4524 	    case E_V16HImode:
   4525 	      if (TARGET_AVX2)
   4526 		gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
   4527 	      break;
   4528 	    case E_V8SImode:
   4529 	      if (TARGET_AVX2)
   4530 		gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
   4531 	      break;
   4532 	    case E_V4DImode:
   4533 	      if (TARGET_AVX512VL)
   4534 		{
   4535 		  gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
   4536 		  cop0 = force_reg (mode, cop0);
   4537 		  cop1 = force_reg (mode, cop1);
   4538 		}
   4539 	      break;
   4540 	    case E_V16QImode:
   4541 	      if (code == GTU && TARGET_SSE2)
   4542 		gen = gen_uminv16qi3;
   4543 	      else if (code == GT && TARGET_SSE4_1)
   4544 		gen = gen_sminv16qi3;
   4545 	      break;
   4546 	    case E_V8QImode:
   4547 	      if (code == GTU && TARGET_SSE2)
   4548 		gen = gen_uminv8qi3;
   4549 	      else if (code == GT && TARGET_SSE4_1)
   4550 		gen = gen_sminv8qi3;
   4551 	      break;
   4552 	    case E_V4QImode:
   4553 	      if (code == GTU && TARGET_SSE2)
   4554 		gen = gen_uminv4qi3;
   4555 	      else if (code == GT && TARGET_SSE4_1)
   4556 		gen = gen_sminv4qi3;
   4557 	      break;
   4558 	    case E_V2QImode:
   4559 	      if (code == GTU && TARGET_SSE2)
   4560 		gen = gen_uminv2qi3;
   4561 	      else if (code == GT && TARGET_SSE4_1)
   4562 		gen = gen_sminv2qi3;
   4563 	      break;
   4564 	    case E_V8HImode:
   4565 	      if (code == GTU && TARGET_SSE4_1)
   4566 		gen = gen_uminv8hi3;
   4567 	      else if (code == GT && TARGET_SSE2)
   4568 		gen = gen_sminv8hi3;
   4569 	      break;
   4570 	    case E_V4HImode:
   4571 	      if (code == GTU && TARGET_SSE4_1)
   4572 		gen = gen_uminv4hi3;
   4573 	      else if (code == GT && TARGET_SSE2)
   4574 		gen = gen_sminv4hi3;
   4575 	      break;
   4576 	    case E_V2HImode:
   4577 	      if (code == GTU && TARGET_SSE4_1)
   4578 		gen = gen_uminv2hi3;
   4579 	      else if (code == GT && TARGET_SSE2)
   4580 		gen = gen_sminv2hi3;
   4581 	      break;
   4582 	    case E_V4SImode:
   4583 	      if (TARGET_SSE4_1)
   4584 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
   4585 	      break;
   4586 	    case E_V2SImode:
   4587 	      if (TARGET_SSE4_1)
   4588 		gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
   4589 	      break;
   4590 	    case E_V2DImode:
   4591 	      if (TARGET_AVX512VL)
   4592 		{
   4593 		  gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
   4594 		  cop0 = force_reg (mode, cop0);
   4595 		  cop1 = force_reg (mode, cop1);
   4596 		}
   4597 	      break;
   4598 	    default:
   4599 	      break;
   4600 	    }
   4601 
   4602 	  if (gen)
   4603 	    {
   4604 	      rtx tem = gen_reg_rtx (mode);
   4605 	      if (!vector_operand (cop0, mode))
   4606 		cop0 = force_reg (mode, cop0);
   4607 	      if (!vector_operand (cop1, mode))
   4608 		cop1 = force_reg (mode, cop1);
   4609 	      *negate = !*negate;
   4610 	      emit_insn (gen (tem, cop0, cop1));
   4611 	      cop1 = tem;
   4612 	      code = EQ;
   4613 	    }
   4614 	}
   4615 
   4616       /* Unsigned parallel compare is not supported by the hardware.
   4617 	 Play some tricks to turn this into a signed comparison
   4618 	 against 0.  */
   4619       if (code == GTU)
   4620 	{
   4621 	  cop0 = force_reg (mode, cop0);
   4622 
   4623 	  switch (mode)
   4624 	    {
   4625 	    case E_V16SImode:
   4626 	    case E_V8DImode:
   4627 	    case E_V8SImode:
   4628 	    case E_V4DImode:
   4629 	    case E_V4SImode:
   4630 	    case E_V2SImode:
   4631 	    case E_V2DImode:
   4632 		{
   4633 		  rtx t1, t2, mask;
   4634 
   4635 		  /* Subtract (-(INT MAX) - 1) from both operands to make
   4636 		     them signed.  */
   4637 		  mask = ix86_build_signbit_mask (mode, true, false);
   4638 		  t1 = gen_reg_rtx (mode);
   4639 		  emit_insn (gen_sub3_insn (t1, cop0, mask));
   4640 
   4641 		  t2 = gen_reg_rtx (mode);
   4642 		  emit_insn (gen_sub3_insn (t2, cop1, mask));
   4643 
   4644 		  cop0 = t1;
   4645 		  cop1 = t2;
   4646 		  code = GT;
   4647 		}
   4648 	      break;
   4649 
   4650 	    case E_V64QImode:
   4651 	    case E_V32HImode:
   4652 	    case E_V32QImode:
   4653 	    case E_V16HImode:
   4654 	    case E_V16QImode:
   4655 	    case E_V8QImode:
   4656 	    case E_V4QImode:
   4657 	    case E_V2QImode:
   4658 	    case E_V8HImode:
   4659 	    case E_V4HImode:
   4660 	    case E_V2HImode:
   4661 	      /* Perform a parallel unsigned saturating subtraction.  */
   4662 	      x = gen_reg_rtx (mode);
   4663 	      emit_insn (gen_rtx_SET
   4664 			 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
   4665 	      cop0 = x;
   4666 	      cop1 = CONST0_RTX (mode);
   4667 	      code = EQ;
   4668 	      *negate = !*negate;
   4669 	      break;
   4670 
   4671 	    default:
   4672 	      gcc_unreachable ();
   4673 	    }
   4674 	}
   4675     }
   4676 
   4677   if (*negate)
   4678     std::swap (op_true, op_false);
   4679 
   4680   /* Allow the comparison to be done in one mode, but the movcc to
   4681      happen in another mode.  */
   4682   if (data_mode == mode)
   4683     {
   4684       x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
   4685 			       op_true, op_false);
   4686     }
   4687   else
   4688     {
   4689       gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
   4690       x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
   4691 			       op_true, op_false);
   4692       if (GET_MODE (x) == mode)
   4693 	x = gen_lowpart (data_mode, x);
   4694     }
   4695 
   4696   return x;
   4697 }
   4698 
   4699 /* Expand integer vector comparison.  */
   4700 
   4701 bool
   4702 ix86_expand_int_vec_cmp (rtx operands[])
   4703 {
   4704   rtx_code code = GET_CODE (operands[1]);
   4705   bool negate = false;
   4706   rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
   4707 				     operands[3], NULL, NULL, &negate);
   4708 
   4709   if (!cmp)
   4710     return false;
   4711 
   4712   if (negate)
   4713     cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
   4714 				   CONST0_RTX (GET_MODE (cmp)),
   4715 				   NULL, NULL, &negate);
   4716 
   4717   gcc_assert (!negate);
   4718 
   4719   if (operands[0] != cmp)
   4720     emit_move_insn (operands[0], cmp);
   4721 
   4722   return true;
   4723 }
   4724 
   4725 /* Expand a floating-point vector conditional move; a vcond operation
   4726    rather than a movcc operation.  */
   4727 
   4728 bool
   4729 ix86_expand_fp_vcond (rtx operands[])
   4730 {
   4731   enum rtx_code code = GET_CODE (operands[3]);
   4732   rtx cmp;
   4733 
   4734   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
   4735 					   &operands[4], &operands[5]);
   4736   if (code == UNKNOWN)
   4737     {
   4738       rtx temp;
   4739       switch (GET_CODE (operands[3]))
   4740 	{
   4741 	case LTGT:
   4742 	  temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
   4743 				      operands[5], operands[0], operands[0]);
   4744 	  cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
   4745 				     operands[5], operands[1], operands[2]);
   4746 	  code = AND;
   4747 	  break;
   4748 	case UNEQ:
   4749 	  temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
   4750 				      operands[5], operands[0], operands[0]);
   4751 	  cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
   4752 				     operands[5], operands[1], operands[2]);
   4753 	  code = IOR;
   4754 	  break;
   4755 	default:
   4756 	  gcc_unreachable ();
   4757 	}
   4758       cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
   4759 				 OPTAB_DIRECT);
   4760       ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
   4761       return true;
   4762     }
   4763 
   4764   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
   4765 				 operands[5], operands[1], operands[2]))
   4766     return true;
   4767 
   4768   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
   4769 			     operands[1], operands[2]);
   4770   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
   4771   return true;
   4772 }
   4773 
   4774 /* Expand a signed/unsigned integral vector conditional move.  */
   4775 
   4776 bool
   4777 ix86_expand_int_vcond (rtx operands[])
   4778 {
   4779   machine_mode data_mode = GET_MODE (operands[0]);
   4780   machine_mode mode = GET_MODE (operands[4]);
   4781   enum rtx_code code = GET_CODE (operands[3]);
   4782   bool negate = false;
   4783   rtx x, cop0, cop1;
   4784 
   4785   cop0 = operands[4];
   4786   cop1 = operands[5];
   4787 
   4788   /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
   4789      and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
   4790   if ((code == LT || code == GE)
   4791       && data_mode == mode
   4792       && cop1 == CONST0_RTX (mode)
   4793       && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
   4794       && GET_MODE_UNIT_SIZE (data_mode) > 1
   4795       && GET_MODE_UNIT_SIZE (data_mode) <= 8
   4796       && (GET_MODE_SIZE (data_mode) == 16
   4797 	  || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
   4798     {
   4799       rtx negop = operands[2 - (code == LT)];
   4800       int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
   4801       if (negop == CONST1_RTX (data_mode))
   4802 	{
   4803 	  rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
   4804 					 operands[0], 1, OPTAB_DIRECT);
   4805 	  if (res != operands[0])
   4806 	    emit_move_insn (operands[0], res);
   4807 	  return true;
   4808 	}
   4809       else if (GET_MODE_INNER (data_mode) != DImode
   4810 	       && vector_all_ones_operand (negop, data_mode))
   4811 	{
   4812 	  rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
   4813 					 operands[0], 0, OPTAB_DIRECT);
   4814 	  if (res != operands[0])
   4815 	    emit_move_insn (operands[0], res);
   4816 	  return true;
   4817 	}
   4818     }
   4819 
   4820   if (!nonimmediate_operand (cop1, mode))
   4821     cop1 = force_reg (mode, cop1);
   4822   if (!general_operand (operands[1], data_mode))
   4823     operands[1] = force_reg (data_mode, operands[1]);
   4824   if (!general_operand (operands[2], data_mode))
   4825     operands[2] = force_reg (data_mode, operands[2]);
   4826 
   4827   x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
   4828 			       operands[1], operands[2], &negate);
   4829 
   4830   if (!x)
   4831     return false;
   4832 
   4833   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
   4834 			 operands[2-negate]);
   4835   return true;
   4836 }
   4837 
   4838 static bool
   4839 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
   4840 			      struct expand_vec_perm_d *d)
   4841 {
   4842   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
   4843      expander, so args are either in d, or in op0, op1 etc.  */
   4844   machine_mode mode = GET_MODE (d ? d->op0 : op0);
   4845   machine_mode maskmode = mode;
   4846   rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
   4847 
   4848   switch (mode)
   4849     {
   4850     case E_V16QImode:
   4851       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
   4852 	gen = gen_avx512vl_vpermt2varv16qi3;
   4853       break;
   4854     case E_V32QImode:
   4855       if (TARGET_AVX512VL && TARGET_AVX512VBMI)
   4856 	gen = gen_avx512vl_vpermt2varv32qi3;
   4857       break;
   4858     case E_V64QImode:
   4859       if (TARGET_AVX512VBMI)
   4860 	gen = gen_avx512bw_vpermt2varv64qi3;
   4861       break;
   4862     case E_V8HImode:
   4863       if (TARGET_AVX512VL && TARGET_AVX512BW)
   4864 	gen = gen_avx512vl_vpermt2varv8hi3;
   4865       break;
   4866     case E_V16HImode:
   4867       if (TARGET_AVX512VL && TARGET_AVX512BW)
   4868 	gen = gen_avx512vl_vpermt2varv16hi3;
   4869       break;
   4870     case E_V32HImode:
   4871       if (TARGET_AVX512BW)
   4872 	gen = gen_avx512bw_vpermt2varv32hi3;
   4873       break;
   4874     case E_V4SImode:
   4875       if (TARGET_AVX512VL)
   4876 	gen = gen_avx512vl_vpermt2varv4si3;
   4877       break;
   4878     case E_V8SImode:
   4879       if (TARGET_AVX512VL)
   4880 	gen = gen_avx512vl_vpermt2varv8si3;
   4881       break;
   4882     case E_V16SImode:
   4883       if (TARGET_AVX512F)
   4884 	gen = gen_avx512f_vpermt2varv16si3;
   4885       break;
   4886     case E_V4SFmode:
   4887       if (TARGET_AVX512VL)
   4888 	{
   4889 	  gen = gen_avx512vl_vpermt2varv4sf3;
   4890 	  maskmode = V4SImode;
   4891 	}
   4892       break;
   4893     case E_V8SFmode:
   4894       if (TARGET_AVX512VL)
   4895 	{
   4896 	  gen = gen_avx512vl_vpermt2varv8sf3;
   4897 	  maskmode = V8SImode;
   4898 	}
   4899       break;
   4900     case E_V16SFmode:
   4901       if (TARGET_AVX512F)
   4902 	{
   4903 	  gen = gen_avx512f_vpermt2varv16sf3;
   4904 	  maskmode = V16SImode;
   4905 	}
   4906       break;
   4907     case E_V2DImode:
   4908       if (TARGET_AVX512VL)
   4909 	gen = gen_avx512vl_vpermt2varv2di3;
   4910       break;
   4911     case E_V4DImode:
   4912       if (TARGET_AVX512VL)
   4913 	gen = gen_avx512vl_vpermt2varv4di3;
   4914       break;
   4915     case E_V8DImode:
   4916       if (TARGET_AVX512F)
   4917 	gen = gen_avx512f_vpermt2varv8di3;
   4918       break;
   4919     case E_V2DFmode:
   4920       if (TARGET_AVX512VL)
   4921 	{
   4922 	  gen = gen_avx512vl_vpermt2varv2df3;
   4923 	  maskmode = V2DImode;
   4924 	}
   4925       break;
   4926     case E_V4DFmode:
   4927       if (TARGET_AVX512VL)
   4928 	{
   4929 	  gen = gen_avx512vl_vpermt2varv4df3;
   4930 	  maskmode = V4DImode;
   4931 	}
   4932       break;
   4933     case E_V8DFmode:
   4934       if (TARGET_AVX512F)
   4935 	{
   4936 	  gen = gen_avx512f_vpermt2varv8df3;
   4937 	  maskmode = V8DImode;
   4938 	}
   4939       break;
   4940     default:
   4941       break;
   4942     }
   4943 
   4944   if (gen == NULL)
   4945     return false;
   4946 
   4947   /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
   4948      expander, so args are either in d, or in op0, op1 etc.  */
   4949   if (d)
   4950     {
   4951       rtx vec[64];
   4952       target = d->target;
   4953       op0 = d->op0;
   4954       op1 = d->op1;
   4955       for (int i = 0; i < d->nelt; ++i)
   4956 	vec[i] = GEN_INT (d->perm[i]);
   4957       mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
   4958     }
   4959 
   4960   emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
   4961   return true;
   4962 }
   4963 
   4964 /* Expand a variable vector permutation.  */
   4965 
   4966 void
   4967 ix86_expand_vec_perm (rtx operands[])
   4968 {
   4969   rtx target = operands[0];
   4970   rtx op0 = operands[1];
   4971   rtx op1 = operands[2];
   4972   rtx mask = operands[3];
   4973   rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
   4974   machine_mode mode = GET_MODE (op0);
   4975   machine_mode maskmode = GET_MODE (mask);
   4976   int w, e, i;
   4977   bool one_operand_shuffle = rtx_equal_p (op0, op1);
   4978 
   4979   /* Number of elements in the vector.  */
   4980   w = GET_MODE_NUNITS (mode);
   4981   e = GET_MODE_UNIT_SIZE (mode);
   4982   gcc_assert (w <= 64);
   4983 
   4984   /* For HF mode vector, convert it to HI using subreg.  */
   4985   if (GET_MODE_INNER (mode) == HFmode)
   4986     {
   4987       machine_mode orig_mode = mode;
   4988       mode = mode_for_vector (HImode, w).require ();
   4989       target = lowpart_subreg (mode, target, orig_mode);
   4990       op0 = lowpart_subreg (mode, op0, orig_mode);
   4991       op1 = lowpart_subreg (mode, op1, orig_mode);
   4992     }
   4993 
   4994   if (TARGET_AVX512F && one_operand_shuffle)
   4995     {
   4996       rtx (*gen) (rtx, rtx, rtx) = NULL;
   4997       switch (mode)
   4998 	{
   4999 	case E_V16SImode:
   5000 	  gen =gen_avx512f_permvarv16si;
   5001 	  break;
   5002 	case E_V16SFmode:
   5003 	  gen = gen_avx512f_permvarv16sf;
   5004 	  break;
   5005 	case E_V8DImode:
   5006 	  gen = gen_avx512f_permvarv8di;
   5007 	  break;
   5008 	case E_V8DFmode:
   5009 	  gen = gen_avx512f_permvarv8df;
   5010 	  break;
   5011 	default:
   5012 	  break;
   5013 	}
   5014       if (gen != NULL)
   5015 	{
   5016 	  emit_insn (gen (target, op0, mask));
   5017 	  return;
   5018 	}
   5019     }
   5020 
   5021   if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
   5022     return;
   5023 
   5024   if (TARGET_AVX2)
   5025     {
   5026       if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
   5027 	{
   5028 	  /* Unfortunately, the VPERMQ and VPERMPD instructions only support
   5029 	     an constant shuffle operand.  With a tiny bit of effort we can
   5030 	     use VPERMD instead.  A re-interpretation stall for V4DFmode is
   5031 	     unfortunate but there's no avoiding it.
   5032 	     Similarly for V16HImode we don't have instructions for variable
   5033 	     shuffling, while for V32QImode we can use after preparing suitable
   5034 	     masks vpshufb; vpshufb; vpermq; vpor.  */
   5035 
   5036 	  if (mode == V16HImode)
   5037 	    {
   5038 	      maskmode = mode = V32QImode;
   5039 	      w = 32;
   5040 	      e = 1;
   5041 	    }
   5042 	  else
   5043 	    {
   5044 	      maskmode = mode = V8SImode;
   5045 	      w = 8;
   5046 	      e = 4;
   5047 	    }
   5048 	  t1 = gen_reg_rtx (maskmode);
   5049 
   5050 	  /* Replicate the low bits of the V4DImode mask into V8SImode:
   5051 	       mask = { A B C D }
   5052 	       t1 = { A A B B C C D D }.  */
   5053 	  for (i = 0; i < w / 2; ++i)
   5054 	    vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
   5055 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
   5056 	  vt = force_reg (maskmode, vt);
   5057 	  mask = gen_lowpart (maskmode, mask);
   5058 	  if (maskmode == V8SImode)
   5059 	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
   5060 	  else
   5061 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
   5062 
   5063 	  /* Multiply the shuffle indicies by two.  */
   5064 	  t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
   5065 				    OPTAB_DIRECT);
   5066 
   5067 	  /* Add one to the odd shuffle indicies:
   5068 		t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
   5069 	  for (i = 0; i < w / 2; ++i)
   5070 	    {
   5071 	      vec[i * 2] = const0_rtx;
   5072 	      vec[i * 2 + 1] = const1_rtx;
   5073 	    }
   5074 	  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
   5075 	  vt = validize_mem (force_const_mem (maskmode, vt));
   5076 	  t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
   5077 				    OPTAB_DIRECT);
   5078 
   5079 	  /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
   5080 	  operands[3] = mask = t1;
   5081 	  target = gen_reg_rtx (mode);
   5082 	  op0 = gen_lowpart (mode, op0);
   5083 	  op1 = gen_lowpart (mode, op1);
   5084 	}
   5085 
   5086       switch (mode)
   5087 	{
   5088 	case E_V8SImode:
   5089 	  /* The VPERMD and VPERMPS instructions already properly ignore
   5090 	     the high bits of the shuffle elements.  No need for us to
   5091 	     perform an AND ourselves.  */
   5092 	  if (one_operand_shuffle)
   5093 	    {
   5094 	      emit_insn (gen_avx2_permvarv8si (target, op0, mask));
   5095 	      if (target != operands[0])
   5096 		emit_move_insn (operands[0],
   5097 				gen_lowpart (GET_MODE (operands[0]), target));
   5098 	    }
   5099 	  else
   5100 	    {
   5101 	      t1 = gen_reg_rtx (V8SImode);
   5102 	      t2 = gen_reg_rtx (V8SImode);
   5103 	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
   5104 	      emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
   5105 	      goto merge_two;
   5106 	    }
   5107 	  return;
   5108 
   5109 	case E_V8SFmode:
   5110 	  mask = gen_lowpart (V8SImode, mask);
   5111 	  if (one_operand_shuffle)
   5112 	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
   5113 	  else
   5114 	    {
   5115 	      t1 = gen_reg_rtx (V8SFmode);
   5116 	      t2 = gen_reg_rtx (V8SFmode);
   5117 	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
   5118 	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
   5119 	      goto merge_two;
   5120 	    }
   5121 	  return;
   5122 
   5123         case E_V4SImode:
   5124 	  /* By combining the two 128-bit input vectors into one 256-bit
   5125 	     input vector, we can use VPERMD and VPERMPS for the full
   5126 	     two-operand shuffle.  */
   5127 	  t1 = gen_reg_rtx (V8SImode);
   5128 	  t2 = gen_reg_rtx (V8SImode);
   5129 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
   5130 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
   5131 	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
   5132 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
   5133 	  return;
   5134 
   5135         case E_V4SFmode:
   5136 	  t1 = gen_reg_rtx (V8SFmode);
   5137 	  t2 = gen_reg_rtx (V8SImode);
   5138 	  mask = gen_lowpart (V4SImode, mask);
   5139 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
   5140 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
   5141 	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
   5142 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
   5143 	  return;
   5144 
   5145 	case E_V32QImode:
   5146 	  t1 = gen_reg_rtx (V32QImode);
   5147 	  t2 = gen_reg_rtx (V32QImode);
   5148 	  t3 = gen_reg_rtx (V32QImode);
   5149 	  vt2 = GEN_INT (-128);
   5150 	  vt = gen_const_vec_duplicate (V32QImode, vt2);
   5151 	  vt = force_reg (V32QImode, vt);
   5152 	  for (i = 0; i < 32; i++)
   5153 	    vec[i] = i < 16 ? vt2 : const0_rtx;
   5154 	  vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
   5155 	  vt2 = force_reg (V32QImode, vt2);
   5156 	  /* From mask create two adjusted masks, which contain the same
   5157 	     bits as mask in the low 7 bits of each vector element.
   5158 	     The first mask will have the most significant bit clear
   5159 	     if it requests element from the same 128-bit lane
   5160 	     and MSB set if it requests element from the other 128-bit lane.
   5161 	     The second mask will have the opposite values of the MSB,
   5162 	     and additionally will have its 128-bit lanes swapped.
   5163 	     E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
   5164 	     t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
   5165 	     t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
   5166 	     stands for other 12 bytes.  */
   5167 	  /* The bit whether element is from the same lane or the other
   5168 	     lane is bit 4, so shift it up by 3 to the MSB position.  */
   5169 	  t5 = gen_reg_rtx (V4DImode);
   5170 	  emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
   5171 				    GEN_INT (3)));
   5172 	  /* Clear MSB bits from the mask just in case it had them set.  */
   5173 	  emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
   5174 	  /* After this t1 will have MSB set for elements from other lane.  */
   5175 	  emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
   5176 	  /* Clear bits other than MSB.  */
   5177 	  emit_insn (gen_andv32qi3 (t1, t1, vt));
   5178 	  /* Or in the lower bits from mask into t3.  */
   5179 	  emit_insn (gen_iorv32qi3 (t3, t1, t2));
   5180 	  /* And invert MSB bits in t1, so MSB is set for elements from the same
   5181 	     lane.  */
   5182 	  emit_insn (gen_xorv32qi3 (t1, t1, vt));
   5183 	  /* Swap 128-bit lanes in t3.  */
   5184 	  t6 = gen_reg_rtx (V4DImode);
   5185 	  emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
   5186 					  const2_rtx, GEN_INT (3),
   5187 					  const0_rtx, const1_rtx));
   5188 	  /* And or in the lower bits from mask into t1.  */
   5189 	  emit_insn (gen_iorv32qi3 (t1, t1, t2));
   5190 	  if (one_operand_shuffle)
   5191 	    {
   5192 	      /* Each of these shuffles will put 0s in places where
   5193 		 element from the other 128-bit lane is needed, otherwise
   5194 		 will shuffle in the requested value.  */
   5195 	      emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
   5196 						gen_lowpart (V32QImode, t6)));
   5197 	      emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
   5198 	      /* For t3 the 128-bit lanes are swapped again.  */
   5199 	      t7 = gen_reg_rtx (V4DImode);
   5200 	      emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
   5201 					      const2_rtx, GEN_INT (3),
   5202 					      const0_rtx, const1_rtx));
   5203 	      /* And oring both together leads to the result.  */
   5204 	      emit_insn (gen_iorv32qi3 (target, t1,
   5205 					gen_lowpart (V32QImode, t7)));
   5206 	      if (target != operands[0])
   5207 		emit_move_insn (operands[0],
   5208 				gen_lowpart (GET_MODE (operands[0]), target));
   5209 	      return;
   5210 	    }
   5211 
   5212 	  t4 = gen_reg_rtx (V32QImode);
   5213 	  /* Similarly to the above one_operand_shuffle code,
   5214 	     just for repeated twice for each operand.  merge_two:
   5215 	     code will merge the two results together.  */
   5216 	  emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
   5217 					    gen_lowpart (V32QImode, t6)));
   5218 	  emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
   5219 					    gen_lowpart (V32QImode, t6)));
   5220 	  emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
   5221 	  emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
   5222 	  t7 = gen_reg_rtx (V4DImode);
   5223 	  emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
   5224 					  const2_rtx, GEN_INT (3),
   5225 					  const0_rtx, const1_rtx));
   5226 	  t8 = gen_reg_rtx (V4DImode);
   5227 	  emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
   5228 					  const2_rtx, GEN_INT (3),
   5229 					  const0_rtx, const1_rtx));
   5230 	  emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
   5231 	  emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
   5232 	  t1 = t4;
   5233 	  t2 = t3;
   5234 	  goto merge_two;
   5235 
   5236 	default:
   5237 	  gcc_assert (GET_MODE_SIZE (mode) <= 16);
   5238 	  break;
   5239 	}
   5240     }
   5241 
   5242   if (TARGET_XOP)
   5243     {
   5244       /* The XOP VPPERM insn supports three inputs.  By ignoring the
   5245 	 one_operand_shuffle special case, we avoid creating another
   5246 	 set of constant vectors in memory.  */
   5247       one_operand_shuffle = false;
   5248 
   5249       /* mask = mask & {2*w-1, ...} */
   5250       vt = GEN_INT (2*w - 1);
   5251     }
   5252   else
   5253     {
   5254       /* mask = mask & {w-1, ...} */
   5255       vt = GEN_INT (w - 1);
   5256     }
   5257 
   5258   vt = gen_const_vec_duplicate (maskmode, vt);
   5259   mask = expand_simple_binop (maskmode, AND, mask, vt,
   5260 			      NULL_RTX, 0, OPTAB_DIRECT);
   5261 
   5262   /* For non-QImode operations, convert the word permutation control
   5263      into a byte permutation control.  */
   5264   if (mode != V16QImode)
   5265     {
   5266       mask = expand_simple_binop (maskmode, ASHIFT, mask,
   5267 				  GEN_INT (exact_log2 (e)),
   5268 				  NULL_RTX, 0, OPTAB_DIRECT);
   5269 
   5270       /* Convert mask to vector of chars.  */
   5271       mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
   5272 
   5273       /* Replicate each of the input bytes into byte positions:
   5274 	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
   5275 	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
   5276 	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
   5277       for (i = 0; i < 16; ++i)
   5278 	vec[i] = GEN_INT (i/e * e);
   5279       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
   5280       vt = validize_mem (force_const_mem (V16QImode, vt));
   5281       if (TARGET_XOP)
   5282 	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
   5283       else
   5284 	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
   5285 
   5286       /* Convert it into the byte positions by doing
   5287 	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
   5288       for (i = 0; i < 16; ++i)
   5289 	vec[i] = GEN_INT (i % e);
   5290       vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
   5291       vt = validize_mem (force_const_mem (V16QImode, vt));
   5292       emit_insn (gen_addv16qi3 (mask, mask, vt));
   5293     }
   5294 
   5295   /* The actual shuffle operations all operate on V16QImode.  */
   5296   op0 = gen_lowpart (V16QImode, op0);
   5297   op1 = gen_lowpart (V16QImode, op1);
   5298 
   5299   if (TARGET_XOP)
   5300     {
   5301       if (GET_MODE (target) != V16QImode)
   5302 	target = gen_reg_rtx (V16QImode);
   5303       emit_insn (gen_xop_pperm (target, op0, op1, mask));
   5304       if (target != operands[0])
   5305 	emit_move_insn (operands[0],
   5306 			gen_lowpart (GET_MODE (operands[0]), target));
   5307     }
   5308   else if (one_operand_shuffle)
   5309     {
   5310       if (GET_MODE (target) != V16QImode)
   5311 	target = gen_reg_rtx (V16QImode);
   5312       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
   5313       if (target != operands[0])
   5314 	emit_move_insn (operands[0],
   5315 			gen_lowpart (GET_MODE (operands[0]), target));
   5316     }
   5317   else
   5318     {
   5319       rtx xops[6];
   5320       bool ok;
   5321 
   5322       /* Shuffle the two input vectors independently.  */
   5323       t1 = gen_reg_rtx (V16QImode);
   5324       t2 = gen_reg_rtx (V16QImode);
   5325       emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
   5326       emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
   5327 
   5328  merge_two:
   5329       /* Then merge them together.  The key is whether any given control
   5330          element contained a bit set that indicates the second word.  */
   5331       mask = operands[3];
   5332       vt = GEN_INT (w);
   5333       if (maskmode == V2DImode && !TARGET_SSE4_1)
   5334 	{
   5335 	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
   5336 	     more shuffle to convert the V2DI input mask into a V4SI
   5337 	     input mask.  At which point the masking that expand_int_vcond
   5338 	     will work as desired.  */
   5339 	  rtx t3 = gen_reg_rtx (V4SImode);
   5340 	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
   5341 				        const0_rtx, const0_rtx,
   5342 				        const2_rtx, const2_rtx));
   5343 	  mask = t3;
   5344 	  maskmode = V4SImode;
   5345 	  e = w = 4;
   5346 	}
   5347 
   5348       vt = gen_const_vec_duplicate (maskmode, vt);
   5349       vt = force_reg (maskmode, vt);
   5350       mask = expand_simple_binop (maskmode, AND, mask, vt,
   5351 				  NULL_RTX, 0, OPTAB_DIRECT);
   5352 
   5353       if (GET_MODE (target) != mode)
   5354 	target = gen_reg_rtx (mode);
   5355       xops[0] = target;
   5356       xops[1] = gen_lowpart (mode, t2);
   5357       xops[2] = gen_lowpart (mode, t1);
   5358       xops[3] = gen_rtx_EQ (maskmode, mask, vt);
   5359       xops[4] = mask;
   5360       xops[5] = vt;
   5361       ok = ix86_expand_int_vcond (xops);
   5362       gcc_assert (ok);
   5363       if (target != operands[0])
   5364 	emit_move_insn (operands[0],
   5365 			gen_lowpart (GET_MODE (operands[0]), target));
   5366     }
   5367 }
   5368 
   5369 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
   5370    true if we should do zero extension, else sign extension.  HIGH_P is
   5371    true if we want the N/2 high elements, else the low elements.  */
   5372 
   5373 void
   5374 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
   5375 {
   5376   machine_mode imode = GET_MODE (src);
   5377   rtx tmp;
   5378 
   5379   if (TARGET_SSE4_1)
   5380     {
   5381       rtx (*unpack)(rtx, rtx);
   5382       rtx (*extract)(rtx, rtx) = NULL;
   5383       machine_mode halfmode = BLKmode;
   5384 
   5385       switch (imode)
   5386 	{
   5387 	case E_V64QImode:
   5388 	  if (unsigned_p)
   5389 	    unpack = gen_avx512bw_zero_extendv32qiv32hi2;
   5390 	  else
   5391 	    unpack = gen_avx512bw_sign_extendv32qiv32hi2;
   5392 	  halfmode = V32QImode;
   5393 	  extract
   5394 	    = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
   5395 	  break;
   5396 	case E_V32QImode:
   5397 	  if (unsigned_p)
   5398 	    unpack = gen_avx2_zero_extendv16qiv16hi2;
   5399 	  else
   5400 	    unpack = gen_avx2_sign_extendv16qiv16hi2;
   5401 	  halfmode = V16QImode;
   5402 	  extract
   5403 	    = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
   5404 	  break;
   5405 	case E_V32HImode:
   5406 	  if (unsigned_p)
   5407 	    unpack = gen_avx512f_zero_extendv16hiv16si2;
   5408 	  else
   5409 	    unpack = gen_avx512f_sign_extendv16hiv16si2;
   5410 	  halfmode = V16HImode;
   5411 	  extract
   5412 	    = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
   5413 	  break;
   5414 	case E_V16HImode:
   5415 	  if (unsigned_p)
   5416 	    unpack = gen_avx2_zero_extendv8hiv8si2;
   5417 	  else
   5418 	    unpack = gen_avx2_sign_extendv8hiv8si2;
   5419 	  halfmode = V8HImode;
   5420 	  extract
   5421 	    = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
   5422 	  break;
   5423 	case E_V16SImode:
   5424 	  if (unsigned_p)
   5425 	    unpack = gen_avx512f_zero_extendv8siv8di2;
   5426 	  else
   5427 	    unpack = gen_avx512f_sign_extendv8siv8di2;
   5428 	  halfmode = V8SImode;
   5429 	  extract
   5430 	    = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
   5431 	  break;
   5432 	case E_V8SImode:
   5433 	  if (unsigned_p)
   5434 	    unpack = gen_avx2_zero_extendv4siv4di2;
   5435 	  else
   5436 	    unpack = gen_avx2_sign_extendv4siv4di2;
   5437 	  halfmode = V4SImode;
   5438 	  extract
   5439 	    = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
   5440 	  break;
   5441 	case E_V16QImode:
   5442 	  if (unsigned_p)
   5443 	    unpack = gen_sse4_1_zero_extendv8qiv8hi2;
   5444 	  else
   5445 	    unpack = gen_sse4_1_sign_extendv8qiv8hi2;
   5446 	  break;
   5447 	case E_V8HImode:
   5448 	  if (unsigned_p)
   5449 	    unpack = gen_sse4_1_zero_extendv4hiv4si2;
   5450 	  else
   5451 	    unpack = gen_sse4_1_sign_extendv4hiv4si2;
   5452 	  break;
   5453 	case E_V4SImode:
   5454 	  if (unsigned_p)
   5455 	    unpack = gen_sse4_1_zero_extendv2siv2di2;
   5456 	  else
   5457 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
   5458 	  break;
   5459 	case E_V8QImode:
   5460 	  if (unsigned_p)
   5461 	    unpack = gen_sse4_1_zero_extendv4qiv4hi2;
   5462 	  else
   5463 	    unpack = gen_sse4_1_sign_extendv4qiv4hi2;
   5464 	  break;
   5465 	case E_V4HImode:
   5466 	  if (unsigned_p)
   5467 	    unpack = gen_sse4_1_zero_extendv2hiv2si2;
   5468 	  else
   5469 	    unpack = gen_sse4_1_sign_extendv2hiv2si2;
   5470 	  break;
   5471 	case E_V4QImode:
   5472 	  if (unsigned_p)
   5473 	    unpack = gen_sse4_1_zero_extendv2qiv2hi2;
   5474 	  else
   5475 	    unpack = gen_sse4_1_sign_extendv2qiv2hi2;
   5476 	  break;
   5477 	default:
   5478 	  gcc_unreachable ();
   5479 	}
   5480 
   5481       if (GET_MODE_SIZE (imode) >= 32)
   5482 	{
   5483 	  tmp = gen_reg_rtx (halfmode);
   5484 	  emit_insn (extract (tmp, src));
   5485 	}
   5486       else if (high_p)
   5487 	{
   5488 	  switch (GET_MODE_SIZE (imode))
   5489 	    {
   5490 	    case 16:
   5491 	      /* Shift higher 8 bytes to lower 8 bytes.  */
   5492 	      tmp = gen_reg_rtx (V1TImode);
   5493 	      emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
   5494 					     GEN_INT (64)));
   5495 	      break;
   5496 	    case 8:
   5497 	      /* Shift higher 4 bytes to lower 4 bytes.  */
   5498 	      tmp = gen_reg_rtx (V1DImode);
   5499 	      emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
   5500 					    GEN_INT (32)));
   5501 	      break;
   5502 	    case 4:
   5503 	      /* Shift higher 2 bytes to lower 2 bytes.  */
   5504 	      tmp = gen_reg_rtx (V1SImode);
   5505 	      emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
   5506 					    GEN_INT (16)));
   5507 	      break;
   5508 	    default:
   5509 	      gcc_unreachable ();
   5510 	    }
   5511 
   5512 	  tmp = gen_lowpart (imode, tmp);
   5513 	}
   5514       else
   5515 	tmp = src;
   5516 
   5517       emit_insn (unpack (dest, tmp));
   5518     }
   5519   else
   5520     {
   5521       rtx (*unpack)(rtx, rtx, rtx);
   5522 
   5523       switch (imode)
   5524 	{
   5525 	case E_V16QImode:
   5526 	  if (high_p)
   5527 	    unpack = gen_vec_interleave_highv16qi;
   5528 	  else
   5529 	    unpack = gen_vec_interleave_lowv16qi;
   5530 	  break;
   5531 	case E_V8HImode:
   5532 	  if (high_p)
   5533 	    unpack = gen_vec_interleave_highv8hi;
   5534 	  else
   5535 	    unpack = gen_vec_interleave_lowv8hi;
   5536 	  break;
   5537 	case E_V4SImode:
   5538 	  if (high_p)
   5539 	    unpack = gen_vec_interleave_highv4si;
   5540 	  else
   5541 	    unpack = gen_vec_interleave_lowv4si;
   5542 	  break;
   5543 	case E_V8QImode:
   5544 	  if (high_p)
   5545 	    unpack = gen_mmx_punpckhbw;
   5546 	  else
   5547 	    unpack = gen_mmx_punpcklbw;
   5548 	  break;
   5549 	case E_V4HImode:
   5550 	  if (high_p)
   5551 	    unpack = gen_mmx_punpckhwd;
   5552 	  else
   5553 	    unpack = gen_mmx_punpcklwd;
   5554 	  break;
   5555 	case E_V4QImode:
   5556 	  if (high_p)
   5557 	    unpack = gen_mmx_punpckhbw_low;
   5558 	  else
   5559 	    unpack = gen_mmx_punpcklbw_low;
   5560 	  break;
   5561 	default:
   5562 	  gcc_unreachable ();
   5563 	}
   5564 
   5565       if (unsigned_p)
   5566 	tmp = force_reg (imode, CONST0_RTX (imode));
   5567       else
   5568 	tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
   5569 				   src, pc_rtx, pc_rtx);
   5570 
   5571       rtx tmp2 = gen_reg_rtx (imode);
   5572       emit_insn (unpack (tmp2, src, tmp));
   5573       emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
   5574     }
   5575 }
   5576 
   5577 /* Return true if mem is pool constant which contains a const_vector
   5578    perm index, assign the index to PERM.  */
   5579 bool
   5580 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
   5581 {
   5582   machine_mode mode = GET_MODE (mem);
   5583   int nelt = GET_MODE_NUNITS (mode);
   5584 
   5585   if (!INTEGRAL_MODE_P (mode))
   5586     return false;
   5587 
   5588     /* Needs to be constant pool.  */
   5589   if (!(MEM_P (mem))
   5590       || !SYMBOL_REF_P (XEXP (mem, 0))
   5591       || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
   5592    return false;
   5593 
   5594   rtx constant = get_pool_constant (XEXP (mem, 0));
   5595 
   5596   if (GET_CODE (constant) != CONST_VECTOR)
   5597     return false;
   5598 
   5599   /* There could be some rtx like
   5600      (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
   5601      but with "*.LC1" refer to V2DI constant vector.  */
   5602   if (GET_MODE (constant) != mode)
   5603     {
   5604       constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
   5605 
   5606       if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
   5607 	return false;
   5608     }
   5609 
   5610   for (int i = 0; i != nelt; i++)
   5611     perm[i] = UINTVAL (XVECEXP (constant, 0, i));
   5612 
   5613   return true;
   5614 }
   5615 
   5616 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
   5617    but works for floating pointer parameters and nonoffsetable memories.
   5618    For pushes, it returns just stack offsets; the values will be saved
   5619    in the right order.  Maximally three parts are generated.  */
   5620 
   5621 static int
   5622 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
   5623 {
   5624   int size;
   5625 
   5626   if (!TARGET_64BIT)
   5627     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
   5628   else
   5629     size = (GET_MODE_SIZE (mode) + 4) / 8;
   5630 
   5631   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
   5632   gcc_assert (size >= 2 && size <= 4);
   5633 
   5634   /* Optimize constant pool reference to immediates.  This is used by fp
   5635      moves, that force all constants to memory to allow combining.  */
   5636   if (MEM_P (operand) && MEM_READONLY_P (operand))
   5637     operand = avoid_constant_pool_reference (operand);
   5638 
   5639   if (MEM_P (operand) && !offsettable_memref_p (operand))
   5640     {
   5641       /* The only non-offsetable memories we handle are pushes.  */
   5642       int ok = push_operand (operand, VOIDmode);
   5643 
   5644       gcc_assert (ok);
   5645 
   5646       operand = copy_rtx (operand);
   5647       PUT_MODE (operand, word_mode);
   5648       parts[0] = parts[1] = parts[2] = parts[3] = operand;
   5649       return size;
   5650     }
   5651 
   5652   if (GET_CODE (operand) == CONST_VECTOR)
   5653     {
   5654       scalar_int_mode imode = int_mode_for_mode (mode).require ();
   5655       /* Caution: if we looked through a constant pool memory above,
   5656 	 the operand may actually have a different mode now.  That's
   5657 	 ok, since we want to pun this all the way back to an integer.  */
   5658       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
   5659       gcc_assert (operand != NULL);
   5660       mode = imode;
   5661     }
   5662 
   5663   if (!TARGET_64BIT)
   5664     {
   5665       if (mode == DImode)
   5666 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
   5667       else
   5668 	{
   5669 	  int i;
   5670 
   5671 	  if (REG_P (operand))
   5672 	    {
   5673 	      gcc_assert (reload_completed);
   5674 	      for (i = 0; i < size; i++)
   5675 		parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
   5676 	    }
   5677 	  else if (offsettable_memref_p (operand))
   5678 	    {
   5679 	      operand = adjust_address (operand, SImode, 0);
   5680 	      parts[0] = operand;
   5681 	      for (i = 1; i < size; i++)
   5682 		parts[i] = adjust_address (operand, SImode, 4 * i);
   5683 	    }
   5684 	  else if (CONST_DOUBLE_P (operand))
   5685 	    {
   5686 	      const REAL_VALUE_TYPE *r;
   5687 	      long l[4];
   5688 
   5689 	      r = CONST_DOUBLE_REAL_VALUE (operand);
   5690 	      switch (mode)
   5691 		{
   5692 		case E_TFmode:
   5693 		  real_to_target (l, r, mode);
   5694 		  parts[3] = gen_int_mode (l[3], SImode);
   5695 		  parts[2] = gen_int_mode (l[2], SImode);
   5696 		  break;
   5697 		case E_XFmode:
   5698 		  /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
   5699 		     long double may not be 80-bit.  */
   5700 		  real_to_target (l, r, mode);
   5701 		  parts[2] = gen_int_mode (l[2], SImode);
   5702 		  break;
   5703 		case E_DFmode:
   5704 		  REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
   5705 		  break;
   5706 		default:
   5707 		  gcc_unreachable ();
   5708 		}
   5709 	      parts[1] = gen_int_mode (l[1], SImode);
   5710 	      parts[0] = gen_int_mode (l[0], SImode);
   5711 	    }
   5712 	  else
   5713 	    gcc_unreachable ();
   5714 	}
   5715     }
   5716   else
   5717     {
   5718       if (mode == TImode)
   5719 	split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
   5720       if (mode == XFmode || mode == TFmode)
   5721 	{
   5722 	  machine_mode upper_mode = mode==XFmode ? SImode : DImode;
   5723 	  if (REG_P (operand))
   5724 	    {
   5725 	      gcc_assert (reload_completed);
   5726 	      parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
   5727 	      parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
   5728 	    }
   5729 	  else if (offsettable_memref_p (operand))
   5730 	    {
   5731 	      operand = adjust_address (operand, DImode, 0);
   5732 	      parts[0] = operand;
   5733 	      parts[1] = adjust_address (operand, upper_mode, 8);
   5734 	    }
   5735 	  else if (CONST_DOUBLE_P (operand))
   5736 	    {
   5737 	      long l[4];
   5738 
   5739 	      real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
   5740 
   5741 	      /* real_to_target puts 32-bit pieces in each long.  */
   5742 	      parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
   5743 				       | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
   5744 					  << 32), DImode);
   5745 
   5746 	      if (upper_mode == SImode)
   5747 	        parts[1] = gen_int_mode (l[2], SImode);
   5748 	      else
   5749 	        parts[1]
   5750 		  = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
   5751 				  | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
   5752 				     << 32), DImode);
   5753 	    }
   5754 	  else
   5755 	    gcc_unreachable ();
   5756 	}
   5757     }
   5758 
   5759   return size;
   5760 }
   5761 
   5762 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
   5763    Return false when normal moves are needed; true when all required
   5764    insns have been emitted.  Operands 2-4 contain the input values
   5765    int the correct order; operands 5-7 contain the output values.  */
   5766 
   5767 void
   5768 ix86_split_long_move (rtx operands[])
   5769 {
   5770   rtx part[2][4];
   5771   int nparts, i, j;
   5772   int push = 0;
   5773   int collisions = 0;
   5774   machine_mode mode = GET_MODE (operands[0]);
   5775   bool collisionparts[4];
   5776 
   5777   /* The DFmode expanders may ask us to move double.
   5778      For 64bit target this is single move.  By hiding the fact
   5779      here we simplify i386.md splitters.  */
   5780   if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
   5781     {
   5782       /* Optimize constant pool reference to immediates.  This is used by
   5783 	 fp moves, that force all constants to memory to allow combining.  */
   5784 
   5785       if (MEM_P (operands[1])
   5786 	  && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
   5787 	  && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
   5788 	operands[1] = get_pool_constant (XEXP (operands[1], 0));
   5789       if (push_operand (operands[0], VOIDmode))
   5790 	{
   5791 	  operands[0] = copy_rtx (operands[0]);
   5792 	  PUT_MODE (operands[0], word_mode);
   5793 	}
   5794       else
   5795         operands[0] = gen_lowpart (DImode, operands[0]);
   5796       operands[1] = gen_lowpart (DImode, operands[1]);
   5797       emit_move_insn (operands[0], operands[1]);
   5798       return;
   5799     }
   5800 
   5801   /* The only non-offsettable memory we handle is push.  */
   5802   if (push_operand (operands[0], VOIDmode))
   5803     push = 1;
   5804   else
   5805     gcc_assert (!MEM_P (operands[0])
   5806 		|| offsettable_memref_p (operands[0]));
   5807 
   5808   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
   5809   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
   5810 
   5811   /* When emitting push, take care for source operands on the stack.  */
   5812   if (push && MEM_P (operands[1])
   5813       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
   5814     {
   5815       rtx src_base = XEXP (part[1][nparts - 1], 0);
   5816 
   5817       /* Compensate for the stack decrement by 4.  */
   5818       if (!TARGET_64BIT && nparts == 3
   5819 	  && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
   5820 	src_base = plus_constant (Pmode, src_base, 4);
   5821 
   5822       /* src_base refers to the stack pointer and is
   5823 	 automatically decreased by emitted push.  */
   5824       for (i = 0; i < nparts; i++)
   5825 	part[1][i] = change_address (part[1][i],
   5826 				     GET_MODE (part[1][i]), src_base);
   5827     }
   5828 
   5829   /* We need to do copy in the right order in case an address register
   5830      of the source overlaps the destination.  */
   5831   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
   5832     {
   5833       rtx tmp;
   5834 
   5835       for (i = 0; i < nparts; i++)
   5836 	{
   5837 	  collisionparts[i]
   5838 	    = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
   5839 	  if (collisionparts[i])
   5840 	    collisions++;
   5841 	}
   5842 
   5843       /* Collision in the middle part can be handled by reordering.  */
   5844       if (collisions == 1 && nparts == 3 && collisionparts [1])
   5845 	{
   5846 	  std::swap (part[0][1], part[0][2]);
   5847 	  std::swap (part[1][1], part[1][2]);
   5848 	}
   5849       else if (collisions == 1
   5850 	       && nparts == 4
   5851 	       && (collisionparts [1] || collisionparts [2]))
   5852 	{
   5853 	  if (collisionparts [1])
   5854 	    {
   5855 	      std::swap (part[0][1], part[0][2]);
   5856 	      std::swap (part[1][1], part[1][2]);
   5857 	    }
   5858 	  else
   5859 	    {
   5860 	      std::swap (part[0][2], part[0][3]);
   5861 	      std::swap (part[1][2], part[1][3]);
   5862 	    }
   5863 	}
   5864 
   5865       /* If there are more collisions, we can't handle it by reordering.
   5866 	 Do an lea to the last part and use only one colliding move.  */
   5867       else if (collisions > 1)
   5868 	{
   5869 	  rtx base, addr;
   5870 
   5871 	  collisions = 1;
   5872 
   5873 	  base = part[0][nparts - 1];
   5874 
   5875 	  /* Handle the case when the last part isn't valid for lea.
   5876 	     Happens in 64-bit mode storing the 12-byte XFmode.  */
   5877 	  if (GET_MODE (base) != Pmode)
   5878 	    base = gen_rtx_REG (Pmode, REGNO (base));
   5879 
   5880 	  addr = XEXP (part[1][0], 0);
   5881 	  if (TARGET_TLS_DIRECT_SEG_REFS)
   5882 	    {
   5883 	      struct ix86_address parts;
   5884 	      int ok = ix86_decompose_address (addr, &parts);
   5885 	      gcc_assert (ok);
   5886 	      /* It is not valid to use %gs: or %fs: in lea.  */
   5887 	      gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
   5888 	    }
   5889 	  emit_insn (gen_rtx_SET (base, addr));
   5890 	  part[1][0] = replace_equiv_address (part[1][0], base);
   5891 	  for (i = 1; i < nparts; i++)
   5892 	    {
   5893 	      tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
   5894 	      part[1][i] = replace_equiv_address (part[1][i], tmp);
   5895 	    }
   5896 	}
   5897     }
   5898 
   5899   if (push)
   5900     {
   5901       if (!TARGET_64BIT)
   5902 	{
   5903 	  if (nparts == 3)
   5904 	    {
   5905 	      if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
   5906                 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
   5907 	      emit_move_insn (part[0][2], part[1][2]);
   5908 	    }
   5909 	  else if (nparts == 4)
   5910 	    {
   5911 	      emit_move_insn (part[0][3], part[1][3]);
   5912 	      emit_move_insn (part[0][2], part[1][2]);
   5913 	    }
   5914 	}
   5915       else
   5916 	{
   5917 	  /* In 64bit mode we don't have 32bit push available.  In case this is
   5918 	     register, it is OK - we will just use larger counterpart.  We also
   5919 	     retype memory - these comes from attempt to avoid REX prefix on
   5920 	     moving of second half of TFmode value.  */
   5921 	  if (GET_MODE (part[1][1]) == SImode)
   5922 	    {
   5923 	      switch (GET_CODE (part[1][1]))
   5924 		{
   5925 		case MEM:
   5926 		  part[1][1] = adjust_address (part[1][1], DImode, 0);
   5927 		  break;
   5928 
   5929 		case REG:
   5930 		  part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
   5931 		  break;
   5932 
   5933 		default:
   5934 		  gcc_unreachable ();
   5935 		}
   5936 
   5937 	      if (GET_MODE (part[1][0]) == SImode)
   5938 		part[1][0] = part[1][1];
   5939 	    }
   5940 	}
   5941       emit_move_insn (part[0][1], part[1][1]);
   5942       emit_move_insn (part[0][0], part[1][0]);
   5943       return;
   5944     }
   5945 
   5946   /* Choose correct order to not overwrite the source before it is copied.  */
   5947   if ((REG_P (part[0][0])
   5948        && REG_P (part[1][1])
   5949        && (REGNO (part[0][0]) == REGNO (part[1][1])
   5950 	   || (nparts == 3
   5951 	       && REGNO (part[0][0]) == REGNO (part[1][2]))
   5952 	   || (nparts == 4
   5953 	       && REGNO (part[0][0]) == REGNO (part[1][3]))))
   5954       || (collisions > 0
   5955 	  && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
   5956     {
   5957       for (i = 0, j = nparts - 1; i < nparts; i++, j--)
   5958 	{
   5959 	  operands[2 + i] = part[0][j];
   5960 	  operands[6 + i] = part[1][j];
   5961 	}
   5962     }
   5963   else
   5964     {
   5965       for (i = 0; i < nparts; i++)
   5966 	{
   5967 	  operands[2 + i] = part[0][i];
   5968 	  operands[6 + i] = part[1][i];
   5969 	}
   5970     }
   5971 
   5972   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
   5973   if (optimize_insn_for_size_p ())
   5974     {
   5975       for (j = 0; j < nparts - 1; j++)
   5976 	if (CONST_INT_P (operands[6 + j])
   5977 	    && operands[6 + j] != const0_rtx
   5978 	    && REG_P (operands[2 + j]))
   5979 	  for (i = j; i < nparts - 1; i++)
   5980 	    if (CONST_INT_P (operands[7 + i])
   5981 		&& INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
   5982 	      operands[7 + i] = operands[2 + j];
   5983     }
   5984 
   5985   for (i = 0; i < nparts; i++)
   5986     emit_move_insn (operands[2 + i], operands[6 + i]);
   5987 
   5988   return;
   5989 }
   5990 
   5991 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
   5992    left shift by a constant, either using a single shift or
   5993    a sequence of add instructions.  */
   5994 
   5995 static void
   5996 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
   5997 {
   5998   if (count == 1
   5999       || (count * ix86_cost->add <= ix86_cost->shift_const
   6000 	  && !optimize_insn_for_size_p ()))
   6001     {
   6002       while (count-- > 0)
   6003 	emit_insn (gen_add2_insn (operand, operand));
   6004     }
   6005   else
   6006     {
   6007       rtx (*insn)(rtx, rtx, rtx);
   6008 
   6009       insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
   6010       emit_insn (insn (operand, operand, GEN_INT (count)));
   6011     }
   6012 }
   6013 
   6014 void
   6015 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
   6016 {
   6017   rtx (*gen_ashl3)(rtx, rtx, rtx);
   6018   rtx (*gen_shld)(rtx, rtx, rtx);
   6019   int half_width = GET_MODE_BITSIZE (mode) >> 1;
   6020   machine_mode half_mode;
   6021 
   6022   rtx low[2], high[2];
   6023   int count;
   6024 
   6025   if (CONST_INT_P (operands[2]))
   6026     {
   6027       split_double_mode (mode, operands, 2, low, high);
   6028       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
   6029 
   6030       if (count >= half_width)
   6031 	{
   6032 	  emit_move_insn (high[0], low[1]);
   6033 	  emit_move_insn (low[0], const0_rtx);
   6034 
   6035 	  if (count > half_width)
   6036 	    ix86_expand_ashl_const (high[0], count - half_width, mode);
   6037 	}
   6038       else
   6039 	{
   6040 	  gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
   6041 
   6042 	  if (!rtx_equal_p (operands[0], operands[1]))
   6043 	    emit_move_insn (operands[0], operands[1]);
   6044 
   6045 	  emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
   6046 	  ix86_expand_ashl_const (low[0], count, mode);
   6047 	}
   6048       return;
   6049     }
   6050 
   6051   split_double_mode (mode, operands, 1, low, high);
   6052   half_mode = mode == DImode ? SImode : DImode;
   6053 
   6054   gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
   6055 
   6056   if (operands[1] == const1_rtx)
   6057     {
   6058       /* Assuming we've chosen a QImode capable registers, then 1 << N
   6059 	 can be done with two 32/64-bit shifts, no branches, no cmoves.  */
   6060       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
   6061 	{
   6062 	  rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
   6063 
   6064 	  ix86_expand_clear (low[0]);
   6065 	  ix86_expand_clear (high[0]);
   6066 	  emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
   6067 
   6068 	  d = gen_lowpart (QImode, low[0]);
   6069 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
   6070 	  s = gen_rtx_EQ (QImode, flags, const0_rtx);
   6071 	  emit_insn (gen_rtx_SET (d, s));
   6072 
   6073 	  d = gen_lowpart (QImode, high[0]);
   6074 	  d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
   6075 	  s = gen_rtx_NE (QImode, flags, const0_rtx);
   6076 	  emit_insn (gen_rtx_SET (d, s));
   6077 	}
   6078 
   6079       /* Otherwise, we can get the same results by manually performing
   6080 	 a bit extract operation on bit 5/6, and then performing the two
   6081 	 shifts.  The two methods of getting 0/1 into low/high are exactly
   6082 	 the same size.  Avoiding the shift in the bit extract case helps
   6083 	 pentium4 a bit; no one else seems to care much either way.  */
   6084       else
   6085 	{
   6086 	  rtx (*gen_lshr3)(rtx, rtx, rtx);
   6087 	  rtx (*gen_and3)(rtx, rtx, rtx);
   6088 	  rtx (*gen_xor3)(rtx, rtx, rtx);
   6089 	  HOST_WIDE_INT bits;
   6090 	  rtx x;
   6091 
   6092 	  if (mode == DImode)
   6093 	    {
   6094 	      gen_lshr3 = gen_lshrsi3;
   6095 	      gen_and3 = gen_andsi3;
   6096 	      gen_xor3 = gen_xorsi3;
   6097 	      bits = 5;
   6098 	    }
   6099 	  else
   6100 	    {
   6101 	      gen_lshr3 = gen_lshrdi3;
   6102 	      gen_and3 = gen_anddi3;
   6103 	      gen_xor3 = gen_xordi3;
   6104 	      bits = 6;
   6105 	    }
   6106 
   6107 	  if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
   6108 	    x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
   6109 	  else
   6110 	    x = gen_lowpart (half_mode, operands[2]);
   6111 	  emit_insn (gen_rtx_SET (high[0], x));
   6112 
   6113 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
   6114 	  emit_insn (gen_and3 (high[0], high[0], const1_rtx));
   6115 	  emit_move_insn (low[0], high[0]);
   6116 	  emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
   6117 	}
   6118 
   6119       emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
   6120       emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
   6121       return;
   6122     }
   6123 
   6124   if (operands[1] == constm1_rtx)
   6125     {
   6126       /* For -1 << N, we can avoid the shld instruction, because we
   6127 	 know that we're shifting 0...31/63 ones into a -1.  */
   6128       emit_move_insn (low[0], constm1_rtx);
   6129       if (optimize_insn_for_size_p ())
   6130 	emit_move_insn (high[0], low[0]);
   6131       else
   6132 	emit_move_insn (high[0], constm1_rtx);
   6133     }
   6134   else
   6135     {
   6136       gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
   6137 
   6138       if (!rtx_equal_p (operands[0], operands[1]))
   6139 	emit_move_insn (operands[0], operands[1]);
   6140 
   6141       split_double_mode (mode, operands, 1, low, high);
   6142       emit_insn (gen_shld (high[0], low[0], operands[2]));
   6143     }
   6144 
   6145   emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
   6146 
   6147   if (TARGET_CMOVE && scratch)
   6148     {
   6149       ix86_expand_clear (scratch);
   6150       emit_insn (gen_x86_shift_adj_1
   6151 		 (half_mode, high[0], low[0], operands[2], scratch));
   6152     }
   6153   else
   6154     emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
   6155 }
   6156 
   6157 void
   6158 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
   6159 {
   6160   rtx (*gen_ashr3)(rtx, rtx, rtx)
   6161     = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
   6162   rtx (*gen_shrd)(rtx, rtx, rtx);
   6163   int half_width = GET_MODE_BITSIZE (mode) >> 1;
   6164 
   6165   rtx low[2], high[2];
   6166   int count;
   6167 
   6168   if (CONST_INT_P (operands[2]))
   6169     {
   6170       split_double_mode (mode, operands, 2, low, high);
   6171       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
   6172 
   6173       if (count == GET_MODE_BITSIZE (mode) - 1)
   6174 	{
   6175 	  emit_move_insn (high[0], high[1]);
   6176 	  emit_insn (gen_ashr3 (high[0], high[0],
   6177 				GEN_INT (half_width - 1)));
   6178 	  emit_move_insn (low[0], high[0]);
   6179 
   6180 	}
   6181       else if (count >= half_width)
   6182 	{
   6183 	  emit_move_insn (low[0], high[1]);
   6184 	  emit_move_insn (high[0], low[0]);
   6185 	  emit_insn (gen_ashr3 (high[0], high[0],
   6186 				GEN_INT (half_width - 1)));
   6187 
   6188 	  if (count > half_width)
   6189 	    emit_insn (gen_ashr3 (low[0], low[0],
   6190 				  GEN_INT (count - half_width)));
   6191 	}
   6192       else
   6193 	{
   6194 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
   6195 
   6196 	  if (!rtx_equal_p (operands[0], operands[1]))
   6197 	    emit_move_insn (operands[0], operands[1]);
   6198 
   6199 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
   6200 	  emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
   6201 	}
   6202     }
   6203   else
   6204     {
   6205       machine_mode half_mode;
   6206 
   6207       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
   6208 
   6209      if (!rtx_equal_p (operands[0], operands[1]))
   6210 	emit_move_insn (operands[0], operands[1]);
   6211 
   6212       split_double_mode (mode, operands, 1, low, high);
   6213       half_mode = mode == DImode ? SImode : DImode;
   6214 
   6215       emit_insn (gen_shrd (low[0], high[0], operands[2]));
   6216       emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
   6217 
   6218       if (TARGET_CMOVE && scratch)
   6219 	{
   6220 	  emit_move_insn (scratch, high[0]);
   6221 	  emit_insn (gen_ashr3 (scratch, scratch,
   6222 				GEN_INT (half_width - 1)));
   6223 	  emit_insn (gen_x86_shift_adj_1
   6224 		     (half_mode, low[0], high[0], operands[2], scratch));
   6225 	}
   6226       else
   6227 	emit_insn (gen_x86_shift_adj_3
   6228 		   (half_mode, low[0], high[0], operands[2]));
   6229     }
   6230 }
   6231 
   6232 void
   6233 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
   6234 {
   6235   rtx (*gen_lshr3)(rtx, rtx, rtx)
   6236     = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
   6237   rtx (*gen_shrd)(rtx, rtx, rtx);
   6238   int half_width = GET_MODE_BITSIZE (mode) >> 1;
   6239 
   6240   rtx low[2], high[2];
   6241   int count;
   6242 
   6243   if (CONST_INT_P (operands[2]))
   6244     {
   6245       split_double_mode (mode, operands, 2, low, high);
   6246       count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
   6247 
   6248       if (count >= half_width)
   6249 	{
   6250 	  emit_move_insn (low[0], high[1]);
   6251 	  ix86_expand_clear (high[0]);
   6252 
   6253 	  if (count > half_width)
   6254 	    emit_insn (gen_lshr3 (low[0], low[0],
   6255 				  GEN_INT (count - half_width)));
   6256 	}
   6257       else
   6258 	{
   6259 	  gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
   6260 
   6261 	  if (!rtx_equal_p (operands[0], operands[1]))
   6262 	    emit_move_insn (operands[0], operands[1]);
   6263 
   6264 	  emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
   6265 	  emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
   6266 	}
   6267     }
   6268   else
   6269     {
   6270       machine_mode half_mode;
   6271 
   6272       gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
   6273 
   6274       if (!rtx_equal_p (operands[0], operands[1]))
   6275 	emit_move_insn (operands[0], operands[1]);
   6276 
   6277       split_double_mode (mode, operands, 1, low, high);
   6278       half_mode = mode == DImode ? SImode : DImode;
   6279 
   6280       emit_insn (gen_shrd (low[0], high[0], operands[2]));
   6281       emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
   6282 
   6283       if (TARGET_CMOVE && scratch)
   6284 	{
   6285 	  ix86_expand_clear (scratch);
   6286 	  emit_insn (gen_x86_shift_adj_1
   6287 		     (half_mode, low[0], high[0], operands[2], scratch));
   6288 	}
   6289       else
   6290 	emit_insn (gen_x86_shift_adj_2
   6291 		   (half_mode, low[0], high[0], operands[2]));
   6292     }
   6293 }
   6294 
   6295 /* Expand move of V1TI mode register X to a new TI mode register.  */
   6296 static rtx
   6297 ix86_expand_v1ti_to_ti (rtx x)
   6298 {
   6299   rtx result = gen_reg_rtx (TImode);
   6300   if (TARGET_SSE2)
   6301     {
   6302       rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
   6303       rtx lo = gen_lowpart (DImode, result);
   6304       emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
   6305       rtx hi = gen_highpart (DImode, result);
   6306       emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
   6307     }
   6308   else
   6309     emit_move_insn (result, gen_lowpart (TImode, x));
   6310   return result;
   6311 }
   6312 
   6313 /* Expand move of TI mode register X to a new V1TI mode register.  */
   6314 static rtx
   6315 ix86_expand_ti_to_v1ti (rtx x)
   6316 {
   6317   if (TARGET_SSE2)
   6318     {
   6319       rtx lo = gen_lowpart (DImode, x);
   6320       rtx hi = gen_highpart (DImode, x);
   6321       rtx tmp = gen_reg_rtx (V2DImode);
   6322       emit_insn (gen_vec_concatv2di (tmp, lo, hi));
   6323       return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
   6324     }
   6325 
   6326   return force_reg (V1TImode, gen_lowpart (V1TImode, x));
   6327 }
   6328 
   6329 /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
   6330 void
   6331 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
   6332 {
   6333   rtx op1 = force_reg (V1TImode, operands[1]);
   6334 
   6335   if (!CONST_INT_P (operands[2]))
   6336     {
   6337       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
   6338       rtx tmp2 = gen_reg_rtx (TImode);
   6339       rtx (*shift) (rtx, rtx, rtx)
   6340 	    = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
   6341       emit_insn (shift (tmp2, tmp1, operands[2]));
   6342       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
   6343       emit_move_insn (operands[0], tmp3);
   6344       return;
   6345     }
   6346 
   6347   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
   6348 
   6349   if (bits == 0)
   6350     {
   6351       emit_move_insn (operands[0], op1);
   6352       return;
   6353     }
   6354 
   6355   if ((bits & 7) == 0)
   6356     {
   6357       rtx tmp = gen_reg_rtx (V1TImode);
   6358       if (code == ASHIFT)
   6359 	emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
   6360       else
   6361 	emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
   6362       emit_move_insn (operands[0], tmp);
   6363       return;
   6364     }
   6365 
   6366   rtx tmp1 = gen_reg_rtx (V1TImode);
   6367   if (code == ASHIFT)
   6368     emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
   6369   else
   6370     emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
   6371 
   6372   /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
   6373   rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
   6374 
   6375   /* tmp3 will be the V2DImode result.  */
   6376   rtx tmp3 = gen_reg_rtx (V2DImode);
   6377 
   6378   if (bits > 64)
   6379     {
   6380       if (code == ASHIFT)
   6381 	emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
   6382       else
   6383 	emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
   6384     }
   6385   else
   6386     {
   6387       /* tmp4 is operands[1], in V2DImode.  */
   6388       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
   6389 
   6390       rtx tmp5 = gen_reg_rtx (V2DImode);
   6391       if (code == ASHIFT)
   6392 	emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
   6393       else
   6394 	emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
   6395 
   6396       rtx tmp6 = gen_reg_rtx (V2DImode);
   6397       if (code == ASHIFT)
   6398 	emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
   6399       else
   6400 	emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
   6401 
   6402       emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
   6403     }
   6404 
   6405   /* Convert the result back to V1TImode and store in operands[0].  */
   6406   rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
   6407   emit_move_insn (operands[0], tmp7);
   6408 }
   6409 
   6410 /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
   6411 void
   6412 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
   6413 {
   6414   rtx op1 = force_reg (V1TImode, operands[1]);
   6415 
   6416   if (!CONST_INT_P (operands[2]))
   6417     {
   6418       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
   6419       rtx tmp2 = gen_reg_rtx (TImode);
   6420       rtx (*rotate) (rtx, rtx, rtx)
   6421 	    = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
   6422       emit_insn (rotate (tmp2, tmp1, operands[2]));
   6423       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
   6424       emit_move_insn (operands[0], tmp3);
   6425       return;
   6426     }
   6427 
   6428   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
   6429 
   6430   if (bits == 0)
   6431     {
   6432       emit_move_insn (operands[0], op1);
   6433       return;
   6434     }
   6435 
   6436   if (code == ROTATERT)
   6437     bits = 128 - bits;
   6438 
   6439   if ((bits & 31) == 0)
   6440     {
   6441       rtx tmp2 = gen_reg_rtx (V4SImode);
   6442       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6443       if (bits == 32)
   6444 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
   6445       else if (bits == 64)
   6446 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
   6447       else
   6448 	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
   6449       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
   6450       return;
   6451     }
   6452 
   6453   if ((bits & 7) == 0)
   6454     {
   6455       rtx tmp1 = gen_reg_rtx (V1TImode);
   6456       rtx tmp2 = gen_reg_rtx (V1TImode);
   6457       rtx tmp3 = gen_reg_rtx (V1TImode);
   6458 
   6459       emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
   6460       emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
   6461       emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
   6462       emit_move_insn (operands[0], tmp3);
   6463       return;
   6464     }
   6465 
   6466   rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6467 
   6468   rtx lobits;
   6469   rtx hibits;
   6470 
   6471   switch (bits >> 5)
   6472     {
   6473     case 0:
   6474       lobits = op1_v4si;
   6475       hibits = gen_reg_rtx (V4SImode);
   6476       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
   6477       break;
   6478 
   6479     case 1:
   6480       lobits = gen_reg_rtx (V4SImode);
   6481       hibits = gen_reg_rtx (V4SImode);
   6482       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
   6483       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
   6484       break;
   6485 
   6486     case 2:
   6487       lobits = gen_reg_rtx (V4SImode);
   6488       hibits = gen_reg_rtx (V4SImode);
   6489       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
   6490       emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
   6491       break;
   6492 
   6493     default:
   6494       lobits = gen_reg_rtx (V4SImode);
   6495       emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
   6496       hibits = op1_v4si;
   6497       break;
   6498     }
   6499 
   6500   rtx tmp1 = gen_reg_rtx (V4SImode);
   6501   rtx tmp2 = gen_reg_rtx (V4SImode);
   6502   rtx tmp3 = gen_reg_rtx (V4SImode);
   6503 
   6504   emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
   6505   emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
   6506   emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
   6507 
   6508   emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
   6509 }
   6510 
   6511 /* Expand V1TI mode ashiftrt by constant.  */
   6512 void
   6513 ix86_expand_v1ti_ashiftrt (rtx operands[])
   6514 {
   6515   rtx op1 = force_reg (V1TImode, operands[1]);
   6516 
   6517   if (!CONST_INT_P (operands[2]))
   6518     {
   6519       rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
   6520       rtx tmp2 = gen_reg_rtx (TImode);
   6521       emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
   6522       rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
   6523       emit_move_insn (operands[0], tmp3);
   6524       return;
   6525     }
   6526 
   6527   HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
   6528 
   6529   if (bits == 0)
   6530     {
   6531       emit_move_insn (operands[0], op1);
   6532       return;
   6533     }
   6534 
   6535   if (bits == 127)
   6536     {
   6537       /* Two operations.  */
   6538       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
   6539       rtx tmp2 = gen_reg_rtx (V4SImode);
   6540       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6541 
   6542       rtx tmp3 = gen_reg_rtx (V4SImode);
   6543       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6544 
   6545       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
   6546       return;
   6547     }
   6548 
   6549   if (bits == 64)
   6550     {
   6551       /* Three operations.  */
   6552       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
   6553       rtx tmp2 = gen_reg_rtx (V4SImode);
   6554       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6555 
   6556       rtx tmp3 = gen_reg_rtx (V4SImode);
   6557       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6558 
   6559       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
   6560       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
   6561       rtx tmp6 = gen_reg_rtx (V2DImode);
   6562       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
   6563 
   6564       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
   6565       return;
   6566     }
   6567 
   6568   if (bits == 96)
   6569     {
   6570       /* Three operations.  */
   6571       rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
   6572       rtx tmp2 = gen_reg_rtx (V4SImode);
   6573       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
   6574 
   6575       rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
   6576       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
   6577       rtx tmp5 = gen_reg_rtx (V2DImode);
   6578       emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
   6579 
   6580       rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
   6581       rtx tmp7 = gen_reg_rtx (V4SImode);
   6582       emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
   6583 
   6584       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
   6585       return;
   6586     }
   6587 
   6588   if (bits >= 111)
   6589     {
   6590       /* Three operations.  */
   6591       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6592       rtx tmp2 = gen_reg_rtx (V4SImode);
   6593       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
   6594 
   6595       rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
   6596       rtx tmp4 = gen_reg_rtx (V8HImode);
   6597       emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
   6598 
   6599       rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
   6600       rtx tmp6 = gen_reg_rtx (V4SImode);
   6601       emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
   6602 
   6603       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
   6604       return;
   6605     }
   6606 
   6607   if (TARGET_AVX2 || TARGET_SSE4_1)
   6608     {
   6609       /* Three operations.  */
   6610       if (bits == 32)
   6611 	{
   6612 	  rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6613 	  rtx tmp2 = gen_reg_rtx (V4SImode);
   6614 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
   6615 
   6616 	  rtx tmp3 = gen_reg_rtx (V1TImode);
   6617 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
   6618 
   6619 	  if (TARGET_AVX2)
   6620 	    {
   6621 	      rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
   6622 	      rtx tmp5 = gen_reg_rtx (V4SImode);
   6623 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
   6624 					       GEN_INT (7)));
   6625 
   6626 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
   6627 	    }
   6628 	  else
   6629 	    {
   6630 	      rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
   6631 	      rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
   6632 	      rtx tmp6 = gen_reg_rtx (V8HImode);
   6633 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
   6634 					     GEN_INT (0x3f)));
   6635 
   6636 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
   6637 	    }
   6638 	  return;
   6639 	}
   6640 
   6641       /* Three operations.  */
   6642       if (bits == 8 || bits == 16 || bits == 24)
   6643 	{
   6644 	  rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6645 	  rtx tmp2 = gen_reg_rtx (V4SImode);
   6646 	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
   6647 
   6648 	  rtx tmp3 = gen_reg_rtx (V1TImode);
   6649 	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
   6650 
   6651 	  if (TARGET_AVX2)
   6652 	    {
   6653 	      rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
   6654 	      rtx tmp5 = gen_reg_rtx (V4SImode);
   6655 	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
   6656 					       GEN_INT (7)));
   6657 
   6658 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
   6659 	    }
   6660 	  else
   6661 	    {
   6662 	      rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
   6663 	      rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
   6664 	      rtx tmp6 = gen_reg_rtx (V8HImode);
   6665 	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
   6666 					     GEN_INT (0x3f)));
   6667 
   6668 	      emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
   6669 	    }
   6670 	  return;
   6671 	}
   6672     }
   6673 
   6674   if (bits > 96)
   6675     {
   6676       /* Four operations.  */
   6677       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6678       rtx tmp2 = gen_reg_rtx (V4SImode);
   6679       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
   6680 
   6681       rtx tmp3 = gen_reg_rtx (V4SImode);
   6682       emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
   6683 
   6684       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
   6685       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
   6686       rtx tmp6 = gen_reg_rtx (V2DImode);
   6687       emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
   6688 
   6689       rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
   6690       rtx tmp8 = gen_reg_rtx (V4SImode);
   6691       emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
   6692 
   6693       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
   6694       return;
   6695     }
   6696 
   6697   if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
   6698     {
   6699       /* Four operations.  */
   6700       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6701       rtx tmp2 = gen_reg_rtx (V4SImode);
   6702       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6703 
   6704       rtx tmp3 = gen_reg_rtx (V4SImode);
   6705       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6706 
   6707       rtx tmp4 = gen_reg_rtx (V1TImode);
   6708       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
   6709 
   6710       rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
   6711       rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
   6712       rtx tmp7 = gen_reg_rtx (V8HImode);
   6713       emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
   6714 				     GEN_INT (bits == 48 ? 0x1f : 0x07)));
   6715 
   6716       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
   6717       return;
   6718     }
   6719 
   6720   if ((bits & 7) == 0)
   6721     {
   6722       /* Five operations.  */
   6723       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6724       rtx tmp2 = gen_reg_rtx (V4SImode);
   6725       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6726 
   6727       rtx tmp3 = gen_reg_rtx (V4SImode);
   6728       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6729 
   6730       rtx tmp4 = gen_reg_rtx (V1TImode);
   6731       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
   6732 
   6733       rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
   6734       rtx tmp6 = gen_reg_rtx (V1TImode);
   6735       emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
   6736 
   6737       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
   6738       rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
   6739       rtx tmp9 = gen_reg_rtx (V2DImode);
   6740       emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
   6741 
   6742       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
   6743       return;
   6744     }
   6745 
   6746   if (TARGET_AVX2 && bits < 32)
   6747     {
   6748       /* Six operations.  */
   6749       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6750       rtx tmp2 = gen_reg_rtx (V4SImode);
   6751       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
   6752 
   6753       rtx tmp3 = gen_reg_rtx (V1TImode);
   6754       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
   6755 
   6756       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
   6757       rtx tmp5 = gen_reg_rtx (V2DImode);
   6758       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
   6759 
   6760       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
   6761       rtx tmp7 = gen_reg_rtx (V2DImode);
   6762       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
   6763 
   6764       rtx tmp8 = gen_reg_rtx (V2DImode);
   6765       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
   6766 
   6767       rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
   6768       rtx tmp10 = gen_reg_rtx (V4SImode);
   6769       emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
   6770 
   6771       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
   6772       return;
   6773     }
   6774 
   6775   if (TARGET_SSE4_1 && bits < 15)
   6776     {
   6777       /* Six operations.  */
   6778       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6779       rtx tmp2 = gen_reg_rtx (V4SImode);
   6780       emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
   6781 
   6782       rtx tmp3 = gen_reg_rtx (V1TImode);
   6783       emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
   6784 
   6785       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
   6786       rtx tmp5 = gen_reg_rtx (V2DImode);
   6787       emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
   6788 
   6789       rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
   6790       rtx tmp7 = gen_reg_rtx (V2DImode);
   6791       emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
   6792 
   6793       rtx tmp8 = gen_reg_rtx (V2DImode);
   6794       emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
   6795 
   6796       rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
   6797       rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
   6798       rtx tmp11 = gen_reg_rtx (V8HImode);
   6799       emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
   6800 
   6801       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
   6802       return;
   6803     }
   6804 
   6805   if (bits == 1)
   6806     {
   6807       /* Eight operations.  */
   6808       rtx tmp1 = gen_reg_rtx (V1TImode);
   6809       emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
   6810 
   6811       rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
   6812       rtx tmp3 = gen_reg_rtx (V2DImode);
   6813       emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
   6814 
   6815       rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
   6816       rtx tmp5 = gen_reg_rtx (V2DImode);
   6817       emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
   6818 
   6819       rtx tmp6 = gen_reg_rtx (V2DImode);
   6820       emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
   6821 
   6822       rtx tmp7 = gen_reg_rtx (V2DImode);
   6823       emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
   6824 
   6825       rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
   6826       rtx tmp9 = gen_reg_rtx (V4SImode);
   6827       emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
   6828 
   6829       rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
   6830       rtx tmp11 = gen_reg_rtx (V2DImode);
   6831       emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
   6832 
   6833       rtx tmp12 = gen_reg_rtx (V2DImode);
   6834       emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
   6835 
   6836       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
   6837       return;
   6838     }
   6839 
   6840   if (bits > 64)
   6841     {
   6842       /* Eight operations.  */
   6843       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6844       rtx tmp2 = gen_reg_rtx (V4SImode);
   6845       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6846 
   6847       rtx tmp3 = gen_reg_rtx (V4SImode);
   6848       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6849 
   6850       rtx tmp4 = gen_reg_rtx (V1TImode);
   6851       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
   6852 
   6853       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
   6854       rtx tmp6 = gen_reg_rtx (V2DImode);
   6855       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
   6856 
   6857       rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
   6858       rtx tmp8 = gen_reg_rtx (V1TImode);
   6859       emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
   6860 
   6861       rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
   6862       rtx tmp10 = gen_reg_rtx (V2DImode);
   6863       emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
   6864 
   6865       rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
   6866       rtx tmp12 = gen_reg_rtx (V2DImode);
   6867       emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
   6868 
   6869       rtx tmp13 = gen_reg_rtx (V2DImode);
   6870       emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
   6871 
   6872       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
   6873     }
   6874   else
   6875     {
   6876       /* Nine operations.  */
   6877       rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
   6878       rtx tmp2 = gen_reg_rtx (V4SImode);
   6879       emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
   6880 
   6881       rtx tmp3 = gen_reg_rtx (V4SImode);
   6882       emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
   6883 
   6884       rtx tmp4 = gen_reg_rtx (V1TImode);
   6885       emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
   6886 
   6887       rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
   6888       rtx tmp6 = gen_reg_rtx (V2DImode);
   6889       emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
   6890 
   6891       rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
   6892       rtx tmp8 = gen_reg_rtx (V2DImode);
   6893       emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
   6894 
   6895       rtx tmp9 = gen_reg_rtx (V2DImode);
   6896       emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
   6897 
   6898       rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
   6899       rtx tmp11 = gen_reg_rtx (V1TImode);
   6900       emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
   6901 
   6902       rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
   6903       rtx tmp13 = gen_reg_rtx (V2DImode);
   6904       emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
   6905 
   6906       rtx tmp14 = gen_reg_rtx (V2DImode);
   6907       emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
   6908 
   6909       emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
   6910     }
   6911 }
   6912 
   6913 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
   6914    DImode for constant loop counts.  */
   6915 
   6916 static machine_mode
   6917 counter_mode (rtx count_exp)
   6918 {
   6919   if (GET_MODE (count_exp) != VOIDmode)
   6920     return GET_MODE (count_exp);
   6921   if (!CONST_INT_P (count_exp))
   6922     return Pmode;
   6923   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
   6924     return DImode;
   6925   return SImode;
   6926 }
   6927 
   6928 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
   6929    to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
   6930    specified in bytes.  When ISSETMEM is TRUE, output the equivalent loop to set
   6931    memory by VALUE (supposed to be in MODE).
   6932 
   6933    The size is rounded down to whole number of chunk size moved at once.
   6934    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
   6935 
   6936 
   6937 static void
   6938 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
   6939 			       rtx destptr, rtx srcptr, rtx value,
   6940 			       rtx count, machine_mode mode, int unroll,
   6941 			       int expected_size, bool issetmem)
   6942 {
   6943   rtx_code_label *out_label, *top_label;
   6944   rtx iter, tmp;
   6945   machine_mode iter_mode = counter_mode (count);
   6946   int piece_size_n = GET_MODE_SIZE (mode) * unroll;
   6947   rtx piece_size = GEN_INT (piece_size_n);
   6948   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
   6949   rtx size;
   6950   int i;
   6951 
   6952   top_label = gen_label_rtx ();
   6953   out_label = gen_label_rtx ();
   6954   iter = gen_reg_rtx (iter_mode);
   6955 
   6956   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
   6957 			      NULL, 1, OPTAB_DIRECT);
   6958   /* Those two should combine.  */
   6959   if (piece_size == const1_rtx)
   6960     {
   6961       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
   6962 			       true, out_label);
   6963       predict_jump (REG_BR_PROB_BASE * 10 / 100);
   6964     }
   6965   emit_move_insn (iter, const0_rtx);
   6966 
   6967   emit_label (top_label);
   6968 
   6969   tmp = convert_modes (Pmode, iter_mode, iter, true);
   6970 
   6971   /* This assert could be relaxed - in this case we'll need to compute
   6972      smallest power of two, containing in PIECE_SIZE_N and pass it to
   6973      offset_address.  */
   6974   gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
   6975   destmem = offset_address (destmem, tmp, piece_size_n);
   6976   destmem = adjust_address (destmem, mode, 0);
   6977 
   6978   if (!issetmem)
   6979     {
   6980       srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
   6981       srcmem = adjust_address (srcmem, mode, 0);
   6982 
   6983       /* When unrolling for chips that reorder memory reads and writes,
   6984 	 we can save registers by using single temporary.
   6985 	 Also using 4 temporaries is overkill in 32bit mode.  */
   6986       if (!TARGET_64BIT && 0)
   6987 	{
   6988 	  for (i = 0; i < unroll; i++)
   6989 	    {
   6990 	      if (i)
   6991 		{
   6992 		  destmem = adjust_address (copy_rtx (destmem), mode,
   6993 					    GET_MODE_SIZE (mode));
   6994 		  srcmem = adjust_address (copy_rtx (srcmem), mode,
   6995 					   GET_MODE_SIZE (mode));
   6996 		}
   6997 	      emit_move_insn (destmem, srcmem);
   6998 	    }
   6999 	}
   7000       else
   7001 	{
   7002 	  rtx tmpreg[4];
   7003 	  gcc_assert (unroll <= 4);
   7004 	  for (i = 0; i < unroll; i++)
   7005 	    {
   7006 	      tmpreg[i] = gen_reg_rtx (mode);
   7007 	      if (i)
   7008 		srcmem = adjust_address (copy_rtx (srcmem), mode,
   7009 					 GET_MODE_SIZE (mode));
   7010 	      emit_move_insn (tmpreg[i], srcmem);
   7011 	    }
   7012 	  for (i = 0; i < unroll; i++)
   7013 	    {
   7014 	      if (i)
   7015 		destmem = adjust_address (copy_rtx (destmem), mode,
   7016 					  GET_MODE_SIZE (mode));
   7017 	      emit_move_insn (destmem, tmpreg[i]);
   7018 	    }
   7019 	}
   7020     }
   7021   else
   7022     for (i = 0; i < unroll; i++)
   7023       {
   7024 	if (i)
   7025 	  destmem = adjust_address (copy_rtx (destmem), mode,
   7026 				    GET_MODE_SIZE (mode));
   7027 	emit_move_insn (destmem, value);
   7028       }
   7029 
   7030   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
   7031 			     true, OPTAB_LIB_WIDEN);
   7032   if (tmp != iter)
   7033     emit_move_insn (iter, tmp);
   7034 
   7035   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
   7036 			   true, top_label);
   7037   if (expected_size != -1)
   7038     {
   7039       expected_size /= GET_MODE_SIZE (mode) * unroll;
   7040       if (expected_size == 0)
   7041 	predict_jump (0);
   7042       else if (expected_size > REG_BR_PROB_BASE)
   7043 	predict_jump (REG_BR_PROB_BASE - 1);
   7044       else
   7045         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
   7046 		      / expected_size);
   7047     }
   7048   else
   7049     predict_jump (REG_BR_PROB_BASE * 80 / 100);
   7050   iter = ix86_zero_extend_to_Pmode (iter);
   7051   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
   7052 			     true, OPTAB_LIB_WIDEN);
   7053   if (tmp != destptr)
   7054     emit_move_insn (destptr, tmp);
   7055   if (!issetmem)
   7056     {
   7057       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
   7058 				 true, OPTAB_LIB_WIDEN);
   7059       if (tmp != srcptr)
   7060 	emit_move_insn (srcptr, tmp);
   7061     }
   7062   emit_label (out_label);
   7063 }
   7064 
   7065 /* Divide COUNTREG by SCALE.  */
   7066 static rtx
   7067 scale_counter (rtx countreg, int scale)
   7068 {
   7069   rtx sc;
   7070 
   7071   if (scale == 1)
   7072     return countreg;
   7073   if (CONST_INT_P (countreg))
   7074     return GEN_INT (INTVAL (countreg) / scale);
   7075   gcc_assert (REG_P (countreg));
   7076 
   7077   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
   7078 			    GEN_INT (exact_log2 (scale)),
   7079 			    NULL, 1, OPTAB_DIRECT);
   7080   return sc;
   7081 }
   7082 
   7083 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
   7084    When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
   7085    When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
   7086    For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
   7087    ORIG_VALUE is the original value passed to memset to fill the memory with.
   7088    Other arguments have same meaning as for previous function.  */
   7089 
   7090 static void
   7091 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
   7092 			   rtx destptr, rtx srcptr, rtx value, rtx orig_value,
   7093 			   rtx count,
   7094 			   machine_mode mode, bool issetmem)
   7095 {
   7096   rtx destexp;
   7097   rtx srcexp;
   7098   rtx countreg;
   7099   HOST_WIDE_INT rounded_count;
   7100 
   7101   /* If possible, it is shorter to use rep movs.
   7102      TODO: Maybe it is better to move this logic to decide_alg.  */
   7103   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
   7104       && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
   7105       && (!issetmem || orig_value == const0_rtx))
   7106     mode = SImode;
   7107 
   7108   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
   7109     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
   7110 
   7111   countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
   7112 						       GET_MODE_SIZE (mode)));
   7113   if (mode != QImode)
   7114     {
   7115       destexp = gen_rtx_ASHIFT (Pmode, countreg,
   7116 				GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
   7117       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
   7118     }
   7119   else
   7120     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
   7121   if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
   7122     {
   7123       rounded_count
   7124 	= ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
   7125       destmem = shallow_copy_rtx (destmem);
   7126       set_mem_size (destmem, rounded_count);
   7127     }
   7128   else if (MEM_SIZE_KNOWN_P (destmem))
   7129     clear_mem_size (destmem);
   7130 
   7131   if (issetmem)
   7132     {
   7133       value = force_reg (mode, gen_lowpart (mode, value));
   7134       emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
   7135     }
   7136   else
   7137     {
   7138       if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
   7139 	srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
   7140       if (mode != QImode)
   7141 	{
   7142 	  srcexp = gen_rtx_ASHIFT (Pmode, countreg,
   7143 				   GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
   7144 	  srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
   7145 	}
   7146       else
   7147 	srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
   7148       if (CONST_INT_P (count))
   7149 	{
   7150 	  rounded_count
   7151 	    = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
   7152 	  srcmem = shallow_copy_rtx (srcmem);
   7153 	  set_mem_size (srcmem, rounded_count);
   7154 	}
   7155       else
   7156 	{
   7157 	  if (MEM_SIZE_KNOWN_P (srcmem))
   7158 	    clear_mem_size (srcmem);
   7159 	}
   7160       emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
   7161 			      destexp, srcexp));
   7162     }
   7163 }
   7164 
   7165 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
   7166    DESTMEM.
   7167    SRC is passed by pointer to be updated on return.
   7168    Return value is updated DST.  */
   7169 static rtx
   7170 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
   7171 	     HOST_WIDE_INT size_to_move)
   7172 {
   7173   rtx dst = destmem, src = *srcmem, tempreg;
   7174   enum insn_code code;
   7175   machine_mode move_mode;
   7176   int piece_size, i;
   7177 
   7178   /* Find the widest mode in which we could perform moves.
   7179      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
   7180      it until move of such size is supported.  */
   7181   piece_size = 1 << floor_log2 (size_to_move);
   7182   while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
   7183 	 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
   7184     {
   7185       gcc_assert (piece_size > 1);
   7186       piece_size >>= 1;
   7187     }
   7188 
   7189   /* Find the corresponding vector mode with the same size as MOVE_MODE.
   7190      MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
   7191   if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
   7192     {
   7193       int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
   7194       if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
   7195 	  || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
   7196 	{
   7197 	  move_mode = word_mode;
   7198 	  piece_size = GET_MODE_SIZE (move_mode);
   7199 	  code = optab_handler (mov_optab, move_mode);
   7200 	}
   7201     }
   7202   gcc_assert (code != CODE_FOR_nothing);
   7203 
   7204   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
   7205   src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
   7206 
   7207   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
   7208   gcc_assert (size_to_move % piece_size == 0);
   7209 
   7210   for (i = 0; i < size_to_move; i += piece_size)
   7211     {
   7212       /* We move from memory to memory, so we'll need to do it via
   7213 	 a temporary register.  */
   7214       tempreg = gen_reg_rtx (move_mode);
   7215       emit_insn (GEN_FCN (code) (tempreg, src));
   7216       emit_insn (GEN_FCN (code) (dst, tempreg));
   7217 
   7218       emit_move_insn (destptr,
   7219 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
   7220       emit_move_insn (srcptr,
   7221 		      plus_constant (Pmode, copy_rtx (srcptr), piece_size));
   7222 
   7223       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
   7224 					  piece_size);
   7225       src = adjust_automodify_address_nv (src, move_mode, srcptr,
   7226 					  piece_size);
   7227     }
   7228 
   7229   /* Update DST and SRC rtx.  */
   7230   *srcmem = src;
   7231   return dst;
   7232 }
   7233 
   7234 /* Helper function for the string operations below.  Dest VARIABLE whether
   7235    it is aligned to VALUE bytes.  If true, jump to the label.  */
   7236 
   7237 static rtx_code_label *
   7238 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
   7239 {
   7240   rtx_code_label *label = gen_label_rtx ();
   7241   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
   7242   if (GET_MODE (variable) == DImode)
   7243     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
   7244   else
   7245     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
   7246   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
   7247 			   1, label);
   7248   if (epilogue)
   7249     predict_jump (REG_BR_PROB_BASE * 50 / 100);
   7250   else
   7251     predict_jump (REG_BR_PROB_BASE * 90 / 100);
   7252   return label;
   7253 }
   7254 
   7255 
   7256 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
   7257 
   7258 static void
   7259 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
   7260 			rtx destptr, rtx srcptr, rtx count, int max_size)
   7261 {
   7262   rtx src, dest;
   7263   if (CONST_INT_P (count))
   7264     {
   7265       HOST_WIDE_INT countval = INTVAL (count);
   7266       HOST_WIDE_INT epilogue_size = countval % max_size;
   7267       int i;
   7268 
   7269       /* For now MAX_SIZE should be a power of 2.  This assert could be
   7270 	 relaxed, but it'll require a bit more complicated epilogue
   7271 	 expanding.  */
   7272       gcc_assert ((max_size & (max_size - 1)) == 0);
   7273       for (i = max_size; i >= 1; i >>= 1)
   7274 	{
   7275 	  if (epilogue_size & i)
   7276 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
   7277 	}
   7278       return;
   7279     }
   7280   if (max_size > 8)
   7281     {
   7282       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
   7283 				    count, 1, OPTAB_DIRECT);
   7284       expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
   7285 				     count, QImode, 1, 4, false);
   7286       return;
   7287     }
   7288 
   7289   /* When there are stringops, we can cheaply increase dest and src pointers.
   7290      Otherwise we save code size by maintaining offset (zero is readily
   7291      available from preceding rep operation) and using x86 addressing modes.
   7292    */
   7293   if (TARGET_SINGLE_STRINGOP)
   7294     {
   7295       if (max_size > 4)
   7296 	{
   7297 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
   7298 	  src = change_address (srcmem, SImode, srcptr);
   7299 	  dest = change_address (destmem, SImode, destptr);
   7300 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
   7301 	  emit_label (label);
   7302 	  LABEL_NUSES (label) = 1;
   7303 	}
   7304       if (max_size > 2)
   7305 	{
   7306 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
   7307 	  src = change_address (srcmem, HImode, srcptr);
   7308 	  dest = change_address (destmem, HImode, destptr);
   7309 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
   7310 	  emit_label (label);
   7311 	  LABEL_NUSES (label) = 1;
   7312 	}
   7313       if (max_size > 1)
   7314 	{
   7315 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
   7316 	  src = change_address (srcmem, QImode, srcptr);
   7317 	  dest = change_address (destmem, QImode, destptr);
   7318 	  emit_insn (gen_strmov (destptr, dest, srcptr, src));
   7319 	  emit_label (label);
   7320 	  LABEL_NUSES (label) = 1;
   7321 	}
   7322     }
   7323   else
   7324     {
   7325       rtx offset = force_reg (Pmode, const0_rtx);
   7326       rtx tmp;
   7327 
   7328       if (max_size > 4)
   7329 	{
   7330 	  rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
   7331 	  src = change_address (srcmem, SImode, srcptr);
   7332 	  dest = change_address (destmem, SImode, destptr);
   7333 	  emit_move_insn (dest, src);
   7334 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
   7335 				     true, OPTAB_LIB_WIDEN);
   7336 	  if (tmp != offset)
   7337 	    emit_move_insn (offset, tmp);
   7338 	  emit_label (label);
   7339 	  LABEL_NUSES (label) = 1;
   7340 	}
   7341       if (max_size > 2)
   7342 	{
   7343 	  rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
   7344 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
   7345 	  src = change_address (srcmem, HImode, tmp);
   7346 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
   7347 	  dest = change_address (destmem, HImode, tmp);
   7348 	  emit_move_insn (dest, src);
   7349 	  tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
   7350 				     true, OPTAB_LIB_WIDEN);
   7351 	  if (tmp != offset)
   7352 	    emit_move_insn (offset, tmp);
   7353 	  emit_label (label);
   7354 	  LABEL_NUSES (label) = 1;
   7355 	}
   7356       if (max_size > 1)
   7357 	{
   7358 	  rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
   7359 	  tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
   7360 	  src = change_address (srcmem, QImode, tmp);
   7361 	  tmp = gen_rtx_PLUS (Pmode, destptr, offset);
   7362 	  dest = change_address (destmem, QImode, tmp);
   7363 	  emit_move_insn (dest, src);
   7364 	  emit_label (label);
   7365 	  LABEL_NUSES (label) = 1;
   7366 	}
   7367     }
   7368 }
   7369 
   7370 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
   7371    with value PROMOTED_VAL.
   7372    SRC is passed by pointer to be updated on return.
   7373    Return value is updated DST.  */
   7374 static rtx
   7375 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
   7376 	     HOST_WIDE_INT size_to_move)
   7377 {
   7378   rtx dst = destmem;
   7379   enum insn_code code;
   7380   machine_mode move_mode;
   7381   int piece_size, i;
   7382 
   7383   /* Find the widest mode in which we could perform moves.
   7384      Start with the biggest power of 2 less than SIZE_TO_MOVE and half
   7385      it until move of such size is supported.  */
   7386   move_mode = GET_MODE (promoted_val);
   7387   if (move_mode == VOIDmode)
   7388     move_mode = QImode;
   7389   if (size_to_move < GET_MODE_SIZE (move_mode))
   7390     {
   7391       unsigned int move_bits = size_to_move * BITS_PER_UNIT;
   7392       move_mode = int_mode_for_size (move_bits, 0).require ();
   7393       promoted_val = gen_lowpart (move_mode, promoted_val);
   7394     }
   7395   piece_size = GET_MODE_SIZE (move_mode);
   7396   code = optab_handler (mov_optab, move_mode);
   7397   gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
   7398 
   7399   dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
   7400 
   7401   /* Emit moves.  We'll need SIZE_TO_MOVE/PIECE_SIZES moves.  */
   7402   gcc_assert (size_to_move % piece_size == 0);
   7403 
   7404   for (i = 0; i < size_to_move; i += piece_size)
   7405     {
   7406       if (piece_size <= GET_MODE_SIZE (word_mode))
   7407 	{
   7408 	  emit_insn (gen_strset (destptr, dst, promoted_val));
   7409 	  dst = adjust_automodify_address_nv (dst, move_mode, destptr,
   7410 					      piece_size);
   7411 	  continue;
   7412 	}
   7413 
   7414       emit_insn (GEN_FCN (code) (dst, promoted_val));
   7415 
   7416       emit_move_insn (destptr,
   7417 		      plus_constant (Pmode, copy_rtx (destptr), piece_size));
   7418 
   7419       dst = adjust_automodify_address_nv (dst, move_mode, destptr,
   7420 					  piece_size);
   7421     }
   7422 
   7423   /* Update DST rtx.  */
   7424   return dst;
   7425 }
   7426 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
   7427 static void
   7428 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
   7429 				 rtx count, int max_size)
   7430 {
   7431   count = expand_simple_binop (counter_mode (count), AND, count,
   7432 			       GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
   7433   expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
   7434 				 gen_lowpart (QImode, value), count, QImode,
   7435 				 1, max_size / 2, true);
   7436 }
   7437 
   7438 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
   7439 static void
   7440 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
   7441 			rtx count, int max_size)
   7442 {
   7443   rtx dest;
   7444 
   7445   if (CONST_INT_P (count))
   7446     {
   7447       HOST_WIDE_INT countval = INTVAL (count);
   7448       HOST_WIDE_INT epilogue_size = countval % max_size;
   7449       int i;
   7450 
   7451       /* For now MAX_SIZE should be a power of 2.  This assert could be
   7452 	 relaxed, but it'll require a bit more complicated epilogue
   7453 	 expanding.  */
   7454       gcc_assert ((max_size & (max_size - 1)) == 0);
   7455       for (i = max_size; i >= 1; i >>= 1)
   7456 	{
   7457 	  if (epilogue_size & i)
   7458 	    {
   7459 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
   7460 		destmem = emit_memset (destmem, destptr, vec_value, i);
   7461 	      else
   7462 		destmem = emit_memset (destmem, destptr, value, i);
   7463 	    }
   7464 	}
   7465       return;
   7466     }
   7467   if (max_size > 32)
   7468     {
   7469       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
   7470       return;
   7471     }
   7472   if (max_size > 16)
   7473     {
   7474       rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
   7475       if (TARGET_64BIT)
   7476 	{
   7477 	  dest = change_address (destmem, DImode, destptr);
   7478 	  emit_insn (gen_strset (destptr, dest, value));
   7479 	  dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
   7480 	  emit_insn (gen_strset (destptr, dest, value));
   7481 	}
   7482       else
   7483 	{
   7484 	  dest = change_address (destmem, SImode, destptr);
   7485 	  emit_insn (gen_strset (destptr, dest, value));
   7486 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
   7487 	  emit_insn (gen_strset (destptr, dest, value));
   7488 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
   7489 	  emit_insn (gen_strset (destptr, dest, value));
   7490 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
   7491 	  emit_insn (gen_strset (destptr, dest, value));
   7492 	}
   7493       emit_label (label);
   7494       LABEL_NUSES (label) = 1;
   7495     }
   7496   if (max_size > 8)
   7497     {
   7498       rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
   7499       if (TARGET_64BIT)
   7500 	{
   7501 	  dest = change_address (destmem, DImode, destptr);
   7502 	  emit_insn (gen_strset (destptr, dest, value));
   7503 	}
   7504       else
   7505 	{
   7506 	  dest = change_address (destmem, SImode, destptr);
   7507 	  emit_insn (gen_strset (destptr, dest, value));
   7508 	  dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
   7509 	  emit_insn (gen_strset (destptr, dest, value));
   7510 	}
   7511       emit_label (label);
   7512       LABEL_NUSES (label) = 1;
   7513     }
   7514   if (max_size > 4)
   7515     {
   7516       rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
   7517       dest = change_address (destmem, SImode, destptr);
   7518       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
   7519       emit_label (label);
   7520       LABEL_NUSES (label) = 1;
   7521     }
   7522   if (max_size > 2)
   7523     {
   7524       rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
   7525       dest = change_address (destmem, HImode, destptr);
   7526       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
   7527       emit_label (label);
   7528       LABEL_NUSES (label) = 1;
   7529     }
   7530   if (max_size > 1)
   7531     {
   7532       rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
   7533       dest = change_address (destmem, QImode, destptr);
   7534       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
   7535       emit_label (label);
   7536       LABEL_NUSES (label) = 1;
   7537     }
   7538 }
   7539 
   7540 /* Adjust COUNTER by the VALUE.  */
   7541 static void
   7542 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
   7543 {
   7544   emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
   7545 }
   7546 
   7547 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
   7548    DESTMEM to align it to DESIRED_ALIGNMENT.  Original alignment is ALIGN.
   7549    Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
   7550    ignored.
   7551    Return value is updated DESTMEM.  */
   7552 
   7553 static rtx
   7554 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
   7555 				  rtx destptr, rtx srcptr, rtx value,
   7556 				  rtx vec_value, rtx count, int align,
   7557 				  int desired_alignment, bool issetmem)
   7558 {
   7559   int i;
   7560   for (i = 1; i < desired_alignment; i <<= 1)
   7561     {
   7562       if (align <= i)
   7563 	{
   7564 	  rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
   7565 	  if (issetmem)
   7566 	    {
   7567 	      if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
   7568 		destmem = emit_memset (destmem, destptr, vec_value, i);
   7569 	      else
   7570 		destmem = emit_memset (destmem, destptr, value, i);
   7571 	    }
   7572 	  else
   7573 	    destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
   7574 	  ix86_adjust_counter (count, i);
   7575 	  emit_label (label);
   7576 	  LABEL_NUSES (label) = 1;
   7577 	  set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
   7578 	}
   7579     }
   7580   return destmem;
   7581 }
   7582 
   7583 /* Test if COUNT&SIZE is nonzero and if so, expand movme
   7584    or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
   7585    and jump to DONE_LABEL.  */
   7586 static void
   7587 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
   7588 			       rtx destptr, rtx srcptr,
   7589 			       rtx value, rtx vec_value,
   7590 			       rtx count, int size,
   7591 			       rtx done_label, bool issetmem)
   7592 {
   7593   rtx_code_label *label = ix86_expand_aligntest (count, size, false);
   7594   machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
   7595   rtx modesize;
   7596   int n;
   7597 
   7598   /* If we do not have vector value to copy, we must reduce size.  */
   7599   if (issetmem)
   7600     {
   7601       if (!vec_value)
   7602 	{
   7603 	  if (GET_MODE (value) == VOIDmode && size > 8)
   7604 	    mode = Pmode;
   7605 	  else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
   7606 	    mode = GET_MODE (value);
   7607 	}
   7608       else
   7609 	mode = GET_MODE (vec_value), value = vec_value;
   7610     }
   7611   else
   7612     {
   7613       /* Choose appropriate vector mode.  */
   7614       if (size >= 32)
   7615 	mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
   7616       else if (size >= 16)
   7617 	mode = TARGET_SSE ? V16QImode : DImode;
   7618       srcmem = change_address (srcmem, mode, srcptr);
   7619     }
   7620   destmem = change_address (destmem, mode, destptr);
   7621   modesize = GEN_INT (GET_MODE_SIZE (mode));
   7622   gcc_assert (GET_MODE_SIZE (mode) <= size);
   7623   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
   7624     {
   7625       if (issetmem)
   7626 	emit_move_insn (destmem, gen_lowpart (mode, value));
   7627       else
   7628 	{
   7629           emit_move_insn (destmem, srcmem);
   7630           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
   7631 	}
   7632       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
   7633     }
   7634 
   7635   destmem = offset_address (destmem, count, 1);
   7636   destmem = offset_address (destmem, GEN_INT (-2 * size),
   7637 			    GET_MODE_SIZE (mode));
   7638   if (!issetmem)
   7639     {
   7640       srcmem = offset_address (srcmem, count, 1);
   7641       srcmem = offset_address (srcmem, GEN_INT (-2 * size),
   7642 			       GET_MODE_SIZE (mode));
   7643     }
   7644   for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
   7645     {
   7646       if (issetmem)
   7647 	emit_move_insn (destmem, gen_lowpart (mode, value));
   7648       else
   7649 	{
   7650 	  emit_move_insn (destmem, srcmem);
   7651 	  srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
   7652 	}
   7653       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
   7654     }
   7655   emit_jump_insn (gen_jump (done_label));
   7656   emit_barrier ();
   7657 
   7658   emit_label (label);
   7659   LABEL_NUSES (label) = 1;
   7660 }
   7661 
   7662 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
   7663    and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
   7664    bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
   7665    proceed with an loop copying SIZE bytes at once. Do moves in MODE.
   7666    DONE_LABEL is a label after the whole copying sequence. The label is created
   7667    on demand if *DONE_LABEL is NULL.
   7668    MIN_SIZE is minimal size of block copied.  This value gets adjusted for new
   7669    bounds after the initial copies.
   7670 
   7671    DESTMEM/SRCMEM are memory expressions pointing to the copies block,
   7672    DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
   7673    we will dispatch to a library call for large blocks.
   7674 
   7675    In pseudocode we do:
   7676 
   7677    if (COUNT < SIZE)
   7678      {
   7679        Assume that SIZE is 4. Bigger sizes are handled analogously
   7680        if (COUNT & 4)
   7681 	 {
   7682 	    copy 4 bytes from SRCPTR to DESTPTR
   7683 	    copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
   7684 	    goto done_label
   7685 	 }
   7686        if (!COUNT)
   7687 	 goto done_label;
   7688        copy 1 byte from SRCPTR to DESTPTR
   7689        if (COUNT & 2)
   7690 	 {
   7691 	    copy 2 bytes from SRCPTR to DESTPTR
   7692 	    copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
   7693 	 }
   7694      }
   7695    else
   7696      {
   7697        copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
   7698        copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
   7699 
   7700        OLD_DESPTR = DESTPTR;
   7701        Align DESTPTR up to DESIRED_ALIGN
   7702        SRCPTR += DESTPTR - OLD_DESTPTR
   7703        COUNT -= DEST_PTR - OLD_DESTPTR
   7704        if (DYNAMIC_CHECK)
   7705 	 Round COUNT down to multiple of SIZE
   7706        << optional caller supplied zero size guard is here >>
   7707        << optional caller supplied dynamic check is here >>
   7708        << caller supplied main copy loop is here >>
   7709      }
   7710    done_label:
   7711   */
   7712 static void
   7713 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
   7714 							    rtx *destptr, rtx *srcptr,
   7715 							    machine_mode mode,
   7716 							    rtx value, rtx vec_value,
   7717 							    rtx *count,
   7718 							    rtx_code_label **done_label,
   7719 							    int size,
   7720 							    int desired_align,
   7721 							    int align,
   7722 							    unsigned HOST_WIDE_INT *min_size,
   7723 							    bool dynamic_check,
   7724 							    bool issetmem)
   7725 {
   7726   rtx_code_label *loop_label = NULL, *label;
   7727   int n;
   7728   rtx modesize;
   7729   int prolog_size = 0;
   7730   rtx mode_value;
   7731 
   7732   /* Chose proper value to copy.  */
   7733   if (issetmem && VECTOR_MODE_P (mode))
   7734     mode_value = vec_value;
   7735   else
   7736     mode_value = value;
   7737   gcc_assert (GET_MODE_SIZE (mode) <= size);
   7738 
   7739   /* See if block is big or small, handle small blocks.  */
   7740   if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
   7741     {
   7742       int size2 = size;
   7743       loop_label = gen_label_rtx ();
   7744 
   7745       if (!*done_label)
   7746 	*done_label = gen_label_rtx ();
   7747 
   7748       emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
   7749 			       1, loop_label);
   7750       size2 >>= 1;
   7751 
   7752       /* Handle sizes > 3.  */
   7753       for (;size2 > 2; size2 >>= 1)
   7754 	expand_small_cpymem_or_setmem (destmem, srcmem,
   7755 				       *destptr, *srcptr,
   7756 				       value, vec_value,
   7757 				       *count,
   7758 				       size2, *done_label, issetmem);
   7759       /* Nothing to copy?  Jump to DONE_LABEL if so */
   7760       emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
   7761 			       1, *done_label);
   7762 
   7763       /* Do a byte copy.  */
   7764       destmem = change_address (destmem, QImode, *destptr);
   7765       if (issetmem)
   7766 	emit_move_insn (destmem, gen_lowpart (QImode, value));
   7767       else
   7768 	{
   7769           srcmem = change_address (srcmem, QImode, *srcptr);
   7770           emit_move_insn (destmem, srcmem);
   7771 	}
   7772 
   7773       /* Handle sizes 2 and 3.  */
   7774       label = ix86_expand_aligntest (*count, 2, false);
   7775       destmem = change_address (destmem, HImode, *destptr);
   7776       destmem = offset_address (destmem, *count, 1);
   7777       destmem = offset_address (destmem, GEN_INT (-2), 2);
   7778       if (issetmem)
   7779         emit_move_insn (destmem, gen_lowpart (HImode, value));
   7780       else
   7781 	{
   7782 	  srcmem = change_address (srcmem, HImode, *srcptr);
   7783 	  srcmem = offset_address (srcmem, *count, 1);
   7784 	  srcmem = offset_address (srcmem, GEN_INT (-2), 2);
   7785 	  emit_move_insn (destmem, srcmem);
   7786 	}
   7787 
   7788       emit_label (label);
   7789       LABEL_NUSES (label) = 1;
   7790       emit_jump_insn (gen_jump (*done_label));
   7791       emit_barrier ();
   7792     }
   7793   else
   7794     gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
   7795 		|| UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
   7796 
   7797   /* Start memcpy for COUNT >= SIZE.  */
   7798   if (loop_label)
   7799     {
   7800        emit_label (loop_label);
   7801        LABEL_NUSES (loop_label) = 1;
   7802     }
   7803 
   7804   /* Copy first desired_align bytes.  */
   7805   if (!issetmem)
   7806     srcmem = change_address (srcmem, mode, *srcptr);
   7807   destmem = change_address (destmem, mode, *destptr);
   7808   modesize = GEN_INT (GET_MODE_SIZE (mode));
   7809   for (n = 0; prolog_size < desired_align - align; n++)
   7810     {
   7811       if (issetmem)
   7812         emit_move_insn (destmem, mode_value);
   7813       else
   7814 	{
   7815           emit_move_insn (destmem, srcmem);
   7816           srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
   7817 	}
   7818       destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
   7819       prolog_size += GET_MODE_SIZE (mode);
   7820     }
   7821 
   7822 
   7823   /* Copy last SIZE bytes.  */
   7824   destmem = offset_address (destmem, *count, 1);
   7825   destmem = offset_address (destmem,
   7826 			    GEN_INT (-size - prolog_size),
   7827 			    1);
   7828   if (issetmem)
   7829     emit_move_insn (destmem, mode_value);
   7830   else
   7831     {
   7832       srcmem = offset_address (srcmem, *count, 1);
   7833       srcmem = offset_address (srcmem,
   7834 			       GEN_INT (-size - prolog_size),
   7835 			       1);
   7836       emit_move_insn (destmem, srcmem);
   7837     }
   7838   for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
   7839     {
   7840       destmem = offset_address (destmem, modesize, 1);
   7841       if (issetmem)
   7842 	emit_move_insn (destmem, mode_value);
   7843       else
   7844 	{
   7845           srcmem = offset_address (srcmem, modesize, 1);
   7846           emit_move_insn (destmem, srcmem);
   7847 	}
   7848     }
   7849 
   7850   /* Align destination.  */
   7851   if (desired_align > 1 && desired_align > align)
   7852     {
   7853       rtx saveddest = *destptr;
   7854 
   7855       gcc_assert (desired_align <= size);
   7856       /* Align destptr up, place it to new register.  */
   7857       *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
   7858 				      GEN_INT (prolog_size),
   7859 				      NULL_RTX, 1, OPTAB_DIRECT);
   7860       if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
   7861 	REG_POINTER (*destptr) = 1;
   7862       *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
   7863 				      GEN_INT (-desired_align),
   7864 				      *destptr, 1, OPTAB_DIRECT);
   7865       /* See how many bytes we skipped.  */
   7866       saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
   7867 				       *destptr,
   7868 				       saveddest, 1, OPTAB_DIRECT);
   7869       /* Adjust srcptr and count.  */
   7870       if (!issetmem)
   7871 	*srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
   7872 				       saveddest, *srcptr, 1, OPTAB_DIRECT);
   7873       *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
   7874 				    saveddest, *count, 1, OPTAB_DIRECT);
   7875       /* We copied at most size + prolog_size.  */
   7876       if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
   7877 	*min_size
   7878 	  = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
   7879       else
   7880 	*min_size = 0;
   7881 
   7882       /* Our loops always round down the block size, but for dispatch to
   7883          library we need precise value.  */
   7884       if (dynamic_check)
   7885 	*count = expand_simple_binop (GET_MODE (*count), AND, *count,
   7886 				      GEN_INT (-size), *count, 1, OPTAB_DIRECT);
   7887     }
   7888   else
   7889     {
   7890       gcc_assert (prolog_size == 0);
   7891       /* Decrease count, so we won't end up copying last word twice.  */
   7892       if (!CONST_INT_P (*count))
   7893 	*count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
   7894 				      constm1_rtx, *count, 1, OPTAB_DIRECT);
   7895       else
   7896 	*count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
   7897 				      (unsigned HOST_WIDE_INT)size));
   7898       if (*min_size)
   7899 	*min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
   7900     }
   7901 }
   7902 
   7903 
   7904 /* This function is like the previous one, except here we know how many bytes
   7905    need to be copied.  That allows us to update alignment not only of DST, which
   7906    is returned, but also of SRC, which is passed as a pointer for that
   7907    reason.  */
   7908 static rtx
   7909 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
   7910 					   rtx srcreg, rtx value, rtx vec_value,
   7911 					   int desired_align, int align_bytes,
   7912 					   bool issetmem)
   7913 {
   7914   rtx src = NULL;
   7915   rtx orig_dst = dst;
   7916   rtx orig_src = NULL;
   7917   int piece_size = 1;
   7918   int copied_bytes = 0;
   7919 
   7920   if (!issetmem)
   7921     {
   7922       gcc_assert (srcp != NULL);
   7923       src = *srcp;
   7924       orig_src = src;
   7925     }
   7926 
   7927   for (piece_size = 1;
   7928        piece_size <= desired_align && copied_bytes < align_bytes;
   7929        piece_size <<= 1)
   7930     {
   7931       if (align_bytes & piece_size)
   7932 	{
   7933 	  if (issetmem)
   7934 	    {
   7935 	      if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
   7936 		dst = emit_memset (dst, destreg, vec_value, piece_size);
   7937 	      else
   7938 		dst = emit_memset (dst, destreg, value, piece_size);
   7939 	    }
   7940 	  else
   7941 	    dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
   7942 	  copied_bytes += piece_size;
   7943 	}
   7944     }
   7945   if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
   7946     set_mem_align (dst, desired_align * BITS_PER_UNIT);
   7947   if (MEM_SIZE_KNOWN_P (orig_dst))
   7948     set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
   7949 
   7950   if (!issetmem)
   7951     {
   7952       int src_align_bytes = get_mem_align_offset (src, desired_align
   7953 						       * BITS_PER_UNIT);
   7954       if (src_align_bytes >= 0)
   7955 	src_align_bytes = desired_align - src_align_bytes;
   7956       if (src_align_bytes >= 0)
   7957 	{
   7958 	  unsigned int src_align;
   7959 	  for (src_align = desired_align; src_align >= 2; src_align >>= 1)
   7960 	    {
   7961 	      if ((src_align_bytes & (src_align - 1))
   7962 		   == (align_bytes & (src_align - 1)))
   7963 		break;
   7964 	    }
   7965 	  if (src_align > (unsigned int) desired_align)
   7966 	    src_align = desired_align;
   7967 	  if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
   7968 	    set_mem_align (src, src_align * BITS_PER_UNIT);
   7969 	}
   7970       if (MEM_SIZE_KNOWN_P (orig_src))
   7971 	set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
   7972       *srcp = src;
   7973     }
   7974 
   7975   return dst;
   7976 }
   7977 
   7978 /* Return true if ALG can be used in current context.
   7979    Assume we expand memset if MEMSET is true.  */
   7980 static bool
   7981 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
   7982 {
   7983   if (alg == no_stringop)
   7984     return false;
   7985   if (alg == vector_loop)
   7986     return TARGET_SSE || TARGET_AVX;
   7987   /* Algorithms using the rep prefix want at least edi and ecx;
   7988      additionally, memset wants eax and memcpy wants esi.  Don't
   7989      consider such algorithms if the user has appropriated those
   7990      registers for their own purposes, or if we have a non-default
   7991      address space, since some string insns cannot override the segment.  */
   7992   if (alg == rep_prefix_1_byte
   7993       || alg == rep_prefix_4_byte
   7994       || alg == rep_prefix_8_byte)
   7995     {
   7996       if (have_as)
   7997 	return false;
   7998       if (fixed_regs[CX_REG]
   7999 	  || fixed_regs[DI_REG]
   8000 	  || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
   8001 	return false;
   8002     }
   8003   return true;
   8004 }
   8005 
   8006 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
   8007 static enum stringop_alg
   8008 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
   8009 	    unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
   8010 	    bool memset, bool zero_memset, bool have_as,
   8011 	    int *dynamic_check, bool *noalign, bool recur)
   8012 {
   8013   const struct stringop_algs *algs;
   8014   bool optimize_for_speed;
   8015   int max = 0;
   8016   const struct processor_costs *cost;
   8017   int i;
   8018   bool any_alg_usable_p = false;
   8019 
   8020   *noalign = false;
   8021   *dynamic_check = -1;
   8022 
   8023   /* Even if the string operation call is cold, we still might spend a lot
   8024      of time processing large blocks.  */
   8025   if (optimize_function_for_size_p (cfun)
   8026       || (optimize_insn_for_size_p ()
   8027  	  && (max_size < 256
   8028               || (expected_size != -1 && expected_size < 256))))
   8029     optimize_for_speed = false;
   8030   else
   8031     optimize_for_speed = true;
   8032 
   8033   cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
   8034   if (memset)
   8035     algs = &cost->memset[TARGET_64BIT != 0];
   8036   else
   8037     algs = &cost->memcpy[TARGET_64BIT != 0];
   8038 
   8039   /* See maximal size for user defined algorithm.  */
   8040   for (i = 0; i < MAX_STRINGOP_ALGS; i++)
   8041     {
   8042       enum stringop_alg candidate = algs->size[i].alg;
   8043       bool usable = alg_usable_p (candidate, memset, have_as);
   8044       any_alg_usable_p |= usable;
   8045 
   8046       if (candidate != libcall && candidate && usable)
   8047 	max = algs->size[i].max;
   8048     }
   8049 
   8050   /* If expected size is not known but max size is small enough
   8051      so inline version is a win, set expected size into
   8052      the range.  */
   8053   if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
   8054       && expected_size == -1)
   8055     expected_size = min_size / 2 + max_size / 2;
   8056 
   8057   /* If user specified the algorithm, honor it if possible.  */
   8058   if (ix86_stringop_alg != no_stringop
   8059       && alg_usable_p (ix86_stringop_alg, memset, have_as))
   8060     return ix86_stringop_alg;
   8061   /* rep; movq or rep; movl is the smallest variant.  */
   8062   else if (!optimize_for_speed)
   8063     {
   8064       *noalign = true;
   8065       if (!count || (count & 3) || (memset && !zero_memset))
   8066 	return alg_usable_p (rep_prefix_1_byte, memset, have_as)
   8067 	       ? rep_prefix_1_byte : loop_1_byte;
   8068       else
   8069 	return alg_usable_p (rep_prefix_4_byte, memset, have_as)
   8070 	       ? rep_prefix_4_byte : loop;
   8071     }
   8072   /* Very tiny blocks are best handled via the loop, REP is expensive to
   8073      setup.  */
   8074   else if (expected_size != -1 && expected_size < 4)
   8075     return loop_1_byte;
   8076   else if (expected_size != -1)
   8077     {
   8078       enum stringop_alg alg = libcall;
   8079       bool alg_noalign = false;
   8080       for (i = 0; i < MAX_STRINGOP_ALGS; i++)
   8081 	{
   8082 	  /* We get here if the algorithms that were not libcall-based
   8083 	     were rep-prefix based and we are unable to use rep prefixes
   8084 	     based on global register usage.  Break out of the loop and
   8085 	     use the heuristic below.  */
   8086 	  if (algs->size[i].max == 0)
   8087 	    break;
   8088 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
   8089 	    {
   8090 	      enum stringop_alg candidate = algs->size[i].alg;
   8091 
   8092 	      if (candidate != libcall
   8093 		  && alg_usable_p (candidate, memset, have_as))
   8094 		{
   8095 		  alg = candidate;
   8096 		  alg_noalign = algs->size[i].noalign;
   8097 		}
   8098 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
   8099 		 last non-libcall inline algorithm.  */
   8100 	      if (TARGET_INLINE_ALL_STRINGOPS)
   8101 		{
   8102 		  /* When the current size is best to be copied by a libcall,
   8103 		     but we are still forced to inline, run the heuristic below
   8104 		     that will pick code for medium sized blocks.  */
   8105 		  if (alg != libcall)
   8106 		    {
   8107 		      *noalign = alg_noalign;
   8108 		      return alg;
   8109 		    }
   8110 		  else if (!any_alg_usable_p)
   8111 		    break;
   8112 		}
   8113 	      else if (alg_usable_p (candidate, memset, have_as)
   8114 		       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
   8115 			    && candidate == rep_prefix_1_byte
   8116 			    /* NB: If min_size != max_size, size is
   8117 			       unknown.  */
   8118 			    && min_size != max_size))
   8119 		{
   8120 		  *noalign = algs->size[i].noalign;
   8121 		  return candidate;
   8122 		}
   8123 	    }
   8124 	}
   8125     }
   8126   /* When asked to inline the call anyway, try to pick meaningful choice.
   8127      We look for maximal size of block that is faster to copy by hand and
   8128      take blocks of at most of that size guessing that average size will
   8129      be roughly half of the block.
   8130 
   8131      If this turns out to be bad, we might simply specify the preferred
   8132      choice in ix86_costs.  */
   8133   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
   8134       && (algs->unknown_size == libcall
   8135 	  || !alg_usable_p (algs->unknown_size, memset, have_as)))
   8136     {
   8137       enum stringop_alg alg;
   8138       HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
   8139 
   8140       /* If there aren't any usable algorithms or if recursing already,
   8141 	 then recursing on smaller sizes or same size isn't going to
   8142 	 find anything.  Just return the simple byte-at-a-time copy loop.  */
   8143       if (!any_alg_usable_p || recur)
   8144 	{
   8145 	  /* Pick something reasonable.  */
   8146 	  if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
   8147 	    *dynamic_check = 128;
   8148 	  return loop_1_byte;
   8149 	}
   8150       alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
   8151 			zero_memset, have_as, dynamic_check, noalign, true);
   8152       gcc_assert (*dynamic_check == -1);
   8153       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
   8154 	*dynamic_check = max;
   8155       else
   8156 	gcc_assert (alg != libcall);
   8157       return alg;
   8158     }
   8159   return (alg_usable_p (algs->unknown_size, memset, have_as)
   8160 	  ? algs->unknown_size : libcall);
   8161 }
   8162 
   8163 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
   8164    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
   8165 static int
   8166 decide_alignment (int align,
   8167 		  enum stringop_alg alg,
   8168 		  int expected_size,
   8169 		  machine_mode move_mode)
   8170 {
   8171   int desired_align = 0;
   8172 
   8173   gcc_assert (alg != no_stringop);
   8174 
   8175   if (alg == libcall)
   8176     return 0;
   8177   if (move_mode == VOIDmode)
   8178     return 0;
   8179 
   8180   desired_align = GET_MODE_SIZE (move_mode);
   8181   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
   8182      copying whole cacheline at once.  */
   8183   if (TARGET_CPU_P (PENTIUMPRO)
   8184       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
   8185     desired_align = 8;
   8186 
   8187   if (optimize_size)
   8188     desired_align = 1;
   8189   if (desired_align < align)
   8190     desired_align = align;
   8191   if (expected_size != -1 && expected_size < 4)
   8192     desired_align = align;
   8193 
   8194   return desired_align;
   8195 }
   8196 
   8197 
   8198 /* Helper function for memcpy.  For QImode value 0xXY produce
   8199    0xXYXYXYXY of wide specified by MODE.  This is essentially
   8200    a * 0x10101010, but we can do slightly better than
   8201    synth_mult by unwinding the sequence by hand on CPUs with
   8202    slow multiply.  */
   8203 static rtx
   8204 promote_duplicated_reg (machine_mode mode, rtx val)
   8205 {
   8206   machine_mode valmode = GET_MODE (val);
   8207   rtx tmp;
   8208   int nops = mode == DImode ? 3 : 2;
   8209 
   8210   gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
   8211   if (val == const0_rtx)
   8212     return copy_to_mode_reg (mode, CONST0_RTX (mode));
   8213   if (CONST_INT_P (val))
   8214     {
   8215       HOST_WIDE_INT v = INTVAL (val) & 255;
   8216 
   8217       v |= v << 8;
   8218       v |= v << 16;
   8219       if (mode == DImode)
   8220         v |= (v << 16) << 16;
   8221       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
   8222     }
   8223 
   8224   if (valmode == VOIDmode)
   8225     valmode = QImode;
   8226   if (valmode != QImode)
   8227     val = gen_lowpart (QImode, val);
   8228   if (mode == QImode)
   8229     return val;
   8230   if (!TARGET_PARTIAL_REG_STALL)
   8231     nops--;
   8232   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
   8233       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
   8234       <= (ix86_cost->shift_const + ix86_cost->add) * nops
   8235           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
   8236     {
   8237       rtx reg = convert_modes (mode, QImode, val, true);
   8238       tmp = promote_duplicated_reg (mode, const1_rtx);
   8239       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
   8240 				  OPTAB_DIRECT);
   8241     }
   8242   else
   8243     {
   8244       rtx reg = convert_modes (mode, QImode, val, true);
   8245 
   8246       if (!TARGET_PARTIAL_REG_STALL)
   8247 	emit_insn (gen_insv_1 (mode, reg, reg));
   8248       else
   8249 	{
   8250 	  tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
   8251 				     NULL, 1, OPTAB_DIRECT);
   8252 	  reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
   8253 				     OPTAB_DIRECT);
   8254 	}
   8255       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
   8256 			         NULL, 1, OPTAB_DIRECT);
   8257       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
   8258       if (mode == SImode)
   8259 	return reg;
   8260       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
   8261 				 NULL, 1, OPTAB_DIRECT);
   8262       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
   8263       return reg;
   8264     }
   8265 }
   8266 
   8267 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
   8268    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
   8269    alignment from ALIGN to DESIRED_ALIGN.  */
   8270 static rtx
   8271 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
   8272 				int align)
   8273 {
   8274   rtx promoted_val;
   8275 
   8276   if (TARGET_64BIT
   8277       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
   8278     promoted_val = promote_duplicated_reg (DImode, val);
   8279   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
   8280     promoted_val = promote_duplicated_reg (SImode, val);
   8281   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
   8282     promoted_val = promote_duplicated_reg (HImode, val);
   8283   else
   8284     promoted_val = val;
   8285 
   8286   return promoted_val;
   8287 }
   8288 
   8289 /* Copy the address to a Pmode register.  This is used for x32 to
   8290    truncate DImode TLS address to a SImode register. */
   8291 
   8292 static rtx
   8293 ix86_copy_addr_to_reg (rtx addr)
   8294 {
   8295   rtx reg;
   8296   if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
   8297     {
   8298       reg = copy_addr_to_reg (addr);
   8299       REG_POINTER (reg) = 1;
   8300       return reg;
   8301     }
   8302   else
   8303     {
   8304       gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
   8305       reg = copy_to_mode_reg (DImode, addr);
   8306       REG_POINTER (reg) = 1;
   8307       return gen_rtx_SUBREG (SImode, reg, 0);
   8308     }
   8309 }
   8310 
   8311 /* Expand string move (memcpy) ot store (memset) operation.  Use i386 string
   8312    operations when profitable.  The code depends upon architecture, block size
   8313    and alignment, but always has one of the following overall structures:
   8314 
   8315    Aligned move sequence:
   8316 
   8317      1) Prologue guard: Conditional that jumps up to epilogues for small
   8318 	blocks that can be handled by epilogue alone.  This is faster
   8319 	but also needed for correctness, since prologue assume the block
   8320 	is larger than the desired alignment.
   8321 
   8322 	Optional dynamic check for size and libcall for large
   8323 	blocks is emitted here too, with -minline-stringops-dynamically.
   8324 
   8325      2) Prologue: copy first few bytes in order to get destination
   8326 	aligned to DESIRED_ALIGN.  It is emitted only when ALIGN is less
   8327 	than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
   8328 	copied.  We emit either a jump tree on power of two sized
   8329 	blocks, or a byte loop.
   8330 
   8331      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
   8332 	with specified algorithm.
   8333 
   8334      4) Epilogue: code copying tail of the block that is too small to be
   8335 	handled by main body (or up to size guarded by prologue guard).
   8336 
   8337   Misaligned move sequence
   8338 
   8339      1) missaligned move prologue/epilogue containing:
   8340         a) Prologue handling small memory blocks and jumping to done_label
   8341 	   (skipped if blocks are known to be large enough)
   8342 	b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
   8343            needed by single possibly misaligned move
   8344 	   (skipped if alignment is not needed)
   8345         c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
   8346 
   8347      2) Zero size guard dispatching to done_label, if needed
   8348 
   8349      3) dispatch to library call, if needed,
   8350 
   8351      3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
   8352 	with specified algorithm.  */
   8353 bool
   8354 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
   8355 			   rtx align_exp, rtx expected_align_exp,
   8356 			   rtx expected_size_exp, rtx min_size_exp,
   8357 			   rtx max_size_exp, rtx probable_max_size_exp,
   8358 			   bool issetmem)
   8359 {
   8360   rtx destreg;
   8361   rtx srcreg = NULL;
   8362   rtx_code_label *label = NULL;
   8363   rtx tmp;
   8364   rtx_code_label *jump_around_label = NULL;
   8365   HOST_WIDE_INT align = 1;
   8366   unsigned HOST_WIDE_INT count = 0;
   8367   HOST_WIDE_INT expected_size = -1;
   8368   int size_needed = 0, epilogue_size_needed;
   8369   int desired_align = 0, align_bytes = 0;
   8370   enum stringop_alg alg;
   8371   rtx promoted_val = NULL;
   8372   rtx vec_promoted_val = NULL;
   8373   bool force_loopy_epilogue = false;
   8374   int dynamic_check;
   8375   bool need_zero_guard = false;
   8376   bool noalign;
   8377   machine_mode move_mode = VOIDmode;
   8378   machine_mode wider_mode;
   8379   int unroll_factor = 1;
   8380   /* TODO: Once value ranges are available, fill in proper data.  */
   8381   unsigned HOST_WIDE_INT min_size = 0;
   8382   unsigned HOST_WIDE_INT max_size = -1;
   8383   unsigned HOST_WIDE_INT probable_max_size = -1;
   8384   bool misaligned_prologue_used = false;
   8385   bool have_as;
   8386 
   8387   if (CONST_INT_P (align_exp))
   8388     align = INTVAL (align_exp);
   8389   /* i386 can do misaligned access on reasonably increased cost.  */
   8390   if (CONST_INT_P (expected_align_exp)
   8391       && INTVAL (expected_align_exp) > align)
   8392     align = INTVAL (expected_align_exp);
   8393   /* ALIGN is the minimum of destination and source alignment, but we care here
   8394      just about destination alignment.  */
   8395   else if (!issetmem
   8396 	   && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
   8397     align = MEM_ALIGN (dst) / BITS_PER_UNIT;
   8398 
   8399   if (CONST_INT_P (count_exp))
   8400     {
   8401       min_size = max_size = probable_max_size = count = expected_size
   8402 	= INTVAL (count_exp);
   8403       /* When COUNT is 0, there is nothing to do.  */
   8404       if (!count)
   8405 	return true;
   8406     }
   8407   else
   8408     {
   8409       if (min_size_exp)
   8410 	min_size = INTVAL (min_size_exp);
   8411       if (max_size_exp)
   8412 	max_size = INTVAL (max_size_exp);
   8413       if (probable_max_size_exp)
   8414 	probable_max_size = INTVAL (probable_max_size_exp);
   8415       if (CONST_INT_P (expected_size_exp))
   8416 	expected_size = INTVAL (expected_size_exp);
   8417      }
   8418 
   8419   /* Make sure we don't need to care about overflow later on.  */
   8420   if (count > (HOST_WIDE_INT_1U << 30))
   8421     return false;
   8422 
   8423   have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
   8424   if (!issetmem)
   8425     have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
   8426 
   8427   /* Step 0: Decide on preferred algorithm, desired alignment and
   8428      size of chunks to be copied by main loop.  */
   8429   alg = decide_alg (count, expected_size, min_size, probable_max_size,
   8430 		    issetmem,
   8431 		    issetmem && val_exp == const0_rtx, have_as,
   8432 		    &dynamic_check, &noalign, false);
   8433 
   8434   if (dump_file)
   8435     fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
   8436 	     stringop_alg_names[alg]);
   8437 
   8438   if (alg == libcall)
   8439     return false;
   8440   gcc_assert (alg != no_stringop);
   8441 
   8442   /* For now vector-version of memset is generated only for memory zeroing, as
   8443      creating of promoted vector value is very cheap in this case.  */
   8444   if (issetmem && alg == vector_loop && val_exp != const0_rtx)
   8445     alg = unrolled_loop;
   8446 
   8447   if (!count)
   8448     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
   8449   destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
   8450   if (!issetmem)
   8451     srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
   8452 
   8453   unroll_factor = 1;
   8454   move_mode = word_mode;
   8455   switch (alg)
   8456     {
   8457     case libcall:
   8458     case no_stringop:
   8459     case last_alg:
   8460       gcc_unreachable ();
   8461     case loop_1_byte:
   8462       need_zero_guard = true;
   8463       move_mode = QImode;
   8464       break;
   8465     case loop:
   8466       need_zero_guard = true;
   8467       break;
   8468     case unrolled_loop:
   8469       need_zero_guard = true;
   8470       unroll_factor = (TARGET_64BIT ? 4 : 2);
   8471       break;
   8472     case vector_loop:
   8473       need_zero_guard = true;
   8474       unroll_factor = 4;
   8475       /* Find the widest supported mode.  */
   8476       move_mode = word_mode;
   8477       while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
   8478 	     && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
   8479 	move_mode = wider_mode;
   8480 
   8481       if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
   8482 	move_mode = TImode;
   8483       if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
   8484 	move_mode = OImode;
   8485 
   8486       /* Find the corresponding vector mode with the same size as MOVE_MODE.
   8487 	 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.).  */
   8488       if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
   8489 	{
   8490 	  int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
   8491 	  if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
   8492 	      || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
   8493 	    move_mode = word_mode;
   8494 	}
   8495       gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
   8496       break;
   8497     case rep_prefix_8_byte:
   8498       move_mode = DImode;
   8499       break;
   8500     case rep_prefix_4_byte:
   8501       move_mode = SImode;
   8502       break;
   8503     case rep_prefix_1_byte:
   8504       move_mode = QImode;
   8505       break;
   8506     }
   8507   size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
   8508   epilogue_size_needed = size_needed;
   8509 
   8510   /* If we are going to call any library calls conditionally, make sure any
   8511      pending stack adjustment happen before the first conditional branch,
   8512      otherwise they will be emitted before the library call only and won't
   8513      happen from the other branches.  */
   8514   if (dynamic_check != -1)
   8515     do_pending_stack_adjust ();
   8516 
   8517   desired_align = decide_alignment (align, alg, expected_size, move_mode);
   8518   if (!TARGET_ALIGN_STRINGOPS || noalign)
   8519     align = desired_align;
   8520 
   8521   /* Step 1: Prologue guard.  */
   8522 
   8523   /* Alignment code needs count to be in register.  */
   8524   if (CONST_INT_P (count_exp) && desired_align > align)
   8525     {
   8526       if (INTVAL (count_exp) > desired_align
   8527 	  && INTVAL (count_exp) > size_needed)
   8528 	{
   8529 	  align_bytes
   8530 	    = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
   8531 	  if (align_bytes <= 0)
   8532 	    align_bytes = 0;
   8533 	  else
   8534 	    align_bytes = desired_align - align_bytes;
   8535 	}
   8536       if (align_bytes == 0)
   8537 	count_exp = force_reg (counter_mode (count_exp), count_exp);
   8538     }
   8539   gcc_assert (desired_align >= 1 && align >= 1);
   8540 
   8541   /* Misaligned move sequences handle both prologue and epilogue at once.
   8542      Default code generation results in a smaller code for large alignments
   8543      and also avoids redundant job when sizes are known precisely.  */
   8544   misaligned_prologue_used
   8545     = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
   8546        && MAX (desired_align, epilogue_size_needed) <= 32
   8547        && desired_align <= epilogue_size_needed
   8548        && ((desired_align > align && !align_bytes)
   8549 	   || (!count && epilogue_size_needed > 1)));
   8550 
   8551   /* Do the cheap promotion to allow better CSE across the
   8552      main loop and epilogue (ie one load of the big constant in the
   8553      front of all code.
   8554      For now the misaligned move sequences do not have fast path
   8555      without broadcasting.  */
   8556   if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
   8557     {
   8558       if (alg == vector_loop)
   8559 	{
   8560 	  gcc_assert (val_exp == const0_rtx);
   8561 	  vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
   8562 	  promoted_val = promote_duplicated_reg_to_size (val_exp,
   8563 							 GET_MODE_SIZE (word_mode),
   8564 							 desired_align, align);
   8565 	}
   8566       else
   8567 	{
   8568 	  promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
   8569 							 desired_align, align);
   8570 	}
   8571     }
   8572   /* Misaligned move sequences handles both prologues and epilogues at once.
   8573      Default code generation results in smaller code for large alignments and
   8574      also avoids redundant job when sizes are known precisely.  */
   8575   if (misaligned_prologue_used)
   8576     {
   8577       /* Misaligned move prologue handled small blocks by itself.  */
   8578       expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
   8579 	   (dst, src, &destreg, &srcreg,
   8580 	    move_mode, promoted_val, vec_promoted_val,
   8581 	    &count_exp,
   8582 	    &jump_around_label,
   8583             desired_align < align
   8584 	    ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
   8585 	    desired_align, align, &min_size, dynamic_check, issetmem);
   8586       if (!issetmem)
   8587         src = change_address (src, BLKmode, srcreg);
   8588       dst = change_address (dst, BLKmode, destreg);
   8589       set_mem_align (dst, desired_align * BITS_PER_UNIT);
   8590       epilogue_size_needed = 0;
   8591       if (need_zero_guard
   8592 	  && min_size < (unsigned HOST_WIDE_INT) size_needed)
   8593 	{
   8594 	  /* It is possible that we copied enough so the main loop will not
   8595 	     execute.  */
   8596 	  gcc_assert (size_needed > 1);
   8597 	  if (jump_around_label == NULL_RTX)
   8598 	    jump_around_label = gen_label_rtx ();
   8599 	  emit_cmp_and_jump_insns (count_exp,
   8600 				   GEN_INT (size_needed),
   8601 				   LTU, 0, counter_mode (count_exp), 1, jump_around_label);
   8602 	  if (expected_size == -1
   8603 	      || expected_size < (desired_align - align) / 2 + size_needed)
   8604 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
   8605 	  else
   8606 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
   8607 	}
   8608     }
   8609   /* Ensure that alignment prologue won't copy past end of block.  */
   8610   else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
   8611     {
   8612       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
   8613       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
   8614 	 Make sure it is power of 2.  */
   8615       epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
   8616 
   8617       /* To improve performance of small blocks, we jump around the VAL
   8618 	 promoting mode.  This mean that if the promoted VAL is not constant,
   8619 	 we might not use it in the epilogue and have to use byte
   8620 	 loop variant.  */
   8621       if (issetmem && epilogue_size_needed > 2 && !promoted_val)
   8622 	force_loopy_epilogue = true;
   8623       if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
   8624 	  || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
   8625 	{
   8626 	  /* If main algorithm works on QImode, no epilogue is needed.
   8627 	     For small sizes just don't align anything.  */
   8628 	  if (size_needed == 1)
   8629 	    desired_align = align;
   8630 	  else
   8631 	    goto epilogue;
   8632 	}
   8633       else if (!count
   8634 	       && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
   8635 	{
   8636 	  label = gen_label_rtx ();
   8637 	  emit_cmp_and_jump_insns (count_exp,
   8638 				   GEN_INT (epilogue_size_needed),
   8639 				   LTU, 0, counter_mode (count_exp), 1, label);
   8640 	  if (expected_size == -1 || expected_size < epilogue_size_needed)
   8641 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
   8642 	  else
   8643 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
   8644 	}
   8645     }
   8646 
   8647   /* Emit code to decide on runtime whether library call or inline should be
   8648      used.  */
   8649   if (dynamic_check != -1)
   8650     {
   8651       if (!issetmem && CONST_INT_P (count_exp))
   8652 	{
   8653 	  if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
   8654 	    {
   8655 	      emit_block_copy_via_libcall (dst, src, count_exp);
   8656 	      count_exp = const0_rtx;
   8657 	      goto epilogue;
   8658 	    }
   8659 	}
   8660       else
   8661 	{
   8662 	  rtx_code_label *hot_label = gen_label_rtx ();
   8663 	  if (jump_around_label == NULL_RTX)
   8664 	    jump_around_label = gen_label_rtx ();
   8665 	  emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
   8666 				   LEU, 0, counter_mode (count_exp),
   8667 				   1, hot_label);
   8668 	  predict_jump (REG_BR_PROB_BASE * 90 / 100);
   8669 	  if (issetmem)
   8670 	    set_storage_via_libcall (dst, count_exp, val_exp);
   8671 	  else
   8672 	    emit_block_copy_via_libcall (dst, src, count_exp);
   8673 	  emit_jump (jump_around_label);
   8674 	  emit_label (hot_label);
   8675 	}
   8676     }
   8677 
   8678   /* Step 2: Alignment prologue.  */
   8679   /* Do the expensive promotion once we branched off the small blocks.  */
   8680   if (issetmem && !promoted_val)
   8681     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
   8682 						   desired_align, align);
   8683 
   8684   if (desired_align > align && !misaligned_prologue_used)
   8685     {
   8686       if (align_bytes == 0)
   8687 	{
   8688 	  /* Except for the first move in prologue, we no longer know
   8689 	     constant offset in aliasing info.  It don't seems to worth
   8690 	     the pain to maintain it for the first move, so throw away
   8691 	     the info early.  */
   8692 	  dst = change_address (dst, BLKmode, destreg);
   8693 	  if (!issetmem)
   8694 	    src = change_address (src, BLKmode, srcreg);
   8695 	  dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
   8696 					    promoted_val, vec_promoted_val,
   8697 					    count_exp, align, desired_align,
   8698 					    issetmem);
   8699 	  /* At most desired_align - align bytes are copied.  */
   8700 	  if (min_size < (unsigned)(desired_align - align))
   8701 	    min_size = 0;
   8702 	  else
   8703 	    min_size -= desired_align - align;
   8704 	}
   8705       else
   8706 	{
   8707 	  /* If we know how many bytes need to be stored before dst is
   8708 	     sufficiently aligned, maintain aliasing info accurately.  */
   8709 	  dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
   8710 							   srcreg,
   8711 							   promoted_val,
   8712 							   vec_promoted_val,
   8713 							   desired_align,
   8714 							   align_bytes,
   8715 							   issetmem);
   8716 
   8717 	  count_exp = plus_constant (counter_mode (count_exp),
   8718 				     count_exp, -align_bytes);
   8719 	  count -= align_bytes;
   8720 	  min_size -= align_bytes;
   8721 	  max_size -= align_bytes;
   8722 	}
   8723       if (need_zero_guard
   8724 	  && min_size < (unsigned HOST_WIDE_INT) size_needed
   8725 	  && (count < (unsigned HOST_WIDE_INT) size_needed
   8726 	      || (align_bytes == 0
   8727 		  && count < ((unsigned HOST_WIDE_INT) size_needed
   8728 			      + desired_align - align))))
   8729 	{
   8730 	  /* It is possible that we copied enough so the main loop will not
   8731 	     execute.  */
   8732 	  gcc_assert (size_needed > 1);
   8733 	  if (label == NULL_RTX)
   8734 	    label = gen_label_rtx ();
   8735 	  emit_cmp_and_jump_insns (count_exp,
   8736 				   GEN_INT (size_needed),
   8737 				   LTU, 0, counter_mode (count_exp), 1, label);
   8738 	  if (expected_size == -1
   8739 	      || expected_size < (desired_align - align) / 2 + size_needed)
   8740 	    predict_jump (REG_BR_PROB_BASE * 20 / 100);
   8741 	  else
   8742 	    predict_jump (REG_BR_PROB_BASE * 60 / 100);
   8743 	}
   8744     }
   8745   if (label && size_needed == 1)
   8746     {
   8747       emit_label (label);
   8748       LABEL_NUSES (label) = 1;
   8749       label = NULL;
   8750       epilogue_size_needed = 1;
   8751       if (issetmem)
   8752 	promoted_val = val_exp;
   8753     }
   8754   else if (label == NULL_RTX && !misaligned_prologue_used)
   8755     epilogue_size_needed = size_needed;
   8756 
   8757   /* Step 3: Main loop.  */
   8758 
   8759   switch (alg)
   8760     {
   8761     case libcall:
   8762     case no_stringop:
   8763     case last_alg:
   8764       gcc_unreachable ();
   8765     case loop_1_byte:
   8766     case loop:
   8767     case unrolled_loop:
   8768       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
   8769 				     count_exp, move_mode, unroll_factor,
   8770 				     expected_size, issetmem);
   8771       break;
   8772     case vector_loop:
   8773       expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
   8774 				     vec_promoted_val, count_exp, move_mode,
   8775 				     unroll_factor, expected_size, issetmem);
   8776       break;
   8777     case rep_prefix_8_byte:
   8778     case rep_prefix_4_byte:
   8779     case rep_prefix_1_byte:
   8780       expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
   8781 				       val_exp, count_exp, move_mode, issetmem);
   8782       break;
   8783     }
   8784   /* Adjust properly the offset of src and dest memory for aliasing.  */
   8785   if (CONST_INT_P (count_exp))
   8786     {
   8787       if (!issetmem)
   8788 	src = adjust_automodify_address_nv (src, BLKmode, srcreg,
   8789 					    (count / size_needed) * size_needed);
   8790       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
   8791 					  (count / size_needed) * size_needed);
   8792     }
   8793   else
   8794     {
   8795       if (!issetmem)
   8796 	src = change_address (src, BLKmode, srcreg);
   8797       dst = change_address (dst, BLKmode, destreg);
   8798     }
   8799 
   8800   /* Step 4: Epilogue to copy the remaining bytes.  */
   8801  epilogue:
   8802   if (label)
   8803     {
   8804       /* When the main loop is done, COUNT_EXP might hold original count,
   8805 	 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
   8806 	 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
   8807 	 bytes. Compensate if needed.  */
   8808 
   8809       if (size_needed < epilogue_size_needed)
   8810 	{
   8811 	  tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
   8812 				     GEN_INT (size_needed - 1), count_exp, 1,
   8813 				     OPTAB_DIRECT);
   8814 	  if (tmp != count_exp)
   8815 	    emit_move_insn (count_exp, tmp);
   8816 	}
   8817       emit_label (label);
   8818       LABEL_NUSES (label) = 1;
   8819     }
   8820 
   8821   if (count_exp != const0_rtx && epilogue_size_needed > 1)
   8822     {
   8823       if (force_loopy_epilogue)
   8824 	expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
   8825 					 epilogue_size_needed);
   8826       else
   8827 	{
   8828 	  if (issetmem)
   8829 	    expand_setmem_epilogue (dst, destreg, promoted_val,
   8830 				    vec_promoted_val, count_exp,
   8831 				    epilogue_size_needed);
   8832 	  else
   8833 	    expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
   8834 				    epilogue_size_needed);
   8835 	}
   8836     }
   8837   if (jump_around_label)
   8838     emit_label (jump_around_label);
   8839   return true;
   8840 }
   8841 
   8842 /* Expand cmpstrn or memcmp.  */
   8843 
   8844 bool
   8845 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
   8846 			       rtx length, rtx align, bool is_cmpstrn)
   8847 {
   8848   /* Expand strncmp and memcmp only with -minline-all-stringops since
   8849      "repz cmpsb" can be much slower than strncmp and memcmp functions
   8850      implemented with vector instructions, see
   8851 
   8852      https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
   8853    */
   8854   if (!TARGET_INLINE_ALL_STRINGOPS)
   8855     return false;
   8856 
   8857   /* Can't use this if the user has appropriated ecx, esi or edi.  */
   8858   if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
   8859     return false;
   8860 
   8861   if (is_cmpstrn)
   8862     {
   8863       /* For strncmp, length is the maximum length, which can be larger
   8864 	 than actual string lengths.  We can expand the cmpstrn pattern
   8865 	 to "repz cmpsb" only if one of the strings is a constant so
   8866 	 that expand_builtin_strncmp() can write the length argument to
   8867 	 be the minimum of the const string length and the actual length
   8868 	 argument.  Otherwise, "repz cmpsb" may pass the 0 byte.  */
   8869       tree t1 = MEM_EXPR (src1);
   8870       tree t2 = MEM_EXPR (src2);
   8871       if (!((t1 && TREE_CODE (t1) == MEM_REF
   8872 	     && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
   8873 	     && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
   8874 		 == STRING_CST))
   8875 	    || (t2 && TREE_CODE (t2) == MEM_REF
   8876 		&& TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
   8877 		&& (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
   8878 		    == STRING_CST))))
   8879 	return false;
   8880     }
   8881 
   8882   rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
   8883   rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
   8884   if (addr1 != XEXP (src1, 0))
   8885     src1 = replace_equiv_address_nv (src1, addr1);
   8886   if (addr2 != XEXP (src2, 0))
   8887     src2 = replace_equiv_address_nv (src2, addr2);
   8888 
   8889   /* NB: Make a copy of the data length to avoid changing the original
   8890      data length by cmpstrnqi patterns.  */
   8891   length = ix86_zero_extend_to_Pmode (length);
   8892   rtx lengthreg = gen_reg_rtx (Pmode);
   8893   emit_move_insn (lengthreg, length);
   8894 
   8895   /* If we are testing strict equality, we can use known alignment to
   8896      good advantage.  This may be possible with combine, particularly
   8897      once cc0 is dead.  */
   8898   if (CONST_INT_P (length))
   8899     {
   8900       if (length == const0_rtx)
   8901 	{
   8902 	  emit_move_insn (result, const0_rtx);
   8903 	  return true;
   8904 	}
   8905       emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
   8906 				     src1, src2));
   8907     }
   8908   else
   8909     {
   8910       emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
   8911       emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
   8912 				  src1, src2));
   8913     }
   8914 
   8915   rtx out = gen_lowpart (QImode, result);
   8916   emit_insn (gen_cmpintqi (out));
   8917   emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
   8918 
   8919   return true;
   8920 }
   8921 
   8922 /* Expand the appropriate insns for doing strlen if not just doing
   8923    repnz; scasb
   8924 
   8925    out = result, initialized with the start address
   8926    align_rtx = alignment of the address.
   8927    scratch = scratch register, initialized with the startaddress when
   8928 	not aligned, otherwise undefined
   8929 
   8930    This is just the body. It needs the initializations mentioned above and
   8931    some address computing at the end.  These things are done in i386.md.  */
   8932 
   8933 static void
   8934 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
   8935 {
   8936   int align;
   8937   rtx tmp;
   8938   rtx_code_label *align_2_label = NULL;
   8939   rtx_code_label *align_3_label = NULL;
   8940   rtx_code_label *align_4_label = gen_label_rtx ();
   8941   rtx_code_label *end_0_label = gen_label_rtx ();
   8942   rtx mem;
   8943   rtx tmpreg = gen_reg_rtx (SImode);
   8944   rtx scratch = gen_reg_rtx (SImode);
   8945   rtx cmp;
   8946 
   8947   align = 0;
   8948   if (CONST_INT_P (align_rtx))
   8949     align = INTVAL (align_rtx);
   8950 
   8951   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
   8952 
   8953   /* Is there a known alignment and is it less than 4?  */
   8954   if (align < 4)
   8955     {
   8956       rtx scratch1 = gen_reg_rtx (Pmode);
   8957       emit_move_insn (scratch1, out);
   8958       /* Is there a known alignment and is it not 2? */
   8959       if (align != 2)
   8960 	{
   8961 	  align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
   8962 	  align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
   8963 
   8964 	  /* Leave just the 3 lower bits.  */
   8965 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
   8966 				    NULL_RTX, 0, OPTAB_WIDEN);
   8967 
   8968 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   8969 				   Pmode, 1, align_4_label);
   8970 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
   8971 				   Pmode, 1, align_2_label);
   8972 	  emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
   8973 				   Pmode, 1, align_3_label);
   8974 	}
   8975       else
   8976         {
   8977 	  /* Since the alignment is 2, we have to check 2 or 0 bytes;
   8978 	     check if is aligned to 4 - byte.  */
   8979 
   8980 	  align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
   8981 				    NULL_RTX, 0, OPTAB_WIDEN);
   8982 
   8983 	  emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
   8984 				   Pmode, 1, align_4_label);
   8985         }
   8986 
   8987       mem = change_address (src, QImode, out);
   8988 
   8989       /* Now compare the bytes.  */
   8990 
   8991       /* Compare the first n unaligned byte on a byte per byte basis.  */
   8992       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
   8993 			       QImode, 1, end_0_label);
   8994 
   8995       /* Increment the address.  */
   8996       emit_insn (gen_add2_insn (out, const1_rtx));
   8997 
   8998       /* Not needed with an alignment of 2 */
   8999       if (align != 2)
   9000 	{
   9001 	  emit_label (align_2_label);
   9002 
   9003 	  emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   9004 				   end_0_label);
   9005 
   9006 	  emit_insn (gen_add2_insn (out, const1_rtx));
   9007 
   9008 	  emit_label (align_3_label);
   9009 	}
   9010 
   9011       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
   9012 			       end_0_label);
   9013 
   9014       emit_insn (gen_add2_insn (out, const1_rtx));
   9015     }
   9016 
   9017   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
   9018      align this loop.  It gives only huge programs, but does not help to
   9019      speed up.  */
   9020   emit_label (align_4_label);
   9021 
   9022   mem = change_address (src, SImode, out);
   9023   emit_move_insn (scratch, mem);
   9024   emit_insn (gen_add2_insn (out, GEN_INT (4)));
   9025 
   9026   /* This formula yields a nonzero result iff one of the bytes is zero.
   9027      This saves three branches inside loop and many cycles.  */
   9028 
   9029   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
   9030   emit_insn (gen_one_cmplsi2 (scratch, scratch));
   9031   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
   9032   emit_insn (gen_andsi3 (tmpreg, tmpreg,
   9033 			 gen_int_mode (0x80808080, SImode)));
   9034   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
   9035 			   align_4_label);
   9036 
   9037   if (TARGET_CMOVE)
   9038     {
   9039        rtx reg = gen_reg_rtx (SImode);
   9040        rtx reg2 = gen_reg_rtx (Pmode);
   9041        emit_move_insn (reg, tmpreg);
   9042        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
   9043 
   9044        /* If zero is not in the first two bytes, move two bytes forward.  */
   9045        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   9046        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   9047        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   9048        emit_insn (gen_rtx_SET (tmpreg,
   9049 			       gen_rtx_IF_THEN_ELSE (SImode, tmp,
   9050 						     reg,
   9051 						     tmpreg)));
   9052        /* Emit lea manually to avoid clobbering of flags.  */
   9053        emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
   9054 
   9055        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   9056        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
   9057        emit_insn (gen_rtx_SET (out,
   9058 			       gen_rtx_IF_THEN_ELSE (Pmode, tmp,
   9059 						     reg2,
   9060 						     out)));
   9061     }
   9062   else
   9063     {
   9064        rtx_code_label *end_2_label = gen_label_rtx ();
   9065        /* Is zero in the first two bytes? */
   9066 
   9067        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
   9068        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   9069        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
   9070        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   9071                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
   9072                             pc_rtx);
   9073        tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   9074        JUMP_LABEL (tmp) = end_2_label;
   9075 
   9076        /* Not in the first two.  Move two bytes forward.  */
   9077        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
   9078        emit_insn (gen_add2_insn (out, const2_rtx));
   9079 
   9080        emit_label (end_2_label);
   9081 
   9082     }
   9083 
   9084   /* Avoid branch in fixing the byte.  */
   9085   tmpreg = gen_lowpart (QImode, tmpreg);
   9086   emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
   9087   tmp = gen_rtx_REG (CCmode, FLAGS_REG);
   9088   cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
   9089   emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
   9090 
   9091   emit_label (end_0_label);
   9092 }
   9093 
   9094 /* Expand strlen.  */
   9095 
   9096 bool
   9097 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
   9098 {
   9099 if (TARGET_UNROLL_STRLEN
   9100 	   && TARGET_INLINE_ALL_STRINGOPS
   9101 	   && eoschar == const0_rtx
   9102 	   && optimize > 1)
   9103     {
   9104       /* The generic case of strlen expander is long.  Avoid it's
   9105 	 expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
   9106       rtx addr = force_reg (Pmode, XEXP (src, 0));
   9107       /* Well it seems that some optimizer does not combine a call like
   9108 	 foo(strlen(bar), strlen(bar));
   9109 	 when the move and the subtraction is done here.  It does calculate
   9110 	 the length just once when these instructions are done inside of
   9111 	 output_strlen_unroll().  But I think since &bar[strlen(bar)] is
   9112 	 often used and I use one fewer register for the lifetime of
   9113 	 output_strlen_unroll() this is better.  */
   9114 
   9115       emit_move_insn (out, addr);
   9116 
   9117       ix86_expand_strlensi_unroll_1 (out, src, align);
   9118 
   9119       /* strlensi_unroll_1 returns the address of the zero at the end of
   9120 	 the string, like memchr(), so compute the length by subtracting
   9121 	 the start address.  */
   9122       emit_insn (gen_sub2_insn (out, addr));
   9123       return true;
   9124     }
   9125   else
   9126     return false;
   9127 }
   9128 
   9129 /* For given symbol (function) construct code to compute address of it's PLT
   9130    entry in large x86-64 PIC model.  */
   9131 
   9132 static rtx
   9133 construct_plt_address (rtx symbol)
   9134 {
   9135   rtx tmp, unspec;
   9136 
   9137   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
   9138   gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
   9139   gcc_assert (Pmode == DImode);
   9140 
   9141   tmp = gen_reg_rtx (Pmode);
   9142   unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
   9143 
   9144   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
   9145   emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
   9146   return tmp;
   9147 }
   9148 
   9149 /* Additional registers that are clobbered by SYSV calls.  */
   9150 
   9151 static int const x86_64_ms_sysv_extra_clobbered_registers
   9152 		 [NUM_X86_64_MS_CLOBBERED_REGS] =
   9153 {
   9154   SI_REG, DI_REG,
   9155   XMM6_REG, XMM7_REG,
   9156   XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
   9157   XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
   9158 };
   9159 
   9160 rtx_insn *
   9161 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   9162 		  rtx callarg2,
   9163 		  rtx pop, bool sibcall)
   9164 {
   9165   rtx vec[3];
   9166   rtx use = NULL, call;
   9167   unsigned int vec_len = 0;
   9168   tree fndecl;
   9169 
   9170   if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
   9171     {
   9172       fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
   9173       if (fndecl
   9174 	  && (lookup_attribute ("interrupt",
   9175 				TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
   9176 	error ("interrupt service routine cannot be called directly");
   9177     }
   9178   else
   9179     fndecl = NULL_TREE;
   9180 
   9181   if (pop == const0_rtx)
   9182     pop = NULL;
   9183   gcc_assert (!TARGET_64BIT || !pop);
   9184 
   9185   rtx addr = XEXP (fnaddr, 0);
   9186   if (TARGET_MACHO && !TARGET_64BIT)
   9187     {
   9188 #if TARGET_MACHO
   9189       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
   9190 	fnaddr = machopic_indirect_call_target (fnaddr);
   9191 #endif
   9192     }
   9193   else
   9194     {
   9195       /* Static functions and indirect calls don't need the pic register.  Also,
   9196 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
   9197 	 it an indirect call.  */
   9198       if (flag_pic
   9199 	  && GET_CODE (addr) == SYMBOL_REF
   9200 	  && ix86_call_use_plt_p (addr))
   9201 	{
   9202 	  if (flag_plt
   9203 	      && (SYMBOL_REF_DECL (addr) == NULL_TREE
   9204 		  || !lookup_attribute ("noplt",
   9205 					DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
   9206 	    {
   9207 	      if (!TARGET_64BIT
   9208 		  || (ix86_cmodel == CM_LARGE_PIC
   9209 		      && DEFAULT_ABI != MS_ABI))
   9210 		{
   9211 		  use_reg (&use, gen_rtx_REG (Pmode,
   9212 					      REAL_PIC_OFFSET_TABLE_REGNUM));
   9213 		  if (ix86_use_pseudo_pic_reg ())
   9214 		    emit_move_insn (gen_rtx_REG (Pmode,
   9215 						 REAL_PIC_OFFSET_TABLE_REGNUM),
   9216 				    pic_offset_table_rtx);
   9217 		}
   9218 	    }
   9219 	  else if (!TARGET_PECOFF && !TARGET_MACHO)
   9220 	    {
   9221 	      if (TARGET_64BIT
   9222 		  && ix86_cmodel == CM_LARGE_PIC
   9223 		  && DEFAULT_ABI != MS_ABI)
   9224 		{
   9225 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   9226 					   UNSPEC_GOT);
   9227 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   9228 		  fnaddr = force_reg (Pmode, fnaddr);
   9229 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
   9230 		}
   9231 	      else if (TARGET_64BIT)
   9232 		{
   9233 		  fnaddr = gen_rtx_UNSPEC (Pmode,
   9234 					   gen_rtvec (1, addr),
   9235 					   UNSPEC_GOTPCREL);
   9236 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   9237 		}
   9238 	      else
   9239 		{
   9240 		  fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
   9241 					   UNSPEC_GOT);
   9242 		  fnaddr = gen_rtx_CONST (Pmode, fnaddr);
   9243 		  fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
   9244 					 fnaddr);
   9245 		}
   9246 	      fnaddr = gen_const_mem (Pmode, fnaddr);
   9247 	      /* Pmode may not be the same as word_mode for x32, which
   9248 		 doesn't support indirect branch via 32-bit memory slot.
   9249 		 Since x32 GOT slot is 64 bit with zero upper 32 bits,
   9250 		 indirect branch via x32 GOT slot is OK.  */
   9251 	      if (GET_MODE (fnaddr) != word_mode)
   9252 		fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
   9253 	      fnaddr = gen_rtx_MEM (QImode, fnaddr);
   9254 	    }
   9255 	}
   9256     }
   9257 
   9258   /* Skip setting up RAX register for -mskip-rax-setup when there are no
   9259      parameters passed in vector registers.  */
   9260   if (TARGET_64BIT
   9261       && (INTVAL (callarg2) > 0
   9262 	  || (INTVAL (callarg2) == 0
   9263 	      && (TARGET_SSE || !flag_skip_rax_setup))))
   9264     {
   9265       rtx al = gen_rtx_REG (QImode, AX_REG);
   9266       emit_move_insn (al, callarg2);
   9267       use_reg (&use, al);
   9268     }
   9269 
   9270   if (ix86_cmodel == CM_LARGE_PIC
   9271       && !TARGET_PECOFF
   9272       && MEM_P (fnaddr)
   9273       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
   9274       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
   9275     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
   9276   /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
   9277      branch via x32 GOT slot is OK.  */
   9278   else if (!(TARGET_X32
   9279 	     && MEM_P (fnaddr)
   9280 	     && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
   9281 	     && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
   9282 	   && (sibcall
   9283 	       ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
   9284 	       : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
   9285     {
   9286       fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
   9287       fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
   9288     }
   9289 
   9290   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
   9291 
   9292   if (retval)
   9293     call = gen_rtx_SET (retval, call);
   9294   vec[vec_len++] = call;
   9295 
   9296   if (pop)
   9297     {
   9298       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
   9299       pop = gen_rtx_SET (stack_pointer_rtx, pop);
   9300       vec[vec_len++] = pop;
   9301     }
   9302 
   9303   if (cfun->machine->no_caller_saved_registers
   9304       && (!fndecl
   9305 	  || (!TREE_THIS_VOLATILE (fndecl)
   9306 	      && !lookup_attribute ("no_caller_saved_registers",
   9307 				    TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
   9308     {
   9309       static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
   9310       bool is_64bit_ms_abi = (TARGET_64BIT
   9311 			      && ix86_function_abi (fndecl) == MS_ABI);
   9312       char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
   9313 
   9314       /* If there are no caller-saved registers, add all registers
   9315 	 that are clobbered by the call which returns.  */
   9316       for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
   9317 	if (!fixed_regs[i]
   9318 	    && (ix86_call_used_regs[i] == 1
   9319 		|| (ix86_call_used_regs[i] & c_mask))
   9320 	    && !STACK_REGNO_P (i)
   9321 	    && !MMX_REGNO_P (i))
   9322 	  clobber_reg (&use,
   9323 		       gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
   9324     }
   9325   else if (TARGET_64BIT_MS_ABI
   9326 	   && (!callarg2 || INTVAL (callarg2) != -2))
   9327     {
   9328       unsigned i;
   9329 
   9330       for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
   9331 	{
   9332 	  int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
   9333 	  machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
   9334 
   9335 	  clobber_reg (&use, gen_rtx_REG (mode, regno));
   9336 	}
   9337 
   9338       /* Set here, but it may get cleared later.  */
   9339       if (TARGET_CALL_MS2SYSV_XLOGUES)
   9340 	{
   9341 	  if (!TARGET_SSE)
   9342 	    ;
   9343 
   9344 	  /* Don't break hot-patched functions.  */
   9345 	  else if (ix86_function_ms_hook_prologue (current_function_decl))
   9346 	    ;
   9347 
   9348 	  /* TODO: Cases not yet examined.  */
   9349 	  else if (flag_split_stack)
   9350 	    warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
   9351 
   9352 	  else
   9353 	    {
   9354 	      gcc_assert (!reload_completed);
   9355 	      cfun->machine->call_ms2sysv = true;
   9356 	    }
   9357 	}
   9358     }
   9359 
   9360   if (TARGET_MACHO && TARGET_64BIT && !sibcall
   9361       && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
   9362 	  || !fndecl || TREE_PUBLIC (fndecl)))
   9363     {
   9364       /* We allow public functions defined in a TU to bind locally for PIC
   9365 	 code (the default) on 64bit Mach-O.
   9366 	 If such functions are not inlined, we cannot tell at compile-time if
   9367 	 they will be called via the lazy symbol resolver (this can depend on
   9368 	 options given at link-time).  Therefore, we must assume that the lazy
   9369 	 resolver could be used which clobbers R11 and R10.  */
   9370       clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
   9371       clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
   9372     }
   9373 
   9374   if (vec_len > 1)
   9375     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
   9376   rtx_insn *call_insn = emit_call_insn (call);
   9377   if (use)
   9378     CALL_INSN_FUNCTION_USAGE (call_insn) = use;
   9379 
   9380   return call_insn;
   9381 }
   9382 
   9383 /* Split simple return with popping POPC bytes from stack to indirect
   9384    branch with stack adjustment .  */
   9385 
   9386 void
   9387 ix86_split_simple_return_pop_internal (rtx popc)
   9388 {
   9389   struct machine_function *m = cfun->machine;
   9390   rtx ecx = gen_rtx_REG (SImode, CX_REG);
   9391   rtx_insn *insn;
   9392 
   9393   /* There is no "pascal" calling convention in any 64bit ABI.  */
   9394   gcc_assert (!TARGET_64BIT);
   9395 
   9396   insn = emit_insn (gen_pop (ecx));
   9397   m->fs.cfa_offset -= UNITS_PER_WORD;
   9398   m->fs.sp_offset -= UNITS_PER_WORD;
   9399 
   9400   rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
   9401   x = gen_rtx_SET (stack_pointer_rtx, x);
   9402   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   9403   add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
   9404   RTX_FRAME_RELATED_P (insn) = 1;
   9405 
   9406   x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
   9407   x = gen_rtx_SET (stack_pointer_rtx, x);
   9408   insn = emit_insn (x);
   9409   add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
   9410   RTX_FRAME_RELATED_P (insn) = 1;
   9411 
   9412   /* Now return address is in ECX.  */
   9413   emit_jump_insn (gen_simple_return_indirect_internal (ecx));
   9414 }
   9415 
   9416 /* Errors in the source file can cause expand_expr to return const0_rtx
   9417    where we expect a vector.  To avoid crashing, use one of the vector
   9418    clear instructions.  */
   9419 
   9420 static rtx
   9421 safe_vector_operand (rtx x, machine_mode mode)
   9422 {
   9423   if (x == const0_rtx)
   9424     x = CONST0_RTX (mode);
   9425   return x;
   9426 }
   9427 
   9428 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
   9429 
   9430 static rtx
   9431 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
   9432 {
   9433   rtx pat;
   9434   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9435   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9436   rtx op0 = expand_normal (arg0);
   9437   rtx op1 = expand_normal (arg1);
   9438   machine_mode tmode = insn_data[icode].operand[0].mode;
   9439   machine_mode mode0 = insn_data[icode].operand[1].mode;
   9440   machine_mode mode1 = insn_data[icode].operand[2].mode;
   9441 
   9442   if (VECTOR_MODE_P (mode0))
   9443     op0 = safe_vector_operand (op0, mode0);
   9444   if (VECTOR_MODE_P (mode1))
   9445     op1 = safe_vector_operand (op1, mode1);
   9446 
   9447   if (optimize || !target
   9448       || GET_MODE (target) != tmode
   9449       || !insn_data[icode].operand[0].predicate (target, tmode))
   9450     target = gen_reg_rtx (tmode);
   9451 
   9452   if (GET_MODE (op1) == SImode && mode1 == TImode)
   9453     {
   9454       rtx x = gen_reg_rtx (V4SImode);
   9455       emit_insn (gen_sse2_loadd (x, op1));
   9456       op1 = gen_lowpart (TImode, x);
   9457     }
   9458 
   9459   if (!insn_data[icode].operand[1].predicate (op0, mode0))
   9460     op0 = copy_to_mode_reg (mode0, op0);
   9461   if (!insn_data[icode].operand[2].predicate (op1, mode1))
   9462     op1 = copy_to_mode_reg (mode1, op1);
   9463 
   9464   pat = GEN_FCN (icode) (target, op0, op1);
   9465   if (! pat)
   9466     return 0;
   9467 
   9468   emit_insn (pat);
   9469 
   9470   return target;
   9471 }
   9472 
   9473 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
   9474 
   9475 static rtx
   9476 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
   9477 			       enum ix86_builtin_func_type m_type,
   9478 			       enum rtx_code sub_code)
   9479 {
   9480   rtx pat;
   9481   unsigned int i, nargs;
   9482   bool comparison_p = false;
   9483   bool tf_p = false;
   9484   bool last_arg_constant = false;
   9485   int num_memory = 0;
   9486   rtx xops[4];
   9487 
   9488   machine_mode tmode = insn_data[icode].operand[0].mode;
   9489 
   9490   switch (m_type)
   9491     {
   9492     case MULTI_ARG_4_DF2_DI_I:
   9493     case MULTI_ARG_4_DF2_DI_I1:
   9494     case MULTI_ARG_4_SF2_SI_I:
   9495     case MULTI_ARG_4_SF2_SI_I1:
   9496       nargs = 4;
   9497       last_arg_constant = true;
   9498       break;
   9499 
   9500     case MULTI_ARG_3_SF:
   9501     case MULTI_ARG_3_DF:
   9502     case MULTI_ARG_3_SF2:
   9503     case MULTI_ARG_3_DF2:
   9504     case MULTI_ARG_3_DI:
   9505     case MULTI_ARG_3_SI:
   9506     case MULTI_ARG_3_SI_DI:
   9507     case MULTI_ARG_3_HI:
   9508     case MULTI_ARG_3_HI_SI:
   9509     case MULTI_ARG_3_QI:
   9510     case MULTI_ARG_3_DI2:
   9511     case MULTI_ARG_3_SI2:
   9512     case MULTI_ARG_3_HI2:
   9513     case MULTI_ARG_3_QI2:
   9514       nargs = 3;
   9515       break;
   9516 
   9517     case MULTI_ARG_2_SF:
   9518     case MULTI_ARG_2_DF:
   9519     case MULTI_ARG_2_DI:
   9520     case MULTI_ARG_2_SI:
   9521     case MULTI_ARG_2_HI:
   9522     case MULTI_ARG_2_QI:
   9523       nargs = 2;
   9524       break;
   9525 
   9526     case MULTI_ARG_2_DI_IMM:
   9527     case MULTI_ARG_2_SI_IMM:
   9528     case MULTI_ARG_2_HI_IMM:
   9529     case MULTI_ARG_2_QI_IMM:
   9530       nargs = 2;
   9531       last_arg_constant = true;
   9532       break;
   9533 
   9534     case MULTI_ARG_1_SF:
   9535     case MULTI_ARG_1_DF:
   9536     case MULTI_ARG_1_SF2:
   9537     case MULTI_ARG_1_DF2:
   9538     case MULTI_ARG_1_DI:
   9539     case MULTI_ARG_1_SI:
   9540     case MULTI_ARG_1_HI:
   9541     case MULTI_ARG_1_QI:
   9542     case MULTI_ARG_1_SI_DI:
   9543     case MULTI_ARG_1_HI_DI:
   9544     case MULTI_ARG_1_HI_SI:
   9545     case MULTI_ARG_1_QI_DI:
   9546     case MULTI_ARG_1_QI_SI:
   9547     case MULTI_ARG_1_QI_HI:
   9548       nargs = 1;
   9549       break;
   9550 
   9551     case MULTI_ARG_2_DI_CMP:
   9552     case MULTI_ARG_2_SI_CMP:
   9553     case MULTI_ARG_2_HI_CMP:
   9554     case MULTI_ARG_2_QI_CMP:
   9555       nargs = 2;
   9556       comparison_p = true;
   9557       break;
   9558 
   9559     case MULTI_ARG_2_SF_TF:
   9560     case MULTI_ARG_2_DF_TF:
   9561     case MULTI_ARG_2_DI_TF:
   9562     case MULTI_ARG_2_SI_TF:
   9563     case MULTI_ARG_2_HI_TF:
   9564     case MULTI_ARG_2_QI_TF:
   9565       nargs = 2;
   9566       tf_p = true;
   9567       break;
   9568 
   9569     default:
   9570       gcc_unreachable ();
   9571     }
   9572 
   9573   if (optimize || !target
   9574       || GET_MODE (target) != tmode
   9575       || !insn_data[icode].operand[0].predicate (target, tmode))
   9576     target = gen_reg_rtx (tmode);
   9577   else if (memory_operand (target, tmode))
   9578     num_memory++;
   9579 
   9580   gcc_assert (nargs <= ARRAY_SIZE (xops));
   9581 
   9582   for (i = 0; i < nargs; i++)
   9583     {
   9584       tree arg = CALL_EXPR_ARG (exp, i);
   9585       rtx op = expand_normal (arg);
   9586       int adjust = (comparison_p) ? 1 : 0;
   9587       machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
   9588 
   9589       if (last_arg_constant && i == nargs - 1)
   9590 	{
   9591 	  if (!insn_data[icode].operand[i + 1].predicate (op, mode))
   9592 	    {
   9593 	      enum insn_code new_icode = icode;
   9594 	      switch (icode)
   9595 		{
   9596 		case CODE_FOR_xop_vpermil2v2df3:
   9597 		case CODE_FOR_xop_vpermil2v4sf3:
   9598 		case CODE_FOR_xop_vpermil2v4df3:
   9599 		case CODE_FOR_xop_vpermil2v8sf3:
   9600 		  error ("the last argument must be a 2-bit immediate");
   9601 		  return gen_reg_rtx (tmode);
   9602 		case CODE_FOR_xop_rotlv2di3:
   9603 		  new_icode = CODE_FOR_rotlv2di3;
   9604 		  goto xop_rotl;
   9605 		case CODE_FOR_xop_rotlv4si3:
   9606 		  new_icode = CODE_FOR_rotlv4si3;
   9607 		  goto xop_rotl;
   9608 		case CODE_FOR_xop_rotlv8hi3:
   9609 		  new_icode = CODE_FOR_rotlv8hi3;
   9610 		  goto xop_rotl;
   9611 		case CODE_FOR_xop_rotlv16qi3:
   9612 		  new_icode = CODE_FOR_rotlv16qi3;
   9613 		xop_rotl:
   9614 		  if (CONST_INT_P (op))
   9615 		    {
   9616 		      int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
   9617 		      op = GEN_INT (INTVAL (op) & mask);
   9618 		      gcc_checking_assert
   9619 			(insn_data[icode].operand[i + 1].predicate (op, mode));
   9620 		    }
   9621 		  else
   9622 		    {
   9623 		      gcc_checking_assert
   9624 			(nargs == 2
   9625 			 && insn_data[new_icode].operand[0].mode == tmode
   9626 			 && insn_data[new_icode].operand[1].mode == tmode
   9627 			 && insn_data[new_icode].operand[2].mode == mode
   9628 			 && insn_data[new_icode].operand[0].predicate
   9629 			    == insn_data[icode].operand[0].predicate
   9630 			 && insn_data[new_icode].operand[1].predicate
   9631 			    == insn_data[icode].operand[1].predicate);
   9632 		      icode = new_icode;
   9633 		      goto non_constant;
   9634 		    }
   9635 		  break;
   9636 		default:
   9637 		  gcc_unreachable ();
   9638 		}
   9639 	    }
   9640 	}
   9641       else
   9642 	{
   9643 	non_constant:
   9644 	  if (VECTOR_MODE_P (mode))
   9645 	    op = safe_vector_operand (op, mode);
   9646 
   9647 	  /* If we aren't optimizing, only allow one memory operand to be
   9648 	     generated.  */
   9649 	  if (memory_operand (op, mode))
   9650 	    num_memory++;
   9651 
   9652 	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
   9653 
   9654 	  if (optimize
   9655 	      || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
   9656 	      || num_memory > 1)
   9657 	    op = force_reg (mode, op);
   9658 	}
   9659 
   9660       xops[i] = op;
   9661     }
   9662 
   9663   switch (nargs)
   9664     {
   9665     case 1:
   9666       pat = GEN_FCN (icode) (target, xops[0]);
   9667       break;
   9668 
   9669     case 2:
   9670       if (tf_p)
   9671 	pat = GEN_FCN (icode) (target, xops[0], xops[1],
   9672 			       GEN_INT ((int)sub_code));
   9673       else if (! comparison_p)
   9674 	pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   9675       else
   9676 	{
   9677 	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
   9678 				       xops[0], xops[1]);
   9679 
   9680 	  pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
   9681 	}
   9682       break;
   9683 
   9684     case 3:
   9685       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   9686       break;
   9687 
   9688     case 4:
   9689       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
   9690       break;
   9691 
   9692     default:
   9693       gcc_unreachable ();
   9694     }
   9695 
   9696   if (! pat)
   9697     return 0;
   9698 
   9699   emit_insn (pat);
   9700   return target;
   9701 }
   9702 
   9703 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
   9704    insns with vec_merge.  */
   9705 
   9706 static rtx
   9707 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
   9708 				    rtx target)
   9709 {
   9710   rtx pat;
   9711   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9712   rtx op1, op0 = expand_normal (arg0);
   9713   machine_mode tmode = insn_data[icode].operand[0].mode;
   9714   machine_mode mode0 = insn_data[icode].operand[1].mode;
   9715 
   9716   if (optimize || !target
   9717       || GET_MODE (target) != tmode
   9718       || !insn_data[icode].operand[0].predicate (target, tmode))
   9719     target = gen_reg_rtx (tmode);
   9720 
   9721   if (VECTOR_MODE_P (mode0))
   9722     op0 = safe_vector_operand (op0, mode0);
   9723 
   9724   if ((optimize && !register_operand (op0, mode0))
   9725       || !insn_data[icode].operand[1].predicate (op0, mode0))
   9726     op0 = copy_to_mode_reg (mode0, op0);
   9727 
   9728   op1 = op0;
   9729   if (!insn_data[icode].operand[2].predicate (op1, mode0))
   9730     op1 = copy_to_mode_reg (mode0, op1);
   9731 
   9732   pat = GEN_FCN (icode) (target, op0, op1);
   9733   if (! pat)
   9734     return 0;
   9735   emit_insn (pat);
   9736   return target;
   9737 }
   9738 
   9739 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
   9740 
   9741 static rtx
   9742 ix86_expand_sse_compare (const struct builtin_description *d,
   9743 			 tree exp, rtx target, bool swap)
   9744 {
   9745   rtx pat;
   9746   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9747   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9748   rtx op0 = expand_normal (arg0);
   9749   rtx op1 = expand_normal (arg1);
   9750   rtx op2;
   9751   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   9752   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   9753   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   9754   enum rtx_code comparison = d->comparison;
   9755 
   9756   if (VECTOR_MODE_P (mode0))
   9757     op0 = safe_vector_operand (op0, mode0);
   9758   if (VECTOR_MODE_P (mode1))
   9759     op1 = safe_vector_operand (op1, mode1);
   9760 
   9761   /* Swap operands if we have a comparison that isn't available in
   9762      hardware.  */
   9763   if (swap)
   9764     std::swap (op0, op1);
   9765 
   9766   if (optimize || !target
   9767       || GET_MODE (target) != tmode
   9768       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   9769     target = gen_reg_rtx (tmode);
   9770 
   9771   if ((optimize && !register_operand (op0, mode0))
   9772       || !insn_data[d->icode].operand[1].predicate (op0, mode0))
   9773     op0 = copy_to_mode_reg (mode0, op0);
   9774   if ((optimize && !register_operand (op1, mode1))
   9775       || !insn_data[d->icode].operand[2].predicate (op1, mode1))
   9776     op1 = copy_to_mode_reg (mode1, op1);
   9777 
   9778   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
   9779   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   9780   if (! pat)
   9781     return 0;
   9782   emit_insn (pat);
   9783   return target;
   9784 }
   9785 
   9786 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
   9787 
   9788 static rtx
   9789 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
   9790 		      rtx target)
   9791 {
   9792   rtx pat;
   9793   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9794   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9795   rtx op0 = expand_normal (arg0);
   9796   rtx op1 = expand_normal (arg1);
   9797   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   9798   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   9799   enum rtx_code comparison = d->comparison;
   9800 
   9801   if (VECTOR_MODE_P (mode0))
   9802     op0 = safe_vector_operand (op0, mode0);
   9803   if (VECTOR_MODE_P (mode1))
   9804     op1 = safe_vector_operand (op1, mode1);
   9805 
   9806   target = gen_reg_rtx (SImode);
   9807   emit_move_insn (target, const0_rtx);
   9808   target = gen_rtx_SUBREG (QImode, target, 0);
   9809 
   9810   if ((optimize && !register_operand (op0, mode0))
   9811       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   9812     op0 = copy_to_mode_reg (mode0, op0);
   9813   if ((optimize && !register_operand (op1, mode1))
   9814       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   9815     op1 = copy_to_mode_reg (mode1, op1);
   9816 
   9817   pat = GEN_FCN (d->icode) (op0, op1);
   9818   if (! pat)
   9819     return 0;
   9820   emit_insn (pat);
   9821   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   9822 			  gen_rtx_fmt_ee (comparison, QImode,
   9823 					  SET_DEST (pat),
   9824 					  const0_rtx)));
   9825 
   9826   return SUBREG_REG (target);
   9827 }
   9828 
   9829 /* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
   9830 
   9831 static rtx
   9832 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
   9833 		       rtx target)
   9834 {
   9835   rtx pat;
   9836   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9837   rtx op1, op0 = expand_normal (arg0);
   9838   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   9839   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   9840 
   9841   if (optimize || target == 0
   9842       || GET_MODE (target) != tmode
   9843       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   9844     target = gen_reg_rtx (tmode);
   9845 
   9846   if (VECTOR_MODE_P (mode0))
   9847     op0 = safe_vector_operand (op0, mode0);
   9848 
   9849   if ((optimize && !register_operand (op0, mode0))
   9850       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   9851     op0 = copy_to_mode_reg (mode0, op0);
   9852 
   9853   op1 = GEN_INT (d->comparison);
   9854 
   9855   pat = GEN_FCN (d->icode) (target, op0, op1);
   9856   if (! pat)
   9857     return 0;
   9858   emit_insn (pat);
   9859   return target;
   9860 }
   9861 
   9862 static rtx
   9863 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
   9864 				     tree exp, rtx target)
   9865 {
   9866   rtx pat;
   9867   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9868   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9869   rtx op0 = expand_normal (arg0);
   9870   rtx op1 = expand_normal (arg1);
   9871   rtx op2;
   9872   machine_mode tmode = insn_data[d->icode].operand[0].mode;
   9873   machine_mode mode0 = insn_data[d->icode].operand[1].mode;
   9874   machine_mode mode1 = insn_data[d->icode].operand[2].mode;
   9875 
   9876   if (optimize || target == 0
   9877       || GET_MODE (target) != tmode
   9878       || !insn_data[d->icode].operand[0].predicate (target, tmode))
   9879     target = gen_reg_rtx (tmode);
   9880 
   9881   op0 = safe_vector_operand (op0, mode0);
   9882   op1 = safe_vector_operand (op1, mode1);
   9883 
   9884   if ((optimize && !register_operand (op0, mode0))
   9885       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   9886     op0 = copy_to_mode_reg (mode0, op0);
   9887   if ((optimize && !register_operand (op1, mode1))
   9888       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   9889     op1 = copy_to_mode_reg (mode1, op1);
   9890 
   9891   op2 = GEN_INT (d->comparison);
   9892 
   9893   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
   9894   if (! pat)
   9895     return 0;
   9896   emit_insn (pat);
   9897   return target;
   9898 }
   9899 
   9900 /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
   9901 
   9902 static rtx
   9903 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
   9904 		       rtx target)
   9905 {
   9906   rtx pat;
   9907   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9908   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9909   rtx op0 = expand_normal (arg0);
   9910   rtx op1 = expand_normal (arg1);
   9911   machine_mode mode0 = insn_data[d->icode].operand[0].mode;
   9912   machine_mode mode1 = insn_data[d->icode].operand[1].mode;
   9913   enum rtx_code comparison = d->comparison;
   9914 
   9915   if (VECTOR_MODE_P (mode0))
   9916     op0 = safe_vector_operand (op0, mode0);
   9917   if (VECTOR_MODE_P (mode1))
   9918     op1 = safe_vector_operand (op1, mode1);
   9919 
   9920   target = gen_reg_rtx (SImode);
   9921   emit_move_insn (target, const0_rtx);
   9922   target = gen_rtx_SUBREG (QImode, target, 0);
   9923 
   9924   if ((optimize && !register_operand (op0, mode0))
   9925       || !insn_data[d->icode].operand[0].predicate (op0, mode0))
   9926     op0 = copy_to_mode_reg (mode0, op0);
   9927   if ((optimize && !register_operand (op1, mode1))
   9928       || !insn_data[d->icode].operand[1].predicate (op1, mode1))
   9929     op1 = copy_to_mode_reg (mode1, op1);
   9930 
   9931   pat = GEN_FCN (d->icode) (op0, op1);
   9932   if (! pat)
   9933     return 0;
   9934   emit_insn (pat);
   9935   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   9936 			  gen_rtx_fmt_ee (comparison, QImode,
   9937 					  SET_DEST (pat),
   9938 					  const0_rtx)));
   9939 
   9940   return SUBREG_REG (target);
   9941 }
   9942 
   9943 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns.  */
   9944 
   9945 static rtx
   9946 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
   9947 			  tree exp, rtx target)
   9948 {
   9949   rtx pat;
   9950   tree arg0 = CALL_EXPR_ARG (exp, 0);
   9951   tree arg1 = CALL_EXPR_ARG (exp, 1);
   9952   tree arg2 = CALL_EXPR_ARG (exp, 2);
   9953   tree arg3 = CALL_EXPR_ARG (exp, 3);
   9954   tree arg4 = CALL_EXPR_ARG (exp, 4);
   9955   rtx scratch0, scratch1;
   9956   rtx op0 = expand_normal (arg0);
   9957   rtx op1 = expand_normal (arg1);
   9958   rtx op2 = expand_normal (arg2);
   9959   rtx op3 = expand_normal (arg3);
   9960   rtx op4 = expand_normal (arg4);
   9961   machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
   9962 
   9963   tmode0 = insn_data[d->icode].operand[0].mode;
   9964   tmode1 = insn_data[d->icode].operand[1].mode;
   9965   modev2 = insn_data[d->icode].operand[2].mode;
   9966   modei3 = insn_data[d->icode].operand[3].mode;
   9967   modev4 = insn_data[d->icode].operand[4].mode;
   9968   modei5 = insn_data[d->icode].operand[5].mode;
   9969   modeimm = insn_data[d->icode].operand[6].mode;
   9970 
   9971   if (VECTOR_MODE_P (modev2))
   9972     op0 = safe_vector_operand (op0, modev2);
   9973   if (VECTOR_MODE_P (modev4))
   9974     op2 = safe_vector_operand (op2, modev4);
   9975 
   9976   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   9977     op0 = copy_to_mode_reg (modev2, op0);
   9978   if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
   9979     op1 = copy_to_mode_reg (modei3, op1);
   9980   if ((optimize && !register_operand (op2, modev4))
   9981       || !insn_data[d->icode].operand[4].predicate (op2, modev4))
   9982     op2 = copy_to_mode_reg (modev4, op2);
   9983   if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
   9984     op3 = copy_to_mode_reg (modei5, op3);
   9985 
   9986   if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
   9987     {
   9988       error ("the fifth argument must be an 8-bit immediate");
   9989       return const0_rtx;
   9990     }
   9991 
   9992   if (d->code == IX86_BUILTIN_PCMPESTRI128)
   9993     {
   9994       if (optimize || !target
   9995 	  || GET_MODE (target) != tmode0
   9996 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   9997 	target = gen_reg_rtx (tmode0);
   9998 
   9999       scratch1 = gen_reg_rtx (tmode1);
   10000 
   10001       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
   10002     }
   10003   else if (d->code == IX86_BUILTIN_PCMPESTRM128)
   10004     {
   10005       if (optimize || !target
   10006 	  || GET_MODE (target) != tmode1
   10007 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   10008 	target = gen_reg_rtx (tmode1);
   10009 
   10010       scratch0 = gen_reg_rtx (tmode0);
   10011 
   10012       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
   10013     }
   10014   else
   10015     {
   10016       gcc_assert (d->flag);
   10017 
   10018       scratch0 = gen_reg_rtx (tmode0);
   10019       scratch1 = gen_reg_rtx (tmode1);
   10020 
   10021       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
   10022     }
   10023 
   10024   if (! pat)
   10025     return 0;
   10026 
   10027   emit_insn (pat);
   10028 
   10029   if (d->flag)
   10030     {
   10031       target = gen_reg_rtx (SImode);
   10032       emit_move_insn (target, const0_rtx);
   10033       target = gen_rtx_SUBREG (QImode, target, 0);
   10034 
   10035       emit_insn
   10036 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   10037 		      gen_rtx_fmt_ee (EQ, QImode,
   10038 				      gen_rtx_REG ((machine_mode) d->flag,
   10039 						   FLAGS_REG),
   10040 				      const0_rtx)));
   10041       return SUBREG_REG (target);
   10042     }
   10043   else
   10044     return target;
   10045 }
   10046 
   10047 
   10048 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns.  */
   10049 
   10050 static rtx
   10051 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
   10052 			  tree exp, rtx target)
   10053 {
   10054   rtx pat;
   10055   tree arg0 = CALL_EXPR_ARG (exp, 0);
   10056   tree arg1 = CALL_EXPR_ARG (exp, 1);
   10057   tree arg2 = CALL_EXPR_ARG (exp, 2);
   10058   rtx scratch0, scratch1;
   10059   rtx op0 = expand_normal (arg0);
   10060   rtx op1 = expand_normal (arg1);
   10061   rtx op2 = expand_normal (arg2);
   10062   machine_mode tmode0, tmode1, modev2, modev3, modeimm;
   10063 
   10064   tmode0 = insn_data[d->icode].operand[0].mode;
   10065   tmode1 = insn_data[d->icode].operand[1].mode;
   10066   modev2 = insn_data[d->icode].operand[2].mode;
   10067   modev3 = insn_data[d->icode].operand[3].mode;
   10068   modeimm = insn_data[d->icode].operand[4].mode;
   10069 
   10070   if (VECTOR_MODE_P (modev2))
   10071     op0 = safe_vector_operand (op0, modev2);
   10072   if (VECTOR_MODE_P (modev3))
   10073     op1 = safe_vector_operand (op1, modev3);
   10074 
   10075   if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
   10076     op0 = copy_to_mode_reg (modev2, op0);
   10077   if ((optimize && !register_operand (op1, modev3))
   10078       || !insn_data[d->icode].operand[3].predicate (op1, modev3))
   10079     op1 = copy_to_mode_reg (modev3, op1);
   10080 
   10081   if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
   10082     {
   10083       error ("the third argument must be an 8-bit immediate");
   10084       return const0_rtx;
   10085     }
   10086 
   10087   if (d->code == IX86_BUILTIN_PCMPISTRI128)
   10088     {
   10089       if (optimize || !target
   10090 	  || GET_MODE (target) != tmode0
   10091 	  || !insn_data[d->icode].operand[0].predicate (target, tmode0))
   10092 	target = gen_reg_rtx (tmode0);
   10093 
   10094       scratch1 = gen_reg_rtx (tmode1);
   10095 
   10096       pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
   10097     }
   10098   else if (d->code == IX86_BUILTIN_PCMPISTRM128)
   10099     {
   10100       if (optimize || !target
   10101 	  || GET_MODE (target) != tmode1
   10102 	  || !insn_data[d->icode].operand[1].predicate (target, tmode1))
   10103 	target = gen_reg_rtx (tmode1);
   10104 
   10105       scratch0 = gen_reg_rtx (tmode0);
   10106 
   10107       pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
   10108     }
   10109   else
   10110     {
   10111       gcc_assert (d->flag);
   10112 
   10113       scratch0 = gen_reg_rtx (tmode0);
   10114       scratch1 = gen_reg_rtx (tmode1);
   10115 
   10116       pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
   10117     }
   10118 
   10119   if (! pat)
   10120     return 0;
   10121 
   10122   emit_insn (pat);
   10123 
   10124   if (d->flag)
   10125     {
   10126       target = gen_reg_rtx (SImode);
   10127       emit_move_insn (target, const0_rtx);
   10128       target = gen_rtx_SUBREG (QImode, target, 0);
   10129 
   10130       emit_insn
   10131 	(gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   10132 		      gen_rtx_fmt_ee (EQ, QImode,
   10133 				      gen_rtx_REG ((machine_mode) d->flag,
   10134 						   FLAGS_REG),
   10135 				      const0_rtx)));
   10136       return SUBREG_REG (target);
   10137     }
   10138   else
   10139     return target;
   10140 }
   10141 
   10142 /* Fixup modeless constants to fit required mode.  */
   10143 
   10144 static rtx
   10145 fixup_modeless_constant (rtx x, machine_mode mode)
   10146 {
   10147   if (GET_MODE (x) == VOIDmode)
   10148     x = convert_to_mode (mode, x, 1);
   10149   return x;
   10150 }
   10151 
   10152 /* Subroutine of ix86_expand_builtin to take care of insns with
   10153    variable number of operands.  */
   10154 
   10155 static rtx
   10156 ix86_expand_args_builtin (const struct builtin_description *d,
   10157 			  tree exp, rtx target)
   10158 {
   10159   rtx pat, real_target;
   10160   unsigned int i, nargs;
   10161   unsigned int nargs_constant = 0;
   10162   unsigned int mask_pos = 0;
   10163   int num_memory = 0;
   10164   rtx xops[6];
   10165   bool second_arg_count = false;
   10166   enum insn_code icode = d->icode;
   10167   const struct insn_data_d *insn_p = &insn_data[icode];
   10168   machine_mode tmode = insn_p->operand[0].mode;
   10169   machine_mode rmode = VOIDmode;
   10170   bool swap = false;
   10171   enum rtx_code comparison = d->comparison;
   10172 
   10173   switch ((enum ix86_builtin_func_type) d->flag)
   10174     {
   10175     case V2DF_FTYPE_V2DF_ROUND:
   10176     case V4DF_FTYPE_V4DF_ROUND:
   10177     case V8DF_FTYPE_V8DF_ROUND:
   10178     case V4SF_FTYPE_V4SF_ROUND:
   10179     case V8SF_FTYPE_V8SF_ROUND:
   10180     case V16SF_FTYPE_V16SF_ROUND:
   10181     case V8HF_FTYPE_V8HF_ROUND:
   10182     case V16HF_FTYPE_V16HF_ROUND:
   10183     case V32HF_FTYPE_V32HF_ROUND:
   10184     case V4SI_FTYPE_V4SF_ROUND:
   10185     case V8SI_FTYPE_V8SF_ROUND:
   10186     case V16SI_FTYPE_V16SF_ROUND:
   10187       return ix86_expand_sse_round (d, exp, target);
   10188     case V4SI_FTYPE_V2DF_V2DF_ROUND:
   10189     case V8SI_FTYPE_V4DF_V4DF_ROUND:
   10190     case V16SI_FTYPE_V8DF_V8DF_ROUND:
   10191       return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
   10192     case INT_FTYPE_V8SF_V8SF_PTEST:
   10193     case INT_FTYPE_V4DI_V4DI_PTEST:
   10194     case INT_FTYPE_V4DF_V4DF_PTEST:
   10195     case INT_FTYPE_V4SF_V4SF_PTEST:
   10196     case INT_FTYPE_V2DI_V2DI_PTEST:
   10197     case INT_FTYPE_V2DF_V2DF_PTEST:
   10198       return ix86_expand_sse_ptest (d, exp, target);
   10199     case FLOAT128_FTYPE_FLOAT128:
   10200     case FLOAT_FTYPE_FLOAT:
   10201     case INT_FTYPE_INT:
   10202     case UINT_FTYPE_UINT:
   10203     case UINT16_FTYPE_UINT16:
   10204     case UINT64_FTYPE_INT:
   10205     case UINT64_FTYPE_UINT64:
   10206     case INT64_FTYPE_INT64:
   10207     case INT64_FTYPE_V4SF:
   10208     case INT64_FTYPE_V2DF:
   10209     case INT_FTYPE_V16QI:
   10210     case INT_FTYPE_V8QI:
   10211     case INT_FTYPE_V8SF:
   10212     case INT_FTYPE_V4DF:
   10213     case INT_FTYPE_V4SF:
   10214     case INT_FTYPE_V2DF:
   10215     case INT_FTYPE_V32QI:
   10216     case V16QI_FTYPE_V16QI:
   10217     case V8SI_FTYPE_V8SF:
   10218     case V8SI_FTYPE_V4SI:
   10219     case V8HI_FTYPE_V8HI:
   10220     case V8HI_FTYPE_V16QI:
   10221     case V8QI_FTYPE_V8QI:
   10222     case V8SF_FTYPE_V8SF:
   10223     case V8SF_FTYPE_V8SI:
   10224     case V8SF_FTYPE_V4SF:
   10225     case V8SF_FTYPE_V8HI:
   10226     case V4SI_FTYPE_V4SI:
   10227     case V4SI_FTYPE_V16QI:
   10228     case V4SI_FTYPE_V4SF:
   10229     case V4SI_FTYPE_V8SI:
   10230     case V4SI_FTYPE_V8HI:
   10231     case V4SI_FTYPE_V4DF:
   10232     case V4SI_FTYPE_V2DF:
   10233     case V4HI_FTYPE_V4HI:
   10234     case V4DF_FTYPE_V4DF:
   10235     case V4DF_FTYPE_V4SI:
   10236     case V4DF_FTYPE_V4SF:
   10237     case V4DF_FTYPE_V2DF:
   10238     case V4SF_FTYPE_V4SF:
   10239     case V4SF_FTYPE_V4SI:
   10240     case V4SF_FTYPE_V8SF:
   10241     case V4SF_FTYPE_V4DF:
   10242     case V4SF_FTYPE_V8HI:
   10243     case V4SF_FTYPE_V2DF:
   10244     case V2DI_FTYPE_V2DI:
   10245     case V2DI_FTYPE_V16QI:
   10246     case V2DI_FTYPE_V8HI:
   10247     case V2DI_FTYPE_V4SI:
   10248     case V2DF_FTYPE_V2DF:
   10249     case V2DF_FTYPE_V4SI:
   10250     case V2DF_FTYPE_V4DF:
   10251     case V2DF_FTYPE_V4SF:
   10252     case V2DF_FTYPE_V2SI:
   10253     case V2SI_FTYPE_V2SI:
   10254     case V2SI_FTYPE_V4SF:
   10255     case V2SI_FTYPE_V2SF:
   10256     case V2SI_FTYPE_V2DF:
   10257     case V2SF_FTYPE_V2SF:
   10258     case V2SF_FTYPE_V2SI:
   10259     case V32QI_FTYPE_V32QI:
   10260     case V32QI_FTYPE_V16QI:
   10261     case V16HI_FTYPE_V16HI:
   10262     case V16HI_FTYPE_V8HI:
   10263     case V8SI_FTYPE_V8SI:
   10264     case V16HI_FTYPE_V16QI:
   10265     case V8SI_FTYPE_V16QI:
   10266     case V4DI_FTYPE_V16QI:
   10267     case V8SI_FTYPE_V8HI:
   10268     case V4DI_FTYPE_V8HI:
   10269     case V4DI_FTYPE_V4SI:
   10270     case V4DI_FTYPE_V2DI:
   10271     case UQI_FTYPE_UQI:
   10272     case UHI_FTYPE_UHI:
   10273     case USI_FTYPE_USI:
   10274     case USI_FTYPE_UQI:
   10275     case USI_FTYPE_UHI:
   10276     case UDI_FTYPE_UDI:
   10277     case UHI_FTYPE_V16QI:
   10278     case USI_FTYPE_V32QI:
   10279     case UDI_FTYPE_V64QI:
   10280     case V16QI_FTYPE_UHI:
   10281     case V32QI_FTYPE_USI:
   10282     case V64QI_FTYPE_UDI:
   10283     case V8HI_FTYPE_UQI:
   10284     case V16HI_FTYPE_UHI:
   10285     case V32HI_FTYPE_USI:
   10286     case V4SI_FTYPE_UQI:
   10287     case V8SI_FTYPE_UQI:
   10288     case V4SI_FTYPE_UHI:
   10289     case V8SI_FTYPE_UHI:
   10290     case UQI_FTYPE_V8HI:
   10291     case UHI_FTYPE_V16HI:
   10292     case USI_FTYPE_V32HI:
   10293     case UQI_FTYPE_V4SI:
   10294     case UQI_FTYPE_V8SI:
   10295     case UHI_FTYPE_V16SI:
   10296     case UQI_FTYPE_V2DI:
   10297     case UQI_FTYPE_V4DI:
   10298     case UQI_FTYPE_V8DI:
   10299     case V16SI_FTYPE_UHI:
   10300     case V2DI_FTYPE_UQI:
   10301     case V4DI_FTYPE_UQI:
   10302     case V16SI_FTYPE_INT:
   10303     case V16SF_FTYPE_V8SF:
   10304     case V16SI_FTYPE_V8SI:
   10305     case V16SF_FTYPE_V4SF:
   10306     case V16SI_FTYPE_V4SI:
   10307     case V16SI_FTYPE_V16SF:
   10308     case V16SI_FTYPE_V16SI:
   10309     case V64QI_FTYPE_V64QI:
   10310     case V32HI_FTYPE_V32HI:
   10311     case V16SF_FTYPE_V16SF:
   10312     case V8DI_FTYPE_UQI:
   10313     case V8DI_FTYPE_V8DI:
   10314     case V8DF_FTYPE_V4DF:
   10315     case V8DF_FTYPE_V2DF:
   10316     case V8DF_FTYPE_V8DF:
   10317     case V4DI_FTYPE_V4DI:
   10318     case V16HI_FTYPE_V16SF:
   10319     case V8HI_FTYPE_V8SF:
   10320     case V8HI_FTYPE_V4SF:
   10321       nargs = 1;
   10322       break;
   10323     case V4SF_FTYPE_V4SF_VEC_MERGE:
   10324     case V2DF_FTYPE_V2DF_VEC_MERGE:
   10325       return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
   10326     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
   10327     case V16QI_FTYPE_V16QI_V16QI:
   10328     case V16QI_FTYPE_V8HI_V8HI:
   10329     case V16HF_FTYPE_V16HF_V16HF:
   10330     case V16SF_FTYPE_V16SF_V16SF:
   10331     case V8QI_FTYPE_V8QI_V8QI:
   10332     case V8QI_FTYPE_V4HI_V4HI:
   10333     case V8HI_FTYPE_V8HI_V8HI:
   10334     case V8HI_FTYPE_V16QI_V16QI:
   10335     case V8HI_FTYPE_V4SI_V4SI:
   10336     case V8HF_FTYPE_V8HF_V8HF:
   10337     case V8SF_FTYPE_V8SF_V8SF:
   10338     case V8SF_FTYPE_V8SF_V8SI:
   10339     case V8DF_FTYPE_V8DF_V8DF:
   10340     case V4SI_FTYPE_V4SI_V4SI:
   10341     case V4SI_FTYPE_V8HI_V8HI:
   10342     case V4SI_FTYPE_V2DF_V2DF:
   10343     case V4HI_FTYPE_V4HI_V4HI:
   10344     case V4HI_FTYPE_V8QI_V8QI:
   10345     case V4HI_FTYPE_V2SI_V2SI:
   10346     case V4DF_FTYPE_V4DF_V4DF:
   10347     case V4DF_FTYPE_V4DF_V4DI:
   10348     case V4SF_FTYPE_V4SF_V4SF:
   10349     case V4SF_FTYPE_V4SF_V4SI:
   10350     case V4SF_FTYPE_V4SF_V2SI:
   10351     case V4SF_FTYPE_V4SF_V2DF:
   10352     case V4SF_FTYPE_V4SF_UINT:
   10353     case V4SF_FTYPE_V4SF_DI:
   10354     case V4SF_FTYPE_V4SF_SI:
   10355     case V2DI_FTYPE_V2DI_V2DI:
   10356     case V2DI_FTYPE_V16QI_V16QI:
   10357     case V2DI_FTYPE_V4SI_V4SI:
   10358     case V2DI_FTYPE_V2DI_V16QI:
   10359     case V2SI_FTYPE_V2SI_V2SI:
   10360     case V2SI_FTYPE_V4HI_V4HI:
   10361     case V2SI_FTYPE_V2SF_V2SF:
   10362     case V2DF_FTYPE_V2DF_V2DF:
   10363     case V2DF_FTYPE_V2DF_V4SF:
   10364     case V2DF_FTYPE_V2DF_V2DI:
   10365     case V2DF_FTYPE_V2DF_DI:
   10366     case V2DF_FTYPE_V2DF_SI:
   10367     case V2DF_FTYPE_V2DF_UINT:
   10368     case V2SF_FTYPE_V2SF_V2SF:
   10369     case V1DI_FTYPE_V1DI_V1DI:
   10370     case V1DI_FTYPE_V8QI_V8QI:
   10371     case V1DI_FTYPE_V2SI_V2SI:
   10372     case V32QI_FTYPE_V16HI_V16HI:
   10373     case V16HI_FTYPE_V8SI_V8SI:
   10374     case V64QI_FTYPE_V64QI_V64QI:
   10375     case V32QI_FTYPE_V32QI_V32QI:
   10376     case V16HI_FTYPE_V32QI_V32QI:
   10377     case V16HI_FTYPE_V16HI_V16HI:
   10378     case V8SI_FTYPE_V4DF_V4DF:
   10379     case V8SI_FTYPE_V8SI_V8SI:
   10380     case V8SI_FTYPE_V16HI_V16HI:
   10381     case V4DI_FTYPE_V4DI_V4DI:
   10382     case V4DI_FTYPE_V8SI_V8SI:
   10383     case V8DI_FTYPE_V64QI_V64QI:
   10384       if (comparison == UNKNOWN)
   10385 	return ix86_expand_binop_builtin (icode, exp, target);
   10386       nargs = 2;
   10387       break;
   10388     case V4SF_FTYPE_V4SF_V4SF_SWAP:
   10389     case V2DF_FTYPE_V2DF_V2DF_SWAP:
   10390       gcc_assert (comparison != UNKNOWN);
   10391       nargs = 2;
   10392       swap = true;
   10393       break;
   10394     case V16HI_FTYPE_V16HI_V8HI_COUNT:
   10395     case V16HI_FTYPE_V16HI_SI_COUNT:
   10396     case V8SI_FTYPE_V8SI_V4SI_COUNT:
   10397     case V8SI_FTYPE_V8SI_SI_COUNT:
   10398     case V4DI_FTYPE_V4DI_V2DI_COUNT:
   10399     case V4DI_FTYPE_V4DI_INT_COUNT:
   10400     case V8HI_FTYPE_V8HI_V8HI_COUNT:
   10401     case V8HI_FTYPE_V8HI_SI_COUNT:
   10402     case V4SI_FTYPE_V4SI_V4SI_COUNT:
   10403     case V4SI_FTYPE_V4SI_SI_COUNT:
   10404     case V4HI_FTYPE_V4HI_V4HI_COUNT:
   10405     case V4HI_FTYPE_V4HI_SI_COUNT:
   10406     case V2DI_FTYPE_V2DI_V2DI_COUNT:
   10407     case V2DI_FTYPE_V2DI_SI_COUNT:
   10408     case V2SI_FTYPE_V2SI_V2SI_COUNT:
   10409     case V2SI_FTYPE_V2SI_SI_COUNT:
   10410     case V1DI_FTYPE_V1DI_V1DI_COUNT:
   10411     case V1DI_FTYPE_V1DI_SI_COUNT:
   10412       nargs = 2;
   10413       second_arg_count = true;
   10414       break;
   10415     case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
   10416     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
   10417     case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
   10418     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
   10419     case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
   10420     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
   10421     case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
   10422     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
   10423     case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
   10424     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
   10425     case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
   10426     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
   10427     case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
   10428     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
   10429     case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
   10430     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
   10431     case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
   10432     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
   10433       nargs = 4;
   10434       second_arg_count = true;
   10435       break;
   10436     case UINT64_FTYPE_UINT64_UINT64:
   10437     case UINT_FTYPE_UINT_UINT:
   10438     case UINT_FTYPE_UINT_USHORT:
   10439     case UINT_FTYPE_UINT_UCHAR:
   10440     case UINT16_FTYPE_UINT16_INT:
   10441     case UINT8_FTYPE_UINT8_INT:
   10442     case UQI_FTYPE_UQI_UQI:
   10443     case UHI_FTYPE_UHI_UHI:
   10444     case USI_FTYPE_USI_USI:
   10445     case UDI_FTYPE_UDI_UDI:
   10446     case V16SI_FTYPE_V8DF_V8DF:
   10447     case V32HI_FTYPE_V16SF_V16SF:
   10448     case V16HI_FTYPE_V8SF_V8SF:
   10449     case V8HI_FTYPE_V4SF_V4SF:
   10450     case V16HI_FTYPE_V16SF_UHI:
   10451     case V8HI_FTYPE_V8SF_UQI:
   10452     case V8HI_FTYPE_V4SF_UQI:
   10453       nargs = 2;
   10454       break;
   10455     case V2DI_FTYPE_V2DI_INT_CONVERT:
   10456       nargs = 2;
   10457       rmode = V1TImode;
   10458       nargs_constant = 1;
   10459       break;
   10460     case V4DI_FTYPE_V4DI_INT_CONVERT:
   10461       nargs = 2;
   10462       rmode = V2TImode;
   10463       nargs_constant = 1;
   10464       break;
   10465     case V8DI_FTYPE_V8DI_INT_CONVERT:
   10466       nargs = 2;
   10467       rmode = V4TImode;
   10468       nargs_constant = 1;
   10469       break;
   10470     case V8HI_FTYPE_V8HI_INT:
   10471     case V8HI_FTYPE_V8SF_INT:
   10472     case V16HI_FTYPE_V16SF_INT:
   10473     case V8HI_FTYPE_V4SF_INT:
   10474     case V8SF_FTYPE_V8SF_INT:
   10475     case V4SF_FTYPE_V16SF_INT:
   10476     case V16SF_FTYPE_V16SF_INT:
   10477     case V4SI_FTYPE_V4SI_INT:
   10478     case V4SI_FTYPE_V8SI_INT:
   10479     case V4HI_FTYPE_V4HI_INT:
   10480     case V4DF_FTYPE_V4DF_INT:
   10481     case V4DF_FTYPE_V8DF_INT:
   10482     case V4SF_FTYPE_V4SF_INT:
   10483     case V4SF_FTYPE_V8SF_INT:
   10484     case V2DI_FTYPE_V2DI_INT:
   10485     case V2DF_FTYPE_V2DF_INT:
   10486     case V2DF_FTYPE_V4DF_INT:
   10487     case V16HI_FTYPE_V16HI_INT:
   10488     case V8SI_FTYPE_V8SI_INT:
   10489     case V16SI_FTYPE_V16SI_INT:
   10490     case V4SI_FTYPE_V16SI_INT:
   10491     case V4DI_FTYPE_V4DI_INT:
   10492     case V2DI_FTYPE_V4DI_INT:
   10493     case V4DI_FTYPE_V8DI_INT:
   10494     case UQI_FTYPE_UQI_UQI_CONST:
   10495     case UHI_FTYPE_UHI_UQI:
   10496     case USI_FTYPE_USI_UQI:
   10497     case UDI_FTYPE_UDI_UQI:
   10498       nargs = 2;
   10499       nargs_constant = 1;
   10500       break;
   10501     case V16QI_FTYPE_V16QI_V16QI_V16QI:
   10502     case V8SF_FTYPE_V8SF_V8SF_V8SF:
   10503     case V4DF_FTYPE_V4DF_V4DF_V4DF:
   10504     case V4SF_FTYPE_V4SF_V4SF_V4SF:
   10505     case V2DF_FTYPE_V2DF_V2DF_V2DF:
   10506     case V32QI_FTYPE_V32QI_V32QI_V32QI:
   10507     case UHI_FTYPE_V16SI_V16SI_UHI:
   10508     case UQI_FTYPE_V8DI_V8DI_UQI:
   10509     case V16HI_FTYPE_V16SI_V16HI_UHI:
   10510     case V16QI_FTYPE_V16SI_V16QI_UHI:
   10511     case V16QI_FTYPE_V8DI_V16QI_UQI:
   10512     case V32HF_FTYPE_V32HF_V32HF_USI:
   10513     case V16SF_FTYPE_V16SF_V16SF_UHI:
   10514     case V16SF_FTYPE_V4SF_V16SF_UHI:
   10515     case V16SI_FTYPE_SI_V16SI_UHI:
   10516     case V16SI_FTYPE_V16HI_V16SI_UHI:
   10517     case V16SI_FTYPE_V16QI_V16SI_UHI:
   10518     case V8SF_FTYPE_V4SF_V8SF_UQI:
   10519     case V4DF_FTYPE_V2DF_V4DF_UQI:
   10520     case V8SI_FTYPE_V4SI_V8SI_UQI:
   10521     case V8SI_FTYPE_SI_V8SI_UQI:
   10522     case V4SI_FTYPE_V4SI_V4SI_UQI:
   10523     case V4SI_FTYPE_SI_V4SI_UQI:
   10524     case V4DI_FTYPE_V2DI_V4DI_UQI:
   10525     case V4DI_FTYPE_DI_V4DI_UQI:
   10526     case V2DI_FTYPE_V2DI_V2DI_UQI:
   10527     case V2DI_FTYPE_DI_V2DI_UQI:
   10528     case V64QI_FTYPE_V64QI_V64QI_UDI:
   10529     case V64QI_FTYPE_V16QI_V64QI_UDI:
   10530     case V64QI_FTYPE_QI_V64QI_UDI:
   10531     case V32QI_FTYPE_V32QI_V32QI_USI:
   10532     case V32QI_FTYPE_V16QI_V32QI_USI:
   10533     case V32QI_FTYPE_QI_V32QI_USI:
   10534     case V16QI_FTYPE_V16QI_V16QI_UHI:
   10535     case V16QI_FTYPE_QI_V16QI_UHI:
   10536     case V32HI_FTYPE_V8HI_V32HI_USI:
   10537     case V32HI_FTYPE_HI_V32HI_USI:
   10538     case V16HI_FTYPE_V8HI_V16HI_UHI:
   10539     case V16HI_FTYPE_HI_V16HI_UHI:
   10540     case V8HI_FTYPE_V8HI_V8HI_UQI:
   10541     case V8HI_FTYPE_HI_V8HI_UQI:
   10542     case V16HF_FTYPE_V16HF_V16HF_UHI:
   10543     case V8SF_FTYPE_V8HI_V8SF_UQI:
   10544     case V4SF_FTYPE_V8HI_V4SF_UQI:
   10545     case V8SI_FTYPE_V8HF_V8SI_UQI:
   10546     case V8SF_FTYPE_V8HF_V8SF_UQI:
   10547     case V8SI_FTYPE_V8SF_V8SI_UQI:
   10548     case V4SI_FTYPE_V4SF_V4SI_UQI:
   10549     case V4SI_FTYPE_V8HF_V4SI_UQI:
   10550     case V4SF_FTYPE_V8HF_V4SF_UQI:
   10551     case V4DI_FTYPE_V8HF_V4DI_UQI:
   10552     case V4DI_FTYPE_V4SF_V4DI_UQI:
   10553     case V2DI_FTYPE_V8HF_V2DI_UQI:
   10554     case V2DI_FTYPE_V4SF_V2DI_UQI:
   10555     case V8HF_FTYPE_V8HF_V8HF_UQI:
   10556     case V8HF_FTYPE_V8HF_V8HF_V8HF:
   10557     case V8HF_FTYPE_V8HI_V8HF_UQI:
   10558     case V8HF_FTYPE_V8SI_V8HF_UQI:
   10559     case V8HF_FTYPE_V8SF_V8HF_UQI:
   10560     case V8HF_FTYPE_V4SI_V8HF_UQI:
   10561     case V8HF_FTYPE_V4SF_V8HF_UQI:
   10562     case V8HF_FTYPE_V4DI_V8HF_UQI:
   10563     case V8HF_FTYPE_V4DF_V8HF_UQI:
   10564     case V8HF_FTYPE_V2DI_V8HF_UQI:
   10565     case V8HF_FTYPE_V2DF_V8HF_UQI:
   10566     case V4SF_FTYPE_V4DI_V4SF_UQI:
   10567     case V4SF_FTYPE_V2DI_V4SF_UQI:
   10568     case V4DF_FTYPE_V4DI_V4DF_UQI:
   10569     case V4DF_FTYPE_V8HF_V4DF_UQI:
   10570     case V2DF_FTYPE_V8HF_V2DF_UQI:
   10571     case V2DF_FTYPE_V2DI_V2DF_UQI:
   10572     case V16QI_FTYPE_V8HI_V16QI_UQI:
   10573     case V16QI_FTYPE_V16HI_V16QI_UHI:
   10574     case V16QI_FTYPE_V4SI_V16QI_UQI:
   10575     case V16QI_FTYPE_V8SI_V16QI_UQI:
   10576     case V8HI_FTYPE_V8HF_V8HI_UQI:
   10577     case V8HI_FTYPE_V4SI_V8HI_UQI:
   10578     case V8HI_FTYPE_V8SI_V8HI_UQI:
   10579     case V16QI_FTYPE_V2DI_V16QI_UQI:
   10580     case V16QI_FTYPE_V4DI_V16QI_UQI:
   10581     case V8HI_FTYPE_V2DI_V8HI_UQI:
   10582     case V8HI_FTYPE_V4DI_V8HI_UQI:
   10583     case V4SI_FTYPE_V2DI_V4SI_UQI:
   10584     case V4SI_FTYPE_V4DI_V4SI_UQI:
   10585     case V32QI_FTYPE_V32HI_V32QI_USI:
   10586     case UHI_FTYPE_V16QI_V16QI_UHI:
   10587     case USI_FTYPE_V32QI_V32QI_USI:
   10588     case UDI_FTYPE_V64QI_V64QI_UDI:
   10589     case UQI_FTYPE_V8HI_V8HI_UQI:
   10590     case UHI_FTYPE_V16HI_V16HI_UHI:
   10591     case USI_FTYPE_V32HI_V32HI_USI:
   10592     case UQI_FTYPE_V4SI_V4SI_UQI:
   10593     case UQI_FTYPE_V8SI_V8SI_UQI:
   10594     case UQI_FTYPE_V2DI_V2DI_UQI:
   10595     case UQI_FTYPE_V4DI_V4DI_UQI:
   10596     case V4SF_FTYPE_V2DF_V4SF_UQI:
   10597     case V4SF_FTYPE_V4DF_V4SF_UQI:
   10598     case V16SI_FTYPE_V16SI_V16SI_UHI:
   10599     case V16SI_FTYPE_V4SI_V16SI_UHI:
   10600     case V2DI_FTYPE_V4SI_V2DI_UQI:
   10601     case V2DI_FTYPE_V8HI_V2DI_UQI:
   10602     case V2DI_FTYPE_V16QI_V2DI_UQI:
   10603     case V4DI_FTYPE_V4DI_V4DI_UQI:
   10604     case V4DI_FTYPE_V4SI_V4DI_UQI:
   10605     case V4DI_FTYPE_V8HI_V4DI_UQI:
   10606     case V4DI_FTYPE_V16QI_V4DI_UQI:
   10607     case V4DI_FTYPE_V4DF_V4DI_UQI:
   10608     case V2DI_FTYPE_V2DF_V2DI_UQI:
   10609     case V4SI_FTYPE_V4DF_V4SI_UQI:
   10610     case V4SI_FTYPE_V2DF_V4SI_UQI:
   10611     case V4SI_FTYPE_V8HI_V4SI_UQI:
   10612     case V4SI_FTYPE_V16QI_V4SI_UQI:
   10613     case V4DI_FTYPE_V4DI_V4DI_V4DI:
   10614     case V8DF_FTYPE_V2DF_V8DF_UQI:
   10615     case V8DF_FTYPE_V4DF_V8DF_UQI:
   10616     case V8DF_FTYPE_V8DF_V8DF_UQI:
   10617     case V8SF_FTYPE_V8SF_V8SF_UQI:
   10618     case V8SF_FTYPE_V8SI_V8SF_UQI:
   10619     case V4DF_FTYPE_V4DF_V4DF_UQI:
   10620     case V4SF_FTYPE_V4SF_V4SF_UQI:
   10621     case V2DF_FTYPE_V2DF_V2DF_UQI:
   10622     case V2DF_FTYPE_V4SF_V2DF_UQI:
   10623     case V2DF_FTYPE_V4SI_V2DF_UQI:
   10624     case V4SF_FTYPE_V4SI_V4SF_UQI:
   10625     case V4DF_FTYPE_V4SF_V4DF_UQI:
   10626     case V4DF_FTYPE_V4SI_V4DF_UQI:
   10627     case V8SI_FTYPE_V8SI_V8SI_UQI:
   10628     case V8SI_FTYPE_V8HI_V8SI_UQI:
   10629     case V8SI_FTYPE_V16QI_V8SI_UQI:
   10630     case V8DF_FTYPE_V8SI_V8DF_UQI:
   10631     case V8DI_FTYPE_DI_V8DI_UQI:
   10632     case V16SF_FTYPE_V8SF_V16SF_UHI:
   10633     case V16SI_FTYPE_V8SI_V16SI_UHI:
   10634     case V16HF_FTYPE_V16HI_V16HF_UHI:
   10635     case V16HF_FTYPE_V16HF_V16HF_V16HF:
   10636     case V16HI_FTYPE_V16HF_V16HI_UHI:
   10637     case V16HI_FTYPE_V16HI_V16HI_UHI:
   10638     case V8HI_FTYPE_V16QI_V8HI_UQI:
   10639     case V16HI_FTYPE_V16QI_V16HI_UHI:
   10640     case V32HI_FTYPE_V32HI_V32HI_USI:
   10641     case V32HI_FTYPE_V32QI_V32HI_USI:
   10642     case V8DI_FTYPE_V16QI_V8DI_UQI:
   10643     case V8DI_FTYPE_V2DI_V8DI_UQI:
   10644     case V8DI_FTYPE_V4DI_V8DI_UQI:
   10645     case V8DI_FTYPE_V8DI_V8DI_UQI:
   10646     case V8DI_FTYPE_V8HI_V8DI_UQI:
   10647     case V8DI_FTYPE_V8SI_V8DI_UQI:
   10648     case V8HI_FTYPE_V8DI_V8HI_UQI:
   10649     case V8SI_FTYPE_V8DI_V8SI_UQI:
   10650     case V4SI_FTYPE_V4SI_V4SI_V4SI:
   10651     case V16SI_FTYPE_V16SI_V16SI_V16SI:
   10652     case V8DI_FTYPE_V8DI_V8DI_V8DI:
   10653     case V32HI_FTYPE_V32HI_V32HI_V32HI:
   10654     case V2DI_FTYPE_V2DI_V2DI_V2DI:
   10655     case V16HI_FTYPE_V16HI_V16HI_V16HI:
   10656     case V8SI_FTYPE_V8SI_V8SI_V8SI:
   10657     case V8HI_FTYPE_V8HI_V8HI_V8HI:
   10658     case V32HI_FTYPE_V16SF_V16SF_USI:
   10659     case V16HI_FTYPE_V8SF_V8SF_UHI:
   10660     case V8HI_FTYPE_V4SF_V4SF_UQI:
   10661     case V16HI_FTYPE_V16SF_V16HI_UHI:
   10662     case V8HI_FTYPE_V8SF_V8HI_UQI:
   10663     case V8HI_FTYPE_V4SF_V8HI_UQI:
   10664     case V16SF_FTYPE_V16SF_V32HI_V32HI:
   10665     case V8SF_FTYPE_V8SF_V16HI_V16HI:
   10666     case V4SF_FTYPE_V4SF_V8HI_V8HI:
   10667       nargs = 3;
   10668       break;
   10669     case V32QI_FTYPE_V32QI_V32QI_INT:
   10670     case V16HI_FTYPE_V16HI_V16HI_INT:
   10671     case V16QI_FTYPE_V16QI_V16QI_INT:
   10672     case V4DI_FTYPE_V4DI_V4DI_INT:
   10673     case V8HI_FTYPE_V8HI_V8HI_INT:
   10674     case V8SI_FTYPE_V8SI_V8SI_INT:
   10675     case V8SI_FTYPE_V8SI_V4SI_INT:
   10676     case V8SF_FTYPE_V8SF_V8SF_INT:
   10677     case V8SF_FTYPE_V8SF_V4SF_INT:
   10678     case V4SI_FTYPE_V4SI_V4SI_INT:
   10679     case V4DF_FTYPE_V4DF_V4DF_INT:
   10680     case V16SF_FTYPE_V16SF_V16SF_INT:
   10681     case V16SF_FTYPE_V16SF_V4SF_INT:
   10682     case V16SI_FTYPE_V16SI_V4SI_INT:
   10683     case V4DF_FTYPE_V4DF_V2DF_INT:
   10684     case V4SF_FTYPE_V4SF_V4SF_INT:
   10685     case V2DI_FTYPE_V2DI_V2DI_INT:
   10686     case V4DI_FTYPE_V4DI_V2DI_INT:
   10687     case V2DF_FTYPE_V2DF_V2DF_INT:
   10688     case UQI_FTYPE_V8DI_V8UDI_INT:
   10689     case UQI_FTYPE_V8DF_V8DF_INT:
   10690     case UQI_FTYPE_V2DF_V2DF_INT:
   10691     case UQI_FTYPE_V4SF_V4SF_INT:
   10692     case UHI_FTYPE_V16SI_V16SI_INT:
   10693     case UHI_FTYPE_V16SF_V16SF_INT:
   10694     case V64QI_FTYPE_V64QI_V64QI_INT:
   10695     case V32HI_FTYPE_V32HI_V32HI_INT:
   10696     case V16SI_FTYPE_V16SI_V16SI_INT:
   10697     case V8DI_FTYPE_V8DI_V8DI_INT:
   10698       nargs = 3;
   10699       nargs_constant = 1;
   10700       break;
   10701     case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
   10702       nargs = 3;
   10703       rmode = V4DImode;
   10704       nargs_constant = 1;
   10705       break;
   10706     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
   10707       nargs = 3;
   10708       rmode = V2DImode;
   10709       nargs_constant = 1;
   10710       break;
   10711     case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
   10712       nargs = 3;
   10713       rmode = DImode;
   10714       nargs_constant = 1;
   10715       break;
   10716     case V2DI_FTYPE_V2DI_UINT_UINT:
   10717       nargs = 3;
   10718       nargs_constant = 2;
   10719       break;
   10720     case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
   10721       nargs = 3;
   10722       rmode = V8DImode;
   10723       nargs_constant = 1;
   10724       break;
   10725     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
   10726       nargs = 5;
   10727       rmode = V8DImode;
   10728       mask_pos = 2;
   10729       nargs_constant = 1;
   10730       break;
   10731     case QI_FTYPE_V8DF_INT_UQI:
   10732     case QI_FTYPE_V4DF_INT_UQI:
   10733     case QI_FTYPE_V2DF_INT_UQI:
   10734     case HI_FTYPE_V16SF_INT_UHI:
   10735     case QI_FTYPE_V8SF_INT_UQI:
   10736     case QI_FTYPE_V4SF_INT_UQI:
   10737     case QI_FTYPE_V8HF_INT_UQI:
   10738     case HI_FTYPE_V16HF_INT_UHI:
   10739     case SI_FTYPE_V32HF_INT_USI:
   10740     case V4SI_FTYPE_V4SI_V4SI_UHI:
   10741     case V8SI_FTYPE_V8SI_V8SI_UHI:
   10742       nargs = 3;
   10743       mask_pos = 1;
   10744       nargs_constant = 1;
   10745       break;
   10746     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
   10747       nargs = 5;
   10748       rmode = V4DImode;
   10749       mask_pos = 2;
   10750       nargs_constant = 1;
   10751       break;
   10752     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
   10753       nargs = 5;
   10754       rmode = V2DImode;
   10755       mask_pos = 2;
   10756       nargs_constant = 1;
   10757       break;
   10758     case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
   10759     case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
   10760     case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
   10761     case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
   10762     case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
   10763     case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
   10764     case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
   10765     case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
   10766     case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
   10767     case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
   10768     case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
   10769     case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
   10770     case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
   10771     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
   10772     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
   10773     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
   10774     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
   10775     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
   10776     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
   10777     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
   10778     case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
   10779     case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
   10780     case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
   10781     case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
   10782     case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
   10783     case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
   10784     case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
   10785     case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
   10786     case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
   10787     case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
   10788     case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
   10789     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
   10790     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
   10791     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
   10792     case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
   10793     case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
   10794     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
   10795     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
   10796     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
   10797     case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
   10798     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
   10799     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
   10800     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
   10801     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
   10802     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
   10803     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
   10804     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
   10805     case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
   10806     case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
   10807     case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
   10808     case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
   10809     case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
   10810     case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
   10811     case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
   10812     case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
   10813     case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
   10814     case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
   10815     case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
   10816       nargs = 4;
   10817       break;
   10818     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
   10819     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
   10820     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
   10821     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
   10822     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
   10823       nargs = 4;
   10824       nargs_constant = 1;
   10825       break;
   10826     case UQI_FTYPE_V4DI_V4DI_INT_UQI:
   10827     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
   10828     case QI_FTYPE_V4DF_V4DF_INT_UQI:
   10829     case QI_FTYPE_V8SF_V8SF_INT_UQI:
   10830     case UHI_FTYPE_V16HF_V16HF_INT_UHI:
   10831     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
   10832     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
   10833     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
   10834     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
   10835     case UQI_FTYPE_V8HF_V8HF_INT_UQI:
   10836     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
   10837     case USI_FTYPE_V32QI_V32QI_INT_USI:
   10838     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
   10839     case USI_FTYPE_V32HI_V32HI_INT_USI:
   10840     case USI_FTYPE_V32HF_V32HF_INT_USI:
   10841     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
   10842     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
   10843       nargs = 4;
   10844       mask_pos = 1;
   10845       nargs_constant = 1;
   10846       break;
   10847     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
   10848       nargs = 4;
   10849       nargs_constant = 2;
   10850       break;
   10851     case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
   10852     case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
   10853     case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
   10854     case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
   10855     case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
   10856       nargs = 4;
   10857       break;
   10858     case UQI_FTYPE_V8DI_V8DI_INT_UQI:
   10859     case UHI_FTYPE_V16SI_V16SI_INT_UHI:
   10860       mask_pos = 1;
   10861       nargs = 4;
   10862       nargs_constant = 1;
   10863       break;
   10864     case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
   10865     case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
   10866     case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
   10867     case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
   10868     case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
   10869     case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
   10870     case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
   10871     case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
   10872     case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
   10873     case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
   10874     case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
   10875     case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
   10876     case V32HI_FTYPE_V32HI_INT_V32HI_USI:
   10877     case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
   10878     case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
   10879     case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
   10880     case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
   10881     case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
   10882     case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
   10883     case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
   10884     case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
   10885     case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
   10886     case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
   10887     case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
   10888     case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
   10889     case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
   10890     case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
   10891     case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
   10892     case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
   10893     case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
   10894     case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
   10895     case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
   10896       nargs = 4;
   10897       mask_pos = 2;
   10898       nargs_constant = 1;
   10899       break;
   10900     case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
   10901     case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
   10902     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
   10903     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
   10904     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
   10905     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
   10906     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
   10907     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
   10908     case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
   10909     case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
   10910     case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
   10911     case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
   10912     case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
   10913     case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
   10914     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
   10915     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
   10916     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
   10917     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
   10918     case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
   10919     case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
   10920     case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
   10921     case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
   10922     case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
   10923     case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
   10924     case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
   10925     case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
   10926     case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
   10927       nargs = 5;
   10928       mask_pos = 2;
   10929       nargs_constant = 1;
   10930       break;
   10931     case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
   10932     case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
   10933     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
   10934     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
   10935     case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
   10936     case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
   10937     case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
   10938     case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
   10939     case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
   10940     case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
   10941       nargs = 5;
   10942       mask_pos = 1;
   10943       nargs_constant = 1;
   10944       break;
   10945     case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
   10946     case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
   10947     case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
   10948     case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
   10949     case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
   10950     case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
   10951     case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
   10952     case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
   10953     case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
   10954     case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
   10955     case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
   10956     case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
   10957       nargs = 5;
   10958       mask_pos = 1;
   10959       nargs_constant = 2;
   10960       break;
   10961 
   10962     default:
   10963       gcc_unreachable ();
   10964     }
   10965 
   10966   gcc_assert (nargs <= ARRAY_SIZE (xops));
   10967 
   10968   if (comparison != UNKNOWN)
   10969     {
   10970       gcc_assert (nargs == 2);
   10971       return ix86_expand_sse_compare (d, exp, target, swap);
   10972     }
   10973 
   10974   if (rmode == VOIDmode || rmode == tmode)
   10975     {
   10976       if (optimize
   10977 	  || target == 0
   10978 	  || GET_MODE (target) != tmode
   10979 	  || !insn_p->operand[0].predicate (target, tmode))
   10980 	target = gen_reg_rtx (tmode);
   10981       else if (memory_operand (target, tmode))
   10982 	num_memory++;
   10983       real_target = target;
   10984     }
   10985   else
   10986     {
   10987       real_target = gen_reg_rtx (tmode);
   10988       target = lowpart_subreg (rmode, real_target, tmode);
   10989     }
   10990 
   10991   for (i = 0; i < nargs; i++)
   10992     {
   10993       tree arg = CALL_EXPR_ARG (exp, i);
   10994       rtx op = expand_normal (arg);
   10995       machine_mode mode = insn_p->operand[i + 1].mode;
   10996       bool match = insn_p->operand[i + 1].predicate (op, mode);
   10997 
   10998       if (second_arg_count && i == 1)
   10999 	{
   11000 	  /* SIMD shift insns take either an 8-bit immediate or
   11001 	     register as count.  But builtin functions take int as
   11002 	     count.  If count doesn't match, we put it in register.
   11003 	     The instructions are using 64-bit count, if op is just
   11004 	     32-bit, zero-extend it, as negative shift counts
   11005 	     are undefined behavior and zero-extension is more
   11006 	     efficient.  */
   11007 	  if (!match)
   11008 	    {
   11009 	      if (SCALAR_INT_MODE_P (GET_MODE (op)))
   11010 		op = convert_modes (mode, GET_MODE (op), op, 1);
   11011 	      else
   11012 		op = lowpart_subreg (mode, op, GET_MODE (op));
   11013 	      if (!insn_p->operand[i + 1].predicate (op, mode))
   11014 		op = copy_to_reg (op);
   11015 	    }
   11016 	}
   11017       else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   11018 	       (!mask_pos && (nargs - i) <= nargs_constant))
   11019 	{
   11020 	  if (!match)
   11021 	    switch (icode)
   11022 	      {
   11023 	      case CODE_FOR_avx_vinsertf128v4di:
   11024 	      case CODE_FOR_avx_vextractf128v4di:
   11025 		error ("the last argument must be an 1-bit immediate");
   11026 		return const0_rtx;
   11027 
   11028 	      case CODE_FOR_avx512f_cmpv8di3_mask:
   11029 	      case CODE_FOR_avx512f_cmpv16si3_mask:
   11030 	      case CODE_FOR_avx512f_ucmpv8di3_mask:
   11031 	      case CODE_FOR_avx512f_ucmpv16si3_mask:
   11032 	      case CODE_FOR_avx512vl_cmpv4di3_mask:
   11033 	      case CODE_FOR_avx512vl_cmpv8si3_mask:
   11034 	      case CODE_FOR_avx512vl_ucmpv4di3_mask:
   11035 	      case CODE_FOR_avx512vl_ucmpv8si3_mask:
   11036 	      case CODE_FOR_avx512vl_cmpv2di3_mask:
   11037 	      case CODE_FOR_avx512vl_cmpv4si3_mask:
   11038 	      case CODE_FOR_avx512vl_ucmpv2di3_mask:
   11039 	      case CODE_FOR_avx512vl_ucmpv4si3_mask:
   11040 		error ("the last argument must be a 3-bit immediate");
   11041 		return const0_rtx;
   11042 
   11043 	      case CODE_FOR_sse4_1_roundsd:
   11044 	      case CODE_FOR_sse4_1_roundss:
   11045 
   11046 	      case CODE_FOR_sse4_1_roundpd:
   11047 	      case CODE_FOR_sse4_1_roundps:
   11048 	      case CODE_FOR_avx_roundpd256:
   11049 	      case CODE_FOR_avx_roundps256:
   11050 
   11051 	      case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
   11052 	      case CODE_FOR_sse4_1_roundps_sfix:
   11053 	      case CODE_FOR_avx_roundpd_vec_pack_sfix256:
   11054 	      case CODE_FOR_avx_roundps_sfix256:
   11055 
   11056 	      case CODE_FOR_sse4_1_blendps:
   11057 	      case CODE_FOR_avx_blendpd256:
   11058 	      case CODE_FOR_avx_vpermilv4df:
   11059 	      case CODE_FOR_avx_vpermilv4df_mask:
   11060 	      case CODE_FOR_avx512f_getmantv8df_mask:
   11061 	      case CODE_FOR_avx512f_getmantv16sf_mask:
   11062 	      case CODE_FOR_avx512vl_getmantv16hf_mask:
   11063 	      case CODE_FOR_avx512vl_getmantv8sf_mask:
   11064 	      case CODE_FOR_avx512vl_getmantv4df_mask:
   11065 	      case CODE_FOR_avx512fp16_getmantv8hf_mask:
   11066 	      case CODE_FOR_avx512vl_getmantv4sf_mask:
   11067 	      case CODE_FOR_avx512vl_getmantv2df_mask:
   11068 	      case CODE_FOR_avx512dq_rangepv8df_mask_round:
   11069 	      case CODE_FOR_avx512dq_rangepv16sf_mask_round:
   11070 	      case CODE_FOR_avx512dq_rangepv4df_mask:
   11071 	      case CODE_FOR_avx512dq_rangepv8sf_mask:
   11072 	      case CODE_FOR_avx512dq_rangepv2df_mask:
   11073 	      case CODE_FOR_avx512dq_rangepv4sf_mask:
   11074 	      case CODE_FOR_avx_shufpd256_mask:
   11075 		error ("the last argument must be a 4-bit immediate");
   11076 		return const0_rtx;
   11077 
   11078 	      case CODE_FOR_sha1rnds4:
   11079 	      case CODE_FOR_sse4_1_blendpd:
   11080 	      case CODE_FOR_avx_vpermilv2df:
   11081 	      case CODE_FOR_avx_vpermilv2df_mask:
   11082 	      case CODE_FOR_xop_vpermil2v2df3:
   11083 	      case CODE_FOR_xop_vpermil2v4sf3:
   11084 	      case CODE_FOR_xop_vpermil2v4df3:
   11085 	      case CODE_FOR_xop_vpermil2v8sf3:
   11086 	      case CODE_FOR_avx512f_vinsertf32x4_mask:
   11087 	      case CODE_FOR_avx512f_vinserti32x4_mask:
   11088 	      case CODE_FOR_avx512f_vextractf32x4_mask:
   11089 	      case CODE_FOR_avx512f_vextracti32x4_mask:
   11090 	      case CODE_FOR_sse2_shufpd:
   11091 	      case CODE_FOR_sse2_shufpd_mask:
   11092 	      case CODE_FOR_avx512dq_shuf_f64x2_mask:
   11093 	      case CODE_FOR_avx512dq_shuf_i64x2_mask:
   11094 	      case CODE_FOR_avx512vl_shuf_i32x4_mask:
   11095 	      case CODE_FOR_avx512vl_shuf_f32x4_mask:
   11096 		error ("the last argument must be a 2-bit immediate");
   11097 		return const0_rtx;
   11098 
   11099 	      case CODE_FOR_avx_vextractf128v4df:
   11100 	      case CODE_FOR_avx_vextractf128v8sf:
   11101 	      case CODE_FOR_avx_vextractf128v8si:
   11102 	      case CODE_FOR_avx_vinsertf128v4df:
   11103 	      case CODE_FOR_avx_vinsertf128v8sf:
   11104 	      case CODE_FOR_avx_vinsertf128v8si:
   11105 	      case CODE_FOR_avx512f_vinsertf64x4_mask:
   11106 	      case CODE_FOR_avx512f_vinserti64x4_mask:
   11107 	      case CODE_FOR_avx512f_vextractf64x4_mask:
   11108 	      case CODE_FOR_avx512f_vextracti64x4_mask:
   11109 	      case CODE_FOR_avx512dq_vinsertf32x8_mask:
   11110 	      case CODE_FOR_avx512dq_vinserti32x8_mask:
   11111 	      case CODE_FOR_avx512vl_vinsertv4df:
   11112 	      case CODE_FOR_avx512vl_vinsertv4di:
   11113 	      case CODE_FOR_avx512vl_vinsertv8sf:
   11114 	      case CODE_FOR_avx512vl_vinsertv8si:
   11115 		error ("the last argument must be a 1-bit immediate");
   11116 		return const0_rtx;
   11117 
   11118 	      case CODE_FOR_avx_vmcmpv2df3:
   11119 	      case CODE_FOR_avx_vmcmpv4sf3:
   11120 	      case CODE_FOR_avx_cmpv2df3:
   11121 	      case CODE_FOR_avx_cmpv4sf3:
   11122 	      case CODE_FOR_avx_cmpv4df3:
   11123 	      case CODE_FOR_avx_cmpv8sf3:
   11124 	      case CODE_FOR_avx512f_cmpv8df3_mask:
   11125 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
   11126 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
   11127 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
   11128 	      case CODE_FOR_avx512bw_cmpv32hf3_mask:
   11129 	      case CODE_FOR_avx512vl_cmpv16hf3_mask:
   11130 	      case CODE_FOR_avx512fp16_cmpv8hf3_mask:
   11131 		error ("the last argument must be a 5-bit immediate");
   11132 		return const0_rtx;
   11133 
   11134 	      default:
   11135 		switch (nargs_constant)
   11136 		  {
   11137 		  case 2:
   11138 		    if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
   11139 			(!mask_pos && (nargs - i) == nargs_constant))
   11140 		      {
   11141 			error ("the next to last argument must be an 8-bit immediate");
   11142 			break;
   11143 		      }
   11144 		    /* FALLTHRU */
   11145 		  case 1:
   11146 		    error ("the last argument must be an 8-bit immediate");
   11147 		    break;
   11148 		  default:
   11149 		    gcc_unreachable ();
   11150 		  }
   11151 		return const0_rtx;
   11152 	      }
   11153 	}
   11154       else
   11155 	{
   11156 	  if (VECTOR_MODE_P (mode))
   11157 	    op = safe_vector_operand (op, mode);
   11158 
   11159 	  /* If we aren't optimizing, only allow one memory operand to
   11160 	     be generated.  */
   11161 	  if (memory_operand (op, mode))
   11162 	    num_memory++;
   11163 
   11164 	  op = fixup_modeless_constant (op, mode);
   11165 
   11166 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   11167 	    {
   11168 	      if (optimize || !match || num_memory > 1)
   11169 		op = copy_to_mode_reg (mode, op);
   11170 	    }
   11171 	  else
   11172 	    {
   11173 	      op = copy_to_reg (op);
   11174 	      op = lowpart_subreg (mode, op, GET_MODE (op));
   11175 	    }
   11176 	}
   11177 
   11178       xops[i] = op;
   11179     }
   11180 
   11181   switch (nargs)
   11182     {
   11183     case 1:
   11184       pat = GEN_FCN (icode) (real_target, xops[0]);
   11185       break;
   11186     case 2:
   11187       pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
   11188       break;
   11189     case 3:
   11190       pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
   11191       break;
   11192     case 4:
   11193       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   11194 			     xops[2], xops[3]);
   11195       break;
   11196     case 5:
   11197       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   11198 			     xops[2], xops[3], xops[4]);
   11199       break;
   11200     case 6:
   11201       pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
   11202 			     xops[2], xops[3], xops[4], xops[5]);
   11203       break;
   11204     default:
   11205       gcc_unreachable ();
   11206     }
   11207 
   11208   if (! pat)
   11209     return 0;
   11210 
   11211   emit_insn (pat);
   11212   return target;
   11213 }
   11214 
   11215 /* Transform pattern of following layout:
   11216      (set A
   11217        (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
   11218      )
   11219    into:
   11220      (set (A B)) */
   11221 
   11222 static rtx
   11223 ix86_erase_embedded_rounding (rtx pat)
   11224 {
   11225   if (GET_CODE (pat) == INSN)
   11226     pat = PATTERN (pat);
   11227 
   11228   gcc_assert (GET_CODE (pat) == SET);
   11229   rtx src = SET_SRC (pat);
   11230   gcc_assert (XVECLEN (src, 0) == 2);
   11231   rtx p0 = XVECEXP (src, 0, 0);
   11232   gcc_assert (GET_CODE (src) == UNSPEC
   11233 	      && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
   11234   rtx res = gen_rtx_SET (SET_DEST (pat), p0);
   11235   return res;
   11236 }
   11237 
   11238 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
   11239    with rounding.  */
   11240 static rtx
   11241 ix86_expand_sse_comi_round (const struct builtin_description *d,
   11242 			    tree exp, rtx target)
   11243 {
   11244   rtx pat, set_dst;
   11245   tree arg0 = CALL_EXPR_ARG (exp, 0);
   11246   tree arg1 = CALL_EXPR_ARG (exp, 1);
   11247   tree arg2 = CALL_EXPR_ARG (exp, 2);
   11248   tree arg3 = CALL_EXPR_ARG (exp, 3);
   11249   rtx op0 = expand_normal (arg0);
   11250   rtx op1 = expand_normal (arg1);
   11251   rtx op2 = expand_normal (arg2);
   11252   rtx op3 = expand_normal (arg3);
   11253   enum insn_code icode = d->icode;
   11254   const struct insn_data_d *insn_p = &insn_data[icode];
   11255   machine_mode mode0 = insn_p->operand[0].mode;
   11256   machine_mode mode1 = insn_p->operand[1].mode;
   11257 
   11258   /* See avxintrin.h for values.  */
   11259   static const enum rtx_code comparisons[32] =
   11260     {
   11261       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   11262       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
   11263       EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
   11264       UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
   11265     };
   11266   static const bool ordereds[32] =
   11267     {
   11268       true,  true,  true,  false, false, false, false, true,
   11269       false, false, false, true,  true,  true,  true,  false,
   11270       true,  true,  true,  false, false, false, false, true,
   11271       false, false, false, true,  true,  true,  true,  false
   11272     };
   11273   static const bool non_signalings[32] =
   11274     {
   11275       true,  false, false, true,  true,  false, false, true,
   11276       true,  false, false, true,  true,  false, false, true,
   11277       false, true,  true,  false, false, true,  true,  false,
   11278       false, true,  true,  false, false, true,  true,  false
   11279     };
   11280 
   11281   if (!CONST_INT_P (op2))
   11282     {
   11283       error ("the third argument must be comparison constant");
   11284       return const0_rtx;
   11285     }
   11286   if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
   11287     {
   11288       error ("incorrect comparison mode");
   11289       return const0_rtx;
   11290     }
   11291 
   11292   if (!insn_p->operand[2].predicate (op3, SImode))
   11293     {
   11294       error ("incorrect rounding operand");
   11295       return const0_rtx;
   11296     }
   11297 
   11298   if (VECTOR_MODE_P (mode0))
   11299     op0 = safe_vector_operand (op0, mode0);
   11300   if (VECTOR_MODE_P (mode1))
   11301     op1 = safe_vector_operand (op1, mode1);
   11302 
   11303   enum rtx_code comparison = comparisons[INTVAL (op2)];
   11304   bool ordered = ordereds[INTVAL (op2)];
   11305   bool non_signaling = non_signalings[INTVAL (op2)];
   11306   rtx const_val = const0_rtx;
   11307 
   11308   bool check_unordered = false;
   11309   machine_mode mode = CCFPmode;
   11310   switch (comparison)
   11311     {
   11312     case ORDERED:
   11313       if (!ordered)
   11314 	{
   11315 	  /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US.  */
   11316 	  if (!non_signaling)
   11317 	    ordered = true;
   11318 	  mode = CCSmode;
   11319 	}
   11320       else
   11321 	{
   11322 	  /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S.  */
   11323 	  if (non_signaling)
   11324 	    ordered = false;
   11325 	  mode = CCPmode;
   11326 	}
   11327       comparison = NE;
   11328       break;
   11329     case UNORDERED:
   11330       if (ordered)
   11331 	{
   11332 	  /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS.  */
   11333 	  if (non_signaling)
   11334 	    ordered = false;
   11335 	  mode = CCSmode;
   11336 	}
   11337       else
   11338 	{
   11339 	  /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S.  */
   11340 	  if (!non_signaling)
   11341 	    ordered = true;
   11342 	  mode = CCPmode;
   11343 	}
   11344       comparison = EQ;
   11345       break;
   11346 
   11347     case LE:	/* -> GE  */
   11348     case LT:	/* -> GT  */
   11349     case UNGE:	/* -> UNLE  */
   11350     case UNGT:	/* -> UNLT  */
   11351       std::swap (op0, op1);
   11352       comparison = swap_condition (comparison);
   11353       /* FALLTHRU */
   11354     case GT:
   11355     case GE:
   11356     case UNEQ:
   11357     case UNLT:
   11358     case UNLE:
   11359     case LTGT:
   11360       /* These are supported by CCFPmode.  NB: Use ordered/signaling
   11361 	 COMI or unordered/non-signaling UCOMI.  Both set ZF, PF, CF
   11362 	 with NAN operands.  */
   11363       if (ordered == non_signaling)
   11364 	ordered = !ordered;
   11365       break;
   11366     case EQ:
   11367       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   11368 	 _CMP_EQ_OQ/_CMP_EQ_OS.  */
   11369       check_unordered = true;
   11370       mode = CCZmode;
   11371       break;
   11372     case NE:
   11373       /* NB: COMI/UCOMI will set ZF with NAN operands.  Use CCZmode for
   11374 	 _CMP_NEQ_UQ/_CMP_NEQ_US.  */
   11375       gcc_assert (!ordered);
   11376       check_unordered = true;
   11377       mode = CCZmode;
   11378       const_val = const1_rtx;
   11379       break;
   11380     default:
   11381       gcc_unreachable ();
   11382     }
   11383 
   11384   target = gen_reg_rtx (SImode);
   11385   emit_move_insn (target, const_val);
   11386   target = gen_rtx_SUBREG (QImode, target, 0);
   11387 
   11388   if ((optimize && !register_operand (op0, mode0))
   11389       || !insn_p->operand[0].predicate (op0, mode0))
   11390     op0 = copy_to_mode_reg (mode0, op0);
   11391   if ((optimize && !register_operand (op1, mode1))
   11392       || !insn_p->operand[1].predicate (op1, mode1))
   11393     op1 = copy_to_mode_reg (mode1, op1);
   11394 
   11395   /*
   11396      1. COMI: ordered and signaling.
   11397      2. UCOMI: unordered and non-signaling.
   11398    */
   11399   if (non_signaling)
   11400     icode = (icode == CODE_FOR_sse_comi_round
   11401 	     ? CODE_FOR_sse_ucomi_round
   11402 	     : CODE_FOR_sse2_ucomi_round);
   11403 
   11404   pat = GEN_FCN (icode) (op0, op1, op3);
   11405   if (! pat)
   11406     return 0;
   11407 
   11408   /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point.  */
   11409   if (INTVAL (op3) == NO_ROUND)
   11410     {
   11411       pat = ix86_erase_embedded_rounding (pat);
   11412       if (! pat)
   11413 	return 0;
   11414 
   11415       set_dst = SET_DEST (pat);
   11416     }
   11417   else
   11418     {
   11419       gcc_assert (GET_CODE (pat) == SET);
   11420       set_dst = SET_DEST (pat);
   11421     }
   11422 
   11423   emit_insn (pat);
   11424 
   11425   rtx_code_label *label = NULL;
   11426 
   11427   /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
   11428      with NAN operands.  */
   11429   if (check_unordered)
   11430     {
   11431       gcc_assert (comparison == EQ || comparison == NE);
   11432 
   11433       rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
   11434       label = gen_label_rtx ();
   11435       rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
   11436       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   11437 				  gen_rtx_LABEL_REF (VOIDmode, label),
   11438 				  pc_rtx);
   11439       emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   11440     }
   11441 
   11442   /* NB: Set CCFPmode and check a different CCmode which is in subset
   11443      of CCFPmode.  */
   11444   if (GET_MODE (set_dst) != mode)
   11445     {
   11446       gcc_assert (mode == CCAmode || mode == CCCmode
   11447 		  || mode == CCOmode || mode == CCPmode
   11448 		  || mode == CCSmode || mode == CCZmode);
   11449       set_dst = gen_rtx_REG (mode, FLAGS_REG);
   11450     }
   11451 
   11452   emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   11453 			  gen_rtx_fmt_ee (comparison, QImode,
   11454 					  set_dst,
   11455 					  const0_rtx)));
   11456 
   11457   if (label)
   11458     emit_label (label);
   11459 
   11460   return SUBREG_REG (target);
   11461 }
   11462 
   11463 static rtx
   11464 ix86_expand_round_builtin (const struct builtin_description *d,
   11465 			   tree exp, rtx target)
   11466 {
   11467   rtx pat;
   11468   unsigned int i, nargs;
   11469   rtx xops[6];
   11470   enum insn_code icode = d->icode;
   11471   const struct insn_data_d *insn_p = &insn_data[icode];
   11472   machine_mode tmode = insn_p->operand[0].mode;
   11473   unsigned int nargs_constant = 0;
   11474   unsigned int redundant_embed_rnd = 0;
   11475 
   11476   switch ((enum ix86_builtin_func_type) d->flag)
   11477     {
   11478     case UINT64_FTYPE_V2DF_INT:
   11479     case UINT64_FTYPE_V4SF_INT:
   11480     case UINT64_FTYPE_V8HF_INT:
   11481     case UINT_FTYPE_V2DF_INT:
   11482     case UINT_FTYPE_V4SF_INT:
   11483     case UINT_FTYPE_V8HF_INT:
   11484     case INT64_FTYPE_V2DF_INT:
   11485     case INT64_FTYPE_V4SF_INT:
   11486     case INT64_FTYPE_V8HF_INT:
   11487     case INT_FTYPE_V2DF_INT:
   11488     case INT_FTYPE_V4SF_INT:
   11489     case INT_FTYPE_V8HF_INT:
   11490       nargs = 2;
   11491       break;
   11492     case V32HF_FTYPE_V32HF_V32HF_INT:
   11493     case V8HF_FTYPE_V8HF_V8HF_INT:
   11494     case V8HF_FTYPE_V8HF_INT_INT:
   11495     case V8HF_FTYPE_V8HF_UINT_INT:
   11496     case V8HF_FTYPE_V8HF_INT64_INT:
   11497     case V8HF_FTYPE_V8HF_UINT64_INT:
   11498     case V4SF_FTYPE_V4SF_UINT_INT:
   11499     case V4SF_FTYPE_V4SF_UINT64_INT:
   11500     case V2DF_FTYPE_V2DF_UINT64_INT:
   11501     case V4SF_FTYPE_V4SF_INT_INT:
   11502     case V4SF_FTYPE_V4SF_INT64_INT:
   11503     case V2DF_FTYPE_V2DF_INT64_INT:
   11504     case V4SF_FTYPE_V4SF_V4SF_INT:
   11505     case V2DF_FTYPE_V2DF_V2DF_INT:
   11506     case V4SF_FTYPE_V4SF_V2DF_INT:
   11507     case V2DF_FTYPE_V2DF_V4SF_INT:
   11508       nargs = 3;
   11509       break;
   11510     case V8SF_FTYPE_V8DF_V8SF_QI_INT:
   11511     case V8DF_FTYPE_V8DF_V8DF_QI_INT:
   11512     case V32HI_FTYPE_V32HF_V32HI_USI_INT:
   11513     case V8SI_FTYPE_V8DF_V8SI_QI_INT:
   11514     case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
   11515     case V8DI_FTYPE_V8DF_V8DI_QI_INT:
   11516     case V8SF_FTYPE_V8DI_V8SF_QI_INT:
   11517     case V8DF_FTYPE_V8DI_V8DF_QI_INT:
   11518     case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
   11519     case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
   11520     case V32HF_FTYPE_V32HI_V32HF_USI_INT:
   11521     case V32HF_FTYPE_V32HF_V32HF_USI_INT:
   11522     case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
   11523     case V16SF_FTYPE_V16SF_V16SF_HI_INT:
   11524     case V8DI_FTYPE_V8SF_V8DI_QI_INT:
   11525     case V16SF_FTYPE_V16SI_V16SF_HI_INT:
   11526     case V16SI_FTYPE_V16SF_V16SI_HI_INT:
   11527     case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
   11528     case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
   11529     case V8DF_FTYPE_V8SF_V8DF_QI_INT:
   11530     case V16SF_FTYPE_V16HI_V16SF_HI_INT:
   11531     case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
   11532     case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
   11533     case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
   11534     case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
   11535     case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
   11536     case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
   11537       nargs = 4;
   11538       break;
   11539     case V4SF_FTYPE_V4SF_V4SF_INT_INT:
   11540     case V2DF_FTYPE_V2DF_V2DF_INT_INT:
   11541       nargs_constant = 2;
   11542       nargs = 4;
   11543       break;
   11544     case INT_FTYPE_V4SF_V4SF_INT_INT:
   11545     case INT_FTYPE_V2DF_V2DF_INT_INT:
   11546       return ix86_expand_sse_comi_round (d, exp, target);
   11547     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
   11548     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
   11549     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
   11550     case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
   11551     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
   11552     case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
   11553     case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
   11554     case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
   11555     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
   11556     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
   11557     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
   11558     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
   11559     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
   11560     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
   11561     case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
   11562     case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
   11563     case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
   11564       nargs = 5;
   11565       break;
   11566     case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
   11567     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
   11568     case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
   11569     case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
   11570     case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
   11571       nargs_constant = 4;
   11572       nargs = 5;
   11573       break;
   11574     case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
   11575     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
   11576     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
   11577     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
   11578     case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
   11579     case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
   11580       nargs_constant = 3;
   11581       nargs = 5;
   11582       break;
   11583     case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
   11584     case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
   11585     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
   11586     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
   11587     case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
   11588     case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
   11589     case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
   11590       nargs = 6;
   11591       nargs_constant = 4;
   11592       break;
   11593     case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
   11594     case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
   11595     case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
   11596     case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
   11597       nargs = 6;
   11598       nargs_constant = 3;
   11599       break;
   11600     default:
   11601       gcc_unreachable ();
   11602     }
   11603   gcc_assert (nargs <= ARRAY_SIZE (xops));
   11604 
   11605   if (optimize
   11606       || target == 0
   11607       || GET_MODE (target) != tmode
   11608       || !insn_p->operand[0].predicate (target, tmode))
   11609     target = gen_reg_rtx (tmode);
   11610 
   11611   for (i = 0; i < nargs; i++)
   11612     {
   11613       tree arg = CALL_EXPR_ARG (exp, i);
   11614       rtx op = expand_normal (arg);
   11615       machine_mode mode = insn_p->operand[i + 1].mode;
   11616       bool match = insn_p->operand[i + 1].predicate (op, mode);
   11617 
   11618       if (i == nargs - nargs_constant)
   11619 	{
   11620 	  if (!match)
   11621 	    {
   11622 	      switch (icode)
   11623 		{
   11624 		case CODE_FOR_avx512f_getmantv8df_mask_round:
   11625 		case CODE_FOR_avx512f_getmantv16sf_mask_round:
   11626 		case CODE_FOR_avx512bw_getmantv32hf_mask_round:
   11627 		case CODE_FOR_avx512f_vgetmantv2df_round:
   11628 		case CODE_FOR_avx512f_vgetmantv2df_mask_round:
   11629 		case CODE_FOR_avx512f_vgetmantv4sf_round:
   11630 		case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
   11631 		case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
   11632 		  error ("the immediate argument must be a 4-bit immediate");
   11633 		  return const0_rtx;
   11634 		case CODE_FOR_avx512f_cmpv8df3_mask_round:
   11635 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
   11636 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
   11637 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
   11638 		case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
   11639 		case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
   11640 		  error ("the immediate argument must be a 5-bit immediate");
   11641 		  return const0_rtx;
   11642 		default:
   11643 		  error ("the immediate argument must be an 8-bit immediate");
   11644 		  return const0_rtx;
   11645 		}
   11646 	    }
   11647 	}
   11648       else if (i == nargs-1)
   11649 	{
   11650 	  if (!insn_p->operand[nargs].predicate (op, SImode))
   11651 	    {
   11652 	      error ("incorrect rounding operand");
   11653 	      return const0_rtx;
   11654 	    }
   11655 
   11656 	  /* If there is no rounding use normal version of the pattern.  */
   11657 	  if (INTVAL (op) == NO_ROUND)
   11658 	    {
   11659 	      /* Skip erasing embedded rounding for below expanders who
   11660 		 generates multiple insns.  In ix86_erase_embedded_rounding
   11661 		 the pattern will be transformed to a single set, and emit_insn
   11662 		 appends the set insead of insert it to chain.  So the insns
   11663 		 emitted inside define_expander would be ignored.  */
   11664 	      switch (icode)
   11665 		{
   11666 		case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
   11667 		case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
   11668 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
   11669 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
   11670 		case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
   11671 		case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
   11672 		  redundant_embed_rnd = 0;
   11673 		  break;
   11674 		default:
   11675 		  redundant_embed_rnd = 1;
   11676 		  break;
   11677 		}
   11678 	    }
   11679 	}
   11680       else
   11681 	{
   11682 	  if (VECTOR_MODE_P (mode))
   11683 	    op = safe_vector_operand (op, mode);
   11684 
   11685 	  op = fixup_modeless_constant (op, mode);
   11686 
   11687 	  if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   11688 	    {
   11689 	      if (optimize || !match)
   11690 		op = copy_to_mode_reg (mode, op);
   11691 	    }
   11692 	  else
   11693 	    {
   11694 	      op = copy_to_reg (op);
   11695 	      op = lowpart_subreg (mode, op, GET_MODE (op));
   11696 	    }
   11697 	}
   11698 
   11699       xops[i] = op;
   11700     }
   11701 
   11702   switch (nargs)
   11703     {
   11704     case 1:
   11705       pat = GEN_FCN (icode) (target, xops[0]);
   11706       break;
   11707     case 2:
   11708       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   11709       break;
   11710     case 3:
   11711       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   11712       break;
   11713     case 4:
   11714       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11715 			     xops[2], xops[3]);
   11716       break;
   11717     case 5:
   11718       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11719 			     xops[2], xops[3], xops[4]);
   11720       break;
   11721     case 6:
   11722       pat = GEN_FCN (icode) (target, xops[0], xops[1],
   11723 			     xops[2], xops[3], xops[4], xops[5]);
   11724       break;
   11725     default:
   11726       gcc_unreachable ();
   11727     }
   11728 
   11729   if (!pat)
   11730     return 0;
   11731 
   11732   if (redundant_embed_rnd)
   11733     pat = ix86_erase_embedded_rounding (pat);
   11734 
   11735   emit_insn (pat);
   11736   return target;
   11737 }
   11738 
   11739 /* Subroutine of ix86_expand_builtin to take care of special insns
   11740    with variable number of operands.  */
   11741 
   11742 static rtx
   11743 ix86_expand_special_args_builtin (const struct builtin_description *d,
   11744 				  tree exp, rtx target)
   11745 {
   11746   tree arg;
   11747   rtx pat, op;
   11748   unsigned int i, nargs, arg_adjust, memory;
   11749   bool aligned_mem = false;
   11750   rtx xops[3];
   11751   enum insn_code icode = d->icode;
   11752   const struct insn_data_d *insn_p = &insn_data[icode];
   11753   machine_mode tmode = insn_p->operand[0].mode;
   11754   enum { load, store } klass;
   11755 
   11756   switch ((enum ix86_builtin_func_type) d->flag)
   11757     {
   11758     case VOID_FTYPE_VOID:
   11759       emit_insn (GEN_FCN (icode) (target));
   11760       return 0;
   11761     case VOID_FTYPE_UINT64:
   11762     case VOID_FTYPE_UNSIGNED:
   11763       nargs = 0;
   11764       klass = store;
   11765       memory = 0;
   11766       break;
   11767 
   11768     case INT_FTYPE_VOID:
   11769     case USHORT_FTYPE_VOID:
   11770     case UINT64_FTYPE_VOID:
   11771     case UINT_FTYPE_VOID:
   11772     case UINT8_FTYPE_VOID:
   11773     case UNSIGNED_FTYPE_VOID:
   11774       nargs = 0;
   11775       klass = load;
   11776       memory = 0;
   11777       break;
   11778     case UINT64_FTYPE_PUNSIGNED:
   11779     case V2DI_FTYPE_PV2DI:
   11780     case V4DI_FTYPE_PV4DI:
   11781     case V32QI_FTYPE_PCCHAR:
   11782     case V16QI_FTYPE_PCCHAR:
   11783     case V8SF_FTYPE_PCV4SF:
   11784     case V8SF_FTYPE_PCFLOAT:
   11785     case V4SF_FTYPE_PCFLOAT:
   11786     case V4DF_FTYPE_PCV2DF:
   11787     case V4DF_FTYPE_PCDOUBLE:
   11788     case V2DF_FTYPE_PCDOUBLE:
   11789     case VOID_FTYPE_PVOID:
   11790     case V8DI_FTYPE_PV8DI:
   11791       nargs = 1;
   11792       klass = load;
   11793       memory = 0;
   11794       switch (icode)
   11795 	{
   11796 	case CODE_FOR_sse4_1_movntdqa:
   11797 	case CODE_FOR_avx2_movntdqa:
   11798 	case CODE_FOR_avx512f_movntdqa:
   11799 	  aligned_mem = true;
   11800 	  break;
   11801 	default:
   11802 	  break;
   11803 	}
   11804       break;
   11805     case VOID_FTYPE_PV2SF_V4SF:
   11806     case VOID_FTYPE_PV8DI_V8DI:
   11807     case VOID_FTYPE_PV4DI_V4DI:
   11808     case VOID_FTYPE_PV2DI_V2DI:
   11809     case VOID_FTYPE_PCHAR_V32QI:
   11810     case VOID_FTYPE_PCHAR_V16QI:
   11811     case VOID_FTYPE_PFLOAT_V16SF:
   11812     case VOID_FTYPE_PFLOAT_V8SF:
   11813     case VOID_FTYPE_PFLOAT_V4SF:
   11814     case VOID_FTYPE_PDOUBLE_V8DF:
   11815     case VOID_FTYPE_PDOUBLE_V4DF:
   11816     case VOID_FTYPE_PDOUBLE_V2DF:
   11817     case VOID_FTYPE_PLONGLONG_LONGLONG:
   11818     case VOID_FTYPE_PULONGLONG_ULONGLONG:
   11819     case VOID_FTYPE_PUNSIGNED_UNSIGNED:
   11820     case VOID_FTYPE_PINT_INT:
   11821       nargs = 1;
   11822       klass = store;
   11823       /* Reserve memory operand for target.  */
   11824       memory = ARRAY_SIZE (xops);
   11825       switch (icode)
   11826 	{
   11827 	/* These builtins and instructions require the memory
   11828 	   to be properly aligned.  */
   11829 	case CODE_FOR_avx_movntv4di:
   11830 	case CODE_FOR_sse2_movntv2di:
   11831 	case CODE_FOR_avx_movntv8sf:
   11832 	case CODE_FOR_sse_movntv4sf:
   11833 	case CODE_FOR_sse4a_vmmovntv4sf:
   11834 	case CODE_FOR_avx_movntv4df:
   11835 	case CODE_FOR_sse2_movntv2df:
   11836 	case CODE_FOR_sse4a_vmmovntv2df:
   11837 	case CODE_FOR_sse2_movntidi:
   11838 	case CODE_FOR_sse_movntq:
   11839 	case CODE_FOR_sse2_movntisi:
   11840 	case CODE_FOR_avx512f_movntv16sf:
   11841 	case CODE_FOR_avx512f_movntv8df:
   11842 	case CODE_FOR_avx512f_movntv8di:
   11843 	  aligned_mem = true;
   11844 	  break;
   11845 	default:
   11846 	  break;
   11847 	}
   11848       break;
   11849     case VOID_FTYPE_PVOID_PCVOID:
   11850 	nargs = 1;
   11851 	klass = store;
   11852 	memory = 0;
   11853 
   11854 	break;
   11855     case V4SF_FTYPE_V4SF_PCV2SF:
   11856     case V2DF_FTYPE_V2DF_PCDOUBLE:
   11857       nargs = 2;
   11858       klass = load;
   11859       memory = 1;
   11860       break;
   11861     case V8SF_FTYPE_PCV8SF_V8SI:
   11862     case V4DF_FTYPE_PCV4DF_V4DI:
   11863     case V4SF_FTYPE_PCV4SF_V4SI:
   11864     case V2DF_FTYPE_PCV2DF_V2DI:
   11865     case V8SI_FTYPE_PCV8SI_V8SI:
   11866     case V4DI_FTYPE_PCV4DI_V4DI:
   11867     case V4SI_FTYPE_PCV4SI_V4SI:
   11868     case V2DI_FTYPE_PCV2DI_V2DI:
   11869     case VOID_FTYPE_INT_INT64:
   11870       nargs = 2;
   11871       klass = load;
   11872       memory = 0;
   11873       break;
   11874     case VOID_FTYPE_PV8DF_V8DF_UQI:
   11875     case VOID_FTYPE_PV4DF_V4DF_UQI:
   11876     case VOID_FTYPE_PV2DF_V2DF_UQI:
   11877     case VOID_FTYPE_PV16SF_V16SF_UHI:
   11878     case VOID_FTYPE_PV8SF_V8SF_UQI:
   11879     case VOID_FTYPE_PV4SF_V4SF_UQI:
   11880     case VOID_FTYPE_PV8DI_V8DI_UQI:
   11881     case VOID_FTYPE_PV4DI_V4DI_UQI:
   11882     case VOID_FTYPE_PV2DI_V2DI_UQI:
   11883     case VOID_FTYPE_PV16SI_V16SI_UHI:
   11884     case VOID_FTYPE_PV8SI_V8SI_UQI:
   11885     case VOID_FTYPE_PV4SI_V4SI_UQI:
   11886     case VOID_FTYPE_PV64QI_V64QI_UDI:
   11887     case VOID_FTYPE_PV32HI_V32HI_USI:
   11888     case VOID_FTYPE_PV32QI_V32QI_USI:
   11889     case VOID_FTYPE_PV16QI_V16QI_UHI:
   11890     case VOID_FTYPE_PV16HI_V16HI_UHI:
   11891     case VOID_FTYPE_PV8HI_V8HI_UQI:
   11892       switch (icode)
   11893 	{
   11894 	/* These builtins and instructions require the memory
   11895 	   to be properly aligned.  */
   11896 	case CODE_FOR_avx512f_storev16sf_mask:
   11897 	case CODE_FOR_avx512f_storev16si_mask:
   11898 	case CODE_FOR_avx512f_storev8df_mask:
   11899 	case CODE_FOR_avx512f_storev8di_mask:
   11900 	case CODE_FOR_avx512vl_storev8sf_mask:
   11901 	case CODE_FOR_avx512vl_storev8si_mask:
   11902 	case CODE_FOR_avx512vl_storev4df_mask:
   11903 	case CODE_FOR_avx512vl_storev4di_mask:
   11904 	case CODE_FOR_avx512vl_storev4sf_mask:
   11905 	case CODE_FOR_avx512vl_storev4si_mask:
   11906 	case CODE_FOR_avx512vl_storev2df_mask:
   11907 	case CODE_FOR_avx512vl_storev2di_mask:
   11908 	  aligned_mem = true;
   11909 	  break;
   11910 	default:
   11911 	  break;
   11912 	}
   11913       /* FALLTHRU */
   11914     case VOID_FTYPE_PV8SF_V8SI_V8SF:
   11915     case VOID_FTYPE_PV4DF_V4DI_V4DF:
   11916     case VOID_FTYPE_PV4SF_V4SI_V4SF:
   11917     case VOID_FTYPE_PV2DF_V2DI_V2DF:
   11918     case VOID_FTYPE_PV8SI_V8SI_V8SI:
   11919     case VOID_FTYPE_PV4DI_V4DI_V4DI:
   11920     case VOID_FTYPE_PV4SI_V4SI_V4SI:
   11921     case VOID_FTYPE_PV2DI_V2DI_V2DI:
   11922     case VOID_FTYPE_PV8SI_V8DI_UQI:
   11923     case VOID_FTYPE_PV8HI_V8DI_UQI:
   11924     case VOID_FTYPE_PV16HI_V16SI_UHI:
   11925     case VOID_FTYPE_PUDI_V8DI_UQI:
   11926     case VOID_FTYPE_PV16QI_V16SI_UHI:
   11927     case VOID_FTYPE_PV4SI_V4DI_UQI:
   11928     case VOID_FTYPE_PUDI_V2DI_UQI:
   11929     case VOID_FTYPE_PUDI_V4DI_UQI:
   11930     case VOID_FTYPE_PUSI_V2DI_UQI:
   11931     case VOID_FTYPE_PV8HI_V8SI_UQI:
   11932     case VOID_FTYPE_PUDI_V4SI_UQI:
   11933     case VOID_FTYPE_PUSI_V4DI_UQI:
   11934     case VOID_FTYPE_PUHI_V2DI_UQI:
   11935     case VOID_FTYPE_PUDI_V8SI_UQI:
   11936     case VOID_FTYPE_PUSI_V4SI_UQI:
   11937     case VOID_FTYPE_PCHAR_V64QI_UDI:
   11938     case VOID_FTYPE_PCHAR_V32QI_USI:
   11939     case VOID_FTYPE_PCHAR_V16QI_UHI:
   11940     case VOID_FTYPE_PSHORT_V32HI_USI:
   11941     case VOID_FTYPE_PSHORT_V16HI_UHI:
   11942     case VOID_FTYPE_PSHORT_V8HI_UQI:
   11943     case VOID_FTYPE_PINT_V16SI_UHI:
   11944     case VOID_FTYPE_PINT_V8SI_UQI:
   11945     case VOID_FTYPE_PINT_V4SI_UQI:
   11946     case VOID_FTYPE_PINT64_V8DI_UQI:
   11947     case VOID_FTYPE_PINT64_V4DI_UQI:
   11948     case VOID_FTYPE_PINT64_V2DI_UQI:
   11949     case VOID_FTYPE_PDOUBLE_V8DF_UQI:
   11950     case VOID_FTYPE_PDOUBLE_V4DF_UQI:
   11951     case VOID_FTYPE_PDOUBLE_V2DF_UQI:
   11952     case VOID_FTYPE_PFLOAT_V16SF_UHI:
   11953     case VOID_FTYPE_PFLOAT_V8SF_UQI:
   11954     case VOID_FTYPE_PFLOAT_V4SF_UQI:
   11955     case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
   11956     case VOID_FTYPE_PV32QI_V32HI_USI:
   11957     case VOID_FTYPE_PV16QI_V16HI_UHI:
   11958     case VOID_FTYPE_PUDI_V8HI_UQI:
   11959       nargs = 2;
   11960       klass = store;
   11961       /* Reserve memory operand for target.  */
   11962       memory = ARRAY_SIZE (xops);
   11963       break;
   11964     case V4SF_FTYPE_PCV4SF_V4SF_UQI:
   11965     case V8SF_FTYPE_PCV8SF_V8SF_UQI:
   11966     case V16SF_FTYPE_PCV16SF_V16SF_UHI:
   11967     case V4SI_FTYPE_PCV4SI_V4SI_UQI:
   11968     case V8SI_FTYPE_PCV8SI_V8SI_UQI:
   11969     case V16SI_FTYPE_PCV16SI_V16SI_UHI:
   11970     case V2DF_FTYPE_PCV2DF_V2DF_UQI:
   11971     case V4DF_FTYPE_PCV4DF_V4DF_UQI:
   11972     case V8DF_FTYPE_PCV8DF_V8DF_UQI:
   11973     case V2DI_FTYPE_PCV2DI_V2DI_UQI:
   11974     case V4DI_FTYPE_PCV4DI_V4DI_UQI:
   11975     case V8DI_FTYPE_PCV8DI_V8DI_UQI:
   11976     case V64QI_FTYPE_PCV64QI_V64QI_UDI:
   11977     case V32HI_FTYPE_PCV32HI_V32HI_USI:
   11978     case V32QI_FTYPE_PCV32QI_V32QI_USI:
   11979     case V16QI_FTYPE_PCV16QI_V16QI_UHI:
   11980     case V16HI_FTYPE_PCV16HI_V16HI_UHI:
   11981     case V8HI_FTYPE_PCV8HI_V8HI_UQI:
   11982       switch (icode)
   11983 	{
   11984 	/* These builtins and instructions require the memory
   11985 	   to be properly aligned.  */
   11986 	case CODE_FOR_avx512f_loadv16sf_mask:
   11987 	case CODE_FOR_avx512f_loadv16si_mask:
   11988 	case CODE_FOR_avx512f_loadv8df_mask:
   11989 	case CODE_FOR_avx512f_loadv8di_mask:
   11990 	case CODE_FOR_avx512vl_loadv8sf_mask:
   11991 	case CODE_FOR_avx512vl_loadv8si_mask:
   11992 	case CODE_FOR_avx512vl_loadv4df_mask:
   11993 	case CODE_FOR_avx512vl_loadv4di_mask:
   11994 	case CODE_FOR_avx512vl_loadv4sf_mask:
   11995 	case CODE_FOR_avx512vl_loadv4si_mask:
   11996 	case CODE_FOR_avx512vl_loadv2df_mask:
   11997 	case CODE_FOR_avx512vl_loadv2di_mask:
   11998 	case CODE_FOR_avx512bw_loadv64qi_mask:
   11999 	case CODE_FOR_avx512vl_loadv32qi_mask:
   12000 	case CODE_FOR_avx512vl_loadv16qi_mask:
   12001 	case CODE_FOR_avx512bw_loadv32hi_mask:
   12002 	case CODE_FOR_avx512vl_loadv16hi_mask:
   12003 	case CODE_FOR_avx512vl_loadv8hi_mask:
   12004 	  aligned_mem = true;
   12005 	  break;
   12006 	default:
   12007 	  break;
   12008 	}
   12009       /* FALLTHRU */
   12010     case V64QI_FTYPE_PCCHAR_V64QI_UDI:
   12011     case V32QI_FTYPE_PCCHAR_V32QI_USI:
   12012     case V16QI_FTYPE_PCCHAR_V16QI_UHI:
   12013     case V32HI_FTYPE_PCSHORT_V32HI_USI:
   12014     case V16HI_FTYPE_PCSHORT_V16HI_UHI:
   12015     case V8HI_FTYPE_PCSHORT_V8HI_UQI:
   12016     case V16SI_FTYPE_PCINT_V16SI_UHI:
   12017     case V8SI_FTYPE_PCINT_V8SI_UQI:
   12018     case V4SI_FTYPE_PCINT_V4SI_UQI:
   12019     case V8DI_FTYPE_PCINT64_V8DI_UQI:
   12020     case V4DI_FTYPE_PCINT64_V4DI_UQI:
   12021     case V2DI_FTYPE_PCINT64_V2DI_UQI:
   12022     case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
   12023     case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
   12024     case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
   12025     case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
   12026     case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
   12027     case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
   12028     case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
   12029       nargs = 3;
   12030       klass = load;
   12031       memory = 0;
   12032       break;
   12033     default:
   12034       gcc_unreachable ();
   12035     }
   12036 
   12037   gcc_assert (nargs <= ARRAY_SIZE (xops));
   12038 
   12039   if (klass == store)
   12040     {
   12041       arg = CALL_EXPR_ARG (exp, 0);
   12042       op = expand_normal (arg);
   12043       gcc_assert (target == 0);
   12044       if (memory)
   12045 	{
   12046 	  op = ix86_zero_extend_to_Pmode (op);
   12047 	  target = gen_rtx_MEM (tmode, op);
   12048 	  /* target at this point has just BITS_PER_UNIT MEM_ALIGN
   12049 	     on it.  Try to improve it using get_pointer_alignment,
   12050 	     and if the special builtin is one that requires strict
   12051 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
   12052 	     Failure to do so could lead to ix86_legitimate_combined_insn
   12053 	     rejecting all changes to such insns.  */
   12054 	  unsigned int align = get_pointer_alignment (arg);
   12055 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
   12056 	    align = GET_MODE_ALIGNMENT (tmode);
   12057 	  if (MEM_ALIGN (target) < align)
   12058 	    set_mem_align (target, align);
   12059 	}
   12060       else
   12061 	target = force_reg (tmode, op);
   12062       arg_adjust = 1;
   12063     }
   12064   else
   12065     {
   12066       arg_adjust = 0;
   12067       if (optimize
   12068 	  || target == 0
   12069 	  || !register_operand (target, tmode)
   12070 	  || GET_MODE (target) != tmode)
   12071 	target = gen_reg_rtx (tmode);
   12072     }
   12073 
   12074   for (i = 0; i < nargs; i++)
   12075     {
   12076       machine_mode mode = insn_p->operand[i + 1].mode;
   12077 
   12078       arg = CALL_EXPR_ARG (exp, i + arg_adjust);
   12079       op = expand_normal (arg);
   12080 
   12081       if (i == memory)
   12082 	{
   12083 	  /* This must be the memory operand.  */
   12084 	  op = ix86_zero_extend_to_Pmode (op);
   12085 	  op = gen_rtx_MEM (mode, op);
   12086 	  /* op at this point has just BITS_PER_UNIT MEM_ALIGN
   12087 	     on it.  Try to improve it using get_pointer_alignment,
   12088 	     and if the special builtin is one that requires strict
   12089 	     mode alignment, also from it's GET_MODE_ALIGNMENT.
   12090 	     Failure to do so could lead to ix86_legitimate_combined_insn
   12091 	     rejecting all changes to such insns.  */
   12092 	  unsigned int align = get_pointer_alignment (arg);
   12093 	  if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
   12094 	    align = GET_MODE_ALIGNMENT (mode);
   12095 	  if (MEM_ALIGN (op) < align)
   12096 	    set_mem_align (op, align);
   12097 	}
   12098       else
   12099 	{
   12100 	  /* This must be register.  */
   12101 	  if (VECTOR_MODE_P (mode))
   12102 	    op = safe_vector_operand (op, mode);
   12103 
   12104 	  op = fixup_modeless_constant (op, mode);
   12105 
   12106 	  /* NB: 3-operands load implied it's a mask load or v{p}expand*,
   12107 	     and that mask operand shoud be at the end.
   12108 	     Keep all-ones mask which would be simplified by the expander.  */
   12109 	  if (nargs == 3 && i == 2 && klass == load
   12110 	      && constm1_operand (op, mode)
   12111 	      && insn_p->operand[i].predicate (op, mode))
   12112 	    ;
   12113 	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
   12114 	    op = copy_to_mode_reg (mode, op);
   12115 	  else
   12116 	    {
   12117 	      op = copy_to_reg (op);
   12118 	      op = lowpart_subreg (mode, op, GET_MODE (op));
   12119 	    }
   12120 	}
   12121 
   12122       xops[i]= op;
   12123     }
   12124 
   12125   switch (nargs)
   12126     {
   12127     case 0:
   12128       pat = GEN_FCN (icode) (target);
   12129       break;
   12130     case 1:
   12131       pat = GEN_FCN (icode) (target, xops[0]);
   12132       break;
   12133     case 2:
   12134       pat = GEN_FCN (icode) (target, xops[0], xops[1]);
   12135       break;
   12136     case 3:
   12137       pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
   12138       break;
   12139     default:
   12140       gcc_unreachable ();
   12141     }
   12142 
   12143   if (! pat)
   12144     return 0;
   12145 
   12146   emit_insn (pat);
   12147   return klass == store ? 0 : target;
   12148 }
   12149 
   12150 /* Return the integer constant in ARG.  Constrain it to be in the range
   12151    of the subparts of VEC_TYPE; issue an error if not.  */
   12152 
   12153 static int
   12154 get_element_number (tree vec_type, tree arg)
   12155 {
   12156   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
   12157 
   12158   if (!tree_fits_uhwi_p (arg)
   12159       || (elt = tree_to_uhwi (arg), elt > max))
   12160     {
   12161       error ("selector must be an integer constant in the range "
   12162 	     "[0, %wi]", max);
   12163       return 0;
   12164     }
   12165 
   12166   return elt;
   12167 }
   12168 
   12169 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   12170    ix86_expand_vector_init.  We DO have language-level syntax for this, in
   12171    the form of  (type){ init-list }.  Except that since we can't place emms
   12172    instructions from inside the compiler, we can't allow the use of MMX
   12173    registers unless the user explicitly asks for it.  So we do *not* define
   12174    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
   12175    we have builtins invoked by mmintrin.h that gives us license to emit
   12176    these sorts of instructions.  */
   12177 
   12178 static rtx
   12179 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
   12180 {
   12181   machine_mode tmode = TYPE_MODE (type);
   12182   machine_mode inner_mode = GET_MODE_INNER (tmode);
   12183   int i, n_elt = GET_MODE_NUNITS (tmode);
   12184   rtvec v = rtvec_alloc (n_elt);
   12185 
   12186   gcc_assert (VECTOR_MODE_P (tmode));
   12187   gcc_assert (call_expr_nargs (exp) == n_elt);
   12188 
   12189   for (i = 0; i < n_elt; ++i)
   12190     {
   12191       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
   12192       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
   12193     }
   12194 
   12195   if (!target || !register_operand (target, tmode))
   12196     target = gen_reg_rtx (tmode);
   12197 
   12198   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
   12199   return target;
   12200 }
   12201 
   12202 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   12203    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
   12204    had a language-level syntax for referencing vector elements.  */
   12205 
   12206 static rtx
   12207 ix86_expand_vec_ext_builtin (tree exp, rtx target)
   12208 {
   12209   machine_mode tmode, mode0;
   12210   tree arg0, arg1;
   12211   int elt;
   12212   rtx op0;
   12213 
   12214   arg0 = CALL_EXPR_ARG (exp, 0);
   12215   arg1 = CALL_EXPR_ARG (exp, 1);
   12216 
   12217   op0 = expand_normal (arg0);
   12218   elt = get_element_number (TREE_TYPE (arg0), arg1);
   12219 
   12220   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   12221   mode0 = TYPE_MODE (TREE_TYPE (arg0));
   12222   gcc_assert (VECTOR_MODE_P (mode0));
   12223 
   12224   op0 = force_reg (mode0, op0);
   12225 
   12226   if (optimize || !target || !register_operand (target, tmode))
   12227     target = gen_reg_rtx (tmode);
   12228 
   12229   ix86_expand_vector_extract (true, target, op0, elt);
   12230 
   12231   return target;
   12232 }
   12233 
   12234 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
   12235    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
   12236    a language-level syntax for referencing vector elements.  */
   12237 
   12238 static rtx
   12239 ix86_expand_vec_set_builtin (tree exp)
   12240 {
   12241   machine_mode tmode, mode1;
   12242   tree arg0, arg1, arg2;
   12243   int elt;
   12244   rtx op0, op1, target;
   12245 
   12246   arg0 = CALL_EXPR_ARG (exp, 0);
   12247   arg1 = CALL_EXPR_ARG (exp, 1);
   12248   arg2 = CALL_EXPR_ARG (exp, 2);
   12249 
   12250   tmode = TYPE_MODE (TREE_TYPE (arg0));
   12251   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
   12252   gcc_assert (VECTOR_MODE_P (tmode));
   12253 
   12254   op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
   12255   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
   12256   elt = get_element_number (TREE_TYPE (arg0), arg2);
   12257 
   12258   if (GET_MODE (op1) != mode1)
   12259     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
   12260 
   12261   op0 = force_reg (tmode, op0);
   12262   op1 = force_reg (mode1, op1);
   12263 
   12264   /* OP0 is the source of these builtin functions and shouldn't be
   12265      modified.  Create a copy, use it and return it as target.  */
   12266   target = gen_reg_rtx (tmode);
   12267   emit_move_insn (target, op0);
   12268   ix86_expand_vector_set (true, target, op1, elt);
   12269 
   12270   return target;
   12271 }
   12272 
   12273 /* Return true if the necessary isa options for this builtin exist,
   12274    else false.
   12275    fcode = DECL_MD_FUNCTION_CODE (fndecl);  */
   12276 bool
   12277 ix86_check_builtin_isa_match (unsigned int fcode,
   12278 			      HOST_WIDE_INT* pbisa,
   12279 			      HOST_WIDE_INT* pbisa2)
   12280 {
   12281   HOST_WIDE_INT isa = ix86_isa_flags;
   12282   HOST_WIDE_INT isa2 = ix86_isa_flags2;
   12283   HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
   12284   HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
   12285   /* The general case is we require all the ISAs specified in bisa{,2}
   12286      to be enabled.
   12287      The exceptions are:
   12288      OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
   12289      OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
   12290      OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
   12291      (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
   12292        OPTION_MASK_ISA2_AVXVNNI
   12293      where for each such pair it is sufficient if either of the ISAs is
   12294      enabled, plus if it is ored with other options also those others.
   12295      OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE.  */
   12296   if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
   12297        == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
   12298       && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
   12299     isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
   12300 
   12301   if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
   12302        == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
   12303       && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
   12304     isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
   12305 
   12306   if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
   12307        == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
   12308       && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
   12309     isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
   12310 
   12311   if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
   12312 	== (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
   12313        || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
   12314       && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
   12315 	   == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
   12316 	  || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
   12317     {
   12318       isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
   12319       isa2 |= OPTION_MASK_ISA2_AVXVNNI;
   12320     }
   12321 
   12322   if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
   12323       /* __builtin_ia32_maskmovq requires MMX registers.  */
   12324       && fcode != IX86_BUILTIN_MASKMOVQ)
   12325     {
   12326       bisa &= ~OPTION_MASK_ISA_MMX;
   12327       bisa |= OPTION_MASK_ISA_SSE2;
   12328     }
   12329 
   12330   if (pbisa)
   12331     *pbisa = bisa;
   12332   if (pbisa2)
   12333     *pbisa2 = bisa2;
   12334 
   12335   return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
   12336 }
   12337 
   12338 /* Expand an expression EXP that calls a built-in function,
   12339    with result going to TARGET if that's convenient
   12340    (and in mode MODE if that's convenient).
   12341    SUBTARGET may be used as the target for computing one of EXP's operands.
   12342    IGNORE is nonzero if the value is to be ignored.  */
   12343 
   12344 rtx
   12345 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
   12346 		     machine_mode mode, int ignore)
   12347 {
   12348   size_t i;
   12349   enum insn_code icode, icode2;
   12350   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   12351   tree arg0, arg1, arg2, arg3, arg4;
   12352   rtx op0, op1, op2, op3, op4, pat, pat2, insn;
   12353   machine_mode mode0, mode1, mode2, mode3, mode4;
   12354   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
   12355   HOST_WIDE_INT bisa, bisa2;
   12356 
   12357   /* For CPU builtins that can be folded, fold first and expand the fold.  */
   12358   switch (fcode)
   12359     {
   12360     case IX86_BUILTIN_CPU_INIT:
   12361       {
   12362 	/* Make it call __cpu_indicator_init in libgcc.  */
   12363 	tree call_expr, fndecl, type;
   12364 	type = build_function_type_list (integer_type_node, NULL_TREE);
   12365 	fndecl = build_fn_decl ("__cpu_indicator_init", type);
   12366 	call_expr = build_call_expr (fndecl, 0);
   12367 	return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
   12368       }
   12369     case IX86_BUILTIN_CPU_IS:
   12370     case IX86_BUILTIN_CPU_SUPPORTS:
   12371       {
   12372 	tree arg0 = CALL_EXPR_ARG (exp, 0);
   12373 	tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
   12374 	gcc_assert (fold_expr != NULL_TREE);
   12375 	return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
   12376       }
   12377     }
   12378 
   12379   if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
   12380     {
   12381       bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
   12382       if (TARGET_ABI_X32)
   12383 	bisa |= OPTION_MASK_ABI_X32;
   12384       else
   12385 	bisa |= OPTION_MASK_ABI_64;
   12386       char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
   12387 				       (enum fpmath_unit) 0,
   12388 				       (enum prefer_vector_width) 0,
   12389 				       PVW_NONE, PVW_NONE,
   12390 				       false, add_abi_p);
   12391       if (!opts)
   12392 	error ("%qE needs unknown isa option", fndecl);
   12393       else
   12394 	{
   12395 	  gcc_assert (opts != NULL);
   12396 	  error ("%qE needs isa option %s", fndecl, opts);
   12397 	  free (opts);
   12398 	}
   12399       return expand_call (exp, target, ignore);
   12400     }
   12401 
   12402   switch (fcode)
   12403     {
   12404     case IX86_BUILTIN_MASKMOVQ:
   12405     case IX86_BUILTIN_MASKMOVDQU:
   12406       icode = (fcode == IX86_BUILTIN_MASKMOVQ
   12407 	       ? CODE_FOR_mmx_maskmovq
   12408 	       : CODE_FOR_sse2_maskmovdqu);
   12409       /* Note the arg order is different from the operand order.  */
   12410       arg1 = CALL_EXPR_ARG (exp, 0);
   12411       arg2 = CALL_EXPR_ARG (exp, 1);
   12412       arg0 = CALL_EXPR_ARG (exp, 2);
   12413       op0 = expand_normal (arg0);
   12414       op1 = expand_normal (arg1);
   12415       op2 = expand_normal (arg2);
   12416       mode0 = insn_data[icode].operand[0].mode;
   12417       mode1 = insn_data[icode].operand[1].mode;
   12418       mode2 = insn_data[icode].operand[2].mode;
   12419 
   12420       op0 = ix86_zero_extend_to_Pmode (op0);
   12421       op0 = gen_rtx_MEM (mode1, op0);
   12422 
   12423       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   12424 	op0 = copy_to_mode_reg (mode0, op0);
   12425       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   12426 	op1 = copy_to_mode_reg (mode1, op1);
   12427       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   12428 	op2 = copy_to_mode_reg (mode2, op2);
   12429       pat = GEN_FCN (icode) (op0, op1, op2);
   12430       if (! pat)
   12431 	return 0;
   12432       emit_insn (pat);
   12433       return 0;
   12434 
   12435     case IX86_BUILTIN_LDMXCSR:
   12436       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
   12437       target = assign_386_stack_local (SImode, SLOT_TEMP);
   12438       emit_move_insn (target, op0);
   12439       emit_insn (gen_sse_ldmxcsr (target));
   12440       return 0;
   12441 
   12442     case IX86_BUILTIN_STMXCSR:
   12443       target = assign_386_stack_local (SImode, SLOT_TEMP);
   12444       emit_insn (gen_sse_stmxcsr (target));
   12445       return copy_to_mode_reg (SImode, target);
   12446 
   12447     case IX86_BUILTIN_CLFLUSH:
   12448 	arg0 = CALL_EXPR_ARG (exp, 0);
   12449 	op0 = expand_normal (arg0);
   12450 	icode = CODE_FOR_sse2_clflush;
   12451 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   12452 	  op0 = ix86_zero_extend_to_Pmode (op0);
   12453 
   12454 	emit_insn (gen_sse2_clflush (op0));
   12455 	return 0;
   12456 
   12457     case IX86_BUILTIN_CLWB:
   12458 	arg0 = CALL_EXPR_ARG (exp, 0);
   12459 	op0 = expand_normal (arg0);
   12460 	icode = CODE_FOR_clwb;
   12461 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   12462 	  op0 = ix86_zero_extend_to_Pmode (op0);
   12463 
   12464 	emit_insn (gen_clwb (op0));
   12465 	return 0;
   12466 
   12467     case IX86_BUILTIN_CLFLUSHOPT:
   12468 	arg0 = CALL_EXPR_ARG (exp, 0);
   12469 	op0 = expand_normal (arg0);
   12470 	icode = CODE_FOR_clflushopt;
   12471 	if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   12472 	  op0 = ix86_zero_extend_to_Pmode (op0);
   12473 
   12474 	emit_insn (gen_clflushopt (op0));
   12475 	return 0;
   12476 
   12477     case IX86_BUILTIN_MONITOR:
   12478     case IX86_BUILTIN_MONITORX:
   12479       arg0 = CALL_EXPR_ARG (exp, 0);
   12480       arg1 = CALL_EXPR_ARG (exp, 1);
   12481       arg2 = CALL_EXPR_ARG (exp, 2);
   12482       op0 = expand_normal (arg0);
   12483       op1 = expand_normal (arg1);
   12484       op2 = expand_normal (arg2);
   12485       if (!REG_P (op0))
   12486 	op0 = ix86_zero_extend_to_Pmode (op0);
   12487       if (!REG_P (op1))
   12488 	op1 = copy_to_mode_reg (SImode, op1);
   12489       if (!REG_P (op2))
   12490 	op2 = copy_to_mode_reg (SImode, op2);
   12491 
   12492       emit_insn (fcode == IX86_BUILTIN_MONITOR
   12493 		 ? gen_sse3_monitor (Pmode, op0, op1, op2)
   12494 		 : gen_monitorx (Pmode, op0, op1, op2));
   12495       return 0;
   12496 
   12497     case IX86_BUILTIN_MWAIT:
   12498       arg0 = CALL_EXPR_ARG (exp, 0);
   12499       arg1 = CALL_EXPR_ARG (exp, 1);
   12500       op0 = expand_normal (arg0);
   12501       op1 = expand_normal (arg1);
   12502       if (!REG_P (op0))
   12503 	op0 = copy_to_mode_reg (SImode, op0);
   12504       if (!REG_P (op1))
   12505 	op1 = copy_to_mode_reg (SImode, op1);
   12506       emit_insn (gen_sse3_mwait (op0, op1));
   12507       return 0;
   12508 
   12509     case IX86_BUILTIN_MWAITX:
   12510       arg0 = CALL_EXPR_ARG (exp, 0);
   12511       arg1 = CALL_EXPR_ARG (exp, 1);
   12512       arg2 = CALL_EXPR_ARG (exp, 2);
   12513       op0 = expand_normal (arg0);
   12514       op1 = expand_normal (arg1);
   12515       op2 = expand_normal (arg2);
   12516       if (!REG_P (op0))
   12517 	op0 = copy_to_mode_reg (SImode, op0);
   12518       if (!REG_P (op1))
   12519 	op1 = copy_to_mode_reg (SImode, op1);
   12520       if (!REG_P (op2))
   12521 	op2 = copy_to_mode_reg (SImode, op2);
   12522       emit_insn (gen_mwaitx (op0, op1, op2));
   12523       return 0;
   12524 
   12525     case IX86_BUILTIN_UMONITOR:
   12526       arg0 = CALL_EXPR_ARG (exp, 0);
   12527       op0 = expand_normal (arg0);
   12528 
   12529       op0 = ix86_zero_extend_to_Pmode (op0);
   12530       emit_insn (gen_umonitor (Pmode, op0));
   12531       return 0;
   12532 
   12533     case IX86_BUILTIN_UMWAIT:
   12534     case IX86_BUILTIN_TPAUSE:
   12535       arg0 = CALL_EXPR_ARG (exp, 0);
   12536       arg1 = CALL_EXPR_ARG (exp, 1);
   12537       op0 = expand_normal (arg0);
   12538       op1 = expand_normal (arg1);
   12539 
   12540       if (!REG_P (op0))
   12541 	op0 = copy_to_mode_reg (SImode, op0);
   12542 
   12543       op1 = force_reg (DImode, op1);
   12544 
   12545       if (TARGET_64BIT)
   12546 	{
   12547 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   12548 				     NULL, 1, OPTAB_DIRECT);
   12549 	  switch (fcode)
   12550 	    {
   12551 	    case IX86_BUILTIN_UMWAIT:
   12552 	      icode = CODE_FOR_umwait_rex64;
   12553 	      break;
   12554 	    case IX86_BUILTIN_TPAUSE:
   12555 	      icode = CODE_FOR_tpause_rex64;
   12556 	      break;
   12557 	    default:
   12558 	      gcc_unreachable ();
   12559 	    }
   12560 
   12561 	  op2 = gen_lowpart (SImode, op2);
   12562 	  op1 = gen_lowpart (SImode, op1);
   12563 	  pat = GEN_FCN (icode) (op0, op1, op2);
   12564 	}
   12565       else
   12566 	{
   12567 	  switch (fcode)
   12568 	    {
   12569 	    case IX86_BUILTIN_UMWAIT:
   12570 	      icode = CODE_FOR_umwait;
   12571 	      break;
   12572 	    case IX86_BUILTIN_TPAUSE:
   12573 	      icode = CODE_FOR_tpause;
   12574 	      break;
   12575 	    default:
   12576 	      gcc_unreachable ();
   12577 	    }
   12578 	  pat = GEN_FCN (icode) (op0, op1);
   12579 	}
   12580 
   12581       if (!pat)
   12582 	return 0;
   12583 
   12584       emit_insn (pat);
   12585 
   12586       if (target == 0
   12587 	  || !register_operand (target, QImode))
   12588 	target = gen_reg_rtx (QImode);
   12589 
   12590       pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   12591 			const0_rtx);
   12592       emit_insn (gen_rtx_SET (target, pat));
   12593 
   12594       return target;
   12595 
   12596     case IX86_BUILTIN_TESTUI:
   12597       emit_insn (gen_testui ());
   12598 
   12599       if (target == 0
   12600 	  || !register_operand (target, QImode))
   12601 	target = gen_reg_rtx (QImode);
   12602 
   12603       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   12604 			 const0_rtx);
   12605       emit_insn (gen_rtx_SET (target, pat));
   12606 
   12607       return target;
   12608 
   12609     case IX86_BUILTIN_CLZERO:
   12610       arg0 = CALL_EXPR_ARG (exp, 0);
   12611       op0 = expand_normal (arg0);
   12612       if (!REG_P (op0))
   12613 	op0 = ix86_zero_extend_to_Pmode (op0);
   12614       emit_insn (gen_clzero (Pmode, op0));
   12615       return 0;
   12616 
   12617     case IX86_BUILTIN_CLDEMOTE:
   12618       arg0 = CALL_EXPR_ARG (exp, 0);
   12619       op0 = expand_normal (arg0);
   12620       icode = CODE_FOR_cldemote;
   12621       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   12622 	op0 = ix86_zero_extend_to_Pmode (op0);
   12623 
   12624       emit_insn (gen_cldemote (op0));
   12625       return 0;
   12626 
   12627     case IX86_BUILTIN_LOADIWKEY:
   12628       {
   12629 	arg0 = CALL_EXPR_ARG (exp, 0);
   12630 	arg1 = CALL_EXPR_ARG (exp, 1);
   12631 	arg2 = CALL_EXPR_ARG (exp, 2);
   12632 	arg3 = CALL_EXPR_ARG (exp, 3);
   12633 
   12634 	op0 = expand_normal (arg0);
   12635 	op1 = expand_normal (arg1);
   12636 	op2 = expand_normal (arg2);
   12637 	op3 = expand_normal (arg3);
   12638 
   12639 	if (!REG_P (op0))
   12640 	  op0 = copy_to_mode_reg (V2DImode, op0);
   12641 	if (!REG_P (op1))
   12642 	  op1 = copy_to_mode_reg (V2DImode, op1);
   12643 	if (!REG_P (op2))
   12644 	  op2 = copy_to_mode_reg (V2DImode, op2);
   12645 	if (!REG_P (op3))
   12646 	  op3 = copy_to_mode_reg (SImode, op3);
   12647 
   12648 	emit_insn (gen_loadiwkey (op0, op1, op2, op3));
   12649 
   12650 	return 0;
   12651       }
   12652 
   12653     case IX86_BUILTIN_AESDEC128KLU8:
   12654       icode = CODE_FOR_aesdec128klu8;
   12655       goto aesdecenc_expand;
   12656 
   12657     case IX86_BUILTIN_AESDEC256KLU8:
   12658       icode = CODE_FOR_aesdec256klu8;
   12659       goto aesdecenc_expand;
   12660 
   12661     case IX86_BUILTIN_AESENC128KLU8:
   12662       icode = CODE_FOR_aesenc128klu8;
   12663       goto aesdecenc_expand;
   12664 
   12665     case IX86_BUILTIN_AESENC256KLU8:
   12666       icode = CODE_FOR_aesenc256klu8;
   12667 
   12668     aesdecenc_expand:
   12669 
   12670       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
   12671       arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
   12672       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   12673 
   12674       op0 = expand_normal (arg0);
   12675       op1 = expand_normal (arg1);
   12676       op2 = expand_normal (arg2);
   12677 
   12678       if (!address_operand (op0, V2DImode))
   12679 	{
   12680 	  op0 = convert_memory_address (Pmode, op0);
   12681 	  op0 = copy_addr_to_reg (op0);
   12682 	}
   12683       op0 = gen_rtx_MEM (V2DImode, op0);
   12684 
   12685       if (!REG_P (op1))
   12686 	op1 = copy_to_mode_reg (V2DImode, op1);
   12687 
   12688       if (!address_operand (op2, VOIDmode))
   12689 	{
   12690 	  op2 = convert_memory_address (Pmode, op2);
   12691 	  op2 = copy_addr_to_reg (op2);
   12692 	}
   12693       op2 = gen_rtx_MEM (BLKmode, op2);
   12694 
   12695       emit_insn (GEN_FCN (icode) (op1, op1, op2));
   12696 
   12697       if (target == 0)
   12698 	target = gen_reg_rtx (QImode);
   12699 
   12700       /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
   12701 	 error occurs. Then the output should be cleared for safety. */
   12702       rtx_code_label *ok_label;
   12703       rtx tmp;
   12704 
   12705       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   12706       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   12707       ok_label = gen_label_rtx ();
   12708       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   12709 			       true, ok_label);
   12710       /* Usually the runtime error seldom occur, so predict OK path as
   12711 	 hotspot to optimize it as fallthrough block. */
   12712       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   12713 
   12714       emit_insn (gen_rtx_SET (op1, const0_rtx));
   12715 
   12716       emit_label (ok_label);
   12717       emit_insn (gen_rtx_SET (target, pat));
   12718       emit_insn (gen_rtx_SET (op0, op1));
   12719 
   12720       return target;
   12721 
   12722     case IX86_BUILTIN_AESDECWIDE128KLU8:
   12723       icode = CODE_FOR_aesdecwide128klu8;
   12724       goto wideaesdecenc_expand;
   12725 
   12726     case IX86_BUILTIN_AESDECWIDE256KLU8:
   12727       icode = CODE_FOR_aesdecwide256klu8;
   12728       goto wideaesdecenc_expand;
   12729 
   12730     case IX86_BUILTIN_AESENCWIDE128KLU8:
   12731       icode = CODE_FOR_aesencwide128klu8;
   12732       goto wideaesdecenc_expand;
   12733 
   12734     case IX86_BUILTIN_AESENCWIDE256KLU8:
   12735       icode = CODE_FOR_aesencwide256klu8;
   12736 
   12737     wideaesdecenc_expand:
   12738 
   12739       rtx xmm_regs[8];
   12740       rtx op;
   12741 
   12742       arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
   12743       arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
   12744       arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
   12745 
   12746       op0 = expand_normal (arg0);
   12747       op1 = expand_normal (arg1);
   12748       op2 = expand_normal (arg2);
   12749 
   12750       if (GET_MODE (op1) != Pmode)
   12751 	op1 = convert_to_mode (Pmode, op1, 1);
   12752 
   12753       if (!address_operand (op2, VOIDmode))
   12754 	{
   12755 	  op2 = convert_memory_address (Pmode, op2);
   12756 	  op2 = copy_addr_to_reg (op2);
   12757 	}
   12758       op2 = gen_rtx_MEM (BLKmode, op2);
   12759 
   12760       for (i = 0; i < 8; i++)
   12761 	{
   12762 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   12763 
   12764 	  op = gen_rtx_MEM (V2DImode,
   12765 			    plus_constant (Pmode, op1, (i * 16)));
   12766 
   12767 	  emit_move_insn (xmm_regs[i], op);
   12768 	}
   12769 
   12770       emit_insn (GEN_FCN (icode) (op2));
   12771 
   12772       if (target == 0)
   12773 	target = gen_reg_rtx (QImode);
   12774 
   12775       tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
   12776       pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
   12777       ok_label = gen_label_rtx ();
   12778       emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
   12779 			       true, ok_label);
   12780       predict_jump (REG_BR_PROB_BASE * 90 / 100);
   12781 
   12782       for (i = 0; i < 8; i++)
   12783 	emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
   12784 
   12785       emit_label (ok_label);
   12786       emit_insn (gen_rtx_SET (target, pat));
   12787 
   12788       if (GET_MODE (op0) != Pmode)
   12789 	op0 = convert_to_mode (Pmode, op0, 1);
   12790 
   12791       for (i = 0; i < 8; i++)
   12792 	{
   12793 	  op = gen_rtx_MEM (V2DImode,
   12794 			    plus_constant (Pmode, op0, (i * 16)));
   12795 	  emit_move_insn (op, xmm_regs[i]);
   12796 	}
   12797 
   12798       return target;
   12799 
   12800     case IX86_BUILTIN_ENCODEKEY128U32:
   12801       {
   12802 	rtx op, xmm_regs[7];
   12803 
   12804 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   12805 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
   12806 	arg2 = CALL_EXPR_ARG (exp, 2); // void *h
   12807 
   12808 	op0 = expand_normal (arg0);
   12809 	op1 = expand_normal (arg1);
   12810 	op2 = expand_normal (arg2);
   12811 
   12812 	if (!REG_P (op0))
   12813 	  op0 = copy_to_mode_reg (SImode, op0);
   12814 
   12815 	if (GET_MODE (op2) != Pmode)
   12816 	  op2 = convert_to_mode (Pmode, op2, 1);
   12817 
   12818 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   12819 	emit_move_insn (op, op1);
   12820 
   12821 	for (i = 0; i < 3; i++)
   12822 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   12823 
   12824 	if (target == 0)
   12825 	  target = gen_reg_rtx (SImode);
   12826 
   12827 	emit_insn (gen_encodekey128u32 (target, op0));
   12828 
   12829 	for (i = 0; i < 3; i++)
   12830 	  {
   12831 	    op = gen_rtx_MEM (V2DImode,
   12832 			      plus_constant (Pmode, op2, (i * 16)));
   12833 	    emit_move_insn (op, xmm_regs[i]);
   12834 	  }
   12835 
   12836 	return target;
   12837       }
   12838     case IX86_BUILTIN_ENCODEKEY256U32:
   12839       {
   12840 	rtx op, xmm_regs[7];
   12841 
   12842 	arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
   12843 	arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
   12844 	arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
   12845 	arg3 = CALL_EXPR_ARG (exp, 3); // void *h
   12846 
   12847 	op0 = expand_normal (arg0);
   12848 	op1 = expand_normal (arg1);
   12849 	op2 = expand_normal (arg2);
   12850 	op3 = expand_normal (arg3);
   12851 
   12852 	if (!REG_P (op0))
   12853 	  op0 = copy_to_mode_reg (SImode, op0);
   12854 
   12855 	if (GET_MODE (op3) != Pmode)
   12856 	  op3 = convert_to_mode (Pmode, op3, 1);
   12857 
   12858 	/* Force to use xmm0, xmm1 for keylow, keyhi*/
   12859 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
   12860 	emit_move_insn (op, op1);
   12861 	op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
   12862 	emit_move_insn (op, op2);
   12863 
   12864 	for (i = 0; i < 4; i++)
   12865 	  xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
   12866 
   12867 	if (target == 0)
   12868 	  target = gen_reg_rtx (SImode);
   12869 
   12870 	emit_insn (gen_encodekey256u32 (target, op0));
   12871 
   12872 	for (i = 0; i < 4; i++)
   12873 	  {
   12874 	    op = gen_rtx_MEM (V2DImode,
   12875 			      plus_constant (Pmode, op3, (i * 16)));
   12876 	    emit_move_insn (op, xmm_regs[i]);
   12877 	  }
   12878 
   12879 	return target;
   12880       }
   12881 
   12882     case IX86_BUILTIN_VEC_INIT_V2SI:
   12883     case IX86_BUILTIN_VEC_INIT_V4HI:
   12884     case IX86_BUILTIN_VEC_INIT_V8QI:
   12885       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
   12886 
   12887     case IX86_BUILTIN_VEC_EXT_V2DF:
   12888     case IX86_BUILTIN_VEC_EXT_V2DI:
   12889     case IX86_BUILTIN_VEC_EXT_V4SF:
   12890     case IX86_BUILTIN_VEC_EXT_V4SI:
   12891     case IX86_BUILTIN_VEC_EXT_V8HI:
   12892     case IX86_BUILTIN_VEC_EXT_V2SI:
   12893     case IX86_BUILTIN_VEC_EXT_V4HI:
   12894     case IX86_BUILTIN_VEC_EXT_V16QI:
   12895       return ix86_expand_vec_ext_builtin (exp, target);
   12896 
   12897     case IX86_BUILTIN_VEC_SET_V2DI:
   12898     case IX86_BUILTIN_VEC_SET_V4SF:
   12899     case IX86_BUILTIN_VEC_SET_V4SI:
   12900     case IX86_BUILTIN_VEC_SET_V8HI:
   12901     case IX86_BUILTIN_VEC_SET_V4HI:
   12902     case IX86_BUILTIN_VEC_SET_V16QI:
   12903       return ix86_expand_vec_set_builtin (exp);
   12904 
   12905     case IX86_BUILTIN_NANQ:
   12906     case IX86_BUILTIN_NANSQ:
   12907       return expand_call (exp, target, ignore);
   12908 
   12909     case IX86_BUILTIN_RDPID:
   12910 
   12911       op0 = gen_reg_rtx (word_mode);
   12912 
   12913       if (TARGET_64BIT)
   12914 	{
   12915 	  insn = gen_rdpid_rex64 (op0);
   12916 	  op0 = convert_to_mode (SImode, op0, 1);
   12917 	}
   12918       else
   12919 	insn = gen_rdpid (op0);
   12920 
   12921       emit_insn (insn);
   12922 
   12923       if (target == 0
   12924 	  || !register_operand (target, SImode))
   12925 	target = gen_reg_rtx (SImode);
   12926 
   12927       emit_move_insn (target, op0);
   12928       return target;
   12929 
   12930     case IX86_BUILTIN_2INTERSECTD512:
   12931     case IX86_BUILTIN_2INTERSECTQ512:
   12932     case IX86_BUILTIN_2INTERSECTD256:
   12933     case IX86_BUILTIN_2INTERSECTQ256:
   12934     case IX86_BUILTIN_2INTERSECTD128:
   12935     case IX86_BUILTIN_2INTERSECTQ128:
   12936       arg0 = CALL_EXPR_ARG (exp, 0);
   12937       arg1 = CALL_EXPR_ARG (exp, 1);
   12938       arg2 = CALL_EXPR_ARG (exp, 2);
   12939       arg3 = CALL_EXPR_ARG (exp, 3);
   12940       op0 = expand_normal (arg0);
   12941       op1 = expand_normal (arg1);
   12942       op2 = expand_normal (arg2);
   12943       op3 = expand_normal (arg3);
   12944 
   12945       if (!address_operand (op0, VOIDmode))
   12946 	{
   12947 	  op0 = convert_memory_address (Pmode, op0);
   12948 	  op0 = copy_addr_to_reg (op0);
   12949 	}
   12950       if (!address_operand (op1, VOIDmode))
   12951 	{
   12952 	  op1 = convert_memory_address (Pmode, op1);
   12953 	  op1 = copy_addr_to_reg (op1);
   12954 	}
   12955 
   12956       switch (fcode)
   12957 	{
   12958 	case IX86_BUILTIN_2INTERSECTD512:
   12959 	  mode4 = P2HImode;
   12960 	  icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
   12961 	  break;
   12962 	case IX86_BUILTIN_2INTERSECTQ512:
   12963 	  mode4 = P2QImode;
   12964 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
   12965 	  break;
   12966 	case IX86_BUILTIN_2INTERSECTD256:
   12967 	  mode4 = P2QImode;
   12968 	  icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
   12969 	  break;
   12970 	case IX86_BUILTIN_2INTERSECTQ256:
   12971 	  mode4 = P2QImode;
   12972 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
   12973 	  break;
   12974 	case IX86_BUILTIN_2INTERSECTD128:
   12975 	  mode4 = P2QImode;
   12976 	  icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
   12977 	  break;
   12978 	case IX86_BUILTIN_2INTERSECTQ128:
   12979 	  mode4 = P2QImode;
   12980 	  icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
   12981 	  break;
   12982 	default:
   12983 	  gcc_unreachable ();
   12984 	}
   12985 
   12986       mode2 = insn_data[icode].operand[1].mode;
   12987       mode3 = insn_data[icode].operand[2].mode;
   12988       if (!insn_data[icode].operand[1].predicate (op2, mode2))
   12989 	op2 = copy_to_mode_reg (mode2, op2);
   12990       if (!insn_data[icode].operand[2].predicate (op3, mode3))
   12991 	op3 = copy_to_mode_reg (mode3, op3);
   12992 
   12993       op4 = gen_reg_rtx (mode4);
   12994       emit_insn (GEN_FCN (icode) (op4, op2, op3));
   12995       mode0 = mode4 == P2HImode ? HImode : QImode;
   12996       emit_move_insn (gen_rtx_MEM (mode0, op0),
   12997 		      gen_lowpart (mode0, op4));
   12998       emit_move_insn (gen_rtx_MEM (mode0, op1),
   12999 		      gen_highpart (mode0, op4));
   13000 
   13001       return 0;
   13002 
   13003     case IX86_BUILTIN_RDPMC:
   13004     case IX86_BUILTIN_RDTSC:
   13005     case IX86_BUILTIN_RDTSCP:
   13006     case IX86_BUILTIN_XGETBV:
   13007 
   13008       op0 = gen_reg_rtx (DImode);
   13009       op1 = gen_reg_rtx (DImode);
   13010 
   13011       if (fcode == IX86_BUILTIN_RDPMC)
   13012 	{
   13013 	  arg0 = CALL_EXPR_ARG (exp, 0);
   13014 	  op2 = expand_normal (arg0);
   13015 	  if (!register_operand (op2, SImode))
   13016 	    op2 = copy_to_mode_reg (SImode, op2);
   13017 
   13018 	  insn = (TARGET_64BIT
   13019 		  ? gen_rdpmc_rex64 (op0, op1, op2)
   13020 		  : gen_rdpmc (op0, op2));
   13021 	  emit_insn (insn);
   13022 	}
   13023       else if (fcode == IX86_BUILTIN_XGETBV)
   13024 	{
   13025 	  arg0 = CALL_EXPR_ARG (exp, 0);
   13026 	  op2 = expand_normal (arg0);
   13027 	  if (!register_operand (op2, SImode))
   13028 	    op2 = copy_to_mode_reg (SImode, op2);
   13029 
   13030 	  insn = (TARGET_64BIT
   13031 		  ? gen_xgetbv_rex64 (op0, op1, op2)
   13032 		  : gen_xgetbv (op0, op2));
   13033 	  emit_insn (insn);
   13034 	}
   13035       else if (fcode == IX86_BUILTIN_RDTSC)
   13036 	{
   13037 	  insn = (TARGET_64BIT
   13038 		  ? gen_rdtsc_rex64 (op0, op1)
   13039 		  : gen_rdtsc (op0));
   13040 	  emit_insn (insn);
   13041 	}
   13042       else
   13043 	{
   13044 	  op2 = gen_reg_rtx (SImode);
   13045 
   13046 	  insn = (TARGET_64BIT
   13047 		  ? gen_rdtscp_rex64 (op0, op1, op2)
   13048 		  : gen_rdtscp (op0, op2));
   13049 	  emit_insn (insn);
   13050 
   13051 	  arg0 = CALL_EXPR_ARG (exp, 0);
   13052 	  op4 = expand_normal (arg0);
   13053 	  if (!address_operand (op4, VOIDmode))
   13054 	    {
   13055 	      op4 = convert_memory_address (Pmode, op4);
   13056 	      op4 = copy_addr_to_reg (op4);
   13057 	    }
   13058 	  emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
   13059 	}
   13060 
   13061       if (target == 0
   13062 	  || !register_operand (target, DImode))
   13063         target = gen_reg_rtx (DImode);
   13064 
   13065       if (TARGET_64BIT)
   13066 	{
   13067 	  op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
   13068 				     op1, 1, OPTAB_DIRECT);
   13069 	  op0 = expand_simple_binop (DImode, IOR, op0, op1,
   13070 				     op0, 1, OPTAB_DIRECT);
   13071 	}
   13072 
   13073       emit_move_insn (target, op0);
   13074       return target;
   13075 
   13076     case IX86_BUILTIN_ENQCMD:
   13077     case IX86_BUILTIN_ENQCMDS:
   13078     case IX86_BUILTIN_MOVDIR64B:
   13079 
   13080       arg0 = CALL_EXPR_ARG (exp, 0);
   13081       arg1 = CALL_EXPR_ARG (exp, 1);
   13082       op0 = expand_normal (arg0);
   13083       op1 = expand_normal (arg1);
   13084 
   13085       op0 = ix86_zero_extend_to_Pmode (op0);
   13086       if (!address_operand (op1, VOIDmode))
   13087       {
   13088 	op1 = convert_memory_address (Pmode, op1);
   13089 	op1 = copy_addr_to_reg (op1);
   13090       }
   13091       op1 = gen_rtx_MEM (XImode, op1);
   13092 
   13093       if (fcode == IX86_BUILTIN_MOVDIR64B)
   13094 	{
   13095 	  emit_insn (gen_movdir64b (Pmode, op0, op1));
   13096 	  return 0;
   13097 	}
   13098       else
   13099 	{
   13100 	  if (target == 0
   13101 	      || !register_operand (target, SImode))
   13102 	    target = gen_reg_rtx (SImode);
   13103 
   13104 	  emit_move_insn (target, const0_rtx);
   13105 	  target = gen_rtx_SUBREG (QImode, target, 0);
   13106 
   13107 	  int unspecv = (fcode == IX86_BUILTIN_ENQCMD
   13108 			 ? UNSPECV_ENQCMD
   13109 			 : UNSPECV_ENQCMDS);
   13110 	  icode = code_for_enqcmd (unspecv, Pmode);
   13111 	  emit_insn (GEN_FCN (icode) (op0, op1));
   13112 
   13113 	  emit_insn
   13114 	    (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
   13115 			  gen_rtx_fmt_ee (EQ, QImode,
   13116 					  gen_rtx_REG (CCZmode, FLAGS_REG),
   13117 					  const0_rtx)));
   13118 	  return SUBREG_REG (target);
   13119 	}
   13120 
   13121     case IX86_BUILTIN_FXSAVE:
   13122     case IX86_BUILTIN_FXRSTOR:
   13123     case IX86_BUILTIN_FXSAVE64:
   13124     case IX86_BUILTIN_FXRSTOR64:
   13125     case IX86_BUILTIN_FNSTENV:
   13126     case IX86_BUILTIN_FLDENV:
   13127       mode0 = BLKmode;
   13128       switch (fcode)
   13129 	{
   13130 	case IX86_BUILTIN_FXSAVE:
   13131 	  icode = CODE_FOR_fxsave;
   13132 	  break;
   13133 	case IX86_BUILTIN_FXRSTOR:
   13134 	  icode = CODE_FOR_fxrstor;
   13135 	  break;
   13136 	case IX86_BUILTIN_FXSAVE64:
   13137 	  icode = CODE_FOR_fxsave64;
   13138 	  break;
   13139 	case IX86_BUILTIN_FXRSTOR64:
   13140 	  icode = CODE_FOR_fxrstor64;
   13141 	  break;
   13142 	case IX86_BUILTIN_FNSTENV:
   13143 	  icode = CODE_FOR_fnstenv;
   13144 	  break;
   13145 	case IX86_BUILTIN_FLDENV:
   13146 	  icode = CODE_FOR_fldenv;
   13147 	  break;
   13148 	default:
   13149 	  gcc_unreachable ();
   13150 	}
   13151 
   13152       arg0 = CALL_EXPR_ARG (exp, 0);
   13153       op0 = expand_normal (arg0);
   13154 
   13155       if (!address_operand (op0, VOIDmode))
   13156 	{
   13157 	  op0 = convert_memory_address (Pmode, op0);
   13158 	  op0 = copy_addr_to_reg (op0);
   13159 	}
   13160       op0 = gen_rtx_MEM (mode0, op0);
   13161 
   13162       pat = GEN_FCN (icode) (op0);
   13163       if (pat)
   13164 	emit_insn (pat);
   13165       return 0;
   13166 
   13167     case IX86_BUILTIN_XSETBV:
   13168       arg0 = CALL_EXPR_ARG (exp, 0);
   13169       arg1 = CALL_EXPR_ARG (exp, 1);
   13170       op0 = expand_normal (arg0);
   13171       op1 = expand_normal (arg1);
   13172 
   13173       if (!REG_P (op0))
   13174 	op0 = copy_to_mode_reg (SImode, op0);
   13175 
   13176       op1 = force_reg (DImode, op1);
   13177 
   13178       if (TARGET_64BIT)
   13179 	{
   13180 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   13181 				     NULL, 1, OPTAB_DIRECT);
   13182 
   13183 	  icode = CODE_FOR_xsetbv_rex64;
   13184 
   13185 	  op2 = gen_lowpart (SImode, op2);
   13186 	  op1 = gen_lowpart (SImode, op1);
   13187 	  pat = GEN_FCN (icode) (op0, op1, op2);
   13188 	}
   13189       else
   13190 	{
   13191 	  icode = CODE_FOR_xsetbv;
   13192 
   13193 	  pat = GEN_FCN (icode) (op0, op1);
   13194 	}
   13195       if (pat)
   13196 	emit_insn (pat);
   13197       return 0;
   13198 
   13199     case IX86_BUILTIN_XSAVE:
   13200     case IX86_BUILTIN_XRSTOR:
   13201     case IX86_BUILTIN_XSAVE64:
   13202     case IX86_BUILTIN_XRSTOR64:
   13203     case IX86_BUILTIN_XSAVEOPT:
   13204     case IX86_BUILTIN_XSAVEOPT64:
   13205     case IX86_BUILTIN_XSAVES:
   13206     case IX86_BUILTIN_XRSTORS:
   13207     case IX86_BUILTIN_XSAVES64:
   13208     case IX86_BUILTIN_XRSTORS64:
   13209     case IX86_BUILTIN_XSAVEC:
   13210     case IX86_BUILTIN_XSAVEC64:
   13211       arg0 = CALL_EXPR_ARG (exp, 0);
   13212       arg1 = CALL_EXPR_ARG (exp, 1);
   13213       op0 = expand_normal (arg0);
   13214       op1 = expand_normal (arg1);
   13215 
   13216       if (!address_operand (op0, VOIDmode))
   13217 	{
   13218 	  op0 = convert_memory_address (Pmode, op0);
   13219 	  op0 = copy_addr_to_reg (op0);
   13220 	}
   13221       op0 = gen_rtx_MEM (BLKmode, op0);
   13222 
   13223       op1 = force_reg (DImode, op1);
   13224 
   13225       if (TARGET_64BIT)
   13226 	{
   13227 	  op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
   13228 				     NULL, 1, OPTAB_DIRECT);
   13229 	  switch (fcode)
   13230 	    {
   13231 	    case IX86_BUILTIN_XSAVE:
   13232 	      icode = CODE_FOR_xsave_rex64;
   13233 	      break;
   13234 	    case IX86_BUILTIN_XRSTOR:
   13235 	      icode = CODE_FOR_xrstor_rex64;
   13236 	      break;
   13237 	    case IX86_BUILTIN_XSAVE64:
   13238 	      icode = CODE_FOR_xsave64;
   13239 	      break;
   13240 	    case IX86_BUILTIN_XRSTOR64:
   13241 	      icode = CODE_FOR_xrstor64;
   13242 	      break;
   13243 	    case IX86_BUILTIN_XSAVEOPT:
   13244 	      icode = CODE_FOR_xsaveopt_rex64;
   13245 	      break;
   13246 	    case IX86_BUILTIN_XSAVEOPT64:
   13247 	      icode = CODE_FOR_xsaveopt64;
   13248 	      break;
   13249 	    case IX86_BUILTIN_XSAVES:
   13250 	      icode = CODE_FOR_xsaves_rex64;
   13251 	      break;
   13252 	    case IX86_BUILTIN_XRSTORS:
   13253 	      icode = CODE_FOR_xrstors_rex64;
   13254 	      break;
   13255 	    case IX86_BUILTIN_XSAVES64:
   13256 	      icode = CODE_FOR_xsaves64;
   13257 	      break;
   13258 	    case IX86_BUILTIN_XRSTORS64:
   13259 	      icode = CODE_FOR_xrstors64;
   13260 	      break;
   13261 	    case IX86_BUILTIN_XSAVEC:
   13262 	      icode = CODE_FOR_xsavec_rex64;
   13263 	      break;
   13264 	    case IX86_BUILTIN_XSAVEC64:
   13265 	      icode = CODE_FOR_xsavec64;
   13266 	      break;
   13267 	    default:
   13268 	      gcc_unreachable ();
   13269 	    }
   13270 
   13271 	  op2 = gen_lowpart (SImode, op2);
   13272 	  op1 = gen_lowpart (SImode, op1);
   13273 	  pat = GEN_FCN (icode) (op0, op1, op2);
   13274 	}
   13275       else
   13276 	{
   13277 	  switch (fcode)
   13278 	    {
   13279 	    case IX86_BUILTIN_XSAVE:
   13280 	      icode = CODE_FOR_xsave;
   13281 	      break;
   13282 	    case IX86_BUILTIN_XRSTOR:
   13283 	      icode = CODE_FOR_xrstor;
   13284 	      break;
   13285 	    case IX86_BUILTIN_XSAVEOPT:
   13286 	      icode = CODE_FOR_xsaveopt;
   13287 	      break;
   13288 	    case IX86_BUILTIN_XSAVES:
   13289 	      icode = CODE_FOR_xsaves;
   13290 	      break;
   13291 	    case IX86_BUILTIN_XRSTORS:
   13292 	      icode = CODE_FOR_xrstors;
   13293 	      break;
   13294 	    case IX86_BUILTIN_XSAVEC:
   13295 	      icode = CODE_FOR_xsavec;
   13296 	      break;
   13297 	    default:
   13298 	      gcc_unreachable ();
   13299 	    }
   13300 	  pat = GEN_FCN (icode) (op0, op1);
   13301 	}
   13302 
   13303       if (pat)
   13304 	emit_insn (pat);
   13305       return 0;
   13306 
   13307     case IX86_BUILTIN_LDTILECFG:
   13308     case IX86_BUILTIN_STTILECFG:
   13309       arg0 = CALL_EXPR_ARG (exp, 0);
   13310       op0 = expand_normal (arg0);
   13311 
   13312       if (!address_operand (op0, VOIDmode))
   13313 	{
   13314 	  op0 = convert_memory_address (Pmode, op0);
   13315 	  op0 = copy_addr_to_reg (op0);
   13316 	}
   13317       op0 = gen_rtx_MEM (XImode, op0);
   13318       if (fcode == IX86_BUILTIN_LDTILECFG)
   13319 	icode = CODE_FOR_ldtilecfg;
   13320       else
   13321 	icode = CODE_FOR_sttilecfg;
   13322       pat = GEN_FCN (icode) (op0);
   13323       emit_insn (pat);
   13324       return 0;
   13325 
   13326     case IX86_BUILTIN_LLWPCB:
   13327       arg0 = CALL_EXPR_ARG (exp, 0);
   13328       op0 = expand_normal (arg0);
   13329 
   13330       if (!register_operand (op0, Pmode))
   13331 	op0 = ix86_zero_extend_to_Pmode (op0);
   13332       emit_insn (gen_lwp_llwpcb (Pmode, op0));
   13333       return 0;
   13334 
   13335     case IX86_BUILTIN_SLWPCB:
   13336       if (!target
   13337 	  || !register_operand (target, Pmode))
   13338 	target = gen_reg_rtx (Pmode);
   13339       emit_insn (gen_lwp_slwpcb (Pmode, target));
   13340       return target;
   13341 
   13342     case IX86_BUILTIN_LWPVAL32:
   13343     case IX86_BUILTIN_LWPVAL64:
   13344     case IX86_BUILTIN_LWPINS32:
   13345     case IX86_BUILTIN_LWPINS64:
   13346       mode = ((fcode == IX86_BUILTIN_LWPVAL32
   13347 	       || fcode == IX86_BUILTIN_LWPINS32)
   13348 	      ? SImode : DImode);
   13349 
   13350       if (fcode == IX86_BUILTIN_LWPVAL32
   13351 	  || fcode == IX86_BUILTIN_LWPVAL64)
   13352 	icode = code_for_lwp_lwpval (mode);
   13353       else
   13354 	icode = code_for_lwp_lwpins (mode);
   13355 
   13356       arg0 = CALL_EXPR_ARG (exp, 0);
   13357       arg1 = CALL_EXPR_ARG (exp, 1);
   13358       arg2 = CALL_EXPR_ARG (exp, 2);
   13359       op0 = expand_normal (arg0);
   13360       op1 = expand_normal (arg1);
   13361       op2 = expand_normal (arg2);
   13362       mode0 = insn_data[icode].operand[0].mode;
   13363 
   13364       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   13365 	op0 = copy_to_mode_reg (mode0, op0);
   13366       if (!insn_data[icode].operand[1].predicate (op1, SImode))
   13367 	op1 = copy_to_mode_reg (SImode, op1);
   13368 
   13369       if (!CONST_INT_P (op2))
   13370 	{
   13371 	  error ("the last argument must be a 32-bit immediate");
   13372 	  return const0_rtx;
   13373 	}
   13374 
   13375       emit_insn (GEN_FCN (icode) (op0, op1, op2));
   13376 
   13377       if (fcode == IX86_BUILTIN_LWPINS32
   13378 	  || fcode == IX86_BUILTIN_LWPINS64)
   13379 	{
   13380 	  if (target == 0
   13381 	      || !nonimmediate_operand (target, QImode))
   13382 	    target = gen_reg_rtx (QImode);
   13383 
   13384 	  pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   13385 			    const0_rtx);
   13386 	  emit_insn (gen_rtx_SET (target, pat));
   13387 
   13388 	  return target;
   13389 	}
   13390       else
   13391 	return 0;
   13392 
   13393     case IX86_BUILTIN_BEXTRI32:
   13394     case IX86_BUILTIN_BEXTRI64:
   13395       mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
   13396 
   13397       arg0 = CALL_EXPR_ARG (exp, 0);
   13398       arg1 = CALL_EXPR_ARG (exp, 1);
   13399       op0 = expand_normal (arg0);
   13400       op1 = expand_normal (arg1);
   13401 
   13402       if (!CONST_INT_P (op1))
   13403 	{
   13404 	  error ("last argument must be an immediate");
   13405 	  return const0_rtx;
   13406 	}
   13407       else
   13408 	{
   13409 	  unsigned char lsb_index = UINTVAL (op1);
   13410 	  unsigned char length = UINTVAL (op1) >> 8;
   13411 
   13412 	  unsigned char bitsize = GET_MODE_BITSIZE (mode);
   13413 
   13414 	  icode = code_for_tbm_bextri (mode);
   13415 
   13416 	  mode1 = insn_data[icode].operand[1].mode;
   13417 	  if (!insn_data[icode].operand[1].predicate (op0, mode1))
   13418 	    op0 = copy_to_mode_reg (mode1, op0);
   13419 
   13420 	  mode0 = insn_data[icode].operand[0].mode;
   13421 	  if (target == 0
   13422 	      || !register_operand (target, mode0))
   13423 	    target = gen_reg_rtx (mode0);
   13424 
   13425 	  if (length == 0 || lsb_index >= bitsize)
   13426 	    {
   13427 	      emit_move_insn (target, const0_rtx);
   13428 	      return target;
   13429 	    }
   13430 
   13431 	  if (length + lsb_index > bitsize)
   13432 	    length = bitsize - lsb_index;
   13433 
   13434 	  op1 = GEN_INT (length);
   13435 	  op2 = GEN_INT (lsb_index);
   13436 
   13437 	  emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
   13438 	  return target;
   13439 	}
   13440 
   13441     case IX86_BUILTIN_RDRAND16_STEP:
   13442       mode = HImode;
   13443       goto rdrand_step;
   13444 
   13445     case IX86_BUILTIN_RDRAND32_STEP:
   13446       mode = SImode;
   13447       goto rdrand_step;
   13448 
   13449     case IX86_BUILTIN_RDRAND64_STEP:
   13450       mode = DImode;
   13451 
   13452 rdrand_step:
   13453       arg0 = CALL_EXPR_ARG (exp, 0);
   13454       op1 = expand_normal (arg0);
   13455       if (!address_operand (op1, VOIDmode))
   13456 	{
   13457 	  op1 = convert_memory_address (Pmode, op1);
   13458 	  op1 = copy_addr_to_reg (op1);
   13459 	}
   13460 
   13461       op0 = gen_reg_rtx (mode);
   13462       emit_insn (gen_rdrand (mode, op0));
   13463 
   13464       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   13465 
   13466       op1 = force_reg (SImode, const1_rtx);
   13467 
   13468       /* Emit SImode conditional move.  */
   13469       if (mode == HImode)
   13470 	{
   13471 	  if (TARGET_ZERO_EXTEND_WITH_AND
   13472 	      && optimize_function_for_speed_p (cfun))
   13473 	    {
   13474 	      op2 = force_reg (SImode, const0_rtx);
   13475 
   13476 	      emit_insn (gen_movstricthi
   13477 			 (gen_lowpart (HImode, op2), op0));
   13478 	    }
   13479 	  else
   13480 	    {
   13481 	      op2 = gen_reg_rtx (SImode);
   13482 
   13483 	      emit_insn (gen_zero_extendhisi2 (op2, op0));
   13484 	    }
   13485 	}
   13486       else if (mode == SImode)
   13487 	op2 = op0;
   13488       else
   13489 	op2 = gen_rtx_SUBREG (SImode, op0, 0);
   13490 
   13491       if (target == 0
   13492 	  || !register_operand (target, SImode))
   13493 	target = gen_reg_rtx (SImode);
   13494 
   13495       pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
   13496 			 const0_rtx);
   13497       emit_insn (gen_rtx_SET (target,
   13498 			      gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
   13499       return target;
   13500 
   13501     case IX86_BUILTIN_RDSEED16_STEP:
   13502       mode = HImode;
   13503       goto rdseed_step;
   13504 
   13505     case IX86_BUILTIN_RDSEED32_STEP:
   13506       mode = SImode;
   13507       goto rdseed_step;
   13508 
   13509     case IX86_BUILTIN_RDSEED64_STEP:
   13510       mode = DImode;
   13511 
   13512 rdseed_step:
   13513       arg0 = CALL_EXPR_ARG (exp, 0);
   13514       op1 = expand_normal (arg0);
   13515       if (!address_operand (op1, VOIDmode))
   13516 	{
   13517 	  op1 = convert_memory_address (Pmode, op1);
   13518 	  op1 = copy_addr_to_reg (op1);
   13519 	}
   13520 
   13521       op0 = gen_reg_rtx (mode);
   13522       emit_insn (gen_rdseed (mode, op0));
   13523 
   13524       emit_move_insn (gen_rtx_MEM (mode, op1), op0);
   13525 
   13526       op2 = gen_reg_rtx (QImode);
   13527 
   13528       pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
   13529                          const0_rtx);
   13530       emit_insn (gen_rtx_SET (op2, pat));
   13531 
   13532       if (target == 0
   13533 	  || !register_operand (target, SImode))
   13534         target = gen_reg_rtx (SImode);
   13535 
   13536       emit_insn (gen_zero_extendqisi2 (target, op2));
   13537       return target;
   13538 
   13539     case IX86_BUILTIN_SBB32:
   13540       icode = CODE_FOR_subborrowsi;
   13541       icode2 = CODE_FOR_subborrowsi_0;
   13542       mode0 = SImode;
   13543       mode1 = DImode;
   13544       mode2 = CCmode;
   13545       goto handlecarry;
   13546 
   13547     case IX86_BUILTIN_SBB64:
   13548       icode = CODE_FOR_subborrowdi;
   13549       icode2 = CODE_FOR_subborrowdi_0;
   13550       mode0 = DImode;
   13551       mode1 = TImode;
   13552       mode2 = CCmode;
   13553       goto handlecarry;
   13554 
   13555     case IX86_BUILTIN_ADDCARRYX32:
   13556       icode = CODE_FOR_addcarrysi;
   13557       icode2 = CODE_FOR_addcarrysi_0;
   13558       mode0 = SImode;
   13559       mode1 = DImode;
   13560       mode2 = CCCmode;
   13561       goto handlecarry;
   13562 
   13563     case IX86_BUILTIN_ADDCARRYX64:
   13564       icode = CODE_FOR_addcarrydi;
   13565       icode2 = CODE_FOR_addcarrydi_0;
   13566       mode0 = DImode;
   13567       mode1 = TImode;
   13568       mode2 = CCCmode;
   13569 
   13570     handlecarry:
   13571       arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in.  */
   13572       arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1.  */
   13573       arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2.  */
   13574       arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out.  */
   13575 
   13576       op1 = expand_normal (arg0);
   13577       if (!integer_zerop (arg0))
   13578 	op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
   13579 
   13580       op2 = expand_normal (arg1);
   13581       if (!register_operand (op2, mode0))
   13582 	op2 = copy_to_mode_reg (mode0, op2);
   13583 
   13584       op3 = expand_normal (arg2);
   13585       if (!register_operand (op3, mode0))
   13586 	op3 = copy_to_mode_reg (mode0, op3);
   13587 
   13588       op4 = expand_normal (arg3);
   13589       if (!address_operand (op4, VOIDmode))
   13590 	{
   13591 	  op4 = convert_memory_address (Pmode, op4);
   13592 	  op4 = copy_addr_to_reg (op4);
   13593 	}
   13594 
   13595       op0 = gen_reg_rtx (mode0);
   13596       if (integer_zerop (arg0))
   13597 	{
   13598 	  /* If arg0 is 0, optimize right away into add or sub
   13599 	     instruction that sets CCCmode flags.  */
   13600 	  op1 = gen_rtx_REG (mode2, FLAGS_REG);
   13601 	  emit_insn (GEN_FCN (icode2) (op0, op2, op3));
   13602 	}
   13603       else
   13604 	{
   13605 	  /* Generate CF from input operand.  */
   13606 	  emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
   13607 
   13608 	  /* Generate instruction that consumes CF.  */
   13609 	  op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
   13610 	  pat = gen_rtx_LTU (mode1, op1, const0_rtx);
   13611 	  pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
   13612 	  emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
   13613 	}
   13614 
   13615       /* Return current CF value.  */
   13616       if (target == 0)
   13617         target = gen_reg_rtx (QImode);
   13618 
   13619       pat = gen_rtx_LTU (QImode, op1, const0_rtx);
   13620       emit_insn (gen_rtx_SET (target, pat));
   13621 
   13622       /* Store the result.  */
   13623       emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
   13624 
   13625       return target;
   13626 
   13627     case IX86_BUILTIN_READ_FLAGS:
   13628       if (ignore)
   13629 	return const0_rtx;
   13630 
   13631       emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
   13632 
   13633       if (optimize
   13634 	  || target == NULL_RTX
   13635 	  || !nonimmediate_operand (target, word_mode)
   13636 	  || GET_MODE (target) != word_mode)
   13637 	target = gen_reg_rtx (word_mode);
   13638 
   13639       emit_insn (gen_pop (target));
   13640       return target;
   13641 
   13642     case IX86_BUILTIN_WRITE_FLAGS:
   13643 
   13644       arg0 = CALL_EXPR_ARG (exp, 0);
   13645       op0 = expand_normal (arg0);
   13646       if (!general_no_elim_operand (op0, word_mode))
   13647 	op0 = copy_to_mode_reg (word_mode, op0);
   13648 
   13649       emit_insn (gen_push (op0));
   13650       emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
   13651       return 0;
   13652 
   13653     case IX86_BUILTIN_KTESTC8:
   13654       icode = CODE_FOR_ktestqi;
   13655       mode3 = CCCmode;
   13656       goto kortest;
   13657 
   13658     case IX86_BUILTIN_KTESTZ8:
   13659       icode = CODE_FOR_ktestqi;
   13660       mode3 = CCZmode;
   13661       goto kortest;
   13662 
   13663     case IX86_BUILTIN_KTESTC16:
   13664       icode = CODE_FOR_ktesthi;
   13665       mode3 = CCCmode;
   13666       goto kortest;
   13667 
   13668     case IX86_BUILTIN_KTESTZ16:
   13669       icode = CODE_FOR_ktesthi;
   13670       mode3 = CCZmode;
   13671       goto kortest;
   13672 
   13673     case IX86_BUILTIN_KTESTC32:
   13674       icode = CODE_FOR_ktestsi;
   13675       mode3 = CCCmode;
   13676       goto kortest;
   13677 
   13678     case IX86_BUILTIN_KTESTZ32:
   13679       icode = CODE_FOR_ktestsi;
   13680       mode3 = CCZmode;
   13681       goto kortest;
   13682 
   13683     case IX86_BUILTIN_KTESTC64:
   13684       icode = CODE_FOR_ktestdi;
   13685       mode3 = CCCmode;
   13686       goto kortest;
   13687 
   13688     case IX86_BUILTIN_KTESTZ64:
   13689       icode = CODE_FOR_ktestdi;
   13690       mode3 = CCZmode;
   13691       goto kortest;
   13692 
   13693     case IX86_BUILTIN_KORTESTC8:
   13694       icode = CODE_FOR_kortestqi;
   13695       mode3 = CCCmode;
   13696       goto kortest;
   13697 
   13698     case IX86_BUILTIN_KORTESTZ8:
   13699       icode = CODE_FOR_kortestqi;
   13700       mode3 = CCZmode;
   13701       goto kortest;
   13702 
   13703     case IX86_BUILTIN_KORTESTC16:
   13704       icode = CODE_FOR_kortesthi;
   13705       mode3 = CCCmode;
   13706       goto kortest;
   13707 
   13708     case IX86_BUILTIN_KORTESTZ16:
   13709       icode = CODE_FOR_kortesthi;
   13710       mode3 = CCZmode;
   13711       goto kortest;
   13712 
   13713     case IX86_BUILTIN_KORTESTC32:
   13714       icode = CODE_FOR_kortestsi;
   13715       mode3 = CCCmode;
   13716       goto kortest;
   13717 
   13718     case IX86_BUILTIN_KORTESTZ32:
   13719       icode = CODE_FOR_kortestsi;
   13720       mode3 = CCZmode;
   13721       goto kortest;
   13722 
   13723     case IX86_BUILTIN_KORTESTC64:
   13724       icode = CODE_FOR_kortestdi;
   13725       mode3 = CCCmode;
   13726       goto kortest;
   13727 
   13728     case IX86_BUILTIN_KORTESTZ64:
   13729       icode = CODE_FOR_kortestdi;
   13730       mode3 = CCZmode;
   13731 
   13732     kortest:
   13733       arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1.  */
   13734       arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2.  */
   13735       op0 = expand_normal (arg0);
   13736       op1 = expand_normal (arg1);
   13737 
   13738       mode0 = insn_data[icode].operand[0].mode;
   13739       mode1 = insn_data[icode].operand[1].mode;
   13740 
   13741       if (GET_MODE (op0) != VOIDmode)
   13742 	op0 = force_reg (GET_MODE (op0), op0);
   13743 
   13744       op0 = gen_lowpart (mode0, op0);
   13745 
   13746       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   13747 	op0 = copy_to_mode_reg (mode0, op0);
   13748 
   13749       if (GET_MODE (op1) != VOIDmode)
   13750 	op1 = force_reg (GET_MODE (op1), op1);
   13751 
   13752       op1 = gen_lowpart (mode1, op1);
   13753 
   13754       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   13755 	op1 = copy_to_mode_reg (mode1, op1);
   13756 
   13757       target = gen_reg_rtx (QImode);
   13758 
   13759       /* Emit kortest.  */
   13760       emit_insn (GEN_FCN (icode) (op0, op1));
   13761       /* And use setcc to return result from flags.  */
   13762       ix86_expand_setcc (target, EQ,
   13763 			 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
   13764       return target;
   13765 
   13766     case IX86_BUILTIN_GATHERSIV2DF:
   13767       icode = CODE_FOR_avx2_gathersiv2df;
   13768       goto gather_gen;
   13769     case IX86_BUILTIN_GATHERSIV4DF:
   13770       icode = CODE_FOR_avx2_gathersiv4df;
   13771       goto gather_gen;
   13772     case IX86_BUILTIN_GATHERDIV2DF:
   13773       icode = CODE_FOR_avx2_gatherdiv2df;
   13774       goto gather_gen;
   13775     case IX86_BUILTIN_GATHERDIV4DF:
   13776       icode = CODE_FOR_avx2_gatherdiv4df;
   13777       goto gather_gen;
   13778     case IX86_BUILTIN_GATHERSIV4SF:
   13779       icode = CODE_FOR_avx2_gathersiv4sf;
   13780       goto gather_gen;
   13781     case IX86_BUILTIN_GATHERSIV8SF:
   13782       icode = CODE_FOR_avx2_gathersiv8sf;
   13783       goto gather_gen;
   13784     case IX86_BUILTIN_GATHERDIV4SF:
   13785       icode = CODE_FOR_avx2_gatherdiv4sf;
   13786       goto gather_gen;
   13787     case IX86_BUILTIN_GATHERDIV8SF:
   13788       icode = CODE_FOR_avx2_gatherdiv8sf;
   13789       goto gather_gen;
   13790     case IX86_BUILTIN_GATHERSIV2DI:
   13791       icode = CODE_FOR_avx2_gathersiv2di;
   13792       goto gather_gen;
   13793     case IX86_BUILTIN_GATHERSIV4DI:
   13794       icode = CODE_FOR_avx2_gathersiv4di;
   13795       goto gather_gen;
   13796     case IX86_BUILTIN_GATHERDIV2DI:
   13797       icode = CODE_FOR_avx2_gatherdiv2di;
   13798       goto gather_gen;
   13799     case IX86_BUILTIN_GATHERDIV4DI:
   13800       icode = CODE_FOR_avx2_gatherdiv4di;
   13801       goto gather_gen;
   13802     case IX86_BUILTIN_GATHERSIV4SI:
   13803       icode = CODE_FOR_avx2_gathersiv4si;
   13804       goto gather_gen;
   13805     case IX86_BUILTIN_GATHERSIV8SI:
   13806       icode = CODE_FOR_avx2_gathersiv8si;
   13807       goto gather_gen;
   13808     case IX86_BUILTIN_GATHERDIV4SI:
   13809       icode = CODE_FOR_avx2_gatherdiv4si;
   13810       goto gather_gen;
   13811     case IX86_BUILTIN_GATHERDIV8SI:
   13812       icode = CODE_FOR_avx2_gatherdiv8si;
   13813       goto gather_gen;
   13814     case IX86_BUILTIN_GATHERALTSIV4DF:
   13815       icode = CODE_FOR_avx2_gathersiv4df;
   13816       goto gather_gen;
   13817     case IX86_BUILTIN_GATHERALTDIV8SF:
   13818       icode = CODE_FOR_avx2_gatherdiv8sf;
   13819       goto gather_gen;
   13820     case IX86_BUILTIN_GATHERALTSIV4DI:
   13821       icode = CODE_FOR_avx2_gathersiv4di;
   13822       goto gather_gen;
   13823     case IX86_BUILTIN_GATHERALTDIV8SI:
   13824       icode = CODE_FOR_avx2_gatherdiv8si;
   13825       goto gather_gen;
   13826     case IX86_BUILTIN_GATHER3SIV16SF:
   13827       icode = CODE_FOR_avx512f_gathersiv16sf;
   13828       goto gather_gen;
   13829     case IX86_BUILTIN_GATHER3SIV8DF:
   13830       icode = CODE_FOR_avx512f_gathersiv8df;
   13831       goto gather_gen;
   13832     case IX86_BUILTIN_GATHER3DIV16SF:
   13833       icode = CODE_FOR_avx512f_gatherdiv16sf;
   13834       goto gather_gen;
   13835     case IX86_BUILTIN_GATHER3DIV8DF:
   13836       icode = CODE_FOR_avx512f_gatherdiv8df;
   13837       goto gather_gen;
   13838     case IX86_BUILTIN_GATHER3SIV16SI:
   13839       icode = CODE_FOR_avx512f_gathersiv16si;
   13840       goto gather_gen;
   13841     case IX86_BUILTIN_GATHER3SIV8DI:
   13842       icode = CODE_FOR_avx512f_gathersiv8di;
   13843       goto gather_gen;
   13844     case IX86_BUILTIN_GATHER3DIV16SI:
   13845       icode = CODE_FOR_avx512f_gatherdiv16si;
   13846       goto gather_gen;
   13847     case IX86_BUILTIN_GATHER3DIV8DI:
   13848       icode = CODE_FOR_avx512f_gatherdiv8di;
   13849       goto gather_gen;
   13850     case IX86_BUILTIN_GATHER3ALTSIV8DF:
   13851       icode = CODE_FOR_avx512f_gathersiv8df;
   13852       goto gather_gen;
   13853     case IX86_BUILTIN_GATHER3ALTDIV16SF:
   13854       icode = CODE_FOR_avx512f_gatherdiv16sf;
   13855       goto gather_gen;
   13856     case IX86_BUILTIN_GATHER3ALTSIV8DI:
   13857       icode = CODE_FOR_avx512f_gathersiv8di;
   13858       goto gather_gen;
   13859     case IX86_BUILTIN_GATHER3ALTDIV16SI:
   13860       icode = CODE_FOR_avx512f_gatherdiv16si;
   13861       goto gather_gen;
   13862     case IX86_BUILTIN_GATHER3SIV2DF:
   13863       icode = CODE_FOR_avx512vl_gathersiv2df;
   13864       goto gather_gen;
   13865     case IX86_BUILTIN_GATHER3SIV4DF:
   13866       icode = CODE_FOR_avx512vl_gathersiv4df;
   13867       goto gather_gen;
   13868     case IX86_BUILTIN_GATHER3DIV2DF:
   13869       icode = CODE_FOR_avx512vl_gatherdiv2df;
   13870       goto gather_gen;
   13871     case IX86_BUILTIN_GATHER3DIV4DF:
   13872       icode = CODE_FOR_avx512vl_gatherdiv4df;
   13873       goto gather_gen;
   13874     case IX86_BUILTIN_GATHER3SIV4SF:
   13875       icode = CODE_FOR_avx512vl_gathersiv4sf;
   13876       goto gather_gen;
   13877     case IX86_BUILTIN_GATHER3SIV8SF:
   13878       icode = CODE_FOR_avx512vl_gathersiv8sf;
   13879       goto gather_gen;
   13880     case IX86_BUILTIN_GATHER3DIV4SF:
   13881       icode = CODE_FOR_avx512vl_gatherdiv4sf;
   13882       goto gather_gen;
   13883     case IX86_BUILTIN_GATHER3DIV8SF:
   13884       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   13885       goto gather_gen;
   13886     case IX86_BUILTIN_GATHER3SIV2DI:
   13887       icode = CODE_FOR_avx512vl_gathersiv2di;
   13888       goto gather_gen;
   13889     case IX86_BUILTIN_GATHER3SIV4DI:
   13890       icode = CODE_FOR_avx512vl_gathersiv4di;
   13891       goto gather_gen;
   13892     case IX86_BUILTIN_GATHER3DIV2DI:
   13893       icode = CODE_FOR_avx512vl_gatherdiv2di;
   13894       goto gather_gen;
   13895     case IX86_BUILTIN_GATHER3DIV4DI:
   13896       icode = CODE_FOR_avx512vl_gatherdiv4di;
   13897       goto gather_gen;
   13898     case IX86_BUILTIN_GATHER3SIV4SI:
   13899       icode = CODE_FOR_avx512vl_gathersiv4si;
   13900       goto gather_gen;
   13901     case IX86_BUILTIN_GATHER3SIV8SI:
   13902       icode = CODE_FOR_avx512vl_gathersiv8si;
   13903       goto gather_gen;
   13904     case IX86_BUILTIN_GATHER3DIV4SI:
   13905       icode = CODE_FOR_avx512vl_gatherdiv4si;
   13906       goto gather_gen;
   13907     case IX86_BUILTIN_GATHER3DIV8SI:
   13908       icode = CODE_FOR_avx512vl_gatherdiv8si;
   13909       goto gather_gen;
   13910     case IX86_BUILTIN_GATHER3ALTSIV4DF:
   13911       icode = CODE_FOR_avx512vl_gathersiv4df;
   13912       goto gather_gen;
   13913     case IX86_BUILTIN_GATHER3ALTDIV8SF:
   13914       icode = CODE_FOR_avx512vl_gatherdiv8sf;
   13915       goto gather_gen;
   13916     case IX86_BUILTIN_GATHER3ALTSIV4DI:
   13917       icode = CODE_FOR_avx512vl_gathersiv4di;
   13918       goto gather_gen;
   13919     case IX86_BUILTIN_GATHER3ALTDIV8SI:
   13920       icode = CODE_FOR_avx512vl_gatherdiv8si;
   13921       goto gather_gen;
   13922     case IX86_BUILTIN_SCATTERSIV16SF:
   13923       icode = CODE_FOR_avx512f_scattersiv16sf;
   13924       goto scatter_gen;
   13925     case IX86_BUILTIN_SCATTERSIV8DF:
   13926       icode = CODE_FOR_avx512f_scattersiv8df;
   13927       goto scatter_gen;
   13928     case IX86_BUILTIN_SCATTERDIV16SF:
   13929       icode = CODE_FOR_avx512f_scatterdiv16sf;
   13930       goto scatter_gen;
   13931     case IX86_BUILTIN_SCATTERDIV8DF:
   13932       icode = CODE_FOR_avx512f_scatterdiv8df;
   13933       goto scatter_gen;
   13934     case IX86_BUILTIN_SCATTERSIV16SI:
   13935       icode = CODE_FOR_avx512f_scattersiv16si;
   13936       goto scatter_gen;
   13937     case IX86_BUILTIN_SCATTERSIV8DI:
   13938       icode = CODE_FOR_avx512f_scattersiv8di;
   13939       goto scatter_gen;
   13940     case IX86_BUILTIN_SCATTERDIV16SI:
   13941       icode = CODE_FOR_avx512f_scatterdiv16si;
   13942       goto scatter_gen;
   13943     case IX86_BUILTIN_SCATTERDIV8DI:
   13944       icode = CODE_FOR_avx512f_scatterdiv8di;
   13945       goto scatter_gen;
   13946     case IX86_BUILTIN_SCATTERSIV8SF:
   13947       icode = CODE_FOR_avx512vl_scattersiv8sf;
   13948       goto scatter_gen;
   13949     case IX86_BUILTIN_SCATTERSIV4SF:
   13950       icode = CODE_FOR_avx512vl_scattersiv4sf;
   13951       goto scatter_gen;
   13952     case IX86_BUILTIN_SCATTERSIV4DF:
   13953       icode = CODE_FOR_avx512vl_scattersiv4df;
   13954       goto scatter_gen;
   13955     case IX86_BUILTIN_SCATTERSIV2DF:
   13956       icode = CODE_FOR_avx512vl_scattersiv2df;
   13957       goto scatter_gen;
   13958     case IX86_BUILTIN_SCATTERDIV8SF:
   13959       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   13960       goto scatter_gen;
   13961     case IX86_BUILTIN_SCATTERDIV4SF:
   13962       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   13963       goto scatter_gen;
   13964     case IX86_BUILTIN_SCATTERDIV4DF:
   13965       icode = CODE_FOR_avx512vl_scatterdiv4df;
   13966       goto scatter_gen;
   13967     case IX86_BUILTIN_SCATTERDIV2DF:
   13968       icode = CODE_FOR_avx512vl_scatterdiv2df;
   13969       goto scatter_gen;
   13970     case IX86_BUILTIN_SCATTERSIV8SI:
   13971       icode = CODE_FOR_avx512vl_scattersiv8si;
   13972       goto scatter_gen;
   13973     case IX86_BUILTIN_SCATTERSIV4SI:
   13974       icode = CODE_FOR_avx512vl_scattersiv4si;
   13975       goto scatter_gen;
   13976     case IX86_BUILTIN_SCATTERSIV4DI:
   13977       icode = CODE_FOR_avx512vl_scattersiv4di;
   13978       goto scatter_gen;
   13979     case IX86_BUILTIN_SCATTERSIV2DI:
   13980       icode = CODE_FOR_avx512vl_scattersiv2di;
   13981       goto scatter_gen;
   13982     case IX86_BUILTIN_SCATTERDIV8SI:
   13983       icode = CODE_FOR_avx512vl_scatterdiv8si;
   13984       goto scatter_gen;
   13985     case IX86_BUILTIN_SCATTERDIV4SI:
   13986       icode = CODE_FOR_avx512vl_scatterdiv4si;
   13987       goto scatter_gen;
   13988     case IX86_BUILTIN_SCATTERDIV4DI:
   13989       icode = CODE_FOR_avx512vl_scatterdiv4di;
   13990       goto scatter_gen;
   13991     case IX86_BUILTIN_SCATTERDIV2DI:
   13992       icode = CODE_FOR_avx512vl_scatterdiv2di;
   13993       goto scatter_gen;
   13994     case IX86_BUILTIN_GATHERPFDPD:
   13995       icode = CODE_FOR_avx512pf_gatherpfv8sidf;
   13996       goto vec_prefetch_gen;
   13997     case IX86_BUILTIN_SCATTERALTSIV8DF:
   13998       icode = CODE_FOR_avx512f_scattersiv8df;
   13999       goto scatter_gen;
   14000     case IX86_BUILTIN_SCATTERALTDIV16SF:
   14001       icode = CODE_FOR_avx512f_scatterdiv16sf;
   14002       goto scatter_gen;
   14003     case IX86_BUILTIN_SCATTERALTSIV8DI:
   14004       icode = CODE_FOR_avx512f_scattersiv8di;
   14005       goto scatter_gen;
   14006     case IX86_BUILTIN_SCATTERALTDIV16SI:
   14007       icode = CODE_FOR_avx512f_scatterdiv16si;
   14008       goto scatter_gen;
   14009     case IX86_BUILTIN_SCATTERALTSIV4DF:
   14010       icode = CODE_FOR_avx512vl_scattersiv4df;
   14011       goto scatter_gen;
   14012     case IX86_BUILTIN_SCATTERALTDIV8SF:
   14013       icode = CODE_FOR_avx512vl_scatterdiv8sf;
   14014       goto scatter_gen;
   14015     case IX86_BUILTIN_SCATTERALTSIV4DI:
   14016       icode = CODE_FOR_avx512vl_scattersiv4di;
   14017       goto scatter_gen;
   14018     case IX86_BUILTIN_SCATTERALTDIV8SI:
   14019       icode = CODE_FOR_avx512vl_scatterdiv8si;
   14020       goto scatter_gen;
   14021     case IX86_BUILTIN_SCATTERALTSIV2DF:
   14022       icode = CODE_FOR_avx512vl_scattersiv2df;
   14023       goto scatter_gen;
   14024     case IX86_BUILTIN_SCATTERALTDIV4SF:
   14025       icode = CODE_FOR_avx512vl_scatterdiv4sf;
   14026       goto scatter_gen;
   14027     case IX86_BUILTIN_SCATTERALTSIV2DI:
   14028       icode = CODE_FOR_avx512vl_scattersiv2di;
   14029       goto scatter_gen;
   14030     case IX86_BUILTIN_SCATTERALTDIV4SI:
   14031       icode = CODE_FOR_avx512vl_scatterdiv4si;
   14032       goto scatter_gen;
   14033     case IX86_BUILTIN_GATHERPFDPS:
   14034       icode = CODE_FOR_avx512pf_gatherpfv16sisf;
   14035       goto vec_prefetch_gen;
   14036     case IX86_BUILTIN_GATHERPFQPD:
   14037       icode = CODE_FOR_avx512pf_gatherpfv8didf;
   14038       goto vec_prefetch_gen;
   14039     case IX86_BUILTIN_GATHERPFQPS:
   14040       icode = CODE_FOR_avx512pf_gatherpfv8disf;
   14041       goto vec_prefetch_gen;
   14042     case IX86_BUILTIN_SCATTERPFDPD:
   14043       icode = CODE_FOR_avx512pf_scatterpfv8sidf;
   14044       goto vec_prefetch_gen;
   14045     case IX86_BUILTIN_SCATTERPFDPS:
   14046       icode = CODE_FOR_avx512pf_scatterpfv16sisf;
   14047       goto vec_prefetch_gen;
   14048     case IX86_BUILTIN_SCATTERPFQPD:
   14049       icode = CODE_FOR_avx512pf_scatterpfv8didf;
   14050       goto vec_prefetch_gen;
   14051     case IX86_BUILTIN_SCATTERPFQPS:
   14052       icode = CODE_FOR_avx512pf_scatterpfv8disf;
   14053       goto vec_prefetch_gen;
   14054 
   14055     gather_gen:
   14056       rtx half;
   14057       rtx (*gen) (rtx, rtx);
   14058 
   14059       arg0 = CALL_EXPR_ARG (exp, 0);
   14060       arg1 = CALL_EXPR_ARG (exp, 1);
   14061       arg2 = CALL_EXPR_ARG (exp, 2);
   14062       arg3 = CALL_EXPR_ARG (exp, 3);
   14063       arg4 = CALL_EXPR_ARG (exp, 4);
   14064       op0 = expand_normal (arg0);
   14065       op1 = expand_normal (arg1);
   14066       op2 = expand_normal (arg2);
   14067       op3 = expand_normal (arg3);
   14068       op4 = expand_normal (arg4);
   14069       /* Note the arg order is different from the operand order.  */
   14070       mode0 = insn_data[icode].operand[1].mode;
   14071       mode2 = insn_data[icode].operand[3].mode;
   14072       mode3 = insn_data[icode].operand[4].mode;
   14073       mode4 = insn_data[icode].operand[5].mode;
   14074 
   14075       if (target == NULL_RTX
   14076 	  || GET_MODE (target) != insn_data[icode].operand[0].mode
   14077 	  || !insn_data[icode].operand[0].predicate (target,
   14078 						     GET_MODE (target)))
   14079 	subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
   14080       else
   14081 	subtarget = target;
   14082 
   14083       switch (fcode)
   14084 	{
   14085 	case IX86_BUILTIN_GATHER3ALTSIV8DF:
   14086 	case IX86_BUILTIN_GATHER3ALTSIV8DI:
   14087 	  half = gen_reg_rtx (V8SImode);
   14088 	  if (!nonimmediate_operand (op2, V16SImode))
   14089 	    op2 = copy_to_mode_reg (V16SImode, op2);
   14090 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
   14091 	  op2 = half;
   14092 	  break;
   14093 	case IX86_BUILTIN_GATHER3ALTSIV4DF:
   14094 	case IX86_BUILTIN_GATHER3ALTSIV4DI:
   14095 	case IX86_BUILTIN_GATHERALTSIV4DF:
   14096 	case IX86_BUILTIN_GATHERALTSIV4DI:
   14097 	  half = gen_reg_rtx (V4SImode);
   14098 	  if (!nonimmediate_operand (op2, V8SImode))
   14099 	    op2 = copy_to_mode_reg (V8SImode, op2);
   14100 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
   14101 	  op2 = half;
   14102 	  break;
   14103 	case IX86_BUILTIN_GATHER3ALTDIV16SF:
   14104 	case IX86_BUILTIN_GATHER3ALTDIV16SI:
   14105 	  half = gen_reg_rtx (mode0);
   14106 	  if (mode0 == V8SFmode)
   14107 	    gen = gen_vec_extract_lo_v16sf;
   14108 	  else
   14109 	    gen = gen_vec_extract_lo_v16si;
   14110 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
   14111 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   14112 	  emit_insn (gen (half, op0));
   14113 	  op0 = half;
   14114 	  op3 = lowpart_subreg (QImode, op3, HImode);
   14115 	  break;
   14116 	case IX86_BUILTIN_GATHER3ALTDIV8SF:
   14117 	case IX86_BUILTIN_GATHER3ALTDIV8SI:
   14118 	case IX86_BUILTIN_GATHERALTDIV8SF:
   14119 	case IX86_BUILTIN_GATHERALTDIV8SI:
   14120 	  half = gen_reg_rtx (mode0);
   14121 	  if (mode0 == V4SFmode)
   14122 	    gen = gen_vec_extract_lo_v8sf;
   14123 	  else
   14124 	    gen = gen_vec_extract_lo_v8si;
   14125 	  if (!nonimmediate_operand (op0, GET_MODE (op0)))
   14126 	    op0 = copy_to_mode_reg (GET_MODE (op0), op0);
   14127 	  emit_insn (gen (half, op0));
   14128 	  op0 = half;
   14129 	  if (VECTOR_MODE_P (GET_MODE (op3)))
   14130 	    {
   14131 	      half = gen_reg_rtx (mode0);
   14132 	      if (!nonimmediate_operand (op3, GET_MODE (op3)))
   14133 		op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   14134 	      emit_insn (gen (half, op3));
   14135 	      op3 = half;
   14136 	    }
   14137 	  break;
   14138 	default:
   14139 	  break;
   14140 	}
   14141 
   14142       /* Force memory operand only with base register here.  But we
   14143 	 don't want to do it on memory operand for other builtin
   14144 	 functions.  */
   14145       op1 = ix86_zero_extend_to_Pmode (op1);
   14146 
   14147       if (!insn_data[icode].operand[1].predicate (op0, mode0))
   14148 	op0 = copy_to_mode_reg (mode0, op0);
   14149       if (!insn_data[icode].operand[2].predicate (op1, Pmode))
   14150 	op1 = copy_to_mode_reg (Pmode, op1);
   14151       if (!insn_data[icode].operand[3].predicate (op2, mode2))
   14152 	op2 = copy_to_mode_reg (mode2, op2);
   14153 
   14154       op3 = fixup_modeless_constant (op3, mode3);
   14155 
   14156       if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
   14157 	{
   14158 	  if (!insn_data[icode].operand[4].predicate (op3, mode3))
   14159 	    op3 = copy_to_mode_reg (mode3, op3);
   14160 	}
   14161       else
   14162 	{
   14163 	  op3 = copy_to_reg (op3);
   14164 	  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
   14165 	}
   14166       if (!insn_data[icode].operand[5].predicate (op4, mode4))
   14167 	{
   14168           error ("the last argument must be scale 1, 2, 4, 8");
   14169           return const0_rtx;
   14170 	}
   14171 
   14172       /* Optimize.  If mask is known to have all high bits set,
   14173 	 replace op0 with pc_rtx to signal that the instruction
   14174 	 overwrites the whole destination and doesn't use its
   14175 	 previous contents.  */
   14176       if (optimize)
   14177 	{
   14178 	  if (TREE_CODE (arg3) == INTEGER_CST)
   14179 	    {
   14180 	      if (integer_all_onesp (arg3))
   14181 		op0 = pc_rtx;
   14182 	    }
   14183 	  else if (TREE_CODE (arg3) == VECTOR_CST)
   14184 	    {
   14185 	      unsigned int negative = 0;
   14186 	      for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
   14187 		{
   14188 		  tree cst = VECTOR_CST_ELT (arg3, i);
   14189 		  if (TREE_CODE (cst) == INTEGER_CST
   14190 		      && tree_int_cst_sign_bit (cst))
   14191 		    negative++;
   14192 		  else if (TREE_CODE (cst) == REAL_CST
   14193 			   && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
   14194 		    negative++;
   14195 		}
   14196 	      if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
   14197 		op0 = pc_rtx;
   14198 	    }
   14199 	  else if (TREE_CODE (arg3) == SSA_NAME
   14200 		   && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
   14201 	    {
   14202 	      /* Recognize also when mask is like:
   14203 		 __v2df src = _mm_setzero_pd ();
   14204 		 __v2df mask = _mm_cmpeq_pd (src, src);
   14205 		 or
   14206 		 __v8sf src = _mm256_setzero_ps ();
   14207 		 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
   14208 		 as that is a cheaper way to load all ones into
   14209 		 a register than having to load a constant from
   14210 		 memory.  */
   14211 	      gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
   14212 	      if (is_gimple_call (def_stmt))
   14213 		{
   14214 		  tree fndecl = gimple_call_fndecl (def_stmt);
   14215 		  if (fndecl
   14216 		      && fndecl_built_in_p (fndecl, BUILT_IN_MD))
   14217 		    switch (DECL_MD_FUNCTION_CODE (fndecl))
   14218 		      {
   14219 		      case IX86_BUILTIN_CMPPD:
   14220 		      case IX86_BUILTIN_CMPPS:
   14221 		      case IX86_BUILTIN_CMPPD256:
   14222 		      case IX86_BUILTIN_CMPPS256:
   14223 			if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
   14224 			  break;
   14225 			/* FALLTHRU */
   14226 		      case IX86_BUILTIN_CMPEQPD:
   14227 		      case IX86_BUILTIN_CMPEQPS:
   14228 			if (initializer_zerop (gimple_call_arg (def_stmt, 0))
   14229 			    && initializer_zerop (gimple_call_arg (def_stmt,
   14230 								   1)))
   14231 			  op0 = pc_rtx;
   14232 			break;
   14233 		      default:
   14234 			break;
   14235 		      }
   14236 		}
   14237 	    }
   14238 	}
   14239 
   14240       pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
   14241       if (! pat)
   14242 	return const0_rtx;
   14243       emit_insn (pat);
   14244 
   14245       switch (fcode)
   14246 	{
   14247 	case IX86_BUILTIN_GATHER3DIV16SF:
   14248 	  if (target == NULL_RTX)
   14249 	    target = gen_reg_rtx (V8SFmode);
   14250 	  emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
   14251 	  break;
   14252 	case IX86_BUILTIN_GATHER3DIV16SI:
   14253 	  if (target == NULL_RTX)
   14254 	    target = gen_reg_rtx (V8SImode);
   14255 	  emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
   14256 	  break;
   14257 	case IX86_BUILTIN_GATHER3DIV8SF:
   14258 	case IX86_BUILTIN_GATHERDIV8SF:
   14259 	  if (target == NULL_RTX)
   14260 	    target = gen_reg_rtx (V4SFmode);
   14261 	  emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
   14262 	  break;
   14263 	case IX86_BUILTIN_GATHER3DIV8SI:
   14264 	case IX86_BUILTIN_GATHERDIV8SI:
   14265 	  if (target == NULL_RTX)
   14266 	    target = gen_reg_rtx (V4SImode);
   14267 	  emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
   14268 	  break;
   14269 	default:
   14270 	  target = subtarget;
   14271 	  break;
   14272 	}
   14273       return target;
   14274 
   14275     scatter_gen:
   14276       arg0 = CALL_EXPR_ARG (exp, 0);
   14277       arg1 = CALL_EXPR_ARG (exp, 1);
   14278       arg2 = CALL_EXPR_ARG (exp, 2);
   14279       arg3 = CALL_EXPR_ARG (exp, 3);
   14280       arg4 = CALL_EXPR_ARG (exp, 4);
   14281       op0 = expand_normal (arg0);
   14282       op1 = expand_normal (arg1);
   14283       op2 = expand_normal (arg2);
   14284       op3 = expand_normal (arg3);
   14285       op4 = expand_normal (arg4);
   14286       mode1 = insn_data[icode].operand[1].mode;
   14287       mode2 = insn_data[icode].operand[2].mode;
   14288       mode3 = insn_data[icode].operand[3].mode;
   14289       mode4 = insn_data[icode].operand[4].mode;
   14290 
   14291       /* Scatter instruction stores operand op3 to memory with
   14292 	 indices from op2 and scale from op4 under writemask op1.
   14293 	 If index operand op2 has more elements then source operand
   14294 	 op3 one need to use only its low half. And vice versa.  */
   14295       switch (fcode)
   14296 	{
   14297 	case IX86_BUILTIN_SCATTERALTSIV8DF:
   14298 	case IX86_BUILTIN_SCATTERALTSIV8DI:
   14299 	  half = gen_reg_rtx (V8SImode);
   14300 	  if (!nonimmediate_operand (op2, V16SImode))
   14301 	    op2 = copy_to_mode_reg (V16SImode, op2);
   14302 	  emit_insn (gen_vec_extract_lo_v16si (half, op2));
   14303 	  op2 = half;
   14304 	  break;
   14305 	case IX86_BUILTIN_SCATTERALTDIV16SF:
   14306 	case IX86_BUILTIN_SCATTERALTDIV16SI:
   14307 	  half = gen_reg_rtx (mode3);
   14308 	  if (mode3 == V8SFmode)
   14309 	    gen = gen_vec_extract_lo_v16sf;
   14310 	  else
   14311 	    gen = gen_vec_extract_lo_v16si;
   14312 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
   14313 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   14314 	  emit_insn (gen (half, op3));
   14315 	  op3 = half;
   14316 	  break;
   14317 	case IX86_BUILTIN_SCATTERALTSIV4DF:
   14318 	case IX86_BUILTIN_SCATTERALTSIV4DI:
   14319 	  half = gen_reg_rtx (V4SImode);
   14320 	  if (!nonimmediate_operand (op2, V8SImode))
   14321 	    op2 = copy_to_mode_reg (V8SImode, op2);
   14322 	  emit_insn (gen_vec_extract_lo_v8si (half, op2));
   14323 	  op2 = half;
   14324 	  break;
   14325 	case IX86_BUILTIN_SCATTERALTDIV8SF:
   14326 	case IX86_BUILTIN_SCATTERALTDIV8SI:
   14327 	  half = gen_reg_rtx (mode3);
   14328 	  if (mode3 == V4SFmode)
   14329 	    gen = gen_vec_extract_lo_v8sf;
   14330 	  else
   14331 	    gen = gen_vec_extract_lo_v8si;
   14332 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
   14333 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   14334 	  emit_insn (gen (half, op3));
   14335 	  op3 = half;
   14336 	  break;
   14337 	case IX86_BUILTIN_SCATTERALTSIV2DF:
   14338 	case IX86_BUILTIN_SCATTERALTSIV2DI:
   14339 	  if (!nonimmediate_operand (op2, V4SImode))
   14340 	    op2 = copy_to_mode_reg (V4SImode, op2);
   14341 	  break;
   14342 	case IX86_BUILTIN_SCATTERALTDIV4SF:
   14343 	case IX86_BUILTIN_SCATTERALTDIV4SI:
   14344 	  if (!nonimmediate_operand (op3, GET_MODE (op3)))
   14345 	    op3 = copy_to_mode_reg (GET_MODE (op3), op3);
   14346 	  break;
   14347 	default:
   14348 	  break;
   14349 	}
   14350 
   14351       /* Force memory operand only with base register here.  But we
   14352 	 don't want to do it on memory operand for other builtin
   14353 	 functions.  */
   14354       op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
   14355 
   14356       if (!insn_data[icode].operand[0].predicate (op0, Pmode))
   14357 	op0 = copy_to_mode_reg (Pmode, op0);
   14358 
   14359       op1 = fixup_modeless_constant (op1, mode1);
   14360 
   14361       if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
   14362 	{
   14363 	  if (!insn_data[icode].operand[1].predicate (op1, mode1))
   14364 	    op1 = copy_to_mode_reg (mode1, op1);
   14365 	}
   14366       else
   14367 	{
   14368 	  op1 = copy_to_reg (op1);
   14369 	  op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
   14370 	}
   14371 
   14372       if (!insn_data[icode].operand[2].predicate (op2, mode2))
   14373 	op2 = copy_to_mode_reg (mode2, op2);
   14374 
   14375       if (!insn_data[icode].operand[3].predicate (op3, mode3))
   14376 	op3 = copy_to_mode_reg (mode3, op3);
   14377 
   14378       if (!insn_data[icode].operand[4].predicate (op4, mode4))
   14379 	{
   14380 	  error ("the last argument must be scale 1, 2, 4, 8");
   14381 	  return const0_rtx;
   14382 	}
   14383 
   14384       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
   14385       if (! pat)
   14386 	return const0_rtx;
   14387 
   14388       emit_insn (pat);
   14389       return 0;
   14390 
   14391     vec_prefetch_gen:
   14392       arg0 = CALL_EXPR_ARG (exp, 0);
   14393       arg1 = CALL_EXPR_ARG (exp, 1);
   14394       arg2 = CALL_EXPR_ARG (exp, 2);
   14395       arg3 = CALL_EXPR_ARG (exp, 3);
   14396       arg4 = CALL_EXPR_ARG (exp, 4);
   14397       op0 = expand_normal (arg0);
   14398       op1 = expand_normal (arg1);
   14399       op2 = expand_normal (arg2);
   14400       op3 = expand_normal (arg3);
   14401       op4 = expand_normal (arg4);
   14402       mode0 = insn_data[icode].operand[0].mode;
   14403       mode1 = insn_data[icode].operand[1].mode;
   14404       mode3 = insn_data[icode].operand[3].mode;
   14405       mode4 = insn_data[icode].operand[4].mode;
   14406 
   14407       op0 = fixup_modeless_constant (op0, mode0);
   14408 
   14409       if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
   14410 	{
   14411 	  if (!insn_data[icode].operand[0].predicate (op0, mode0))
   14412 	    op0 = copy_to_mode_reg (mode0, op0);
   14413 	}
   14414       else
   14415 	{
   14416 	  op0 = copy_to_reg (op0);
   14417 	  op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
   14418 	}
   14419 
   14420       if (!insn_data[icode].operand[1].predicate (op1, mode1))
   14421 	op1 = copy_to_mode_reg (mode1, op1);
   14422 
   14423       /* Force memory operand only with base register here.  But we
   14424 	 don't want to do it on memory operand for other builtin
   14425 	 functions.  */
   14426       op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
   14427 
   14428       if (!insn_data[icode].operand[2].predicate (op2, Pmode))
   14429 	op2 = copy_to_mode_reg (Pmode, op2);
   14430 
   14431       if (!insn_data[icode].operand[3].predicate (op3, mode3))
   14432 	{
   14433 	  error ("the forth argument must be scale 1, 2, 4, 8");
   14434 	  return const0_rtx;
   14435 	}
   14436 
   14437       if (!insn_data[icode].operand[4].predicate (op4, mode4))
   14438 	{
   14439 	  error ("incorrect hint operand");
   14440 	  return const0_rtx;
   14441 	}
   14442 
   14443       pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
   14444       if (! pat)
   14445 	return const0_rtx;
   14446 
   14447       emit_insn (pat);
   14448 
   14449       return 0;
   14450 
   14451     case IX86_BUILTIN_XABORT:
   14452       icode = CODE_FOR_xabort;
   14453       arg0 = CALL_EXPR_ARG (exp, 0);
   14454       op0 = expand_normal (arg0);
   14455       mode0 = insn_data[icode].operand[0].mode;
   14456       if (!insn_data[icode].operand[0].predicate (op0, mode0))
   14457 	{
   14458 	  error ("the argument to %<xabort%> intrinsic must "
   14459 		 "be an 8-bit immediate");
   14460 	  return const0_rtx;
   14461 	}
   14462       emit_insn (gen_xabort (op0));
   14463       return 0;
   14464 
   14465     case IX86_BUILTIN_RDSSPD:
   14466     case IX86_BUILTIN_RDSSPQ:
   14467       mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
   14468 
   14469       if (target == 0
   14470 	  || !register_operand (target, mode))
   14471 	target = gen_reg_rtx (mode);
   14472 
   14473       op0 = force_reg (mode, const0_rtx);
   14474 
   14475       emit_insn (gen_rdssp (mode, target, op0));
   14476       return target;
   14477 
   14478     case IX86_BUILTIN_INCSSPD:
   14479     case IX86_BUILTIN_INCSSPQ:
   14480       mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
   14481 
   14482       arg0 = CALL_EXPR_ARG (exp, 0);
   14483       op0 = expand_normal (arg0);
   14484 
   14485       op0 = force_reg (mode, op0);
   14486 
   14487       emit_insn (gen_incssp (mode, op0));
   14488       return 0;
   14489 
   14490     case IX86_BUILTIN_HRESET:
   14491       icode = CODE_FOR_hreset;
   14492       arg0 = CALL_EXPR_ARG (exp, 0);
   14493       op0 = expand_normal (arg0);
   14494       op0 = force_reg (SImode, op0);
   14495       emit_insn (gen_hreset (op0));
   14496       return 0;
   14497 
   14498     case IX86_BUILTIN_RSTORSSP:
   14499     case IX86_BUILTIN_CLRSSBSY:
   14500       arg0 = CALL_EXPR_ARG (exp, 0);
   14501       op0 = expand_normal (arg0);
   14502       icode = (fcode == IX86_BUILTIN_RSTORSSP
   14503 	       ? CODE_FOR_rstorssp
   14504 	       : CODE_FOR_clrssbsy);
   14505 
   14506       if (!address_operand (op0, VOIDmode))
   14507 	{
   14508 	  op0 = convert_memory_address (Pmode, op0);
   14509 	  op0 = copy_addr_to_reg (op0);
   14510 	}
   14511       emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
   14512       return 0;
   14513 
   14514     case IX86_BUILTIN_WRSSD:
   14515     case IX86_BUILTIN_WRSSQ:
   14516     case IX86_BUILTIN_WRUSSD:
   14517     case IX86_BUILTIN_WRUSSQ:
   14518       mode = ((fcode == IX86_BUILTIN_WRSSD
   14519 	       || fcode == IX86_BUILTIN_WRUSSD)
   14520 	      ? SImode : DImode);
   14521 
   14522       arg0 = CALL_EXPR_ARG (exp, 0);
   14523       op0 = expand_normal (arg0);
   14524       arg1 = CALL_EXPR_ARG (exp, 1);
   14525       op1 = expand_normal (arg1);
   14526 
   14527       op0 = force_reg (mode, op0);
   14528 
   14529       if (!address_operand (op1, VOIDmode))
   14530 	{
   14531 	  op1 = convert_memory_address (Pmode, op1);
   14532 	  op1 = copy_addr_to_reg (op1);
   14533 	}
   14534       op1 = gen_rtx_MEM (mode, op1);
   14535 
   14536       icode = ((fcode == IX86_BUILTIN_WRSSD
   14537 		|| fcode == IX86_BUILTIN_WRSSQ)
   14538 	       ? code_for_wrss (mode)
   14539 	       : code_for_wruss (mode));
   14540       emit_insn (GEN_FCN (icode) (op0, op1));
   14541 
   14542       return 0;
   14543 
   14544     default:
   14545       break;
   14546     }
   14547 
   14548   if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
   14549       && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
   14550     {
   14551       i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
   14552       return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
   14553 					       target);
   14554     }
   14555 
   14556   if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
   14557       && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
   14558     {
   14559       i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
   14560       return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
   14561 					       target);
   14562     }
   14563 
   14564   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
   14565       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
   14566     {
   14567       i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
   14568       rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
   14569       rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
   14570       rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
   14571       int masked = 1;
   14572       machine_mode mode, wide_mode, nar_mode;
   14573 
   14574       nar_mode  = V4SFmode;
   14575       mode      = V16SFmode;
   14576       wide_mode = V64SFmode;
   14577       fcn_mask  = gen_avx5124fmaddps_4fmaddps_mask;
   14578       fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
   14579 
   14580       switch (fcode)
   14581 	{
   14582 	case IX86_BUILTIN_4FMAPS:
   14583 	  fcn = gen_avx5124fmaddps_4fmaddps;
   14584 	  masked = 0;
   14585 	  goto v4fma_expand;
   14586 
   14587 	case IX86_BUILTIN_4DPWSSD:
   14588 	  nar_mode  = V4SImode;
   14589 	  mode      = V16SImode;
   14590 	  wide_mode = V64SImode;
   14591 	  fcn = gen_avx5124vnniw_vp4dpwssd;
   14592 	  masked = 0;
   14593 	  goto v4fma_expand;
   14594 
   14595 	case IX86_BUILTIN_4DPWSSDS:
   14596 	  nar_mode  = V4SImode;
   14597 	  mode      = V16SImode;
   14598 	  wide_mode = V64SImode;
   14599 	  fcn = gen_avx5124vnniw_vp4dpwssds;
   14600 	  masked = 0;
   14601 	  goto v4fma_expand;
   14602 
   14603 	case IX86_BUILTIN_4FNMAPS:
   14604 	  fcn = gen_avx5124fmaddps_4fnmaddps;
   14605 	  masked = 0;
   14606 	  goto v4fma_expand;
   14607 
   14608 	case IX86_BUILTIN_4FNMAPS_MASK:
   14609 	  fcn_mask  = gen_avx5124fmaddps_4fnmaddps_mask;
   14610 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
   14611 	  goto v4fma_expand;
   14612 
   14613 	case IX86_BUILTIN_4DPWSSD_MASK:
   14614 	  nar_mode  = V4SImode;
   14615 	  mode      = V16SImode;
   14616 	  wide_mode = V64SImode;
   14617 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssd_mask;
   14618 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
   14619 	  goto v4fma_expand;
   14620 
   14621 	case IX86_BUILTIN_4DPWSSDS_MASK:
   14622 	  nar_mode  = V4SImode;
   14623 	  mode      = V16SImode;
   14624 	  wide_mode = V64SImode;
   14625 	  fcn_mask  = gen_avx5124vnniw_vp4dpwssds_mask;
   14626 	  fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
   14627 	  goto v4fma_expand;
   14628 
   14629 	case IX86_BUILTIN_4FMAPS_MASK:
   14630 	  {
   14631 	    tree args[4];
   14632 	    rtx ops[4];
   14633 	    rtx wide_reg;
   14634 	    rtx accum;
   14635 	    rtx addr;
   14636 	    rtx mem;
   14637 
   14638 v4fma_expand:
   14639 	    wide_reg = gen_reg_rtx (wide_mode);
   14640 	    for (i = 0; i < 4; i++)
   14641 	      {
   14642 		args[i] = CALL_EXPR_ARG (exp, i);
   14643 		ops[i] = expand_normal (args[i]);
   14644 
   14645 		emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
   14646 				ops[i]);
   14647 	      }
   14648 
   14649 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
   14650 	    accum = force_reg (mode, accum);
   14651 
   14652 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
   14653 	    addr = force_reg (Pmode, addr);
   14654 
   14655 	    mem = gen_rtx_MEM (nar_mode, addr);
   14656 
   14657 	    target = gen_reg_rtx (mode);
   14658 
   14659 	    emit_move_insn (target, accum);
   14660 
   14661 	    if (! masked)
   14662 	      emit_insn (fcn (target, accum, wide_reg, mem));
   14663 	    else
   14664 	      {
   14665 		rtx merge, mask;
   14666 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
   14667 
   14668 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
   14669 
   14670 		if (CONST_INT_P (mask))
   14671 		  mask = fixup_modeless_constant (mask, HImode);
   14672 
   14673 		mask = force_reg (HImode, mask);
   14674 
   14675 		if (GET_MODE (mask) != HImode)
   14676 		  mask = gen_rtx_SUBREG (HImode, mask, 0);
   14677 
   14678 		/* If merge is 0 then we're about to emit z-masked variant.  */
   14679 		if (const0_operand (merge, mode))
   14680 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
   14681 		/* If merge is the same as accum then emit merge-masked variant.  */
   14682 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
   14683 		  {
   14684 		    merge = force_reg (mode, merge);
   14685 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
   14686 		  }
   14687 		/* Merge with something unknown might happen if we z-mask w/ -O0.  */
   14688 		else
   14689 		  {
   14690 		    target = gen_reg_rtx (mode);
   14691 		    emit_move_insn (target, merge);
   14692 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
   14693 		  }
   14694 	      }
   14695 	    return target;
   14696 	  }
   14697 
   14698 	case IX86_BUILTIN_4FNMASS:
   14699 	  fcn = gen_avx5124fmaddps_4fnmaddss;
   14700 	  masked = 0;
   14701 	  goto s4fma_expand;
   14702 
   14703 	case IX86_BUILTIN_4FMASS:
   14704 	  fcn = gen_avx5124fmaddps_4fmaddss;
   14705 	  masked = 0;
   14706 	  goto s4fma_expand;
   14707 
   14708 	case IX86_BUILTIN_4FNMASS_MASK:
   14709 	  fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
   14710 	  fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
   14711 	  goto s4fma_expand;
   14712 
   14713 	case IX86_BUILTIN_4FMASS_MASK:
   14714 	  {
   14715 	    tree args[4];
   14716 	    rtx ops[4];
   14717 	    rtx wide_reg;
   14718 	    rtx accum;
   14719 	    rtx addr;
   14720 	    rtx mem;
   14721 
   14722 	    fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
   14723 	    fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
   14724 
   14725 s4fma_expand:
   14726 	    mode = V4SFmode;
   14727 	    wide_reg = gen_reg_rtx (V64SFmode);
   14728 	    for (i = 0; i < 4; i++)
   14729 	      {
   14730 		rtx tmp;
   14731 		args[i] = CALL_EXPR_ARG (exp, i);
   14732 		ops[i] = expand_normal (args[i]);
   14733 
   14734 		tmp = gen_reg_rtx (SFmode);
   14735 		emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
   14736 
   14737 		emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
   14738 				gen_rtx_SUBREG (V16SFmode, tmp, 0));
   14739 	      }
   14740 
   14741 	    accum = expand_normal (CALL_EXPR_ARG (exp, 4));
   14742 	    accum = force_reg (V4SFmode, accum);
   14743 
   14744 	    addr = expand_normal (CALL_EXPR_ARG (exp, 5));
   14745 	    addr = force_reg (Pmode, addr);
   14746 
   14747 	    mem = gen_rtx_MEM (V4SFmode, addr);
   14748 
   14749 	    target = gen_reg_rtx (V4SFmode);
   14750 
   14751 	    emit_move_insn (target, accum);
   14752 
   14753 	    if (! masked)
   14754 	      emit_insn (fcn (target, accum, wide_reg, mem));
   14755 	    else
   14756 	      {
   14757 		rtx merge, mask;
   14758 		merge = expand_normal (CALL_EXPR_ARG (exp, 6));
   14759 
   14760 		mask = expand_normal (CALL_EXPR_ARG (exp, 7));
   14761 
   14762 		if (CONST_INT_P (mask))
   14763 		  mask = fixup_modeless_constant (mask, QImode);
   14764 
   14765 		mask = force_reg (QImode, mask);
   14766 
   14767 		if (GET_MODE (mask) != QImode)
   14768 		  mask = gen_rtx_SUBREG (QImode, mask, 0);
   14769 
   14770 		/* If merge is 0 then we're about to emit z-masked variant.  */
   14771 		if (const0_operand (merge, mode))
   14772 		  emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
   14773 		/* If merge is the same as accum then emit merge-masked
   14774 		   variant.  */
   14775 		else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
   14776 		  {
   14777 		    merge = force_reg (mode, merge);
   14778 		    emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
   14779 		  }
   14780 		/* Merge with something unknown might happen if we z-mask
   14781 		   w/ -O0.  */
   14782 		else
   14783 		  {
   14784 		    target = gen_reg_rtx (mode);
   14785 		    emit_move_insn (target, merge);
   14786 		    emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
   14787 		  }
   14788 		}
   14789 	      return target;
   14790 	    }
   14791 	  case IX86_BUILTIN_RDPID:
   14792 	    return ix86_expand_special_args_builtin (bdesc_args + i, exp,
   14793 						     target);
   14794 	  case IX86_BUILTIN_FABSQ:
   14795 	  case IX86_BUILTIN_COPYSIGNQ:
   14796 	    if (!TARGET_SSE)
   14797 	      /* Emit a normal call if SSE isn't available.  */
   14798 	      return expand_call (exp, target, ignore);
   14799 	    /* FALLTHRU */
   14800 	  default:
   14801 	    return ix86_expand_args_builtin (bdesc_args + i, exp, target);
   14802 	  }
   14803     }
   14804 
   14805   if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
   14806       && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
   14807     {
   14808       i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
   14809       return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
   14810     }
   14811 
   14812   if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
   14813       && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
   14814     {
   14815       i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
   14816       return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
   14817     }
   14818 
   14819   if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
   14820       && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
   14821     {
   14822       i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
   14823       return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
   14824     }
   14825 
   14826   if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
   14827       && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
   14828     {
   14829       i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
   14830       return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
   14831     }
   14832 
   14833   if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
   14834       && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
   14835     {
   14836       i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
   14837       const struct builtin_description *d = bdesc_multi_arg + i;
   14838       return ix86_expand_multi_arg_builtin (d->icode, exp, target,
   14839 					    (enum ix86_builtin_func_type)
   14840 					    d->flag, d->comparison);
   14841     }
   14842 
   14843   if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
   14844       && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
   14845     {
   14846       i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
   14847       return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
   14848 					       target);
   14849     }
   14850 
   14851   gcc_unreachable ();
   14852 }
   14853 
   14854 /* A subroutine of ix86_expand_vector_init_duplicate.  Tries to
   14855    fill target with val via vec_duplicate.  */
   14856 
   14857 static bool
   14858 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
   14859 {
   14860   bool ok;
   14861   rtx_insn *insn;
   14862   rtx dup;
   14863 
   14864   /* First attempt to recognize VAL as-is.  */
   14865   dup = gen_vec_duplicate (mode, val);
   14866   insn = emit_insn (gen_rtx_SET (target, dup));
   14867   if (recog_memoized (insn) < 0)
   14868     {
   14869       rtx_insn *seq;
   14870       machine_mode innermode = GET_MODE_INNER (mode);
   14871       rtx reg;
   14872 
   14873       /* If that fails, force VAL into a register.  */
   14874 
   14875       start_sequence ();
   14876       reg = force_reg (innermode, val);
   14877       if (GET_MODE (reg) != innermode)
   14878 	reg = gen_lowpart (innermode, reg);
   14879       SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
   14880       seq = get_insns ();
   14881       end_sequence ();
   14882       if (seq)
   14883 	emit_insn_before (seq, insn);
   14884 
   14885       ok = recog_memoized (insn) >= 0;
   14886       gcc_assert (ok);
   14887     }
   14888   return true;
   14889 }
   14890 
   14891 /* Get a vector mode of the same size as the original but with elements
   14892    twice as wide.  This is only guaranteed to apply to integral vectors.  */
   14893 
   14894 static machine_mode
   14895 get_mode_wider_vector (machine_mode o)
   14896 {
   14897   /* ??? Rely on the ordering that genmodes.cc gives to vectors.  */
   14898   machine_mode n = GET_MODE_WIDER_MODE (o).require ();
   14899   gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
   14900   gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
   14901   return n;
   14902 }
   14903 
   14904 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
   14905 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
   14906 
   14907 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   14908    with all elements equal to VAR.  Return true if successful.  */
   14909 
   14910 bool
   14911 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
   14912 				   rtx target, rtx val)
   14913 {
   14914   bool ok;
   14915 
   14916   switch (mode)
   14917     {
   14918     case E_V2SImode:
   14919     case E_V2SFmode:
   14920       if (!mmx_ok)
   14921 	return false;
   14922       /* FALLTHRU */
   14923 
   14924     case E_V4DFmode:
   14925     case E_V4DImode:
   14926     case E_V8SFmode:
   14927     case E_V8SImode:
   14928     case E_V2DFmode:
   14929     case E_V2DImode:
   14930     case E_V4SFmode:
   14931     case E_V4SImode:
   14932     case E_V16SImode:
   14933     case E_V8DImode:
   14934     case E_V16SFmode:
   14935     case E_V8DFmode:
   14936       return ix86_vector_duplicate_value (mode, target, val);
   14937 
   14938     case E_V4HImode:
   14939       if (!mmx_ok)
   14940 	return false;
   14941       if (TARGET_SSE || TARGET_3DNOW_A)
   14942 	{
   14943 	  rtx x;
   14944 
   14945 	  val = gen_lowpart (SImode, val);
   14946 	  x = gen_rtx_TRUNCATE (HImode, val);
   14947 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
   14948 	  emit_insn (gen_rtx_SET (target, x));
   14949 	  return true;
   14950 	}
   14951       goto widen;
   14952 
   14953     case E_V2HImode:
   14954       if (TARGET_SSE2)
   14955 	{
   14956 	  rtx x;
   14957 
   14958 	  val = gen_lowpart (SImode, val);
   14959 	  x = gen_rtx_TRUNCATE (HImode, val);
   14960 	  x = gen_rtx_VEC_DUPLICATE (mode, x);
   14961 	  emit_insn (gen_rtx_SET (target, x));
   14962 	  return true;
   14963 	}
   14964       return false;
   14965 
   14966     case E_V8QImode:
   14967     case E_V4QImode:
   14968       if (!mmx_ok)
   14969 	return false;
   14970       goto widen;
   14971 
   14972     case E_V8HImode:
   14973     case E_V8HFmode:
   14974       if (TARGET_AVX2)
   14975 	return ix86_vector_duplicate_value (mode, target, val);
   14976 
   14977       if (TARGET_SSE2)
   14978 	{
   14979 	  struct expand_vec_perm_d dperm;
   14980 	  rtx tmp1, tmp2;
   14981 
   14982 	permute:
   14983 	  memset (&dperm, 0, sizeof (dperm));
   14984 	  dperm.target = target;
   14985 	  dperm.vmode = mode;
   14986 	  dperm.nelt = GET_MODE_NUNITS (mode);
   14987 	  dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
   14988 	  dperm.one_operand_p = true;
   14989 
   14990 	  if (mode == V8HFmode)
   14991 	    {
   14992 	      tmp1 = force_reg (HFmode, val);
   14993 	      tmp2 = gen_reg_rtx (mode);
   14994 	      emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
   14995 	      tmp1 = gen_lowpart (mode, tmp2);
   14996 	    }
   14997 	  else
   14998 	    {
   14999 	      /* Extend to SImode using a paradoxical SUBREG.  */
   15000 	      tmp1 = gen_reg_rtx (SImode);
   15001 	      emit_move_insn (tmp1, gen_lowpart (SImode, val));
   15002 
   15003 	      /* Insert the SImode value as
   15004 		 low element of a V4SImode vector.  */
   15005 	      tmp2 = gen_reg_rtx (V4SImode);
   15006 	      emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
   15007 	      tmp1 = gen_lowpart (mode, tmp2);
   15008 	    }
   15009 
   15010 	  emit_move_insn (dperm.op0, tmp1);
   15011 	  ok = (expand_vec_perm_1 (&dperm)
   15012 		|| expand_vec_perm_broadcast_1 (&dperm));
   15013 	  gcc_assert (ok);
   15014 	  return ok;
   15015 	}
   15016       goto widen;
   15017 
   15018     case E_V16QImode:
   15019       if (TARGET_AVX2)
   15020 	return ix86_vector_duplicate_value (mode, target, val);
   15021 
   15022       if (TARGET_SSE2)
   15023 	goto permute;
   15024       goto widen;
   15025 
   15026     widen:
   15027       /* Replicate the value once into the next wider mode and recurse.  */
   15028       {
   15029 	machine_mode smode, wsmode, wvmode;
   15030 	rtx x;
   15031 
   15032 	smode = GET_MODE_INNER (mode);
   15033 	wvmode = get_mode_wider_vector (mode);
   15034 	wsmode = GET_MODE_INNER (wvmode);
   15035 
   15036 	val = convert_modes (wsmode, smode, val, true);
   15037 
   15038 	if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
   15039 	  emit_insn (gen_insv_1 (wsmode, val, val));
   15040 	else
   15041 	  {
   15042 	    x = expand_simple_binop (wsmode, ASHIFT, val,
   15043 				     GEN_INT (GET_MODE_BITSIZE (smode)),
   15044 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
   15045 	    val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
   15046 				       OPTAB_LIB_WIDEN);
   15047 	  }
   15048 
   15049 	x = gen_reg_rtx (wvmode);
   15050 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
   15051 	gcc_assert (ok);
   15052 	emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
   15053 	return ok;
   15054       }
   15055 
   15056     case E_V16HImode:
   15057     case E_V16HFmode:
   15058     case E_V32QImode:
   15059       if (TARGET_AVX2)
   15060 	return ix86_vector_duplicate_value (mode, target, val);
   15061       else
   15062 	{
   15063 	  machine_mode hvmode = (mode == V16HImode ? V8HImode
   15064 				 : mode == V16HFmode ? V8HFmode
   15065 				 : V16QImode);
   15066 	  rtx x = gen_reg_rtx (hvmode);
   15067 
   15068 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   15069 	  gcc_assert (ok);
   15070 
   15071 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
   15072 	  emit_insn (gen_rtx_SET (target, x));
   15073 	}
   15074       return true;
   15075 
   15076     case E_V32HImode:
   15077     case E_V32HFmode:
   15078     case E_V64QImode:
   15079       if (TARGET_AVX512BW)
   15080 	return ix86_vector_duplicate_value (mode, target, val);
   15081       else
   15082 	{
   15083 	  machine_mode hvmode = (mode == V32HImode ? V16HImode
   15084 				 : mode == V32HFmode ? V16HFmode
   15085 				 : V32QImode);
   15086 	  rtx x = gen_reg_rtx (hvmode);
   15087 
   15088 	  ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
   15089 	  gcc_assert (ok);
   15090 
   15091 	  x = gen_rtx_VEC_CONCAT (mode, x, x);
   15092 	  emit_insn (gen_rtx_SET (target, x));
   15093 	}
   15094       return true;
   15095 
   15096     default:
   15097       return false;
   15098     }
   15099 }
   15100 
   15101 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   15102    whose ONE_VAR element is VAR, and other elements are zero.  Return true
   15103    if successful.  */
   15104 
   15105 static bool
   15106 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
   15107 				     rtx target, rtx var, int one_var)
   15108 {
   15109   machine_mode vsimode;
   15110   rtx new_target;
   15111   rtx x, tmp;
   15112   bool use_vector_set = false;
   15113   rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
   15114 
   15115   switch (mode)
   15116     {
   15117     case E_V2DImode:
   15118       /* For SSE4.1, we normally use vector set.  But if the second
   15119 	 element is zero and inter-unit moves are OK, we use movq
   15120 	 instead.  */
   15121       use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
   15122 			&& !(TARGET_INTER_UNIT_MOVES_TO_VEC
   15123 			     && one_var == 0));
   15124       break;
   15125     case E_V16QImode:
   15126     case E_V4SImode:
   15127     case E_V4SFmode:
   15128       use_vector_set = TARGET_SSE4_1;
   15129       break;
   15130     case E_V8HImode:
   15131       use_vector_set = TARGET_SSE2;
   15132       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   15133 	? gen_vec_setv8hi_0 : NULL;
   15134       break;
   15135     case E_V8QImode:
   15136       use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   15137       break;
   15138     case E_V4HImode:
   15139       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
   15140       break;
   15141     case E_V4QImode:
   15142       use_vector_set = TARGET_SSE4_1;
   15143       break;
   15144     case E_V32QImode:
   15145       use_vector_set = TARGET_AVX;
   15146       break;
   15147     case E_V16HImode:
   15148       use_vector_set = TARGET_AVX;
   15149       gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
   15150 	? gen_vec_setv16hi_0 : NULL;
   15151       break;
   15152     case E_V8SImode:
   15153       use_vector_set = TARGET_AVX;
   15154       gen_vec_set_0 = gen_vec_setv8si_0;
   15155       break;
   15156     case E_V8SFmode:
   15157       use_vector_set = TARGET_AVX;
   15158       gen_vec_set_0 = gen_vec_setv8sf_0;
   15159       break;
   15160     case E_V4DFmode:
   15161       use_vector_set = TARGET_AVX;
   15162       gen_vec_set_0 = gen_vec_setv4df_0;
   15163       break;
   15164     case E_V4DImode:
   15165       /* Use ix86_expand_vector_set in 64bit mode only.  */
   15166       use_vector_set = TARGET_AVX && TARGET_64BIT;
   15167       gen_vec_set_0 = gen_vec_setv4di_0;
   15168       break;
   15169     case E_V16SImode:
   15170       use_vector_set = TARGET_AVX512F && one_var == 0;
   15171       gen_vec_set_0 = gen_vec_setv16si_0;
   15172       break;
   15173     case E_V16SFmode:
   15174       use_vector_set = TARGET_AVX512F && one_var == 0;
   15175       gen_vec_set_0 = gen_vec_setv16sf_0;
   15176       break;
   15177     case E_V8DFmode:
   15178       use_vector_set = TARGET_AVX512F && one_var == 0;
   15179       gen_vec_set_0 = gen_vec_setv8df_0;
   15180       break;
   15181     case E_V8DImode:
   15182       /* Use ix86_expand_vector_set in 64bit mode only.  */
   15183       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
   15184       gen_vec_set_0 = gen_vec_setv8di_0;
   15185       break;
   15186     case E_V8HFmode:
   15187       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   15188       gen_vec_set_0 = gen_vec_setv8hf_0;
   15189       break;
   15190     case E_V16HFmode:
   15191       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   15192       gen_vec_set_0 = gen_vec_setv16hf_0;
   15193       break;
   15194     case E_V32HFmode:
   15195       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   15196       gen_vec_set_0 = gen_vec_setv32hf_0;
   15197       break;
   15198     case E_V32HImode:
   15199       use_vector_set = TARGET_AVX512FP16 && one_var == 0;
   15200       gen_vec_set_0 = gen_vec_setv32hi_0;
   15201     default:
   15202       break;
   15203     }
   15204 
   15205   if (use_vector_set)
   15206     {
   15207       if (gen_vec_set_0 && one_var == 0)
   15208 	{
   15209 	  var = force_reg (GET_MODE_INNER (mode), var);
   15210 	  emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
   15211 	  return true;
   15212 	}
   15213       emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
   15214       var = force_reg (GET_MODE_INNER (mode), var);
   15215       ix86_expand_vector_set (mmx_ok, target, var, one_var);
   15216       return true;
   15217     }
   15218 
   15219   switch (mode)
   15220     {
   15221     case E_V2SFmode:
   15222     case E_V2SImode:
   15223       if (!mmx_ok)
   15224 	return false;
   15225       /* FALLTHRU */
   15226 
   15227     case E_V2DFmode:
   15228     case E_V2DImode:
   15229       if (one_var != 0)
   15230 	return false;
   15231       var = force_reg (GET_MODE_INNER (mode), var);
   15232       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
   15233       emit_insn (gen_rtx_SET (target, x));
   15234       return true;
   15235 
   15236     case E_V4SFmode:
   15237     case E_V4SImode:
   15238       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
   15239 	new_target = gen_reg_rtx (mode);
   15240       else
   15241 	new_target = target;
   15242       var = force_reg (GET_MODE_INNER (mode), var);
   15243       x = gen_rtx_VEC_DUPLICATE (mode, var);
   15244       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
   15245       emit_insn (gen_rtx_SET (new_target, x));
   15246       if (one_var != 0)
   15247 	{
   15248 	  /* We need to shuffle the value to the correct position, so
   15249 	     create a new pseudo to store the intermediate result.  */
   15250 
   15251 	  /* With SSE2, we can use the integer shuffle insns.  */
   15252 	  if (mode != V4SFmode && TARGET_SSE2)
   15253 	    {
   15254 	      emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
   15255 					    const1_rtx,
   15256 					    GEN_INT (one_var == 1 ? 0 : 1),
   15257 					    GEN_INT (one_var == 2 ? 0 : 1),
   15258 					    GEN_INT (one_var == 3 ? 0 : 1)));
   15259 	      if (target != new_target)
   15260 		emit_move_insn (target, new_target);
   15261 	      return true;
   15262 	    }
   15263 
   15264 	  /* Otherwise convert the intermediate result to V4SFmode and
   15265 	     use the SSE1 shuffle instructions.  */
   15266 	  if (mode != V4SFmode)
   15267 	    {
   15268 	      tmp = gen_reg_rtx (V4SFmode);
   15269 	      emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
   15270 	    }
   15271 	  else
   15272 	    tmp = new_target;
   15273 
   15274 	  emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
   15275 				       const1_rtx,
   15276 				       GEN_INT (one_var == 1 ? 0 : 1),
   15277 				       GEN_INT (one_var == 2 ? 0+4 : 1+4),
   15278 				       GEN_INT (one_var == 3 ? 0+4 : 1+4)));
   15279 
   15280 	  if (mode != V4SFmode)
   15281 	    emit_move_insn (target, gen_lowpart (V4SImode, tmp));
   15282 	  else if (tmp != target)
   15283 	    emit_move_insn (target, tmp);
   15284 	}
   15285       else if (target != new_target)
   15286 	emit_move_insn (target, new_target);
   15287       return true;
   15288 
   15289     case E_V8HImode:
   15290     case E_V16QImode:
   15291       vsimode = V4SImode;
   15292       goto widen;
   15293     case E_V4HImode:
   15294     case E_V8QImode:
   15295       if (!mmx_ok)
   15296 	return false;
   15297       vsimode = V2SImode;
   15298       goto widen;
   15299     widen:
   15300       if (one_var != 0)
   15301 	return false;
   15302 
   15303       /* Zero extend the variable element to SImode and recurse.  */
   15304       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
   15305 
   15306       x = gen_reg_rtx (vsimode);
   15307       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
   15308 						var, one_var))
   15309 	gcc_unreachable ();
   15310 
   15311       emit_move_insn (target, gen_lowpart (mode, x));
   15312       return true;
   15313 
   15314     default:
   15315       return false;
   15316     }
   15317 }
   15318 
   15319 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
   15320    consisting of the values in VALS.  It is known that all elements
   15321    except ONE_VAR are constants.  Return true if successful.  */
   15322 
   15323 static bool
   15324 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
   15325 				 rtx target, rtx vals, int one_var)
   15326 {
   15327   rtx var = XVECEXP (vals, 0, one_var);
   15328   machine_mode wmode;
   15329   rtx const_vec, x;
   15330 
   15331   const_vec = copy_rtx (vals);
   15332   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
   15333   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
   15334 
   15335   switch (mode)
   15336     {
   15337     case E_V2DFmode:
   15338     case E_V2DImode:
   15339     case E_V2SFmode:
   15340     case E_V2SImode:
   15341       /* For the two element vectors, it's just as easy to use
   15342 	 the general case.  */
   15343       return false;
   15344 
   15345     case E_V4DImode:
   15346       /* Use ix86_expand_vector_set in 64bit mode only.  */
   15347       if (!TARGET_64BIT)
   15348 	return false;
   15349       /* FALLTHRU */
   15350     case E_V8HFmode:
   15351     case E_V16HFmode:
   15352     case E_V4DFmode:
   15353     case E_V8SFmode:
   15354     case E_V8SImode:
   15355     case E_V16HImode:
   15356     case E_V32QImode:
   15357     case E_V4SFmode:
   15358     case E_V4SImode:
   15359     case E_V8HImode:
   15360     case E_V4HImode:
   15361       break;
   15362 
   15363     case E_V16QImode:
   15364       if (TARGET_SSE4_1)
   15365 	break;
   15366       wmode = V8HImode;
   15367       goto widen;
   15368     case E_V8QImode:
   15369       if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
   15370 	break;
   15371       wmode = V4HImode;
   15372       goto widen;
   15373     case E_V4QImode:
   15374       if (TARGET_SSE4_1)
   15375 	break;
   15376       wmode = V2HImode;
   15377     widen:
   15378       /* There's no way to set one QImode entry easily.  Combine
   15379 	 the variable value with its adjacent constant value, and
   15380 	 promote to an HImode set.  */
   15381       x = XVECEXP (vals, 0, one_var ^ 1);
   15382       if (one_var & 1)
   15383 	{
   15384 	  var = convert_modes (HImode, QImode, var, true);
   15385 	  var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
   15386 				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
   15387 	  x = GEN_INT (INTVAL (x) & 0xff);
   15388 	}
   15389       else
   15390 	{
   15391 	  var = convert_modes (HImode, QImode, var, true);
   15392 	  x = gen_int_mode (UINTVAL (x) << 8, HImode);
   15393 	}
   15394       if (x != const0_rtx)
   15395 	var = expand_simple_binop (HImode, IOR, var, x, var,
   15396 				   1, OPTAB_LIB_WIDEN);
   15397 
   15398       x = gen_reg_rtx (wmode);
   15399       emit_move_insn (x, gen_lowpart (wmode, const_vec));
   15400       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
   15401 
   15402       emit_move_insn (target, gen_lowpart (mode, x));
   15403       return true;
   15404 
   15405     default:
   15406       return false;
   15407     }
   15408 
   15409   emit_move_insn (target, const_vec);
   15410   ix86_expand_vector_set (mmx_ok, target, var, one_var);
   15411   return true;
   15412 }
   15413 
   15414 /* A subroutine of ix86_expand_vector_init_general.  Use vector
   15415    concatenate to handle the most general case: all values variable,
   15416    and none identical.  */
   15417 
   15418 static void
   15419 ix86_expand_vector_init_concat (machine_mode mode,
   15420 				rtx target, rtx *ops, int n)
   15421 {
   15422   machine_mode half_mode = VOIDmode;
   15423   rtx half[2];
   15424   rtvec v;
   15425   int i, j;
   15426 
   15427   switch (n)
   15428     {
   15429     case 2:
   15430       switch (mode)
   15431 	{
   15432 	case E_V32HFmode:
   15433 	  half_mode = V16HFmode;
   15434 	  break;
   15435 	case E_V16SImode:
   15436 	  half_mode = V8SImode;
   15437 	  break;
   15438 	case E_V16SFmode:
   15439 	  half_mode = V8SFmode;
   15440 	  break;
   15441 	case E_V8DImode:
   15442 	  half_mode = V4DImode;
   15443 	  break;
   15444 	case E_V8DFmode:
   15445 	  half_mode = V4DFmode;
   15446 	  break;
   15447 	case E_V16HFmode:
   15448 	  half_mode = V8HFmode;
   15449 	  break;
   15450 	case E_V8SImode:
   15451 	  half_mode = V4SImode;
   15452 	  break;
   15453 	case E_V8SFmode:
   15454 	  half_mode = V4SFmode;
   15455 	  break;
   15456 	case E_V4DImode:
   15457 	  half_mode = V2DImode;
   15458 	  break;
   15459 	case E_V4DFmode:
   15460 	  half_mode = V2DFmode;
   15461 	  break;
   15462 	case E_V4SImode:
   15463 	  half_mode = V2SImode;
   15464 	  break;
   15465 	case E_V4SFmode:
   15466 	  half_mode = V2SFmode;
   15467 	  break;
   15468 	case E_V2DImode:
   15469 	  half_mode = DImode;
   15470 	  break;
   15471 	case E_V2SImode:
   15472 	  half_mode = SImode;
   15473 	  break;
   15474 	case E_V2DFmode:
   15475 	  half_mode = DFmode;
   15476 	  break;
   15477 	case E_V2SFmode:
   15478 	  half_mode = SFmode;
   15479 	  break;
   15480 	default:
   15481 	  gcc_unreachable ();
   15482 	}
   15483 
   15484       if (!register_operand (ops[1], half_mode))
   15485 	ops[1] = force_reg (half_mode, ops[1]);
   15486       if (!register_operand (ops[0], half_mode))
   15487 	ops[0] = force_reg (half_mode, ops[0]);
   15488       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
   15489 							  ops[1])));
   15490       break;
   15491 
   15492     case 4:
   15493       switch (mode)
   15494 	{
   15495 	case E_V4DImode:
   15496 	  half_mode = V2DImode;
   15497 	  break;
   15498 	case E_V4DFmode:
   15499 	  half_mode = V2DFmode;
   15500 	  break;
   15501 	case E_V4SImode:
   15502 	  half_mode = V2SImode;
   15503 	  break;
   15504 	case E_V4SFmode:
   15505 	  half_mode = V2SFmode;
   15506 	  break;
   15507 	default:
   15508 	  gcc_unreachable ();
   15509 	}
   15510       goto half;
   15511 
   15512     case 8:
   15513       switch (mode)
   15514 	{
   15515 	case E_V8DImode:
   15516 	  half_mode = V4DImode;
   15517 	  break;
   15518 	case E_V8DFmode:
   15519 	  half_mode = V4DFmode;
   15520 	  break;
   15521 	case E_V8SImode:
   15522 	  half_mode = V4SImode;
   15523 	  break;
   15524 	case E_V8SFmode:
   15525 	  half_mode = V4SFmode;
   15526 	  break;
   15527 	default:
   15528 	  gcc_unreachable ();
   15529 	}
   15530       goto half;
   15531 
   15532     case 16:
   15533       switch (mode)
   15534 	{
   15535 	case E_V16SImode:
   15536 	  half_mode = V8SImode;
   15537 	  break;
   15538 	case E_V16SFmode:
   15539 	  half_mode = V8SFmode;
   15540 	  break;
   15541 	default:
   15542 	  gcc_unreachable ();
   15543 	}
   15544       goto half;
   15545 
   15546 half:
   15547       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
   15548       i = n - 1;
   15549       for (j = 1; j != -1; j--)
   15550 	{
   15551 	  half[j] = gen_reg_rtx (half_mode);
   15552 	  switch (n >> 1)
   15553 	    {
   15554 	    case 2:
   15555 	      v = gen_rtvec (2, ops[i-1], ops[i]);
   15556 	      i -= 2;
   15557 	      break;
   15558 	    case 4:
   15559 	      v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
   15560 	      i -= 4;
   15561 	      break;
   15562 	    case 8:
   15563 	      v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
   15564 			     ops[i-3], ops[i-2], ops[i-1], ops[i]);
   15565 	      i -= 8;
   15566 	      break;
   15567 	    default:
   15568 	      gcc_unreachable ();
   15569 	    }
   15570 	  ix86_expand_vector_init (false, half[j],
   15571 				   gen_rtx_PARALLEL (half_mode, v));
   15572 	}
   15573 
   15574       ix86_expand_vector_init_concat (mode, target, half, 2);
   15575       break;
   15576 
   15577     default:
   15578       gcc_unreachable ();
   15579     }
   15580 }
   15581 
   15582 /* A subroutine of ix86_expand_vector_init_general.  Use vector
   15583    interleave to handle the most general case: all values variable,
   15584    and none identical.  */
   15585 
   15586 static void
   15587 ix86_expand_vector_init_interleave (machine_mode mode,
   15588 				    rtx target, rtx *ops, int n)
   15589 {
   15590   machine_mode first_imode, second_imode, third_imode, inner_mode;
   15591   int i, j;
   15592   rtx op, op0, op1;
   15593   rtx (*gen_load_even) (rtx, rtx, rtx);
   15594   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
   15595   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
   15596 
   15597   switch (mode)
   15598     {
   15599     case E_V8HFmode:
   15600       gen_load_even = gen_vec_interleave_lowv8hf;
   15601       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   15602       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   15603       inner_mode = HFmode;
   15604       first_imode = V4SImode;
   15605       second_imode = V2DImode;
   15606       third_imode = VOIDmode;
   15607       break;
   15608     case E_V8HImode:
   15609       gen_load_even = gen_vec_setv8hi;
   15610       gen_interleave_first_low = gen_vec_interleave_lowv4si;
   15611       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   15612       inner_mode = HImode;
   15613       first_imode = V4SImode;
   15614       second_imode = V2DImode;
   15615       third_imode = VOIDmode;
   15616       break;
   15617     case E_V16QImode:
   15618       gen_load_even = gen_vec_setv16qi;
   15619       gen_interleave_first_low = gen_vec_interleave_lowv8hi;
   15620       gen_interleave_second_low = gen_vec_interleave_lowv4si;
   15621       inner_mode = QImode;
   15622       first_imode = V8HImode;
   15623       second_imode = V4SImode;
   15624       third_imode = V2DImode;
   15625       break;
   15626     default:
   15627       gcc_unreachable ();
   15628     }
   15629 
   15630   for (i = 0; i < n; i++)
   15631     {
   15632       op = ops [i + i];
   15633       if (inner_mode == HFmode)
   15634 	{
   15635 	  rtx even, odd;
   15636 	  /* Use vpuncklwd to pack 2 HFmode.  */
   15637 	  op0 = gen_reg_rtx (V8HFmode);
   15638 	  even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
   15639 	  odd = lowpart_subreg (V8HFmode,
   15640 				force_reg (HFmode, ops[i + i + 1]),
   15641 				HFmode);
   15642 	  emit_insn (gen_load_even (op0, even, odd));
   15643 	}
   15644       else
   15645 	{
   15646 	  /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
   15647 	  op0 = gen_reg_rtx (SImode);
   15648 	  emit_move_insn (op0, gen_lowpart (SImode, op));
   15649 
   15650 	  /* Insert the SImode value as low element of V4SImode vector.  */
   15651 	  op1 = gen_reg_rtx (V4SImode);
   15652 	  op0 = gen_rtx_VEC_MERGE (V4SImode,
   15653 				   gen_rtx_VEC_DUPLICATE (V4SImode,
   15654 							  op0),
   15655 				   CONST0_RTX (V4SImode),
   15656 				   const1_rtx);
   15657 	  emit_insn (gen_rtx_SET (op1, op0));
   15658 
   15659 	  /* Cast the V4SImode vector back to a vector in orignal mode.  */
   15660 	  op0 = gen_reg_rtx (mode);
   15661 	  emit_move_insn (op0, gen_lowpart (mode, op1));
   15662 
   15663 	  /* Load even elements into the second position.  */
   15664 	  emit_insn (gen_load_even (op0,
   15665 				    force_reg (inner_mode,
   15666 					       ops[i + i + 1]),
   15667 				    const1_rtx));
   15668 	}
   15669 
   15670       /* Cast vector to FIRST_IMODE vector.  */
   15671       ops[i] = gen_reg_rtx (first_imode);
   15672       emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
   15673     }
   15674 
   15675   /* Interleave low FIRST_IMODE vectors.  */
   15676   for (i = j = 0; i < n; i += 2, j++)
   15677     {
   15678       op0 = gen_reg_rtx (first_imode);
   15679       emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
   15680 
   15681       /* Cast FIRST_IMODE vector to SECOND_IMODE vector.  */
   15682       ops[j] = gen_reg_rtx (second_imode);
   15683       emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
   15684     }
   15685 
   15686   /* Interleave low SECOND_IMODE vectors.  */
   15687   switch (second_imode)
   15688     {
   15689     case E_V4SImode:
   15690       for (i = j = 0; i < n / 2; i += 2, j++)
   15691 	{
   15692 	  op0 = gen_reg_rtx (second_imode);
   15693 	  emit_insn (gen_interleave_second_low (op0, ops[i],
   15694 						ops[i + 1]));
   15695 
   15696 	  /* Cast the SECOND_IMODE vector to the THIRD_IMODE
   15697 	     vector.  */
   15698 	  ops[j] = gen_reg_rtx (third_imode);
   15699 	  emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
   15700 	}
   15701       second_imode = V2DImode;
   15702       gen_interleave_second_low = gen_vec_interleave_lowv2di;
   15703       /* FALLTHRU */
   15704 
   15705     case E_V2DImode:
   15706       op0 = gen_reg_rtx (second_imode);
   15707       emit_insn (gen_interleave_second_low (op0, ops[0],
   15708 					    ops[1]));
   15709 
   15710       /* Cast the SECOND_IMODE vector back to a vector on original
   15711 	 mode.  */
   15712       emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
   15713       break;
   15714 
   15715     default:
   15716       gcc_unreachable ();
   15717     }
   15718 }
   15719 
   15720 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
   15721    all values variable, and none identical.  */
   15722 
   15723 static void
   15724 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
   15725 				 rtx target, rtx vals)
   15726 {
   15727   rtx ops[64], op0, op1, op2, op3, op4, op5;
   15728   machine_mode half_mode = VOIDmode;
   15729   machine_mode quarter_mode = VOIDmode;
   15730   int n, i;
   15731 
   15732   switch (mode)
   15733     {
   15734     case E_V2SFmode:
   15735     case E_V2SImode:
   15736       if (!mmx_ok && !TARGET_SSE)
   15737 	break;
   15738       /* FALLTHRU */
   15739 
   15740     case E_V16SImode:
   15741     case E_V16SFmode:
   15742     case E_V8DFmode:
   15743     case E_V8DImode:
   15744     case E_V8SFmode:
   15745     case E_V8SImode:
   15746     case E_V4DFmode:
   15747     case E_V4DImode:
   15748     case E_V4SFmode:
   15749     case E_V4SImode:
   15750     case E_V2DFmode:
   15751     case E_V2DImode:
   15752       n = GET_MODE_NUNITS (mode);
   15753       for (i = 0; i < n; i++)
   15754 	ops[i] = XVECEXP (vals, 0, i);
   15755       ix86_expand_vector_init_concat (mode, target, ops, n);
   15756       return;
   15757 
   15758     case E_V2TImode:
   15759       for (i = 0; i < 2; i++)
   15760 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   15761       op0 = gen_reg_rtx (V4DImode);
   15762       ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
   15763       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   15764       return;
   15765 
   15766     case E_V4TImode:
   15767       for (i = 0; i < 4; i++)
   15768 	ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
   15769       ops[4] = gen_reg_rtx (V4DImode);
   15770       ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
   15771       ops[5] = gen_reg_rtx (V4DImode);
   15772       ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
   15773       op0 = gen_reg_rtx (V8DImode);
   15774       ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
   15775       emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
   15776       return;
   15777 
   15778     case E_V32QImode:
   15779       half_mode = V16QImode;
   15780       goto half;
   15781 
   15782     case E_V16HImode:
   15783       half_mode = V8HImode;
   15784       goto half;
   15785 
   15786     case E_V16HFmode:
   15787       half_mode = V8HFmode;
   15788       goto half;
   15789 
   15790 half:
   15791       n = GET_MODE_NUNITS (mode);
   15792       for (i = 0; i < n; i++)
   15793 	ops[i] = XVECEXP (vals, 0, i);
   15794       op0 = gen_reg_rtx (half_mode);
   15795       op1 = gen_reg_rtx (half_mode);
   15796       ix86_expand_vector_init_interleave (half_mode, op0, ops,
   15797 					  n >> 2);
   15798       ix86_expand_vector_init_interleave (half_mode, op1,
   15799 					  &ops [n >> 1], n >> 2);
   15800       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
   15801       return;
   15802 
   15803     case E_V64QImode:
   15804       quarter_mode = V16QImode;
   15805       half_mode = V32QImode;
   15806       goto quarter;
   15807 
   15808     case E_V32HImode:
   15809       quarter_mode = V8HImode;
   15810       half_mode = V16HImode;
   15811       goto quarter;
   15812 
   15813     case E_V32HFmode:
   15814       quarter_mode = V8HFmode;
   15815       half_mode = V16HFmode;
   15816       goto quarter;
   15817 
   15818 quarter:
   15819       n = GET_MODE_NUNITS (mode);
   15820       for (i = 0; i < n; i++)
   15821 	ops[i] = XVECEXP (vals, 0, i);
   15822       op0 = gen_reg_rtx (quarter_mode);
   15823       op1 = gen_reg_rtx (quarter_mode);
   15824       op2 = gen_reg_rtx (quarter_mode);
   15825       op3 = gen_reg_rtx (quarter_mode);
   15826       op4 = gen_reg_rtx (half_mode);
   15827       op5 = gen_reg_rtx (half_mode);
   15828       ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
   15829 					  n >> 3);
   15830       ix86_expand_vector_init_interleave (quarter_mode, op1,
   15831 					  &ops [n >> 2], n >> 3);
   15832       ix86_expand_vector_init_interleave (quarter_mode, op2,
   15833 					  &ops [n >> 1], n >> 3);
   15834       ix86_expand_vector_init_interleave (quarter_mode, op3,
   15835 					  &ops [(n >> 1) | (n >> 2)], n >> 3);
   15836       emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
   15837       emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
   15838       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
   15839       return;
   15840 
   15841     case E_V16QImode:
   15842       if (!TARGET_SSE4_1)
   15843 	break;
   15844       /* FALLTHRU */
   15845 
   15846     case E_V8HImode:
   15847       if (!TARGET_SSE2)
   15848 	break;
   15849 
   15850       /* Don't use ix86_expand_vector_init_interleave if we can't
   15851 	 move from GPR to SSE register directly.  */
   15852       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
   15853 	break;
   15854       /* FALLTHRU */
   15855 
   15856     case E_V8HFmode:
   15857 
   15858       n = GET_MODE_NUNITS (mode);
   15859       for (i = 0; i < n; i++)
   15860 	ops[i] = XVECEXP (vals, 0, i);
   15861       ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
   15862       return;
   15863 
   15864     case E_V4HImode:
   15865     case E_V8QImode:
   15866 
   15867     case E_V2HImode:
   15868     case E_V4QImode:
   15869       break;
   15870 
   15871     default:
   15872       gcc_unreachable ();
   15873     }
   15874 
   15875     {
   15876       int i, j, n_elts, n_words, n_elt_per_word;
   15877       machine_mode tmp_mode, inner_mode;
   15878       rtx words[4], shift;
   15879 
   15880       tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
   15881 
   15882       inner_mode = GET_MODE_INNER (mode);
   15883       n_elts = GET_MODE_NUNITS (mode);
   15884       n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
   15885       n_elt_per_word = n_elts / n_words;
   15886       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
   15887 
   15888       for (i = 0; i < n_words; ++i)
   15889 	{
   15890 	  rtx word = NULL_RTX;
   15891 
   15892 	  for (j = 0; j < n_elt_per_word; ++j)
   15893 	    {
   15894 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
   15895 	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
   15896 
   15897 	      if (j == 0)
   15898 		word = elt;
   15899 	      else
   15900 		{
   15901 		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
   15902 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   15903 		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
   15904 					      NULL_RTX, 1, OPTAB_LIB_WIDEN);
   15905 		}
   15906 	    }
   15907 
   15908 	  words[i] = word;
   15909 	}
   15910 
   15911       if (n_words == 1)
   15912 	emit_move_insn (target, gen_lowpart (mode, words[0]));
   15913       else if (n_words == 2)
   15914 	{
   15915 	  rtx tmp = gen_reg_rtx (mode);
   15916 	  emit_clobber (tmp);
   15917 	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
   15918 	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
   15919 	  emit_move_insn (target, tmp);
   15920 	}
   15921       else if (n_words == 4)
   15922 	{
   15923 	  rtx tmp = gen_reg_rtx (V4SImode);
   15924 	  gcc_assert (tmp_mode == SImode);
   15925 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
   15926 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
   15927 	  emit_move_insn (target, gen_lowpart (mode, tmp));
   15928 	}
   15929       else
   15930 	gcc_unreachable ();
   15931     }
   15932 }
   15933 
   15934 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
   15935    instructions unless MMX_OK is true.  */
   15936 
   15937 void
   15938 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
   15939 {
   15940   machine_mode mode = GET_MODE (target);
   15941   machine_mode inner_mode = GET_MODE_INNER (mode);
   15942   int n_elts = GET_MODE_NUNITS (mode);
   15943   int n_var = 0, one_var = -1;
   15944   bool all_same = true, all_const_zero = true;
   15945   int i;
   15946   rtx x;
   15947 
   15948   /* Handle first initialization from vector elts.  */
   15949   if (n_elts != XVECLEN (vals, 0))
   15950     {
   15951       rtx subtarget = target;
   15952       x = XVECEXP (vals, 0, 0);
   15953       gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
   15954       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
   15955 	{
   15956 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
   15957 	  if (inner_mode == QImode
   15958 	      || inner_mode == HImode
   15959 	      || inner_mode == TImode
   15960 	      || inner_mode == HFmode)
   15961 	    {
   15962 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
   15963 	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
   15964 	      n_bits /= GET_MODE_SIZE (elt_mode);
   15965 	      mode = mode_for_vector (elt_mode, n_bits).require ();
   15966 	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
   15967 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
   15968 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
   15969 	      subtarget = gen_reg_rtx (mode);
   15970 	    }
   15971 	  ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
   15972 	  if (subtarget != target)
   15973 	    emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
   15974 	  return;
   15975 	}
   15976       gcc_unreachable ();
   15977     }
   15978 
   15979   for (i = 0; i < n_elts; ++i)
   15980     {
   15981       x = XVECEXP (vals, 0, i);
   15982       if (!(CONST_SCALAR_INT_P (x)
   15983 	    || CONST_DOUBLE_P (x)
   15984 	    || CONST_FIXED_P (x)))
   15985 	n_var++, one_var = i;
   15986       else if (x != CONST0_RTX (inner_mode))
   15987 	all_const_zero = false;
   15988       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
   15989 	all_same = false;
   15990     }
   15991 
   15992   /* Constants are best loaded from the constant pool.  */
   15993   if (n_var == 0)
   15994     {
   15995       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
   15996       return;
   15997     }
   15998 
   15999   /* If all values are identical, broadcast the value.  */
   16000   if (all_same
   16001       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
   16002 					    XVECEXP (vals, 0, 0)))
   16003     return;
   16004 
   16005   /* Values where only one field is non-constant are best loaded from
   16006      the pool and overwritten via move later.  */
   16007   if (n_var == 1)
   16008     {
   16009       if (all_const_zero
   16010 	  && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
   16011 						  XVECEXP (vals, 0, one_var),
   16012 						  one_var))
   16013 	return;
   16014 
   16015       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
   16016 	return;
   16017     }
   16018 
   16019   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
   16020 }
   16021 
   16022 /* Implemented as
   16023    V setg (V v, int idx, T val)
   16024    {
   16025      V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
   16026      V valv = (V){val, val, val, val, val, val, val, val};
   16027      V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
   16028      v = (v & ~mask) | (valv & mask);
   16029      return v;
   16030    }.  */
   16031 void
   16032 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   16033 {
   16034   rtx vec[64];
   16035   machine_mode mode = GET_MODE (target);
   16036   machine_mode cmp_mode = mode;
   16037   int n_elts = GET_MODE_NUNITS (mode);
   16038   rtx valv,idxv,constv,idx_tmp;
   16039   bool ok = false;
   16040 
   16041   /* 512-bits vector byte/word broadcast and comparison only available
   16042      under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
   16043      when without TARGET_AVX512BW.  */
   16044   if ((mode == V32HImode || mode == V32HFmode || mode == V64QImode)
   16045       && !TARGET_AVX512BW)
   16046     {
   16047       gcc_assert (TARGET_AVX512F);
   16048       rtx vhi, vlo, idx_hi;
   16049       machine_mode half_mode;
   16050       rtx (*extract_hi)(rtx, rtx);
   16051       rtx (*extract_lo)(rtx, rtx);
   16052 
   16053       if (mode == V32HImode)
   16054 	{
   16055 	  half_mode = V16HImode;
   16056 	  extract_hi = gen_vec_extract_hi_v32hi;
   16057 	  extract_lo = gen_vec_extract_lo_v32hi;
   16058 	}
   16059       else if (mode == V32HFmode)
   16060 	{
   16061 	  half_mode = V16HFmode;
   16062 	  extract_hi = gen_vec_extract_hi_v32hf;
   16063 	  extract_lo = gen_vec_extract_lo_v32hf;
   16064 	}
   16065       else
   16066 	{
   16067 	  half_mode = V32QImode;
   16068 	  extract_hi = gen_vec_extract_hi_v64qi;
   16069 	  extract_lo = gen_vec_extract_lo_v64qi;
   16070 	}
   16071 
   16072       vhi = gen_reg_rtx (half_mode);
   16073       vlo = gen_reg_rtx (half_mode);
   16074       idx_hi = gen_reg_rtx (GET_MODE (idx));
   16075       emit_insn (extract_hi (vhi, target));
   16076       emit_insn (extract_lo (vlo, target));
   16077       vec[0] = idx_hi;
   16078       vec[1] = idx;
   16079       vec[2] = GEN_INT (n_elts/2);
   16080       ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
   16081       ix86_expand_vector_set_var (vhi, val, idx_hi);
   16082       ix86_expand_vector_set_var (vlo, val, idx);
   16083       emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
   16084       return;
   16085     }
   16086 
   16087   if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
   16088     {
   16089       switch (mode)
   16090 	{
   16091 	case E_V2DFmode:
   16092 	  cmp_mode = V2DImode;
   16093 	  break;
   16094 	case E_V4DFmode:
   16095 	  cmp_mode = V4DImode;
   16096 	  break;
   16097 	case E_V8DFmode:
   16098 	  cmp_mode = V8DImode;
   16099 	  break;
   16100 	case E_V2SFmode:
   16101 	  cmp_mode = V2SImode;
   16102 	  break;
   16103 	case E_V4SFmode:
   16104 	  cmp_mode = V4SImode;
   16105 	  break;
   16106 	case E_V8SFmode:
   16107 	  cmp_mode = V8SImode;
   16108 	  break;
   16109 	case E_V16SFmode:
   16110 	  cmp_mode = V16SImode;
   16111 	  break;
   16112 	case E_V8HFmode:
   16113 	  cmp_mode = V8HImode;
   16114 	  break;
   16115 	case E_V16HFmode:
   16116 	  cmp_mode = V16HImode;
   16117 	  break;
   16118 	case E_V32HFmode:
   16119 	  cmp_mode = V32HImode;
   16120 	  break;
   16121 	default:
   16122 	  gcc_unreachable ();
   16123 	}
   16124     }
   16125 
   16126   for (int i = 0; i != n_elts; i++)
   16127     vec[i] = GEN_INT (i);
   16128   constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
   16129   valv = gen_reg_rtx (mode);
   16130   idxv = gen_reg_rtx (cmp_mode);
   16131   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
   16132 
   16133   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   16134 					  mode, valv, val);
   16135   gcc_assert (ok);
   16136   ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
   16137 					  cmp_mode, idxv, idx_tmp);
   16138   gcc_assert (ok);
   16139   vec[0] = target;
   16140   vec[1] = valv;
   16141   vec[2] = target;
   16142   vec[3] = gen_rtx_EQ (mode, idxv, constv);
   16143   vec[4] = idxv;
   16144   vec[5] = constv;
   16145   ok = ix86_expand_int_vcond (vec);
   16146   gcc_assert (ok);
   16147 }
   16148 
   16149 void
   16150 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   16151 {
   16152   machine_mode mode = GET_MODE (target);
   16153   machine_mode inner_mode = GET_MODE_INNER (mode);
   16154   machine_mode half_mode;
   16155   bool use_vec_merge = false;
   16156   bool blendm_const = false;
   16157   rtx tmp;
   16158   static rtx (*gen_extract[7][2]) (rtx, rtx)
   16159     = {
   16160 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
   16161 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
   16162 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
   16163 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
   16164 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
   16165 	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
   16166 	{ gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
   16167       };
   16168   static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
   16169     = {
   16170 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
   16171 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
   16172 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
   16173 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
   16174 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
   16175 	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
   16176 	{ gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
   16177       };
   16178   int i, j, n;
   16179   machine_mode mmode = VOIDmode;
   16180   rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
   16181 
   16182   switch (mode)
   16183     {
   16184     case E_V2SImode:
   16185       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   16186       if (use_vec_merge)
   16187 	break;
   16188       /* FALLTHRU */
   16189 
   16190     case E_V2SFmode:
   16191       if (mmx_ok)
   16192 	{
   16193 	  tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   16194 	  ix86_expand_vector_extract (true, tmp, target, 1 - elt);
   16195 	  if (elt == 0)
   16196 	    tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   16197 	  else
   16198 	    tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   16199 	  emit_insn (gen_rtx_SET (target, tmp));
   16200 	  return;
   16201 	}
   16202       break;
   16203 
   16204     case E_V2DImode:
   16205       use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
   16206       if (use_vec_merge)
   16207 	break;
   16208 
   16209       tmp = gen_reg_rtx (GET_MODE_INNER (mode));
   16210       ix86_expand_vector_extract (false, tmp, target, 1 - elt);
   16211       if (elt == 0)
   16212 	tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
   16213       else
   16214 	tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
   16215       emit_insn (gen_rtx_SET (target, tmp));
   16216       return;
   16217 
   16218     case E_V2DFmode:
   16219       /* NB: For ELT == 0, use standard scalar operation patterns which
   16220 	 preserve the rest of the vector for combiner:
   16221 
   16222 	 (vec_merge:V2DF
   16223 	   (vec_duplicate:V2DF (reg:DF))
   16224 	   (reg:V2DF)
   16225 	   (const_int 1))
   16226        */
   16227       if (elt == 0)
   16228 	goto do_vec_merge;
   16229 
   16230       {
   16231 	rtx op0, op1;
   16232 
   16233 	/* For the two element vectors, we implement a VEC_CONCAT with
   16234 	   the extraction of the other element.  */
   16235 
   16236 	tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
   16237 	tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
   16238 
   16239 	if (elt == 0)
   16240 	  op0 = val, op1 = tmp;
   16241 	else
   16242 	  op0 = tmp, op1 = val;
   16243 
   16244 	tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
   16245 	emit_insn (gen_rtx_SET (target, tmp));
   16246       }
   16247       return;
   16248 
   16249     case E_V4SFmode:
   16250       use_vec_merge = TARGET_SSE4_1;
   16251       if (use_vec_merge)
   16252 	break;
   16253 
   16254       switch (elt)
   16255 	{
   16256 	case 0:
   16257 	  use_vec_merge = true;
   16258 	  break;
   16259 
   16260 	case 1:
   16261 	  /* tmp = target = A B C D */
   16262 	  tmp = copy_to_reg (target);
   16263 	  /* target = A A B B */
   16264 	  emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
   16265 	  /* target = X A B B */
   16266 	  ix86_expand_vector_set (false, target, val, 0);
   16267 	  /* target = A X C D  */
   16268 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   16269 					  const1_rtx, const0_rtx,
   16270 					  GEN_INT (2+4), GEN_INT (3+4)));
   16271 	  return;
   16272 
   16273 	case 2:
   16274 	  /* tmp = target = A B C D */
   16275 	  tmp = copy_to_reg (target);
   16276 	  /* tmp = X B C D */
   16277 	  ix86_expand_vector_set (false, tmp, val, 0);
   16278 	  /* target = A B X D */
   16279 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   16280 					  const0_rtx, const1_rtx,
   16281 					  GEN_INT (0+4), GEN_INT (3+4)));
   16282 	  return;
   16283 
   16284 	case 3:
   16285 	  /* tmp = target = A B C D */
   16286 	  tmp = copy_to_reg (target);
   16287 	  /* tmp = X B C D */
   16288 	  ix86_expand_vector_set (false, tmp, val, 0);
   16289 	  /* target = A B X D */
   16290 	  emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
   16291 					  const0_rtx, const1_rtx,
   16292 					  GEN_INT (2+4), GEN_INT (0+4)));
   16293 	  return;
   16294 
   16295 	default:
   16296 	  gcc_unreachable ();
   16297 	}
   16298       break;
   16299 
   16300     case E_V4SImode:
   16301       use_vec_merge = TARGET_SSE4_1;
   16302       if (use_vec_merge)
   16303 	break;
   16304 
   16305       /* Element 0 handled by vec_merge below.  */
   16306       if (elt == 0)
   16307 	{
   16308 	  use_vec_merge = true;
   16309 	  break;
   16310 	}
   16311 
   16312       if (TARGET_SSE2)
   16313 	{
   16314 	  /* With SSE2, use integer shuffles to swap element 0 and ELT,
   16315 	     store into element 0, then shuffle them back.  */
   16316 
   16317 	  rtx order[4];
   16318 
   16319 	  order[0] = GEN_INT (elt);
   16320 	  order[1] = const1_rtx;
   16321 	  order[2] = const2_rtx;
   16322 	  order[3] = GEN_INT (3);
   16323 	  order[elt] = const0_rtx;
   16324 
   16325 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   16326 					order[1], order[2], order[3]));
   16327 
   16328 	  ix86_expand_vector_set (false, target, val, 0);
   16329 
   16330 	  emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
   16331 					order[1], order[2], order[3]));
   16332 	}
   16333       else
   16334 	{
   16335 	  /* For SSE1, we have to reuse the V4SF code.  */
   16336 	  rtx t = gen_reg_rtx (V4SFmode);
   16337 	  emit_move_insn (t, gen_lowpart (V4SFmode, target));
   16338 	  ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
   16339 	  emit_move_insn (target, gen_lowpart (mode, t));
   16340 	}
   16341       return;
   16342 
   16343     case E_V8HImode:
   16344     case E_V8HFmode:
   16345     case E_V2HImode:
   16346       use_vec_merge = TARGET_SSE2;
   16347       break;
   16348     case E_V4HImode:
   16349       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   16350       break;
   16351 
   16352     case E_V16QImode:
   16353     case E_V4QImode:
   16354       use_vec_merge = TARGET_SSE4_1;
   16355       break;
   16356 
   16357     case E_V8QImode:
   16358       use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   16359       break;
   16360 
   16361     case E_V32QImode:
   16362       half_mode = V16QImode;
   16363       j = 0;
   16364       n = 16;
   16365       goto half;
   16366 
   16367     case E_V16HFmode:
   16368       /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw.  */
   16369       if (TARGET_AVX2 && elt != 0)
   16370 	{
   16371 	  mmode = SImode;
   16372 	  gen_blendm = gen_avx2_pblendph_1;
   16373 	  blendm_const = true;
   16374 	  break;
   16375 	}
   16376       else
   16377 	{
   16378 	  half_mode = V8HFmode;
   16379 	  j = 6;
   16380 	  n = 8;
   16381 	  goto half;
   16382 	}
   16383 
   16384     case E_V16HImode:
   16385       half_mode = V8HImode;
   16386       j = 1;
   16387       n = 8;
   16388       goto half;
   16389 
   16390     case E_V8SImode:
   16391       half_mode = V4SImode;
   16392       j = 2;
   16393       n = 4;
   16394       goto half;
   16395 
   16396     case E_V4DImode:
   16397       half_mode = V2DImode;
   16398       j = 3;
   16399       n = 2;
   16400       goto half;
   16401 
   16402     case E_V8SFmode:
   16403       half_mode = V4SFmode;
   16404       j = 4;
   16405       n = 4;
   16406       goto half;
   16407 
   16408     case E_V4DFmode:
   16409       half_mode = V2DFmode;
   16410       j = 5;
   16411       n = 2;
   16412       goto half;
   16413 
   16414 half:
   16415       /* Compute offset.  */
   16416       i = elt / n;
   16417       elt %= n;
   16418 
   16419       gcc_assert (i <= 1);
   16420 
   16421       /* Extract the half.  */
   16422       tmp = gen_reg_rtx (half_mode);
   16423       emit_insn (gen_extract[j][i] (tmp, target));
   16424 
   16425       /* Put val in tmp at elt.  */
   16426       ix86_expand_vector_set (false, tmp, val, elt);
   16427 
   16428       /* Put it back.  */
   16429       emit_insn (gen_insert[j][i] (target, target, tmp));
   16430       return;
   16431 
   16432     case E_V8DFmode:
   16433       if (TARGET_AVX512F)
   16434 	{
   16435 	  mmode = QImode;
   16436 	  gen_blendm = gen_avx512f_blendmv8df;
   16437 	}
   16438       break;
   16439 
   16440     case E_V8DImode:
   16441       if (TARGET_AVX512F)
   16442 	{
   16443 	  mmode = QImode;
   16444 	  gen_blendm = gen_avx512f_blendmv8di;
   16445 	}
   16446       break;
   16447 
   16448     case E_V16SFmode:
   16449       if (TARGET_AVX512F)
   16450 	{
   16451 	  mmode = HImode;
   16452 	  gen_blendm = gen_avx512f_blendmv16sf;
   16453 	}
   16454       break;
   16455 
   16456     case E_V16SImode:
   16457       if (TARGET_AVX512F)
   16458 	{
   16459 	  mmode = HImode;
   16460 	  gen_blendm = gen_avx512f_blendmv16si;
   16461 	}
   16462       break;
   16463 
   16464     case E_V32HFmode:
   16465       if (TARGET_AVX512BW)
   16466 	{
   16467 	  mmode = SImode;
   16468 	  gen_blendm = gen_avx512bw_blendmv32hf;
   16469 	}
   16470       break;
   16471     case E_V32HImode:
   16472       if (TARGET_AVX512BW)
   16473 	{
   16474 	  mmode = SImode;
   16475 	  gen_blendm = gen_avx512bw_blendmv32hi;
   16476 	}
   16477       else if (TARGET_AVX512F)
   16478 	{
   16479 	  half_mode = E_V8HImode;
   16480 	  n = 8;
   16481 	  goto quarter;
   16482 	}
   16483       break;
   16484 
   16485     case E_V64QImode:
   16486       if (TARGET_AVX512BW)
   16487 	{
   16488 	  mmode = DImode;
   16489 	  gen_blendm = gen_avx512bw_blendmv64qi;
   16490 	}
   16491       else if (TARGET_AVX512F)
   16492 	{
   16493 	  half_mode = E_V16QImode;
   16494 	  n = 16;
   16495 	  goto quarter;
   16496 	}
   16497       break;
   16498 
   16499 quarter:
   16500       /* Compute offset.  */
   16501       i = elt / n;
   16502       elt %= n;
   16503 
   16504       gcc_assert (i <= 3);
   16505 
   16506       {
   16507 	/* Extract the quarter.  */
   16508 	tmp = gen_reg_rtx (V4SImode);
   16509 	rtx tmp2 = gen_lowpart (V16SImode, target);
   16510 	rtx mask = gen_reg_rtx (QImode);
   16511 
   16512 	emit_move_insn (mask, constm1_rtx);
   16513 	emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
   16514 						   tmp, mask));
   16515 
   16516 	tmp2 = gen_reg_rtx (half_mode);
   16517 	emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
   16518 	tmp = tmp2;
   16519 
   16520 	/* Put val in tmp at elt.  */
   16521 	ix86_expand_vector_set (false, tmp, val, elt);
   16522 
   16523 	/* Put it back.  */
   16524 	tmp2 = gen_reg_rtx (V16SImode);
   16525 	rtx tmp3 = gen_lowpart (V16SImode, target);
   16526 	mask = gen_reg_rtx (HImode);
   16527 	emit_move_insn (mask, constm1_rtx);
   16528 	tmp = gen_lowpart (V4SImode, tmp);
   16529 	emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
   16530 						  tmp3, mask));
   16531 	emit_move_insn (target, gen_lowpart (mode, tmp2));
   16532       }
   16533       return;
   16534 
   16535     default:
   16536       break;
   16537     }
   16538 
   16539   if (mmode != VOIDmode)
   16540     {
   16541       tmp = gen_reg_rtx (mode);
   16542       emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
   16543       rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
   16544       /* The avx512*_blendm<mode> expanders have different operand order
   16545 	 from VEC_MERGE.  In VEC_MERGE, the first input operand is used for
   16546 	 elements where the mask is set and second input operand otherwise,
   16547 	 in {sse,avx}*_*blend* the first input operand is used for elements
   16548 	 where the mask is clear and second input operand otherwise.  */
   16549       if (!blendm_const)
   16550 	merge_mask = force_reg (mmode, merge_mask);
   16551       emit_insn (gen_blendm (target, target, tmp, merge_mask));
   16552     }
   16553   else if (use_vec_merge)
   16554     {
   16555 do_vec_merge:
   16556       if (!nonimmediate_operand (val, inner_mode))
   16557 	val = force_reg (inner_mode, val);
   16558       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
   16559       tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
   16560 			       GEN_INT (HOST_WIDE_INT_1U << elt));
   16561       emit_insn (gen_rtx_SET (target, tmp));
   16562     }
   16563   else
   16564     {
   16565       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   16566 
   16567       emit_move_insn (mem, target);
   16568 
   16569       tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
   16570       emit_move_insn (tmp, val);
   16571 
   16572       emit_move_insn (target, mem);
   16573     }
   16574 }
   16575 
   16576 void
   16577 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
   16578 {
   16579   machine_mode mode = GET_MODE (vec);
   16580   machine_mode inner_mode = GET_MODE_INNER (mode);
   16581   bool use_vec_extr = false;
   16582   rtx tmp;
   16583 
   16584   switch (mode)
   16585     {
   16586     case E_V2SImode:
   16587       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   16588       if (use_vec_extr)
   16589 	break;
   16590       /* FALLTHRU */
   16591 
   16592     case E_V2SFmode:
   16593       if (!mmx_ok)
   16594 	break;
   16595       /* FALLTHRU */
   16596 
   16597     case E_V2DFmode:
   16598     case E_V2DImode:
   16599     case E_V2TImode:
   16600     case E_V4TImode:
   16601       use_vec_extr = true;
   16602       break;
   16603 
   16604     case E_V4SFmode:
   16605       use_vec_extr = TARGET_SSE4_1;
   16606       if (use_vec_extr)
   16607 	break;
   16608 
   16609       switch (elt)
   16610 	{
   16611 	case 0:
   16612 	  tmp = vec;
   16613 	  break;
   16614 
   16615 	case 1:
   16616 	case 3:
   16617 	  tmp = gen_reg_rtx (mode);
   16618 	  emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
   16619 				       GEN_INT (elt), GEN_INT (elt),
   16620 				       GEN_INT (elt+4), GEN_INT (elt+4)));
   16621 	  break;
   16622 
   16623 	case 2:
   16624 	  tmp = gen_reg_rtx (mode);
   16625 	  emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
   16626 	  break;
   16627 
   16628 	default:
   16629 	  gcc_unreachable ();
   16630 	}
   16631       vec = tmp;
   16632       use_vec_extr = true;
   16633       elt = 0;
   16634       break;
   16635 
   16636     case E_V4SImode:
   16637       use_vec_extr = TARGET_SSE4_1;
   16638       if (use_vec_extr)
   16639 	break;
   16640 
   16641       if (TARGET_SSE2)
   16642 	{
   16643 	  switch (elt)
   16644 	    {
   16645 	    case 0:
   16646 	      tmp = vec;
   16647 	      break;
   16648 
   16649 	    case 1:
   16650 	    case 3:
   16651 	      tmp = gen_reg_rtx (mode);
   16652 	      emit_insn (gen_sse2_pshufd_1 (tmp, vec,
   16653 					    GEN_INT (elt), GEN_INT (elt),
   16654 					    GEN_INT (elt), GEN_INT (elt)));
   16655 	      break;
   16656 
   16657 	    case 2:
   16658 	      tmp = gen_reg_rtx (mode);
   16659 	      emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
   16660 	      break;
   16661 
   16662 	    default:
   16663 	      gcc_unreachable ();
   16664 	    }
   16665 	  vec = tmp;
   16666 	  use_vec_extr = true;
   16667 	  elt = 0;
   16668 	}
   16669       else
   16670 	{
   16671 	  /* For SSE1, we have to reuse the V4SF code.  */
   16672 	  ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
   16673 				      gen_lowpart (V4SFmode, vec), elt);
   16674 	  return;
   16675 	}
   16676       break;
   16677 
   16678     case E_V8HImode:
   16679     case E_V8HFmode:
   16680     case E_V2HImode:
   16681       use_vec_extr = TARGET_SSE2;
   16682       break;
   16683     case E_V4HImode:
   16684       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
   16685       break;
   16686 
   16687     case E_V16QImode:
   16688       use_vec_extr = TARGET_SSE4_1;
   16689       if (!use_vec_extr
   16690 	  && TARGET_SSE2
   16691 	  && elt == 0
   16692 	  && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
   16693 	{
   16694 	  tmp = gen_reg_rtx (SImode);
   16695 	  ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
   16696 				      0);
   16697 	  emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
   16698 	  return;
   16699 	}
   16700       break;
   16701     case E_V4QImode:
   16702       use_vec_extr = TARGET_SSE4_1;
   16703       break;
   16704 
   16705     case E_V8SFmode:
   16706       if (TARGET_AVX)
   16707 	{
   16708 	  tmp = gen_reg_rtx (V4SFmode);
   16709 	  if (elt < 4)
   16710 	    emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
   16711 	  else
   16712 	    emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
   16713 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
   16714 	  return;
   16715 	}
   16716       break;
   16717 
   16718     case E_V4DFmode:
   16719       if (TARGET_AVX)
   16720 	{
   16721 	  tmp = gen_reg_rtx (V2DFmode);
   16722 	  if (elt < 2)
   16723 	    emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
   16724 	  else
   16725 	    emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
   16726 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
   16727 	  return;
   16728 	}
   16729       break;
   16730 
   16731     case E_V32QImode:
   16732       if (TARGET_AVX)
   16733 	{
   16734 	  tmp = gen_reg_rtx (V16QImode);
   16735 	  if (elt < 16)
   16736 	    emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
   16737 	  else
   16738 	    emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
   16739 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
   16740 	  return;
   16741 	}
   16742       break;
   16743 
   16744     case E_V16HImode:
   16745       if (TARGET_AVX)
   16746 	{
   16747 	  tmp = gen_reg_rtx (V8HImode);
   16748 	  if (elt < 8)
   16749 	    emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
   16750 	  else
   16751 	    emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
   16752 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
   16753 	  return;
   16754 	}
   16755       break;
   16756 
   16757     case E_V8SImode:
   16758       if (TARGET_AVX)
   16759 	{
   16760 	  tmp = gen_reg_rtx (V4SImode);
   16761 	  if (elt < 4)
   16762 	    emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
   16763 	  else
   16764 	    emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
   16765 	  ix86_expand_vector_extract (false, target, tmp, elt & 3);
   16766 	  return;
   16767 	}
   16768       break;
   16769 
   16770     case E_V4DImode:
   16771       if (TARGET_AVX)
   16772 	{
   16773 	  tmp = gen_reg_rtx (V2DImode);
   16774 	  if (elt < 2)
   16775 	    emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
   16776 	  else
   16777 	    emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
   16778 	  ix86_expand_vector_extract (false, target, tmp, elt & 1);
   16779 	  return;
   16780 	}
   16781       break;
   16782 
   16783     case E_V32HImode:
   16784       if (TARGET_AVX512BW)
   16785 	{
   16786 	  tmp = gen_reg_rtx (V16HImode);
   16787 	  if (elt < 16)
   16788 	    emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
   16789 	  else
   16790 	    emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
   16791 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
   16792 	  return;
   16793 	}
   16794       break;
   16795 
   16796     case E_V64QImode:
   16797       if (TARGET_AVX512BW)
   16798 	{
   16799 	  tmp = gen_reg_rtx (V32QImode);
   16800 	  if (elt < 32)
   16801 	    emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
   16802 	  else
   16803 	    emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
   16804 	  ix86_expand_vector_extract (false, target, tmp, elt & 31);
   16805 	  return;
   16806 	}
   16807       break;
   16808 
   16809     case E_V16SFmode:
   16810       tmp = gen_reg_rtx (V8SFmode);
   16811       if (elt < 8)
   16812 	emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
   16813       else
   16814 	emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
   16815       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   16816       return;
   16817 
   16818     case E_V8DFmode:
   16819       tmp = gen_reg_rtx (V4DFmode);
   16820       if (elt < 4)
   16821 	emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
   16822       else
   16823 	emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
   16824       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   16825       return;
   16826 
   16827     case E_V16SImode:
   16828       tmp = gen_reg_rtx (V8SImode);
   16829       if (elt < 8)
   16830 	emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
   16831       else
   16832 	emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
   16833       ix86_expand_vector_extract (false, target, tmp, elt & 7);
   16834       return;
   16835 
   16836     case E_V8DImode:
   16837       tmp = gen_reg_rtx (V4DImode);
   16838       if (elt < 4)
   16839 	emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
   16840       else
   16841 	emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
   16842       ix86_expand_vector_extract (false, target, tmp, elt & 3);
   16843       return;
   16844 
   16845     case E_V32HFmode:
   16846       if (TARGET_AVX512BW)
   16847 	{
   16848 	  tmp = gen_reg_rtx (V16HFmode);
   16849 	  if (elt < 16)
   16850 	    emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
   16851 	  else
   16852 	    emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
   16853 	  ix86_expand_vector_extract (false, target, tmp, elt & 15);
   16854 	  return;
   16855 	}
   16856       break;
   16857 
   16858     case E_V16HFmode:
   16859       if (TARGET_AVX)
   16860 	{
   16861 	  tmp = gen_reg_rtx (V8HFmode);
   16862 	  if (elt < 8)
   16863 	    emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
   16864 	  else
   16865 	    emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
   16866 	  ix86_expand_vector_extract (false, target, tmp, elt & 7);
   16867 	  return;
   16868 	}
   16869       break;
   16870 
   16871     case E_V8QImode:
   16872       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
   16873       /* ??? Could extract the appropriate HImode element and shift.  */
   16874       break;
   16875 
   16876     default:
   16877       break;
   16878     }
   16879 
   16880   if (use_vec_extr)
   16881     {
   16882       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
   16883       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
   16884 
   16885       /* Let the rtl optimizers know about the zero extension performed.  */
   16886       if (inner_mode == QImode || inner_mode == HImode)
   16887 	{
   16888 	  rtx reg = gen_reg_rtx (SImode);
   16889 	  tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
   16890 	  emit_move_insn (reg, tmp);
   16891 	  tmp = gen_lowpart (inner_mode, reg);
   16892 	  SUBREG_PROMOTED_VAR_P (tmp) = 1;
   16893 	  SUBREG_PROMOTED_SET (tmp, 1);
   16894 	}
   16895 
   16896       emit_move_insn (target, tmp);
   16897     }
   16898   else
   16899     {
   16900       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   16901 
   16902       emit_move_insn (mem, vec);
   16903 
   16904       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
   16905       emit_move_insn (target, tmp);
   16906     }
   16907 }
   16908 
   16909 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
   16910    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
   16911    The upper bits of DEST are undefined, though they shouldn't cause
   16912    exceptions (some bits from src or all zeros are ok).  */
   16913 
   16914 static void
   16915 emit_reduc_half (rtx dest, rtx src, int i)
   16916 {
   16917   rtx tem, d = dest;
   16918   switch (GET_MODE (src))
   16919     {
   16920     case E_V4SFmode:
   16921       if (i == 128)
   16922 	tem = gen_sse_movhlps (dest, src, src);
   16923       else
   16924 	tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
   16925 				   GEN_INT (1 + 4), GEN_INT (1 + 4));
   16926       break;
   16927     case E_V2DFmode:
   16928       tem = gen_vec_interleave_highv2df (dest, src, src);
   16929       break;
   16930     case E_V4QImode:
   16931       d = gen_reg_rtx (V1SImode);
   16932       tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
   16933 			       GEN_INT (i / 2));
   16934       break;
   16935     case E_V4HImode:
   16936       d = gen_reg_rtx (V1DImode);
   16937       tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
   16938 			       GEN_INT (i / 2));
   16939       break;
   16940     case E_V16QImode:
   16941     case E_V8HImode:
   16942     case E_V8HFmode:
   16943     case E_V4SImode:
   16944     case E_V2DImode:
   16945       d = gen_reg_rtx (V1TImode);
   16946       tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
   16947 				GEN_INT (i / 2));
   16948       break;
   16949     case E_V8SFmode:
   16950       if (i == 256)
   16951 	tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
   16952       else
   16953 	tem = gen_avx_shufps256 (dest, src, src,
   16954 				 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
   16955       break;
   16956     case E_V4DFmode:
   16957       if (i == 256)
   16958 	tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
   16959       else
   16960 	tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
   16961       break;
   16962     case E_V32QImode:
   16963     case E_V16HImode:
   16964     case E_V16HFmode:
   16965     case E_V8SImode:
   16966     case E_V4DImode:
   16967       if (i == 256)
   16968 	{
   16969 	  if (GET_MODE (dest) != V4DImode)
   16970 	    d = gen_reg_rtx (V4DImode);
   16971 	  tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
   16972 				   gen_lowpart (V4DImode, src),
   16973 				   const1_rtx);
   16974 	}
   16975       else
   16976 	{
   16977 	  d = gen_reg_rtx (V2TImode);
   16978 	  tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
   16979 				    GEN_INT (i / 2));
   16980 	}
   16981       break;
   16982     case E_V64QImode:
   16983     case E_V32HImode:
   16984     case E_V32HFmode:
   16985       if (i < 64)
   16986 	{
   16987 	  d = gen_reg_rtx (V4TImode);
   16988 	  tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
   16989 					GEN_INT (i / 2));
   16990 	  break;
   16991 	}
   16992       /* FALLTHRU */
   16993     case E_V16SImode:
   16994     case E_V16SFmode:
   16995     case E_V8DImode:
   16996     case E_V8DFmode:
   16997       if (i > 128)
   16998 	tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
   16999 					gen_lowpart (V16SImode, src),
   17000 					gen_lowpart (V16SImode, src),
   17001 					GEN_INT (0x4 + (i == 512 ? 4 : 0)),
   17002 					GEN_INT (0x5 + (i == 512 ? 4 : 0)),
   17003 					GEN_INT (0x6 + (i == 512 ? 4 : 0)),
   17004 					GEN_INT (0x7 + (i == 512 ? 4 : 0)),
   17005 					GEN_INT (0xC), GEN_INT (0xD),
   17006 					GEN_INT (0xE), GEN_INT (0xF),
   17007 					GEN_INT (0x10), GEN_INT (0x11),
   17008 					GEN_INT (0x12), GEN_INT (0x13),
   17009 					GEN_INT (0x14), GEN_INT (0x15),
   17010 					GEN_INT (0x16), GEN_INT (0x17));
   17011       else
   17012 	tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
   17013 				    gen_lowpart (V16SImode, src),
   17014 				    GEN_INT (i == 128 ? 0x2 : 0x1),
   17015 				    GEN_INT (0x3),
   17016 				    GEN_INT (0x3),
   17017 				    GEN_INT (0x3),
   17018 				    GEN_INT (i == 128 ? 0x6 : 0x5),
   17019 				    GEN_INT (0x7),
   17020 				    GEN_INT (0x7),
   17021 				    GEN_INT (0x7),
   17022 				    GEN_INT (i == 128 ? 0xA : 0x9),
   17023 				    GEN_INT (0xB),
   17024 				    GEN_INT (0xB),
   17025 				    GEN_INT (0xB),
   17026 				    GEN_INT (i == 128 ? 0xE : 0xD),
   17027 				    GEN_INT (0xF),
   17028 				    GEN_INT (0xF),
   17029 				    GEN_INT (0xF));
   17030       break;
   17031     default:
   17032       gcc_unreachable ();
   17033     }
   17034   emit_insn (tem);
   17035   if (d != dest)
   17036     emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
   17037 }
   17038 
   17039 /* Expand a vector reduction.  FN is the binary pattern to reduce;
   17040    DEST is the destination; IN is the input vector.  */
   17041 
   17042 void
   17043 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
   17044 {
   17045   rtx half, dst, vec = in;
   17046   machine_mode mode = GET_MODE (in);
   17047   int i;
   17048 
   17049   /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
   17050   if (TARGET_SSE4_1
   17051       && mode == V8HImode
   17052       && fn == gen_uminv8hi3)
   17053     {
   17054       emit_insn (gen_sse4_1_phminposuw (dest, in));
   17055       return;
   17056     }
   17057 
   17058   for (i = GET_MODE_BITSIZE (mode);
   17059        i > GET_MODE_UNIT_BITSIZE (mode);
   17060        i >>= 1)
   17061     {
   17062       half = gen_reg_rtx (mode);
   17063       emit_reduc_half (half, vec, i);
   17064       if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
   17065 	dst = dest;
   17066       else
   17067 	dst = gen_reg_rtx (mode);
   17068       emit_insn (fn (dst, half, vec));
   17069       vec = dst;
   17070     }
   17071 }
   17072 
   17073 /* Output code to perform a conditional jump to LABEL, if C2 flag in
   17074    FP status register is set.  */
   17075 
   17076 void
   17077 ix86_emit_fp_unordered_jump (rtx label)
   17078 {
   17079   rtx reg = gen_reg_rtx (HImode);
   17080   rtx_insn *insn;
   17081   rtx temp;
   17082 
   17083   emit_insn (gen_x86_fnstsw_1 (reg));
   17084 
   17085   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
   17086     {
   17087       emit_insn (gen_x86_sahf_1 (reg));
   17088 
   17089       temp = gen_rtx_REG (CCmode, FLAGS_REG);
   17090       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
   17091     }
   17092   else
   17093     {
   17094       emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
   17095 
   17096       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17097       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
   17098     }
   17099 
   17100   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
   17101 			      gen_rtx_LABEL_REF (VOIDmode, label),
   17102 			      pc_rtx);
   17103   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
   17104   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   17105   JUMP_LABEL (insn) = label;
   17106 }
   17107 
   17108 /* Output code to perform an sinh XFmode calculation.  */
   17109 
   17110 void
   17111 ix86_emit_i387_sinh (rtx op0, rtx op1)
   17112 {
   17113   rtx e1 = gen_reg_rtx (XFmode);
   17114   rtx e2 = gen_reg_rtx (XFmode);
   17115   rtx scratch = gen_reg_rtx (HImode);
   17116   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17117   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   17118   rtx cst1, tmp;
   17119   rtx_code_label *jump_label = gen_label_rtx ();
   17120   rtx_insn *insn;
   17121 
   17122   /* scratch = fxam (op1) */
   17123   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   17124 
   17125   /* e1 = expm1 (|op1|) */
   17126   emit_insn (gen_absxf2 (e2, op1));
   17127   emit_insn (gen_expm1xf2 (e1, e2));
   17128 
   17129   /* e2 = e1 / (e1 + 1.0) + e1 */
   17130   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17131   emit_insn (gen_addxf3 (e2, e1, cst1));
   17132   emit_insn (gen_divxf3 (e2, e1, e2));
   17133   emit_insn (gen_addxf3 (e2, e2, e1));
   17134 
   17135   /* flags = signbit (op1) */
   17136   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   17137 
   17138   /* if (flags) then e2 = -e2 */
   17139   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   17140 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   17141 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
   17142 			      pc_rtx);
   17143   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17144   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   17145   JUMP_LABEL (insn) = jump_label;
   17146 
   17147   emit_insn (gen_negxf2 (e2, e2));
   17148 
   17149   emit_label (jump_label);
   17150   LABEL_NUSES (jump_label) = 1;
   17151 
   17152   /* op0 = 0.5 * e2 */
   17153   half = force_reg (XFmode, half);
   17154   emit_insn (gen_mulxf3 (op0, e2, half));
   17155 }
   17156 
   17157 /* Output code to perform an cosh XFmode calculation.  */
   17158 
   17159 void
   17160 ix86_emit_i387_cosh (rtx op0, rtx op1)
   17161 {
   17162   rtx e1 = gen_reg_rtx (XFmode);
   17163   rtx e2 = gen_reg_rtx (XFmode);
   17164   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   17165   rtx cst1;
   17166 
   17167   /* e1 = exp (op1) */
   17168   emit_insn (gen_expxf2 (e1, op1));
   17169 
   17170   /* e2 = e1 + 1.0 / e1 */
   17171   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17172   emit_insn (gen_divxf3 (e2, cst1, e1));
   17173   emit_insn (gen_addxf3 (e2, e1, e2));
   17174 
   17175   /* op0 = 0.5 * e2 */
   17176   half = force_reg (XFmode, half);
   17177   emit_insn (gen_mulxf3 (op0, e2, half));
   17178 }
   17179 
   17180 /* Output code to perform an tanh XFmode calculation.  */
   17181 
   17182 void
   17183 ix86_emit_i387_tanh (rtx op0, rtx op1)
   17184 {
   17185   rtx e1 = gen_reg_rtx (XFmode);
   17186   rtx e2 = gen_reg_rtx (XFmode);
   17187   rtx scratch = gen_reg_rtx (HImode);
   17188   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17189   rtx cst2, tmp;
   17190   rtx_code_label *jump_label = gen_label_rtx ();
   17191   rtx_insn *insn;
   17192 
   17193   /* scratch = fxam (op1) */
   17194   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   17195 
   17196   /* e1 = expm1 (-|2 * op1|) */
   17197   emit_insn (gen_addxf3 (e2, op1, op1));
   17198   emit_insn (gen_absxf2 (e2, e2));
   17199   emit_insn (gen_negxf2 (e2, e2));
   17200   emit_insn (gen_expm1xf2 (e1, e2));
   17201 
   17202   /* e2 = e1 / (e1 + 2.0) */
   17203   cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
   17204   emit_insn (gen_addxf3 (e2, e1, cst2));
   17205   emit_insn (gen_divxf3 (e2, e1, e2));
   17206 
   17207   /* flags = signbit (op1) */
   17208   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   17209 
   17210   /* if (!flags) then e2 = -e2 */
   17211   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   17212 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
   17213 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
   17214 			      pc_rtx);
   17215   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17216   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   17217   JUMP_LABEL (insn) = jump_label;
   17218 
   17219   emit_insn (gen_negxf2 (e2, e2));
   17220 
   17221   emit_label (jump_label);
   17222   LABEL_NUSES (jump_label) = 1;
   17223 
   17224   emit_move_insn (op0, e2);
   17225 }
   17226 
   17227 /* Output code to perform an asinh XFmode calculation.  */
   17228 
   17229 void
   17230 ix86_emit_i387_asinh (rtx op0, rtx op1)
   17231 {
   17232   rtx e1 = gen_reg_rtx (XFmode);
   17233   rtx e2 = gen_reg_rtx (XFmode);
   17234   rtx scratch = gen_reg_rtx (HImode);
   17235   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17236   rtx cst1, tmp;
   17237   rtx_code_label *jump_label = gen_label_rtx ();
   17238   rtx_insn *insn;
   17239 
   17240   /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
   17241   emit_insn (gen_mulxf3 (e1, op1, op1));
   17242   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17243   emit_insn (gen_addxf3 (e2, e1, cst1));
   17244   emit_insn (gen_sqrtxf2 (e2, e2));
   17245   emit_insn (gen_addxf3 (e2, e2, cst1));
   17246 
   17247   /* e1 = e1 / e2 */
   17248   emit_insn (gen_divxf3 (e1, e1, e2));
   17249 
   17250   /* scratch = fxam (op1) */
   17251   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   17252 
   17253   /* e1 = e1 + |op1| */
   17254   emit_insn (gen_absxf2 (e2, op1));
   17255   emit_insn (gen_addxf3 (e1, e1, e2));
   17256 
   17257   /* e2 = log1p (e1) */
   17258   ix86_emit_i387_log1p (e2, e1);
   17259 
   17260   /* flags = signbit (op1) */
   17261   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   17262 
   17263   /* if (flags) then e2 = -e2 */
   17264   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   17265 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   17266 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
   17267 			      pc_rtx);
   17268   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17269   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   17270   JUMP_LABEL (insn) = jump_label;
   17271 
   17272   emit_insn (gen_negxf2 (e2, e2));
   17273 
   17274   emit_label (jump_label);
   17275   LABEL_NUSES (jump_label) = 1;
   17276 
   17277   emit_move_insn (op0, e2);
   17278 }
   17279 
   17280 /* Output code to perform an acosh XFmode calculation.  */
   17281 
   17282 void
   17283 ix86_emit_i387_acosh (rtx op0, rtx op1)
   17284 {
   17285   rtx e1 = gen_reg_rtx (XFmode);
   17286   rtx e2 = gen_reg_rtx (XFmode);
   17287   rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17288 
   17289   /* e2 = sqrt (op1 + 1.0) */
   17290   emit_insn (gen_addxf3 (e2, op1, cst1));
   17291   emit_insn (gen_sqrtxf2 (e2, e2));
   17292 
   17293   /* e1 = sqrt (op1 - 1.0) */
   17294   emit_insn (gen_subxf3 (e1, op1, cst1));
   17295   emit_insn (gen_sqrtxf2 (e1, e1));
   17296 
   17297   /* e1 = e1 * e2 */
   17298   emit_insn (gen_mulxf3 (e1, e1, e2));
   17299 
   17300   /* e1 = e1 + op1 */
   17301   emit_insn (gen_addxf3 (e1, e1, op1));
   17302 
   17303   /* op0 = log (e1) */
   17304   emit_insn (gen_logxf2 (op0, e1));
   17305 }
   17306 
   17307 /* Output code to perform an atanh XFmode calculation.  */
   17308 
   17309 void
   17310 ix86_emit_i387_atanh (rtx op0, rtx op1)
   17311 {
   17312   rtx e1 = gen_reg_rtx (XFmode);
   17313   rtx e2 = gen_reg_rtx (XFmode);
   17314   rtx scratch = gen_reg_rtx (HImode);
   17315   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17316   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   17317   rtx cst1, tmp;
   17318   rtx_code_label *jump_label = gen_label_rtx ();
   17319   rtx_insn *insn;
   17320 
   17321   /* scratch = fxam (op1) */
   17322   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   17323 
   17324   /* e2 = |op1| */
   17325   emit_insn (gen_absxf2 (e2, op1));
   17326 
   17327   /* e1 = -(e2 + e2) / (e2 + 1.0) */
   17328   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17329   emit_insn (gen_addxf3 (e1, e2, cst1));
   17330   emit_insn (gen_addxf3 (e2, e2, e2));
   17331   emit_insn (gen_negxf2 (e2, e2));
   17332   emit_insn (gen_divxf3 (e1, e2, e1));
   17333 
   17334   /* e2 = log1p (e1) */
   17335   ix86_emit_i387_log1p (e2, e1);
   17336 
   17337   /* flags = signbit (op1) */
   17338   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   17339 
   17340   /* if (!flags) then e2 = -e2 */
   17341   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   17342 			      gen_rtx_NE (VOIDmode, flags, const0_rtx),
   17343 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
   17344 			      pc_rtx);
   17345   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17346   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   17347   JUMP_LABEL (insn) = jump_label;
   17348 
   17349   emit_insn (gen_negxf2 (e2, e2));
   17350 
   17351   emit_label (jump_label);
   17352   LABEL_NUSES (jump_label) = 1;
   17353 
   17354   /* op0 = 0.5 * e2 */
   17355   half = force_reg (XFmode, half);
   17356   emit_insn (gen_mulxf3 (op0, e2, half));
   17357 }
   17358 
   17359 /* Output code to perform a log1p XFmode calculation.  */
   17360 
   17361 void
   17362 ix86_emit_i387_log1p (rtx op0, rtx op1)
   17363 {
   17364   rtx_code_label *label1 = gen_label_rtx ();
   17365   rtx_code_label *label2 = gen_label_rtx ();
   17366 
   17367   rtx tmp = gen_reg_rtx (XFmode);
   17368   rtx res = gen_reg_rtx (XFmode);
   17369   rtx cst, cstln2, cst1;
   17370   rtx_insn *insn;
   17371 
   17372   /* The emit_jump call emits pending stack adjust, make sure it is emitted
   17373      before the conditional jump, otherwise the stack adjustment will be
   17374      only conditional.  */
   17375   do_pending_stack_adjust ();
   17376 
   17377   cst = const_double_from_real_value
   17378     (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
   17379   cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
   17380 
   17381   emit_insn (gen_absxf2 (tmp, op1));
   17382 
   17383   cst = force_reg (XFmode, cst);
   17384   ix86_expand_branch (GE, tmp, cst, label1);
   17385   predict_jump (REG_BR_PROB_BASE * 10 / 100);
   17386   insn = get_last_insn ();
   17387   JUMP_LABEL (insn) = label1;
   17388 
   17389   emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
   17390   emit_jump (label2);
   17391 
   17392   emit_label (label1);
   17393   LABEL_NUSES (label1) = 1;
   17394 
   17395   cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
   17396   emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
   17397   emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
   17398 
   17399   emit_label (label2);
   17400   LABEL_NUSES (label2) = 1;
   17401 
   17402   emit_move_insn (op0, res);
   17403 }
   17404 
   17405 /* Emit code for round calculation.  */
   17406 void
   17407 ix86_emit_i387_round (rtx op0, rtx op1)
   17408 {
   17409   machine_mode inmode = GET_MODE (op1);
   17410   machine_mode outmode = GET_MODE (op0);
   17411   rtx e1 = gen_reg_rtx (XFmode);
   17412   rtx e2 = gen_reg_rtx (XFmode);
   17413   rtx scratch = gen_reg_rtx (HImode);
   17414   rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
   17415   rtx half = const_double_from_real_value (dconsthalf, XFmode);
   17416   rtx res = gen_reg_rtx (outmode);
   17417   rtx_code_label *jump_label = gen_label_rtx ();
   17418   rtx (*floor_insn) (rtx, rtx);
   17419   rtx (*neg_insn) (rtx, rtx);
   17420   rtx_insn *insn;
   17421   rtx tmp;
   17422 
   17423   switch (inmode)
   17424     {
   17425     case E_SFmode:
   17426     case E_DFmode:
   17427       tmp = gen_reg_rtx (XFmode);
   17428 
   17429       emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
   17430       op1 = tmp;
   17431       break;
   17432     case E_XFmode:
   17433       break;
   17434     default:
   17435       gcc_unreachable ();
   17436     }
   17437 
   17438   switch (outmode)
   17439     {
   17440     case E_SFmode:
   17441       floor_insn = gen_frndintxf2_floor;
   17442       neg_insn = gen_negsf2;
   17443       break;
   17444     case E_DFmode:
   17445       floor_insn = gen_frndintxf2_floor;
   17446       neg_insn = gen_negdf2;
   17447       break;
   17448     case E_XFmode:
   17449       floor_insn = gen_frndintxf2_floor;
   17450       neg_insn = gen_negxf2;
   17451       break;
   17452     case E_HImode:
   17453       floor_insn = gen_lfloorxfhi2;
   17454       neg_insn = gen_neghi2;
   17455       break;
   17456     case E_SImode:
   17457       floor_insn = gen_lfloorxfsi2;
   17458       neg_insn = gen_negsi2;
   17459       break;
   17460     case E_DImode:
   17461       floor_insn = gen_lfloorxfdi2;
   17462       neg_insn = gen_negdi2;
   17463       break;
   17464     default:
   17465       gcc_unreachable ();
   17466     }
   17467 
   17468   /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
   17469 
   17470   /* scratch = fxam(op1) */
   17471   emit_insn (gen_fxamxf2_i387 (scratch, op1));
   17472 
   17473   /* e1 = fabs(op1) */
   17474   emit_insn (gen_absxf2 (e1, op1));
   17475 
   17476   /* e2 = e1 + 0.5 */
   17477   half = force_reg (XFmode, half);
   17478   emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
   17479 
   17480   /* res = floor(e2) */
   17481   switch (outmode)
   17482     {
   17483     case E_SFmode:
   17484     case E_DFmode:
   17485       {
   17486 	tmp = gen_reg_rtx (XFmode);
   17487 
   17488 	emit_insn (floor_insn (tmp, e2));
   17489 	emit_insn (gen_rtx_SET (res,
   17490 				gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
   17491 						UNSPEC_TRUNC_NOOP)));
   17492       }
   17493       break;
   17494     default:
   17495       emit_insn (floor_insn (res, e2));
   17496     }
   17497 
   17498   /* flags = signbit(a) */
   17499   emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
   17500 
   17501   /* if (flags) then res = -res */
   17502   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
   17503 			      gen_rtx_EQ (VOIDmode, flags, const0_rtx),
   17504 			      gen_rtx_LABEL_REF (VOIDmode, jump_label),
   17505 			      pc_rtx);
   17506   insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17507   predict_jump (REG_BR_PROB_BASE * 50 / 100);
   17508   JUMP_LABEL (insn) = jump_label;
   17509 
   17510   emit_insn (neg_insn (res, res));
   17511 
   17512   emit_label (jump_label);
   17513   LABEL_NUSES (jump_label) = 1;
   17514 
   17515   emit_move_insn (op0, res);
   17516 }
   17517 
   17518 /* Output code to perform a Newton-Rhapson approximation of a single precision
   17519    floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
   17520 
   17521 void
   17522 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
   17523 {
   17524   rtx x0, x1, e0, e1;
   17525 
   17526   x0 = gen_reg_rtx (mode);
   17527   e0 = gen_reg_rtx (mode);
   17528   e1 = gen_reg_rtx (mode);
   17529   x1 = gen_reg_rtx (mode);
   17530 
   17531   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
   17532 
   17533   b = force_reg (mode, b);
   17534 
   17535   /* x0 = rcp(b) estimate */
   17536   if (mode == V16SFmode || mode == V8DFmode)
   17537     {
   17538       if (TARGET_AVX512ER)
   17539 	{
   17540 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   17541 						      UNSPEC_RCP28)));
   17542 	  /* res = a * x0 */
   17543 	  emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
   17544 	  return;
   17545 	}
   17546       else
   17547 	emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   17548 						    UNSPEC_RCP14)));
   17549     }
   17550   else
   17551     emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
   17552 						UNSPEC_RCP)));
   17553 
   17554   /* e0 = x0 * b */
   17555   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
   17556 
   17557   /* e0 = x0 * e0 */
   17558   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
   17559 
   17560   /* e1 = x0 + x0 */
   17561   emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
   17562 
   17563   /* x1 = e1 - e0 */
   17564   emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
   17565 
   17566   /* res = a * x1 */
   17567   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
   17568 }
   17569 
   17570 /* Output code to perform a Newton-Rhapson approximation of a
   17571    single precision floating point [reciprocal] square root.  */
   17572 
   17573 void
   17574 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
   17575 {
   17576   rtx x0, e0, e1, e2, e3, mthree, mhalf;
   17577   REAL_VALUE_TYPE r;
   17578   int unspec;
   17579 
   17580   x0 = gen_reg_rtx (mode);
   17581   e0 = gen_reg_rtx (mode);
   17582   e1 = gen_reg_rtx (mode);
   17583   e2 = gen_reg_rtx (mode);
   17584   e3 = gen_reg_rtx (mode);
   17585 
   17586   if (TARGET_AVX512ER && mode == V16SFmode)
   17587     {
   17588       if (recip)
   17589 	/* res = rsqrt28(a) estimate */
   17590 	emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   17591 						     UNSPEC_RSQRT28)));
   17592       else
   17593 	{
   17594 	  /* x0 = rsqrt28(a) estimate */
   17595 	  emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   17596 						      UNSPEC_RSQRT28)));
   17597 	  /* res = rcp28(x0) estimate */
   17598 	  emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
   17599 						       UNSPEC_RCP28)));
   17600 	}
   17601       return;
   17602     }
   17603 
   17604   real_from_integer (&r, VOIDmode, -3, SIGNED);
   17605   mthree = const_double_from_real_value (r, SFmode);
   17606 
   17607   real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
   17608   mhalf = const_double_from_real_value (r, SFmode);
   17609   unspec = UNSPEC_RSQRT;
   17610 
   17611   if (VECTOR_MODE_P (mode))
   17612     {
   17613       mthree = ix86_build_const_vector (mode, true, mthree);
   17614       mhalf = ix86_build_const_vector (mode, true, mhalf);
   17615       /* There is no 512-bit rsqrt.  There is however rsqrt14.  */
   17616       if (GET_MODE_SIZE (mode) == 64)
   17617 	unspec = UNSPEC_RSQRT14;
   17618     }
   17619 
   17620   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
   17621      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
   17622 
   17623   a = force_reg (mode, a);
   17624 
   17625   /* x0 = rsqrt(a) estimate */
   17626   emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
   17627 					      unspec)));
   17628 
   17629   /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0).  */
   17630   if (!recip)
   17631     {
   17632       rtx zero = force_reg (mode, CONST0_RTX(mode));
   17633       rtx mask;
   17634 
   17635       /* Handle masked compare.  */
   17636       if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
   17637 	{
   17638 	  mask = gen_reg_rtx (HImode);
   17639 	  /* Imm value 0x4 corresponds to not-equal comparison.  */
   17640 	  emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
   17641 	  emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
   17642 	}
   17643       else
   17644 	{
   17645 	  mask = gen_reg_rtx (mode);
   17646 	  emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
   17647 	  emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
   17648 	}
   17649     }
   17650 
   17651   mthree = force_reg (mode, mthree);
   17652 
   17653   /* e0 = x0 * a */
   17654   emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
   17655 
   17656   unsigned vector_size = GET_MODE_SIZE (mode);
   17657   if (TARGET_FMA
   17658       || (TARGET_AVX512F && vector_size == 64)
   17659       || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
   17660     emit_insn (gen_rtx_SET (e2,
   17661 			    gen_rtx_FMA (mode, e0, x0, mthree)));
   17662   else
   17663     {
   17664       /* e1 = e0 * x0 */
   17665       emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
   17666 
   17667       /* e2 = e1 - 3. */
   17668       emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
   17669     }
   17670 
   17671   mhalf = force_reg (mode, mhalf);
   17672   if (recip)
   17673     /* e3 = -.5 * x0 */
   17674     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
   17675   else
   17676     /* e3 = -.5 * e0 */
   17677     emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
   17678   /* ret = e2 * e3 */
   17679   emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
   17680 }
   17681 
   17682 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
   17683    mask for masking out the sign-bit is stored in *SMASK, if that is
   17684    non-null.  */
   17685 
   17686 static rtx
   17687 ix86_expand_sse_fabs (rtx op0, rtx *smask)
   17688 {
   17689   machine_mode vmode, mode = GET_MODE (op0);
   17690   rtx xa, mask;
   17691 
   17692   xa = gen_reg_rtx (mode);
   17693   if (mode == SFmode)
   17694     vmode = V4SFmode;
   17695   else if (mode == DFmode)
   17696     vmode = V2DFmode;
   17697   else
   17698     vmode = mode;
   17699   mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
   17700   if (!VECTOR_MODE_P (mode))
   17701     {
   17702       /* We need to generate a scalar mode mask in this case.  */
   17703       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   17704       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   17705       mask = gen_reg_rtx (mode);
   17706       emit_insn (gen_rtx_SET (mask, tmp));
   17707     }
   17708   emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
   17709 
   17710   if (smask)
   17711     *smask = mask;
   17712 
   17713   return xa;
   17714 }
   17715 
   17716 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
   17717    swapping the operands if SWAP_OPERANDS is true.  The expanded
   17718    code is a forward jump to a newly created label in case the
   17719    comparison is true.  The generated label rtx is returned.  */
   17720 static rtx_code_label *
   17721 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
   17722                                   bool swap_operands)
   17723 {
   17724   bool unordered_compare = ix86_unordered_fp_compare (code);
   17725   rtx_code_label *label;
   17726   rtx tmp, reg;
   17727 
   17728   if (swap_operands)
   17729     std::swap (op0, op1);
   17730 
   17731   label = gen_label_rtx ();
   17732   tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
   17733   if (unordered_compare)
   17734     tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
   17735   reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
   17736   emit_insn (gen_rtx_SET (reg, tmp));
   17737   tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
   17738   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
   17739 			      gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
   17740   tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
   17741   JUMP_LABEL (tmp) = label;
   17742 
   17743   return label;
   17744 }
   17745 
   17746 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
   17747    using comparison code CODE.  Operands are swapped for the comparison if
   17748    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
   17749 static rtx
   17750 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
   17751 			      bool swap_operands)
   17752 {
   17753   rtx (*insn)(rtx, rtx, rtx, rtx);
   17754   machine_mode mode = GET_MODE (op0);
   17755   rtx mask = gen_reg_rtx (mode);
   17756 
   17757   if (swap_operands)
   17758     std::swap (op0, op1);
   17759 
   17760   insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
   17761 
   17762   emit_insn (insn (mask, op0, op1,
   17763 		   gen_rtx_fmt_ee (code, mode, op0, op1)));
   17764   return mask;
   17765 }
   17766 
   17767 /* Expand copysign from SIGN to the positive value ABS_VALUE
   17768    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
   17769    the sign-bit.  */
   17770 
   17771 static void
   17772 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
   17773 {
   17774   machine_mode mode = GET_MODE (sign);
   17775   rtx sgn = gen_reg_rtx (mode);
   17776   if (mask == NULL_RTX)
   17777     {
   17778       machine_mode vmode;
   17779 
   17780       if (mode == SFmode)
   17781 	vmode = V4SFmode;
   17782       else if (mode == DFmode)
   17783 	vmode = V2DFmode;
   17784       else
   17785 	vmode = mode;
   17786 
   17787       mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
   17788       if (!VECTOR_MODE_P (mode))
   17789 	{
   17790 	  /* We need to generate a scalar mode mask in this case.  */
   17791 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
   17792 	  tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
   17793 	  mask = gen_reg_rtx (mode);
   17794 	  emit_insn (gen_rtx_SET (mask, tmp));
   17795 	}
   17796     }
   17797   else
   17798     mask = gen_rtx_NOT (mode, mask);
   17799   emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
   17800   emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
   17801 }
   17802 
   17803 /* Expand SSE sequence for computing lround from OP1 storing
   17804    into OP0.  */
   17805 
   17806 void
   17807 ix86_expand_lround (rtx op0, rtx op1)
   17808 {
   17809   /* C code for the stuff we're doing below:
   17810 	tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
   17811 	return (long)tmp;
   17812    */
   17813   machine_mode mode = GET_MODE (op1);
   17814   const struct real_format *fmt;
   17815   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   17816   rtx adj;
   17817 
   17818   /* load nextafter (0.5, 0.0) */
   17819   fmt = REAL_MODE_FORMAT (mode);
   17820   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   17821   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   17822 
   17823   /* adj = copysign (0.5, op1) */
   17824   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
   17825   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
   17826 
   17827   /* adj = op1 + adj */
   17828   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
   17829 
   17830   /* op0 = (imode)adj */
   17831   expand_fix (op0, adj, 0);
   17832 }
   17833 
   17834 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
   17835    into OPERAND0.  */
   17836 
   17837 void
   17838 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
   17839 {
   17840   /* C code for the stuff we're doing below (for do_floor):
   17841 	xi = (long)op1;
   17842 	xi -= (double)xi > op1 ? 1 : 0;
   17843 	return xi;
   17844    */
   17845   machine_mode fmode = GET_MODE (op1);
   17846   machine_mode imode = GET_MODE (op0);
   17847   rtx ireg, freg, tmp;
   17848   rtx_code_label *label;
   17849 
   17850   /* reg = (long)op1 */
   17851   ireg = gen_reg_rtx (imode);
   17852   expand_fix (ireg, op1, 0);
   17853 
   17854   /* freg = (double)reg */
   17855   freg = gen_reg_rtx (fmode);
   17856   expand_float (freg, ireg, 0);
   17857 
   17858   /* ireg = (freg > op1) ? ireg - 1 : ireg */
   17859   label = ix86_expand_sse_compare_and_jump (UNLE,
   17860 					    freg, op1, !do_floor);
   17861   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
   17862 			     ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
   17863   emit_move_insn (ireg, tmp);
   17864 
   17865   emit_label (label);
   17866   LABEL_NUSES (label) = 1;
   17867 
   17868   emit_move_insn (op0, ireg);
   17869 }
   17870 
   17871 /* Generate and return a rtx of mode MODE for 2**n where n is the number
   17872    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
   17873 
   17874 static rtx
   17875 ix86_gen_TWO52 (machine_mode mode)
   17876 {
   17877   const struct real_format *fmt;
   17878   REAL_VALUE_TYPE TWO52r;
   17879   rtx TWO52;
   17880 
   17881   fmt = REAL_MODE_FORMAT (mode);
   17882   real_2expN (&TWO52r, fmt->p - 1, mode);
   17883   TWO52 = const_double_from_real_value (TWO52r, mode);
   17884   TWO52 = force_reg (mode, TWO52);
   17885 
   17886   return TWO52;
   17887 }
   17888 
   17889 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0.  */
   17890 
   17891 void
   17892 ix86_expand_rint (rtx operand0, rtx operand1)
   17893 {
   17894   /* C code for the stuff we're doing below:
   17895 	xa = fabs (operand1);
   17896 	if (!isless (xa, 2**52))
   17897 	  return operand1;
   17898 	two52 = 2**52;
   17899 	if (flag_rounding_math)
   17900 	  {
   17901 	    two52 = copysign (two52, operand1);
   17902 	    xa = operand1;
   17903 	  }
   17904 	xa = xa + two52 - two52;
   17905 	return copysign (xa, operand1);
   17906    */
   17907   machine_mode mode = GET_MODE (operand0);
   17908   rtx res, xa, TWO52, mask;
   17909   rtx_code_label *label;
   17910 
   17911   TWO52 = ix86_gen_TWO52 (mode);
   17912 
   17913   /* Temporary for holding the result, initialized to the input
   17914      operand to ease control flow.  */
   17915   res = copy_to_reg (operand1);
   17916 
   17917   /* xa = abs (operand1) */
   17918   xa = ix86_expand_sse_fabs (res, &mask);
   17919 
   17920   /* if (!isless (xa, TWO52)) goto label; */
   17921   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   17922 
   17923   if (flag_rounding_math)
   17924     {
   17925       ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
   17926       xa = res;
   17927     }
   17928 
   17929   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   17930   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   17931 
   17932   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   17933   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   17934     xa = ix86_expand_sse_fabs (xa, NULL);
   17935 
   17936   ix86_sse_copysign_to_positive (res, xa, res, mask);
   17937 
   17938   emit_label (label);
   17939   LABEL_NUSES (label) = 1;
   17940 
   17941   emit_move_insn (operand0, res);
   17942 }
   17943 
   17944 /* Expand SSE2 sequence for computing floor or ceil
   17945    from OPERAND1 storing into OPERAND0.  */
   17946 void
   17947 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
   17948 {
   17949   /* C code for the stuff we expand below.
   17950 	double xa = fabs (x), x2;
   17951 	if (!isless (xa, TWO52))
   17952 	  return x;
   17953 	x2 = (double)(long)x;
   17954 
   17955      Compensate.  Floor:
   17956 	if (x2 > x)
   17957 	  x2 -= 1;
   17958      Compensate.  Ceil:
   17959 	if (x2 < x)
   17960 	  x2 += 1;
   17961 
   17962 	if (HONOR_SIGNED_ZEROS (mode))
   17963 	  return copysign (x2, x);
   17964 	return x2;
   17965    */
   17966   machine_mode mode = GET_MODE (operand0);
   17967   rtx xa, xi, TWO52, tmp, one, res, mask;
   17968   rtx_code_label *label;
   17969 
   17970   TWO52 = ix86_gen_TWO52 (mode);
   17971 
   17972   /* Temporary for holding the result, initialized to the input
   17973      operand to ease control flow.  */
   17974   res = copy_to_reg (operand1);
   17975 
   17976   /* xa = abs (operand1) */
   17977   xa = ix86_expand_sse_fabs (res, &mask);
   17978 
   17979   /* if (!isless (xa, TWO52)) goto label; */
   17980   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   17981 
   17982   /* xa = (double)(long)x */
   17983   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   17984   expand_fix (xi, res, 0);
   17985   expand_float (xa, xi, 0);
   17986 
   17987   /* generate 1.0 */
   17988   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   17989 
   17990   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   17991   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   17992   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   17993   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   17994 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   17995   if (HONOR_SIGNED_ZEROS (mode))
   17996     {
   17997       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   17998       if (do_floor && flag_rounding_math)
   17999 	tmp = ix86_expand_sse_fabs (tmp, NULL);
   18000 
   18001       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   18002     }
   18003   emit_move_insn (res, tmp);
   18004 
   18005   emit_label (label);
   18006   LABEL_NUSES (label) = 1;
   18007 
   18008   emit_move_insn (operand0, res);
   18009 }
   18010 
   18011 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
   18012    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   18013    that is only available on 64bit targets.  */
   18014 void
   18015 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
   18016 {
   18017   /* C code for the stuff we expand below.
   18018 	double xa = fabs (x), x2;
   18019 	if (!isless (xa, TWO52))
   18020 	  return x;
   18021 	xa = xa + TWO52 - TWO52;
   18022 	x2 = copysign (xa, x);
   18023 
   18024      Compensate.  Floor:
   18025 	if (x2 > x)
   18026 	  x2 -= 1;
   18027      Compensate.  Ceil:
   18028 	if (x2 < x)
   18029 	  x2 += 1;
   18030 
   18031 	if (HONOR_SIGNED_ZEROS (mode))
   18032 	  x2 = copysign (x2, x);
   18033 	return x2;
   18034    */
   18035   machine_mode mode = GET_MODE (operand0);
   18036   rtx xa, TWO52, tmp, one, res, mask;
   18037   rtx_code_label *label;
   18038 
   18039   TWO52 = ix86_gen_TWO52 (mode);
   18040 
   18041   /* Temporary for holding the result, initialized to the input
   18042      operand to ease control flow.  */
   18043   res = copy_to_reg (operand1);
   18044 
   18045   /* xa = abs (operand1) */
   18046   xa = ix86_expand_sse_fabs (res, &mask);
   18047 
   18048   /* if (!isless (xa, TWO52)) goto label; */
   18049   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   18050 
   18051   /* xa = xa + TWO52 - TWO52; */
   18052   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   18053   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
   18054 
   18055   /* xa = copysign (xa, operand1) */
   18056   ix86_sse_copysign_to_positive (xa, xa, res, mask);
   18057 
   18058   /* generate 1.0 */
   18059   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   18060 
   18061   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
   18062   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
   18063   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   18064   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
   18065 			     xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   18066   if (HONOR_SIGNED_ZEROS (mode))
   18067     {
   18068       /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   18069       if (do_floor && flag_rounding_math)
   18070 	tmp = ix86_expand_sse_fabs (tmp, NULL);
   18071 
   18072       ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
   18073     }
   18074   emit_move_insn (res, tmp);
   18075 
   18076   emit_label (label);
   18077   LABEL_NUSES (label) = 1;
   18078 
   18079   emit_move_insn (operand0, res);
   18080 }
   18081 
   18082 /* Expand SSE sequence for computing trunc
   18083    from OPERAND1 storing into OPERAND0.  */
   18084 void
   18085 ix86_expand_trunc (rtx operand0, rtx operand1)
   18086 {
   18087   /* C code for SSE variant we expand below.
   18088 	double xa = fabs (x), x2;
   18089 	if (!isless (xa, TWO52))
   18090 	  return x;
   18091 	x2 = (double)(long)x;
   18092 	if (HONOR_SIGNED_ZEROS (mode))
   18093 	  return copysign (x2, x);
   18094 	return x2;
   18095    */
   18096   machine_mode mode = GET_MODE (operand0);
   18097   rtx xa, xi, TWO52, res, mask;
   18098   rtx_code_label *label;
   18099 
   18100   TWO52 = ix86_gen_TWO52 (mode);
   18101 
   18102   /* Temporary for holding the result, initialized to the input
   18103      operand to ease control flow.  */
   18104   res = copy_to_reg (operand1);
   18105 
   18106   /* xa = abs (operand1) */
   18107   xa = ix86_expand_sse_fabs (res, &mask);
   18108 
   18109   /* if (!isless (xa, TWO52)) goto label; */
   18110   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   18111 
   18112   /* xa = (double)(long)x */
   18113   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   18114   expand_fix (xi, res, 0);
   18115   expand_float (xa, xi, 0);
   18116 
   18117   if (HONOR_SIGNED_ZEROS (mode))
   18118     ix86_sse_copysign_to_positive (xa, xa, res, mask);
   18119 
   18120   emit_move_insn (res, xa);
   18121 
   18122   emit_label (label);
   18123   LABEL_NUSES (label) = 1;
   18124 
   18125   emit_move_insn (operand0, res);
   18126 }
   18127 
   18128 /* Expand SSE sequence for computing trunc from OPERAND1 storing
   18129    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   18130    that is only available on 64bit targets.  */
   18131 void
   18132 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
   18133 {
   18134   machine_mode mode = GET_MODE (operand0);
   18135   rtx xa, xa2, TWO52, tmp, one, res, mask;
   18136   rtx_code_label *label;
   18137 
   18138   /* C code for SSE variant we expand below.
   18139 	double xa = fabs (x), x2;
   18140 	if (!isless (xa, TWO52))
   18141 	  return x;
   18142 	xa2 = xa + TWO52 - TWO52;
   18143      Compensate:
   18144 	if (xa2 > xa)
   18145 	  xa2 -= 1.0;
   18146 	x2 = copysign (xa2, x);
   18147 	return x2;
   18148    */
   18149 
   18150   TWO52 = ix86_gen_TWO52 (mode);
   18151 
   18152   /* Temporary for holding the result, initialized to the input
   18153      operand to ease control flow.  */
   18154   res =copy_to_reg (operand1);
   18155 
   18156   /* xa = abs (operand1) */
   18157   xa = ix86_expand_sse_fabs (res, &mask);
   18158 
   18159   /* if (!isless (xa, TWO52)) goto label; */
   18160   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   18161 
   18162   /* xa2 = xa + TWO52 - TWO52; */
   18163   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   18164   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   18165 
   18166   /* generate 1.0 */
   18167   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
   18168 
   18169   /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0)  */
   18170   tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
   18171   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
   18172   tmp = expand_simple_binop (mode, MINUS,
   18173 			     xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   18174   /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
   18175   if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
   18176     tmp = ix86_expand_sse_fabs (tmp, NULL);
   18177 
   18178   /* res = copysign (xa2, operand1) */
   18179   ix86_sse_copysign_to_positive (res, tmp, res, mask);
   18180 
   18181   emit_label (label);
   18182   LABEL_NUSES (label) = 1;
   18183 
   18184   emit_move_insn (operand0, res);
   18185 }
   18186 
   18187 /* Expand SSE sequence for computing round
   18188    from OPERAND1 storing into OPERAND0.  */
   18189 void
   18190 ix86_expand_round (rtx operand0, rtx operand1)
   18191 {
   18192   /* C code for the stuff we're doing below:
   18193 	double xa = fabs (x);
   18194 	if (!isless (xa, TWO52))
   18195 	  return x;
   18196 	xa = (double)(long)(xa + nextafter (0.5, 0.0));
   18197 	return copysign (xa, x);
   18198    */
   18199   machine_mode mode = GET_MODE (operand0);
   18200   rtx res, TWO52, xa, xi, half, mask;
   18201   rtx_code_label *label;
   18202   const struct real_format *fmt;
   18203   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   18204 
   18205   /* Temporary for holding the result, initialized to the input
   18206      operand to ease control flow.  */
   18207   res = copy_to_reg (operand1);
   18208 
   18209   TWO52 = ix86_gen_TWO52 (mode);
   18210   xa = ix86_expand_sse_fabs (res, &mask);
   18211   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   18212 
   18213   /* load nextafter (0.5, 0.0) */
   18214   fmt = REAL_MODE_FORMAT (mode);
   18215   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   18216   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   18217 
   18218   /* xa = xa + 0.5 */
   18219   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
   18220   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
   18221 
   18222   /* xa = (double)(int64_t)xa */
   18223   xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
   18224   expand_fix (xi, xa, 0);
   18225   expand_float (xa, xi, 0);
   18226 
   18227   /* res = copysign (xa, operand1) */
   18228   ix86_sse_copysign_to_positive (res, xa, res, mask);
   18229 
   18230   emit_label (label);
   18231   LABEL_NUSES (label) = 1;
   18232 
   18233   emit_move_insn (operand0, res);
   18234 }
   18235 
   18236 /* Expand SSE sequence for computing round from OPERAND1 storing
   18237    into OPERAND0 without relying on DImode truncation via cvttsd2siq
   18238    that is only available on 64bit targets.  */
   18239 void
   18240 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
   18241 {
   18242   /* C code for the stuff we expand below.
   18243 	double xa = fabs (x), xa2, x2;
   18244 	if (!isless (xa, TWO52))
   18245 	  return x;
   18246      Using the absolute value and copying back sign makes
   18247      -0.0 -> -0.0 correct.
   18248 	xa2 = xa + TWO52 - TWO52;
   18249      Compensate.
   18250 	dxa = xa2 - xa;
   18251 	if (dxa <= -0.5)
   18252 	  xa2 += 1;
   18253 	else if (dxa > 0.5)
   18254 	  xa2 -= 1;
   18255 	x2 = copysign (xa2, x);
   18256 	return x2;
   18257    */
   18258   machine_mode mode = GET_MODE (operand0);
   18259   rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
   18260   rtx_code_label *label;
   18261 
   18262   TWO52 = ix86_gen_TWO52 (mode);
   18263 
   18264   /* Temporary for holding the result, initialized to the input
   18265      operand to ease control flow.  */
   18266   res = copy_to_reg (operand1);
   18267 
   18268   /* xa = abs (operand1) */
   18269   xa = ix86_expand_sse_fabs (res, &mask);
   18270 
   18271   /* if (!isless (xa, TWO52)) goto label; */
   18272   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
   18273 
   18274   /* xa2 = xa + TWO52 - TWO52; */
   18275   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
   18276   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
   18277 
   18278   /* dxa = xa2 - xa; */
   18279   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
   18280 
   18281   /* generate 0.5, 1.0 and -0.5 */
   18282   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
   18283   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
   18284   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
   18285 			       0, OPTAB_DIRECT);
   18286 
   18287   /* Compensate.  */
   18288   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
   18289   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
   18290   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   18291   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   18292   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
   18293   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
   18294   emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
   18295   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
   18296 
   18297   /* res = copysign (xa2, operand1) */
   18298   ix86_sse_copysign_to_positive (res, xa2, res, mask);
   18299 
   18300   emit_label (label);
   18301   LABEL_NUSES (label) = 1;
   18302 
   18303   emit_move_insn (operand0, res);
   18304 }
   18305 
   18306 /* Expand SSE sequence for computing round
   18307    from OP1 storing into OP0 using sse4 round insn.  */
   18308 void
   18309 ix86_expand_round_sse4 (rtx op0, rtx op1)
   18310 {
   18311   machine_mode mode = GET_MODE (op0);
   18312   rtx e1, e2, res, half;
   18313   const struct real_format *fmt;
   18314   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
   18315   rtx (*gen_copysign) (rtx, rtx, rtx);
   18316   rtx (*gen_round) (rtx, rtx, rtx);
   18317 
   18318   switch (mode)
   18319     {
   18320     case E_SFmode:
   18321       gen_copysign = gen_copysignsf3;
   18322       gen_round = gen_sse4_1_roundsf2;
   18323       break;
   18324     case E_DFmode:
   18325       gen_copysign = gen_copysigndf3;
   18326       gen_round = gen_sse4_1_rounddf2;
   18327       break;
   18328     default:
   18329       gcc_unreachable ();
   18330     }
   18331 
   18332   /* round (a) = trunc (a + copysign (0.5, a)) */
   18333 
   18334   /* load nextafter (0.5, 0.0) */
   18335   fmt = REAL_MODE_FORMAT (mode);
   18336   real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
   18337   real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
   18338   half = const_double_from_real_value (pred_half, mode);
   18339 
   18340   /* e1 = copysign (0.5, op1) */
   18341   e1 = gen_reg_rtx (mode);
   18342   emit_insn (gen_copysign (e1, half, op1));
   18343 
   18344   /* e2 = op1 + e1 */
   18345   e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
   18346 
   18347   /* res = trunc (e2) */
   18348   res = gen_reg_rtx (mode);
   18349   emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
   18350 
   18351   emit_move_insn (op0, res);
   18352 }
   18353 
   18354 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
   18355    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
   18356    insn every time.  */
   18357 
   18358 static GTY(()) rtx_insn *vselect_insn;
   18359 
   18360 /* Initialize vselect_insn.  */
   18361 
   18362 static void
   18363 init_vselect_insn (void)
   18364 {
   18365   unsigned i;
   18366   rtx x;
   18367 
   18368   x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
   18369   for (i = 0; i < MAX_VECT_LEN; ++i)
   18370     XVECEXP (x, 0, i) = const0_rtx;
   18371   x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
   18372 							const0_rtx), x);
   18373   x = gen_rtx_SET (const0_rtx, x);
   18374   start_sequence ();
   18375   vselect_insn = emit_insn (x);
   18376   end_sequence ();
   18377 }
   18378 
   18379 /* Construct (set target (vec_select op0 (parallel perm))) and
   18380    return true if that's a valid instruction in the active ISA.  */
   18381 
   18382 static bool
   18383 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
   18384 		unsigned nelt, bool testing_p)
   18385 {
   18386   unsigned int i;
   18387   rtx x, save_vconcat;
   18388   int icode;
   18389 
   18390   if (vselect_insn == NULL_RTX)
   18391     init_vselect_insn ();
   18392 
   18393   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
   18394   PUT_NUM_ELEM (XVEC (x, 0), nelt);
   18395   for (i = 0; i < nelt; ++i)
   18396     XVECEXP (x, 0, i) = GEN_INT (perm[i]);
   18397   save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   18398   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
   18399   PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
   18400   SET_DEST (PATTERN (vselect_insn)) = target;
   18401   icode = recog_memoized (vselect_insn);
   18402 
   18403   if (icode >= 0 && !testing_p)
   18404     emit_insn (copy_rtx (PATTERN (vselect_insn)));
   18405 
   18406   SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
   18407   XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
   18408   INSN_CODE (vselect_insn) = -1;
   18409 
   18410   return icode >= 0;
   18411 }
   18412 
   18413 /* Similar, but generate a vec_concat from op0 and op1 as well.  */
   18414 
   18415 static bool
   18416 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
   18417 			const unsigned char *perm, unsigned nelt,
   18418 			bool testing_p)
   18419 {
   18420   machine_mode v2mode;
   18421   rtx x;
   18422   bool ok;
   18423 
   18424   if (vselect_insn == NULL_RTX)
   18425     init_vselect_insn ();
   18426 
   18427   if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
   18428     return false;
   18429   x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
   18430   PUT_MODE (x, v2mode);
   18431   XEXP (x, 0) = op0;
   18432   XEXP (x, 1) = op1;
   18433   ok = expand_vselect (target, x, perm, nelt, testing_p);
   18434   XEXP (x, 0) = const0_rtx;
   18435   XEXP (x, 1) = const0_rtx;
   18436   return ok;
   18437 }
   18438 
   18439 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   18440    using movss or movsd.  */
   18441 static bool
   18442 expand_vec_perm_movs (struct expand_vec_perm_d *d)
   18443 {
   18444   machine_mode vmode = d->vmode;
   18445   unsigned i, nelt = d->nelt;
   18446   rtx x;
   18447 
   18448   if (d->one_operand_p)
   18449     return false;
   18450 
   18451   if (!(TARGET_SSE && vmode == V4SFmode)
   18452       && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
   18453       && !(TARGET_SSE2 && vmode == V2DFmode))
   18454     return false;
   18455 
   18456   /* Only the first element is changed.  */
   18457   if (d->perm[0] != nelt && d->perm[0] != 0)
   18458     return false;
   18459   for (i = 1; i < nelt; ++i)
   18460     if (d->perm[i] != i + nelt - d->perm[0])
   18461       return false;
   18462 
   18463   if (d->testing_p)
   18464     return true;
   18465 
   18466   if (d->perm[0] == nelt)
   18467     x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
   18468   else
   18469     x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
   18470 
   18471   emit_insn (gen_rtx_SET (d->target, x));
   18472 
   18473   return true;
   18474 }
   18475 
   18476 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   18477    in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
   18478 
   18479 static bool
   18480 expand_vec_perm_blend (struct expand_vec_perm_d *d)
   18481 {
   18482   machine_mode mmode, vmode = d->vmode;
   18483   unsigned i, nelt = d->nelt;
   18484   unsigned HOST_WIDE_INT mask;
   18485   rtx target, op0, op1, maskop, x;
   18486   rtx rperm[32], vperm;
   18487 
   18488   if (d->one_operand_p)
   18489     return false;
   18490   if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
   18491       && (TARGET_AVX512BW
   18492 	  || GET_MODE_UNIT_SIZE (vmode) >= 4))
   18493     ;
   18494   else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   18495     ;
   18496   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   18497     ;
   18498   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
   18499 			     || GET_MODE_SIZE (vmode) == 8
   18500 			     || GET_MODE_SIZE (vmode) == 4))
   18501     ;
   18502   else
   18503     return false;
   18504 
   18505   /* This is a blend, not a permute.  Elements must stay in their
   18506      respective lanes.  */
   18507   for (i = 0; i < nelt; ++i)
   18508     {
   18509       unsigned e = d->perm[i];
   18510       if (!(e == i || e == i + nelt))
   18511 	return false;
   18512     }
   18513 
   18514   if (d->testing_p)
   18515     return true;
   18516 
   18517   /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
   18518      decision should be extracted elsewhere, so that we only try that
   18519      sequence once all budget==3 options have been tried.  */
   18520   target = d->target;
   18521   op0 = d->op0;
   18522   op1 = d->op1;
   18523   mask = 0;
   18524 
   18525   switch (vmode)
   18526     {
   18527     case E_V8DFmode:
   18528     case E_V16SFmode:
   18529     case E_V4DFmode:
   18530     case E_V8SFmode:
   18531     case E_V2DFmode:
   18532     case E_V4SFmode:
   18533     case E_V4HImode:
   18534     case E_V8HImode:
   18535     case E_V8SImode:
   18536     case E_V32HImode:
   18537     case E_V64QImode:
   18538     case E_V16SImode:
   18539     case E_V8DImode:
   18540       for (i = 0; i < nelt; ++i)
   18541 	mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
   18542       break;
   18543 
   18544     case E_V2DImode:
   18545       for (i = 0; i < 2; ++i)
   18546 	mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
   18547       vmode = V8HImode;
   18548       goto do_subreg;
   18549 
   18550     case E_V2SImode:
   18551       for (i = 0; i < 2; ++i)
   18552 	mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
   18553       vmode = V4HImode;
   18554       goto do_subreg;
   18555 
   18556     case E_V4SImode:
   18557       for (i = 0; i < 4; ++i)
   18558 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   18559       vmode = V8HImode;
   18560       goto do_subreg;
   18561 
   18562     case E_V16QImode:
   18563       /* See if bytes move in pairs so we can use pblendw with
   18564 	 an immediate argument, rather than pblendvb with a vector
   18565 	 argument.  */
   18566       for (i = 0; i < 16; i += 2)
   18567 	if (d->perm[i] + 1 != d->perm[i + 1])
   18568 	  {
   18569 	  use_pblendvb:
   18570 	    for (i = 0; i < nelt; ++i)
   18571 	      rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
   18572 
   18573 	  finish_pblendvb:
   18574 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   18575 	    vperm = force_reg (vmode, vperm);
   18576 
   18577 	    if (GET_MODE_SIZE (vmode) == 4)
   18578 	      emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
   18579 	    else if (GET_MODE_SIZE (vmode) == 8)
   18580 	      emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
   18581 	    else if (GET_MODE_SIZE (vmode) == 16)
   18582 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
   18583 	    else
   18584 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
   18585 	    if (target != d->target)
   18586 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   18587 	    return true;
   18588 	  }
   18589 
   18590       for (i = 0; i < 8; ++i)
   18591 	mask |= (d->perm[i * 2] >= 16) << i;
   18592       vmode = V8HImode;
   18593       /* FALLTHRU */
   18594 
   18595     do_subreg:
   18596       target = gen_reg_rtx (vmode);
   18597       op0 = gen_lowpart (vmode, op0);
   18598       op1 = gen_lowpart (vmode, op1);
   18599       break;
   18600 
   18601     case E_V8QImode:
   18602       for (i = 0; i < 8; i += 2)
   18603 	if (d->perm[i] + 1 != d->perm[i + 1])
   18604 	  goto use_pblendvb;
   18605 
   18606       for (i = 0; i < 4; ++i)
   18607 	mask |= (d->perm[i * 2] >= 8) << i;
   18608       vmode = V4HImode;
   18609       goto do_subreg;
   18610 
   18611     case E_V4QImode:
   18612       for (i = 0; i < 4; i += 2)
   18613 	if (d->perm[i] + 1 != d->perm[i + 1])
   18614 	  goto use_pblendvb;
   18615 
   18616       for (i = 0; i < 2; ++i)
   18617 	mask |= (d->perm[i * 2] >= 4) << i;
   18618       vmode = V2HImode;
   18619       goto do_subreg;
   18620 
   18621     case E_V32QImode:
   18622       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
   18623       for (i = 0; i < 32; i += 2)
   18624 	if (d->perm[i] + 1 != d->perm[i + 1])
   18625 	  goto use_pblendvb;
   18626       /* See if bytes move in quadruplets.  If yes, vpblendd
   18627 	 with immediate can be used.  */
   18628       for (i = 0; i < 32; i += 4)
   18629 	if (d->perm[i] + 2 != d->perm[i + 2])
   18630 	  break;
   18631       if (i < 32)
   18632 	{
   18633 	  /* See if bytes move the same in both lanes.  If yes,
   18634 	     vpblendw with immediate can be used.  */
   18635 	  for (i = 0; i < 16; i += 2)
   18636 	    if (d->perm[i] + 16 != d->perm[i + 16])
   18637 	      goto use_pblendvb;
   18638 
   18639 	  /* Use vpblendw.  */
   18640 	  for (i = 0; i < 16; ++i)
   18641 	    mask |= (d->perm[i * 2] >= 32) << i;
   18642 	  vmode = V16HImode;
   18643 	  goto do_subreg;
   18644 	}
   18645 
   18646       /* Use vpblendd.  */
   18647       for (i = 0; i < 8; ++i)
   18648 	mask |= (d->perm[i * 4] >= 32) << i;
   18649       vmode = V8SImode;
   18650       goto do_subreg;
   18651 
   18652     case E_V16HImode:
   18653       /* See if words move in pairs.  If yes, vpblendd can be used.  */
   18654       for (i = 0; i < 16; i += 2)
   18655 	if (d->perm[i] + 1 != d->perm[i + 1])
   18656 	  break;
   18657       if (i < 16)
   18658 	{
   18659 	  /* See if words move the same in both lanes.  If not,
   18660 	     vpblendvb must be used.  */
   18661 	  for (i = 0; i < 8; i++)
   18662 	    if (d->perm[i] + 8 != d->perm[i + 8])
   18663 	      {
   18664 		/* Use vpblendvb.  */
   18665 		for (i = 0; i < 32; ++i)
   18666 		  rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
   18667 
   18668 		vmode = V32QImode;
   18669 		nelt = 32;
   18670 		target = gen_reg_rtx (vmode);
   18671 		op0 = gen_lowpart (vmode, op0);
   18672 		op1 = gen_lowpart (vmode, op1);
   18673 		goto finish_pblendvb;
   18674 	      }
   18675 
   18676 	  /* Use vpblendw.  */
   18677 	  for (i = 0; i < 16; ++i)
   18678 	    mask |= (d->perm[i] >= 16) << i;
   18679 	  break;
   18680 	}
   18681 
   18682       /* Use vpblendd.  */
   18683       for (i = 0; i < 8; ++i)
   18684 	mask |= (d->perm[i * 2] >= 16) << i;
   18685       vmode = V8SImode;
   18686       goto do_subreg;
   18687 
   18688     case E_V4DImode:
   18689       /* Use vpblendd.  */
   18690       for (i = 0; i < 4; ++i)
   18691 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
   18692       vmode = V8SImode;
   18693       goto do_subreg;
   18694 
   18695     default:
   18696       gcc_unreachable ();
   18697     }
   18698 
   18699   switch (vmode)
   18700     {
   18701     case E_V8DFmode:
   18702     case E_V8DImode:
   18703       mmode = QImode;
   18704       break;
   18705     case E_V16SFmode:
   18706     case E_V16SImode:
   18707       mmode = HImode;
   18708       break;
   18709     case E_V32HImode:
   18710       mmode = SImode;
   18711       break;
   18712     case E_V64QImode:
   18713       mmode = DImode;
   18714       break;
   18715     default:
   18716       mmode = VOIDmode;
   18717     }
   18718 
   18719   if (mmode != VOIDmode)
   18720     maskop = force_reg (mmode, gen_int_mode (mask, mmode));
   18721   else
   18722     maskop = GEN_INT (mask);
   18723 
   18724   /* This matches five different patterns with the different modes.  */
   18725   x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
   18726   x = gen_rtx_SET (target, x);
   18727   emit_insn (x);
   18728   if (target != d->target)
   18729     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   18730 
   18731   return true;
   18732 }
   18733 
   18734 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   18735    in terms of the variable form of vpermilps.
   18736 
   18737    Note that we will have already failed the immediate input vpermilps,
   18738    which requires that the high and low part shuffle be identical; the
   18739    variable form doesn't require that.  */
   18740 
   18741 static bool
   18742 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   18743 {
   18744   rtx rperm[8], vperm;
   18745   unsigned i;
   18746 
   18747   if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
   18748     return false;
   18749 
   18750   /* We can only permute within the 128-bit lane.  */
   18751   for (i = 0; i < 8; ++i)
   18752     {
   18753       unsigned e = d->perm[i];
   18754       if (i < 4 ? e >= 4 : e < 4)
   18755 	return false;
   18756     }
   18757 
   18758   if (d->testing_p)
   18759     return true;
   18760 
   18761   for (i = 0; i < 8; ++i)
   18762     {
   18763       unsigned e = d->perm[i];
   18764 
   18765       /* Within each 128-bit lane, the elements of op0 are numbered
   18766 	 from 0 and the elements of op1 are numbered from 4.  */
   18767       if (e >= 8 + 4)
   18768 	e -= 8;
   18769       else if (e >= 4)
   18770 	e -= 4;
   18771 
   18772       rperm[i] = GEN_INT (e);
   18773     }
   18774 
   18775   vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
   18776   vperm = force_reg (V8SImode, vperm);
   18777   emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
   18778 
   18779   return true;
   18780 }
   18781 
   18782 /* For V*[QHS]Imode permutations, check if the same permutation
   18783    can't be performed in a 2x, 4x or 8x wider inner mode.  */
   18784 
   18785 static bool
   18786 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
   18787 			      struct expand_vec_perm_d *nd)
   18788 {
   18789   int i;
   18790   machine_mode mode = VOIDmode;
   18791 
   18792   switch (d->vmode)
   18793     {
   18794     case E_V8QImode: mode = V4HImode; break;
   18795     case E_V16QImode: mode = V8HImode; break;
   18796     case E_V32QImode: mode = V16HImode; break;
   18797     case E_V64QImode: mode = V32HImode; break;
   18798     case E_V4HImode: mode = V2SImode; break;
   18799     case E_V8HImode: mode = V4SImode; break;
   18800     case E_V16HImode: mode = V8SImode; break;
   18801     case E_V32HImode: mode = V16SImode; break;
   18802     case E_V4SImode: mode = V2DImode; break;
   18803     case E_V8SImode: mode = V4DImode; break;
   18804     case E_V16SImode: mode = V8DImode; break;
   18805     default: return false;
   18806     }
   18807   for (i = 0; i < d->nelt; i += 2)
   18808     if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
   18809       return false;
   18810   nd->vmode = mode;
   18811   nd->nelt = d->nelt / 2;
   18812   for (i = 0; i < nd->nelt; i++)
   18813     nd->perm[i] = d->perm[2 * i] / 2;
   18814   if (GET_MODE_INNER (mode) != DImode)
   18815     canonicalize_vector_int_perm (nd, nd);
   18816   if (nd != d)
   18817     {
   18818       nd->one_operand_p = d->one_operand_p;
   18819       nd->testing_p = d->testing_p;
   18820       if (d->op0 == d->op1)
   18821 	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
   18822       else
   18823 	{
   18824 	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
   18825 	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
   18826 	}
   18827       if (d->testing_p)
   18828 	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
   18829       else
   18830 	nd->target = gen_reg_rtx (nd->vmode);
   18831     }
   18832   return true;
   18833 }
   18834 
   18835 /* Return true if permutation D can be performed as VMODE permutation
   18836    instead.  */
   18837 
   18838 static bool
   18839 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
   18840 {
   18841   unsigned int i, j, chunk;
   18842 
   18843   if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
   18844       || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
   18845       || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
   18846     return false;
   18847 
   18848   if (GET_MODE_NUNITS (vmode) >= d->nelt)
   18849     return true;
   18850 
   18851   chunk = d->nelt / GET_MODE_NUNITS (vmode);
   18852   for (i = 0; i < d->nelt; i += chunk)
   18853     if (d->perm[i] & (chunk - 1))
   18854       return false;
   18855     else
   18856       for (j = 1; j < chunk; ++j)
   18857 	if (d->perm[i] + j != d->perm[i + j])
   18858 	  return false;
   18859 
   18860   return true;
   18861 }
   18862 
   18863 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   18864    in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
   18865 
   18866 static bool
   18867 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   18868 {
   18869   unsigned i, nelt, eltsz, mask;
   18870   unsigned char perm[64];
   18871   machine_mode vmode;
   18872   struct expand_vec_perm_d nd;
   18873   rtx rperm[64], vperm, target, op0, op1;
   18874 
   18875   nelt = d->nelt;
   18876 
   18877   if (!d->one_operand_p)
   18878     switch (GET_MODE_SIZE (d->vmode))
   18879       {
   18880       case 4:
   18881 	if (!TARGET_XOP)
   18882 	  return false;
   18883 	vmode = V4QImode;
   18884 	break;
   18885 
   18886       case 8:
   18887 	if (!TARGET_XOP)
   18888 	  return false;
   18889 	vmode = V8QImode;
   18890 	break;
   18891 
   18892       case 16:
   18893 	if (!TARGET_XOP)
   18894 	  return false;
   18895 	vmode = V16QImode;
   18896 	break;
   18897 
   18898       case 32:
   18899 	if (!TARGET_AVX2)
   18900 	  return false;
   18901 
   18902 	if (valid_perm_using_mode_p (V2TImode, d))
   18903 	  {
   18904 	    if (d->testing_p)
   18905 	      return true;
   18906 
   18907 	    /* Use vperm2i128 insn.  The pattern uses
   18908 	       V4DImode instead of V2TImode.  */
   18909 	    target = d->target;
   18910 	    if (d->vmode != V4DImode)
   18911 	      target = gen_reg_rtx (V4DImode);
   18912 	    op0 = gen_lowpart (V4DImode, d->op0);
   18913 	    op1 = gen_lowpart (V4DImode, d->op1);
   18914 	    rperm[0]
   18915 	      = GEN_INT ((d->perm[0] / (nelt / 2))
   18916 			 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
   18917 	    emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
   18918 	    if (target != d->target)
   18919 	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   18920 	    return true;
   18921 	  }
   18922 	/* FALLTHRU */
   18923 
   18924       default:
   18925 	return false;
   18926       }
   18927   else
   18928     switch (GET_MODE_SIZE (d->vmode))
   18929       {
   18930       case 4:
   18931 	if (!TARGET_SSSE3)
   18932 	  return false;
   18933 	vmode = V4QImode;
   18934 	break;
   18935 
   18936       case 8:
   18937 	if (!TARGET_SSSE3)
   18938 	  return false;
   18939 	vmode = V8QImode;
   18940 	break;
   18941 
   18942       case 16:
   18943 	if (!TARGET_SSSE3)
   18944 	  return false;
   18945 	vmode = V16QImode;
   18946 	break;
   18947 
   18948       case 32:
   18949 	if (!TARGET_AVX2)
   18950 	  return false;
   18951 
   18952 	/* V4DImode should be already handled through
   18953 	   expand_vselect by vpermq instruction.  */
   18954 	gcc_assert (d->vmode != V4DImode);
   18955 
   18956 	vmode = V32QImode;
   18957 	if (d->vmode == V8SImode
   18958 	    || d->vmode == V16HImode
   18959 	    || d->vmode == V32QImode)
   18960 	  {
   18961 	    /* First see if vpermq can be used for
   18962 	       V8SImode/V16HImode/V32QImode.  */
   18963 	    if (valid_perm_using_mode_p (V4DImode, d))
   18964 	      {
   18965 		for (i = 0; i < 4; i++)
   18966 		  perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
   18967 		if (d->testing_p)
   18968 		  return true;
   18969 		target = gen_reg_rtx (V4DImode);
   18970 		if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
   18971 				    perm, 4, false))
   18972 		  {
   18973 		    emit_move_insn (d->target,
   18974 				    gen_lowpart (d->vmode, target));
   18975 		    return true;
   18976 		  }
   18977 		return false;
   18978 	      }
   18979 
   18980 	    /* Next see if vpermd can be used.  */
   18981 	    if (valid_perm_using_mode_p (V8SImode, d))
   18982 	      vmode = V8SImode;
   18983 	  }
   18984 	/* Or if vpermps can be used.  */
   18985 	else if (d->vmode == V8SFmode)
   18986 	  vmode = V8SImode;
   18987 
   18988 	if (vmode == V32QImode)
   18989 	  {
   18990 	    /* vpshufb only works intra lanes, it is not
   18991 	       possible to shuffle bytes in between the lanes.  */
   18992 	    for (i = 0; i < nelt; ++i)
   18993 	      if ((d->perm[i] ^ i) & (nelt / 2))
   18994 		return false;
   18995 	  }
   18996 	break;
   18997 
   18998       case 64:
   18999 	if (!TARGET_AVX512BW)
   19000 	  return false;
   19001 
   19002 	/* If vpermq didn't work, vpshufb won't work either.  */
   19003 	if (d->vmode == V8DFmode || d->vmode == V8DImode)
   19004 	  return false;
   19005 
   19006 	vmode = V64QImode;
   19007 	if (d->vmode == V16SImode
   19008 	    || d->vmode == V32HImode
   19009 	    || d->vmode == V64QImode)
   19010 	  {
   19011 	    /* First see if vpermq can be used for
   19012 	       V16SImode/V32HImode/V64QImode.  */
   19013 	    if (valid_perm_using_mode_p (V8DImode, d))
   19014 	      {
   19015 		for (i = 0; i < 8; i++)
   19016 		  perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
   19017 		if (d->testing_p)
   19018 		  return true;
   19019 		target = gen_reg_rtx (V8DImode);
   19020 		if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
   19021 				    perm, 8, false))
   19022 		  {
   19023 		    emit_move_insn (d->target,
   19024 				    gen_lowpart (d->vmode, target));
   19025 		    return true;
   19026 		  }
   19027 		return false;
   19028 	      }
   19029 
   19030 	    /* Next see if vpermd can be used.  */
   19031 	    if (valid_perm_using_mode_p (V16SImode, d))
   19032 	      vmode = V16SImode;
   19033 	  }
   19034 	/* Or if vpermps can be used.  */
   19035 	else if (d->vmode == V16SFmode)
   19036 	  vmode = V16SImode;
   19037 
   19038 	if (vmode == V64QImode)
   19039 	  {
   19040 	    /* vpshufb only works intra lanes, it is not
   19041 	       possible to shuffle bytes in between the lanes.  */
   19042 	    for (i = 0; i < nelt; ++i)
   19043 	      if ((d->perm[i] ^ i) & (3 * nelt / 4))
   19044 		return false;
   19045 	  }
   19046 	break;
   19047 
   19048       default:
   19049 	return false;
   19050       }
   19051 
   19052   if (d->testing_p)
   19053     return true;
   19054 
   19055   /* Try to avoid variable permutation instruction.  */
   19056   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   19057     {
   19058       emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   19059       return true;
   19060     }
   19061 
   19062   if (vmode == V8SImode)
   19063     for (i = 0; i < 8; ++i)
   19064       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
   19065   else if (vmode == V16SImode)
   19066     for (i = 0; i < 16; ++i)
   19067       rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
   19068   else
   19069     {
   19070       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   19071       if (!d->one_operand_p)
   19072 	mask = 2 * nelt - 1;
   19073       else if (vmode == V64QImode)
   19074 	mask = nelt / 4 - 1;
   19075       else if (vmode == V32QImode)
   19076 	mask = nelt / 2 - 1;
   19077       else
   19078 	mask = nelt - 1;
   19079 
   19080       for (i = 0; i < nelt; ++i)
   19081 	{
   19082 	  unsigned j, e = d->perm[i] & mask;
   19083 	  for (j = 0; j < eltsz; ++j)
   19084 	    rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
   19085 	}
   19086     }
   19087 
   19088   machine_mode vpmode = vmode;
   19089 
   19090   nelt = GET_MODE_SIZE (vmode);
   19091 
   19092   /* Emulate narrow modes with V16QI instructions.  */
   19093   if (nelt < 16)
   19094     {
   19095       rtx m128 = GEN_INT (-128);
   19096 
   19097       /* Remap elements from the second operand, as we have to
   19098 	 account for inactive top elements from the first operand.  */
   19099       if (!d->one_operand_p)
   19100 	{
   19101 	  for (i = 0; i < nelt; ++i)
   19102 	    {
   19103 	      unsigned ival = UINTVAL (rperm[i]);
   19104 	      if (ival >= nelt)
   19105 		rperm[i] = GEN_INT (ival + 16 - nelt);
   19106 	    }
   19107 	}
   19108 
   19109       /* Fill inactive elements in the top positions with zeros.  */
   19110       for (i = nelt; i < 16; ++i)
   19111 	rperm[i] = m128;
   19112 
   19113       vpmode = V16QImode;
   19114     }
   19115 
   19116   vperm = gen_rtx_CONST_VECTOR (vpmode,
   19117 				gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
   19118   vperm = force_reg (vpmode, vperm);
   19119 
   19120   if (vmode == d->vmode)
   19121     target = d->target;
   19122   else
   19123     target = gen_reg_rtx (vmode);
   19124 
   19125   op0 = gen_lowpart (vmode, d->op0);
   19126 
   19127   if (d->one_operand_p)
   19128     {
   19129       rtx (*gen) (rtx, rtx, rtx);
   19130 
   19131       if (vmode == V4QImode)
   19132 	gen = gen_mmx_pshufbv4qi3;
   19133       else if (vmode == V8QImode)
   19134 	gen = gen_mmx_pshufbv8qi3;
   19135       else if (vmode == V16QImode)
   19136 	gen = gen_ssse3_pshufbv16qi3;
   19137       else if (vmode == V32QImode)
   19138 	gen = gen_avx2_pshufbv32qi3;
   19139       else if (vmode == V64QImode)
   19140 	gen = gen_avx512bw_pshufbv64qi3;
   19141       else if (vmode == V8SFmode)
   19142 	gen = gen_avx2_permvarv8sf;
   19143       else if (vmode == V8SImode)
   19144 	gen = gen_avx2_permvarv8si;
   19145       else if (vmode == V16SFmode)
   19146 	gen = gen_avx512f_permvarv16sf;
   19147       else if (vmode == V16SImode)
   19148 	gen = gen_avx512f_permvarv16si;
   19149       else
   19150 	gcc_unreachable ();
   19151 
   19152       emit_insn (gen (target, op0, vperm));
   19153     }
   19154   else
   19155     {
   19156       rtx (*gen) (rtx, rtx, rtx, rtx);
   19157 
   19158       op1 = gen_lowpart (vmode, d->op1);
   19159 
   19160       if (vmode == V4QImode)
   19161 	gen = gen_mmx_ppermv32;
   19162       else if (vmode == V8QImode)
   19163 	gen = gen_mmx_ppermv64;
   19164       else if (vmode == V16QImode)
   19165 	gen = gen_xop_pperm;
   19166       else
   19167 	gcc_unreachable ();
   19168 
   19169       emit_insn (gen (target, op0, op1, vperm));
   19170     }
   19171 
   19172   if (target != d->target)
   19173     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
   19174 
   19175   return true;
   19176 }
   19177 
   19178 /* Try to expand one-operand permutation with constant mask.  */
   19179 
   19180 static bool
   19181 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   19182 {
   19183   machine_mode mode = GET_MODE (d->op0);
   19184   machine_mode maskmode = mode;
   19185   unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   19186   rtx (*gen) (rtx, rtx, rtx) = NULL;
   19187   rtx target, op0, mask;
   19188   rtx vec[64];
   19189 
   19190   if (!rtx_equal_p (d->op0, d->op1))
   19191     return false;
   19192 
   19193   if (!TARGET_AVX512F)
   19194     return false;
   19195 
   19196   /* Accept VNxHImode and VNxQImode now.  */
   19197   if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
   19198     return false;
   19199 
   19200   /* vpermw.  */
   19201   if (!TARGET_AVX512BW && inner_size == 2)
   19202     return false;
   19203 
   19204   /* vpermb.  */
   19205   if (!TARGET_AVX512VBMI && inner_size == 1)
   19206     return false;
   19207 
   19208   switch (mode)
   19209     {
   19210     case E_V16SImode:
   19211       gen = gen_avx512f_permvarv16si;
   19212       break;
   19213     case E_V16SFmode:
   19214       gen = gen_avx512f_permvarv16sf;
   19215       maskmode = V16SImode;
   19216       break;
   19217     case E_V8DImode:
   19218       gen = gen_avx512f_permvarv8di;
   19219       break;
   19220     case E_V8DFmode:
   19221       gen = gen_avx512f_permvarv8df;
   19222       maskmode = V8DImode;
   19223       break;
   19224     case E_V32HImode:
   19225       gen = gen_avx512bw_permvarv32hi;
   19226       break;
   19227     case E_V16HImode:
   19228       gen = gen_avx512vl_permvarv16hi;
   19229       break;
   19230     case E_V8HImode:
   19231       gen = gen_avx512vl_permvarv8hi;
   19232       break;
   19233     case E_V64QImode:
   19234       gen = gen_avx512bw_permvarv64qi;
   19235       break;
   19236     case E_V32QImode:
   19237       gen = gen_avx512vl_permvarv32qi;
   19238       break;
   19239     case E_V16QImode:
   19240       gen = gen_avx512vl_permvarv16qi;
   19241       break;
   19242 
   19243     default:
   19244       return false;
   19245     }
   19246 
   19247   if (d->testing_p)
   19248     return true;
   19249 
   19250   target = d->target;
   19251   op0 = d->op0;
   19252   for (int i = 0; i < d->nelt; ++i)
   19253     vec[i] = GEN_INT (d->perm[i]);
   19254   mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
   19255   emit_insn (gen (target, op0, force_reg (maskmode, mask)));
   19256   return true;
   19257 }
   19258 
   19259 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
   19260 
   19261 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
   19262    in a single instruction.  */
   19263 
   19264 static bool
   19265 expand_vec_perm_1 (struct expand_vec_perm_d *d)
   19266 {
   19267   unsigned i, nelt = d->nelt;
   19268   struct expand_vec_perm_d nd;
   19269 
   19270   /* Check plain VEC_SELECT first, because AVX has instructions that could
   19271      match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
   19272      input where SEL+CONCAT may not.  */
   19273   if (d->one_operand_p)
   19274     {
   19275       int mask = nelt - 1;
   19276       bool identity_perm = true;
   19277       bool broadcast_perm = true;
   19278 
   19279       for (i = 0; i < nelt; i++)
   19280 	{
   19281 	  nd.perm[i] = d->perm[i] & mask;
   19282 	  if (nd.perm[i] != i)
   19283 	    identity_perm = false;
   19284 	  if (nd.perm[i])
   19285 	    broadcast_perm = false;
   19286 	}
   19287 
   19288       if (identity_perm)
   19289 	{
   19290 	  if (!d->testing_p)
   19291 	    emit_move_insn (d->target, d->op0);
   19292 	  return true;
   19293 	}
   19294       else if (broadcast_perm && TARGET_AVX2)
   19295 	{
   19296 	  /* Use vpbroadcast{b,w,d}.  */
   19297 	  rtx (*gen) (rtx, rtx) = NULL;
   19298 	  switch (d->vmode)
   19299 	    {
   19300 	    case E_V64QImode:
   19301 	      if (TARGET_AVX512BW)
   19302 		gen = gen_avx512bw_vec_dupv64qi_1;
   19303 	      break;
   19304 	    case E_V32QImode:
   19305 	      gen = gen_avx2_pbroadcastv32qi_1;
   19306 	      break;
   19307 	    case E_V32HImode:
   19308 	      if (TARGET_AVX512BW)
   19309 		gen = gen_avx512bw_vec_dupv32hi_1;
   19310 	      break;
   19311 	    case E_V16HImode:
   19312 	      gen = gen_avx2_pbroadcastv16hi_1;
   19313 	      break;
   19314 	    case E_V16SImode:
   19315 	      if (TARGET_AVX512F)
   19316 		gen = gen_avx512f_vec_dupv16si_1;
   19317 	      break;
   19318 	    case E_V8SImode:
   19319 	      gen = gen_avx2_pbroadcastv8si_1;
   19320 	      break;
   19321 	    case E_V16QImode:
   19322 	      gen = gen_avx2_pbroadcastv16qi;
   19323 	      break;
   19324 	    case E_V8HImode:
   19325 	      gen = gen_avx2_pbroadcastv8hi;
   19326 	      break;
   19327 	    case E_V16SFmode:
   19328 	      if (TARGET_AVX512F)
   19329 		gen = gen_avx512f_vec_dupv16sf_1;
   19330 	      break;
   19331 	    case E_V8SFmode:
   19332 	      gen = gen_avx2_vec_dupv8sf_1;
   19333 	      break;
   19334 	    case E_V8DFmode:
   19335 	      if (TARGET_AVX512F)
   19336 		gen = gen_avx512f_vec_dupv8df_1;
   19337 	      break;
   19338 	    case E_V8DImode:
   19339 	      if (TARGET_AVX512F)
   19340 		gen = gen_avx512f_vec_dupv8di_1;
   19341 	      break;
   19342 	    /* For other modes prefer other shuffles this function creates.  */
   19343 	    default: break;
   19344 	    }
   19345 	  if (gen != NULL)
   19346 	    {
   19347 	      if (!d->testing_p)
   19348 		emit_insn (gen (d->target, d->op0));
   19349 	      return true;
   19350 	    }
   19351 	}
   19352 
   19353       if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
   19354 	return true;
   19355 
   19356       /* There are plenty of patterns in sse.md that are written for
   19357 	 SEL+CONCAT and are not replicated for a single op.  Perhaps
   19358 	 that should be changed, to avoid the nastiness here.  */
   19359 
   19360       /* Recognize interleave style patterns, which means incrementing
   19361 	 every other permutation operand.  */
   19362       for (i = 0; i < nelt; i += 2)
   19363 	{
   19364 	  nd.perm[i] = d->perm[i] & mask;
   19365 	  nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
   19366 	}
   19367       if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   19368 				  d->testing_p))
   19369 	return true;
   19370 
   19371       /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
   19372       if (nelt >= 4)
   19373 	{
   19374 	  for (i = 0; i < nelt; i += 4)
   19375 	    {
   19376 	      nd.perm[i + 0] = d->perm[i + 0] & mask;
   19377 	      nd.perm[i + 1] = d->perm[i + 1] & mask;
   19378 	      nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
   19379 	      nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
   19380 	    }
   19381 
   19382 	  if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
   19383 				      d->testing_p))
   19384 	    return true;
   19385 	}
   19386     }
   19387 
   19388   /* Try movss/movsd instructions.  */
   19389   if (expand_vec_perm_movs (d))
   19390     return true;
   19391 
   19392   /* Finally, try the fully general two operand permute.  */
   19393   if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
   19394 			      d->testing_p))
   19395     return true;
   19396 
   19397   /* Recognize interleave style patterns with reversed operands.  */
   19398   if (!d->one_operand_p)
   19399     {
   19400       for (i = 0; i < nelt; ++i)
   19401 	{
   19402 	  unsigned e = d->perm[i];
   19403 	  if (e >= nelt)
   19404 	    e -= nelt;
   19405 	  else
   19406 	    e += nelt;
   19407 	  nd.perm[i] = e;
   19408 	}
   19409 
   19410       if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
   19411 				  d->testing_p))
   19412 	return true;
   19413     }
   19414 
   19415   /* Try the SSE4.1 blend variable merge instructions.  */
   19416   if (expand_vec_perm_blend (d))
   19417     return true;
   19418 
   19419   /* Try one of the AVX vpermil variable permutations.  */
   19420   if (expand_vec_perm_vpermil (d))
   19421     return true;
   19422 
   19423   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
   19424      vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   19425   if (expand_vec_perm_pshufb (d))
   19426     return true;
   19427 
   19428   /* Try the AVX2 vpalignr instruction.  */
   19429   if (expand_vec_perm_palignr (d, true))
   19430     return true;
   19431 
   19432   /* Try the AVX512F vperm{w,b,s,d} instructions  */
   19433   if (ix86_expand_vec_one_operand_perm_avx512 (d))
   19434     return true;
   19435 
   19436   /* Try the AVX512F vpermt2/vpermi2 instructions.  */
   19437   if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
   19438     return true;
   19439 
   19440   /* See if we can get the same permutation in different vector integer
   19441      mode.  */
   19442   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   19443     {
   19444       if (!d->testing_p)
   19445 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   19446       return true;
   19447     }
   19448   return false;
   19449 }
   19450 
   19451 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
   19452    in terms of a pair of pshuflw + pshufhw instructions.  */
   19453 
   19454 static bool
   19455 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
   19456 {
   19457   unsigned char perm2[MAX_VECT_LEN];
   19458   unsigned i;
   19459   bool ok;
   19460 
   19461   if (d->vmode != V8HImode || !d->one_operand_p)
   19462     return false;
   19463 
   19464   /* The two permutations only operate in 64-bit lanes.  */
   19465   for (i = 0; i < 4; ++i)
   19466     if (d->perm[i] >= 4)
   19467       return false;
   19468   for (i = 4; i < 8; ++i)
   19469     if (d->perm[i] < 4)
   19470       return false;
   19471 
   19472   if (d->testing_p)
   19473     return true;
   19474 
   19475   /* Emit the pshuflw.  */
   19476   memcpy (perm2, d->perm, 4);
   19477   for (i = 4; i < 8; ++i)
   19478     perm2[i] = i;
   19479   ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
   19480   gcc_assert (ok);
   19481 
   19482   /* Emit the pshufhw.  */
   19483   memcpy (perm2 + 4, d->perm + 4, 4);
   19484   for (i = 0; i < 4; ++i)
   19485     perm2[i] = i;
   19486   ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
   19487   gcc_assert (ok);
   19488 
   19489   return true;
   19490 }
   19491 
   19492 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   19493    the permutation using the SSSE3 palignr instruction.  This succeeds
   19494    when all of the elements in PERM fit within one vector and we merely
   19495    need to shift them down so that a single vector permutation has a
   19496    chance to succeed.  If SINGLE_INSN_ONLY_P, succeed if only
   19497    the vpalignr instruction itself can perform the requested permutation.  */
   19498 
   19499 static bool
   19500 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
   19501 {
   19502   unsigned i, nelt = d->nelt;
   19503   unsigned min, max, minswap, maxswap;
   19504   bool in_order, ok, swap = false;
   19505   rtx shift, target;
   19506   struct expand_vec_perm_d dcopy;
   19507 
   19508   /* Even with AVX, palignr only operates on 128-bit vectors,
   19509      in AVX2 palignr operates on both 128-bit lanes.  */
   19510   if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
   19511       && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
   19512     return false;
   19513 
   19514   min = 2 * nelt;
   19515   max = 0;
   19516   minswap = 2 * nelt;
   19517   maxswap = 0;
   19518   for (i = 0; i < nelt; ++i)
   19519     {
   19520       unsigned e = d->perm[i];
   19521       unsigned eswap = d->perm[i] ^ nelt;
   19522       if (GET_MODE_SIZE (d->vmode) == 32)
   19523 	{
   19524 	  e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
   19525 	  eswap = e ^ (nelt / 2);
   19526 	}
   19527       if (e < min)
   19528 	min = e;
   19529       if (e > max)
   19530 	max = e;
   19531       if (eswap < minswap)
   19532 	minswap = eswap;
   19533       if (eswap > maxswap)
   19534 	maxswap = eswap;
   19535     }
   19536   if (min == 0
   19537       || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
   19538     {
   19539       if (d->one_operand_p
   19540 	  || minswap == 0
   19541 	  || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
   19542 				   ? nelt / 2 : nelt))
   19543 	return false;
   19544       swap = true;
   19545       min = minswap;
   19546       max = maxswap;
   19547     }
   19548 
   19549   /* Given that we have SSSE3, we know we'll be able to implement the
   19550      single operand permutation after the palignr with pshufb for
   19551      128-bit vectors.  If SINGLE_INSN_ONLY_P, in_order has to be computed
   19552      first.  */
   19553   if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
   19554     return true;
   19555 
   19556   dcopy = *d;
   19557   if (swap)
   19558     {
   19559       dcopy.op0 = d->op1;
   19560       dcopy.op1 = d->op0;
   19561       for (i = 0; i < nelt; ++i)
   19562 	dcopy.perm[i] ^= nelt;
   19563     }
   19564 
   19565   in_order = true;
   19566   for (i = 0; i < nelt; ++i)
   19567     {
   19568       unsigned e = dcopy.perm[i];
   19569       if (GET_MODE_SIZE (d->vmode) == 32
   19570 	  && e >= nelt
   19571 	  && (e & (nelt / 2 - 1)) < min)
   19572 	e = e - min - (nelt / 2);
   19573       else
   19574 	e = e - min;
   19575       if (e != i)
   19576 	in_order = false;
   19577       dcopy.perm[i] = e;
   19578     }
   19579   dcopy.one_operand_p = true;
   19580 
   19581   if (single_insn_only_p && !in_order)
   19582     return false;
   19583 
   19584   /* For AVX2, test whether we can permute the result in one instruction.  */
   19585   if (d->testing_p)
   19586     {
   19587       if (in_order)
   19588 	return true;
   19589       dcopy.op1 = dcopy.op0;
   19590       return expand_vec_perm_1 (&dcopy);
   19591     }
   19592 
   19593   shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
   19594   if (GET_MODE_SIZE (d->vmode) == 16)
   19595     {
   19596       target = gen_reg_rtx (TImode);
   19597       emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
   19598 				      gen_lowpart (TImode, dcopy.op0), shift));
   19599     }
   19600   else
   19601     {
   19602       target = gen_reg_rtx (V2TImode);
   19603       emit_insn (gen_avx2_palignrv2ti (target,
   19604 				       gen_lowpart (V2TImode, dcopy.op1),
   19605 				       gen_lowpart (V2TImode, dcopy.op0),
   19606 				       shift));
   19607     }
   19608 
   19609   dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
   19610 
   19611   /* Test for the degenerate case where the alignment by itself
   19612      produces the desired permutation.  */
   19613   if (in_order)
   19614     {
   19615       emit_move_insn (d->target, dcopy.op0);
   19616       return true;
   19617     }
   19618 
   19619   ok = expand_vec_perm_1 (&dcopy);
   19620   gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
   19621 
   19622   return ok;
   19623 }
   19624 
   19625 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   19626    the permutation using the SSE4_1 pblendv instruction.  Potentially
   19627    reduces permutation from 2 pshufb and or to 1 pshufb and pblendv.  */
   19628 
   19629 static bool
   19630 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
   19631 {
   19632   unsigned i, which, nelt = d->nelt;
   19633   struct expand_vec_perm_d dcopy, dcopy1;
   19634   machine_mode vmode = d->vmode;
   19635   bool ok;
   19636 
   19637   /* Use the same checks as in expand_vec_perm_blend.  */
   19638   if (d->one_operand_p)
   19639     return false;
   19640   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   19641     ;
   19642   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   19643     ;
   19644   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
   19645 			     || GET_MODE_SIZE (vmode) == 8
   19646 			     || GET_MODE_SIZE (vmode) == 16))
   19647     ;
   19648   else
   19649     return false;
   19650 
   19651   /* Figure out where permutation elements stay not in their
   19652      respective lanes.  */
   19653   for (i = 0, which = 0; i < nelt; ++i)
   19654     {
   19655       unsigned e = d->perm[i];
   19656       if (e != i)
   19657 	which |= (e < nelt ? 1 : 2);
   19658     }
   19659   /* We can pblend the part where elements stay not in their
   19660      respective lanes only when these elements are all in one
   19661      half of a permutation.
   19662      {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
   19663      lanes, but both 8 and 9 >= 8
   19664      {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
   19665      respective lanes and 8 >= 8, but 2 not.  */
   19666   if (which != 1 && which != 2)
   19667     return false;
   19668   if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
   19669     return true;
   19670 
   19671   /* First we apply one operand permutation to the part where
   19672      elements stay not in their respective lanes.  */
   19673   dcopy = *d;
   19674   if (which == 2)
   19675     dcopy.op0 = dcopy.op1 = d->op1;
   19676   else
   19677     dcopy.op0 = dcopy.op1 = d->op0;
   19678   if (!d->testing_p)
   19679     dcopy.target = gen_reg_rtx (vmode);
   19680   dcopy.one_operand_p = true;
   19681 
   19682   for (i = 0; i < nelt; ++i)
   19683     dcopy.perm[i] = d->perm[i] & (nelt - 1);
   19684 
   19685   ok = expand_vec_perm_1 (&dcopy);
   19686   if (GET_MODE_SIZE (vmode) != 16 && !ok)
   19687     return false;
   19688   else
   19689     gcc_assert (ok);
   19690   if (d->testing_p)
   19691     return true;
   19692 
   19693   /* Next we put permuted elements into their positions.  */
   19694   dcopy1 = *d;
   19695   if (which == 2)
   19696     dcopy1.op1 = dcopy.target;
   19697   else
   19698     dcopy1.op0 = dcopy.target;
   19699 
   19700   for (i = 0; i < nelt; ++i)
   19701     dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
   19702 
   19703   ok = expand_vec_perm_blend (&dcopy1);
   19704   gcc_assert (ok);
   19705 
   19706   return true;
   19707 }
   19708 
   19709 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
   19710 
   19711 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   19712    a two vector permutation into a single vector permutation by using
   19713    an interleave operation to merge the vectors.  */
   19714 
   19715 static bool
   19716 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   19717 {
   19718   struct expand_vec_perm_d dremap, dfinal;
   19719   unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
   19720   unsigned HOST_WIDE_INT contents;
   19721   unsigned char remap[2 * MAX_VECT_LEN];
   19722   rtx_insn *seq;
   19723   bool ok, same_halves = false;
   19724 
   19725   if (GET_MODE_SIZE (d->vmode) == 4
   19726       || GET_MODE_SIZE (d->vmode) == 8
   19727       || GET_MODE_SIZE (d->vmode) == 16)
   19728     {
   19729       if (d->one_operand_p)
   19730 	return false;
   19731     }
   19732   else if (GET_MODE_SIZE (d->vmode) == 32)
   19733     {
   19734       if (!TARGET_AVX)
   19735 	return false;
   19736       /* For 32-byte modes allow even d->one_operand_p.
   19737 	 The lack of cross-lane shuffling in some instructions
   19738 	 might prevent a single insn shuffle.  */
   19739       dfinal = *d;
   19740       dfinal.testing_p = true;
   19741       /* If expand_vec_perm_interleave3 can expand this into
   19742 	 a 3 insn sequence, give up and let it be expanded as
   19743 	 3 insn sequence.  While that is one insn longer,
   19744 	 it doesn't need a memory operand and in the common
   19745 	 case that both interleave low and high permutations
   19746 	 with the same operands are adjacent needs 4 insns
   19747 	 for both after CSE.  */
   19748       if (expand_vec_perm_interleave3 (&dfinal))
   19749 	return false;
   19750     }
   19751   else
   19752     return false;
   19753 
   19754   /* Examine from whence the elements come.  */
   19755   contents = 0;
   19756   for (i = 0; i < nelt; ++i)
   19757     contents |= HOST_WIDE_INT_1U << d->perm[i];
   19758 
   19759   memset (remap, 0xff, sizeof (remap));
   19760   dremap = *d;
   19761 
   19762   if (GET_MODE_SIZE (d->vmode) == 4
   19763       || GET_MODE_SIZE (d->vmode) == 8)
   19764     {
   19765       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   19766 
   19767       /* Split the two input vectors into 4 halves.  */
   19768       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   19769       h2 = h1 << nelt2;
   19770       h3 = h2 << nelt2;
   19771       h4 = h3 << nelt2;
   19772 
   19773       /* If the elements from the low halves use interleave low,
   19774 	 and similarly for interleave high.  */
   19775       if ((contents & (h1 | h3)) == contents)
   19776 	{
   19777 	  /* punpckl* */
   19778 	  for (i = 0; i < nelt2; ++i)
   19779 	    {
   19780 	      remap[i] = i * 2;
   19781 	      remap[i + nelt] = i * 2 + 1;
   19782 	      dremap.perm[i * 2] = i;
   19783 	      dremap.perm[i * 2 + 1] = i + nelt;
   19784 	    }
   19785 	}
   19786       else if ((contents & (h2 | h4)) == contents)
   19787 	{
   19788 	  /* punpckh* */
   19789 	  for (i = 0; i < nelt2; ++i)
   19790 	    {
   19791 	      remap[i + nelt2] = i * 2;
   19792 	      remap[i + nelt + nelt2] = i * 2 + 1;
   19793 	      dremap.perm[i * 2] = i + nelt2;
   19794 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   19795 	    }
   19796 	}
   19797       else
   19798 	return false;
   19799     }
   19800   else if (GET_MODE_SIZE (d->vmode) == 16)
   19801     {
   19802       unsigned HOST_WIDE_INT h1, h2, h3, h4;
   19803 
   19804       /* Split the two input vectors into 4 halves.  */
   19805       h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
   19806       h2 = h1 << nelt2;
   19807       h3 = h2 << nelt2;
   19808       h4 = h3 << nelt2;
   19809 
   19810       /* If the elements from the low halves use interleave low, and similarly
   19811 	 for interleave high.  If the elements are from mis-matched halves, we
   19812 	 can use shufps for V4SF/V4SI or do a DImode shuffle.  */
   19813       if ((contents & (h1 | h3)) == contents)
   19814 	{
   19815 	  /* punpckl* */
   19816 	  for (i = 0; i < nelt2; ++i)
   19817 	    {
   19818 	      remap[i] = i * 2;
   19819 	      remap[i + nelt] = i * 2 + 1;
   19820 	      dremap.perm[i * 2] = i;
   19821 	      dremap.perm[i * 2 + 1] = i + nelt;
   19822 	    }
   19823 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
   19824 	    dremap.vmode = V4SFmode;
   19825 	}
   19826       else if ((contents & (h2 | h4)) == contents)
   19827 	{
   19828 	  /* punpckh* */
   19829 	  for (i = 0; i < nelt2; ++i)
   19830 	    {
   19831 	      remap[i + nelt2] = i * 2;
   19832 	      remap[i + nelt + nelt2] = i * 2 + 1;
   19833 	      dremap.perm[i * 2] = i + nelt2;
   19834 	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
   19835 	    }
   19836 	  if (!TARGET_SSE2 && d->vmode == V4SImode)
   19837 	    dremap.vmode = V4SFmode;
   19838 	}
   19839       else if ((contents & (h1 | h4)) == contents)
   19840 	{
   19841 	  /* shufps */
   19842 	  for (i = 0; i < nelt2; ++i)
   19843 	    {
   19844 	      remap[i] = i;
   19845 	      remap[i + nelt + nelt2] = i + nelt2;
   19846 	      dremap.perm[i] = i;
   19847 	      dremap.perm[i + nelt2] = i + nelt + nelt2;
   19848 	    }
   19849 	  if (nelt != 4)
   19850 	    {
   19851 	      /* shufpd */
   19852 	      dremap.vmode = V2DImode;
   19853 	      dremap.nelt = 2;
   19854 	      dremap.perm[0] = 0;
   19855 	      dremap.perm[1] = 3;
   19856 	    }
   19857 	}
   19858       else if ((contents & (h2 | h3)) == contents)
   19859 	{
   19860 	  /* shufps */
   19861 	  for (i = 0; i < nelt2; ++i)
   19862 	    {
   19863 	      remap[i + nelt2] = i;
   19864 	      remap[i + nelt] = i + nelt2;
   19865 	      dremap.perm[i] = i + nelt2;
   19866 	      dremap.perm[i + nelt2] = i + nelt;
   19867 	    }
   19868 	  if (nelt != 4)
   19869 	    {
   19870 	      /* shufpd */
   19871 	      dremap.vmode = V2DImode;
   19872 	      dremap.nelt = 2;
   19873 	      dremap.perm[0] = 1;
   19874 	      dremap.perm[1] = 2;
   19875 	    }
   19876 	}
   19877       else
   19878 	return false;
   19879     }
   19880   else
   19881     {
   19882       unsigned int nelt4 = nelt / 4, nzcnt = 0;
   19883       unsigned HOST_WIDE_INT q[8];
   19884       unsigned int nonzero_halves[4];
   19885 
   19886       /* Split the two input vectors into 8 quarters.  */
   19887       q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
   19888       for (i = 1; i < 8; ++i)
   19889 	q[i] = q[0] << (nelt4 * i);
   19890       for (i = 0; i < 4; ++i)
   19891 	if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
   19892 	  {
   19893 	    nonzero_halves[nzcnt] = i;
   19894 	    ++nzcnt;
   19895 	  }
   19896 
   19897       if (nzcnt == 1)
   19898 	{
   19899 	  gcc_assert (d->one_operand_p);
   19900 	  nonzero_halves[1] = nonzero_halves[0];
   19901 	  same_halves = true;
   19902 	}
   19903       else if (d->one_operand_p)
   19904 	{
   19905 	  gcc_assert (nonzero_halves[0] == 0);
   19906 	  gcc_assert (nonzero_halves[1] == 1);
   19907 	}
   19908 
   19909       if (nzcnt <= 2)
   19910 	{
   19911 	  if (d->perm[0] / nelt2 == nonzero_halves[1])
   19912 	    {
   19913 	      /* Attempt to increase the likelihood that dfinal
   19914 		 shuffle will be intra-lane.  */
   19915 	      std::swap (nonzero_halves[0], nonzero_halves[1]);
   19916 	    }
   19917 
   19918 	  /* vperm2f128 or vperm2i128.  */
   19919 	  for (i = 0; i < nelt2; ++i)
   19920 	    {
   19921 	      remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
   19922 	      remap[i + nonzero_halves[0] * nelt2] = i;
   19923 	      dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
   19924 	      dremap.perm[i] = i + nonzero_halves[0] * nelt2;
   19925 	    }
   19926 
   19927 	  if (d->vmode != V8SFmode
   19928 	      && d->vmode != V4DFmode
   19929 	      && d->vmode != V8SImode)
   19930 	    {
   19931 	      dremap.vmode = V8SImode;
   19932 	      dremap.nelt = 8;
   19933 	      for (i = 0; i < 4; ++i)
   19934 		{
   19935 		  dremap.perm[i] = i + nonzero_halves[0] * 4;
   19936 		  dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
   19937 		}
   19938 	    }
   19939 	}
   19940       else if (d->one_operand_p)
   19941 	return false;
   19942       else if (TARGET_AVX2
   19943 	       && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
   19944 	{
   19945 	  /* vpunpckl* */
   19946 	  for (i = 0; i < nelt4; ++i)
   19947 	    {
   19948 	      remap[i] = i * 2;
   19949 	      remap[i + nelt] = i * 2 + 1;
   19950 	      remap[i + nelt2] = i * 2 + nelt2;
   19951 	      remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
   19952 	      dremap.perm[i * 2] = i;
   19953 	      dremap.perm[i * 2 + 1] = i + nelt;
   19954 	      dremap.perm[i * 2 + nelt2] = i + nelt2;
   19955 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
   19956 	    }
   19957 	}
   19958       else if (TARGET_AVX2
   19959 	       && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
   19960 	{
   19961 	  /* vpunpckh* */
   19962 	  for (i = 0; i < nelt4; ++i)
   19963 	    {
   19964 	      remap[i + nelt4] = i * 2;
   19965 	      remap[i + nelt + nelt4] = i * 2 + 1;
   19966 	      remap[i + nelt2 + nelt4] = i * 2 + nelt2;
   19967 	      remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
   19968 	      dremap.perm[i * 2] = i + nelt4;
   19969 	      dremap.perm[i * 2 + 1] = i + nelt + nelt4;
   19970 	      dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
   19971 	      dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
   19972 	    }
   19973 	}
   19974       else
   19975 	return false;
   19976     }
   19977 
   19978   /* Use the remapping array set up above to move the elements from their
   19979      swizzled locations into their final destinations.  */
   19980   dfinal = *d;
   19981   for (i = 0; i < nelt; ++i)
   19982     {
   19983       unsigned e = remap[d->perm[i]];
   19984       gcc_assert (e < nelt);
   19985       /* If same_halves is true, both halves of the remapped vector are the
   19986 	 same.  Avoid cross-lane accesses if possible.  */
   19987       if (same_halves && i >= nelt2)
   19988 	{
   19989 	  gcc_assert (e < nelt2);
   19990 	  dfinal.perm[i] = e + nelt2;
   19991 	}
   19992       else
   19993 	dfinal.perm[i] = e;
   19994     }
   19995   if (!d->testing_p)
   19996     {
   19997       dremap.target = gen_reg_rtx (dremap.vmode);
   19998       dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   19999     }
   20000   dfinal.op1 = dfinal.op0;
   20001   dfinal.one_operand_p = true;
   20002 
   20003   /* Test if the final remap can be done with a single insn.  For V4SFmode or
   20004      V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
   20005   start_sequence ();
   20006   ok = expand_vec_perm_1 (&dfinal);
   20007   seq = get_insns ();
   20008   end_sequence ();
   20009 
   20010   if (!ok)
   20011     return false;
   20012 
   20013   if (d->testing_p)
   20014     return true;
   20015 
   20016   if (dremap.vmode != dfinal.vmode)
   20017     {
   20018       dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
   20019       dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
   20020     }
   20021 
   20022   ok = expand_vec_perm_1 (&dremap);
   20023   gcc_assert (ok);
   20024 
   20025   emit_insn (seq);
   20026   return true;
   20027 }
   20028 
   20029 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   20030    a single vector cross-lane permutation into vpermq followed
   20031    by any of the single insn permutations.  */
   20032 
   20033 static bool
   20034 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
   20035 {
   20036   struct expand_vec_perm_d dremap, dfinal;
   20037   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
   20038   unsigned contents[2];
   20039   bool ok;
   20040 
   20041   if (!(TARGET_AVX2
   20042 	&& (d->vmode == V32QImode || d->vmode == V16HImode)
   20043 	&& d->one_operand_p))
   20044     return false;
   20045 
   20046   contents[0] = 0;
   20047   contents[1] = 0;
   20048   for (i = 0; i < nelt2; ++i)
   20049     {
   20050       contents[0] |= 1u << (d->perm[i] / nelt4);
   20051       contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
   20052     }
   20053 
   20054   for (i = 0; i < 2; ++i)
   20055     {
   20056       unsigned int cnt = 0;
   20057       for (j = 0; j < 4; ++j)
   20058 	if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
   20059 	  return false;
   20060     }
   20061 
   20062   if (d->testing_p)
   20063     return true;
   20064 
   20065   dremap = *d;
   20066   dremap.vmode = V4DImode;
   20067   dremap.nelt = 4;
   20068   dremap.target = gen_reg_rtx (V4DImode);
   20069   dremap.op0 = gen_lowpart (V4DImode, d->op0);
   20070   dremap.op1 = dremap.op0;
   20071   dremap.one_operand_p = true;
   20072   for (i = 0; i < 2; ++i)
   20073     {
   20074       unsigned int cnt = 0;
   20075       for (j = 0; j < 4; ++j)
   20076 	if ((contents[i] & (1u << j)) != 0)
   20077 	  dremap.perm[2 * i + cnt++] = j;
   20078       for (; cnt < 2; ++cnt)
   20079 	dremap.perm[2 * i + cnt] = 0;
   20080     }
   20081 
   20082   dfinal = *d;
   20083   dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
   20084   dfinal.op1 = dfinal.op0;
   20085   dfinal.one_operand_p = true;
   20086   for (i = 0, j = 0; i < nelt; ++i)
   20087     {
   20088       if (i == nelt2)
   20089 	j = 2;
   20090       dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
   20091       if ((d->perm[i] / nelt4) == dremap.perm[j])
   20092 	;
   20093       else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
   20094 	dfinal.perm[i] |= nelt4;
   20095       else
   20096 	gcc_unreachable ();
   20097     }
   20098 
   20099   ok = expand_vec_perm_1 (&dremap);
   20100   gcc_assert (ok);
   20101 
   20102   ok = expand_vec_perm_1 (&dfinal);
   20103   gcc_assert (ok);
   20104 
   20105   return true;
   20106 }
   20107 
   20108 static bool canonicalize_perm (struct expand_vec_perm_d *d);
   20109 
   20110 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
   20111    a vector permutation using two instructions, vperm2f128 resp.
   20112    vperm2i128 followed by any single in-lane permutation.  */
   20113 
   20114 static bool
   20115 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
   20116 {
   20117   struct expand_vec_perm_d dfirst, dsecond;
   20118   unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
   20119   bool ok;
   20120 
   20121   if (!TARGET_AVX
   20122       || GET_MODE_SIZE (d->vmode) != 32
   20123       || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
   20124     return false;
   20125 
   20126   dsecond = *d;
   20127   dsecond.one_operand_p = false;
   20128   dsecond.testing_p = true;
   20129 
   20130   /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
   20131      immediate.  For perm < 16 the second permutation uses
   20132      d->op0 as first operand, for perm >= 16 it uses d->op1
   20133      as first operand.  The second operand is the result of
   20134      vperm2[fi]128.  */
   20135   for (perm = 0; perm < 32; perm++)
   20136     {
   20137       /* Ignore permutations which do not move anything cross-lane.  */
   20138       if (perm < 16)
   20139 	{
   20140 	  /* The second shuffle for e.g. V4DFmode has
   20141 	     0123 and ABCD operands.
   20142 	     Ignore AB23, as 23 is already in the second lane
   20143 	     of the first operand.  */
   20144 	  if ((perm & 0xc) == (1 << 2)) continue;
   20145 	  /* And 01CD, as 01 is in the first lane of the first
   20146 	     operand.  */
   20147 	  if ((perm & 3) == 0) continue;
   20148 	  /* And 4567, as then the vperm2[fi]128 doesn't change
   20149 	     anything on the original 4567 second operand.  */
   20150 	  if ((perm & 0xf) == ((3 << 2) | 2)) continue;
   20151 	}
   20152       else
   20153 	{
   20154 	  /* The second shuffle for e.g. V4DFmode has
   20155 	     4567 and ABCD operands.
   20156 	     Ignore AB67, as 67 is already in the second lane
   20157 	     of the first operand.  */
   20158 	  if ((perm & 0xc) == (3 << 2)) continue;
   20159 	  /* And 45CD, as 45 is in the first lane of the first
   20160 	     operand.  */
   20161 	  if ((perm & 3) == 2) continue;
   20162 	  /* And 0123, as then the vperm2[fi]128 doesn't change
   20163 	     anything on the original 0123 first operand.  */
   20164 	  if ((perm & 0xf) == (1 << 2)) continue;
   20165 	}
   20166 
   20167       for (i = 0; i < nelt; i++)
   20168 	{
   20169 	  j = d->perm[i] / nelt2;
   20170 	  if (j == ((perm >> (2 * (i >= nelt2))) & 3))
   20171 	    dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
   20172 	  else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
   20173 	    dsecond.perm[i] = d->perm[i] & (nelt - 1);
   20174 	  else
   20175 	    break;
   20176 	}
   20177 
   20178       if (i == nelt)
   20179 	{
   20180 	  start_sequence ();
   20181 	  ok = expand_vec_perm_1 (&dsecond);
   20182 	  end_sequence ();
   20183 	}
   20184       else
   20185 	ok = false;
   20186 
   20187       if (ok)
   20188 	{
   20189 	  if (d->testing_p)
   20190 	    return true;
   20191 
   20192 	  /* Found a usable second shuffle.  dfirst will be
   20193 	     vperm2f128 on d->op0 and d->op1.  */
   20194 	  dsecond.testing_p = false;
   20195 	  dfirst = *d;
   20196 	  dfirst.target = gen_reg_rtx (d->vmode);
   20197 	  for (i = 0; i < nelt; i++)
   20198 	    dfirst.perm[i] = (i & (nelt2 - 1))
   20199 			     + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
   20200 
   20201 	  canonicalize_perm (&dfirst);
   20202 	  ok = expand_vec_perm_1 (&dfirst);
   20203 	  gcc_assert (ok);
   20204 
   20205 	  /* And dsecond is some single insn shuffle, taking
   20206 	     d->op0 and result of vperm2f128 (if perm < 16) or
   20207 	     d->op1 and result of vperm2f128 (otherwise).  */
   20208 	  if (perm >= 16)
   20209 	    dsecond.op0 = dsecond.op1;
   20210 	  dsecond.op1 = dfirst.target;
   20211 
   20212 	  ok = expand_vec_perm_1 (&dsecond);
   20213 	  gcc_assert (ok);
   20214 
   20215 	  return true;
   20216 	}
   20217 
   20218       /* For one operand, the only useful vperm2f128 permutation is 0x01
   20219 	 aka lanes swap.  */
   20220       if (d->one_operand_p)
   20221 	return false;
   20222     }
   20223 
   20224   return false;
   20225 }
   20226 
   20227 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   20228    a two vector permutation using 2 intra-lane interleave insns
   20229    and cross-lane shuffle for 32-byte vectors.  */
   20230 
   20231 static bool
   20232 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
   20233 {
   20234   unsigned i, nelt;
   20235   rtx (*gen) (rtx, rtx, rtx);
   20236 
   20237   if (d->one_operand_p)
   20238     return false;
   20239   if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
   20240     ;
   20241   else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
   20242     ;
   20243   else
   20244     return false;
   20245 
   20246   nelt = d->nelt;
   20247   if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
   20248     return false;
   20249   for (i = 0; i < nelt; i += 2)
   20250     if (d->perm[i] != d->perm[0] + i / 2
   20251 	|| d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
   20252       return false;
   20253 
   20254   if (d->testing_p)
   20255     return true;
   20256 
   20257   switch (d->vmode)
   20258     {
   20259     case E_V32QImode:
   20260       if (d->perm[0])
   20261 	gen = gen_vec_interleave_highv32qi;
   20262       else
   20263 	gen = gen_vec_interleave_lowv32qi;
   20264       break;
   20265     case E_V16HImode:
   20266       if (d->perm[0])
   20267 	gen = gen_vec_interleave_highv16hi;
   20268       else
   20269 	gen = gen_vec_interleave_lowv16hi;
   20270       break;
   20271     case E_V8SImode:
   20272       if (d->perm[0])
   20273 	gen = gen_vec_interleave_highv8si;
   20274       else
   20275 	gen = gen_vec_interleave_lowv8si;
   20276       break;
   20277     case E_V4DImode:
   20278       if (d->perm[0])
   20279 	gen = gen_vec_interleave_highv4di;
   20280       else
   20281 	gen = gen_vec_interleave_lowv4di;
   20282       break;
   20283     case E_V8SFmode:
   20284       if (d->perm[0])
   20285 	gen = gen_vec_interleave_highv8sf;
   20286       else
   20287 	gen = gen_vec_interleave_lowv8sf;
   20288       break;
   20289     case E_V4DFmode:
   20290       if (d->perm[0])
   20291 	gen = gen_vec_interleave_highv4df;
   20292       else
   20293 	gen = gen_vec_interleave_lowv4df;
   20294       break;
   20295     default:
   20296       gcc_unreachable ();
   20297     }
   20298 
   20299   emit_insn (gen (d->target, d->op0, d->op1));
   20300   return true;
   20301 }
   20302 
   20303 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   20304    a single vector permutation using a single intra-lane vector
   20305    permutation, vperm2f128 swapping the lanes and vblend* insn blending
   20306    the non-swapped and swapped vectors together.  */
   20307 
   20308 static bool
   20309 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   20310 {
   20311   struct expand_vec_perm_d dfirst, dsecond;
   20312   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
   20313   rtx_insn *seq;
   20314   bool ok;
   20315   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   20316 
   20317   if (!TARGET_AVX
   20318       || TARGET_AVX2
   20319       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   20320       || !d->one_operand_p)
   20321     return false;
   20322 
   20323   dfirst = *d;
   20324   for (i = 0; i < nelt; i++)
   20325     dfirst.perm[i] = 0xff;
   20326   for (i = 0, msk = 0; i < nelt; i++)
   20327     {
   20328       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   20329       if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
   20330 	return false;
   20331       dfirst.perm[j] = d->perm[i];
   20332       if (j != i)
   20333 	msk |= (1 << i);
   20334     }
   20335   for (i = 0; i < nelt; i++)
   20336     if (dfirst.perm[i] == 0xff)
   20337       dfirst.perm[i] = i;
   20338 
   20339   if (!d->testing_p)
   20340     dfirst.target = gen_reg_rtx (dfirst.vmode);
   20341 
   20342   start_sequence ();
   20343   ok = expand_vec_perm_1 (&dfirst);
   20344   seq = get_insns ();
   20345   end_sequence ();
   20346 
   20347   if (!ok)
   20348     return false;
   20349 
   20350   if (d->testing_p)
   20351     return true;
   20352 
   20353   emit_insn (seq);
   20354 
   20355   dsecond = *d;
   20356   dsecond.op0 = dfirst.target;
   20357   dsecond.op1 = dfirst.target;
   20358   dsecond.one_operand_p = true;
   20359   dsecond.target = gen_reg_rtx (dsecond.vmode);
   20360   for (i = 0; i < nelt; i++)
   20361     dsecond.perm[i] = i ^ nelt2;
   20362 
   20363   ok = expand_vec_perm_1 (&dsecond);
   20364   gcc_assert (ok);
   20365 
   20366   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   20367   emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
   20368   return true;
   20369 }
   20370 
   20371 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   20372    a two vector permutation using two single vector permutations and
   20373    {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
   20374    of dfirst or dsecond is identity permutation.  */
   20375 
   20376 static bool
   20377 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
   20378 {
   20379   unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
   20380   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   20381   bool ident1 = true, ident2 = true;
   20382 
   20383   if (d->one_operand_p)
   20384     return false;
   20385 
   20386   if (GET_MODE_SIZE (d->vmode) == 16)
   20387     {
   20388       if (!TARGET_SSE)
   20389 	return false;
   20390       if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
   20391 	return false;
   20392     }
   20393   else if (GET_MODE_SIZE (d->vmode) == 32)
   20394     {
   20395       if (!TARGET_AVX)
   20396 	return false;
   20397       if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
   20398 	return false;
   20399       lane = nelt2;
   20400     }
   20401   else
   20402     return false;
   20403 
   20404   for (i = 1; i < nelt; i++)
   20405     if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
   20406       return false;
   20407 
   20408   dfirst = *d;
   20409   dsecond = *d;
   20410   dfinal = *d;
   20411   dfirst.op1 = dfirst.op0;
   20412   dfirst.one_operand_p = true;
   20413   dsecond.op0 = dsecond.op1;
   20414   dsecond.one_operand_p = true;
   20415 
   20416   for (i = 0; i < nelt; i++)
   20417     if (d->perm[i] >= nelt)
   20418       {
   20419 	dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
   20420 	if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
   20421 	  ident2 = false;
   20422 	dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
   20423 	  = d->perm[i] - nelt;
   20424       }
   20425     else
   20426       {
   20427 	dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
   20428 	if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
   20429 	  ident1 = false;
   20430 	dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
   20431       }
   20432 
   20433   if (two_insn && !ident1 && !ident2)
   20434     return false;
   20435 
   20436   if (!d->testing_p)
   20437     {
   20438       if (!ident1)
   20439 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   20440       if (!ident2)
   20441 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   20442       if (d->perm[0] >= nelt)
   20443 	std::swap (dfinal.op0, dfinal.op1);
   20444     }
   20445 
   20446   bool ok;
   20447   rtx_insn *seq1 = NULL, *seq2 = NULL;
   20448 
   20449   if (!ident1)
   20450     {
   20451       start_sequence ();
   20452       ok = expand_vec_perm_1 (&dfirst);
   20453       seq1 = get_insns ();
   20454       end_sequence ();
   20455 
   20456       if (!ok)
   20457 	return false;
   20458     }
   20459 
   20460   if (!ident2)
   20461     {
   20462       start_sequence ();
   20463       ok = expand_vec_perm_1 (&dsecond);
   20464       seq2 = get_insns ();
   20465       end_sequence ();
   20466 
   20467       if (!ok)
   20468 	return false;
   20469     }
   20470 
   20471   if (d->testing_p)
   20472     return true;
   20473 
   20474   for (i = 0; i < nelt; i++)
   20475     {
   20476       dfinal.perm[i] = i / 2;
   20477       if (i >= lane)
   20478 	dfinal.perm[i] += lane / 2;
   20479       if ((i & 1) != 0)
   20480 	dfinal.perm[i] += nelt;
   20481     }
   20482   emit_insn (seq1);
   20483   emit_insn (seq2);
   20484   ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
   20485 			       dfinal.perm, dfinal.nelt, false);
   20486   gcc_assert (ok);
   20487   return true;
   20488 }
   20489 
   20490 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
   20491    the permutation using two single vector permutations and the SSE4_1 pblendv
   20492    instruction.  If two_insn, succeed only if one of dfirst or dsecond is
   20493    identity permutation.  */
   20494 
   20495 static bool
   20496 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   20497 {
   20498   unsigned i, nelt = d->nelt;
   20499   struct expand_vec_perm_d dfirst, dsecond, dfinal;
   20500   machine_mode vmode = d->vmode;
   20501   bool ident1 = true, ident2 = true;
   20502 
   20503   /* Use the same checks as in expand_vec_perm_blend.  */
   20504   if (d->one_operand_p)
   20505     return false;
   20506   if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
   20507     ;
   20508   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
   20509     ;
   20510   else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
   20511 			     || GET_MODE_SIZE (vmode) == 8
   20512 			     || GET_MODE_SIZE (vmode) == 4))
   20513     ;
   20514   else
   20515     return false;
   20516 
   20517   dfirst = *d;
   20518   dsecond = *d;
   20519   dfinal = *d;
   20520   dfirst.op1 = dfirst.op0;
   20521   dfirst.one_operand_p = true;
   20522   dsecond.op0 = dsecond.op1;
   20523   dsecond.one_operand_p = true;
   20524 
   20525   for (i = 0; i < nelt; ++i)
   20526     if (d->perm[i] >= nelt)
   20527       {
   20528 	dfirst.perm[i] = 0xff;
   20529 	dsecond.perm[i] = d->perm[i] - nelt;
   20530 	if (d->perm[i] != i + nelt)
   20531 	  ident2 = false;
   20532       }
   20533     else
   20534       {
   20535 	dsecond.perm[i] = 0xff;
   20536 	dfirst.perm[i] = d->perm[i];
   20537 	if (d->perm[i] != i)
   20538 	  ident1 = false;
   20539       }
   20540 
   20541   if (two_insn && !ident1 && !ident2)
   20542     return false;
   20543 
   20544   /* For now.  Ideally treat 0xff as a wildcard.  */
   20545   for (i = 0; i < nelt; ++i)
   20546     if (dfirst.perm[i] == 0xff)
   20547       {
   20548 	if (GET_MODE_SIZE (vmode) == 32
   20549 	    && dfirst.perm[i ^ (nelt / 2)] != 0xff)
   20550 	  dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   20551 	else
   20552 	  dfirst.perm[i] = i;
   20553       }
   20554     else
   20555       {
   20556 	if (GET_MODE_SIZE (vmode) == 32
   20557 	    && dsecond.perm[i ^ (nelt / 2)] != 0xff)
   20558 	  dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
   20559 	else
   20560 	  dsecond.perm[i] = i;
   20561       }
   20562 
   20563   if (!d->testing_p)
   20564     {
   20565       if (!ident1)
   20566 	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
   20567       if (!ident2)
   20568 	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
   20569     }
   20570 
   20571   bool ok;
   20572   rtx_insn *seq1 = NULL, *seq2 = NULL;
   20573 
   20574   if (!ident1)
   20575     {
   20576       start_sequence ();
   20577       ok = expand_vec_perm_1 (&dfirst);
   20578       seq1 = get_insns ();
   20579       end_sequence ();
   20580 
   20581       if (!ok)
   20582 	return false;
   20583     }
   20584 
   20585   if (!ident2)
   20586     {
   20587       start_sequence ();
   20588       ok = expand_vec_perm_1 (&dsecond);
   20589       seq2 = get_insns ();
   20590       end_sequence ();
   20591 
   20592       if (!ok)
   20593 	return false;
   20594     }
   20595 
   20596   if (d->testing_p)
   20597     return true;
   20598 
   20599   for (i = 0; i < nelt; ++i)
   20600     dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
   20601 
   20602   emit_insn (seq1);
   20603   emit_insn (seq2);
   20604   ok = expand_vec_perm_blend (&dfinal);
   20605   gcc_assert (ok);
   20606   return true;
   20607 }
   20608 
   20609 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
   20610    permutation using two vperm2f128, followed by a vshufpd insn blending
   20611    the two vectors together.  */
   20612 
   20613 static bool
   20614 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
   20615 {
   20616   struct expand_vec_perm_d dfirst, dsecond, dthird;
   20617   bool ok;
   20618 
   20619   if (!TARGET_AVX || (d->vmode != V4DFmode))
   20620     return false;
   20621 
   20622   if (d->testing_p)
   20623     return true;
   20624 
   20625   dfirst = *d;
   20626   dsecond = *d;
   20627   dthird = *d;
   20628 
   20629   dfirst.perm[0] = (d->perm[0] & ~1);
   20630   dfirst.perm[1] = (d->perm[0] & ~1) + 1;
   20631   dfirst.perm[2] = (d->perm[2] & ~1);
   20632   dfirst.perm[3] = (d->perm[2] & ~1) + 1;
   20633   dsecond.perm[0] = (d->perm[1] & ~1);
   20634   dsecond.perm[1] = (d->perm[1] & ~1) + 1;
   20635   dsecond.perm[2] = (d->perm[3] & ~1);
   20636   dsecond.perm[3] = (d->perm[3] & ~1) + 1;
   20637   dthird.perm[0] = (d->perm[0] % 2);
   20638   dthird.perm[1] = (d->perm[1] % 2) + 4;
   20639   dthird.perm[2] = (d->perm[2] % 2) + 2;
   20640   dthird.perm[3] = (d->perm[3] % 2) + 6;
   20641 
   20642   dfirst.target = gen_reg_rtx (dfirst.vmode);
   20643   dsecond.target = gen_reg_rtx (dsecond.vmode);
   20644   dthird.op0 = dfirst.target;
   20645   dthird.op1 = dsecond.target;
   20646   dthird.one_operand_p = false;
   20647 
   20648   canonicalize_perm (&dfirst);
   20649   canonicalize_perm (&dsecond);
   20650 
   20651   ok = expand_vec_perm_1 (&dfirst)
   20652        && expand_vec_perm_1 (&dsecond)
   20653        && expand_vec_perm_1 (&dthird);
   20654 
   20655   gcc_assert (ok);
   20656 
   20657   return true;
   20658 }
   20659 
   20660 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
   20661 
   20662 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
   20663    a two vector permutation using two intra-lane vector
   20664    permutations, vperm2f128 swapping the lanes and vblend* insn blending
   20665    the non-swapped and swapped vectors together.  */
   20666 
   20667 static bool
   20668 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
   20669 {
   20670   struct expand_vec_perm_d dfirst, dsecond, dthird;
   20671   unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
   20672   rtx_insn *seq1, *seq2;
   20673   bool ok;
   20674   rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
   20675 
   20676   if (!TARGET_AVX
   20677       || TARGET_AVX2
   20678       || (d->vmode != V8SFmode && d->vmode != V4DFmode)
   20679       || d->one_operand_p)
   20680     return false;
   20681 
   20682   dfirst = *d;
   20683   dsecond = *d;
   20684   for (i = 0; i < nelt; i++)
   20685     {
   20686       dfirst.perm[i] = 0xff;
   20687       dsecond.perm[i] = 0xff;
   20688     }
   20689   for (i = 0, msk = 0; i < nelt; i++)
   20690     {
   20691       j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
   20692       if (j == i)
   20693 	{
   20694 	  dfirst.perm[j] = d->perm[i];
   20695 	  which1 |= (d->perm[i] < nelt ? 1 : 2);
   20696 	}
   20697       else
   20698 	{
   20699 	  dsecond.perm[j] = d->perm[i];
   20700 	  which2 |= (d->perm[i] < nelt ? 1 : 2);
   20701 	  msk |= (1U << i);
   20702 	}
   20703     }
   20704   if (msk == 0 || msk == (1U << nelt) - 1)
   20705     return false;
   20706 
   20707   if (!d->testing_p)
   20708     {
   20709       dfirst.target = gen_reg_rtx (dfirst.vmode);
   20710       dsecond.target = gen_reg_rtx (dsecond.vmode);
   20711     }
   20712 
   20713   for (i = 0; i < nelt; i++)
   20714     {
   20715       if (dfirst.perm[i] == 0xff)
   20716 	dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
   20717       if (dsecond.perm[i] == 0xff)
   20718 	dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
   20719     }
   20720   canonicalize_perm (&dfirst);
   20721   start_sequence ();
   20722   ok = ix86_expand_vec_perm_const_1 (&dfirst);
   20723   seq1 = get_insns ();
   20724   end_sequence ();
   20725 
   20726   if (!ok)
   20727     return false;
   20728 
   20729   canonicalize_perm (&dsecond);
   20730   start_sequence ();
   20731   ok = ix86_expand_vec_perm_const_1 (&dsecond);
   20732   seq2 = get_insns ();
   20733   end_sequence ();
   20734 
   20735   if (!ok)
   20736     return false;
   20737 
   20738   if (d->testing_p)
   20739     return true;
   20740 
   20741   emit_insn (seq1);
   20742   emit_insn (seq2);
   20743 
   20744   dthird = *d;
   20745   dthird.op0 = dsecond.target;
   20746   dthird.op1 = dsecond.target;
   20747   dthird.one_operand_p = true;
   20748   dthird.target = gen_reg_rtx (dthird.vmode);
   20749   for (i = 0; i < nelt; i++)
   20750     dthird.perm[i] = i ^ nelt2;
   20751 
   20752   ok = expand_vec_perm_1 (&dthird);
   20753   gcc_assert (ok);
   20754 
   20755   blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
   20756   emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
   20757   return true;
   20758 }
   20759 
   20760 /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
   20761    permutation with two pshufb insns and an ior.  We should have already
   20762    failed all two instruction sequences.  */
   20763 
   20764 static bool
   20765 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   20766 {
   20767   rtx rperm[2][16], vperm, l, h, op, m128;
   20768   unsigned int i, nelt, eltsz;
   20769   machine_mode mode;
   20770   rtx (*gen) (rtx, rtx, rtx);
   20771 
   20772   if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
   20773 			&& GET_MODE_SIZE (d->vmode) != 8
   20774 			&& GET_MODE_SIZE (d->vmode) != 4))
   20775     return false;
   20776   gcc_assert (!d->one_operand_p);
   20777 
   20778   if (d->testing_p)
   20779     return true;
   20780 
   20781   switch (GET_MODE_SIZE (d->vmode))
   20782     {
   20783     case 4:
   20784       mode = V4QImode;
   20785       gen = gen_mmx_pshufbv4qi3;
   20786       break;
   20787     case 8:
   20788       mode = V8QImode;
   20789       gen = gen_mmx_pshufbv8qi3;
   20790       break;
   20791     case 16:
   20792       mode = V16QImode;
   20793       gen = gen_ssse3_pshufbv16qi3;
   20794       break;
   20795     default:
   20796       gcc_unreachable ();
   20797     }
   20798 
   20799   nelt = d->nelt;
   20800   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   20801 
   20802   /* Generate two permutation masks.  If the required element is within
   20803      the given vector it is shuffled into the proper lane.  If the required
   20804      element is in the other vector, force a zero into the lane by setting
   20805      bit 7 in the permutation mask.  */
   20806   m128 = GEN_INT (-128);
   20807   for (i = 0; i < nelt; ++i)
   20808     {
   20809       unsigned j, k, e = d->perm[i];
   20810       unsigned which = (e >= nelt);
   20811       if (e >= nelt)
   20812 	e -= nelt;
   20813 
   20814       for (j = 0; j < eltsz; ++j)
   20815 	{
   20816 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
   20817 	  rperm[1-which][i*eltsz + j] = m128;
   20818 	}
   20819 
   20820       for (k = i*eltsz + j; k < 16; ++k)
   20821 	rperm[0][k] = rperm[1][k] = m128;
   20822     }
   20823 
   20824   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   20825   vperm = force_reg (V16QImode, vperm);
   20826 
   20827   l = gen_reg_rtx (mode);
   20828   op = gen_lowpart (mode, d->op0);
   20829   emit_insn (gen (l, op, vperm));
   20830 
   20831   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   20832   vperm = force_reg (V16QImode, vperm);
   20833 
   20834   h = gen_reg_rtx (mode);
   20835   op = gen_lowpart (mode, d->op1);
   20836   emit_insn (gen (h, op, vperm));
   20837 
   20838   op = d->target;
   20839   if (d->vmode != mode)
   20840     op = gen_reg_rtx (mode);
   20841   ix86_emit_vec_binop (IOR, mode, op, l, h);
   20842   if (op != d->target)
   20843     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   20844 
   20845   return true;
   20846 }
   20847 
   20848 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
   20849    with two vpshufb insns, vpermq and vpor.  We should have already failed
   20850    all two or three instruction sequences.  */
   20851 
   20852 static bool
   20853 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
   20854 {
   20855   rtx rperm[2][32], vperm, l, h, hp, op, m128;
   20856   unsigned int i, nelt, eltsz;
   20857 
   20858   if (!TARGET_AVX2
   20859       || !d->one_operand_p
   20860       || (d->vmode != V32QImode && d->vmode != V16HImode))
   20861     return false;
   20862 
   20863   if (d->testing_p)
   20864     return true;
   20865 
   20866   nelt = d->nelt;
   20867   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   20868 
   20869   /* Generate two permutation masks.  If the required element is within
   20870      the same lane, it is shuffled in.  If the required element from the
   20871      other lane, force a zero by setting bit 7 in the permutation mask.
   20872      In the other mask the mask has non-negative elements if element
   20873      is requested from the other lane, but also moved to the other lane,
   20874      so that the result of vpshufb can have the two V2TImode halves
   20875      swapped.  */
   20876   m128 = GEN_INT (-128);
   20877   for (i = 0; i < nelt; ++i)
   20878     {
   20879       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   20880       unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   20881 
   20882       for (j = 0; j < eltsz; ++j)
   20883 	{
   20884 	  rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
   20885 	  rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
   20886 	}
   20887     }
   20888 
   20889   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   20890   vperm = force_reg (V32QImode, vperm);
   20891 
   20892   h = gen_reg_rtx (V32QImode);
   20893   op = gen_lowpart (V32QImode, d->op0);
   20894   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   20895 
   20896   /* Swap the 128-byte lanes of h into hp.  */
   20897   hp = gen_reg_rtx (V4DImode);
   20898   op = gen_lowpart (V4DImode, h);
   20899   emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
   20900 				  const1_rtx));
   20901 
   20902   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   20903   vperm = force_reg (V32QImode, vperm);
   20904 
   20905   l = gen_reg_rtx (V32QImode);
   20906   op = gen_lowpart (V32QImode, d->op0);
   20907   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   20908 
   20909   op = d->target;
   20910   if (d->vmode != V32QImode)
   20911     op = gen_reg_rtx (V32QImode);
   20912   emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
   20913   if (op != d->target)
   20914     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   20915 
   20916   return true;
   20917 }
   20918 
   20919 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   20920    and extract-odd permutations of two V32QImode and V16QImode operand
   20921    with two vpshufb insns, vpor and vpermq.  We should have already
   20922    failed all two or three instruction sequences.  */
   20923 
   20924 static bool
   20925 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
   20926 {
   20927   rtx rperm[2][32], vperm, l, h, ior, op, m128;
   20928   unsigned int i, nelt, eltsz;
   20929 
   20930   if (!TARGET_AVX2
   20931       || d->one_operand_p
   20932       || (d->vmode != V32QImode && d->vmode != V16HImode))
   20933     return false;
   20934 
   20935   for (i = 0; i < d->nelt; ++i)
   20936     if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
   20937       return false;
   20938 
   20939   if (d->testing_p)
   20940     return true;
   20941 
   20942   nelt = d->nelt;
   20943   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   20944 
   20945   /* Generate two permutation masks.  In the first permutation mask
   20946      the first quarter will contain indexes for the first half
   20947      of the op0, the second quarter will contain bit 7 set, third quarter
   20948      will contain indexes for the second half of the op0 and the
   20949      last quarter bit 7 set.  In the second permutation mask
   20950      the first quarter will contain bit 7 set, the second quarter
   20951      indexes for the first half of the op1, the third quarter bit 7 set
   20952      and last quarter indexes for the second half of the op1.
   20953      I.e. the first mask e.g. for V32QImode extract even will be:
   20954      0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
   20955      (all values masked with 0xf except for -128) and second mask
   20956      for extract even will be
   20957      -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
   20958   m128 = GEN_INT (-128);
   20959   for (i = 0; i < nelt; ++i)
   20960     {
   20961       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   20962       unsigned which = d->perm[i] >= nelt;
   20963       unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
   20964 
   20965       for (j = 0; j < eltsz; ++j)
   20966 	{
   20967 	  rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
   20968 	  rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
   20969 	}
   20970     }
   20971 
   20972   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
   20973   vperm = force_reg (V32QImode, vperm);
   20974 
   20975   l = gen_reg_rtx (V32QImode);
   20976   op = gen_lowpart (V32QImode, d->op0);
   20977   emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
   20978 
   20979   vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
   20980   vperm = force_reg (V32QImode, vperm);
   20981 
   20982   h = gen_reg_rtx (V32QImode);
   20983   op = gen_lowpart (V32QImode, d->op1);
   20984   emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
   20985 
   20986   ior = gen_reg_rtx (V32QImode);
   20987   emit_insn (gen_iorv32qi3 (ior, l, h));
   20988 
   20989   /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
   20990   op = gen_reg_rtx (V4DImode);
   20991   ior = gen_lowpart (V4DImode, ior);
   20992   emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
   20993 				  const1_rtx, GEN_INT (3)));
   20994   emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   20995 
   20996   return true;
   20997 }
   20998 
   20999 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   21000    and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
   21001    operands with two "and" and "pack" or two "shift" and "pack" insns.
   21002    We should have already failed all two instruction sequences.  */
   21003 
   21004 static bool
   21005 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
   21006 {
   21007   rtx op, dop0, dop1, t;
   21008   unsigned i, odd, c, s, nelt = d->nelt;
   21009   bool end_perm = false;
   21010   machine_mode half_mode;
   21011   rtx (*gen_and) (rtx, rtx, rtx);
   21012   rtx (*gen_pack) (rtx, rtx, rtx);
   21013   rtx (*gen_shift) (rtx, rtx, rtx);
   21014 
   21015   if (d->one_operand_p)
   21016     return false;
   21017 
   21018   switch (d->vmode)
   21019     {
   21020     case E_V4HImode:
   21021       /* Required for "pack".  */
   21022       if (!TARGET_SSE4_1)
   21023 	return false;
   21024       c = 0xffff;
   21025       s = 16;
   21026       half_mode = V2SImode;
   21027       gen_and = gen_andv2si3;
   21028       gen_pack = gen_mmx_packusdw;
   21029       gen_shift = gen_lshrv2si3;
   21030       break;
   21031     case E_V8HImode:
   21032       /* Required for "pack".  */
   21033       if (!TARGET_SSE4_1)
   21034         return false;
   21035       c = 0xffff;
   21036       s = 16;
   21037       half_mode = V4SImode;
   21038       gen_and = gen_andv4si3;
   21039       gen_pack = gen_sse4_1_packusdw;
   21040       gen_shift = gen_lshrv4si3;
   21041       break;
   21042     case E_V8QImode:
   21043       /* No check as all instructions are SSE2.  */
   21044       c = 0xff;
   21045       s = 8;
   21046       half_mode = V4HImode;
   21047       gen_and = gen_andv4hi3;
   21048       gen_pack = gen_mmx_packuswb;
   21049       gen_shift = gen_lshrv4hi3;
   21050       break;
   21051     case E_V16QImode:
   21052       /* No check as all instructions are SSE2.  */
   21053       c = 0xff;
   21054       s = 8;
   21055       half_mode = V8HImode;
   21056       gen_and = gen_andv8hi3;
   21057       gen_pack = gen_sse2_packuswb;
   21058       gen_shift = gen_lshrv8hi3;
   21059       break;
   21060     case E_V16HImode:
   21061       if (!TARGET_AVX2)
   21062         return false;
   21063       c = 0xffff;
   21064       s = 16;
   21065       half_mode = V8SImode;
   21066       gen_and = gen_andv8si3;
   21067       gen_pack = gen_avx2_packusdw;
   21068       gen_shift = gen_lshrv8si3;
   21069       end_perm = true;
   21070       break;
   21071     case E_V32QImode:
   21072       if (!TARGET_AVX2)
   21073         return false;
   21074       c = 0xff;
   21075       s = 8;
   21076       half_mode = V16HImode;
   21077       gen_and = gen_andv16hi3;
   21078       gen_pack = gen_avx2_packuswb;
   21079       gen_shift = gen_lshrv16hi3;
   21080       end_perm = true;
   21081       break;
   21082     default:
   21083       /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
   21084 	 are more profitable than general shuffles.  */
   21085       return false;
   21086     }
   21087 
   21088   /* Check that permutation is even or odd.  */
   21089   odd = d->perm[0];
   21090   if (odd > 1)
   21091     return false;
   21092 
   21093   for (i = 1; i < nelt; ++i)
   21094     if (d->perm[i] != 2 * i + odd)
   21095       return false;
   21096 
   21097   if (d->testing_p)
   21098     return true;
   21099 
   21100   dop0 = gen_reg_rtx (half_mode);
   21101   dop1 = gen_reg_rtx (half_mode);
   21102   if (odd == 0)
   21103     {
   21104       t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
   21105       t = force_reg (half_mode, t);
   21106       emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
   21107       emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
   21108     }
   21109   else
   21110     {
   21111       emit_insn (gen_shift (dop0,
   21112 			    gen_lowpart (half_mode, d->op0),
   21113 			    GEN_INT (s)));
   21114       emit_insn (gen_shift (dop1,
   21115 			    gen_lowpart (half_mode, d->op1),
   21116 			    GEN_INT (s)));
   21117     }
   21118   /* In AVX2 for 256 bit case we need to permute pack result.  */
   21119   if (TARGET_AVX2 && end_perm)
   21120     {
   21121       op = gen_reg_rtx (d->vmode);
   21122       t = gen_reg_rtx (V4DImode);
   21123       emit_insn (gen_pack (op, dop0, dop1));
   21124       emit_insn (gen_avx2_permv4di_1 (t,
   21125 				      gen_lowpart (V4DImode, op),
   21126 				      const0_rtx,
   21127 				      const2_rtx,
   21128 				      const1_rtx,
   21129 				      GEN_INT (3)));
   21130       emit_move_insn (d->target, gen_lowpart (d->vmode, t));
   21131     }
   21132   else
   21133     emit_insn (gen_pack (d->target, dop0, dop1));
   21134 
   21135   return true;
   21136 }
   21137 
   21138 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
   21139    and extract-odd permutations of two V64QI operands
   21140    with two "shifts", two "truncs" and one "concat" insns for "odd"
   21141    and two "truncs" and one concat insn for "even."
   21142    Have already failed all two instruction sequences.  */
   21143 
   21144 static bool
   21145 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
   21146 {
   21147   rtx t1, t2, t3, t4;
   21148   unsigned i, odd, nelt = d->nelt;
   21149 
   21150   if (!TARGET_AVX512BW
   21151       || d->one_operand_p
   21152       || d->vmode != V64QImode)
   21153     return false;
   21154 
   21155   /* Check that permutation is even or odd.  */
   21156   odd = d->perm[0];
   21157   if (odd > 1)
   21158     return false;
   21159 
   21160   for (i = 1; i < nelt; ++i)
   21161     if (d->perm[i] != 2 * i + odd)
   21162       return false;
   21163 
   21164   if (d->testing_p)
   21165     return true;
   21166 
   21167 
   21168   if (odd)
   21169     {
   21170       t1 = gen_reg_rtx (V32HImode);
   21171       t2 = gen_reg_rtx (V32HImode);
   21172       emit_insn (gen_lshrv32hi3 (t1,
   21173 				 gen_lowpart (V32HImode, d->op0),
   21174 				 GEN_INT (8)));
   21175       emit_insn (gen_lshrv32hi3 (t2,
   21176 				 gen_lowpart (V32HImode, d->op1),
   21177 				 GEN_INT (8)));
   21178     }
   21179   else
   21180     {
   21181       t1 = gen_lowpart (V32HImode, d->op0);
   21182       t2 = gen_lowpart (V32HImode, d->op1);
   21183     }
   21184 
   21185   t3 = gen_reg_rtx (V32QImode);
   21186   t4 = gen_reg_rtx (V32QImode);
   21187   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
   21188   emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
   21189   emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
   21190 
   21191   return true;
   21192 }
   21193 
   21194 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
   21195    and extract-odd permutations.  */
   21196 
   21197 static bool
   21198 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
   21199 {
   21200   rtx t1, t2, t3, t4, t5;
   21201 
   21202   switch (d->vmode)
   21203     {
   21204     case E_V4DFmode:
   21205       if (d->testing_p)
   21206 	break;
   21207       t1 = gen_reg_rtx (V4DFmode);
   21208       t2 = gen_reg_rtx (V4DFmode);
   21209 
   21210       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   21211       emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
   21212       emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
   21213 
   21214       /* Now an unpck[lh]pd will produce the result required.  */
   21215       if (odd)
   21216 	t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
   21217       else
   21218 	t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
   21219       emit_insn (t3);
   21220       break;
   21221 
   21222     case E_V8SFmode:
   21223       {
   21224 	int mask = odd ? 0xdd : 0x88;
   21225 
   21226 	if (d->testing_p)
   21227 	  break;
   21228 	t1 = gen_reg_rtx (V8SFmode);
   21229 	t2 = gen_reg_rtx (V8SFmode);
   21230 	t3 = gen_reg_rtx (V8SFmode);
   21231 
   21232 	/* Shuffle within the 128-bit lanes to produce:
   21233 	   { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }.  */
   21234 	emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
   21235 				      GEN_INT (mask)));
   21236 
   21237 	/* Shuffle the lanes around to produce:
   21238 	   { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }.  */
   21239 	emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
   21240 					    GEN_INT (0x3)));
   21241 
   21242 	/* Shuffle within the 128-bit lanes to produce:
   21243 	   { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }.  */
   21244 	emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
   21245 
   21246 	/* Shuffle within the 128-bit lanes to produce:
   21247 	   { 8 a c e c e 8 a } | { 9 b d f d f 9 b }.  */
   21248 	emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
   21249 
   21250 	/* Shuffle the lanes around to produce:
   21251 	   { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }.  */
   21252 	emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
   21253 					    GEN_INT (0x20)));
   21254       }
   21255       break;
   21256 
   21257     case E_V2DFmode:
   21258     case E_V4SFmode:
   21259     case E_V2DImode:
   21260     case E_V2SImode:
   21261     case E_V4SImode:
   21262     case E_V2HImode:
   21263       /* These are always directly implementable by expand_vec_perm_1.  */
   21264       gcc_unreachable ();
   21265 
   21266     case E_V2SFmode:
   21267       gcc_assert (TARGET_MMX_WITH_SSE);
   21268       /* We have no suitable instructions.  */
   21269       if (d->testing_p)
   21270 	return false;
   21271       break;
   21272 
   21273     case E_V4QImode:
   21274       if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   21275 	return expand_vec_perm_pshufb2 (d);
   21276       else
   21277 	{
   21278 	  if (d->testing_p)
   21279 	    break;
   21280 	  /* We need 2*log2(N)-1 operations to achieve odd/even
   21281 	     with interleave. */
   21282 	  t1 = gen_reg_rtx (V4QImode);
   21283 	  emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
   21284 	  emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
   21285 	  if (odd)
   21286 	    t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
   21287 	  else
   21288 	    t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
   21289 	  emit_insn (t2);
   21290 	}
   21291       break;
   21292 
   21293     case E_V4HImode:
   21294       if (TARGET_SSE4_1)
   21295 	return expand_vec_perm_even_odd_pack (d);
   21296       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   21297 	return expand_vec_perm_pshufb2 (d);
   21298       else
   21299 	{
   21300 	  if (d->testing_p)
   21301 	    break;
   21302 	  /* We need 2*log2(N)-1 operations to achieve odd/even
   21303 	     with interleave. */
   21304 	  t1 = gen_reg_rtx (V4HImode);
   21305 	  emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
   21306 	  emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
   21307 	  if (odd)
   21308 	    t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
   21309 	  else
   21310 	    t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
   21311 	  emit_insn (t2);
   21312 	}
   21313       break;
   21314 
   21315     case E_V8HImode:
   21316       if (TARGET_SSE4_1)
   21317 	return expand_vec_perm_even_odd_pack (d);
   21318       else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
   21319 	return expand_vec_perm_pshufb2 (d);
   21320       else
   21321 	{
   21322 	  if (d->testing_p)
   21323 	    break;
   21324 	  /* We need 2*log2(N)-1 operations to achieve odd/even
   21325 	     with interleave. */
   21326 	  t1 = gen_reg_rtx (V8HImode);
   21327 	  t2 = gen_reg_rtx (V8HImode);
   21328 	  emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
   21329 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
   21330 	  emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
   21331 	  emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
   21332 	  if (odd)
   21333 	    t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
   21334 	  else
   21335 	    t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
   21336 	  emit_insn (t3);
   21337 	}
   21338       break;
   21339 
   21340     case E_V8QImode:
   21341     case E_V16QImode:
   21342       return expand_vec_perm_even_odd_pack (d);
   21343 
   21344     case E_V16HImode:
   21345     case E_V32QImode:
   21346       return expand_vec_perm_even_odd_pack (d);
   21347 
   21348     case E_V64QImode:
   21349       return expand_vec_perm_even_odd_trunc (d);
   21350 
   21351     case E_V4DImode:
   21352       if (!TARGET_AVX2)
   21353 	{
   21354 	  struct expand_vec_perm_d d_copy = *d;
   21355 	  d_copy.vmode = V4DFmode;
   21356 	  if (d->testing_p)
   21357 	    d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
   21358 	  else
   21359 	    d_copy.target = gen_reg_rtx (V4DFmode);
   21360 	  d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
   21361 	  d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
   21362 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   21363 	    {
   21364 	      if (!d->testing_p)
   21365 		emit_move_insn (d->target,
   21366 				gen_lowpart (V4DImode, d_copy.target));
   21367 	      return true;
   21368 	    }
   21369 	  return false;
   21370 	}
   21371 
   21372       if (d->testing_p)
   21373 	break;
   21374 
   21375       t1 = gen_reg_rtx (V4DImode);
   21376       t2 = gen_reg_rtx (V4DImode);
   21377 
   21378       /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
   21379       emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
   21380       emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
   21381 
   21382       /* Now an vpunpck[lh]qdq will produce the result required.  */
   21383       if (odd)
   21384 	t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
   21385       else
   21386 	t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
   21387       emit_insn (t3);
   21388       break;
   21389 
   21390     case E_V8SImode:
   21391       if (!TARGET_AVX2)
   21392 	{
   21393 	  struct expand_vec_perm_d d_copy = *d;
   21394 	  d_copy.vmode = V8SFmode;
   21395 	  if (d->testing_p)
   21396 	    d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
   21397 	  else
   21398 	    d_copy.target = gen_reg_rtx (V8SFmode);
   21399 	  d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
   21400 	  d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
   21401 	  if (expand_vec_perm_even_odd_1 (&d_copy, odd))
   21402 	    {
   21403 	      if (!d->testing_p)
   21404 		emit_move_insn (d->target,
   21405 				gen_lowpart (V8SImode, d_copy.target));
   21406 	      return true;
   21407 	    }
   21408 	  return false;
   21409 	}
   21410 
   21411       if (d->testing_p)
   21412 	break;
   21413 
   21414       t1 = gen_reg_rtx (V8SImode);
   21415       t2 = gen_reg_rtx (V8SImode);
   21416       t3 = gen_reg_rtx (V4DImode);
   21417       t4 = gen_reg_rtx (V4DImode);
   21418       t5 = gen_reg_rtx (V4DImode);
   21419 
   21420       /* Shuffle the lanes around into
   21421 	 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
   21422       emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
   21423 				    gen_lowpart (V4DImode, d->op1),
   21424 				    GEN_INT (0x20)));
   21425       emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
   21426 				    gen_lowpart (V4DImode, d->op1),
   21427 				    GEN_INT (0x31)));
   21428 
   21429       /* Swap the 2nd and 3rd position in each lane into
   21430 	 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
   21431       emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
   21432 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   21433       emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
   21434 				    GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
   21435 
   21436       /* Now an vpunpck[lh]qdq will produce
   21437 	 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
   21438       if (odd)
   21439 	t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
   21440 					   gen_lowpart (V4DImode, t2));
   21441       else
   21442 	t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
   21443 					  gen_lowpart (V4DImode, t2));
   21444       emit_insn (t3);
   21445       emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
   21446       break;
   21447 
   21448     default:
   21449       gcc_unreachable ();
   21450     }
   21451 
   21452   return true;
   21453 }
   21454 
   21455 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   21456    extract-even and extract-odd permutations.  */
   21457 
   21458 static bool
   21459 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
   21460 {
   21461   unsigned i, odd, nelt = d->nelt;
   21462 
   21463   odd = d->perm[0];
   21464   if (odd != 0 && odd != 1)
   21465     return false;
   21466 
   21467   for (i = 1; i < nelt; ++i)
   21468     if (d->perm[i] != 2 * i + odd)
   21469       return false;
   21470 
   21471   if (d->vmode == E_V32HImode
   21472       && d->testing_p
   21473       && !TARGET_AVX512BW)
   21474     return false;
   21475 
   21476   return expand_vec_perm_even_odd_1 (d, odd);
   21477 }
   21478 
   21479 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
   21480    permutations.  We assume that expand_vec_perm_1 has already failed.  */
   21481 
   21482 static bool
   21483 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
   21484 {
   21485   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   21486   machine_mode vmode = d->vmode;
   21487   rtx (*gen) (rtx, rtx, rtx);
   21488   unsigned char perm2[4];
   21489   rtx op0 = d->op0, dest;
   21490   bool ok;
   21491 
   21492   switch (vmode)
   21493     {
   21494     case E_V4DFmode:
   21495     case E_V8SFmode:
   21496       /* These are special-cased in sse.md so that we can optionally
   21497 	 use the vbroadcast instruction.  They expand to two insns
   21498 	 if the input happens to be in a register.  */
   21499       gcc_unreachable ();
   21500 
   21501     case E_V2DFmode:
   21502     case E_V2SFmode:
   21503     case E_V4SFmode:
   21504     case E_V2DImode:
   21505     case E_V2SImode:
   21506     case E_V4SImode:
   21507     case E_V2HImode:
   21508     case E_V4HImode:
   21509       /* These are always implementable using standard shuffle patterns.  */
   21510       gcc_unreachable ();
   21511 
   21512     case E_V4QImode:
   21513       /* This can be implemented via interleave and pshuflw.  */
   21514       if (d->testing_p)
   21515 	return true;
   21516 
   21517       if (elt >= nelt2)
   21518 	{
   21519 	  gen = gen_mmx_punpckhbw_low;
   21520 	  elt -= nelt2;
   21521 	}
   21522       else
   21523 	gen = gen_mmx_punpcklbw_low;
   21524 
   21525       dest = gen_reg_rtx (vmode);
   21526       emit_insn (gen (dest, op0, op0));
   21527       vmode = get_mode_wider_vector (vmode);
   21528       op0 = gen_lowpart (vmode, dest);
   21529 
   21530       memset (perm2, elt, 2);
   21531       dest = gen_reg_rtx (vmode);
   21532       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   21533       gcc_assert (ok);
   21534 
   21535       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   21536       return true;
   21537 
   21538     case E_V8QImode:
   21539       /* This can be implemented via interleave.  We save one insn by
   21540 	 stopping once we have promoted to V2SImode and then use pshufd.  */
   21541       if (d->testing_p)
   21542 	return true;
   21543       do
   21544 	{
   21545 	  if (elt >= nelt2)
   21546 	    {
   21547 	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
   21548 				      : gen_mmx_punpckhwd;
   21549 	      elt -= nelt2;
   21550 	    }
   21551 	  else
   21552 	    gen = vmode == V8QImode ? gen_mmx_punpcklbw
   21553 				    : gen_mmx_punpcklwd;
   21554 	  nelt2 /= 2;
   21555 
   21556 	  dest = gen_reg_rtx (vmode);
   21557 	  emit_insn (gen (dest, op0, op0));
   21558 	  vmode = get_mode_wider_vector (vmode);
   21559 	  op0 = gen_lowpart (vmode, dest);
   21560 	}
   21561       while (vmode != V2SImode);
   21562 
   21563       memset (perm2, elt, 2);
   21564       dest = gen_reg_rtx (vmode);
   21565       ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
   21566       gcc_assert (ok);
   21567 
   21568       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   21569       return true;
   21570 
   21571     case E_V8HImode:
   21572     case E_V16QImode:
   21573       /* These can be implemented via interleave.  We save one insn by
   21574 	 stopping once we have promoted to V4SImode and then use pshufd.  */
   21575       if (d->testing_p)
   21576 	return true;
   21577       do
   21578 	{
   21579 	  if (elt >= nelt2)
   21580 	    {
   21581 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
   21582 				       : gen_vec_interleave_highv8hi;
   21583 	      elt -= nelt2;
   21584 	    }
   21585 	  else
   21586 	    gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
   21587 				     : gen_vec_interleave_lowv8hi;
   21588 	  nelt2 /= 2;
   21589 
   21590 	  dest = gen_reg_rtx (vmode);
   21591 	  emit_insn (gen (dest, op0, op0));
   21592 	  vmode = get_mode_wider_vector (vmode);
   21593 	  op0 = gen_lowpart (vmode, dest);
   21594 	}
   21595       while (vmode != V4SImode);
   21596 
   21597       memset (perm2, elt, 4);
   21598       dest = gen_reg_rtx (vmode);
   21599       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   21600       gcc_assert (ok);
   21601 
   21602       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   21603       return true;
   21604 
   21605     case E_V8HFmode:
   21606       /* This can be implemented via interleave and pshufd.  */
   21607       if (d->testing_p)
   21608 	return true;
   21609 
   21610       if (elt >= nelt2)
   21611 	{
   21612 	  gen = gen_vec_interleave_highv8hf;
   21613 	  elt -= nelt2;
   21614 	}
   21615       else
   21616 	gen = gen_vec_interleave_lowv8hf;
   21617       nelt2 /= 2;
   21618 
   21619       dest = gen_reg_rtx (vmode);
   21620       emit_insn (gen (dest, op0, op0));
   21621 
   21622       vmode = V4SImode;
   21623       op0 = gen_lowpart (vmode, dest);
   21624 
   21625       memset (perm2, elt, 4);
   21626       dest = gen_reg_rtx (vmode);
   21627       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
   21628       gcc_assert (ok);
   21629 
   21630       emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   21631       return true;
   21632 
   21633     case E_V32QImode:
   21634     case E_V16HImode:
   21635     case E_V8SImode:
   21636     case E_V4DImode:
   21637       /* For AVX2 broadcasts of the first element vpbroadcast* or
   21638 	 vpermq should be used by expand_vec_perm_1.  */
   21639       gcc_assert (!TARGET_AVX2 || d->perm[0]);
   21640       return false;
   21641 
   21642     case E_V64QImode:
   21643       gcc_assert (!TARGET_AVX512BW || d->perm[0]);
   21644       return false;
   21645 
   21646     case E_V32HImode:
   21647       gcc_assert (!TARGET_AVX512BW);
   21648       return false;
   21649 
   21650     default:
   21651       gcc_unreachable ();
   21652     }
   21653 }
   21654 
   21655 /* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
   21656    broadcast permutations.  */
   21657 
   21658 static bool
   21659 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
   21660 {
   21661   unsigned i, elt, nelt = d->nelt;
   21662 
   21663   if (!d->one_operand_p)
   21664     return false;
   21665 
   21666   elt = d->perm[0];
   21667   for (i = 1; i < nelt; ++i)
   21668     if (d->perm[i] != elt)
   21669       return false;
   21670 
   21671   return expand_vec_perm_broadcast_1 (d);
   21672 }
   21673 
   21674 /* Implement arbitrary permutations of two V64QImode operands
   21675    with 2 vperm[it]2w, 2 vpshufb and one vpor instruction.  */
   21676 static bool
   21677 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
   21678 {
   21679   if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
   21680     return false;
   21681 
   21682   if (d->testing_p)
   21683     return true;
   21684 
   21685   struct expand_vec_perm_d ds[2];
   21686   rtx rperm[128], vperm, target0, target1;
   21687   unsigned int i, nelt;
   21688   machine_mode vmode;
   21689 
   21690   nelt = d->nelt;
   21691   vmode = V64QImode;
   21692 
   21693   for (i = 0; i < 2; i++)
   21694     {
   21695       ds[i] = *d;
   21696       ds[i].vmode = V32HImode;
   21697       ds[i].nelt = 32;
   21698       ds[i].target = gen_reg_rtx (V32HImode);
   21699       ds[i].op0 = gen_lowpart (V32HImode, d->op0);
   21700       ds[i].op1 = gen_lowpart (V32HImode, d->op1);
   21701     }
   21702 
   21703   /* Prepare permutations such that the first one takes care of
   21704      putting the even bytes into the right positions or one higher
   21705      positions (ds[0]) and the second one takes care of
   21706      putting the odd bytes into the right positions or one below
   21707      (ds[1]).  */
   21708 
   21709   for (i = 0; i < nelt; i++)
   21710     {
   21711       ds[i & 1].perm[i / 2] = d->perm[i] / 2;
   21712       if (i & 1)
   21713 	{
   21714 	  rperm[i] = constm1_rtx;
   21715 	  rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   21716 	}
   21717       else
   21718 	{
   21719 	  rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
   21720 	  rperm[i + 64] = constm1_rtx;
   21721 	}
   21722     }
   21723 
   21724   bool ok = expand_vec_perm_1 (&ds[0]);
   21725   gcc_assert (ok);
   21726   ds[0].target = gen_lowpart (V64QImode, ds[0].target);
   21727 
   21728   ok = expand_vec_perm_1 (&ds[1]);
   21729   gcc_assert (ok);
   21730   ds[1].target = gen_lowpart (V64QImode, ds[1].target);
   21731 
   21732   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
   21733   vperm = force_reg (vmode, vperm);
   21734   target0 = gen_reg_rtx (V64QImode);
   21735   emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
   21736 
   21737   vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
   21738   vperm = force_reg (vmode, vperm);
   21739   target1 = gen_reg_rtx (V64QImode);
   21740   emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
   21741 
   21742   emit_insn (gen_iorv64qi3 (d->target, target0, target1));
   21743   return true;
   21744 }
   21745 
   21746 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
   21747    with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
   21748    all the shorter instruction sequences.  */
   21749 
   21750 static bool
   21751 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
   21752 {
   21753   rtx rperm[4][32], vperm, l[2], h[2], op, m128;
   21754   unsigned int i, nelt, eltsz;
   21755   bool used[4];
   21756 
   21757   if (!TARGET_AVX2
   21758       || d->one_operand_p
   21759       || (d->vmode != V32QImode && d->vmode != V16HImode))
   21760     return false;
   21761 
   21762   if (d->testing_p)
   21763     return true;
   21764 
   21765   nelt = d->nelt;
   21766   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
   21767 
   21768   /* Generate 4 permutation masks.  If the required element is within
   21769      the same lane, it is shuffled in.  If the required element from the
   21770      other lane, force a zero by setting bit 7 in the permutation mask.
   21771      In the other mask the mask has non-negative elements if element
   21772      is requested from the other lane, but also moved to the other lane,
   21773      so that the result of vpshufb can have the two V2TImode halves
   21774      swapped.  */
   21775   m128 = GEN_INT (-128);
   21776   for (i = 0; i < 32; ++i)
   21777     {
   21778       rperm[0][i] = m128;
   21779       rperm[1][i] = m128;
   21780       rperm[2][i] = m128;
   21781       rperm[3][i] = m128;
   21782     }
   21783   used[0] = false;
   21784   used[1] = false;
   21785   used[2] = false;
   21786   used[3] = false;
   21787   for (i = 0; i < nelt; ++i)
   21788     {
   21789       unsigned j, e = d->perm[i] & (nelt / 2 - 1);
   21790       unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
   21791       unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
   21792 
   21793       for (j = 0; j < eltsz; ++j)
   21794 	rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
   21795       used[which] = true;
   21796     }
   21797 
   21798   for (i = 0; i < 2; ++i)
   21799     {
   21800       if (!used[2 * i + 1])
   21801 	{
   21802 	  h[i] = NULL_RTX;
   21803 	  continue;
   21804 	}
   21805       vperm = gen_rtx_CONST_VECTOR (V32QImode,
   21806 				    gen_rtvec_v (32, rperm[2 * i + 1]));
   21807       vperm = force_reg (V32QImode, vperm);
   21808       h[i] = gen_reg_rtx (V32QImode);
   21809       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   21810       emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
   21811     }
   21812 
   21813   /* Swap the 128-byte lanes of h[X].  */
   21814   for (i = 0; i < 2; ++i)
   21815    {
   21816      if (h[i] == NULL_RTX)
   21817        continue;
   21818      op = gen_reg_rtx (V4DImode);
   21819      emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
   21820 				     const2_rtx, GEN_INT (3), const0_rtx,
   21821 				     const1_rtx));
   21822      h[i] = gen_lowpart (V32QImode, op);
   21823    }
   21824 
   21825   for (i = 0; i < 2; ++i)
   21826     {
   21827       if (!used[2 * i])
   21828 	{
   21829 	  l[i] = NULL_RTX;
   21830 	  continue;
   21831 	}
   21832       vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
   21833       vperm = force_reg (V32QImode, vperm);
   21834       l[i] = gen_reg_rtx (V32QImode);
   21835       op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
   21836       emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
   21837     }
   21838 
   21839   for (i = 0; i < 2; ++i)
   21840     {
   21841       if (h[i] && l[i])
   21842 	{
   21843 	  op = gen_reg_rtx (V32QImode);
   21844 	  emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
   21845 	  l[i] = op;
   21846 	}
   21847       else if (h[i])
   21848 	l[i] = h[i];
   21849     }
   21850 
   21851   gcc_assert (l[0] && l[1]);
   21852   op = d->target;
   21853   if (d->vmode != V32QImode)
   21854     op = gen_reg_rtx (V32QImode);
   21855   emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
   21856   if (op != d->target)
   21857     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
   21858   return true;
   21859 }
   21860 
   21861 /* The guts of ix86_vectorize_vec_perm_const.  With all of the interface bits
   21862    taken care of, perform the expansion in D and return true on success.  */
   21863 
   21864 static bool
   21865 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   21866 {
   21867   /* Try a single instruction expansion.  */
   21868   if (expand_vec_perm_1 (d))
   21869     return true;
   21870 
   21871   /* Try sequences of two instructions.  */
   21872 
   21873   if (expand_vec_perm_pshuflw_pshufhw (d))
   21874     return true;
   21875 
   21876   if (expand_vec_perm_palignr (d, false))
   21877     return true;
   21878 
   21879   if (expand_vec_perm_interleave2 (d))
   21880     return true;
   21881 
   21882   if (expand_vec_perm_broadcast (d))
   21883     return true;
   21884 
   21885   if (expand_vec_perm_vpermq_perm_1 (d))
   21886     return true;
   21887 
   21888   if (expand_vec_perm_vperm2f128 (d))
   21889     return true;
   21890 
   21891   if (expand_vec_perm_pblendv (d))
   21892     return true;
   21893 
   21894   if (expand_vec_perm_2perm_interleave (d, true))
   21895     return true;
   21896 
   21897   if (expand_vec_perm_2perm_pblendv (d, true))
   21898     return true;
   21899 
   21900   /* Try sequences of three instructions.  */
   21901 
   21902   if (expand_vec_perm_even_odd_pack (d))
   21903     return true;
   21904 
   21905   if (expand_vec_perm_2vperm2f128_vshuf (d))
   21906     return true;
   21907 
   21908   if (expand_vec_perm_pshufb2 (d))
   21909     return true;
   21910 
   21911   if (expand_vec_perm_interleave3 (d))
   21912     return true;
   21913 
   21914   if (expand_vec_perm_vperm2f128_vblend (d))
   21915     return true;
   21916 
   21917   if (expand_vec_perm_2perm_interleave (d, false))
   21918     return true;
   21919 
   21920   if (expand_vec_perm_2perm_pblendv (d, false))
   21921     return true;
   21922 
   21923   /* Try sequences of four instructions.  */
   21924 
   21925   if (expand_vec_perm_even_odd_trunc (d))
   21926     return true;
   21927   if (expand_vec_perm_vpshufb2_vpermq (d))
   21928     return true;
   21929 
   21930   if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
   21931     return true;
   21932 
   21933   if (expand_vec_perm_vpermt2_vpshub2 (d))
   21934     return true;
   21935 
   21936   /* ??? Look for narrow permutations whose element orderings would
   21937      allow the promotion to a wider mode.  */
   21938 
   21939   /* ??? Look for sequences of interleave or a wider permute that place
   21940      the data into the correct lanes for a half-vector shuffle like
   21941      pshuf[lh]w or vpermilps.  */
   21942 
   21943   /* ??? Look for sequences of interleave that produce the desired results.
   21944      The combinatorics of punpck[lh] get pretty ugly... */
   21945 
   21946   if (expand_vec_perm_even_odd (d))
   21947     return true;
   21948 
   21949   /* Even longer sequences.  */
   21950   if (expand_vec_perm_vpshufb4_vpermq2 (d))
   21951     return true;
   21952 
   21953   /* See if we can get the same permutation in different vector integer
   21954      mode.  */
   21955   struct expand_vec_perm_d nd;
   21956   if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
   21957     {
   21958       if (!d->testing_p)
   21959 	emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
   21960       return true;
   21961     }
   21962 
   21963   /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
   21964   if (expand_vec_perm2_vperm2f128_vblend (d))
   21965     return true;
   21966 
   21967   return false;
   21968 }
   21969 
   21970 /* If a permutation only uses one operand, make it clear. Returns true
   21971    if the permutation references both operands.  */
   21972 
   21973 static bool
   21974 canonicalize_perm (struct expand_vec_perm_d *d)
   21975 {
   21976   int i, which, nelt = d->nelt;
   21977 
   21978   for (i = which = 0; i < nelt; ++i)
   21979     which |= (d->perm[i] < nelt ? 1 : 2);
   21980 
   21981   d->one_operand_p = true;
   21982   switch (which)
   21983     {
   21984     default:
   21985       gcc_unreachable();
   21986 
   21987     case 3:
   21988       if (!rtx_equal_p (d->op0, d->op1))
   21989         {
   21990 	  d->one_operand_p = false;
   21991 	  break;
   21992         }
   21993       /* The elements of PERM do not suggest that only the first operand
   21994 	 is used, but both operands are identical.  Allow easier matching
   21995 	 of the permutation by folding the permutation into the single
   21996 	 input vector.  */
   21997       /* FALLTHRU */
   21998 
   21999     case 2:
   22000       for (i = 0; i < nelt; ++i)
   22001         d->perm[i] &= nelt - 1;
   22002       d->op0 = d->op1;
   22003       break;
   22004 
   22005     case 1:
   22006       d->op1 = d->op0;
   22007       break;
   22008     }
   22009 
   22010   return (which == 3);
   22011 }
   22012 
   22013 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
   22014 
   22015 bool
   22016 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
   22017 			       rtx op1, const vec_perm_indices &sel)
   22018 {
   22019   struct expand_vec_perm_d d;
   22020   unsigned char perm[MAX_VECT_LEN];
   22021   unsigned int i, nelt, which;
   22022   bool two_args;
   22023 
   22024   /* For HF mode vector, convert it to HI using subreg.  */
   22025   if (GET_MODE_INNER (vmode) == HFmode)
   22026     {
   22027       machine_mode orig_mode = vmode;
   22028       vmode = mode_for_vector (HImode,
   22029 			       GET_MODE_NUNITS (vmode)).require ();
   22030       if (target)
   22031 	target = lowpart_subreg (vmode, target, orig_mode);
   22032       if (op0)
   22033 	op0 = lowpart_subreg (vmode, op0, orig_mode);
   22034       if (op1)
   22035 	op1 = lowpart_subreg (vmode, op1, orig_mode);
   22036     }
   22037 
   22038   d.target = target;
   22039   d.op0 = op0;
   22040   d.op1 = op1;
   22041 
   22042   d.vmode = vmode;
   22043   gcc_assert (VECTOR_MODE_P (d.vmode));
   22044   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   22045   d.testing_p = !target;
   22046 
   22047   gcc_assert (sel.length () == nelt);
   22048   gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
   22049 
   22050   /* Given sufficient ISA support we can just return true here
   22051      for selected vector modes.  */
   22052   switch (d.vmode)
   22053     {
   22054     case E_V16SFmode:
   22055     case E_V16SImode:
   22056     case E_V8DImode:
   22057     case E_V8DFmode:
   22058       if (!TARGET_AVX512F)
   22059 	return false;
   22060       /* All implementable with a single vperm[it]2 insn.  */
   22061       if (d.testing_p)
   22062 	return true;
   22063       break;
   22064     case E_V32HImode:
   22065       if (!TARGET_AVX512F)
   22066 	return false;
   22067       if (d.testing_p && TARGET_AVX512BW)
   22068 	/* All implementable with a single vperm[it]2 insn.  */
   22069 	return true;
   22070       break;
   22071     case E_V64QImode:
   22072       if (!TARGET_AVX512F)
   22073 	return false;
   22074       if (d.testing_p && TARGET_AVX512BW)
   22075 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
   22076 	return true;
   22077       break;
   22078     case E_V8SImode:
   22079     case E_V8SFmode:
   22080     case E_V4DFmode:
   22081     case E_V4DImode:
   22082       if (!TARGET_AVX)
   22083 	return false;
   22084       if (d.testing_p && TARGET_AVX512VL)
   22085 	/* All implementable with a single vperm[it]2 insn.  */
   22086 	return true;
   22087       break;
   22088     case E_V16HImode:
   22089       if (!TARGET_SSE2)
   22090 	return false;
   22091       if (d.testing_p && TARGET_AVX2)
   22092 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   22093 	return true;
   22094       break;
   22095     case E_V32QImode:
   22096       if (!TARGET_SSE2)
   22097 	return false;
   22098       if (d.testing_p && TARGET_AVX2)
   22099 	/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns.  */
   22100 	return true;
   22101       break;
   22102     case E_V8HImode:
   22103     case E_V16QImode:
   22104       if (!TARGET_SSE2)
   22105 	return false;
   22106       /* Fall through.  */
   22107     case E_V4SImode:
   22108     case E_V4SFmode:
   22109       if (!TARGET_SSE)
   22110 	return false;
   22111       /* All implementable with a single vpperm insn.  */
   22112       if (d.testing_p && TARGET_XOP)
   22113 	return true;
   22114       /* All implementable with 2 pshufb + 1 ior.  */
   22115       if (d.testing_p && TARGET_SSSE3)
   22116 	return true;
   22117       break;
   22118     case E_V2SFmode:
   22119     case E_V2SImode:
   22120     case E_V4HImode:
   22121     case E_V8QImode:
   22122       if (!TARGET_MMX_WITH_SSE)
   22123 	return false;
   22124       break;
   22125     case E_V2HImode:
   22126       if (!TARGET_SSE2)
   22127 	return false;
   22128       /* All implementable with *punpckwd.  */
   22129       if (d.testing_p)
   22130 	return true;
   22131       break;
   22132     case E_V4QImode:
   22133       if (!TARGET_SSE2)
   22134 	return false;
   22135       break;
   22136     case E_V2DImode:
   22137     case E_V2DFmode:
   22138       if (!TARGET_SSE)
   22139 	return false;
   22140       /* All implementable with shufpd or unpck[lh]pd.  */
   22141       if (d.testing_p)
   22142 	return true;
   22143       break;
   22144     default:
   22145       return false;
   22146     }
   22147 
   22148   for (i = which = 0; i < nelt; ++i)
   22149     {
   22150       unsigned char e = sel[i];
   22151       gcc_assert (e < 2 * nelt);
   22152       d.perm[i] = e;
   22153       perm[i] = e;
   22154       which |= (e < nelt ? 1 : 2);
   22155     }
   22156 
   22157   if (d.testing_p)
   22158     {
   22159       /* For all elements from second vector, fold the elements to first.  */
   22160       if (which == 2)
   22161 	for (i = 0; i < nelt; ++i)
   22162 	  d.perm[i] -= nelt;
   22163 
   22164       /* Check whether the mask can be applied to the vector type.  */
   22165       d.one_operand_p = (which != 3);
   22166 
   22167       /* Implementable with shufps, pshufd or pshuflw.  */
   22168       if (d.one_operand_p
   22169 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
   22170 	      || d.vmode == V4SImode || d.vmode == V2SImode
   22171 	      || d.vmode == V4HImode || d.vmode == V2HImode))
   22172 	return true;
   22173 
   22174       /* Otherwise we have to go through the motions and see if we can
   22175 	 figure out how to generate the requested permutation.  */
   22176       d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
   22177       d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
   22178       if (!d.one_operand_p)
   22179 	d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
   22180 
   22181       start_sequence ();
   22182       bool ret = ix86_expand_vec_perm_const_1 (&d);
   22183       end_sequence ();
   22184 
   22185       return ret;
   22186     }
   22187 
   22188   two_args = canonicalize_perm (&d);
   22189 
   22190   /* If one of the operands is a zero vector, try to match pmovzx.  */
   22191   if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
   22192     {
   22193       struct expand_vec_perm_d dzero = d;
   22194       if (d.op0 == CONST0_RTX (vmode))
   22195 	{
   22196 	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
   22197 	  std::swap (dzero.op0, dzero.op1);
   22198 	  for (i = 0; i < nelt; ++i)
   22199 	    dzero.perm[i] ^= nelt;
   22200 	}
   22201       else
   22202 	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
   22203 
   22204       if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
   22205 				  dzero.perm, nelt, dzero.testing_p))
   22206 	return true;
   22207     }
   22208 
   22209   /* Force operands into registers.  */
   22210   rtx nop0 = force_reg (vmode, d.op0);
   22211   if (d.op0 == d.op1)
   22212     d.op1 = nop0;
   22213   d.op0 = nop0;
   22214   d.op1 = force_reg (vmode, d.op1);
   22215 
   22216   if (ix86_expand_vec_perm_const_1 (&d))
   22217     return true;
   22218 
   22219   /* If the selector says both arguments are needed, but the operands are the
   22220      same, the above tried to expand with one_operand_p and flattened selector.
   22221      If that didn't work, retry without one_operand_p; we succeeded with that
   22222      during testing.  */
   22223   if (two_args && d.one_operand_p)
   22224     {
   22225       d.one_operand_p = false;
   22226       memcpy (d.perm, perm, sizeof (perm));
   22227       return ix86_expand_vec_perm_const_1 (&d);
   22228     }
   22229 
   22230   return false;
   22231 }
   22232 
   22233 void
   22234 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
   22235 {
   22236   struct expand_vec_perm_d d;
   22237   unsigned i, nelt;
   22238 
   22239   d.target = targ;
   22240   d.op0 = op0;
   22241   d.op1 = op1;
   22242   d.vmode = GET_MODE (targ);
   22243   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   22244   d.one_operand_p = false;
   22245   d.testing_p = false;
   22246 
   22247   for (i = 0; i < nelt; ++i)
   22248     d.perm[i] = i * 2 + odd;
   22249 
   22250   /* We'll either be able to implement the permutation directly...  */
   22251   if (expand_vec_perm_1 (&d))
   22252     return;
   22253 
   22254   /* ... or we use the special-case patterns.  */
   22255   expand_vec_perm_even_odd_1 (&d, odd);
   22256 }
   22257 
   22258 static void
   22259 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   22260 {
   22261   struct expand_vec_perm_d d;
   22262   unsigned i, nelt, base;
   22263   bool ok;
   22264 
   22265   d.target = targ;
   22266   d.op0 = op0;
   22267   d.op1 = op1;
   22268   d.vmode = GET_MODE (targ);
   22269   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
   22270   d.one_operand_p = false;
   22271   d.testing_p = false;
   22272 
   22273   base = high_p ? nelt / 2 : 0;
   22274   for (i = 0; i < nelt / 2; ++i)
   22275     {
   22276       d.perm[i * 2] = i + base;
   22277       d.perm[i * 2 + 1] = i + base + nelt;
   22278     }
   22279 
   22280   /* Note that for AVX this isn't one instruction.  */
   22281   ok = ix86_expand_vec_perm_const_1 (&d);
   22282   gcc_assert (ok);
   22283 }
   22284 
   22285 /* This function is similar as ix86_expand_vecop_qihi,
   22286    but optimized under AVX512BW by using vpmovwb.
   22287    For example, optimize vector MUL generation like
   22288 
   22289    vpmovzxbw ymm2, xmm0
   22290    vpmovzxbw ymm3, xmm1
   22291    vpmullw   ymm4, ymm2, ymm3
   22292    vpmovwb   xmm0, ymm4
   22293 
   22294    it would take less instructions than ix86_expand_vecop_qihi.
   22295    Return true if success.  */
   22296 
   22297 static bool
   22298 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   22299 {
   22300   machine_mode himode, qimode = GET_MODE (dest);
   22301   rtx hop1, hop2, hdest;
   22302   rtx (*gen_extend)(rtx, rtx);
   22303   rtx (*gen_truncate)(rtx, rtx);
   22304   bool uns_p = (code == ASHIFTRT) ? false : true;
   22305 
   22306   /* There's no V64HImode multiplication instruction.  */
   22307   if (qimode == E_V64QImode)
   22308     return false;
   22309 
   22310   /* vpmovwb only available under AVX512BW.  */
   22311   if (!TARGET_AVX512BW)
   22312     return false;
   22313   if ((qimode == V8QImode || qimode == V16QImode)
   22314       && !TARGET_AVX512VL)
   22315     return false;
   22316   /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
   22317   if (qimode == V32QImode
   22318       && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
   22319     return false;
   22320 
   22321   switch (qimode)
   22322     {
   22323     case E_V8QImode:
   22324       himode = V8HImode;
   22325       gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
   22326       gen_truncate = gen_truncv8hiv8qi2;
   22327       break;
   22328     case E_V16QImode:
   22329       himode = V16HImode;
   22330       gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
   22331       gen_truncate = gen_truncv16hiv16qi2;
   22332       break;
   22333     case E_V32QImode:
   22334       himode = V32HImode;
   22335       gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
   22336       gen_truncate = gen_truncv32hiv32qi2;
   22337       break;
   22338     default:
   22339       gcc_unreachable ();
   22340     }
   22341 
   22342   hop1 = gen_reg_rtx (himode);
   22343   hop2 = gen_reg_rtx (himode);
   22344   hdest = gen_reg_rtx (himode);
   22345   emit_insn (gen_extend (hop1, op1));
   22346   emit_insn (gen_extend (hop2, op2));
   22347   emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
   22348 						      hop1, hop2)));
   22349   emit_insn (gen_truncate (dest, hdest));
   22350   return true;
   22351 }
   22352 
   22353 /* Expand a vector operation shift by constant for a V*QImode in terms of the
   22354    same operation on V*HImode. Return true if success. */
   22355 static bool
   22356 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
   22357 				     rtx dest, rtx op1, rtx op2)
   22358 {
   22359   machine_mode qimode, himode;
   22360   HOST_WIDE_INT and_constant, xor_constant;
   22361   HOST_WIDE_INT shift_amount;
   22362   rtx vec_const_and, vec_const_xor;
   22363   rtx tmp, op1_subreg;
   22364   rtx (*gen_shift) (rtx, rtx, rtx);
   22365   rtx (*gen_and) (rtx, rtx, rtx);
   22366   rtx (*gen_xor) (rtx, rtx, rtx);
   22367   rtx (*gen_sub) (rtx, rtx, rtx);
   22368 
   22369   /* Only optimize shift by constant.  */
   22370   if (!CONST_INT_P (op2))
   22371     return false;
   22372 
   22373   qimode = GET_MODE (dest);
   22374   shift_amount = INTVAL (op2);
   22375   /* Do nothing when shift amount greater equal 8.  */
   22376   if (shift_amount > 7)
   22377     return false;
   22378 
   22379   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
   22380   /* Record sign bit.  */
   22381   xor_constant = 1 << (8 - shift_amount - 1);
   22382 
   22383   /* Zero upper/lower bits shift from left/right element.  */
   22384   and_constant
   22385     = (code == ASHIFT ? 256 - (1 << shift_amount)
   22386        : (1 << (8 - shift_amount)) - 1);
   22387 
   22388   switch (qimode)
   22389     {
   22390     case V16QImode:
   22391       himode = V8HImode;
   22392       gen_shift =
   22393 	((code == ASHIFT)
   22394 	 ? gen_ashlv8hi3
   22395 	 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
   22396       gen_and = gen_andv16qi3;
   22397       gen_xor = gen_xorv16qi3;
   22398       gen_sub = gen_subv16qi3;
   22399       break;
   22400     case V32QImode:
   22401       himode = V16HImode;
   22402       gen_shift =
   22403 	((code == ASHIFT)
   22404 	 ? gen_ashlv16hi3
   22405 	 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
   22406       gen_and = gen_andv32qi3;
   22407       gen_xor = gen_xorv32qi3;
   22408       gen_sub = gen_subv32qi3;
   22409       break;
   22410     case V64QImode:
   22411       himode = V32HImode;
   22412       gen_shift =
   22413 	((code == ASHIFT)
   22414 	 ? gen_ashlv32hi3
   22415 	 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
   22416       gen_and = gen_andv64qi3;
   22417       gen_xor = gen_xorv64qi3;
   22418       gen_sub = gen_subv64qi3;
   22419       break;
   22420     default:
   22421       gcc_unreachable ();
   22422     }
   22423 
   22424   tmp = gen_reg_rtx (himode);
   22425   vec_const_and = gen_reg_rtx (qimode);
   22426   op1_subreg = lowpart_subreg (himode, op1, qimode);
   22427 
   22428   /* For ASHIFT and LSHIFTRT, perform operation like
   22429      vpsllw/vpsrlw $shift_amount, %op1, %dest.
   22430      vpand %vec_const_and, %dest.  */
   22431   emit_insn (gen_shift (tmp, op1_subreg, op2));
   22432   emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
   22433   emit_move_insn (vec_const_and,
   22434 		  ix86_build_const_vector (qimode, true,
   22435 					   gen_int_mode (and_constant, QImode)));
   22436   emit_insn (gen_and (dest, dest, vec_const_and));
   22437 
   22438   /* For ASHIFTRT, perform extra operation like
   22439      vpxor %vec_const_xor, %dest, %dest
   22440      vpsubb %vec_const_xor, %dest, %dest  */
   22441   if (code == ASHIFTRT)
   22442     {
   22443       vec_const_xor = gen_reg_rtx (qimode);
   22444       emit_move_insn (vec_const_xor,
   22445 		      ix86_build_const_vector (qimode, true,
   22446 					       gen_int_mode (xor_constant, QImode)));
   22447       emit_insn (gen_xor (dest, dest, vec_const_xor));
   22448       emit_insn (gen_sub (dest, dest, vec_const_xor));
   22449     }
   22450   return true;
   22451 }
   22452 
   22453 /* Expand a vector operation CODE for a V*QImode in terms of the
   22454    same operation on V*HImode.  */
   22455 
   22456 void
   22457 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   22458 {
   22459   machine_mode qimode = GET_MODE (dest);
   22460   machine_mode himode;
   22461   rtx (*gen_il) (rtx, rtx, rtx);
   22462   rtx (*gen_ih) (rtx, rtx, rtx);
   22463   rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
   22464   struct expand_vec_perm_d d;
   22465   bool ok, full_interleave;
   22466   bool uns_p = false;
   22467   int i;
   22468 
   22469   if (CONST_INT_P (op2)
   22470       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
   22471       && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
   22472     return;
   22473 
   22474   if (TARGET_AVX512BW
   22475       && VECTOR_MODE_P (GET_MODE (op2))
   22476       && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
   22477     return;
   22478 
   22479   switch (qimode)
   22480     {
   22481     case E_V16QImode:
   22482       himode = V8HImode;
   22483       gen_il = gen_vec_interleave_lowv16qi;
   22484       gen_ih = gen_vec_interleave_highv16qi;
   22485       break;
   22486     case E_V32QImode:
   22487       himode = V16HImode;
   22488       gen_il = gen_avx2_interleave_lowv32qi;
   22489       gen_ih = gen_avx2_interleave_highv32qi;
   22490       break;
   22491     case E_V64QImode:
   22492       himode = V32HImode;
   22493       gen_il = gen_avx512bw_interleave_lowv64qi;
   22494       gen_ih = gen_avx512bw_interleave_highv64qi;
   22495       break;
   22496     default:
   22497       gcc_unreachable ();
   22498     }
   22499 
   22500   switch (code)
   22501     {
   22502     case MULT:
   22503       /* Unpack data such that we've got a source byte in each low byte of
   22504 	 each word.  We don't care what goes into the high byte of each word.
   22505 	 Rather than trying to get zero in there, most convenient is to let
   22506 	 it be a copy of the low byte.  */
   22507       op2_l = gen_reg_rtx (qimode);
   22508       op2_h = gen_reg_rtx (qimode);
   22509       emit_insn (gen_il (op2_l, op2, op2));
   22510       emit_insn (gen_ih (op2_h, op2, op2));
   22511 
   22512       op1_l = gen_reg_rtx (qimode);
   22513       op1_h = gen_reg_rtx (qimode);
   22514       emit_insn (gen_il (op1_l, op1, op1));
   22515       emit_insn (gen_ih (op1_h, op1, op1));
   22516       full_interleave = qimode == V16QImode;
   22517       break;
   22518 
   22519     case ASHIFT:
   22520     case LSHIFTRT:
   22521       uns_p = true;
   22522       /* FALLTHRU */
   22523     case ASHIFTRT:
   22524       op1_l = gen_reg_rtx (himode);
   22525       op1_h = gen_reg_rtx (himode);
   22526       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
   22527       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
   22528       /* vashr/vlshr/vashl  */
   22529       if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
   22530 	{
   22531 	  rtx tmp = force_reg (qimode, op2);
   22532 	  op2_l = gen_reg_rtx (himode);
   22533 	  op2_h = gen_reg_rtx (himode);
   22534 	  ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
   22535 	  ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
   22536 	}
   22537       else
   22538 	op2_l = op2_h = op2;
   22539 
   22540       full_interleave = true;
   22541       break;
   22542     default:
   22543       gcc_unreachable ();
   22544     }
   22545 
   22546   /* Perform vashr/vlshr/vashl.  */
   22547   if (code != MULT
   22548       && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
   22549     {
   22550       res_l = gen_reg_rtx (himode);
   22551       res_h = gen_reg_rtx (himode);
   22552       emit_insn (gen_rtx_SET (res_l,
   22553 			      simplify_gen_binary (code, himode,
   22554 						   op1_l, op2_l)));
   22555       emit_insn (gen_rtx_SET (res_h,
   22556 			      simplify_gen_binary (code, himode,
   22557 						   op1_h, op2_h)));
   22558     }
   22559   /* Performance mult/ashr/lshr/ashl.  */
   22560   else
   22561     {
   22562       res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
   22563 				   1, OPTAB_DIRECT);
   22564       res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
   22565 				   1, OPTAB_DIRECT);
   22566     }
   22567 
   22568   gcc_assert (res_l && res_h);
   22569 
   22570   /* Merge the data back into the right place.  */
   22571   d.target = dest;
   22572   d.op0 = gen_lowpart (qimode, res_l);
   22573   d.op1 = gen_lowpart (qimode, res_h);
   22574   d.vmode = qimode;
   22575   d.nelt = GET_MODE_NUNITS (qimode);
   22576   d.one_operand_p = false;
   22577   d.testing_p = false;
   22578 
   22579   if (full_interleave)
   22580     {
   22581       /* For SSE2, we used an full interleave, so the desired
   22582 	 results are in the even elements.  */
   22583       for (i = 0; i < d.nelt; ++i)
   22584 	d.perm[i] = i * 2;
   22585     }
   22586   else
   22587     {
   22588       /* For AVX, the interleave used above was not cross-lane.  So the
   22589 	 extraction is evens but with the second and third quarter swapped.
   22590 	 Happily, that is even one insn shorter than even extraction.
   22591 	 For AVX512BW we have 4 lanes.  We extract evens from within a lane,
   22592 	 always first from the first and then from the second source operand,
   22593 	 the index bits above the low 4 bits remains the same.
   22594 	 Thus, for d.nelt == 32 we want permutation
   22595 	 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
   22596 	 and for d.nelt == 64 we want permutation
   22597 	 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
   22598 	 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126.  */
   22599       for (i = 0; i < d.nelt; ++i)
   22600 	d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
   22601     }
   22602 
   22603   ok = ix86_expand_vec_perm_const_1 (&d);
   22604   gcc_assert (ok);
   22605 
   22606   set_unique_reg_note (get_last_insn (), REG_EQUAL,
   22607 		       gen_rtx_fmt_ee (code, qimode, op1, op2));
   22608 }
   22609 
   22610 /* Helper function of ix86_expand_mul_widen_evenodd.  Return true
   22611    if op is CONST_VECTOR with all odd elements equal to their
   22612    preceding element.  */
   22613 
   22614 static bool
   22615 const_vector_equal_evenodd_p (rtx op)
   22616 {
   22617   machine_mode mode = GET_MODE (op);
   22618   int i, nunits = GET_MODE_NUNITS (mode);
   22619   if (GET_CODE (op) != CONST_VECTOR
   22620       || nunits != CONST_VECTOR_NUNITS (op))
   22621     return false;
   22622   for (i = 0; i < nunits; i += 2)
   22623     if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
   22624       return false;
   22625   return true;
   22626 }
   22627 
   22628 void
   22629 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
   22630 			       bool uns_p, bool odd_p)
   22631 {
   22632   machine_mode mode = GET_MODE (op1);
   22633   machine_mode wmode = GET_MODE (dest);
   22634   rtx x;
   22635   rtx orig_op1 = op1, orig_op2 = op2;
   22636 
   22637   if (!nonimmediate_operand (op1, mode))
   22638     op1 = force_reg (mode, op1);
   22639   if (!nonimmediate_operand (op2, mode))
   22640     op2 = force_reg (mode, op2);
   22641 
   22642   /* We only play even/odd games with vectors of SImode.  */
   22643   gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
   22644 
   22645   /* If we're looking for the odd results, shift those members down to
   22646      the even slots.  For some cpus this is faster than a PSHUFD.  */
   22647   if (odd_p)
   22648     {
   22649       /* For XOP use vpmacsdqh, but only for smult, as it is only
   22650 	 signed.  */
   22651       if (TARGET_XOP && mode == V4SImode && !uns_p)
   22652 	{
   22653 	  x = force_reg (wmode, CONST0_RTX (wmode));
   22654 	  emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
   22655 	  return;
   22656 	}
   22657 
   22658       x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
   22659       if (!const_vector_equal_evenodd_p (orig_op1))
   22660 	op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
   22661 			    x, NULL, 1, OPTAB_DIRECT);
   22662       if (!const_vector_equal_evenodd_p (orig_op2))
   22663 	op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
   22664 			    x, NULL, 1, OPTAB_DIRECT);
   22665       op1 = gen_lowpart (mode, op1);
   22666       op2 = gen_lowpart (mode, op2);
   22667     }
   22668 
   22669   if (mode == V16SImode)
   22670     {
   22671       if (uns_p)
   22672 	x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
   22673       else
   22674 	x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
   22675     }
   22676   else if (mode == V8SImode)
   22677     {
   22678       if (uns_p)
   22679 	x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
   22680       else
   22681 	x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
   22682     }
   22683   else if (uns_p)
   22684     x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
   22685   else if (TARGET_SSE4_1)
   22686     x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
   22687   else
   22688     {
   22689       rtx s1, s2, t0, t1, t2;
   22690 
   22691       /* The easiest way to implement this without PMULDQ is to go through
   22692 	 the motions as if we are performing a full 64-bit multiply.  With
   22693 	 the exception that we need to do less shuffling of the elements.  */
   22694 
   22695       /* Compute the sign-extension, aka highparts, of the two operands.  */
   22696       s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   22697 				op1, pc_rtx, pc_rtx);
   22698       s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
   22699 				op2, pc_rtx, pc_rtx);
   22700 
   22701       /* Multiply LO(A) * HI(B), and vice-versa.  */
   22702       t1 = gen_reg_rtx (wmode);
   22703       t2 = gen_reg_rtx (wmode);
   22704       emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
   22705       emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
   22706 
   22707       /* Multiply LO(A) * LO(B).  */
   22708       t0 = gen_reg_rtx (wmode);
   22709       emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
   22710 
   22711       /* Combine and shift the highparts into place.  */
   22712       t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
   22713       t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
   22714 			 1, OPTAB_DIRECT);
   22715 
   22716       /* Combine high and low parts.  */
   22717       force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
   22718       return;
   22719     }
   22720   emit_insn (x);
   22721 }
   22722 
   22723 void
   22724 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
   22725 			    bool uns_p, bool high_p)
   22726 {
   22727   machine_mode wmode = GET_MODE (dest);
   22728   machine_mode mode = GET_MODE (op1);
   22729   rtx t1, t2, t3, t4, mask;
   22730 
   22731   switch (mode)
   22732     {
   22733     case E_V4SImode:
   22734       t1 = gen_reg_rtx (mode);
   22735       t2 = gen_reg_rtx (mode);
   22736       if (TARGET_XOP && !uns_p)
   22737 	{
   22738 	  /* With XOP, we have pmacsdqh, aka mul_widen_odd.  In this case,
   22739 	     shuffle the elements once so that all elements are in the right
   22740 	     place for immediate use: { A C B D }.  */
   22741 	  emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
   22742 					const1_rtx, GEN_INT (3)));
   22743 	  emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
   22744 					const1_rtx, GEN_INT (3)));
   22745 	}
   22746       else
   22747 	{
   22748 	  /* Put the elements into place for the multiply.  */
   22749 	  ix86_expand_vec_interleave (t1, op1, op1, high_p);
   22750 	  ix86_expand_vec_interleave (t2, op2, op2, high_p);
   22751 	  high_p = false;
   22752 	}
   22753       ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
   22754       break;
   22755 
   22756     case E_V8SImode:
   22757       /* Shuffle the elements between the lanes.  After this we
   22758 	 have { A B E F | C D G H } for each operand.  */
   22759       t1 = gen_reg_rtx (V4DImode);
   22760       t2 = gen_reg_rtx (V4DImode);
   22761       emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
   22762 				      const0_rtx, const2_rtx,
   22763 				      const1_rtx, GEN_INT (3)));
   22764       emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
   22765 				      const0_rtx, const2_rtx,
   22766 				      const1_rtx, GEN_INT (3)));
   22767 
   22768       /* Shuffle the elements within the lanes.  After this we
   22769 	 have { A A B B | C C D D } or { E E F F | G G H H }.  */
   22770       t3 = gen_reg_rtx (V8SImode);
   22771       t4 = gen_reg_rtx (V8SImode);
   22772       mask = GEN_INT (high_p
   22773 		      ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
   22774 		      : 0 + (0 << 2) + (1 << 4) + (1 << 6));
   22775       emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
   22776       emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
   22777 
   22778       ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
   22779       break;
   22780 
   22781     case E_V8HImode:
   22782     case E_V16HImode:
   22783       t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
   22784 			 uns_p, OPTAB_DIRECT);
   22785       t2 = expand_binop (mode,
   22786 			 uns_p ? umul_highpart_optab : smul_highpart_optab,
   22787 			 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
   22788       gcc_assert (t1 && t2);
   22789 
   22790       t3 = gen_reg_rtx (mode);
   22791       ix86_expand_vec_interleave (t3, t1, t2, high_p);
   22792       emit_move_insn (dest, gen_lowpart (wmode, t3));
   22793       break;
   22794 
   22795     case E_V16QImode:
   22796     case E_V32QImode:
   22797     case E_V32HImode:
   22798     case E_V16SImode:
   22799     case E_V64QImode:
   22800       t1 = gen_reg_rtx (wmode);
   22801       t2 = gen_reg_rtx (wmode);
   22802       ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
   22803       ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
   22804 
   22805       emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
   22806       break;
   22807 
   22808     default:
   22809       gcc_unreachable ();
   22810     }
   22811 }
   22812 
   22813 void
   22814 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
   22815 {
   22816   rtx res_1, res_2, res_3, res_4;
   22817 
   22818   res_1 = gen_reg_rtx (V4SImode);
   22819   res_2 = gen_reg_rtx (V4SImode);
   22820   res_3 = gen_reg_rtx (V2DImode);
   22821   res_4 = gen_reg_rtx (V2DImode);
   22822   ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
   22823   ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
   22824 
   22825   /* Move the results in element 2 down to element 1; we don't care
   22826      what goes in elements 2 and 3.  Then we can merge the parts
   22827      back together with an interleave.
   22828 
   22829      Note that two other sequences were tried:
   22830      (1) Use interleaves at the start instead of psrldq, which allows
   22831      us to use a single shufps to merge things back at the end.
   22832      (2) Use shufps here to combine the two vectors, then pshufd to
   22833      put the elements in the correct order.
   22834      In both cases the cost of the reformatting stall was too high
   22835      and the overall sequence slower.  */
   22836 
   22837   emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
   22838 				const0_rtx, const2_rtx,
   22839 				const0_rtx, const0_rtx));
   22840   emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
   22841 				const0_rtx, const2_rtx,
   22842 				const0_rtx, const0_rtx));
   22843   res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
   22844 
   22845   set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
   22846 }
   22847 
   22848 void
   22849 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
   22850 {
   22851   machine_mode mode = GET_MODE (op0);
   22852   rtx t1, t2, t3, t4, t5, t6;
   22853 
   22854   if (TARGET_AVX512DQ && mode == V8DImode)
   22855     emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
   22856   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
   22857     emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
   22858   else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
   22859     emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
   22860   else if (TARGET_XOP && mode == V2DImode)
   22861     {
   22862       /* op1: A,B,C,D, op2: E,F,G,H */
   22863       op1 = gen_lowpart (V4SImode, op1);
   22864       op2 = gen_lowpart (V4SImode, op2);
   22865 
   22866       t1 = gen_reg_rtx (V4SImode);
   22867       t2 = gen_reg_rtx (V4SImode);
   22868       t3 = gen_reg_rtx (V2DImode);
   22869       t4 = gen_reg_rtx (V2DImode);
   22870 
   22871       /* t1: B,A,D,C */
   22872       emit_insn (gen_sse2_pshufd_1 (t1, op1,
   22873 				    GEN_INT (1),
   22874 				    GEN_INT (0),
   22875 				    GEN_INT (3),
   22876 				    GEN_INT (2)));
   22877 
   22878       /* t2: (B*E),(A*F),(D*G),(C*H) */
   22879       emit_insn (gen_mulv4si3 (t2, t1, op2));
   22880 
   22881       /* t3: (B*E)+(A*F), (D*G)+(C*H) */
   22882       emit_insn (gen_xop_phadddq (t3, t2));
   22883 
   22884       /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
   22885       emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
   22886 
   22887       /* Multiply lower parts and add all */
   22888       t5 = gen_reg_rtx (V2DImode);
   22889       emit_insn (gen_vec_widen_umult_even_v4si (t5,
   22890 					gen_lowpart (V4SImode, op1),
   22891 					gen_lowpart (V4SImode, op2)));
   22892       force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
   22893     }
   22894   else
   22895     {
   22896       machine_mode nmode;
   22897       rtx (*umul) (rtx, rtx, rtx);
   22898 
   22899       if (mode == V2DImode)
   22900 	{
   22901 	  umul = gen_vec_widen_umult_even_v4si;
   22902 	  nmode = V4SImode;
   22903 	}
   22904       else if (mode == V4DImode)
   22905 	{
   22906 	  umul = gen_vec_widen_umult_even_v8si;
   22907 	  nmode = V8SImode;
   22908 	}
   22909       else if (mode == V8DImode)
   22910 	{
   22911 	  umul = gen_vec_widen_umult_even_v16si;
   22912 	  nmode = V16SImode;
   22913 	}
   22914       else
   22915 	gcc_unreachable ();
   22916 
   22917 
   22918       /* Multiply low parts.  */
   22919       t1 = gen_reg_rtx (mode);
   22920       emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
   22921 
   22922       /* Shift input vectors right 32 bits so we can multiply high parts.  */
   22923       t6 = GEN_INT (32);
   22924       t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
   22925       t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
   22926 
   22927       /* Multiply high parts by low parts.  */
   22928       t4 = gen_reg_rtx (mode);
   22929       t5 = gen_reg_rtx (mode);
   22930       emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
   22931       emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
   22932 
   22933       /* Combine and shift the highparts back.  */
   22934       t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
   22935       t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
   22936 
   22937       /* Combine high and low parts.  */
   22938       force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
   22939     }
   22940 
   22941   set_unique_reg_note (get_last_insn (), REG_EQUAL,
   22942 		       gen_rtx_MULT (mode, op1, op2));
   22943 }
   22944 
   22945 /* Return 1 if control tansfer instruction INSN
   22946    should be encoded with notrack prefix.  */
   22947 
   22948 bool
   22949 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
   22950 {
   22951   if (!insn || !((flag_cf_protection & CF_BRANCH)))
   22952     return false;
   22953 
   22954   if (CALL_P (insn))
   22955     {
   22956       rtx call = get_call_rtx_from (insn);
   22957       gcc_assert (call != NULL_RTX);
   22958       rtx addr = XEXP (call, 0);
   22959 
   22960       /* Do not emit 'notrack' if it's not an indirect call.  */
   22961       if (MEM_P (addr)
   22962 	  && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
   22963 	return false;
   22964       else
   22965 	return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
   22966     }
   22967 
   22968   if (JUMP_P (insn) && !flag_cet_switch)
   22969     {
   22970       rtx target = JUMP_LABEL (insn);
   22971       if (target == NULL_RTX || ANY_RETURN_P (target))
   22972 	return false;
   22973 
   22974       /* Check the jump is a switch table.  */
   22975       rtx_insn *label = as_a<rtx_insn *> (target);
   22976       rtx_insn *table = next_insn (label);
   22977       if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
   22978 	return false;
   22979       else
   22980 	return true;
   22981     }
   22982   return false;
   22983 }
   22984 
   22985 /* Calculate integer abs() using only SSE2 instructions.  */
   22986 
   22987 void
   22988 ix86_expand_sse2_abs (rtx target, rtx input)
   22989 {
   22990   machine_mode mode = GET_MODE (target);
   22991   rtx tmp0, tmp1, x;
   22992 
   22993   switch (mode)
   22994     {
   22995     case E_V2DImode:
   22996     case E_V4DImode:
   22997       /* For 64-bit signed integer X, with SSE4.2 use
   22998 	 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
   22999 	 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
   23000 	 32 and use logical instead of arithmetic right shift (which is
   23001 	 unimplemented) and subtract.  */
   23002       if (TARGET_SSE4_2)
   23003 	{
   23004 	  tmp0 = gen_reg_rtx (mode);
   23005 	  tmp1 = gen_reg_rtx (mode);
   23006 	  emit_move_insn (tmp1, CONST0_RTX (mode));
   23007 	  if (mode == E_V2DImode)
   23008 	    emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
   23009 	  else
   23010 	    emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
   23011 	}
   23012       else
   23013 	{
   23014 	  tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
   23015 				      GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
   23016 					       - 1), NULL, 0, OPTAB_DIRECT);
   23017 	  tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
   23018 	}
   23019 
   23020       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   23021 				  NULL, 0, OPTAB_DIRECT);
   23022       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   23023 			       target, 0, OPTAB_DIRECT);
   23024       break;
   23025 
   23026     case E_V4SImode:
   23027       /* For 32-bit signed integer X, the best way to calculate the absolute
   23028 	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
   23029       tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
   23030 				  GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
   23031 				  NULL, 0, OPTAB_DIRECT);
   23032       tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
   23033 				  NULL, 0, OPTAB_DIRECT);
   23034       x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
   23035 			       target, 0, OPTAB_DIRECT);
   23036       break;
   23037 
   23038     case E_V8HImode:
   23039       /* For 16-bit signed integer X, the best way to calculate the absolute
   23040 	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
   23041       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   23042 
   23043       x = expand_simple_binop (mode, SMAX, tmp0, input,
   23044 			       target, 0, OPTAB_DIRECT);
   23045       break;
   23046 
   23047     case E_V16QImode:
   23048       /* For 8-bit signed integer X, the best way to calculate the absolute
   23049 	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
   23050 	 as SSE2 provides the PMINUB insn.  */
   23051       tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
   23052 
   23053       x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
   23054 			       target, 0, OPTAB_DIRECT);
   23055       break;
   23056 
   23057     default:
   23058       gcc_unreachable ();
   23059     }
   23060 
   23061   if (x != target)
   23062     emit_move_insn (target, x);
   23063 }
   23064 
   23065 /* Expand an extract from a vector register through pextr insn.
   23066    Return true if successful.  */
   23067 
   23068 bool
   23069 ix86_expand_pextr (rtx *operands)
   23070 {
   23071   rtx dst = operands[0];
   23072   rtx src = operands[1];
   23073 
   23074   unsigned int size = INTVAL (operands[2]);
   23075   unsigned int pos = INTVAL (operands[3]);
   23076 
   23077   if (SUBREG_P (dst))
   23078     {
   23079       /* Reject non-lowpart subregs.  */
   23080       if (SUBREG_BYTE (dst) > 0)
   23081 	return false;
   23082       dst = SUBREG_REG (dst);
   23083     }
   23084 
   23085   if (SUBREG_P (src))
   23086     {
   23087       pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
   23088       src = SUBREG_REG (src);
   23089     }
   23090 
   23091   switch (GET_MODE (src))
   23092     {
   23093     case E_V16QImode:
   23094     case E_V8HImode:
   23095     case E_V4SImode:
   23096     case E_V2DImode:
   23097     case E_V1TImode:
   23098       {
   23099 	machine_mode srcmode, dstmode;
   23100 	rtx d, pat;
   23101 
   23102 	if (!int_mode_for_size (size, 0).exists (&dstmode))
   23103 	  return false;
   23104 
   23105 	switch (dstmode)
   23106 	  {
   23107 	  case E_QImode:
   23108 	    if (!TARGET_SSE4_1)
   23109 	      return false;
   23110 	    srcmode = V16QImode;
   23111 	    break;
   23112 
   23113 	  case E_HImode:
   23114 	    if (!TARGET_SSE2)
   23115 	      return false;
   23116 	    srcmode = V8HImode;
   23117 	    break;
   23118 
   23119 	  case E_SImode:
   23120 	    if (!TARGET_SSE4_1)
   23121 	      return false;
   23122 	    srcmode = V4SImode;
   23123 	    break;
   23124 
   23125 	  case E_DImode:
   23126 	    gcc_assert (TARGET_64BIT);
   23127 	    if (!TARGET_SSE4_1)
   23128 	      return false;
   23129 	    srcmode = V2DImode;
   23130 	    break;
   23131 
   23132 	  default:
   23133 	    return false;
   23134 	  }
   23135 
   23136 	/* Reject extractions from misaligned positions.  */
   23137 	if (pos & (size-1))
   23138 	  return false;
   23139 
   23140 	if (GET_MODE (dst) == dstmode)
   23141 	  d = dst;
   23142 	else
   23143 	  d = gen_reg_rtx (dstmode);
   23144 
   23145 	/* Construct insn pattern.  */
   23146 	pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
   23147 	pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
   23148 
   23149 	/* Let the rtl optimizers know about the zero extension performed.  */
   23150 	if (dstmode == QImode || dstmode == HImode)
   23151 	  {
   23152 	    pat = gen_rtx_ZERO_EXTEND (SImode, pat);
   23153 	    d = gen_lowpart (SImode, d);
   23154 	  }
   23155 
   23156 	emit_insn (gen_rtx_SET (d, pat));
   23157 
   23158 	if (d != dst)
   23159 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   23160 	return true;
   23161       }
   23162 
   23163     default:
   23164       return false;
   23165     }
   23166 }
   23167 
   23168 /* Expand an insert into a vector register through pinsr insn.
   23169    Return true if successful.  */
   23170 
   23171 bool
   23172 ix86_expand_pinsr (rtx *operands)
   23173 {
   23174   rtx dst = operands[0];
   23175   rtx src = operands[3];
   23176 
   23177   unsigned int size = INTVAL (operands[1]);
   23178   unsigned int pos = INTVAL (operands[2]);
   23179 
   23180   if (SUBREG_P (dst))
   23181     {
   23182       pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
   23183       dst = SUBREG_REG (dst);
   23184     }
   23185 
   23186   switch (GET_MODE (dst))
   23187     {
   23188     case E_V16QImode:
   23189     case E_V8HImode:
   23190     case E_V4SImode:
   23191     case E_V2DImode:
   23192     case E_V1TImode:
   23193       {
   23194 	machine_mode srcmode, dstmode;
   23195 	rtx (*pinsr)(rtx, rtx, rtx, rtx);
   23196 	rtx d;
   23197 
   23198 	if (!int_mode_for_size (size, 0).exists (&srcmode))
   23199 	  return false;
   23200 
   23201 	switch (srcmode)
   23202 	  {
   23203 	  case E_QImode:
   23204 	    if (!TARGET_SSE4_1)
   23205 	      return false;
   23206 	    dstmode = V16QImode;
   23207 	    pinsr = gen_sse4_1_pinsrb;
   23208 	    break;
   23209 
   23210 	  case E_HImode:
   23211 	    if (!TARGET_SSE2)
   23212 	      return false;
   23213 	    dstmode = V8HImode;
   23214 	    pinsr = gen_sse2_pinsrw;
   23215 	    break;
   23216 
   23217 	  case E_SImode:
   23218 	    if (!TARGET_SSE4_1)
   23219 	      return false;
   23220 	    dstmode = V4SImode;
   23221 	    pinsr = gen_sse4_1_pinsrd;
   23222 	    break;
   23223 
   23224 	  case E_DImode:
   23225 	    gcc_assert (TARGET_64BIT);
   23226 	    if (!TARGET_SSE4_1)
   23227 	      return false;
   23228 	    dstmode = V2DImode;
   23229 	    pinsr = gen_sse4_1_pinsrq;
   23230 	    break;
   23231 
   23232 	  default:
   23233 	    return false;
   23234 	  }
   23235 
   23236 	/* Reject insertions to misaligned positions.  */
   23237 	if (pos & (size-1))
   23238 	  return false;
   23239 
   23240 	if (SUBREG_P (src))
   23241 	  {
   23242 	    unsigned int srcpos = SUBREG_BYTE (src);
   23243 
   23244 	    if (srcpos > 0)
   23245 	      {
   23246 		rtx extr_ops[4];
   23247 
   23248 		extr_ops[0] = gen_reg_rtx (srcmode);
   23249 		extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
   23250 		extr_ops[2] = GEN_INT (size);
   23251 		extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
   23252 
   23253 		if (!ix86_expand_pextr (extr_ops))
   23254 		  return false;
   23255 
   23256 		src = extr_ops[0];
   23257 	      }
   23258 	    else
   23259 	      src = gen_lowpart (srcmode, SUBREG_REG (src));
   23260 	  }
   23261 
   23262 	if (GET_MODE (dst) == dstmode)
   23263 	  d = dst;
   23264 	else
   23265 	  d = gen_reg_rtx (dstmode);
   23266 
   23267 	emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
   23268 			  gen_lowpart (srcmode, src),
   23269 			  GEN_INT (1 << (pos / size))));
   23270 	if (d != dst)
   23271 	  emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
   23272 	return true;
   23273       }
   23274 
   23275     default:
   23276       return false;
   23277     }
   23278 }
   23279 
   23280 /* All CPUs prefer to avoid cross-lane operations so perform reductions
   23281    upper against lower halves up to SSE reg size.  */
   23282 
   23283 machine_mode
   23284 ix86_split_reduction (machine_mode mode)
   23285 {
   23286   /* Reduce lowpart against highpart until we reach SSE reg width to
   23287      avoid cross-lane operations.  */
   23288   switch (mode)
   23289     {
   23290     case E_V8DImode:
   23291     case E_V4DImode:
   23292       return V2DImode;
   23293     case E_V16SImode:
   23294     case E_V8SImode:
   23295       return V4SImode;
   23296     case E_V32HImode:
   23297     case E_V16HImode:
   23298       return V8HImode;
   23299     case E_V64QImode:
   23300     case E_V32QImode:
   23301       return V16QImode;
   23302     case E_V16SFmode:
   23303     case E_V8SFmode:
   23304       return V4SFmode;
   23305     case E_V8DFmode:
   23306     case E_V4DFmode:
   23307       return V2DFmode;
   23308     default:
   23309       return mode;
   23310     }
   23311 }
   23312 
   23313 /* Generate call to __divmoddi4.  */
   23314 
   23315 void
   23316 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
   23317 			    rtx op0, rtx op1,
   23318 			    rtx *quot_p, rtx *rem_p)
   23319 {
   23320   rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
   23321 
   23322   rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
   23323 				      mode, op0, mode, op1, mode,
   23324 				      XEXP (rem, 0), Pmode);
   23325   *quot_p = quot;
   23326   *rem_p = rem;
   23327 }
   23328 
   23329 void
   23330 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
   23331 				  enum rtx_code code, bool after,
   23332 				  bool doubleword)
   23333 {
   23334   rtx old_reg, new_reg, old_mem, success;
   23335   machine_mode mode = GET_MODE (target);
   23336   rtx_code_label *loop_label = NULL;
   23337 
   23338   old_reg = gen_reg_rtx (mode);
   23339   new_reg = old_reg;
   23340   old_mem = copy_to_reg (mem);
   23341   loop_label = gen_label_rtx ();
   23342   emit_label (loop_label);
   23343   emit_move_insn (old_reg, old_mem);
   23344 
   23345   /* return value for atomic_fetch_op.  */
   23346   if (!after)
   23347     emit_move_insn (target, old_reg);
   23348 
   23349   if (code == NOT)
   23350     {
   23351       new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
   23352 				     true, OPTAB_LIB_WIDEN);
   23353       new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
   23354     }
   23355   else
   23356     new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
   23357 				   true, OPTAB_LIB_WIDEN);
   23358 
   23359   /* return value for atomic_op_fetch.  */
   23360   if (after)
   23361     emit_move_insn (target, new_reg);
   23362 
   23363   success = NULL_RTX;
   23364 
   23365   ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
   23366 			    gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
   23367 					  SImode),
   23368 			    doubleword, loop_label);
   23369 }
   23370 
   23371 /* Relax cmpxchg instruction, param loop_label indicates whether
   23372    the instruction should be relaxed with a pause loop.  If not,
   23373    it will be relaxed to an atomic load + compare, and skip
   23374    cmpxchg instruction if mem != exp_input.  */
   23375 
   23376 void
   23377 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
   23378 			  rtx mem, rtx exp_input, rtx new_input,
   23379 			  rtx mem_model, bool doubleword,
   23380 			  rtx_code_label *loop_label)
   23381 {
   23382   rtx_code_label *cmp_label = NULL;
   23383   rtx_code_label *done_label = NULL;
   23384   rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
   23385   rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
   23386   rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
   23387   machine_mode mode = GET_MODE (target_val), hmode = mode;
   23388 
   23389   if (*ptarget_bool == NULL)
   23390     target_bool = gen_reg_rtx (QImode);
   23391   else
   23392     target_bool = *ptarget_bool;
   23393 
   23394   cmp_label = gen_label_rtx ();
   23395   done_label = gen_label_rtx ();
   23396 
   23397   new_mem = gen_reg_rtx (mode);
   23398   /* Load memory first.  */
   23399   expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
   23400 
   23401   switch (mode)
   23402     {
   23403     case E_TImode:
   23404       gendw = gen_atomic_compare_and_swapti_doubleword;
   23405       hmode = DImode;
   23406       break;
   23407     case E_DImode:
   23408       if (doubleword)
   23409 	{
   23410 	  gendw = gen_atomic_compare_and_swapdi_doubleword;
   23411 	  hmode = SImode;
   23412 	}
   23413       else
   23414 	gen = gen_atomic_compare_and_swapdi_1;
   23415       break;
   23416     case E_SImode:
   23417       gen = gen_atomic_compare_and_swapsi_1;
   23418       break;
   23419     case E_HImode:
   23420       gen = gen_atomic_compare_and_swaphi_1;
   23421       break;
   23422     case E_QImode:
   23423       gen = gen_atomic_compare_and_swapqi_1;
   23424       break;
   23425     default:
   23426       gcc_unreachable ();
   23427     }
   23428 
   23429   /* Compare mem value with expected value.  */
   23430   if (doubleword)
   23431     {
   23432       rtx low_new_mem = gen_lowpart (hmode, new_mem);
   23433       rtx low_exp_input = gen_lowpart (hmode, exp_input);
   23434       rtx high_new_mem = gen_highpart (hmode, new_mem);
   23435       rtx high_exp_input = gen_highpart (hmode, exp_input);
   23436       emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
   23437 			       hmode, 1, cmp_label,
   23438 			       profile_probability::guessed_never ());
   23439       emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
   23440 			       hmode, 1, cmp_label,
   23441 			       profile_probability::guessed_never ());
   23442     }
   23443   else
   23444     emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
   23445 			     GET_MODE (exp_input), 1, cmp_label,
   23446 			     profile_probability::guessed_never ());
   23447 
   23448   /* Directly emits cmpxchg here.  */
   23449   if (doubleword)
   23450     emit_insn (gendw (target_val, mem, exp_input,
   23451 		      gen_lowpart (hmode, new_input),
   23452 		      gen_highpart (hmode, new_input),
   23453 		      mem_model));
   23454   else
   23455     emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
   23456 
   23457   if (!loop_label)
   23458   {
   23459     emit_jump_insn (gen_jump (done_label));
   23460     emit_barrier ();
   23461     emit_label (cmp_label);
   23462     emit_move_insn (target_val, new_mem);
   23463     emit_label (done_label);
   23464     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   23465 		       const0_rtx);
   23466   }
   23467   else
   23468   {
   23469     ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
   23470 		       const0_rtx);
   23471     emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
   23472 			     GET_MODE (target_bool), 1, loop_label,
   23473 			     profile_probability::guessed_never ());
   23474     emit_jump_insn (gen_jump (done_label));
   23475     emit_barrier ();
   23476 
   23477     /* If mem is not expected, pause and loop back.  */
   23478     emit_label (cmp_label);
   23479     emit_move_insn (target_val, new_mem);
   23480     emit_insn (gen_pause ());
   23481     emit_jump_insn (gen_jump (loop_label));
   23482     emit_barrier ();
   23483     emit_label (done_label);
   23484   }
   23485 
   23486   *ptarget_bool = target_bool;
   23487 }
   23488 
   23489 #include "gt-i386-expand.h"
   23490