i386-expand.cc revision 1.1.1.1 1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-options.h"
93 #include "i386-builtins.h"
94 #include "i386-expand.h"
95
96 /* Split one or more double-mode RTL references into pairs of half-mode
97 references. The RTL can be REG, offsettable MEM, integer constant, or
98 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
99 split and "num" is its length. lo_half and hi_half are output arrays
100 that parallel "operands". */
101
102 void
103 split_double_mode (machine_mode mode, rtx operands[],
104 int num, rtx lo_half[], rtx hi_half[])
105 {
106 machine_mode half_mode;
107 unsigned int byte;
108 rtx mem_op = NULL_RTX;
109 int mem_num = 0;
110
111 switch (mode)
112 {
113 case E_TImode:
114 half_mode = DImode;
115 break;
116 case E_DImode:
117 half_mode = SImode;
118 break;
119 case E_P2HImode:
120 half_mode = HImode;
121 break;
122 case E_P2QImode:
123 half_mode = QImode;
124 break;
125 default:
126 gcc_unreachable ();
127 }
128
129 byte = GET_MODE_SIZE (half_mode);
130
131 while (num--)
132 {
133 rtx op = operands[num];
134
135 /* simplify_subreg refuse to split volatile memory addresses,
136 but we still have to handle it. */
137 if (MEM_P (op))
138 {
139 if (mem_op && rtx_equal_p (op, mem_op))
140 {
141 lo_half[num] = lo_half[mem_num];
142 hi_half[num] = hi_half[mem_num];
143 }
144 else
145 {
146 mem_op = op;
147 mem_num = num;
148 lo_half[num] = adjust_address (op, half_mode, 0);
149 hi_half[num] = adjust_address (op, half_mode, byte);
150 }
151 }
152 else
153 {
154 lo_half[num] = simplify_gen_subreg (half_mode, op,
155 GET_MODE (op) == VOIDmode
156 ? mode : GET_MODE (op), 0);
157
158 rtx tmp = simplify_gen_subreg (half_mode, op,
159 GET_MODE (op) == VOIDmode
160 ? mode : GET_MODE (op), byte);
161 /* simplify_gen_subreg will return NULL RTX for the
162 high half of the paradoxical subreg. */
163 hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
164 }
165 }
166 }
167
168 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
169 for the target. */
170
171 void
172 ix86_expand_clear (rtx dest)
173 {
174 rtx tmp;
175
176 /* We play register width games, which are only valid after reload. */
177 gcc_assert (reload_completed);
178
179 /* Avoid HImode and its attendant prefix byte. */
180 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
181 dest = gen_rtx_REG (SImode, REGNO (dest));
182 tmp = gen_rtx_SET (dest, const0_rtx);
183
184 if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
185 {
186 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
187 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
188 }
189
190 emit_insn (tmp);
191 }
192
193 /* Return true if V can be broadcasted from an integer of WIDTH bits
194 which is returned in VAL_BROADCAST. Otherwise, return false. */
195
196 static bool
197 ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
198 HOST_WIDE_INT &val_broadcast)
199 {
200 wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
201 val_broadcast = wi::extract_uhwi (val, 0, width);
202 for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
203 {
204 HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
205 if (val_broadcast != each)
206 return false;
207 }
208 val_broadcast = sext_hwi (val_broadcast, width);
209 return true;
210 }
211
212 /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
213
214 static rtx
215 ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
216 {
217 /* Don't use integer vector broadcast if we can't move from GPR to SSE
218 register directly. */
219 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
220 return nullptr;
221
222 /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
223 broadcast only if vector broadcast is available. */
224 if (!TARGET_AVX
225 || !CONST_WIDE_INT_P (op)
226 || standard_sse_constant_p (op, mode)
227 || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
228 != GET_MODE_BITSIZE (mode)))
229 return nullptr;
230
231 HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
232 HOST_WIDE_INT val_broadcast;
233 scalar_int_mode broadcast_mode;
234 if (TARGET_AVX2
235 && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
236 val_broadcast))
237 broadcast_mode = QImode;
238 else if (TARGET_AVX2
239 && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
240 val_broadcast))
241 broadcast_mode = HImode;
242 else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
243 val_broadcast))
244 broadcast_mode = SImode;
245 else if (TARGET_64BIT
246 && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
247 val_broadcast))
248 broadcast_mode = DImode;
249 else
250 return nullptr;
251
252 /* Check if OP can be broadcasted from VAL. */
253 for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
254 if (val != CONST_WIDE_INT_ELT (op, i))
255 return nullptr;
256
257 unsigned int nunits = (GET_MODE_SIZE (mode)
258 / GET_MODE_SIZE (broadcast_mode));
259 machine_mode vector_mode;
260 if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
261 gcc_unreachable ();
262 rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
263 bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
264 target,
265 GEN_INT (val_broadcast));
266 gcc_assert (ok);
267 target = lowpart_subreg (mode, target, vector_mode);
268 return target;
269 }
270
271 void
272 ix86_expand_move (machine_mode mode, rtx operands[])
273 {
274 rtx op0, op1;
275 rtx tmp, addend = NULL_RTX;
276 enum tls_model model;
277
278 op0 = operands[0];
279 op1 = operands[1];
280
281 /* Avoid complex sets of likely spilled hard registers before reload. */
282 if (!ix86_hardreg_mov_ok (op0, op1))
283 {
284 tmp = gen_reg_rtx (mode);
285 operands[0] = tmp;
286 ix86_expand_move (mode, operands);
287 operands[0] = op0;
288 operands[1] = tmp;
289 op1 = tmp;
290 }
291
292 switch (GET_CODE (op1))
293 {
294 case CONST:
295 tmp = XEXP (op1, 0);
296
297 if (GET_CODE (tmp) != PLUS
298 || GET_CODE (XEXP (tmp, 0)) != SYMBOL_REF)
299 break;
300
301 op1 = XEXP (tmp, 0);
302 addend = XEXP (tmp, 1);
303 /* FALLTHRU */
304
305 case SYMBOL_REF:
306 model = SYMBOL_REF_TLS_MODEL (op1);
307
308 if (model)
309 op1 = legitimize_tls_address (op1, model, true);
310 else if (ix86_force_load_from_GOT_p (op1))
311 {
312 /* Load the external function address via GOT slot to avoid PLT. */
313 op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
314 (TARGET_64BIT
315 ? UNSPEC_GOTPCREL
316 : UNSPEC_GOT));
317 op1 = gen_rtx_CONST (Pmode, op1);
318 op1 = gen_const_mem (Pmode, op1);
319 set_mem_alias_set (op1, ix86_GOT_alias_set ());
320 }
321 else
322 {
323 tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
324 if (tmp)
325 {
326 op1 = tmp;
327 if (!addend)
328 break;
329 }
330 else
331 {
332 op1 = operands[1];
333 break;
334 }
335 }
336
337 if (addend)
338 {
339 op1 = force_operand (op1, NULL_RTX);
340 op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
341 op0, 1, OPTAB_DIRECT);
342 }
343 else
344 op1 = force_operand (op1, op0);
345
346 if (op1 == op0)
347 return;
348
349 op1 = convert_to_mode (mode, op1, 1);
350
351 default:
352 break;
353
354 case SUBREG:
355 /* As not all values in XFmode are representable in real_value,
356 we might be called with unfoldable SUBREGs of constants. */
357 if (mode == XFmode
358 && CONSTANT_P (SUBREG_REG (op1))
359 && can_create_pseudo_p ())
360 {
361 machine_mode imode = GET_MODE (SUBREG_REG (op1));
362 rtx r = force_const_mem (imode, SUBREG_REG (op1));
363 if (r)
364 r = validize_mem (r);
365 else
366 r = force_reg (imode, SUBREG_REG (op1));
367 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
368 }
369 break;
370 }
371
372 if ((flag_pic || MACHOPIC_INDIRECT)
373 && symbolic_operand (op1, mode))
374 {
375 if (TARGET_MACHO && !TARGET_64BIT)
376 {
377 #if TARGET_MACHO
378 /* dynamic-no-pic */
379 if (MACHOPIC_INDIRECT)
380 {
381 rtx temp = (op0 && REG_P (op0) && mode == Pmode)
382 ? op0 : gen_reg_rtx (Pmode);
383 op1 = machopic_indirect_data_reference (op1, temp);
384 if (MACHOPIC_PURE)
385 op1 = machopic_legitimize_pic_address (op1, mode,
386 temp == op1 ? 0 : temp);
387 }
388 if (op0 != op1 && GET_CODE (op0) != MEM)
389 {
390 rtx insn = gen_rtx_SET (op0, op1);
391 emit_insn (insn);
392 return;
393 }
394 if (GET_CODE (op0) == MEM)
395 op1 = force_reg (Pmode, op1);
396 else
397 {
398 rtx temp = op0;
399 if (GET_CODE (temp) != REG)
400 temp = gen_reg_rtx (Pmode);
401 temp = legitimize_pic_address (op1, temp);
402 if (temp == op0)
403 return;
404 op1 = temp;
405 }
406 /* dynamic-no-pic */
407 #endif
408 }
409 else
410 {
411 if (MEM_P (op0))
412 op1 = force_reg (mode, op1);
413 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
414 {
415 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
416 op1 = legitimize_pic_address (op1, reg);
417 if (op0 == op1)
418 return;
419 op1 = convert_to_mode (mode, op1, 1);
420 }
421 }
422 }
423 else
424 {
425 if (MEM_P (op0)
426 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
427 || !push_operand (op0, mode))
428 && MEM_P (op1))
429 op1 = force_reg (mode, op1);
430
431 if (push_operand (op0, mode)
432 && ! general_no_elim_operand (op1, mode))
433 op1 = copy_to_mode_reg (mode, op1);
434
435 /* Force large constants in 64bit compilation into register
436 to get them CSEed. */
437 if (can_create_pseudo_p ()
438 && (mode == DImode) && TARGET_64BIT
439 && immediate_operand (op1, mode)
440 && !x86_64_zext_immediate_operand (op1, VOIDmode)
441 && !register_operand (op0, mode)
442 && optimize)
443 op1 = copy_to_mode_reg (mode, op1);
444
445 if (can_create_pseudo_p ())
446 {
447 if (CONST_DOUBLE_P (op1))
448 {
449 /* If we are loading a floating point constant to a
450 register, force the value to memory now, since we'll
451 get better code out the back end. */
452
453 op1 = validize_mem (force_const_mem (mode, op1));
454 if (!register_operand (op0, mode))
455 {
456 rtx temp = gen_reg_rtx (mode);
457 emit_insn (gen_rtx_SET (temp, op1));
458 emit_move_insn (op0, temp);
459 return;
460 }
461 }
462 else if (GET_MODE_SIZE (mode) >= 16)
463 {
464 rtx tmp = ix86_convert_const_wide_int_to_broadcast
465 (GET_MODE (op0), op1);
466 if (tmp != nullptr)
467 op1 = tmp;
468 }
469 }
470 }
471
472 emit_insn (gen_rtx_SET (op0, op1));
473 }
474
475 /* OP is a memref of CONST_VECTOR, return scalar constant mem
476 if CONST_VECTOR is a vec_duplicate, else return NULL. */
477 static rtx
478 ix86_broadcast_from_constant (machine_mode mode, rtx op)
479 {
480 int nunits = GET_MODE_NUNITS (mode);
481 if (nunits < 2)
482 return nullptr;
483
484 /* Don't use integer vector broadcast if we can't move from GPR to SSE
485 register directly. */
486 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
487 && INTEGRAL_MODE_P (mode))
488 return nullptr;
489
490 /* Convert CONST_VECTOR to a non-standard SSE constant integer
491 broadcast only if vector broadcast is available. */
492 if (!(TARGET_AVX2
493 || (TARGET_AVX
494 && (GET_MODE_INNER (mode) == SImode
495 || GET_MODE_INNER (mode) == DImode))
496 || FLOAT_MODE_P (mode))
497 || standard_sse_constant_p (op, mode))
498 return nullptr;
499
500 /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
501 We can still put 64-bit integer constant in memory when
502 avx512 embed broadcast is available. */
503 if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
504 && (!TARGET_AVX512F
505 || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
506 return nullptr;
507
508 if (GET_MODE_INNER (mode) == TImode)
509 return nullptr;
510
511 rtx constant = get_pool_constant (XEXP (op, 0));
512 if (GET_CODE (constant) != CONST_VECTOR)
513 return nullptr;
514
515 /* There could be some rtx like
516 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
517 but with "*.LC1" refer to V2DI constant vector. */
518 if (GET_MODE (constant) != mode)
519 {
520 constant = simplify_subreg (mode, constant, GET_MODE (constant),
521 0);
522 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
523 return nullptr;
524 }
525
526 rtx first = XVECEXP (constant, 0, 0);
527
528 for (int i = 1; i < nunits; ++i)
529 {
530 rtx tmp = XVECEXP (constant, 0, i);
531 /* Vector duplicate value. */
532 if (!rtx_equal_p (tmp, first))
533 return nullptr;
534 }
535
536 return first;
537 }
538
539 void
540 ix86_expand_vector_move (machine_mode mode, rtx operands[])
541 {
542 rtx op0 = operands[0], op1 = operands[1];
543 /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
544 psABI since the biggest alignment is 4 byte for IA MCU psABI. */
545 unsigned int align = (TARGET_IAMCU
546 ? GET_MODE_BITSIZE (mode)
547 : GET_MODE_ALIGNMENT (mode));
548
549 if (push_operand (op0, VOIDmode))
550 op0 = emit_move_resolve_push (mode, op0);
551
552 /* Force constants other than zero into memory. We do not know how
553 the instructions used to build constants modify the upper 64 bits
554 of the register, once we have that information we may be able
555 to handle some of them more efficiently. */
556 if (can_create_pseudo_p ()
557 && (CONSTANT_P (op1)
558 || (SUBREG_P (op1)
559 && CONSTANT_P (SUBREG_REG (op1))))
560 && ((register_operand (op0, mode)
561 && !standard_sse_constant_p (op1, mode))
562 /* ix86_expand_vector_move_misalign() does not like constants. */
563 || (SSE_REG_MODE_P (mode)
564 && MEM_P (op0)
565 && MEM_ALIGN (op0) < align)))
566 {
567 if (SUBREG_P (op1))
568 {
569 machine_mode imode = GET_MODE (SUBREG_REG (op1));
570 rtx r = force_const_mem (imode, SUBREG_REG (op1));
571 if (r)
572 r = validize_mem (r);
573 else
574 r = force_reg (imode, SUBREG_REG (op1));
575 op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
576 }
577 else
578 {
579 machine_mode mode = GET_MODE (op0);
580 rtx tmp = ix86_convert_const_wide_int_to_broadcast
581 (mode, op1);
582 if (tmp == nullptr)
583 op1 = validize_mem (force_const_mem (mode, op1));
584 else
585 op1 = tmp;
586 }
587 }
588
589 if (can_create_pseudo_p ()
590 && GET_MODE_SIZE (mode) >= 16
591 && VECTOR_MODE_P (mode)
592 && (MEM_P (op1)
593 && SYMBOL_REF_P (XEXP (op1, 0))
594 && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
595 {
596 rtx first = ix86_broadcast_from_constant (mode, op1);
597 if (first != nullptr)
598 {
599 /* Broadcast to XMM/YMM/ZMM register from an integer
600 constant or scalar mem. */
601 op1 = gen_reg_rtx (mode);
602 if (FLOAT_MODE_P (mode)
603 || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
604 first = force_const_mem (GET_MODE_INNER (mode), first);
605 bool ok = ix86_expand_vector_init_duplicate (false, mode,
606 op1, first);
607 gcc_assert (ok);
608 emit_move_insn (op0, op1);
609 return;
610 }
611 }
612
613 /* We need to check memory alignment for SSE mode since attribute
614 can make operands unaligned. */
615 if (can_create_pseudo_p ()
616 && SSE_REG_MODE_P (mode)
617 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
618 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
619 {
620 rtx tmp[2];
621
622 /* ix86_expand_vector_move_misalign() does not like both
623 arguments in memory. */
624 if (!register_operand (op0, mode)
625 && !register_operand (op1, mode))
626 {
627 rtx scratch = ix86_gen_scratch_sse_rtx (mode);
628 emit_move_insn (scratch, op1);
629 op1 = scratch;
630 }
631
632 tmp[0] = op0; tmp[1] = op1;
633 ix86_expand_vector_move_misalign (mode, tmp);
634 return;
635 }
636
637 /* Special case TImode to V1TImode conversions, via V2DI. */
638 if (mode == V1TImode
639 && SUBREG_P (op1)
640 && GET_MODE (SUBREG_REG (op1)) == TImode
641 && TARGET_64BIT && TARGET_SSE
642 && can_create_pseudo_p ())
643 {
644 rtx tmp = gen_reg_rtx (V2DImode);
645 rtx lo = gen_reg_rtx (DImode);
646 rtx hi = gen_reg_rtx (DImode);
647 emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
648 emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
649 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
650 emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
651 return;
652 }
653
654 /* If operand0 is a hard register, make operand1 a pseudo. */
655 if (can_create_pseudo_p ()
656 && !ix86_hardreg_mov_ok (op0, op1))
657 {
658 rtx tmp = gen_reg_rtx (GET_MODE (op0));
659 emit_move_insn (tmp, op1);
660 emit_move_insn (op0, tmp);
661 return;
662 }
663
664 /* Make operand1 a register if it isn't already. */
665 if (can_create_pseudo_p ()
666 && !register_operand (op0, mode)
667 && !register_operand (op1, mode))
668 {
669 rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
670 emit_move_insn (tmp, op1);
671 emit_move_insn (op0, tmp);
672 return;
673 }
674
675 emit_insn (gen_rtx_SET (op0, op1));
676 }
677
678 /* Split 32-byte AVX unaligned load and store if needed. */
679
680 static void
681 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
682 {
683 rtx m;
684 rtx (*extract) (rtx, rtx, rtx);
685 machine_mode mode;
686
687 if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
688 || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
689 {
690 emit_insn (gen_rtx_SET (op0, op1));
691 return;
692 }
693
694 rtx orig_op0 = NULL_RTX;
695 mode = GET_MODE (op0);
696 switch (GET_MODE_CLASS (mode))
697 {
698 case MODE_VECTOR_INT:
699 case MODE_INT:
700 if (mode != V32QImode)
701 {
702 if (!MEM_P (op0))
703 {
704 orig_op0 = op0;
705 op0 = gen_reg_rtx (V32QImode);
706 }
707 else
708 op0 = gen_lowpart (V32QImode, op0);
709 op1 = gen_lowpart (V32QImode, op1);
710 mode = V32QImode;
711 }
712 break;
713 case MODE_VECTOR_FLOAT:
714 break;
715 default:
716 gcc_unreachable ();
717 }
718
719 switch (mode)
720 {
721 default:
722 gcc_unreachable ();
723 case E_V32QImode:
724 extract = gen_avx_vextractf128v32qi;
725 mode = V16QImode;
726 break;
727 case E_V16HFmode:
728 extract = gen_avx_vextractf128v16hf;
729 mode = V8HFmode;
730 break;
731 case E_V8SFmode:
732 extract = gen_avx_vextractf128v8sf;
733 mode = V4SFmode;
734 break;
735 case E_V4DFmode:
736 extract = gen_avx_vextractf128v4df;
737 mode = V2DFmode;
738 break;
739 }
740
741 if (MEM_P (op1))
742 {
743 rtx r = gen_reg_rtx (mode);
744 m = adjust_address (op1, mode, 0);
745 emit_move_insn (r, m);
746 m = adjust_address (op1, mode, 16);
747 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
748 emit_move_insn (op0, r);
749 }
750 else if (MEM_P (op0))
751 {
752 m = adjust_address (op0, mode, 0);
753 emit_insn (extract (m, op1, const0_rtx));
754 m = adjust_address (op0, mode, 16);
755 emit_insn (extract (m, copy_rtx (op1), const1_rtx));
756 }
757 else
758 gcc_unreachable ();
759
760 if (orig_op0)
761 emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
762 }
763
764 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
765 straight to ix86_expand_vector_move. */
766 /* Code generation for scalar reg-reg moves of single and double precision data:
767 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
768 movaps reg, reg
769 else
770 movss reg, reg
771 if (x86_sse_partial_reg_dependency == true)
772 movapd reg, reg
773 else
774 movsd reg, reg
775
776 Code generation for scalar loads of double precision data:
777 if (x86_sse_split_regs == true)
778 movlpd mem, reg (gas syntax)
779 else
780 movsd mem, reg
781
782 Code generation for unaligned packed loads of single precision data
783 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
784 if (x86_sse_unaligned_move_optimal)
785 movups mem, reg
786
787 if (x86_sse_partial_reg_dependency == true)
788 {
789 xorps reg, reg
790 movlps mem, reg
791 movhps mem+8, reg
792 }
793 else
794 {
795 movlps mem, reg
796 movhps mem+8, reg
797 }
798
799 Code generation for unaligned packed loads of double precision data
800 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
801 if (x86_sse_unaligned_move_optimal)
802 movupd mem, reg
803
804 if (x86_sse_split_regs == true)
805 {
806 movlpd mem, reg
807 movhpd mem+8, reg
808 }
809 else
810 {
811 movsd mem, reg
812 movhpd mem+8, reg
813 }
814 */
815
816 void
817 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
818 {
819 rtx op0, op1, m;
820
821 op0 = operands[0];
822 op1 = operands[1];
823
824 /* Use unaligned load/store for AVX512 or when optimizing for size. */
825 if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
826 {
827 emit_insn (gen_rtx_SET (op0, op1));
828 return;
829 }
830
831 if (TARGET_AVX)
832 {
833 if (GET_MODE_SIZE (mode) == 32)
834 ix86_avx256_split_vector_move_misalign (op0, op1);
835 else
836 /* Always use 128-bit mov<mode>_internal pattern for AVX. */
837 emit_insn (gen_rtx_SET (op0, op1));
838 return;
839 }
840
841 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
842 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
843 {
844 emit_insn (gen_rtx_SET (op0, op1));
845 return;
846 }
847
848 /* ??? If we have typed data, then it would appear that using
849 movdqu is the only way to get unaligned data loaded with
850 integer type. */
851 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
852 {
853 emit_insn (gen_rtx_SET (op0, op1));
854 return;
855 }
856
857 if (MEM_P (op1))
858 {
859 if (TARGET_SSE2 && mode == V2DFmode)
860 {
861 rtx zero;
862
863 /* When SSE registers are split into halves, we can avoid
864 writing to the top half twice. */
865 if (TARGET_SSE_SPLIT_REGS)
866 {
867 emit_clobber (op0);
868 zero = op0;
869 }
870 else
871 {
872 /* ??? Not sure about the best option for the Intel chips.
873 The following would seem to satisfy; the register is
874 entirely cleared, breaking the dependency chain. We
875 then store to the upper half, with a dependency depth
876 of one. A rumor has it that Intel recommends two movsd
877 followed by an unpacklpd, but this is unconfirmed. And
878 given that the dependency depth of the unpacklpd would
879 still be one, I'm not sure why this would be better. */
880 zero = CONST0_RTX (V2DFmode);
881 }
882
883 m = adjust_address (op1, DFmode, 0);
884 emit_insn (gen_sse2_loadlpd (op0, zero, m));
885 m = adjust_address (op1, DFmode, 8);
886 emit_insn (gen_sse2_loadhpd (op0, op0, m));
887 }
888 else
889 {
890 rtx t;
891
892 if (mode != V4SFmode)
893 t = gen_reg_rtx (V4SFmode);
894 else
895 t = op0;
896
897 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
898 emit_move_insn (t, CONST0_RTX (V4SFmode));
899 else
900 emit_clobber (t);
901
902 m = adjust_address (op1, V2SFmode, 0);
903 emit_insn (gen_sse_loadlps (t, t, m));
904 m = adjust_address (op1, V2SFmode, 8);
905 emit_insn (gen_sse_loadhps (t, t, m));
906 if (mode != V4SFmode)
907 emit_move_insn (op0, gen_lowpart (mode, t));
908 }
909 }
910 else if (MEM_P (op0))
911 {
912 if (TARGET_SSE2 && mode == V2DFmode)
913 {
914 m = adjust_address (op0, DFmode, 0);
915 emit_insn (gen_sse2_storelpd (m, op1));
916 m = adjust_address (op0, DFmode, 8);
917 emit_insn (gen_sse2_storehpd (m, op1));
918 }
919 else
920 {
921 if (mode != V4SFmode)
922 op1 = gen_lowpart (V4SFmode, op1);
923
924 m = adjust_address (op0, V2SFmode, 0);
925 emit_insn (gen_sse_storelps (m, op1));
926 m = adjust_address (op0, V2SFmode, 8);
927 emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
928 }
929 }
930 else
931 gcc_unreachable ();
932 }
933
934 /* Move bits 64:95 to bits 32:63. */
935
936 void
937 ix86_move_vector_high_sse_to_mmx (rtx op)
938 {
939 rtx mask = gen_rtx_PARALLEL (VOIDmode,
940 gen_rtvec (4, GEN_INT (0), GEN_INT (2),
941 GEN_INT (0), GEN_INT (0)));
942 rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
943 op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
944 rtx insn = gen_rtx_SET (dest, op);
945 emit_insn (insn);
946 }
947
948 /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
949
950 void
951 ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
952 {
953 rtx op0 = operands[0];
954 rtx op1 = operands[1];
955 rtx op2 = operands[2];
956
957 machine_mode dmode = GET_MODE (op0);
958 machine_mode smode = GET_MODE (op1);
959 machine_mode inner_dmode = GET_MODE_INNER (dmode);
960 machine_mode inner_smode = GET_MODE_INNER (smode);
961
962 /* Get the corresponding SSE mode for destination. */
963 int nunits = 16 / GET_MODE_SIZE (inner_dmode);
964 machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
965 nunits).require ();
966 machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
967 nunits / 2).require ();
968
969 /* Get the corresponding SSE mode for source. */
970 nunits = 16 / GET_MODE_SIZE (inner_smode);
971 machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
972 nunits).require ();
973
974 /* Generate SSE pack with signed/unsigned saturation. */
975 rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
976 op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
977 op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
978
979 op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
980 op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
981 rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
982 op1, op2));
983 emit_insn (insn);
984
985 ix86_move_vector_high_sse_to_mmx (op0);
986 }
987
988 /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. */
989
990 void
991 ix86_split_mmx_punpck (rtx operands[], bool high_p)
992 {
993 rtx op0 = operands[0];
994 rtx op1 = operands[1];
995 rtx op2 = operands[2];
996 machine_mode mode = GET_MODE (op0);
997 rtx mask;
998 /* The corresponding SSE mode. */
999 machine_mode sse_mode, double_sse_mode;
1000
1001 switch (mode)
1002 {
1003 case E_V4QImode:
1004 case E_V8QImode:
1005 sse_mode = V16QImode;
1006 double_sse_mode = V32QImode;
1007 mask = gen_rtx_PARALLEL (VOIDmode,
1008 gen_rtvec (16,
1009 GEN_INT (0), GEN_INT (16),
1010 GEN_INT (1), GEN_INT (17),
1011 GEN_INT (2), GEN_INT (18),
1012 GEN_INT (3), GEN_INT (19),
1013 GEN_INT (4), GEN_INT (20),
1014 GEN_INT (5), GEN_INT (21),
1015 GEN_INT (6), GEN_INT (22),
1016 GEN_INT (7), GEN_INT (23)));
1017 break;
1018
1019 case E_V4HImode:
1020 case E_V2HImode:
1021 sse_mode = V8HImode;
1022 double_sse_mode = V16HImode;
1023 mask = gen_rtx_PARALLEL (VOIDmode,
1024 gen_rtvec (8,
1025 GEN_INT (0), GEN_INT (8),
1026 GEN_INT (1), GEN_INT (9),
1027 GEN_INT (2), GEN_INT (10),
1028 GEN_INT (3), GEN_INT (11)));
1029 break;
1030
1031 case E_V2SImode:
1032 sse_mode = V4SImode;
1033 double_sse_mode = V8SImode;
1034 mask = gen_rtx_PARALLEL (VOIDmode,
1035 gen_rtvec (4,
1036 GEN_INT (0), GEN_INT (4),
1037 GEN_INT (1), GEN_INT (5)));
1038 break;
1039
1040 case E_V2SFmode:
1041 sse_mode = V4SFmode;
1042 double_sse_mode = V8SFmode;
1043 mask = gen_rtx_PARALLEL (VOIDmode,
1044 gen_rtvec (4,
1045 GEN_INT (0), GEN_INT (4),
1046 GEN_INT (1), GEN_INT (5)));
1047 break;
1048
1049 default:
1050 gcc_unreachable ();
1051 }
1052
1053 /* Generate SSE punpcklXX. */
1054 rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1055 op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1056 op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1057
1058 op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1059 op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1060 rtx insn = gen_rtx_SET (dest, op2);
1061 emit_insn (insn);
1062
1063 /* Move high bits to low bits. */
1064 if (high_p)
1065 {
1066 if (sse_mode == V4SFmode)
1067 {
1068 mask = gen_rtx_PARALLEL (VOIDmode,
1069 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1070 GEN_INT (4), GEN_INT (5)));
1071 op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1072 op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1073 }
1074 else
1075 {
1076 int sz = GET_MODE_SIZE (mode);
1077
1078 if (sz == 4)
1079 mask = gen_rtx_PARALLEL (VOIDmode,
1080 gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1081 GEN_INT (0), GEN_INT (1)));
1082 else if (sz == 8)
1083 mask = gen_rtx_PARALLEL (VOIDmode,
1084 gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1085 GEN_INT (0), GEN_INT (1)));
1086 else
1087 gcc_unreachable ();
1088
1089 dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1090 op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1091 }
1092
1093 insn = gen_rtx_SET (dest, op1);
1094 emit_insn (insn);
1095 }
1096 }
1097
1098 /* Helper function of ix86_fixup_binary_operands to canonicalize
1099 operand order. Returns true if the operands should be swapped. */
1100
1101 static bool
1102 ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1103 rtx operands[])
1104 {
1105 rtx dst = operands[0];
1106 rtx src1 = operands[1];
1107 rtx src2 = operands[2];
1108
1109 /* If the operation is not commutative, we can't do anything. */
1110 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1111 && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1112 return false;
1113
1114 /* Highest priority is that src1 should match dst. */
1115 if (rtx_equal_p (dst, src1))
1116 return false;
1117 if (rtx_equal_p (dst, src2))
1118 return true;
1119
1120 /* Next highest priority is that immediate constants come second. */
1121 if (immediate_operand (src2, mode))
1122 return false;
1123 if (immediate_operand (src1, mode))
1124 return true;
1125
1126 /* Lowest priority is that memory references should come second. */
1127 if (MEM_P (src2))
1128 return false;
1129 if (MEM_P (src1))
1130 return true;
1131
1132 return false;
1133 }
1134
1135
1136 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1137 destination to use for the operation. If different from the true
1138 destination in operands[0], a copy operation will be required. */
1139
1140 rtx
1141 ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1142 rtx operands[])
1143 {
1144 rtx dst = operands[0];
1145 rtx src1 = operands[1];
1146 rtx src2 = operands[2];
1147
1148 /* Canonicalize operand order. */
1149 if (ix86_swap_binary_operands_p (code, mode, operands))
1150 {
1151 /* It is invalid to swap operands of different modes. */
1152 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1153
1154 std::swap (src1, src2);
1155 }
1156
1157 /* Both source operands cannot be in memory. */
1158 if (MEM_P (src1) && MEM_P (src2))
1159 {
1160 /* Optimization: Only read from memory once. */
1161 if (rtx_equal_p (src1, src2))
1162 {
1163 src2 = force_reg (mode, src2);
1164 src1 = src2;
1165 }
1166 else if (rtx_equal_p (dst, src1))
1167 src2 = force_reg (mode, src2);
1168 else
1169 src1 = force_reg (mode, src1);
1170 }
1171
1172 /* If the destination is memory, and we do not have matching source
1173 operands, do things in registers. */
1174 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1175 dst = gen_reg_rtx (mode);
1176
1177 /* Source 1 cannot be a constant. */
1178 if (CONSTANT_P (src1))
1179 src1 = force_reg (mode, src1);
1180
1181 /* Source 1 cannot be a non-matching memory. */
1182 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1183 src1 = force_reg (mode, src1);
1184
1185 /* Improve address combine. */
1186 if (code == PLUS
1187 && GET_MODE_CLASS (mode) == MODE_INT
1188 && MEM_P (src2))
1189 src2 = force_reg (mode, src2);
1190
1191 operands[1] = src1;
1192 operands[2] = src2;
1193 return dst;
1194 }
1195
1196 /* Similarly, but assume that the destination has already been
1197 set up properly. */
1198
1199 void
1200 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1201 machine_mode mode, rtx operands[])
1202 {
1203 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
1204 gcc_assert (dst == operands[0]);
1205 }
1206
1207 /* Attempt to expand a binary operator. Make the expansion closer to the
1208 actual machine, then just general_operand, which will allow 3 separate
1209 memory references (one output, two input) in a single insn. */
1210
1211 void
1212 ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1213 rtx operands[])
1214 {
1215 rtx src1, src2, dst, op, clob;
1216
1217 dst = ix86_fixup_binary_operands (code, mode, operands);
1218 src1 = operands[1];
1219 src2 = operands[2];
1220
1221 /* Emit the instruction. */
1222
1223 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1224
1225 if (reload_completed
1226 && code == PLUS
1227 && !rtx_equal_p (dst, src1))
1228 {
1229 /* This is going to be an LEA; avoid splitting it later. */
1230 emit_insn (op);
1231 }
1232 else
1233 {
1234 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1235 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1236 }
1237
1238 /* Fix up the destination if needed. */
1239 if (dst != operands[0])
1240 emit_move_insn (operands[0], dst);
1241 }
1242
1243 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1244 the given OPERANDS. */
1245
1246 void
1247 ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1248 rtx operands[])
1249 {
1250 rtx op1 = NULL_RTX, op2 = NULL_RTX;
1251 if (SUBREG_P (operands[1]))
1252 {
1253 op1 = operands[1];
1254 op2 = operands[2];
1255 }
1256 else if (SUBREG_P (operands[2]))
1257 {
1258 op1 = operands[2];
1259 op2 = operands[1];
1260 }
1261 /* Optimize (__m128i) d | (__m128i) e and similar code
1262 when d and e are float vectors into float vector logical
1263 insn. In C/C++ without using intrinsics there is no other way
1264 to express vector logical operation on float vectors than
1265 to cast them temporarily to integer vectors. */
1266 if (op1
1267 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1268 && (SUBREG_P (op2) || GET_CODE (op2) == CONST_VECTOR)
1269 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1270 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1271 && SUBREG_BYTE (op1) == 0
1272 && (GET_CODE (op2) == CONST_VECTOR
1273 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1274 && SUBREG_BYTE (op2) == 0))
1275 && can_create_pseudo_p ())
1276 {
1277 rtx dst;
1278 switch (GET_MODE (SUBREG_REG (op1)))
1279 {
1280 case E_V4SFmode:
1281 case E_V8SFmode:
1282 case E_V16SFmode:
1283 case E_V2DFmode:
1284 case E_V4DFmode:
1285 case E_V8DFmode:
1286 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1287 if (GET_CODE (op2) == CONST_VECTOR)
1288 {
1289 op2 = gen_lowpart (GET_MODE (dst), op2);
1290 op2 = force_reg (GET_MODE (dst), op2);
1291 }
1292 else
1293 {
1294 op1 = operands[1];
1295 op2 = SUBREG_REG (operands[2]);
1296 if (!vector_operand (op2, GET_MODE (dst)))
1297 op2 = force_reg (GET_MODE (dst), op2);
1298 }
1299 op1 = SUBREG_REG (op1);
1300 if (!vector_operand (op1, GET_MODE (dst)))
1301 op1 = force_reg (GET_MODE (dst), op1);
1302 emit_insn (gen_rtx_SET (dst,
1303 gen_rtx_fmt_ee (code, GET_MODE (dst),
1304 op1, op2)));
1305 emit_move_insn (operands[0], gen_lowpart (mode, dst));
1306 return;
1307 default:
1308 break;
1309 }
1310 }
1311 if (!vector_operand (operands[1], mode))
1312 operands[1] = force_reg (mode, operands[1]);
1313 if (!vector_operand (operands[2], mode))
1314 operands[2] = force_reg (mode, operands[2]);
1315 ix86_fixup_binary_operands_no_copy (code, mode, operands);
1316 emit_insn (gen_rtx_SET (operands[0],
1317 gen_rtx_fmt_ee (code, mode, operands[1],
1318 operands[2])));
1319 }
1320
1321 /* Return TRUE or FALSE depending on whether the binary operator meets the
1322 appropriate constraints. */
1323
1324 bool
1325 ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1326 rtx operands[3])
1327 {
1328 rtx dst = operands[0];
1329 rtx src1 = operands[1];
1330 rtx src2 = operands[2];
1331
1332 /* Both source operands cannot be in memory. */
1333 if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1334 && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1335 return false;
1336
1337 /* Canonicalize operand order for commutative operators. */
1338 if (ix86_swap_binary_operands_p (code, mode, operands))
1339 std::swap (src1, src2);
1340
1341 /* If the destination is memory, we must have a matching source operand. */
1342 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1343 return false;
1344
1345 /* Source 1 cannot be a constant. */
1346 if (CONSTANT_P (src1))
1347 return false;
1348
1349 /* Source 1 cannot be a non-matching memory. */
1350 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
1351 /* Support "andhi/andsi/anddi" as a zero-extending move. */
1352 return (code == AND
1353 && (mode == HImode
1354 || mode == SImode
1355 || (TARGET_64BIT && mode == DImode))
1356 && satisfies_constraint_L (src2));
1357
1358 return true;
1359 }
1360
1361 /* Attempt to expand a unary operator. Make the expansion closer to the
1362 actual machine, then just general_operand, which will allow 2 separate
1363 memory references (one output, one input) in a single insn. */
1364
1365 void
1366 ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1367 rtx operands[])
1368 {
1369 bool matching_memory = false;
1370 rtx src, dst, op, clob;
1371
1372 dst = operands[0];
1373 src = operands[1];
1374
1375 /* If the destination is memory, and we do not have matching source
1376 operands, do things in registers. */
1377 if (MEM_P (dst))
1378 {
1379 if (rtx_equal_p (dst, src))
1380 matching_memory = true;
1381 else
1382 dst = gen_reg_rtx (mode);
1383 }
1384
1385 /* When source operand is memory, destination must match. */
1386 if (MEM_P (src) && !matching_memory)
1387 src = force_reg (mode, src);
1388
1389 /* Emit the instruction. */
1390
1391 op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1392
1393 if (code == NOT)
1394 emit_insn (op);
1395 else
1396 {
1397 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1398 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1399 }
1400
1401 /* Fix up the destination if needed. */
1402 if (dst != operands[0])
1403 emit_move_insn (operands[0], dst);
1404 }
1405
1406 /* Predict just emitted jump instruction to be taken with probability PROB. */
1407
1408 static void
1409 predict_jump (int prob)
1410 {
1411 rtx_insn *insn = get_last_insn ();
1412 gcc_assert (JUMP_P (insn));
1413 add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1414 }
1415
1416 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1417 divisor are within the range [0-255]. */
1418
1419 void
1420 ix86_split_idivmod (machine_mode mode, rtx operands[],
1421 bool unsigned_p)
1422 {
1423 rtx_code_label *end_label, *qimode_label;
1424 rtx div, mod;
1425 rtx_insn *insn;
1426 rtx scratch, tmp0, tmp1, tmp2;
1427 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1428
1429 operands[2] = force_reg (mode, operands[2]);
1430 operands[3] = force_reg (mode, operands[3]);
1431
1432 switch (mode)
1433 {
1434 case E_SImode:
1435 if (GET_MODE (operands[0]) == SImode)
1436 {
1437 if (GET_MODE (operands[1]) == SImode)
1438 gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1439 else
1440 gen_divmod4_1
1441 = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1442 }
1443 else
1444 gen_divmod4_1
1445 = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1446 break;
1447
1448 case E_DImode:
1449 gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1450 break;
1451
1452 default:
1453 gcc_unreachable ();
1454 }
1455
1456 end_label = gen_label_rtx ();
1457 qimode_label = gen_label_rtx ();
1458
1459 scratch = gen_reg_rtx (mode);
1460
1461 /* Use 8bit unsigned divimod if dividend and divisor are within
1462 the range [0-255]. */
1463 emit_move_insn (scratch, operands[2]);
1464 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1465 scratch, 1, OPTAB_DIRECT);
1466 emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1467 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1468 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1469 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1470 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1471 pc_rtx);
1472 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1473 predict_jump (REG_BR_PROB_BASE * 50 / 100);
1474 JUMP_LABEL (insn) = qimode_label;
1475
1476 /* Generate original signed/unsigned divimod. */
1477 emit_insn (gen_divmod4_1 (operands[0], operands[1],
1478 operands[2], operands[3]));
1479
1480 /* Branch to the end. */
1481 emit_jump_insn (gen_jump (end_label));
1482 emit_barrier ();
1483
1484 /* Generate 8bit unsigned divide. */
1485 emit_label (qimode_label);
1486 /* Don't use operands[0] for result of 8bit divide since not all
1487 registers support QImode ZERO_EXTRACT. */
1488 tmp0 = lowpart_subreg (HImode, scratch, mode);
1489 tmp1 = lowpart_subreg (HImode, operands[2], mode);
1490 tmp2 = lowpart_subreg (QImode, operands[3], mode);
1491 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1492
1493 if (unsigned_p)
1494 {
1495 div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1496 mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1497 }
1498 else
1499 {
1500 div = gen_rtx_DIV (mode, operands[2], operands[3]);
1501 mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1502 }
1503 if (mode == SImode)
1504 {
1505 if (GET_MODE (operands[0]) != SImode)
1506 div = gen_rtx_ZERO_EXTEND (DImode, div);
1507 if (GET_MODE (operands[1]) != SImode)
1508 mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1509 }
1510
1511 /* Extract remainder from AH. */
1512 scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1513 tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1514 GEN_INT (8), GEN_INT (8));
1515 insn = emit_move_insn (operands[1], tmp1);
1516 set_unique_reg_note (insn, REG_EQUAL, mod);
1517
1518 /* Zero extend quotient from AL. */
1519 tmp1 = gen_lowpart (QImode, tmp0);
1520 insn = emit_insn (gen_extend_insn
1521 (operands[0], tmp1,
1522 GET_MODE (operands[0]), QImode, 1));
1523 set_unique_reg_note (insn, REG_EQUAL, div);
1524
1525 emit_label (end_label);
1526 }
1527
1528 /* Emit x86 binary operand CODE in mode MODE, where the first operand
1529 matches destination. RTX includes clobber of FLAGS_REG. */
1530
1531 void
1532 ix86_emit_binop (enum rtx_code code, machine_mode mode,
1533 rtx dst, rtx src)
1534 {
1535 rtx op, clob;
1536
1537 op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1538 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1539
1540 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1541 }
1542
1543 /* Return true if regno1 def is nearest to the insn. */
1544
1545 static bool
1546 find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1547 {
1548 rtx_insn *prev = insn;
1549 rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1550
1551 if (insn == start)
1552 return false;
1553 while (prev && prev != start)
1554 {
1555 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1556 {
1557 prev = PREV_INSN (prev);
1558 continue;
1559 }
1560 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1561 return true;
1562 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1563 return false;
1564 prev = PREV_INSN (prev);
1565 }
1566
1567 /* None of the regs is defined in the bb. */
1568 return false;
1569 }
1570
1571 /* INSN_UID of the last insn emitted by zero store peephole2s. */
1572 int ix86_last_zero_store_uid;
1573
1574 /* Split lea instructions into a sequence of instructions
1575 which are executed on ALU to avoid AGU stalls.
1576 It is assumed that it is allowed to clobber flags register
1577 at lea position. */
1578
1579 void
1580 ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1581 {
1582 unsigned int regno0, regno1, regno2;
1583 struct ix86_address parts;
1584 rtx target, tmp;
1585 int ok, adds;
1586
1587 ok = ix86_decompose_address (operands[1], &parts);
1588 gcc_assert (ok);
1589
1590 target = gen_lowpart (mode, operands[0]);
1591
1592 regno0 = true_regnum (target);
1593 regno1 = INVALID_REGNUM;
1594 regno2 = INVALID_REGNUM;
1595
1596 if (parts.base)
1597 {
1598 parts.base = gen_lowpart (mode, parts.base);
1599 regno1 = true_regnum (parts.base);
1600 }
1601
1602 if (parts.index)
1603 {
1604 parts.index = gen_lowpart (mode, parts.index);
1605 regno2 = true_regnum (parts.index);
1606 }
1607
1608 if (parts.disp)
1609 parts.disp = gen_lowpart (mode, parts.disp);
1610
1611 if (parts.scale > 1)
1612 {
1613 /* Case r1 = r1 + ... */
1614 if (regno1 == regno0)
1615 {
1616 /* If we have a case r1 = r1 + C * r2 then we
1617 should use multiplication which is very
1618 expensive. Assume cost model is wrong if we
1619 have such case here. */
1620 gcc_assert (regno2 != regno0);
1621
1622 for (adds = parts.scale; adds > 0; adds--)
1623 ix86_emit_binop (PLUS, mode, target, parts.index);
1624 }
1625 else
1626 {
1627 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1628 if (regno0 != regno2)
1629 emit_insn (gen_rtx_SET (target, parts.index));
1630
1631 /* Use shift for scaling, but emit it as MULT instead
1632 to avoid it being immediately peephole2 optimized back
1633 into lea. */
1634 ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1635
1636 if (parts.base)
1637 ix86_emit_binop (PLUS, mode, target, parts.base);
1638
1639 if (parts.disp && parts.disp != const0_rtx)
1640 ix86_emit_binop (PLUS, mode, target, parts.disp);
1641 }
1642 }
1643 else if (!parts.base && !parts.index)
1644 {
1645 gcc_assert(parts.disp);
1646 emit_insn (gen_rtx_SET (target, parts.disp));
1647 }
1648 else
1649 {
1650 if (!parts.base)
1651 {
1652 if (regno0 != regno2)
1653 emit_insn (gen_rtx_SET (target, parts.index));
1654 }
1655 else if (!parts.index)
1656 {
1657 if (regno0 != regno1)
1658 emit_insn (gen_rtx_SET (target, parts.base));
1659 }
1660 else
1661 {
1662 if (regno0 == regno1)
1663 tmp = parts.index;
1664 else if (regno0 == regno2)
1665 tmp = parts.base;
1666 else
1667 {
1668 rtx tmp1;
1669
1670 /* Find better operand for SET instruction, depending
1671 on which definition is farther from the insn. */
1672 if (find_nearest_reg_def (insn, regno1, regno2))
1673 tmp = parts.index, tmp1 = parts.base;
1674 else
1675 tmp = parts.base, tmp1 = parts.index;
1676
1677 emit_insn (gen_rtx_SET (target, tmp));
1678
1679 if (parts.disp && parts.disp != const0_rtx)
1680 ix86_emit_binop (PLUS, mode, target, parts.disp);
1681
1682 ix86_emit_binop (PLUS, mode, target, tmp1);
1683 return;
1684 }
1685
1686 ix86_emit_binop (PLUS, mode, target, tmp);
1687 }
1688
1689 if (parts.disp && parts.disp != const0_rtx)
1690 ix86_emit_binop (PLUS, mode, target, parts.disp);
1691 }
1692 }
1693
1694 /* Post-reload splitter for converting an SF or DFmode value in an
1695 SSE register into an unsigned SImode. */
1696
1697 void
1698 ix86_split_convert_uns_si_sse (rtx operands[])
1699 {
1700 machine_mode vecmode;
1701 rtx value, large, zero_or_two31, input, two31, x;
1702
1703 large = operands[1];
1704 zero_or_two31 = operands[2];
1705 input = operands[3];
1706 two31 = operands[4];
1707 vecmode = GET_MODE (large);
1708 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1709
1710 /* Load up the value into the low element. We must ensure that the other
1711 elements are valid floats -- zero is the easiest such value. */
1712 if (MEM_P (input))
1713 {
1714 if (vecmode == V4SFmode)
1715 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1716 else
1717 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1718 }
1719 else
1720 {
1721 input = gen_rtx_REG (vecmode, REGNO (input));
1722 emit_move_insn (value, CONST0_RTX (vecmode));
1723 if (vecmode == V4SFmode)
1724 emit_insn (gen_sse_movss (value, value, input));
1725 else
1726 emit_insn (gen_sse2_movsd (value, value, input));
1727 }
1728
1729 emit_move_insn (large, two31);
1730 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1731
1732 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1733 emit_insn (gen_rtx_SET (large, x));
1734
1735 x = gen_rtx_AND (vecmode, zero_or_two31, large);
1736 emit_insn (gen_rtx_SET (zero_or_two31, x));
1737
1738 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1739 emit_insn (gen_rtx_SET (value, x));
1740
1741 large = gen_rtx_REG (V4SImode, REGNO (large));
1742 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1743
1744 x = gen_rtx_REG (V4SImode, REGNO (value));
1745 if (vecmode == V4SFmode)
1746 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1747 else
1748 emit_insn (gen_sse2_cvttpd2dq (x, value));
1749 value = x;
1750
1751 emit_insn (gen_xorv4si3 (value, value, large));
1752 }
1753
1754 static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
1755 machine_mode mode, rtx target,
1756 rtx var, int one_var);
1757
1758 /* Convert an unsigned DImode value into a DFmode, using only SSE.
1759 Expects the 64-bit DImode to be supplied in a pair of integral
1760 registers. Requires SSE2; will use SSE3 if available. For x86_32,
1761 -mfpmath=sse, !optimize_size only. */
1762
1763 void
1764 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1765 {
1766 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1767 rtx int_xmm, fp_xmm;
1768 rtx biases, exponents;
1769 rtx x;
1770
1771 int_xmm = gen_reg_rtx (V4SImode);
1772 if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1773 emit_insn (gen_movdi_to_sse (int_xmm, input));
1774 else if (TARGET_SSE_SPLIT_REGS)
1775 {
1776 emit_clobber (int_xmm);
1777 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1778 }
1779 else
1780 {
1781 x = gen_reg_rtx (V2DImode);
1782 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1783 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1784 }
1785
1786 x = gen_rtx_CONST_VECTOR (V4SImode,
1787 gen_rtvec (4, GEN_INT (0x43300000UL),
1788 GEN_INT (0x45300000UL),
1789 const0_rtx, const0_rtx));
1790 exponents = validize_mem (force_const_mem (V4SImode, x));
1791
1792 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1793 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1794
1795 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1796 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1797 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1798 (0x1.0p84 + double(fp_value_hi_xmm)).
1799 Note these exponents differ by 32. */
1800
1801 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1802
1803 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1804 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1805 real_ldexp (&bias_lo_rvt, &dconst1, 52);
1806 real_ldexp (&bias_hi_rvt, &dconst1, 84);
1807 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1808 x = const_double_from_real_value (bias_hi_rvt, DFmode);
1809 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1810 biases = validize_mem (force_const_mem (V2DFmode, biases));
1811 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1812
1813 /* Add the upper and lower DFmode values together. */
1814 if (TARGET_SSE3)
1815 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1816 else
1817 {
1818 x = copy_to_mode_reg (V2DFmode, fp_xmm);
1819 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1820 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1821 }
1822
1823 ix86_expand_vector_extract (false, target, fp_xmm, 0);
1824 }
1825
1826 /* Not used, but eases macroization of patterns. */
1827 void
1828 ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1829 {
1830 gcc_unreachable ();
1831 }
1832
1833 static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1834
1835 /* Convert an unsigned SImode value into a DFmode. Only currently used
1836 for SSE, but applicable anywhere. */
1837
1838 void
1839 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1840 {
1841 REAL_VALUE_TYPE TWO31r;
1842 rtx x, fp;
1843
1844 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1845 NULL, 1, OPTAB_DIRECT);
1846
1847 fp = gen_reg_rtx (DFmode);
1848 emit_insn (gen_floatsidf2 (fp, x));
1849
1850 real_ldexp (&TWO31r, &dconst1, 31);
1851 x = const_double_from_real_value (TWO31r, DFmode);
1852
1853 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1854
1855 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
1856 if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
1857 x = ix86_expand_sse_fabs (x, NULL);
1858
1859 if (x != target)
1860 emit_move_insn (target, x);
1861 }
1862
1863 /* Convert a signed DImode value into a DFmode. Only used for SSE in
1864 32-bit mode; otherwise we have a direct convert instruction. */
1865
1866 void
1867 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
1868 {
1869 REAL_VALUE_TYPE TWO32r;
1870 rtx fp_lo, fp_hi, x;
1871
1872 fp_lo = gen_reg_rtx (DFmode);
1873 fp_hi = gen_reg_rtx (DFmode);
1874
1875 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
1876
1877 real_ldexp (&TWO32r, &dconst1, 32);
1878 x = const_double_from_real_value (TWO32r, DFmode);
1879 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
1880
1881 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
1882
1883 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
1884 0, OPTAB_DIRECT);
1885 if (x != target)
1886 emit_move_insn (target, x);
1887 }
1888
1889 /* Convert an unsigned SImode value into a SFmode, using only SSE.
1890 For x86_32, -mfpmath=sse, !optimize_size only. */
1891 void
1892 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
1893 {
1894 REAL_VALUE_TYPE ONE16r;
1895 rtx fp_hi, fp_lo, int_hi, int_lo, x;
1896
1897 real_ldexp (&ONE16r, &dconst1, 16);
1898 x = const_double_from_real_value (ONE16r, SFmode);
1899 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
1900 NULL, 0, OPTAB_DIRECT);
1901 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
1902 NULL, 0, OPTAB_DIRECT);
1903 fp_hi = gen_reg_rtx (SFmode);
1904 fp_lo = gen_reg_rtx (SFmode);
1905 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
1906 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
1907 if (TARGET_FMA)
1908 {
1909 x = validize_mem (force_const_mem (SFmode, x));
1910 fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
1911 emit_move_insn (target, fp_hi);
1912 }
1913 else
1914 {
1915 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
1916 0, OPTAB_DIRECT);
1917 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
1918 0, OPTAB_DIRECT);
1919 if (!rtx_equal_p (target, fp_hi))
1920 emit_move_insn (target, fp_hi);
1921 }
1922 }
1923
1924 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
1925 a vector of unsigned ints VAL to vector of floats TARGET. */
1926
1927 void
1928 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
1929 {
1930 rtx tmp[8];
1931 REAL_VALUE_TYPE TWO16r;
1932 machine_mode intmode = GET_MODE (val);
1933 machine_mode fltmode = GET_MODE (target);
1934 rtx (*cvt) (rtx, rtx);
1935
1936 if (intmode == V4SImode)
1937 cvt = gen_floatv4siv4sf2;
1938 else
1939 cvt = gen_floatv8siv8sf2;
1940 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
1941 tmp[0] = force_reg (intmode, tmp[0]);
1942 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
1943 OPTAB_DIRECT);
1944 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
1945 NULL_RTX, 1, OPTAB_DIRECT);
1946 tmp[3] = gen_reg_rtx (fltmode);
1947 emit_insn (cvt (tmp[3], tmp[1]));
1948 tmp[4] = gen_reg_rtx (fltmode);
1949 emit_insn (cvt (tmp[4], tmp[2]));
1950 real_ldexp (&TWO16r, &dconst1, 16);
1951 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
1952 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
1953 if (TARGET_FMA)
1954 {
1955 tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
1956 emit_move_insn (target, tmp[6]);
1957 }
1958 else
1959 {
1960 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
1961 NULL_RTX, 1, OPTAB_DIRECT);
1962 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
1963 target, 1, OPTAB_DIRECT);
1964 if (tmp[7] != target)
1965 emit_move_insn (target, tmp[7]);
1966 }
1967 }
1968
1969 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
1970 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
1971 This is done by doing just signed conversion if < 0x1p31, and otherwise by
1972 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
1973
1974 rtx
1975 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
1976 {
1977 REAL_VALUE_TYPE TWO31r;
1978 rtx two31r, tmp[4];
1979 machine_mode mode = GET_MODE (val);
1980 machine_mode scalarmode = GET_MODE_INNER (mode);
1981 machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
1982 rtx (*cmp) (rtx, rtx, rtx, rtx);
1983 int i;
1984
1985 for (i = 0; i < 3; i++)
1986 tmp[i] = gen_reg_rtx (mode);
1987 real_ldexp (&TWO31r, &dconst1, 31);
1988 two31r = const_double_from_real_value (TWO31r, scalarmode);
1989 two31r = ix86_build_const_vector (mode, 1, two31r);
1990 two31r = force_reg (mode, two31r);
1991 switch (mode)
1992 {
1993 case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
1994 case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
1995 case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
1996 case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
1997 default: gcc_unreachable ();
1998 }
1999 tmp[3] = gen_rtx_LE (mode, two31r, val);
2000 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2001 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2002 0, OPTAB_DIRECT);
2003 if (intmode == V4SImode || TARGET_AVX2)
2004 *xorp = expand_simple_binop (intmode, ASHIFT,
2005 gen_lowpart (intmode, tmp[0]),
2006 GEN_INT (31), NULL_RTX, 0,
2007 OPTAB_DIRECT);
2008 else
2009 {
2010 rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2011 two31 = ix86_build_const_vector (intmode, 1, two31);
2012 *xorp = expand_simple_binop (intmode, AND,
2013 gen_lowpart (intmode, tmp[0]),
2014 two31, NULL_RTX, 0,
2015 OPTAB_DIRECT);
2016 }
2017 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2018 0, OPTAB_DIRECT);
2019 }
2020
2021 /* Generate code for floating point ABS or NEG. */
2022
2023 void
2024 ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2025 rtx operands[])
2026 {
2027 rtx set, dst, src;
2028 bool use_sse = false;
2029 bool vector_mode = VECTOR_MODE_P (mode);
2030 machine_mode vmode = mode;
2031 rtvec par;
2032
2033 if (vector_mode || mode == TFmode || mode == HFmode)
2034 {
2035 use_sse = true;
2036 if (mode == HFmode)
2037 vmode = V8HFmode;
2038 }
2039 else if (TARGET_SSE_MATH)
2040 {
2041 use_sse = SSE_FLOAT_MODE_P (mode);
2042 if (mode == SFmode)
2043 vmode = V4SFmode;
2044 else if (mode == DFmode)
2045 vmode = V2DFmode;
2046 }
2047
2048 dst = operands[0];
2049 src = operands[1];
2050
2051 set = gen_rtx_fmt_e (code, mode, src);
2052 set = gen_rtx_SET (dst, set);
2053
2054 if (use_sse)
2055 {
2056 rtx mask, use, clob;
2057
2058 /* NEG and ABS performed with SSE use bitwise mask operations.
2059 Create the appropriate mask now. */
2060 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2061 use = gen_rtx_USE (VOIDmode, mask);
2062 if (vector_mode || mode == TFmode)
2063 par = gen_rtvec (2, set, use);
2064 else
2065 {
2066 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2067 par = gen_rtvec (3, set, use, clob);
2068 }
2069 }
2070 else
2071 {
2072 rtx clob;
2073
2074 /* Changing of sign for FP values is doable using integer unit too. */
2075 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2076 par = gen_rtvec (2, set, clob);
2077 }
2078
2079 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2080 }
2081
2082 /* Deconstruct a floating point ABS or NEG operation
2083 with integer registers into integer operations. */
2084
2085 void
2086 ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2087 rtx operands[])
2088 {
2089 enum rtx_code absneg_op;
2090 rtx dst, set;
2091
2092 gcc_assert (operands_match_p (operands[0], operands[1]));
2093
2094 switch (mode)
2095 {
2096 case E_SFmode:
2097 dst = gen_lowpart (SImode, operands[0]);
2098
2099 if (code == ABS)
2100 {
2101 set = gen_int_mode (0x7fffffff, SImode);
2102 absneg_op = AND;
2103 }
2104 else
2105 {
2106 set = gen_int_mode (0x80000000, SImode);
2107 absneg_op = XOR;
2108 }
2109 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2110 break;
2111
2112 case E_DFmode:
2113 if (TARGET_64BIT)
2114 {
2115 dst = gen_lowpart (DImode, operands[0]);
2116 dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2117
2118 if (code == ABS)
2119 set = const0_rtx;
2120 else
2121 set = gen_rtx_NOT (DImode, dst);
2122 }
2123 else
2124 {
2125 dst = gen_highpart (SImode, operands[0]);
2126
2127 if (code == ABS)
2128 {
2129 set = gen_int_mode (0x7fffffff, SImode);
2130 absneg_op = AND;
2131 }
2132 else
2133 {
2134 set = gen_int_mode (0x80000000, SImode);
2135 absneg_op = XOR;
2136 }
2137 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2138 }
2139 break;
2140
2141 case E_XFmode:
2142 dst = gen_rtx_REG (SImode,
2143 REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2144 if (code == ABS)
2145 {
2146 set = GEN_INT (0x7fff);
2147 absneg_op = AND;
2148 }
2149 else
2150 {
2151 set = GEN_INT (0x8000);
2152 absneg_op = XOR;
2153 }
2154 set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2155 break;
2156
2157 default:
2158 gcc_unreachable ();
2159 }
2160
2161 set = gen_rtx_SET (dst, set);
2162
2163 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2164 rtvec par = gen_rtvec (2, set, clob);
2165
2166 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2167 }
2168
2169 /* Expand a copysign operation. Special case operand 0 being a constant. */
2170
2171 void
2172 ix86_expand_copysign (rtx operands[])
2173 {
2174 machine_mode mode, vmode;
2175 rtx dest, vdest, op0, op1, mask, op2, op3;
2176
2177 mode = GET_MODE (operands[0]);
2178
2179 if (mode == HFmode)
2180 vmode = V8HFmode;
2181 else if (mode == SFmode)
2182 vmode = V4SFmode;
2183 else if (mode == DFmode)
2184 vmode = V2DFmode;
2185 else if (mode == TFmode)
2186 vmode = mode;
2187 else
2188 gcc_unreachable ();
2189
2190 if (rtx_equal_p (operands[1], operands[2]))
2191 {
2192 emit_move_insn (operands[0], operands[1]);
2193 return;
2194 }
2195
2196 dest = operands[0];
2197 vdest = lowpart_subreg (vmode, dest, mode);
2198 if (vdest == NULL_RTX)
2199 vdest = gen_reg_rtx (vmode);
2200 else
2201 dest = NULL_RTX;
2202 op1 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2203 mask = ix86_build_signbit_mask (vmode, 0, 0);
2204
2205 if (CONST_DOUBLE_P (operands[1]))
2206 {
2207 op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
2208 /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
2209 if (op0 == CONST0_RTX (mode))
2210 {
2211 emit_move_insn (vdest, gen_rtx_AND (vmode, mask, op1));
2212 if (dest)
2213 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2214 return;
2215 }
2216
2217 if (GET_MODE_SIZE (mode) < 16)
2218 op0 = ix86_build_const_vector (vmode, false, op0);
2219 op0 = force_reg (vmode, op0);
2220 }
2221 else
2222 op0 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2223
2224 op2 = gen_reg_rtx (vmode);
2225 op3 = gen_reg_rtx (vmode);
2226 emit_move_insn (op2, gen_rtx_AND (vmode,
2227 gen_rtx_NOT (vmode, mask),
2228 op0));
2229 emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
2230 emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2231 if (dest)
2232 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2233 }
2234
2235 /* Expand an xorsign operation. */
2236
2237 void
2238 ix86_expand_xorsign (rtx operands[])
2239 {
2240 machine_mode mode, vmode;
2241 rtx dest, vdest, op0, op1, mask, x, temp;
2242
2243 dest = operands[0];
2244 op0 = operands[1];
2245 op1 = operands[2];
2246
2247 mode = GET_MODE (dest);
2248
2249 if (mode == HFmode)
2250 vmode = V8HFmode;
2251 else if (mode == SFmode)
2252 vmode = V4SFmode;
2253 else if (mode == DFmode)
2254 vmode = V2DFmode;
2255 else
2256 gcc_unreachable ();
2257
2258 temp = gen_reg_rtx (vmode);
2259 mask = ix86_build_signbit_mask (vmode, 0, 0);
2260
2261 op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2262 x = gen_rtx_AND (vmode, op1, mask);
2263 emit_insn (gen_rtx_SET (temp, x));
2264
2265 op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2266 x = gen_rtx_XOR (vmode, temp, op0);
2267
2268 vdest = lowpart_subreg (vmode, dest, mode);
2269 if (vdest == NULL_RTX)
2270 vdest = gen_reg_rtx (vmode);
2271 else
2272 dest = NULL_RTX;
2273 emit_insn (gen_rtx_SET (vdest, x));
2274
2275 if (dest)
2276 emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2277 }
2278
2279 static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2280
2281 void
2282 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2283 {
2284 machine_mode mode = GET_MODE (op0);
2285 rtx tmp;
2286
2287 /* Handle special case - vector comparsion with boolean result, transform
2288 it using ptest instruction. */
2289 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
2290 {
2291 rtx flag = gen_rtx_REG (CCZmode, FLAGS_REG);
2292 machine_mode p_mode = GET_MODE_SIZE (mode) == 32 ? V4DImode : V2DImode;
2293
2294 gcc_assert (code == EQ || code == NE);
2295 /* Generate XOR since we can't check that one operand is zero vector. */
2296 tmp = gen_reg_rtx (mode);
2297 emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
2298 tmp = gen_lowpart (p_mode, tmp);
2299 emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
2300 gen_rtx_UNSPEC (CCmode,
2301 gen_rtvec (2, tmp, tmp),
2302 UNSPEC_PTEST)));
2303 tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2304 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2305 gen_rtx_LABEL_REF (VOIDmode, label),
2306 pc_rtx);
2307 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2308 return;
2309 }
2310
2311 switch (mode)
2312 {
2313 case E_HFmode:
2314 case E_SFmode:
2315 case E_DFmode:
2316 case E_XFmode:
2317 case E_QImode:
2318 case E_HImode:
2319 case E_SImode:
2320 simple:
2321 tmp = ix86_expand_compare (code, op0, op1);
2322 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2323 gen_rtx_LABEL_REF (VOIDmode, label),
2324 pc_rtx);
2325 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2326 return;
2327
2328 case E_DImode:
2329 if (TARGET_64BIT)
2330 goto simple;
2331 /* For 32-bit target DI comparison may be performed on
2332 SSE registers. To allow this we should avoid split
2333 to SI mode which is achieved by doing xor in DI mode
2334 and then comparing with zero (which is recognized by
2335 STV pass). We don't compare using xor when optimizing
2336 for size. */
2337 if (!optimize_insn_for_size_p ()
2338 && TARGET_STV
2339 && (code == EQ || code == NE))
2340 {
2341 op0 = force_reg (mode, gen_rtx_XOR (mode, op0, op1));
2342 op1 = const0_rtx;
2343 }
2344 /* FALLTHRU */
2345 case E_TImode:
2346 /* Expand DImode branch into multiple compare+branch. */
2347 {
2348 rtx lo[2], hi[2];
2349 rtx_code_label *label2;
2350 enum rtx_code code1, code2, code3;
2351 machine_mode submode;
2352
2353 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2354 {
2355 std::swap (op0, op1);
2356 code = swap_condition (code);
2357 }
2358
2359 split_double_mode (mode, &op0, 1, lo+0, hi+0);
2360 split_double_mode (mode, &op1, 1, lo+1, hi+1);
2361
2362 submode = mode == DImode ? SImode : DImode;
2363
2364 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
2365 avoid two branches. This costs one extra insn, so disable when
2366 optimizing for size. */
2367
2368 if ((code == EQ || code == NE)
2369 && (!optimize_insn_for_size_p ()
2370 || hi[1] == const0_rtx || lo[1] == const0_rtx))
2371 {
2372 rtx xor0, xor1;
2373
2374 xor1 = hi[0];
2375 if (hi[1] != const0_rtx)
2376 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
2377 NULL_RTX, 0, OPTAB_WIDEN);
2378
2379 xor0 = lo[0];
2380 if (lo[1] != const0_rtx)
2381 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
2382 NULL_RTX, 0, OPTAB_WIDEN);
2383
2384 tmp = expand_binop (submode, ior_optab, xor1, xor0,
2385 NULL_RTX, 0, OPTAB_WIDEN);
2386
2387 ix86_expand_branch (code, tmp, const0_rtx, label);
2388 return;
2389 }
2390
2391 /* Otherwise, if we are doing less-than or greater-or-equal-than,
2392 op1 is a constant and the low word is zero, then we can just
2393 examine the high word. Similarly for low word -1 and
2394 less-or-equal-than or greater-than. */
2395
2396 if (CONST_INT_P (hi[1]))
2397 switch (code)
2398 {
2399 case LT: case LTU: case GE: case GEU:
2400 if (lo[1] == const0_rtx)
2401 {
2402 ix86_expand_branch (code, hi[0], hi[1], label);
2403 return;
2404 }
2405 break;
2406 case LE: case LEU: case GT: case GTU:
2407 if (lo[1] == constm1_rtx)
2408 {
2409 ix86_expand_branch (code, hi[0], hi[1], label);
2410 return;
2411 }
2412 break;
2413 default:
2414 break;
2415 }
2416
2417 /* Emulate comparisons that do not depend on Zero flag with
2418 double-word subtraction. Note that only Overflow, Sign
2419 and Carry flags are valid, so swap arguments and condition
2420 of comparisons that would otherwise test Zero flag. */
2421
2422 switch (code)
2423 {
2424 case LE: case LEU: case GT: case GTU:
2425 std::swap (lo[0], lo[1]);
2426 std::swap (hi[0], hi[1]);
2427 code = swap_condition (code);
2428 /* FALLTHRU */
2429
2430 case LT: case LTU: case GE: case GEU:
2431 {
2432 bool uns = (code == LTU || code == GEU);
2433 rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2434 = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2435
2436 if (!nonimmediate_operand (lo[0], submode))
2437 lo[0] = force_reg (submode, lo[0]);
2438 if (!x86_64_general_operand (lo[1], submode))
2439 lo[1] = force_reg (submode, lo[1]);
2440
2441 if (!register_operand (hi[0], submode))
2442 hi[0] = force_reg (submode, hi[0]);
2443 if ((uns && !nonimmediate_operand (hi[1], submode))
2444 || (!uns && !x86_64_general_operand (hi[1], submode)))
2445 hi[1] = force_reg (submode, hi[1]);
2446
2447 emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2448
2449 tmp = gen_rtx_SCRATCH (submode);
2450 emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2451
2452 tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2453 ix86_expand_branch (code, tmp, const0_rtx, label);
2454 return;
2455 }
2456
2457 default:
2458 break;
2459 }
2460
2461 /* Otherwise, we need two or three jumps. */
2462
2463 label2 = gen_label_rtx ();
2464
2465 code1 = code;
2466 code2 = swap_condition (code);
2467 code3 = unsigned_condition (code);
2468
2469 switch (code)
2470 {
2471 case LT: case GT: case LTU: case GTU:
2472 break;
2473
2474 case LE: code1 = LT; code2 = GT; break;
2475 case GE: code1 = GT; code2 = LT; break;
2476 case LEU: code1 = LTU; code2 = GTU; break;
2477 case GEU: code1 = GTU; code2 = LTU; break;
2478
2479 case EQ: code1 = UNKNOWN; code2 = NE; break;
2480 case NE: code2 = UNKNOWN; break;
2481
2482 default:
2483 gcc_unreachable ();
2484 }
2485
2486 /*
2487 * a < b =>
2488 * if (hi(a) < hi(b)) goto true;
2489 * if (hi(a) > hi(b)) goto false;
2490 * if (lo(a) < lo(b)) goto true;
2491 * false:
2492 */
2493
2494 if (code1 != UNKNOWN)
2495 ix86_expand_branch (code1, hi[0], hi[1], label);
2496 if (code2 != UNKNOWN)
2497 ix86_expand_branch (code2, hi[0], hi[1], label2);
2498
2499 ix86_expand_branch (code3, lo[0], lo[1], label);
2500
2501 if (code2 != UNKNOWN)
2502 emit_label (label2);
2503 return;
2504 }
2505
2506 default:
2507 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2508 goto simple;
2509 }
2510 }
2511
2512 /* Figure out whether to use unordered fp comparisons. */
2513
2514 static bool
2515 ix86_unordered_fp_compare (enum rtx_code code)
2516 {
2517 if (!TARGET_IEEE_FP)
2518 return false;
2519
2520 switch (code)
2521 {
2522 case LT:
2523 case LE:
2524 case GT:
2525 case GE:
2526 case LTGT:
2527 return false;
2528
2529 case EQ:
2530 case NE:
2531
2532 case UNORDERED:
2533 case ORDERED:
2534 case UNLT:
2535 case UNLE:
2536 case UNGT:
2537 case UNGE:
2538 case UNEQ:
2539 return true;
2540
2541 default:
2542 gcc_unreachable ();
2543 }
2544 }
2545
2546 /* Return a comparison we can do and that it is equivalent to
2547 swap_condition (code) apart possibly from orderedness.
2548 But, never change orderedness if TARGET_IEEE_FP, returning
2549 UNKNOWN in that case if necessary. */
2550
2551 static enum rtx_code
2552 ix86_fp_swap_condition (enum rtx_code code)
2553 {
2554 switch (code)
2555 {
2556 case GT: /* GTU - CF=0 & ZF=0 */
2557 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2558 case GE: /* GEU - CF=0 */
2559 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2560 case UNLT: /* LTU - CF=1 */
2561 return TARGET_IEEE_FP ? UNKNOWN : GT;
2562 case UNLE: /* LEU - CF=1 | ZF=1 */
2563 return TARGET_IEEE_FP ? UNKNOWN : GE;
2564 default:
2565 return swap_condition (code);
2566 }
2567 }
2568
2569 /* Return cost of comparison CODE using the best strategy for performance.
2570 All following functions do use number of instructions as a cost metrics.
2571 In future this should be tweaked to compute bytes for optimize_size and
2572 take into account performance of various instructions on various CPUs. */
2573
2574 static int
2575 ix86_fp_comparison_cost (enum rtx_code code)
2576 {
2577 int arith_cost;
2578
2579 /* The cost of code using bit-twiddling on %ah. */
2580 switch (code)
2581 {
2582 case UNLE:
2583 case UNLT:
2584 case LTGT:
2585 case GT:
2586 case GE:
2587 case UNORDERED:
2588 case ORDERED:
2589 case UNEQ:
2590 arith_cost = 4;
2591 break;
2592 case LT:
2593 case NE:
2594 case EQ:
2595 case UNGE:
2596 arith_cost = TARGET_IEEE_FP ? 5 : 4;
2597 break;
2598 case LE:
2599 case UNGT:
2600 arith_cost = TARGET_IEEE_FP ? 6 : 4;
2601 break;
2602 default:
2603 gcc_unreachable ();
2604 }
2605
2606 switch (ix86_fp_comparison_strategy (code))
2607 {
2608 case IX86_FPCMP_COMI:
2609 return arith_cost > 4 ? 3 : 2;
2610 case IX86_FPCMP_SAHF:
2611 return arith_cost > 4 ? 4 : 3;
2612 default:
2613 return arith_cost;
2614 }
2615 }
2616
2617 /* Swap, force into registers, or otherwise massage the two operands
2618 to a fp comparison. The operands are updated in place; the new
2619 comparison code is returned. */
2620
2621 static enum rtx_code
2622 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2623 {
2624 bool unordered_compare = ix86_unordered_fp_compare (code);
2625 rtx op0 = *pop0, op1 = *pop1;
2626 machine_mode op_mode = GET_MODE (op0);
2627 bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
2628
2629 /* All of the unordered compare instructions only work on registers.
2630 The same is true of the fcomi compare instructions. The XFmode
2631 compare instructions require registers except when comparing
2632 against zero or when converting operand 1 from fixed point to
2633 floating point. */
2634
2635 if (!is_sse
2636 && (unordered_compare
2637 || (op_mode == XFmode
2638 && ! (standard_80387_constant_p (op0) == 1
2639 || standard_80387_constant_p (op1) == 1)
2640 && GET_CODE (op1) != FLOAT)
2641 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2642 {
2643 op0 = force_reg (op_mode, op0);
2644 op1 = force_reg (op_mode, op1);
2645 }
2646 else
2647 {
2648 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2649 things around if they appear profitable, otherwise force op0
2650 into a register. */
2651
2652 if (standard_80387_constant_p (op0) == 0
2653 || (MEM_P (op0)
2654 && ! (standard_80387_constant_p (op1) == 0
2655 || MEM_P (op1))))
2656 {
2657 enum rtx_code new_code = ix86_fp_swap_condition (code);
2658 if (new_code != UNKNOWN)
2659 {
2660 std::swap (op0, op1);
2661 code = new_code;
2662 }
2663 }
2664
2665 if (!REG_P (op0))
2666 op0 = force_reg (op_mode, op0);
2667
2668 if (CONSTANT_P (op1))
2669 {
2670 int tmp = standard_80387_constant_p (op1);
2671 if (tmp == 0)
2672 op1 = validize_mem (force_const_mem (op_mode, op1));
2673 else if (tmp == 1)
2674 {
2675 if (TARGET_CMOVE)
2676 op1 = force_reg (op_mode, op1);
2677 }
2678 else
2679 op1 = force_reg (op_mode, op1);
2680 }
2681 }
2682
2683 /* Try to rearrange the comparison to make it cheaper. */
2684 if (ix86_fp_comparison_cost (code)
2685 > ix86_fp_comparison_cost (swap_condition (code))
2686 && (REG_P (op1) || can_create_pseudo_p ()))
2687 {
2688 std::swap (op0, op1);
2689 code = swap_condition (code);
2690 if (!REG_P (op0))
2691 op0 = force_reg (op_mode, op0);
2692 }
2693
2694 *pop0 = op0;
2695 *pop1 = op1;
2696 return code;
2697 }
2698
2699 /* Generate insn patterns to do a floating point compare of OPERANDS. */
2700
2701 static rtx
2702 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2703 {
2704 bool unordered_compare = ix86_unordered_fp_compare (code);
2705 machine_mode cmp_mode;
2706 rtx tmp, scratch;
2707
2708 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2709
2710 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2711 if (unordered_compare)
2712 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2713
2714 /* Do fcomi/sahf based test when profitable. */
2715 switch (ix86_fp_comparison_strategy (code))
2716 {
2717 case IX86_FPCMP_COMI:
2718 cmp_mode = CCFPmode;
2719 emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2720 break;
2721
2722 case IX86_FPCMP_SAHF:
2723 cmp_mode = CCFPmode;
2724 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2725 scratch = gen_reg_rtx (HImode);
2726 emit_insn (gen_rtx_SET (scratch, tmp));
2727 emit_insn (gen_x86_sahf_1 (scratch));
2728 break;
2729
2730 case IX86_FPCMP_ARITH:
2731 cmp_mode = CCNOmode;
2732 tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2733 scratch = gen_reg_rtx (HImode);
2734 emit_insn (gen_rtx_SET (scratch, tmp));
2735
2736 /* In the unordered case, we have to check C2 for NaN's, which
2737 doesn't happen to work out to anything nice combination-wise.
2738 So do some bit twiddling on the value we've got in AH to come
2739 up with an appropriate set of condition codes. */
2740
2741 switch (code)
2742 {
2743 case GT:
2744 case UNGT:
2745 if (code == GT || !TARGET_IEEE_FP)
2746 {
2747 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2748 code = EQ;
2749 }
2750 else
2751 {
2752 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2753 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2754 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2755 cmp_mode = CCmode;
2756 code = GEU;
2757 }
2758 break;
2759 case LT:
2760 case UNLT:
2761 if (code == LT && TARGET_IEEE_FP)
2762 {
2763 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2764 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2765 cmp_mode = CCmode;
2766 code = EQ;
2767 }
2768 else
2769 {
2770 emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2771 code = NE;
2772 }
2773 break;
2774 case GE:
2775 case UNGE:
2776 if (code == GE || !TARGET_IEEE_FP)
2777 {
2778 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
2779 code = EQ;
2780 }
2781 else
2782 {
2783 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2784 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
2785 code = NE;
2786 }
2787 break;
2788 case LE:
2789 case UNLE:
2790 if (code == LE && TARGET_IEEE_FP)
2791 {
2792 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2793 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2794 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2795 cmp_mode = CCmode;
2796 code = LTU;
2797 }
2798 else
2799 {
2800 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2801 code = NE;
2802 }
2803 break;
2804 case EQ:
2805 case UNEQ:
2806 if (code == EQ && TARGET_IEEE_FP)
2807 {
2808 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2809 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
2810 cmp_mode = CCmode;
2811 code = EQ;
2812 }
2813 else
2814 {
2815 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2816 code = NE;
2817 }
2818 break;
2819 case NE:
2820 case LTGT:
2821 if (code == NE && TARGET_IEEE_FP)
2822 {
2823 emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2824 emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
2825 GEN_INT (0x40)));
2826 code = NE;
2827 }
2828 else
2829 {
2830 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
2831 code = EQ;
2832 }
2833 break;
2834
2835 case UNORDERED:
2836 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2837 code = NE;
2838 break;
2839 case ORDERED:
2840 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
2841 code = EQ;
2842 break;
2843
2844 default:
2845 gcc_unreachable ();
2846 }
2847 break;
2848
2849 default:
2850 gcc_unreachable();
2851 }
2852
2853 /* Return the test that should be put into the flags user, i.e.
2854 the bcc, scc, or cmov instruction. */
2855 return gen_rtx_fmt_ee (code, VOIDmode,
2856 gen_rtx_REG (cmp_mode, FLAGS_REG),
2857 const0_rtx);
2858 }
2859
2860 /* Generate insn patterns to do an integer compare of OPERANDS. */
2861
2862 static rtx
2863 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
2864 {
2865 machine_mode cmpmode;
2866 rtx tmp, flags;
2867
2868 /* Swap operands to emit carry flag comparison. */
2869 if ((code == GTU || code == LEU)
2870 && nonimmediate_operand (op1, VOIDmode))
2871 {
2872 std::swap (op0, op1);
2873 code = swap_condition (code);
2874 }
2875
2876 cmpmode = SELECT_CC_MODE (code, op0, op1);
2877 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
2878
2879 /* This is very simple, but making the interface the same as in the
2880 FP case makes the rest of the code easier. */
2881 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
2882 emit_insn (gen_rtx_SET (flags, tmp));
2883
2884 /* Return the test that should be put into the flags user, i.e.
2885 the bcc, scc, or cmov instruction. */
2886 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
2887 }
2888
2889 static rtx
2890 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
2891 {
2892 rtx ret;
2893
2894 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
2895 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
2896
2897 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
2898 {
2899 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
2900 ret = ix86_expand_fp_compare (code, op0, op1);
2901 }
2902 else
2903 ret = ix86_expand_int_compare (code, op0, op1);
2904
2905 return ret;
2906 }
2907
2908 void
2909 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
2910 {
2911 rtx ret;
2912
2913 gcc_assert (GET_MODE (dest) == QImode);
2914
2915 ret = ix86_expand_compare (code, op0, op1);
2916 PUT_MODE (ret, QImode);
2917 emit_insn (gen_rtx_SET (dest, ret));
2918 }
2919
2920 /* Expand floating point op0 <=> op1, i.e.
2921 dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : 2. */
2922
2923 void
2924 ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1)
2925 {
2926 gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
2927 rtx gt = ix86_expand_fp_compare (GT, op0, op1);
2928 rtx l0 = gen_label_rtx ();
2929 rtx l1 = gen_label_rtx ();
2930 rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
2931 rtx lend = gen_label_rtx ();
2932 rtx tmp;
2933 rtx_insn *jmp;
2934 if (l2)
2935 {
2936 rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
2937 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2938 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
2939 gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
2940 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2941 add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
2942 }
2943 rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
2944 gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
2945 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
2946 gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
2947 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2948 add_reg_br_prob_note (jmp, profile_probability::unlikely ());
2949 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
2950 gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
2951 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2952 add_reg_br_prob_note (jmp, profile_probability::even ());
2953 emit_move_insn (dest, constm1_rtx);
2954 emit_jump (lend);
2955 emit_label (l0);
2956 emit_move_insn (dest, const0_rtx);
2957 emit_jump (lend);
2958 emit_label (l1);
2959 emit_move_insn (dest, const1_rtx);
2960 emit_jump (lend);
2961 if (l2)
2962 {
2963 emit_label (l2);
2964 emit_move_insn (dest, const2_rtx);
2965 }
2966 emit_label (lend);
2967 }
2968
2969 /* Expand comparison setting or clearing carry flag. Return true when
2970 successful and set pop for the operation. */
2971 static bool
2972 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
2973 {
2974 machine_mode mode
2975 = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
2976
2977 /* Do not handle double-mode compares that go through special path. */
2978 if (mode == (TARGET_64BIT ? TImode : DImode))
2979 return false;
2980
2981 if (SCALAR_FLOAT_MODE_P (mode))
2982 {
2983 rtx compare_op;
2984 rtx_insn *compare_seq;
2985
2986 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
2987
2988 /* Shortcut: following common codes never translate
2989 into carry flag compares. */
2990 if (code == EQ || code == NE || code == UNEQ || code == LTGT
2991 || code == ORDERED || code == UNORDERED)
2992 return false;
2993
2994 /* These comparisons require zero flag; swap operands so they won't. */
2995 if ((code == GT || code == UNLE || code == LE || code == UNGT)
2996 && !TARGET_IEEE_FP)
2997 {
2998 std::swap (op0, op1);
2999 code = swap_condition (code);
3000 }
3001
3002 /* Try to expand the comparison and verify that we end up with
3003 carry flag based comparison. This fails to be true only when
3004 we decide to expand comparison using arithmetic that is not
3005 too common scenario. */
3006 start_sequence ();
3007 compare_op = ix86_expand_fp_compare (code, op0, op1);
3008 compare_seq = get_insns ();
3009 end_sequence ();
3010
3011 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3012 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3013 else
3014 code = GET_CODE (compare_op);
3015
3016 if (code != LTU && code != GEU)
3017 return false;
3018
3019 emit_insn (compare_seq);
3020 *pop = compare_op;
3021 return true;
3022 }
3023
3024 if (!INTEGRAL_MODE_P (mode))
3025 return false;
3026
3027 switch (code)
3028 {
3029 case LTU:
3030 case GEU:
3031 break;
3032
3033 /* Convert a==0 into (unsigned)a<1. */
3034 case EQ:
3035 case NE:
3036 if (op1 != const0_rtx)
3037 return false;
3038 op1 = const1_rtx;
3039 code = (code == EQ ? LTU : GEU);
3040 break;
3041
3042 /* Convert a>b into b<a or a>=b-1. */
3043 case GTU:
3044 case LEU:
3045 if (CONST_INT_P (op1))
3046 {
3047 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3048 /* Bail out on overflow. We still can swap operands but that
3049 would force loading of the constant into register. */
3050 if (op1 == const0_rtx
3051 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3052 return false;
3053 code = (code == GTU ? GEU : LTU);
3054 }
3055 else
3056 {
3057 std::swap (op0, op1);
3058 code = (code == GTU ? LTU : GEU);
3059 }
3060 break;
3061
3062 /* Convert a>=0 into (unsigned)a<0x80000000. */
3063 case LT:
3064 case GE:
3065 if (mode == DImode || op1 != const0_rtx)
3066 return false;
3067 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3068 code = (code == LT ? GEU : LTU);
3069 break;
3070 case LE:
3071 case GT:
3072 if (mode == DImode || op1 != constm1_rtx)
3073 return false;
3074 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3075 code = (code == LE ? GEU : LTU);
3076 break;
3077
3078 default:
3079 return false;
3080 }
3081 /* Swapping operands may cause constant to appear as first operand. */
3082 if (!nonimmediate_operand (op0, VOIDmode))
3083 {
3084 if (!can_create_pseudo_p ())
3085 return false;
3086 op0 = force_reg (mode, op0);
3087 }
3088 *pop = ix86_expand_compare (code, op0, op1);
3089 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3090 return true;
3091 }
3092
3093 /* Expand conditional increment or decrement using adb/sbb instructions.
3094 The default case using setcc followed by the conditional move can be
3095 done by generic code. */
3096 bool
3097 ix86_expand_int_addcc (rtx operands[])
3098 {
3099 enum rtx_code code = GET_CODE (operands[1]);
3100 rtx flags;
3101 rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3102 rtx compare_op;
3103 rtx val = const0_rtx;
3104 bool fpcmp = false;
3105 machine_mode mode;
3106 rtx op0 = XEXP (operands[1], 0);
3107 rtx op1 = XEXP (operands[1], 1);
3108
3109 if (operands[3] != const1_rtx
3110 && operands[3] != constm1_rtx)
3111 return false;
3112 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3113 return false;
3114 code = GET_CODE (compare_op);
3115
3116 flags = XEXP (compare_op, 0);
3117
3118 if (GET_MODE (flags) == CCFPmode)
3119 {
3120 fpcmp = true;
3121 code = ix86_fp_compare_code_to_integer (code);
3122 }
3123
3124 if (code != LTU)
3125 {
3126 val = constm1_rtx;
3127 if (fpcmp)
3128 PUT_CODE (compare_op,
3129 reverse_condition_maybe_unordered
3130 (GET_CODE (compare_op)));
3131 else
3132 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3133 }
3134
3135 mode = GET_MODE (operands[0]);
3136
3137 /* Construct either adc or sbb insn. */
3138 if ((code == LTU) == (operands[3] == constm1_rtx))
3139 insn = gen_sub3_carry;
3140 else
3141 insn = gen_add3_carry;
3142
3143 emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3144
3145 return true;
3146 }
3147
3148 bool
3149 ix86_expand_int_movcc (rtx operands[])
3150 {
3151 enum rtx_code code = GET_CODE (operands[1]), compare_code;
3152 rtx_insn *compare_seq;
3153 rtx compare_op;
3154 machine_mode mode = GET_MODE (operands[0]);
3155 bool sign_bit_compare_p = false;
3156 rtx op0 = XEXP (operands[1], 0);
3157 rtx op1 = XEXP (operands[1], 1);
3158 rtx op2 = operands[2];
3159 rtx op3 = operands[3];
3160
3161 if (GET_MODE (op0) == TImode
3162 || (GET_MODE (op0) == DImode
3163 && !TARGET_64BIT))
3164 return false;
3165
3166 start_sequence ();
3167 compare_op = ix86_expand_compare (code, op0, op1);
3168 compare_seq = get_insns ();
3169 end_sequence ();
3170
3171 compare_code = GET_CODE (compare_op);
3172
3173 if ((op1 == const0_rtx && (code == GE || code == LT))
3174 || (op1 == constm1_rtx && (code == GT || code == LE)))
3175 sign_bit_compare_p = true;
3176
3177 /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3178 but if op1 is a constant, the latter form allows more optimizations,
3179 either through the last 2 ops being constant handling, or the one
3180 constant and one variable cases. On the other side, for cmov the
3181 former might be better as we don't need to load the constant into
3182 another register. */
3183 if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3184 op2 = op1;
3185 /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3186 else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3187 op3 = op1;
3188
3189 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3190 HImode insns, we'd be swallowed in word prefix ops. */
3191
3192 if ((mode != HImode || TARGET_FAST_PREFIX)
3193 && (mode != (TARGET_64BIT ? TImode : DImode))
3194 && CONST_INT_P (op2)
3195 && CONST_INT_P (op3))
3196 {
3197 rtx out = operands[0];
3198 HOST_WIDE_INT ct = INTVAL (op2);
3199 HOST_WIDE_INT cf = INTVAL (op3);
3200 HOST_WIDE_INT diff;
3201
3202 diff = ct - cf;
3203 /* Sign bit compares are better done using shifts than we do by using
3204 sbb. */
3205 if (sign_bit_compare_p
3206 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3207 {
3208 /* Detect overlap between destination and compare sources. */
3209 rtx tmp = out;
3210
3211 if (!sign_bit_compare_p)
3212 {
3213 rtx flags;
3214 bool fpcmp = false;
3215
3216 compare_code = GET_CODE (compare_op);
3217
3218 flags = XEXP (compare_op, 0);
3219
3220 if (GET_MODE (flags) == CCFPmode)
3221 {
3222 fpcmp = true;
3223 compare_code
3224 = ix86_fp_compare_code_to_integer (compare_code);
3225 }
3226
3227 /* To simplify rest of code, restrict to the GEU case. */
3228 if (compare_code == LTU)
3229 {
3230 std::swap (ct, cf);
3231 compare_code = reverse_condition (compare_code);
3232 code = reverse_condition (code);
3233 }
3234 else
3235 {
3236 if (fpcmp)
3237 PUT_CODE (compare_op,
3238 reverse_condition_maybe_unordered
3239 (GET_CODE (compare_op)));
3240 else
3241 PUT_CODE (compare_op,
3242 reverse_condition (GET_CODE (compare_op)));
3243 }
3244 diff = ct - cf;
3245
3246 if (reg_overlap_mentioned_p (out, op0)
3247 || reg_overlap_mentioned_p (out, op1))
3248 tmp = gen_reg_rtx (mode);
3249
3250 if (mode == DImode)
3251 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3252 else
3253 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3254 flags, compare_op));
3255 }
3256 else
3257 {
3258 if (code == GT || code == GE)
3259 code = reverse_condition (code);
3260 else
3261 {
3262 std::swap (ct, cf);
3263 diff = ct - cf;
3264 }
3265 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3266 }
3267
3268 if (diff == 1)
3269 {
3270 /*
3271 * cmpl op0,op1
3272 * sbbl dest,dest
3273 * [addl dest, ct]
3274 *
3275 * Size 5 - 8.
3276 */
3277 if (ct)
3278 tmp = expand_simple_binop (mode, PLUS,
3279 tmp, GEN_INT (ct),
3280 copy_rtx (tmp), 1, OPTAB_DIRECT);
3281 }
3282 else if (cf == -1)
3283 {
3284 /*
3285 * cmpl op0,op1
3286 * sbbl dest,dest
3287 * orl $ct, dest
3288 *
3289 * Size 8.
3290 */
3291 tmp = expand_simple_binop (mode, IOR,
3292 tmp, GEN_INT (ct),
3293 copy_rtx (tmp), 1, OPTAB_DIRECT);
3294 }
3295 else if (diff == -1 && ct)
3296 {
3297 /*
3298 * cmpl op0,op1
3299 * sbbl dest,dest
3300 * notl dest
3301 * [addl dest, cf]
3302 *
3303 * Size 8 - 11.
3304 */
3305 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3306 if (cf)
3307 tmp = expand_simple_binop (mode, PLUS,
3308 copy_rtx (tmp), GEN_INT (cf),
3309 copy_rtx (tmp), 1, OPTAB_DIRECT);
3310 }
3311 else
3312 {
3313 /*
3314 * cmpl op0,op1
3315 * sbbl dest,dest
3316 * [notl dest]
3317 * andl cf - ct, dest
3318 * [addl dest, ct]
3319 *
3320 * Size 8 - 11.
3321 */
3322
3323 if (cf == 0)
3324 {
3325 cf = ct;
3326 ct = 0;
3327 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3328 }
3329
3330 tmp = expand_simple_binop (mode, AND,
3331 copy_rtx (tmp),
3332 gen_int_mode (cf - ct, mode),
3333 copy_rtx (tmp), 1, OPTAB_DIRECT);
3334 if (ct)
3335 tmp = expand_simple_binop (mode, PLUS,
3336 copy_rtx (tmp), GEN_INT (ct),
3337 copy_rtx (tmp), 1, OPTAB_DIRECT);
3338 }
3339
3340 if (!rtx_equal_p (tmp, out))
3341 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3342
3343 return true;
3344 }
3345
3346 if (diff < 0)
3347 {
3348 machine_mode cmp_mode = GET_MODE (op0);
3349 enum rtx_code new_code;
3350
3351 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3352 {
3353 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3354
3355 /* We may be reversing a non-trapping
3356 comparison to a trapping comparison. */
3357 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3358 && code != EQ && code != NE
3359 && code != ORDERED && code != UNORDERED)
3360 new_code = UNKNOWN;
3361 else
3362 new_code = reverse_condition_maybe_unordered (code);
3363 }
3364 else
3365 new_code = ix86_reverse_condition (code, cmp_mode);
3366 if (new_code != UNKNOWN)
3367 {
3368 std::swap (ct, cf);
3369 diff = -diff;
3370 code = new_code;
3371 }
3372 }
3373
3374 compare_code = UNKNOWN;
3375 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3376 && CONST_INT_P (op1))
3377 {
3378 if (op1 == const0_rtx
3379 && (code == LT || code == GE))
3380 compare_code = code;
3381 else if (op1 == constm1_rtx)
3382 {
3383 if (code == LE)
3384 compare_code = LT;
3385 else if (code == GT)
3386 compare_code = GE;
3387 }
3388 }
3389
3390 /* Optimize dest = (op0 < 0) ? -1 : cf. */
3391 if (compare_code != UNKNOWN
3392 && GET_MODE (op0) == GET_MODE (out)
3393 && (cf == -1 || ct == -1))
3394 {
3395 /* If lea code below could be used, only optimize
3396 if it results in a 2 insn sequence. */
3397
3398 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3399 || diff == 3 || diff == 5 || diff == 9)
3400 || (compare_code == LT && ct == -1)
3401 || (compare_code == GE && cf == -1))
3402 {
3403 /*
3404 * notl op1 (if necessary)
3405 * sarl $31, op1
3406 * orl cf, op1
3407 */
3408 if (ct != -1)
3409 {
3410 cf = ct;
3411 ct = -1;
3412 code = reverse_condition (code);
3413 }
3414
3415 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3416
3417 out = expand_simple_binop (mode, IOR,
3418 out, GEN_INT (cf),
3419 out, 1, OPTAB_DIRECT);
3420 if (out != operands[0])
3421 emit_move_insn (operands[0], out);
3422
3423 return true;
3424 }
3425 }
3426
3427
3428 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3429 || diff == 3 || diff == 5 || diff == 9)
3430 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3431 && (mode != DImode
3432 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3433 {
3434 /*
3435 * xorl dest,dest
3436 * cmpl op1,op2
3437 * setcc dest
3438 * lea cf(dest*(ct-cf)),dest
3439 *
3440 * Size 14.
3441 *
3442 * This also catches the degenerate setcc-only case.
3443 */
3444
3445 rtx tmp;
3446 int nops;
3447
3448 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3449
3450 nops = 0;
3451 /* On x86_64 the lea instruction operates on Pmode, so we need
3452 to get arithmetics done in proper mode to match. */
3453 if (diff == 1)
3454 tmp = copy_rtx (out);
3455 else
3456 {
3457 rtx out1;
3458 out1 = copy_rtx (out);
3459 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3460 nops++;
3461 if (diff & 1)
3462 {
3463 tmp = gen_rtx_PLUS (mode, tmp, out1);
3464 nops++;
3465 }
3466 }
3467 if (cf != 0)
3468 {
3469 tmp = plus_constant (mode, tmp, cf);
3470 nops++;
3471 }
3472 if (!rtx_equal_p (tmp, out))
3473 {
3474 if (nops == 1)
3475 out = force_operand (tmp, copy_rtx (out));
3476 else
3477 emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3478 }
3479 if (!rtx_equal_p (out, operands[0]))
3480 emit_move_insn (operands[0], copy_rtx (out));
3481
3482 return true;
3483 }
3484
3485 /*
3486 * General case: Jumpful:
3487 * xorl dest,dest cmpl op1, op2
3488 * cmpl op1, op2 movl ct, dest
3489 * setcc dest jcc 1f
3490 * decl dest movl cf, dest
3491 * andl (cf-ct),dest 1:
3492 * addl ct,dest
3493 *
3494 * Size 20. Size 14.
3495 *
3496 * This is reasonably steep, but branch mispredict costs are
3497 * high on modern cpus, so consider failing only if optimizing
3498 * for space.
3499 */
3500
3501 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3502 && BRANCH_COST (optimize_insn_for_speed_p (),
3503 false) >= 2)
3504 {
3505 if (cf == 0)
3506 {
3507 machine_mode cmp_mode = GET_MODE (op0);
3508 enum rtx_code new_code;
3509
3510 if (SCALAR_FLOAT_MODE_P (cmp_mode))
3511 {
3512 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3513
3514 /* We may be reversing a non-trapping
3515 comparison to a trapping comparison. */
3516 if (HONOR_NANS (cmp_mode) && flag_trapping_math
3517 && code != EQ && code != NE
3518 && code != ORDERED && code != UNORDERED)
3519 new_code = UNKNOWN;
3520 else
3521 new_code = reverse_condition_maybe_unordered (code);
3522
3523 }
3524 else
3525 {
3526 new_code = ix86_reverse_condition (code, cmp_mode);
3527 if (compare_code != UNKNOWN && new_code != UNKNOWN)
3528 compare_code = reverse_condition (compare_code);
3529 }
3530
3531 if (new_code != UNKNOWN)
3532 {
3533 cf = ct;
3534 ct = 0;
3535 code = new_code;
3536 }
3537 }
3538
3539 if (compare_code != UNKNOWN)
3540 {
3541 /* notl op1 (if needed)
3542 sarl $31, op1
3543 andl (cf-ct), op1
3544 addl ct, op1
3545
3546 For x < 0 (resp. x <= -1) there will be no notl,
3547 so if possible swap the constants to get rid of the
3548 complement.
3549 True/false will be -1/0 while code below (store flag
3550 followed by decrement) is 0/-1, so the constants need
3551 to be exchanged once more. */
3552
3553 if (compare_code == GE || !cf)
3554 {
3555 code = reverse_condition (code);
3556 compare_code = LT;
3557 }
3558 else
3559 std::swap (ct, cf);
3560
3561 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3562 }
3563 else
3564 {
3565 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3566
3567 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
3568 constm1_rtx,
3569 copy_rtx (out), 1, OPTAB_DIRECT);
3570 }
3571
3572 out = expand_simple_binop (mode, AND, copy_rtx (out),
3573 gen_int_mode (cf - ct, mode),
3574 copy_rtx (out), 1, OPTAB_DIRECT);
3575 if (ct)
3576 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
3577 copy_rtx (out), 1, OPTAB_DIRECT);
3578 if (!rtx_equal_p (out, operands[0]))
3579 emit_move_insn (operands[0], copy_rtx (out));
3580
3581 return true;
3582 }
3583 }
3584
3585 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3586 {
3587 /* Try a few things more with specific constants and a variable. */
3588
3589 optab op;
3590 rtx var, orig_out, out, tmp;
3591
3592 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
3593 return false;
3594
3595 operands[2] = op2;
3596 operands[3] = op3;
3597
3598 /* If one of the two operands is an interesting constant, load a
3599 constant with the above and mask it in with a logical operation. */
3600
3601 if (CONST_INT_P (operands[2]))
3602 {
3603 var = operands[3];
3604 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
3605 operands[3] = constm1_rtx, op = and_optab;
3606 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
3607 operands[3] = const0_rtx, op = ior_optab;
3608 else
3609 return false;
3610 }
3611 else if (CONST_INT_P (operands[3]))
3612 {
3613 var = operands[2];
3614 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
3615 {
3616 /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
3617 "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
3618 if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
3619 operands[1] = simplify_gen_relational (LT, VOIDmode,
3620 GET_MODE (op0),
3621 op0, const0_rtx);
3622
3623 operands[2] = constm1_rtx;
3624 op = and_optab;
3625 }
3626 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
3627 operands[2] = const0_rtx, op = ior_optab;
3628 else
3629 return false;
3630 }
3631 else
3632 return false;
3633
3634 orig_out = operands[0];
3635 tmp = gen_reg_rtx (mode);
3636 operands[0] = tmp;
3637
3638 /* Recurse to get the constant loaded. */
3639 if (!ix86_expand_int_movcc (operands))
3640 return false;
3641
3642 /* Mask in the interesting variable. */
3643 out = expand_binop (mode, op, var, tmp, orig_out, 0,
3644 OPTAB_WIDEN);
3645 if (!rtx_equal_p (out, orig_out))
3646 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
3647
3648 return true;
3649 }
3650
3651 /*
3652 * For comparison with above,
3653 *
3654 * movl cf,dest
3655 * movl ct,tmp
3656 * cmpl op1,op2
3657 * cmovcc tmp,dest
3658 *
3659 * Size 15.
3660 */
3661
3662 if (! nonimmediate_operand (operands[2], mode))
3663 operands[2] = force_reg (mode, operands[2]);
3664 if (! nonimmediate_operand (operands[3], mode))
3665 operands[3] = force_reg (mode, operands[3]);
3666
3667 if (! register_operand (operands[2], VOIDmode)
3668 && (mode == QImode
3669 || ! register_operand (operands[3], VOIDmode)))
3670 operands[2] = force_reg (mode, operands[2]);
3671
3672 if (mode == QImode
3673 && ! register_operand (operands[3], VOIDmode))
3674 operands[3] = force_reg (mode, operands[3]);
3675
3676 emit_insn (compare_seq);
3677 emit_insn (gen_rtx_SET (operands[0],
3678 gen_rtx_IF_THEN_ELSE (mode,
3679 compare_op, operands[2],
3680 operands[3])));
3681 return true;
3682 }
3683
3684 /* Detect conditional moves that exactly match min/max operational
3685 semantics. Note that this is IEEE safe, as long as we don't
3686 interchange the operands.
3687
3688 Returns FALSE if this conditional move doesn't match a MIN/MAX,
3689 and TRUE if the operation is successful and instructions are emitted. */
3690
3691 static bool
3692 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
3693 rtx cmp_op1, rtx if_true, rtx if_false)
3694 {
3695 machine_mode mode;
3696 bool is_min;
3697 rtx tmp;
3698
3699 if (code == LT)
3700 ;
3701 else if (code == UNGE)
3702 std::swap (if_true, if_false);
3703 else
3704 return false;
3705
3706 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
3707 is_min = true;
3708 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
3709 is_min = false;
3710 else
3711 return false;
3712
3713 mode = GET_MODE (dest);
3714
3715 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
3716 but MODE may be a vector mode and thus not appropriate. */
3717 if (!flag_finite_math_only || flag_signed_zeros)
3718 {
3719 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
3720 rtvec v;
3721
3722 if_true = force_reg (mode, if_true);
3723 v = gen_rtvec (2, if_true, if_false);
3724 tmp = gen_rtx_UNSPEC (mode, v, u);
3725 }
3726 else
3727 {
3728 code = is_min ? SMIN : SMAX;
3729 if (MEM_P (if_true) && MEM_P (if_false))
3730 if_true = force_reg (mode, if_true);
3731 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
3732 }
3733
3734 emit_insn (gen_rtx_SET (dest, tmp));
3735 return true;
3736 }
3737
3738 /* Return true if MODE is valid for vector compare to mask register,
3739 Same result for conditionl vector move with mask register. */
3740 static bool
3741 ix86_valid_mask_cmp_mode (machine_mode mode)
3742 {
3743 /* XOP has its own vector conditional movement. */
3744 if (TARGET_XOP && !TARGET_AVX512F)
3745 return false;
3746
3747 /* HFmode only supports vcmpsh whose dest is mask register. */
3748 if (TARGET_AVX512FP16 && mode == HFmode)
3749 return true;
3750
3751 /* AVX512F is needed for mask operation. */
3752 if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
3753 return false;
3754
3755 /* AVX512BW is needed for vector QI/HImode,
3756 AVX512VL is needed for 128/256-bit vector. */
3757 machine_mode inner_mode = GET_MODE_INNER (mode);
3758 int vector_size = GET_MODE_SIZE (mode);
3759 if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
3760 return false;
3761
3762 return vector_size == 64 || TARGET_AVX512VL;
3763 }
3764
3765 /* Return true if integer mask comparison should be used. */
3766 static bool
3767 ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
3768 rtx op_true, rtx op_false)
3769 {
3770 int vector_size = GET_MODE_SIZE (mode);
3771
3772 if (cmp_mode == HFmode)
3773 return true;
3774 else if (vector_size < 16)
3775 return false;
3776 else if (vector_size == 64)
3777 return true;
3778 else if (GET_MODE_INNER (cmp_mode) == HFmode)
3779 return true;
3780
3781 /* When op_true is NULL, op_false must be NULL, or vice versa. */
3782 gcc_assert (!op_true == !op_false);
3783
3784 /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
3785 vector dest is required. */
3786 if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
3787 return false;
3788
3789 /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
3790 if (op_false == CONST0_RTX (mode)
3791 || op_true == CONST0_RTX (mode)
3792 || (INTEGRAL_MODE_P (mode)
3793 && (op_true == CONSTM1_RTX (mode)
3794 || op_false == CONSTM1_RTX (mode))))
3795 return false;
3796
3797 return true;
3798 }
3799
3800 /* Expand an SSE comparison. Return the register with the result. */
3801
3802 static rtx
3803 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
3804 rtx op_true, rtx op_false)
3805 {
3806 machine_mode mode = GET_MODE (dest);
3807 machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
3808
3809 /* In general case result of comparison can differ from operands' type. */
3810 machine_mode cmp_mode;
3811
3812 /* In AVX512F the result of comparison is an integer mask. */
3813 bool maskcmp = false;
3814 rtx x;
3815
3816 if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
3817 {
3818 unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
3819 maskcmp = true;
3820 cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
3821 }
3822 else
3823 cmp_mode = cmp_ops_mode;
3824
3825 cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
3826
3827 bool (*op1_predicate)(rtx, machine_mode)
3828 = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
3829
3830 if (!op1_predicate (cmp_op1, cmp_ops_mode))
3831 cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
3832
3833 if (optimize
3834 || (maskcmp && cmp_mode != mode)
3835 || (op_true && reg_overlap_mentioned_p (dest, op_true))
3836 || (op_false && reg_overlap_mentioned_p (dest, op_false)))
3837 dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
3838
3839 if (maskcmp)
3840 {
3841 bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
3842 gcc_assert (ok);
3843 return dest;
3844 }
3845
3846 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
3847
3848 if (cmp_mode != mode)
3849 {
3850 x = force_reg (cmp_ops_mode, x);
3851 convert_move (dest, x, false);
3852 }
3853 else
3854 emit_insn (gen_rtx_SET (dest, x));
3855
3856 return dest;
3857 }
3858
3859 /* Emit x86 binary operand CODE in mode MODE for SSE vector
3860 instructions that can be performed using GP registers. */
3861
3862 static void
3863 ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
3864 rtx dst, rtx src1, rtx src2)
3865 {
3866 rtx tmp;
3867
3868 tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
3869
3870 if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
3871 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
3872 {
3873 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
3874 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
3875 }
3876
3877 emit_insn (tmp);
3878 }
3879
3880 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
3881 operations. This is used for both scalar and vector conditional moves. */
3882
3883 void
3884 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
3885 {
3886 machine_mode mode = GET_MODE (dest);
3887 machine_mode cmpmode = GET_MODE (cmp);
3888 rtx x;
3889
3890 /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
3891 if (rtx_equal_p (op_true, op_false))
3892 {
3893 emit_move_insn (dest, op_true);
3894 return;
3895 }
3896
3897 /* If we have an integer mask and FP value then we need
3898 to cast mask to FP mode. */
3899 if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
3900 {
3901 cmp = force_reg (cmpmode, cmp);
3902 cmp = gen_rtx_SUBREG (mode, cmp, 0);
3903 }
3904
3905 /* In AVX512F the result of comparison is an integer mask. */
3906 if (mode != cmpmode
3907 && GET_MODE_CLASS (cmpmode) == MODE_INT)
3908 {
3909 gcc_assert (ix86_valid_mask_cmp_mode (mode));
3910 /* Using scalar/vector move with mask register. */
3911 cmp = force_reg (cmpmode, cmp);
3912 /* Optimize for mask zero. */
3913 op_true = (op_true != CONST0_RTX (mode)
3914 ? force_reg (mode, op_true) : op_true);
3915 op_false = (op_false != CONST0_RTX (mode)
3916 ? force_reg (mode, op_false) : op_false);
3917 if (op_true == CONST0_RTX (mode))
3918 {
3919 if (cmpmode == E_DImode && !TARGET_64BIT)
3920 {
3921 x = gen_reg_rtx (cmpmode);
3922 emit_insn (gen_knotdi (x, cmp));
3923 }
3924 else
3925 x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
3926 cmp = x;
3927 /* Reverse op_true op_false. */
3928 std::swap (op_true, op_false);
3929 }
3930
3931 if (mode == HFmode)
3932 emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
3933 else
3934 emit_insn (gen_rtx_SET (dest,
3935 gen_rtx_VEC_MERGE (mode,
3936 op_true, op_false, cmp)));
3937 return;
3938 }
3939
3940 if (vector_all_ones_operand (op_true, mode)
3941 && op_false == CONST0_RTX (mode))
3942 {
3943 emit_move_insn (dest, cmp);
3944 return;
3945 }
3946 else if (op_false == CONST0_RTX (mode))
3947 {
3948 x = expand_simple_binop (mode, AND, cmp, op_true,
3949 dest, 1, OPTAB_DIRECT);
3950 if (x != dest)
3951 emit_move_insn (dest, x);
3952 return;
3953 }
3954 else if (op_true == CONST0_RTX (mode))
3955 {
3956 op_false = force_reg (mode, op_false);
3957 x = gen_rtx_NOT (mode, cmp);
3958 ix86_emit_vec_binop (AND, mode, dest, x, op_false);
3959 return;
3960 }
3961 else if (vector_all_ones_operand (op_true, mode))
3962 {
3963 x = expand_simple_binop (mode, IOR, cmp, op_false,
3964 dest, 1, OPTAB_DIRECT);
3965 if (x != dest)
3966 emit_move_insn (dest, x);
3967 return;
3968 }
3969
3970 if (TARGET_XOP)
3971 {
3972 op_true = force_reg (mode, op_true);
3973
3974 if (GET_MODE_SIZE (mode) < 16
3975 || !nonimmediate_operand (op_false, mode))
3976 op_false = force_reg (mode, op_false);
3977
3978 emit_insn (gen_rtx_SET (dest,
3979 gen_rtx_IF_THEN_ELSE (mode, cmp,
3980 op_true, op_false)));
3981 return;
3982 }
3983
3984 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
3985 machine_mode blend_mode = mode;
3986
3987 if (GET_MODE_SIZE (mode) < 16
3988 || !vector_operand (op_true, mode))
3989 op_true = force_reg (mode, op_true);
3990
3991 op_false = force_reg (mode, op_false);
3992
3993 switch (mode)
3994 {
3995 case E_V2SFmode:
3996 if (TARGET_SSE4_1)
3997 gen = gen_mmx_blendvps;
3998 break;
3999 case E_V4SFmode:
4000 if (TARGET_SSE4_1)
4001 gen = gen_sse4_1_blendvps;
4002 break;
4003 case E_V2DFmode:
4004 if (TARGET_SSE4_1)
4005 gen = gen_sse4_1_blendvpd;
4006 break;
4007 case E_SFmode:
4008 if (TARGET_SSE4_1)
4009 gen = gen_sse4_1_blendvss;
4010 break;
4011 case E_DFmode:
4012 if (TARGET_SSE4_1)
4013 gen = gen_sse4_1_blendvsd;
4014 break;
4015 case E_V8QImode:
4016 case E_V4HImode:
4017 case E_V2SImode:
4018 if (TARGET_SSE4_1)
4019 {
4020 gen = gen_mmx_pblendvb_v8qi;
4021 blend_mode = V8QImode;
4022 }
4023 break;
4024 case E_V4QImode:
4025 case E_V2HImode:
4026 if (TARGET_SSE4_1)
4027 {
4028 gen = gen_mmx_pblendvb_v4qi;
4029 blend_mode = V4QImode;
4030 }
4031 break;
4032 case E_V2QImode:
4033 if (TARGET_SSE4_1)
4034 gen = gen_mmx_pblendvb_v2qi;
4035 break;
4036 case E_V16QImode:
4037 case E_V8HImode:
4038 case E_V8HFmode:
4039 case E_V4SImode:
4040 case E_V2DImode:
4041 if (TARGET_SSE4_1)
4042 {
4043 gen = gen_sse4_1_pblendvb;
4044 blend_mode = V16QImode;
4045 }
4046 break;
4047 case E_V8SFmode:
4048 if (TARGET_AVX)
4049 gen = gen_avx_blendvps256;
4050 break;
4051 case E_V4DFmode:
4052 if (TARGET_AVX)
4053 gen = gen_avx_blendvpd256;
4054 break;
4055 case E_V32QImode:
4056 case E_V16HImode:
4057 case E_V16HFmode:
4058 case E_V8SImode:
4059 case E_V4DImode:
4060 if (TARGET_AVX2)
4061 {
4062 gen = gen_avx2_pblendvb;
4063 blend_mode = V32QImode;
4064 }
4065 break;
4066
4067 case E_V64QImode:
4068 gen = gen_avx512bw_blendmv64qi;
4069 break;
4070 case E_V32HImode:
4071 gen = gen_avx512bw_blendmv32hi;
4072 break;
4073 case E_V32HFmode:
4074 gen = gen_avx512bw_blendmv32hf;
4075 break;
4076 case E_V16SImode:
4077 gen = gen_avx512f_blendmv16si;
4078 break;
4079 case E_V8DImode:
4080 gen = gen_avx512f_blendmv8di;
4081 break;
4082 case E_V8DFmode:
4083 gen = gen_avx512f_blendmv8df;
4084 break;
4085 case E_V16SFmode:
4086 gen = gen_avx512f_blendmv16sf;
4087 break;
4088
4089 default:
4090 break;
4091 }
4092
4093 if (gen != NULL)
4094 {
4095 if (blend_mode == mode)
4096 x = dest;
4097 else
4098 {
4099 x = gen_reg_rtx (blend_mode);
4100 op_false = gen_lowpart (blend_mode, op_false);
4101 op_true = gen_lowpart (blend_mode, op_true);
4102 cmp = gen_lowpart (blend_mode, cmp);
4103 }
4104
4105 emit_insn (gen (x, op_false, op_true, cmp));
4106
4107 if (x != dest)
4108 emit_move_insn (dest, gen_lowpart (mode, x));
4109 }
4110 else
4111 {
4112 rtx t2, t3;
4113
4114 t2 = expand_simple_binop (mode, AND, op_true, cmp,
4115 NULL, 1, OPTAB_DIRECT);
4116
4117 t3 = gen_reg_rtx (mode);
4118 x = gen_rtx_NOT (mode, cmp);
4119 ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4120
4121 x = expand_simple_binop (mode, IOR, t3, t2,
4122 dest, 1, OPTAB_DIRECT);
4123 if (x != dest)
4124 emit_move_insn (dest, x);
4125 }
4126 }
4127
4128 /* Swap, force into registers, or otherwise massage the two operands
4129 to an sse comparison with a mask result. Thus we differ a bit from
4130 ix86_prepare_fp_compare_args which expects to produce a flags result.
4131
4132 The DEST operand exists to help determine whether to commute commutative
4133 operators. The POP0/POP1 operands are updated in place. The new
4134 comparison code is returned, or UNKNOWN if not implementable. */
4135
4136 static enum rtx_code
4137 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4138 rtx *pop0, rtx *pop1)
4139 {
4140 switch (code)
4141 {
4142 case LTGT:
4143 case UNEQ:
4144 /* AVX supports all the needed comparisons. */
4145 if (TARGET_AVX)
4146 break;
4147 /* We have no LTGT as an operator. We could implement it with
4148 NE & ORDERED, but this requires an extra temporary. It's
4149 not clear that it's worth it. */
4150 return UNKNOWN;
4151
4152 case LT:
4153 case LE:
4154 case UNGT:
4155 case UNGE:
4156 /* These are supported directly. */
4157 break;
4158
4159 case EQ:
4160 case NE:
4161 case UNORDERED:
4162 case ORDERED:
4163 /* AVX has 3 operand comparisons, no need to swap anything. */
4164 if (TARGET_AVX)
4165 break;
4166 /* For commutative operators, try to canonicalize the destination
4167 operand to be first in the comparison - this helps reload to
4168 avoid extra moves. */
4169 if (!dest || !rtx_equal_p (dest, *pop1))
4170 break;
4171 /* FALLTHRU */
4172
4173 case GE:
4174 case GT:
4175 case UNLE:
4176 case UNLT:
4177 /* These are not supported directly before AVX, and furthermore
4178 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4179 comparison operands to transform into something that is
4180 supported. */
4181 std::swap (*pop0, *pop1);
4182 code = swap_condition (code);
4183 break;
4184
4185 default:
4186 gcc_unreachable ();
4187 }
4188
4189 return code;
4190 }
4191
4192 /* Expand a floating-point conditional move. Return true if successful. */
4193
4194 bool
4195 ix86_expand_fp_movcc (rtx operands[])
4196 {
4197 machine_mode mode = GET_MODE (operands[0]);
4198 enum rtx_code code = GET_CODE (operands[1]);
4199 rtx tmp, compare_op;
4200 rtx op0 = XEXP (operands[1], 0);
4201 rtx op1 = XEXP (operands[1], 1);
4202
4203 if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
4204 {
4205 machine_mode cmode;
4206
4207 /* Since we've no cmove for sse registers, don't force bad register
4208 allocation just to gain access to it. Deny movcc when the
4209 comparison mode doesn't match the move mode. */
4210 cmode = GET_MODE (op0);
4211 if (cmode == VOIDmode)
4212 cmode = GET_MODE (op1);
4213 if (cmode != mode)
4214 return false;
4215
4216 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4217 if (code == UNKNOWN)
4218 return false;
4219
4220 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4221 operands[2], operands[3]))
4222 return true;
4223
4224 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4225 operands[2], operands[3]);
4226 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4227 return true;
4228 }
4229
4230 if (GET_MODE (op0) == TImode
4231 || (GET_MODE (op0) == DImode
4232 && !TARGET_64BIT))
4233 return false;
4234
4235 /* The floating point conditional move instructions don't directly
4236 support conditions resulting from a signed integer comparison. */
4237
4238 compare_op = ix86_expand_compare (code, op0, op1);
4239 if (!fcmov_comparison_operator (compare_op, VOIDmode))
4240 {
4241 tmp = gen_reg_rtx (QImode);
4242 ix86_expand_setcc (tmp, code, op0, op1);
4243
4244 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4245 }
4246
4247 emit_insn (gen_rtx_SET (operands[0],
4248 gen_rtx_IF_THEN_ELSE (mode, compare_op,
4249 operands[2], operands[3])));
4250
4251 return true;
4252 }
4253
4254 /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4255
4256 static int
4257 ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4258 {
4259 switch (code)
4260 {
4261 case EQ:
4262 return 0;
4263 case LT:
4264 case LTU:
4265 return 1;
4266 case LE:
4267 case LEU:
4268 return 2;
4269 case NE:
4270 return 4;
4271 case GE:
4272 case GEU:
4273 return 5;
4274 case GT:
4275 case GTU:
4276 return 6;
4277 default:
4278 gcc_unreachable ();
4279 }
4280 }
4281
4282 /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4283
4284 static int
4285 ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4286 {
4287 switch (code)
4288 {
4289 case EQ:
4290 return 0x00;
4291 case NE:
4292 return 0x04;
4293 case GT:
4294 return 0x0e;
4295 case LE:
4296 return 0x02;
4297 case GE:
4298 return 0x0d;
4299 case LT:
4300 return 0x01;
4301 case UNLE:
4302 return 0x0a;
4303 case UNLT:
4304 return 0x09;
4305 case UNGE:
4306 return 0x05;
4307 case UNGT:
4308 return 0x06;
4309 case UNEQ:
4310 return 0x18;
4311 case LTGT:
4312 return 0x0c;
4313 case ORDERED:
4314 return 0x07;
4315 case UNORDERED:
4316 return 0x03;
4317 default:
4318 gcc_unreachable ();
4319 }
4320 }
4321
4322 /* Return immediate value to be used in UNSPEC_PCMP
4323 for comparison CODE in MODE. */
4324
4325 static int
4326 ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4327 {
4328 if (FLOAT_MODE_P (mode))
4329 return ix86_fp_cmp_code_to_pcmp_immediate (code);
4330 return ix86_int_cmp_code_to_pcmp_immediate (code);
4331 }
4332
4333 /* Expand AVX-512 vector comparison. */
4334
4335 bool
4336 ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4337 {
4338 machine_mode mask_mode = GET_MODE (dest);
4339 machine_mode cmp_mode = GET_MODE (cmp_op0);
4340 rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4341 int unspec_code;
4342 rtx unspec;
4343
4344 switch (code)
4345 {
4346 case LEU:
4347 case GTU:
4348 case GEU:
4349 case LTU:
4350 unspec_code = UNSPEC_UNSIGNED_PCMP;
4351 break;
4352
4353 default:
4354 unspec_code = UNSPEC_PCMP;
4355 }
4356
4357 unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4358 unspec_code);
4359 emit_insn (gen_rtx_SET (dest, unspec));
4360
4361 return true;
4362 }
4363
4364 /* Expand fp vector comparison. */
4365
4366 bool
4367 ix86_expand_fp_vec_cmp (rtx operands[])
4368 {
4369 enum rtx_code code = GET_CODE (operands[1]);
4370 rtx cmp;
4371
4372 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4373 &operands[2], &operands[3]);
4374 if (code == UNKNOWN)
4375 {
4376 rtx temp;
4377 switch (GET_CODE (operands[1]))
4378 {
4379 case LTGT:
4380 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4381 operands[3], NULL, NULL);
4382 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4383 operands[3], NULL, NULL);
4384 code = AND;
4385 break;
4386 case UNEQ:
4387 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4388 operands[3], NULL, NULL);
4389 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4390 operands[3], NULL, NULL);
4391 code = IOR;
4392 break;
4393 default:
4394 gcc_unreachable ();
4395 }
4396 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4397 OPTAB_DIRECT);
4398 }
4399 else
4400 cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4401 NULL, NULL);
4402
4403 if (operands[0] != cmp)
4404 emit_move_insn (operands[0], cmp);
4405
4406 return true;
4407 }
4408
4409 static rtx
4410 ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4411 rtx op_true, rtx op_false, bool *negate)
4412 {
4413 machine_mode data_mode = GET_MODE (dest);
4414 machine_mode mode = GET_MODE (cop0);
4415 rtx x;
4416
4417 *negate = false;
4418
4419 /* XOP supports all of the comparisons on all 128-bit vector int types. */
4420 if (TARGET_XOP
4421 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4422 && GET_MODE_SIZE (mode) <= 16)
4423 ;
4424 /* AVX512F supports all of the comparsions
4425 on all 128/256/512-bit vector int types. */
4426 else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4427 ;
4428 else
4429 {
4430 /* Canonicalize the comparison to EQ, GT, GTU. */
4431 switch (code)
4432 {
4433 case EQ:
4434 case GT:
4435 case GTU:
4436 break;
4437
4438 case NE:
4439 case LE:
4440 case LEU:
4441 code = reverse_condition (code);
4442 *negate = true;
4443 break;
4444
4445 case GE:
4446 case GEU:
4447 code = reverse_condition (code);
4448 *negate = true;
4449 /* FALLTHRU */
4450
4451 case LT:
4452 case LTU:
4453 std::swap (cop0, cop1);
4454 code = swap_condition (code);
4455 break;
4456
4457 default:
4458 gcc_unreachable ();
4459 }
4460
4461 /* Only SSE4.1/SSE4.2 supports V2DImode. */
4462 if (mode == V2DImode)
4463 {
4464 switch (code)
4465 {
4466 case EQ:
4467 /* SSE4.1 supports EQ. */
4468 if (!TARGET_SSE4_1)
4469 return NULL;
4470 break;
4471
4472 case GT:
4473 case GTU:
4474 /* SSE4.2 supports GT/GTU. */
4475 if (!TARGET_SSE4_2)
4476 return NULL;
4477 break;
4478
4479 default:
4480 gcc_unreachable ();
4481 }
4482 }
4483
4484 rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
4485 rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
4486 if (*negate)
4487 std::swap (optrue, opfalse);
4488
4489 /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
4490 not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
4491 min (x, y) == x). While we add one instruction (the minimum),
4492 we remove the need for two instructions in the negation, as the
4493 result is done this way.
4494 When using masks, do it for SI/DImode element types, as it is shorter
4495 than the two subtractions. */
4496 if ((code != EQ
4497 && GET_MODE_SIZE (mode) != 64
4498 && vector_all_ones_operand (opfalse, data_mode)
4499 && optrue == CONST0_RTX (data_mode))
4500 || (code == GTU
4501 && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
4502 /* Don't do it if not using integer masks and we'd end up with
4503 the right values in the registers though. */
4504 && (GET_MODE_SIZE (mode) == 64
4505 || !vector_all_ones_operand (optrue, data_mode)
4506 || opfalse != CONST0_RTX (data_mode))))
4507 {
4508 rtx (*gen) (rtx, rtx, rtx) = NULL;
4509
4510 switch (mode)
4511 {
4512 case E_V16SImode:
4513 gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
4514 break;
4515 case E_V8DImode:
4516 gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
4517 cop0 = force_reg (mode, cop0);
4518 cop1 = force_reg (mode, cop1);
4519 break;
4520 case E_V32QImode:
4521 if (TARGET_AVX2)
4522 gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
4523 break;
4524 case E_V16HImode:
4525 if (TARGET_AVX2)
4526 gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
4527 break;
4528 case E_V8SImode:
4529 if (TARGET_AVX2)
4530 gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
4531 break;
4532 case E_V4DImode:
4533 if (TARGET_AVX512VL)
4534 {
4535 gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
4536 cop0 = force_reg (mode, cop0);
4537 cop1 = force_reg (mode, cop1);
4538 }
4539 break;
4540 case E_V16QImode:
4541 if (code == GTU && TARGET_SSE2)
4542 gen = gen_uminv16qi3;
4543 else if (code == GT && TARGET_SSE4_1)
4544 gen = gen_sminv16qi3;
4545 break;
4546 case E_V8QImode:
4547 if (code == GTU && TARGET_SSE2)
4548 gen = gen_uminv8qi3;
4549 else if (code == GT && TARGET_SSE4_1)
4550 gen = gen_sminv8qi3;
4551 break;
4552 case E_V4QImode:
4553 if (code == GTU && TARGET_SSE2)
4554 gen = gen_uminv4qi3;
4555 else if (code == GT && TARGET_SSE4_1)
4556 gen = gen_sminv4qi3;
4557 break;
4558 case E_V2QImode:
4559 if (code == GTU && TARGET_SSE2)
4560 gen = gen_uminv2qi3;
4561 else if (code == GT && TARGET_SSE4_1)
4562 gen = gen_sminv2qi3;
4563 break;
4564 case E_V8HImode:
4565 if (code == GTU && TARGET_SSE4_1)
4566 gen = gen_uminv8hi3;
4567 else if (code == GT && TARGET_SSE2)
4568 gen = gen_sminv8hi3;
4569 break;
4570 case E_V4HImode:
4571 if (code == GTU && TARGET_SSE4_1)
4572 gen = gen_uminv4hi3;
4573 else if (code == GT && TARGET_SSE2)
4574 gen = gen_sminv4hi3;
4575 break;
4576 case E_V2HImode:
4577 if (code == GTU && TARGET_SSE4_1)
4578 gen = gen_uminv2hi3;
4579 else if (code == GT && TARGET_SSE2)
4580 gen = gen_sminv2hi3;
4581 break;
4582 case E_V4SImode:
4583 if (TARGET_SSE4_1)
4584 gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
4585 break;
4586 case E_V2SImode:
4587 if (TARGET_SSE4_1)
4588 gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
4589 break;
4590 case E_V2DImode:
4591 if (TARGET_AVX512VL)
4592 {
4593 gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
4594 cop0 = force_reg (mode, cop0);
4595 cop1 = force_reg (mode, cop1);
4596 }
4597 break;
4598 default:
4599 break;
4600 }
4601
4602 if (gen)
4603 {
4604 rtx tem = gen_reg_rtx (mode);
4605 if (!vector_operand (cop0, mode))
4606 cop0 = force_reg (mode, cop0);
4607 if (!vector_operand (cop1, mode))
4608 cop1 = force_reg (mode, cop1);
4609 *negate = !*negate;
4610 emit_insn (gen (tem, cop0, cop1));
4611 cop1 = tem;
4612 code = EQ;
4613 }
4614 }
4615
4616 /* Unsigned parallel compare is not supported by the hardware.
4617 Play some tricks to turn this into a signed comparison
4618 against 0. */
4619 if (code == GTU)
4620 {
4621 cop0 = force_reg (mode, cop0);
4622
4623 switch (mode)
4624 {
4625 case E_V16SImode:
4626 case E_V8DImode:
4627 case E_V8SImode:
4628 case E_V4DImode:
4629 case E_V4SImode:
4630 case E_V2SImode:
4631 case E_V2DImode:
4632 {
4633 rtx t1, t2, mask;
4634
4635 /* Subtract (-(INT MAX) - 1) from both operands to make
4636 them signed. */
4637 mask = ix86_build_signbit_mask (mode, true, false);
4638 t1 = gen_reg_rtx (mode);
4639 emit_insn (gen_sub3_insn (t1, cop0, mask));
4640
4641 t2 = gen_reg_rtx (mode);
4642 emit_insn (gen_sub3_insn (t2, cop1, mask));
4643
4644 cop0 = t1;
4645 cop1 = t2;
4646 code = GT;
4647 }
4648 break;
4649
4650 case E_V64QImode:
4651 case E_V32HImode:
4652 case E_V32QImode:
4653 case E_V16HImode:
4654 case E_V16QImode:
4655 case E_V8QImode:
4656 case E_V4QImode:
4657 case E_V2QImode:
4658 case E_V8HImode:
4659 case E_V4HImode:
4660 case E_V2HImode:
4661 /* Perform a parallel unsigned saturating subtraction. */
4662 x = gen_reg_rtx (mode);
4663 emit_insn (gen_rtx_SET
4664 (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
4665 cop0 = x;
4666 cop1 = CONST0_RTX (mode);
4667 code = EQ;
4668 *negate = !*negate;
4669 break;
4670
4671 default:
4672 gcc_unreachable ();
4673 }
4674 }
4675 }
4676
4677 if (*negate)
4678 std::swap (op_true, op_false);
4679
4680 /* Allow the comparison to be done in one mode, but the movcc to
4681 happen in another mode. */
4682 if (data_mode == mode)
4683 {
4684 x = ix86_expand_sse_cmp (dest, code, cop0, cop1,
4685 op_true, op_false);
4686 }
4687 else
4688 {
4689 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
4690 x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
4691 op_true, op_false);
4692 if (GET_MODE (x) == mode)
4693 x = gen_lowpart (data_mode, x);
4694 }
4695
4696 return x;
4697 }
4698
4699 /* Expand integer vector comparison. */
4700
4701 bool
4702 ix86_expand_int_vec_cmp (rtx operands[])
4703 {
4704 rtx_code code = GET_CODE (operands[1]);
4705 bool negate = false;
4706 rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
4707 operands[3], NULL, NULL, &negate);
4708
4709 if (!cmp)
4710 return false;
4711
4712 if (negate)
4713 cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
4714 CONST0_RTX (GET_MODE (cmp)),
4715 NULL, NULL, &negate);
4716
4717 gcc_assert (!negate);
4718
4719 if (operands[0] != cmp)
4720 emit_move_insn (operands[0], cmp);
4721
4722 return true;
4723 }
4724
4725 /* Expand a floating-point vector conditional move; a vcond operation
4726 rather than a movcc operation. */
4727
4728 bool
4729 ix86_expand_fp_vcond (rtx operands[])
4730 {
4731 enum rtx_code code = GET_CODE (operands[3]);
4732 rtx cmp;
4733
4734 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4735 &operands[4], &operands[5]);
4736 if (code == UNKNOWN)
4737 {
4738 rtx temp;
4739 switch (GET_CODE (operands[3]))
4740 {
4741 case LTGT:
4742 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
4743 operands[5], operands[0], operands[0]);
4744 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
4745 operands[5], operands[1], operands[2]);
4746 code = AND;
4747 break;
4748 case UNEQ:
4749 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
4750 operands[5], operands[0], operands[0]);
4751 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
4752 operands[5], operands[1], operands[2]);
4753 code = IOR;
4754 break;
4755 default:
4756 gcc_unreachable ();
4757 }
4758 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4759 OPTAB_DIRECT);
4760 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4761 return true;
4762 }
4763
4764 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
4765 operands[5], operands[1], operands[2]))
4766 return true;
4767
4768 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
4769 operands[1], operands[2]);
4770 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
4771 return true;
4772 }
4773
4774 /* Expand a signed/unsigned integral vector conditional move. */
4775
4776 bool
4777 ix86_expand_int_vcond (rtx operands[])
4778 {
4779 machine_mode data_mode = GET_MODE (operands[0]);
4780 machine_mode mode = GET_MODE (operands[4]);
4781 enum rtx_code code = GET_CODE (operands[3]);
4782 bool negate = false;
4783 rtx x, cop0, cop1;
4784
4785 cop0 = operands[4];
4786 cop1 = operands[5];
4787
4788 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
4789 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
4790 if ((code == LT || code == GE)
4791 && data_mode == mode
4792 && cop1 == CONST0_RTX (mode)
4793 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
4794 && GET_MODE_UNIT_SIZE (data_mode) > 1
4795 && GET_MODE_UNIT_SIZE (data_mode) <= 8
4796 && (GET_MODE_SIZE (data_mode) == 16
4797 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
4798 {
4799 rtx negop = operands[2 - (code == LT)];
4800 int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
4801 if (negop == CONST1_RTX (data_mode))
4802 {
4803 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
4804 operands[0], 1, OPTAB_DIRECT);
4805 if (res != operands[0])
4806 emit_move_insn (operands[0], res);
4807 return true;
4808 }
4809 else if (GET_MODE_INNER (data_mode) != DImode
4810 && vector_all_ones_operand (negop, data_mode))
4811 {
4812 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
4813 operands[0], 0, OPTAB_DIRECT);
4814 if (res != operands[0])
4815 emit_move_insn (operands[0], res);
4816 return true;
4817 }
4818 }
4819
4820 if (!nonimmediate_operand (cop1, mode))
4821 cop1 = force_reg (mode, cop1);
4822 if (!general_operand (operands[1], data_mode))
4823 operands[1] = force_reg (data_mode, operands[1]);
4824 if (!general_operand (operands[2], data_mode))
4825 operands[2] = force_reg (data_mode, operands[2]);
4826
4827 x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
4828 operands[1], operands[2], &negate);
4829
4830 if (!x)
4831 return false;
4832
4833 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
4834 operands[2-negate]);
4835 return true;
4836 }
4837
4838 static bool
4839 ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
4840 struct expand_vec_perm_d *d)
4841 {
4842 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4843 expander, so args are either in d, or in op0, op1 etc. */
4844 machine_mode mode = GET_MODE (d ? d->op0 : op0);
4845 machine_mode maskmode = mode;
4846 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4847
4848 switch (mode)
4849 {
4850 case E_V16QImode:
4851 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4852 gen = gen_avx512vl_vpermt2varv16qi3;
4853 break;
4854 case E_V32QImode:
4855 if (TARGET_AVX512VL && TARGET_AVX512VBMI)
4856 gen = gen_avx512vl_vpermt2varv32qi3;
4857 break;
4858 case E_V64QImode:
4859 if (TARGET_AVX512VBMI)
4860 gen = gen_avx512bw_vpermt2varv64qi3;
4861 break;
4862 case E_V8HImode:
4863 if (TARGET_AVX512VL && TARGET_AVX512BW)
4864 gen = gen_avx512vl_vpermt2varv8hi3;
4865 break;
4866 case E_V16HImode:
4867 if (TARGET_AVX512VL && TARGET_AVX512BW)
4868 gen = gen_avx512vl_vpermt2varv16hi3;
4869 break;
4870 case E_V32HImode:
4871 if (TARGET_AVX512BW)
4872 gen = gen_avx512bw_vpermt2varv32hi3;
4873 break;
4874 case E_V4SImode:
4875 if (TARGET_AVX512VL)
4876 gen = gen_avx512vl_vpermt2varv4si3;
4877 break;
4878 case E_V8SImode:
4879 if (TARGET_AVX512VL)
4880 gen = gen_avx512vl_vpermt2varv8si3;
4881 break;
4882 case E_V16SImode:
4883 if (TARGET_AVX512F)
4884 gen = gen_avx512f_vpermt2varv16si3;
4885 break;
4886 case E_V4SFmode:
4887 if (TARGET_AVX512VL)
4888 {
4889 gen = gen_avx512vl_vpermt2varv4sf3;
4890 maskmode = V4SImode;
4891 }
4892 break;
4893 case E_V8SFmode:
4894 if (TARGET_AVX512VL)
4895 {
4896 gen = gen_avx512vl_vpermt2varv8sf3;
4897 maskmode = V8SImode;
4898 }
4899 break;
4900 case E_V16SFmode:
4901 if (TARGET_AVX512F)
4902 {
4903 gen = gen_avx512f_vpermt2varv16sf3;
4904 maskmode = V16SImode;
4905 }
4906 break;
4907 case E_V2DImode:
4908 if (TARGET_AVX512VL)
4909 gen = gen_avx512vl_vpermt2varv2di3;
4910 break;
4911 case E_V4DImode:
4912 if (TARGET_AVX512VL)
4913 gen = gen_avx512vl_vpermt2varv4di3;
4914 break;
4915 case E_V8DImode:
4916 if (TARGET_AVX512F)
4917 gen = gen_avx512f_vpermt2varv8di3;
4918 break;
4919 case E_V2DFmode:
4920 if (TARGET_AVX512VL)
4921 {
4922 gen = gen_avx512vl_vpermt2varv2df3;
4923 maskmode = V2DImode;
4924 }
4925 break;
4926 case E_V4DFmode:
4927 if (TARGET_AVX512VL)
4928 {
4929 gen = gen_avx512vl_vpermt2varv4df3;
4930 maskmode = V4DImode;
4931 }
4932 break;
4933 case E_V8DFmode:
4934 if (TARGET_AVX512F)
4935 {
4936 gen = gen_avx512f_vpermt2varv8df3;
4937 maskmode = V8DImode;
4938 }
4939 break;
4940 default:
4941 break;
4942 }
4943
4944 if (gen == NULL)
4945 return false;
4946
4947 /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
4948 expander, so args are either in d, or in op0, op1 etc. */
4949 if (d)
4950 {
4951 rtx vec[64];
4952 target = d->target;
4953 op0 = d->op0;
4954 op1 = d->op1;
4955 for (int i = 0; i < d->nelt; ++i)
4956 vec[i] = GEN_INT (d->perm[i]);
4957 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
4958 }
4959
4960 emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
4961 return true;
4962 }
4963
4964 /* Expand a variable vector permutation. */
4965
4966 void
4967 ix86_expand_vec_perm (rtx operands[])
4968 {
4969 rtx target = operands[0];
4970 rtx op0 = operands[1];
4971 rtx op1 = operands[2];
4972 rtx mask = operands[3];
4973 rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
4974 machine_mode mode = GET_MODE (op0);
4975 machine_mode maskmode = GET_MODE (mask);
4976 int w, e, i;
4977 bool one_operand_shuffle = rtx_equal_p (op0, op1);
4978
4979 /* Number of elements in the vector. */
4980 w = GET_MODE_NUNITS (mode);
4981 e = GET_MODE_UNIT_SIZE (mode);
4982 gcc_assert (w <= 64);
4983
4984 /* For HF mode vector, convert it to HI using subreg. */
4985 if (GET_MODE_INNER (mode) == HFmode)
4986 {
4987 machine_mode orig_mode = mode;
4988 mode = mode_for_vector (HImode, w).require ();
4989 target = lowpart_subreg (mode, target, orig_mode);
4990 op0 = lowpart_subreg (mode, op0, orig_mode);
4991 op1 = lowpart_subreg (mode, op1, orig_mode);
4992 }
4993
4994 if (TARGET_AVX512F && one_operand_shuffle)
4995 {
4996 rtx (*gen) (rtx, rtx, rtx) = NULL;
4997 switch (mode)
4998 {
4999 case E_V16SImode:
5000 gen =gen_avx512f_permvarv16si;
5001 break;
5002 case E_V16SFmode:
5003 gen = gen_avx512f_permvarv16sf;
5004 break;
5005 case E_V8DImode:
5006 gen = gen_avx512f_permvarv8di;
5007 break;
5008 case E_V8DFmode:
5009 gen = gen_avx512f_permvarv8df;
5010 break;
5011 default:
5012 break;
5013 }
5014 if (gen != NULL)
5015 {
5016 emit_insn (gen (target, op0, mask));
5017 return;
5018 }
5019 }
5020
5021 if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5022 return;
5023
5024 if (TARGET_AVX2)
5025 {
5026 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5027 {
5028 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5029 an constant shuffle operand. With a tiny bit of effort we can
5030 use VPERMD instead. A re-interpretation stall for V4DFmode is
5031 unfortunate but there's no avoiding it.
5032 Similarly for V16HImode we don't have instructions for variable
5033 shuffling, while for V32QImode we can use after preparing suitable
5034 masks vpshufb; vpshufb; vpermq; vpor. */
5035
5036 if (mode == V16HImode)
5037 {
5038 maskmode = mode = V32QImode;
5039 w = 32;
5040 e = 1;
5041 }
5042 else
5043 {
5044 maskmode = mode = V8SImode;
5045 w = 8;
5046 e = 4;
5047 }
5048 t1 = gen_reg_rtx (maskmode);
5049
5050 /* Replicate the low bits of the V4DImode mask into V8SImode:
5051 mask = { A B C D }
5052 t1 = { A A B B C C D D }. */
5053 for (i = 0; i < w / 2; ++i)
5054 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5055 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5056 vt = force_reg (maskmode, vt);
5057 mask = gen_lowpart (maskmode, mask);
5058 if (maskmode == V8SImode)
5059 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5060 else
5061 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5062
5063 /* Multiply the shuffle indicies by two. */
5064 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5065 OPTAB_DIRECT);
5066
5067 /* Add one to the odd shuffle indicies:
5068 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5069 for (i = 0; i < w / 2; ++i)
5070 {
5071 vec[i * 2] = const0_rtx;
5072 vec[i * 2 + 1] = const1_rtx;
5073 }
5074 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5075 vt = validize_mem (force_const_mem (maskmode, vt));
5076 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5077 OPTAB_DIRECT);
5078
5079 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5080 operands[3] = mask = t1;
5081 target = gen_reg_rtx (mode);
5082 op0 = gen_lowpart (mode, op0);
5083 op1 = gen_lowpart (mode, op1);
5084 }
5085
5086 switch (mode)
5087 {
5088 case E_V8SImode:
5089 /* The VPERMD and VPERMPS instructions already properly ignore
5090 the high bits of the shuffle elements. No need for us to
5091 perform an AND ourselves. */
5092 if (one_operand_shuffle)
5093 {
5094 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5095 if (target != operands[0])
5096 emit_move_insn (operands[0],
5097 gen_lowpart (GET_MODE (operands[0]), target));
5098 }
5099 else
5100 {
5101 t1 = gen_reg_rtx (V8SImode);
5102 t2 = gen_reg_rtx (V8SImode);
5103 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5104 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5105 goto merge_two;
5106 }
5107 return;
5108
5109 case E_V8SFmode:
5110 mask = gen_lowpart (V8SImode, mask);
5111 if (one_operand_shuffle)
5112 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5113 else
5114 {
5115 t1 = gen_reg_rtx (V8SFmode);
5116 t2 = gen_reg_rtx (V8SFmode);
5117 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5118 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5119 goto merge_two;
5120 }
5121 return;
5122
5123 case E_V4SImode:
5124 /* By combining the two 128-bit input vectors into one 256-bit
5125 input vector, we can use VPERMD and VPERMPS for the full
5126 two-operand shuffle. */
5127 t1 = gen_reg_rtx (V8SImode);
5128 t2 = gen_reg_rtx (V8SImode);
5129 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5130 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5131 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5132 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5133 return;
5134
5135 case E_V4SFmode:
5136 t1 = gen_reg_rtx (V8SFmode);
5137 t2 = gen_reg_rtx (V8SImode);
5138 mask = gen_lowpart (V4SImode, mask);
5139 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5140 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5141 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5142 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5143 return;
5144
5145 case E_V32QImode:
5146 t1 = gen_reg_rtx (V32QImode);
5147 t2 = gen_reg_rtx (V32QImode);
5148 t3 = gen_reg_rtx (V32QImode);
5149 vt2 = GEN_INT (-128);
5150 vt = gen_const_vec_duplicate (V32QImode, vt2);
5151 vt = force_reg (V32QImode, vt);
5152 for (i = 0; i < 32; i++)
5153 vec[i] = i < 16 ? vt2 : const0_rtx;
5154 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5155 vt2 = force_reg (V32QImode, vt2);
5156 /* From mask create two adjusted masks, which contain the same
5157 bits as mask in the low 7 bits of each vector element.
5158 The first mask will have the most significant bit clear
5159 if it requests element from the same 128-bit lane
5160 and MSB set if it requests element from the other 128-bit lane.
5161 The second mask will have the opposite values of the MSB,
5162 and additionally will have its 128-bit lanes swapped.
5163 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5164 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5165 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5166 stands for other 12 bytes. */
5167 /* The bit whether element is from the same lane or the other
5168 lane is bit 4, so shift it up by 3 to the MSB position. */
5169 t5 = gen_reg_rtx (V4DImode);
5170 emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5171 GEN_INT (3)));
5172 /* Clear MSB bits from the mask just in case it had them set. */
5173 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5174 /* After this t1 will have MSB set for elements from other lane. */
5175 emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5176 /* Clear bits other than MSB. */
5177 emit_insn (gen_andv32qi3 (t1, t1, vt));
5178 /* Or in the lower bits from mask into t3. */
5179 emit_insn (gen_iorv32qi3 (t3, t1, t2));
5180 /* And invert MSB bits in t1, so MSB is set for elements from the same
5181 lane. */
5182 emit_insn (gen_xorv32qi3 (t1, t1, vt));
5183 /* Swap 128-bit lanes in t3. */
5184 t6 = gen_reg_rtx (V4DImode);
5185 emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5186 const2_rtx, GEN_INT (3),
5187 const0_rtx, const1_rtx));
5188 /* And or in the lower bits from mask into t1. */
5189 emit_insn (gen_iorv32qi3 (t1, t1, t2));
5190 if (one_operand_shuffle)
5191 {
5192 /* Each of these shuffles will put 0s in places where
5193 element from the other 128-bit lane is needed, otherwise
5194 will shuffle in the requested value. */
5195 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5196 gen_lowpart (V32QImode, t6)));
5197 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5198 /* For t3 the 128-bit lanes are swapped again. */
5199 t7 = gen_reg_rtx (V4DImode);
5200 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5201 const2_rtx, GEN_INT (3),
5202 const0_rtx, const1_rtx));
5203 /* And oring both together leads to the result. */
5204 emit_insn (gen_iorv32qi3 (target, t1,
5205 gen_lowpart (V32QImode, t7)));
5206 if (target != operands[0])
5207 emit_move_insn (operands[0],
5208 gen_lowpart (GET_MODE (operands[0]), target));
5209 return;
5210 }
5211
5212 t4 = gen_reg_rtx (V32QImode);
5213 /* Similarly to the above one_operand_shuffle code,
5214 just for repeated twice for each operand. merge_two:
5215 code will merge the two results together. */
5216 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5217 gen_lowpart (V32QImode, t6)));
5218 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5219 gen_lowpart (V32QImode, t6)));
5220 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5221 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5222 t7 = gen_reg_rtx (V4DImode);
5223 emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5224 const2_rtx, GEN_INT (3),
5225 const0_rtx, const1_rtx));
5226 t8 = gen_reg_rtx (V4DImode);
5227 emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5228 const2_rtx, GEN_INT (3),
5229 const0_rtx, const1_rtx));
5230 emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5231 emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5232 t1 = t4;
5233 t2 = t3;
5234 goto merge_two;
5235
5236 default:
5237 gcc_assert (GET_MODE_SIZE (mode) <= 16);
5238 break;
5239 }
5240 }
5241
5242 if (TARGET_XOP)
5243 {
5244 /* The XOP VPPERM insn supports three inputs. By ignoring the
5245 one_operand_shuffle special case, we avoid creating another
5246 set of constant vectors in memory. */
5247 one_operand_shuffle = false;
5248
5249 /* mask = mask & {2*w-1, ...} */
5250 vt = GEN_INT (2*w - 1);
5251 }
5252 else
5253 {
5254 /* mask = mask & {w-1, ...} */
5255 vt = GEN_INT (w - 1);
5256 }
5257
5258 vt = gen_const_vec_duplicate (maskmode, vt);
5259 mask = expand_simple_binop (maskmode, AND, mask, vt,
5260 NULL_RTX, 0, OPTAB_DIRECT);
5261
5262 /* For non-QImode operations, convert the word permutation control
5263 into a byte permutation control. */
5264 if (mode != V16QImode)
5265 {
5266 mask = expand_simple_binop (maskmode, ASHIFT, mask,
5267 GEN_INT (exact_log2 (e)),
5268 NULL_RTX, 0, OPTAB_DIRECT);
5269
5270 /* Convert mask to vector of chars. */
5271 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5272
5273 /* Replicate each of the input bytes into byte positions:
5274 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5275 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5276 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5277 for (i = 0; i < 16; ++i)
5278 vec[i] = GEN_INT (i/e * e);
5279 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5280 vt = validize_mem (force_const_mem (V16QImode, vt));
5281 if (TARGET_XOP)
5282 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5283 else
5284 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5285
5286 /* Convert it into the byte positions by doing
5287 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5288 for (i = 0; i < 16; ++i)
5289 vec[i] = GEN_INT (i % e);
5290 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5291 vt = validize_mem (force_const_mem (V16QImode, vt));
5292 emit_insn (gen_addv16qi3 (mask, mask, vt));
5293 }
5294
5295 /* The actual shuffle operations all operate on V16QImode. */
5296 op0 = gen_lowpart (V16QImode, op0);
5297 op1 = gen_lowpart (V16QImode, op1);
5298
5299 if (TARGET_XOP)
5300 {
5301 if (GET_MODE (target) != V16QImode)
5302 target = gen_reg_rtx (V16QImode);
5303 emit_insn (gen_xop_pperm (target, op0, op1, mask));
5304 if (target != operands[0])
5305 emit_move_insn (operands[0],
5306 gen_lowpart (GET_MODE (operands[0]), target));
5307 }
5308 else if (one_operand_shuffle)
5309 {
5310 if (GET_MODE (target) != V16QImode)
5311 target = gen_reg_rtx (V16QImode);
5312 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5313 if (target != operands[0])
5314 emit_move_insn (operands[0],
5315 gen_lowpart (GET_MODE (operands[0]), target));
5316 }
5317 else
5318 {
5319 rtx xops[6];
5320 bool ok;
5321
5322 /* Shuffle the two input vectors independently. */
5323 t1 = gen_reg_rtx (V16QImode);
5324 t2 = gen_reg_rtx (V16QImode);
5325 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5326 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5327
5328 merge_two:
5329 /* Then merge them together. The key is whether any given control
5330 element contained a bit set that indicates the second word. */
5331 mask = operands[3];
5332 vt = GEN_INT (w);
5333 if (maskmode == V2DImode && !TARGET_SSE4_1)
5334 {
5335 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5336 more shuffle to convert the V2DI input mask into a V4SI
5337 input mask. At which point the masking that expand_int_vcond
5338 will work as desired. */
5339 rtx t3 = gen_reg_rtx (V4SImode);
5340 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5341 const0_rtx, const0_rtx,
5342 const2_rtx, const2_rtx));
5343 mask = t3;
5344 maskmode = V4SImode;
5345 e = w = 4;
5346 }
5347
5348 vt = gen_const_vec_duplicate (maskmode, vt);
5349 vt = force_reg (maskmode, vt);
5350 mask = expand_simple_binop (maskmode, AND, mask, vt,
5351 NULL_RTX, 0, OPTAB_DIRECT);
5352
5353 if (GET_MODE (target) != mode)
5354 target = gen_reg_rtx (mode);
5355 xops[0] = target;
5356 xops[1] = gen_lowpart (mode, t2);
5357 xops[2] = gen_lowpart (mode, t1);
5358 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5359 xops[4] = mask;
5360 xops[5] = vt;
5361 ok = ix86_expand_int_vcond (xops);
5362 gcc_assert (ok);
5363 if (target != operands[0])
5364 emit_move_insn (operands[0],
5365 gen_lowpart (GET_MODE (operands[0]), target));
5366 }
5367 }
5368
5369 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
5370 true if we should do zero extension, else sign extension. HIGH_P is
5371 true if we want the N/2 high elements, else the low elements. */
5372
5373 void
5374 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5375 {
5376 machine_mode imode = GET_MODE (src);
5377 rtx tmp;
5378
5379 if (TARGET_SSE4_1)
5380 {
5381 rtx (*unpack)(rtx, rtx);
5382 rtx (*extract)(rtx, rtx) = NULL;
5383 machine_mode halfmode = BLKmode;
5384
5385 switch (imode)
5386 {
5387 case E_V64QImode:
5388 if (unsigned_p)
5389 unpack = gen_avx512bw_zero_extendv32qiv32hi2;
5390 else
5391 unpack = gen_avx512bw_sign_extendv32qiv32hi2;
5392 halfmode = V32QImode;
5393 extract
5394 = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
5395 break;
5396 case E_V32QImode:
5397 if (unsigned_p)
5398 unpack = gen_avx2_zero_extendv16qiv16hi2;
5399 else
5400 unpack = gen_avx2_sign_extendv16qiv16hi2;
5401 halfmode = V16QImode;
5402 extract
5403 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
5404 break;
5405 case E_V32HImode:
5406 if (unsigned_p)
5407 unpack = gen_avx512f_zero_extendv16hiv16si2;
5408 else
5409 unpack = gen_avx512f_sign_extendv16hiv16si2;
5410 halfmode = V16HImode;
5411 extract
5412 = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
5413 break;
5414 case E_V16HImode:
5415 if (unsigned_p)
5416 unpack = gen_avx2_zero_extendv8hiv8si2;
5417 else
5418 unpack = gen_avx2_sign_extendv8hiv8si2;
5419 halfmode = V8HImode;
5420 extract
5421 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
5422 break;
5423 case E_V16SImode:
5424 if (unsigned_p)
5425 unpack = gen_avx512f_zero_extendv8siv8di2;
5426 else
5427 unpack = gen_avx512f_sign_extendv8siv8di2;
5428 halfmode = V8SImode;
5429 extract
5430 = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
5431 break;
5432 case E_V8SImode:
5433 if (unsigned_p)
5434 unpack = gen_avx2_zero_extendv4siv4di2;
5435 else
5436 unpack = gen_avx2_sign_extendv4siv4di2;
5437 halfmode = V4SImode;
5438 extract
5439 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
5440 break;
5441 case E_V16QImode:
5442 if (unsigned_p)
5443 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
5444 else
5445 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
5446 break;
5447 case E_V8HImode:
5448 if (unsigned_p)
5449 unpack = gen_sse4_1_zero_extendv4hiv4si2;
5450 else
5451 unpack = gen_sse4_1_sign_extendv4hiv4si2;
5452 break;
5453 case E_V4SImode:
5454 if (unsigned_p)
5455 unpack = gen_sse4_1_zero_extendv2siv2di2;
5456 else
5457 unpack = gen_sse4_1_sign_extendv2siv2di2;
5458 break;
5459 case E_V8QImode:
5460 if (unsigned_p)
5461 unpack = gen_sse4_1_zero_extendv4qiv4hi2;
5462 else
5463 unpack = gen_sse4_1_sign_extendv4qiv4hi2;
5464 break;
5465 case E_V4HImode:
5466 if (unsigned_p)
5467 unpack = gen_sse4_1_zero_extendv2hiv2si2;
5468 else
5469 unpack = gen_sse4_1_sign_extendv2hiv2si2;
5470 break;
5471 case E_V4QImode:
5472 if (unsigned_p)
5473 unpack = gen_sse4_1_zero_extendv2qiv2hi2;
5474 else
5475 unpack = gen_sse4_1_sign_extendv2qiv2hi2;
5476 break;
5477 default:
5478 gcc_unreachable ();
5479 }
5480
5481 if (GET_MODE_SIZE (imode) >= 32)
5482 {
5483 tmp = gen_reg_rtx (halfmode);
5484 emit_insn (extract (tmp, src));
5485 }
5486 else if (high_p)
5487 {
5488 switch (GET_MODE_SIZE (imode))
5489 {
5490 case 16:
5491 /* Shift higher 8 bytes to lower 8 bytes. */
5492 tmp = gen_reg_rtx (V1TImode);
5493 emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
5494 GEN_INT (64)));
5495 break;
5496 case 8:
5497 /* Shift higher 4 bytes to lower 4 bytes. */
5498 tmp = gen_reg_rtx (V1DImode);
5499 emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
5500 GEN_INT (32)));
5501 break;
5502 case 4:
5503 /* Shift higher 2 bytes to lower 2 bytes. */
5504 tmp = gen_reg_rtx (V1SImode);
5505 emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
5506 GEN_INT (16)));
5507 break;
5508 default:
5509 gcc_unreachable ();
5510 }
5511
5512 tmp = gen_lowpart (imode, tmp);
5513 }
5514 else
5515 tmp = src;
5516
5517 emit_insn (unpack (dest, tmp));
5518 }
5519 else
5520 {
5521 rtx (*unpack)(rtx, rtx, rtx);
5522
5523 switch (imode)
5524 {
5525 case E_V16QImode:
5526 if (high_p)
5527 unpack = gen_vec_interleave_highv16qi;
5528 else
5529 unpack = gen_vec_interleave_lowv16qi;
5530 break;
5531 case E_V8HImode:
5532 if (high_p)
5533 unpack = gen_vec_interleave_highv8hi;
5534 else
5535 unpack = gen_vec_interleave_lowv8hi;
5536 break;
5537 case E_V4SImode:
5538 if (high_p)
5539 unpack = gen_vec_interleave_highv4si;
5540 else
5541 unpack = gen_vec_interleave_lowv4si;
5542 break;
5543 case E_V8QImode:
5544 if (high_p)
5545 unpack = gen_mmx_punpckhbw;
5546 else
5547 unpack = gen_mmx_punpcklbw;
5548 break;
5549 case E_V4HImode:
5550 if (high_p)
5551 unpack = gen_mmx_punpckhwd;
5552 else
5553 unpack = gen_mmx_punpcklwd;
5554 break;
5555 case E_V4QImode:
5556 if (high_p)
5557 unpack = gen_mmx_punpckhbw_low;
5558 else
5559 unpack = gen_mmx_punpcklbw_low;
5560 break;
5561 default:
5562 gcc_unreachable ();
5563 }
5564
5565 if (unsigned_p)
5566 tmp = force_reg (imode, CONST0_RTX (imode));
5567 else
5568 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5569 src, pc_rtx, pc_rtx);
5570
5571 rtx tmp2 = gen_reg_rtx (imode);
5572 emit_insn (unpack (tmp2, src, tmp));
5573 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
5574 }
5575 }
5576
5577 /* Return true if mem is pool constant which contains a const_vector
5578 perm index, assign the index to PERM. */
5579 bool
5580 ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
5581 {
5582 machine_mode mode = GET_MODE (mem);
5583 int nelt = GET_MODE_NUNITS (mode);
5584
5585 if (!INTEGRAL_MODE_P (mode))
5586 return false;
5587
5588 /* Needs to be constant pool. */
5589 if (!(MEM_P (mem))
5590 || !SYMBOL_REF_P (XEXP (mem, 0))
5591 || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
5592 return false;
5593
5594 rtx constant = get_pool_constant (XEXP (mem, 0));
5595
5596 if (GET_CODE (constant) != CONST_VECTOR)
5597 return false;
5598
5599 /* There could be some rtx like
5600 (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
5601 but with "*.LC1" refer to V2DI constant vector. */
5602 if (GET_MODE (constant) != mode)
5603 {
5604 constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
5605
5606 if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
5607 return false;
5608 }
5609
5610 for (int i = 0; i != nelt; i++)
5611 perm[i] = UINTVAL (XVECEXP (constant, 0, i));
5612
5613 return true;
5614 }
5615
5616 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
5617 but works for floating pointer parameters and nonoffsetable memories.
5618 For pushes, it returns just stack offsets; the values will be saved
5619 in the right order. Maximally three parts are generated. */
5620
5621 static int
5622 ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
5623 {
5624 int size;
5625
5626 if (!TARGET_64BIT)
5627 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
5628 else
5629 size = (GET_MODE_SIZE (mode) + 4) / 8;
5630
5631 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
5632 gcc_assert (size >= 2 && size <= 4);
5633
5634 /* Optimize constant pool reference to immediates. This is used by fp
5635 moves, that force all constants to memory to allow combining. */
5636 if (MEM_P (operand) && MEM_READONLY_P (operand))
5637 operand = avoid_constant_pool_reference (operand);
5638
5639 if (MEM_P (operand) && !offsettable_memref_p (operand))
5640 {
5641 /* The only non-offsetable memories we handle are pushes. */
5642 int ok = push_operand (operand, VOIDmode);
5643
5644 gcc_assert (ok);
5645
5646 operand = copy_rtx (operand);
5647 PUT_MODE (operand, word_mode);
5648 parts[0] = parts[1] = parts[2] = parts[3] = operand;
5649 return size;
5650 }
5651
5652 if (GET_CODE (operand) == CONST_VECTOR)
5653 {
5654 scalar_int_mode imode = int_mode_for_mode (mode).require ();
5655 /* Caution: if we looked through a constant pool memory above,
5656 the operand may actually have a different mode now. That's
5657 ok, since we want to pun this all the way back to an integer. */
5658 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
5659 gcc_assert (operand != NULL);
5660 mode = imode;
5661 }
5662
5663 if (!TARGET_64BIT)
5664 {
5665 if (mode == DImode)
5666 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5667 else
5668 {
5669 int i;
5670
5671 if (REG_P (operand))
5672 {
5673 gcc_assert (reload_completed);
5674 for (i = 0; i < size; i++)
5675 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
5676 }
5677 else if (offsettable_memref_p (operand))
5678 {
5679 operand = adjust_address (operand, SImode, 0);
5680 parts[0] = operand;
5681 for (i = 1; i < size; i++)
5682 parts[i] = adjust_address (operand, SImode, 4 * i);
5683 }
5684 else if (CONST_DOUBLE_P (operand))
5685 {
5686 const REAL_VALUE_TYPE *r;
5687 long l[4];
5688
5689 r = CONST_DOUBLE_REAL_VALUE (operand);
5690 switch (mode)
5691 {
5692 case E_TFmode:
5693 real_to_target (l, r, mode);
5694 parts[3] = gen_int_mode (l[3], SImode);
5695 parts[2] = gen_int_mode (l[2], SImode);
5696 break;
5697 case E_XFmode:
5698 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
5699 long double may not be 80-bit. */
5700 real_to_target (l, r, mode);
5701 parts[2] = gen_int_mode (l[2], SImode);
5702 break;
5703 case E_DFmode:
5704 REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
5705 break;
5706 default:
5707 gcc_unreachable ();
5708 }
5709 parts[1] = gen_int_mode (l[1], SImode);
5710 parts[0] = gen_int_mode (l[0], SImode);
5711 }
5712 else
5713 gcc_unreachable ();
5714 }
5715 }
5716 else
5717 {
5718 if (mode == TImode)
5719 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
5720 if (mode == XFmode || mode == TFmode)
5721 {
5722 machine_mode upper_mode = mode==XFmode ? SImode : DImode;
5723 if (REG_P (operand))
5724 {
5725 gcc_assert (reload_completed);
5726 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
5727 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
5728 }
5729 else if (offsettable_memref_p (operand))
5730 {
5731 operand = adjust_address (operand, DImode, 0);
5732 parts[0] = operand;
5733 parts[1] = adjust_address (operand, upper_mode, 8);
5734 }
5735 else if (CONST_DOUBLE_P (operand))
5736 {
5737 long l[4];
5738
5739 real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
5740
5741 /* real_to_target puts 32-bit pieces in each long. */
5742 parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
5743 | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
5744 << 32), DImode);
5745
5746 if (upper_mode == SImode)
5747 parts[1] = gen_int_mode (l[2], SImode);
5748 else
5749 parts[1]
5750 = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
5751 | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
5752 << 32), DImode);
5753 }
5754 else
5755 gcc_unreachable ();
5756 }
5757 }
5758
5759 return size;
5760 }
5761
5762 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
5763 Return false when normal moves are needed; true when all required
5764 insns have been emitted. Operands 2-4 contain the input values
5765 int the correct order; operands 5-7 contain the output values. */
5766
5767 void
5768 ix86_split_long_move (rtx operands[])
5769 {
5770 rtx part[2][4];
5771 int nparts, i, j;
5772 int push = 0;
5773 int collisions = 0;
5774 machine_mode mode = GET_MODE (operands[0]);
5775 bool collisionparts[4];
5776
5777 /* The DFmode expanders may ask us to move double.
5778 For 64bit target this is single move. By hiding the fact
5779 here we simplify i386.md splitters. */
5780 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
5781 {
5782 /* Optimize constant pool reference to immediates. This is used by
5783 fp moves, that force all constants to memory to allow combining. */
5784
5785 if (MEM_P (operands[1])
5786 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
5787 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
5788 operands[1] = get_pool_constant (XEXP (operands[1], 0));
5789 if (push_operand (operands[0], VOIDmode))
5790 {
5791 operands[0] = copy_rtx (operands[0]);
5792 PUT_MODE (operands[0], word_mode);
5793 }
5794 else
5795 operands[0] = gen_lowpart (DImode, operands[0]);
5796 operands[1] = gen_lowpart (DImode, operands[1]);
5797 emit_move_insn (operands[0], operands[1]);
5798 return;
5799 }
5800
5801 /* The only non-offsettable memory we handle is push. */
5802 if (push_operand (operands[0], VOIDmode))
5803 push = 1;
5804 else
5805 gcc_assert (!MEM_P (operands[0])
5806 || offsettable_memref_p (operands[0]));
5807
5808 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
5809 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
5810
5811 /* When emitting push, take care for source operands on the stack. */
5812 if (push && MEM_P (operands[1])
5813 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
5814 {
5815 rtx src_base = XEXP (part[1][nparts - 1], 0);
5816
5817 /* Compensate for the stack decrement by 4. */
5818 if (!TARGET_64BIT && nparts == 3
5819 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
5820 src_base = plus_constant (Pmode, src_base, 4);
5821
5822 /* src_base refers to the stack pointer and is
5823 automatically decreased by emitted push. */
5824 for (i = 0; i < nparts; i++)
5825 part[1][i] = change_address (part[1][i],
5826 GET_MODE (part[1][i]), src_base);
5827 }
5828
5829 /* We need to do copy in the right order in case an address register
5830 of the source overlaps the destination. */
5831 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
5832 {
5833 rtx tmp;
5834
5835 for (i = 0; i < nparts; i++)
5836 {
5837 collisionparts[i]
5838 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
5839 if (collisionparts[i])
5840 collisions++;
5841 }
5842
5843 /* Collision in the middle part can be handled by reordering. */
5844 if (collisions == 1 && nparts == 3 && collisionparts [1])
5845 {
5846 std::swap (part[0][1], part[0][2]);
5847 std::swap (part[1][1], part[1][2]);
5848 }
5849 else if (collisions == 1
5850 && nparts == 4
5851 && (collisionparts [1] || collisionparts [2]))
5852 {
5853 if (collisionparts [1])
5854 {
5855 std::swap (part[0][1], part[0][2]);
5856 std::swap (part[1][1], part[1][2]);
5857 }
5858 else
5859 {
5860 std::swap (part[0][2], part[0][3]);
5861 std::swap (part[1][2], part[1][3]);
5862 }
5863 }
5864
5865 /* If there are more collisions, we can't handle it by reordering.
5866 Do an lea to the last part and use only one colliding move. */
5867 else if (collisions > 1)
5868 {
5869 rtx base, addr;
5870
5871 collisions = 1;
5872
5873 base = part[0][nparts - 1];
5874
5875 /* Handle the case when the last part isn't valid for lea.
5876 Happens in 64-bit mode storing the 12-byte XFmode. */
5877 if (GET_MODE (base) != Pmode)
5878 base = gen_rtx_REG (Pmode, REGNO (base));
5879
5880 addr = XEXP (part[1][0], 0);
5881 if (TARGET_TLS_DIRECT_SEG_REFS)
5882 {
5883 struct ix86_address parts;
5884 int ok = ix86_decompose_address (addr, &parts);
5885 gcc_assert (ok);
5886 /* It is not valid to use %gs: or %fs: in lea. */
5887 gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
5888 }
5889 emit_insn (gen_rtx_SET (base, addr));
5890 part[1][0] = replace_equiv_address (part[1][0], base);
5891 for (i = 1; i < nparts; i++)
5892 {
5893 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
5894 part[1][i] = replace_equiv_address (part[1][i], tmp);
5895 }
5896 }
5897 }
5898
5899 if (push)
5900 {
5901 if (!TARGET_64BIT)
5902 {
5903 if (nparts == 3)
5904 {
5905 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
5906 emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
5907 emit_move_insn (part[0][2], part[1][2]);
5908 }
5909 else if (nparts == 4)
5910 {
5911 emit_move_insn (part[0][3], part[1][3]);
5912 emit_move_insn (part[0][2], part[1][2]);
5913 }
5914 }
5915 else
5916 {
5917 /* In 64bit mode we don't have 32bit push available. In case this is
5918 register, it is OK - we will just use larger counterpart. We also
5919 retype memory - these comes from attempt to avoid REX prefix on
5920 moving of second half of TFmode value. */
5921 if (GET_MODE (part[1][1]) == SImode)
5922 {
5923 switch (GET_CODE (part[1][1]))
5924 {
5925 case MEM:
5926 part[1][1] = adjust_address (part[1][1], DImode, 0);
5927 break;
5928
5929 case REG:
5930 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
5931 break;
5932
5933 default:
5934 gcc_unreachable ();
5935 }
5936
5937 if (GET_MODE (part[1][0]) == SImode)
5938 part[1][0] = part[1][1];
5939 }
5940 }
5941 emit_move_insn (part[0][1], part[1][1]);
5942 emit_move_insn (part[0][0], part[1][0]);
5943 return;
5944 }
5945
5946 /* Choose correct order to not overwrite the source before it is copied. */
5947 if ((REG_P (part[0][0])
5948 && REG_P (part[1][1])
5949 && (REGNO (part[0][0]) == REGNO (part[1][1])
5950 || (nparts == 3
5951 && REGNO (part[0][0]) == REGNO (part[1][2]))
5952 || (nparts == 4
5953 && REGNO (part[0][0]) == REGNO (part[1][3]))))
5954 || (collisions > 0
5955 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
5956 {
5957 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
5958 {
5959 operands[2 + i] = part[0][j];
5960 operands[6 + i] = part[1][j];
5961 }
5962 }
5963 else
5964 {
5965 for (i = 0; i < nparts; i++)
5966 {
5967 operands[2 + i] = part[0][i];
5968 operands[6 + i] = part[1][i];
5969 }
5970 }
5971
5972 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
5973 if (optimize_insn_for_size_p ())
5974 {
5975 for (j = 0; j < nparts - 1; j++)
5976 if (CONST_INT_P (operands[6 + j])
5977 && operands[6 + j] != const0_rtx
5978 && REG_P (operands[2 + j]))
5979 for (i = j; i < nparts - 1; i++)
5980 if (CONST_INT_P (operands[7 + i])
5981 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
5982 operands[7 + i] = operands[2 + j];
5983 }
5984
5985 for (i = 0; i < nparts; i++)
5986 emit_move_insn (operands[2 + i], operands[6 + i]);
5987
5988 return;
5989 }
5990
5991 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
5992 left shift by a constant, either using a single shift or
5993 a sequence of add instructions. */
5994
5995 static void
5996 ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
5997 {
5998 if (count == 1
5999 || (count * ix86_cost->add <= ix86_cost->shift_const
6000 && !optimize_insn_for_size_p ()))
6001 {
6002 while (count-- > 0)
6003 emit_insn (gen_add2_insn (operand, operand));
6004 }
6005 else
6006 {
6007 rtx (*insn)(rtx, rtx, rtx);
6008
6009 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6010 emit_insn (insn (operand, operand, GEN_INT (count)));
6011 }
6012 }
6013
6014 void
6015 ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6016 {
6017 rtx (*gen_ashl3)(rtx, rtx, rtx);
6018 rtx (*gen_shld)(rtx, rtx, rtx);
6019 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6020 machine_mode half_mode;
6021
6022 rtx low[2], high[2];
6023 int count;
6024
6025 if (CONST_INT_P (operands[2]))
6026 {
6027 split_double_mode (mode, operands, 2, low, high);
6028 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6029
6030 if (count >= half_width)
6031 {
6032 emit_move_insn (high[0], low[1]);
6033 emit_move_insn (low[0], const0_rtx);
6034
6035 if (count > half_width)
6036 ix86_expand_ashl_const (high[0], count - half_width, mode);
6037 }
6038 else
6039 {
6040 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6041
6042 if (!rtx_equal_p (operands[0], operands[1]))
6043 emit_move_insn (operands[0], operands[1]);
6044
6045 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6046 ix86_expand_ashl_const (low[0], count, mode);
6047 }
6048 return;
6049 }
6050
6051 split_double_mode (mode, operands, 1, low, high);
6052 half_mode = mode == DImode ? SImode : DImode;
6053
6054 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6055
6056 if (operands[1] == const1_rtx)
6057 {
6058 /* Assuming we've chosen a QImode capable registers, then 1 << N
6059 can be done with two 32/64-bit shifts, no branches, no cmoves. */
6060 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6061 {
6062 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6063
6064 ix86_expand_clear (low[0]);
6065 ix86_expand_clear (high[0]);
6066 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6067
6068 d = gen_lowpart (QImode, low[0]);
6069 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6070 s = gen_rtx_EQ (QImode, flags, const0_rtx);
6071 emit_insn (gen_rtx_SET (d, s));
6072
6073 d = gen_lowpart (QImode, high[0]);
6074 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6075 s = gen_rtx_NE (QImode, flags, const0_rtx);
6076 emit_insn (gen_rtx_SET (d, s));
6077 }
6078
6079 /* Otherwise, we can get the same results by manually performing
6080 a bit extract operation on bit 5/6, and then performing the two
6081 shifts. The two methods of getting 0/1 into low/high are exactly
6082 the same size. Avoiding the shift in the bit extract case helps
6083 pentium4 a bit; no one else seems to care much either way. */
6084 else
6085 {
6086 rtx (*gen_lshr3)(rtx, rtx, rtx);
6087 rtx (*gen_and3)(rtx, rtx, rtx);
6088 rtx (*gen_xor3)(rtx, rtx, rtx);
6089 HOST_WIDE_INT bits;
6090 rtx x;
6091
6092 if (mode == DImode)
6093 {
6094 gen_lshr3 = gen_lshrsi3;
6095 gen_and3 = gen_andsi3;
6096 gen_xor3 = gen_xorsi3;
6097 bits = 5;
6098 }
6099 else
6100 {
6101 gen_lshr3 = gen_lshrdi3;
6102 gen_and3 = gen_anddi3;
6103 gen_xor3 = gen_xordi3;
6104 bits = 6;
6105 }
6106
6107 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6108 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6109 else
6110 x = gen_lowpart (half_mode, operands[2]);
6111 emit_insn (gen_rtx_SET (high[0], x));
6112
6113 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6114 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6115 emit_move_insn (low[0], high[0]);
6116 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6117 }
6118
6119 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6120 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6121 return;
6122 }
6123
6124 if (operands[1] == constm1_rtx)
6125 {
6126 /* For -1 << N, we can avoid the shld instruction, because we
6127 know that we're shifting 0...31/63 ones into a -1. */
6128 emit_move_insn (low[0], constm1_rtx);
6129 if (optimize_insn_for_size_p ())
6130 emit_move_insn (high[0], low[0]);
6131 else
6132 emit_move_insn (high[0], constm1_rtx);
6133 }
6134 else
6135 {
6136 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6137
6138 if (!rtx_equal_p (operands[0], operands[1]))
6139 emit_move_insn (operands[0], operands[1]);
6140
6141 split_double_mode (mode, operands, 1, low, high);
6142 emit_insn (gen_shld (high[0], low[0], operands[2]));
6143 }
6144
6145 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6146
6147 if (TARGET_CMOVE && scratch)
6148 {
6149 ix86_expand_clear (scratch);
6150 emit_insn (gen_x86_shift_adj_1
6151 (half_mode, high[0], low[0], operands[2], scratch));
6152 }
6153 else
6154 emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6155 }
6156
6157 void
6158 ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6159 {
6160 rtx (*gen_ashr3)(rtx, rtx, rtx)
6161 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6162 rtx (*gen_shrd)(rtx, rtx, rtx);
6163 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6164
6165 rtx low[2], high[2];
6166 int count;
6167
6168 if (CONST_INT_P (operands[2]))
6169 {
6170 split_double_mode (mode, operands, 2, low, high);
6171 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6172
6173 if (count == GET_MODE_BITSIZE (mode) - 1)
6174 {
6175 emit_move_insn (high[0], high[1]);
6176 emit_insn (gen_ashr3 (high[0], high[0],
6177 GEN_INT (half_width - 1)));
6178 emit_move_insn (low[0], high[0]);
6179
6180 }
6181 else if (count >= half_width)
6182 {
6183 emit_move_insn (low[0], high[1]);
6184 emit_move_insn (high[0], low[0]);
6185 emit_insn (gen_ashr3 (high[0], high[0],
6186 GEN_INT (half_width - 1)));
6187
6188 if (count > half_width)
6189 emit_insn (gen_ashr3 (low[0], low[0],
6190 GEN_INT (count - half_width)));
6191 }
6192 else
6193 {
6194 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6195
6196 if (!rtx_equal_p (operands[0], operands[1]))
6197 emit_move_insn (operands[0], operands[1]);
6198
6199 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6200 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6201 }
6202 }
6203 else
6204 {
6205 machine_mode half_mode;
6206
6207 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6208
6209 if (!rtx_equal_p (operands[0], operands[1]))
6210 emit_move_insn (operands[0], operands[1]);
6211
6212 split_double_mode (mode, operands, 1, low, high);
6213 half_mode = mode == DImode ? SImode : DImode;
6214
6215 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6216 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6217
6218 if (TARGET_CMOVE && scratch)
6219 {
6220 emit_move_insn (scratch, high[0]);
6221 emit_insn (gen_ashr3 (scratch, scratch,
6222 GEN_INT (half_width - 1)));
6223 emit_insn (gen_x86_shift_adj_1
6224 (half_mode, low[0], high[0], operands[2], scratch));
6225 }
6226 else
6227 emit_insn (gen_x86_shift_adj_3
6228 (half_mode, low[0], high[0], operands[2]));
6229 }
6230 }
6231
6232 void
6233 ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6234 {
6235 rtx (*gen_lshr3)(rtx, rtx, rtx)
6236 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6237 rtx (*gen_shrd)(rtx, rtx, rtx);
6238 int half_width = GET_MODE_BITSIZE (mode) >> 1;
6239
6240 rtx low[2], high[2];
6241 int count;
6242
6243 if (CONST_INT_P (operands[2]))
6244 {
6245 split_double_mode (mode, operands, 2, low, high);
6246 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6247
6248 if (count >= half_width)
6249 {
6250 emit_move_insn (low[0], high[1]);
6251 ix86_expand_clear (high[0]);
6252
6253 if (count > half_width)
6254 emit_insn (gen_lshr3 (low[0], low[0],
6255 GEN_INT (count - half_width)));
6256 }
6257 else
6258 {
6259 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6260
6261 if (!rtx_equal_p (operands[0], operands[1]))
6262 emit_move_insn (operands[0], operands[1]);
6263
6264 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6265 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6266 }
6267 }
6268 else
6269 {
6270 machine_mode half_mode;
6271
6272 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6273
6274 if (!rtx_equal_p (operands[0], operands[1]))
6275 emit_move_insn (operands[0], operands[1]);
6276
6277 split_double_mode (mode, operands, 1, low, high);
6278 half_mode = mode == DImode ? SImode : DImode;
6279
6280 emit_insn (gen_shrd (low[0], high[0], operands[2]));
6281 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6282
6283 if (TARGET_CMOVE && scratch)
6284 {
6285 ix86_expand_clear (scratch);
6286 emit_insn (gen_x86_shift_adj_1
6287 (half_mode, low[0], high[0], operands[2], scratch));
6288 }
6289 else
6290 emit_insn (gen_x86_shift_adj_2
6291 (half_mode, low[0], high[0], operands[2]));
6292 }
6293 }
6294
6295 /* Expand move of V1TI mode register X to a new TI mode register. */
6296 static rtx
6297 ix86_expand_v1ti_to_ti (rtx x)
6298 {
6299 rtx result = gen_reg_rtx (TImode);
6300 if (TARGET_SSE2)
6301 {
6302 rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
6303 rtx lo = gen_lowpart (DImode, result);
6304 emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
6305 rtx hi = gen_highpart (DImode, result);
6306 emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
6307 }
6308 else
6309 emit_move_insn (result, gen_lowpart (TImode, x));
6310 return result;
6311 }
6312
6313 /* Expand move of TI mode register X to a new V1TI mode register. */
6314 static rtx
6315 ix86_expand_ti_to_v1ti (rtx x)
6316 {
6317 if (TARGET_SSE2)
6318 {
6319 rtx lo = gen_lowpart (DImode, x);
6320 rtx hi = gen_highpart (DImode, x);
6321 rtx tmp = gen_reg_rtx (V2DImode);
6322 emit_insn (gen_vec_concatv2di (tmp, lo, hi));
6323 return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
6324 }
6325
6326 return force_reg (V1TImode, gen_lowpart (V1TImode, x));
6327 }
6328
6329 /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
6330 void
6331 ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
6332 {
6333 rtx op1 = force_reg (V1TImode, operands[1]);
6334
6335 if (!CONST_INT_P (operands[2]))
6336 {
6337 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6338 rtx tmp2 = gen_reg_rtx (TImode);
6339 rtx (*shift) (rtx, rtx, rtx)
6340 = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
6341 emit_insn (shift (tmp2, tmp1, operands[2]));
6342 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6343 emit_move_insn (operands[0], tmp3);
6344 return;
6345 }
6346
6347 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6348
6349 if (bits == 0)
6350 {
6351 emit_move_insn (operands[0], op1);
6352 return;
6353 }
6354
6355 if ((bits & 7) == 0)
6356 {
6357 rtx tmp = gen_reg_rtx (V1TImode);
6358 if (code == ASHIFT)
6359 emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
6360 else
6361 emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
6362 emit_move_insn (operands[0], tmp);
6363 return;
6364 }
6365
6366 rtx tmp1 = gen_reg_rtx (V1TImode);
6367 if (code == ASHIFT)
6368 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
6369 else
6370 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6371
6372 /* tmp2 is operands[1] shifted by 64, in V2DImode. */
6373 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6374
6375 /* tmp3 will be the V2DImode result. */
6376 rtx tmp3 = gen_reg_rtx (V2DImode);
6377
6378 if (bits > 64)
6379 {
6380 if (code == ASHIFT)
6381 emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6382 else
6383 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
6384 }
6385 else
6386 {
6387 /* tmp4 is operands[1], in V2DImode. */
6388 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6389
6390 rtx tmp5 = gen_reg_rtx (V2DImode);
6391 if (code == ASHIFT)
6392 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
6393 else
6394 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6395
6396 rtx tmp6 = gen_reg_rtx (V2DImode);
6397 if (code == ASHIFT)
6398 emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6399 else
6400 emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
6401
6402 emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
6403 }
6404
6405 /* Convert the result back to V1TImode and store in operands[0]. */
6406 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6407 emit_move_insn (operands[0], tmp7);
6408 }
6409
6410 /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
6411 void
6412 ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
6413 {
6414 rtx op1 = force_reg (V1TImode, operands[1]);
6415
6416 if (!CONST_INT_P (operands[2]))
6417 {
6418 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6419 rtx tmp2 = gen_reg_rtx (TImode);
6420 rtx (*rotate) (rtx, rtx, rtx)
6421 = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
6422 emit_insn (rotate (tmp2, tmp1, operands[2]));
6423 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6424 emit_move_insn (operands[0], tmp3);
6425 return;
6426 }
6427
6428 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6429
6430 if (bits == 0)
6431 {
6432 emit_move_insn (operands[0], op1);
6433 return;
6434 }
6435
6436 if (code == ROTATERT)
6437 bits = 128 - bits;
6438
6439 if ((bits & 31) == 0)
6440 {
6441 rtx tmp2 = gen_reg_rtx (V4SImode);
6442 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6443 if (bits == 32)
6444 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
6445 else if (bits == 64)
6446 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
6447 else
6448 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
6449 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
6450 return;
6451 }
6452
6453 if ((bits & 7) == 0)
6454 {
6455 rtx tmp1 = gen_reg_rtx (V1TImode);
6456 rtx tmp2 = gen_reg_rtx (V1TImode);
6457 rtx tmp3 = gen_reg_rtx (V1TImode);
6458
6459 emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
6460 emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
6461 emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
6462 emit_move_insn (operands[0], tmp3);
6463 return;
6464 }
6465
6466 rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6467
6468 rtx lobits;
6469 rtx hibits;
6470
6471 switch (bits >> 5)
6472 {
6473 case 0:
6474 lobits = op1_v4si;
6475 hibits = gen_reg_rtx (V4SImode);
6476 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
6477 break;
6478
6479 case 1:
6480 lobits = gen_reg_rtx (V4SImode);
6481 hibits = gen_reg_rtx (V4SImode);
6482 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
6483 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
6484 break;
6485
6486 case 2:
6487 lobits = gen_reg_rtx (V4SImode);
6488 hibits = gen_reg_rtx (V4SImode);
6489 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
6490 emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
6491 break;
6492
6493 default:
6494 lobits = gen_reg_rtx (V4SImode);
6495 emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
6496 hibits = op1_v4si;
6497 break;
6498 }
6499
6500 rtx tmp1 = gen_reg_rtx (V4SImode);
6501 rtx tmp2 = gen_reg_rtx (V4SImode);
6502 rtx tmp3 = gen_reg_rtx (V4SImode);
6503
6504 emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
6505 emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
6506 emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
6507
6508 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6509 }
6510
6511 /* Expand V1TI mode ashiftrt by constant. */
6512 void
6513 ix86_expand_v1ti_ashiftrt (rtx operands[])
6514 {
6515 rtx op1 = force_reg (V1TImode, operands[1]);
6516
6517 if (!CONST_INT_P (operands[2]))
6518 {
6519 rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
6520 rtx tmp2 = gen_reg_rtx (TImode);
6521 emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
6522 rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
6523 emit_move_insn (operands[0], tmp3);
6524 return;
6525 }
6526
6527 HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
6528
6529 if (bits == 0)
6530 {
6531 emit_move_insn (operands[0], op1);
6532 return;
6533 }
6534
6535 if (bits == 127)
6536 {
6537 /* Two operations. */
6538 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6539 rtx tmp2 = gen_reg_rtx (V4SImode);
6540 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6541
6542 rtx tmp3 = gen_reg_rtx (V4SImode);
6543 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6544
6545 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
6546 return;
6547 }
6548
6549 if (bits == 64)
6550 {
6551 /* Three operations. */
6552 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6553 rtx tmp2 = gen_reg_rtx (V4SImode);
6554 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6555
6556 rtx tmp3 = gen_reg_rtx (V4SImode);
6557 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6558
6559 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6560 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6561 rtx tmp6 = gen_reg_rtx (V2DImode);
6562 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6563
6564 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6565 return;
6566 }
6567
6568 if (bits == 96)
6569 {
6570 /* Three operations. */
6571 rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
6572 rtx tmp2 = gen_reg_rtx (V4SImode);
6573 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6574
6575 rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6576 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6577 rtx tmp5 = gen_reg_rtx (V2DImode);
6578 emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
6579
6580 rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
6581 rtx tmp7 = gen_reg_rtx (V4SImode);
6582 emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
6583
6584 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6585 return;
6586 }
6587
6588 if (bits >= 111)
6589 {
6590 /* Three operations. */
6591 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6592 rtx tmp2 = gen_reg_rtx (V4SImode);
6593 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6594
6595 rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6596 rtx tmp4 = gen_reg_rtx (V8HImode);
6597 emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
6598
6599 rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
6600 rtx tmp6 = gen_reg_rtx (V4SImode);
6601 emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
6602
6603 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6604 return;
6605 }
6606
6607 if (TARGET_AVX2 || TARGET_SSE4_1)
6608 {
6609 /* Three operations. */
6610 if (bits == 32)
6611 {
6612 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6613 rtx tmp2 = gen_reg_rtx (V4SImode);
6614 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
6615
6616 rtx tmp3 = gen_reg_rtx (V1TImode);
6617 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
6618
6619 if (TARGET_AVX2)
6620 {
6621 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6622 rtx tmp5 = gen_reg_rtx (V4SImode);
6623 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6624 GEN_INT (7)));
6625
6626 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6627 }
6628 else
6629 {
6630 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6631 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6632 rtx tmp6 = gen_reg_rtx (V8HImode);
6633 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6634 GEN_INT (0x3f)));
6635
6636 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6637 }
6638 return;
6639 }
6640
6641 /* Three operations. */
6642 if (bits == 8 || bits == 16 || bits == 24)
6643 {
6644 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6645 rtx tmp2 = gen_reg_rtx (V4SImode);
6646 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6647
6648 rtx tmp3 = gen_reg_rtx (V1TImode);
6649 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
6650
6651 if (TARGET_AVX2)
6652 {
6653 rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
6654 rtx tmp5 = gen_reg_rtx (V4SImode);
6655 emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
6656 GEN_INT (7)));
6657
6658 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
6659 }
6660 else
6661 {
6662 rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6663 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6664 rtx tmp6 = gen_reg_rtx (V8HImode);
6665 emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
6666 GEN_INT (0x3f)));
6667
6668 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
6669 }
6670 return;
6671 }
6672 }
6673
6674 if (bits > 96)
6675 {
6676 /* Four operations. */
6677 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6678 rtx tmp2 = gen_reg_rtx (V4SImode);
6679 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
6680
6681 rtx tmp3 = gen_reg_rtx (V4SImode);
6682 emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
6683
6684 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
6685 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6686 rtx tmp6 = gen_reg_rtx (V2DImode);
6687 emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
6688
6689 rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
6690 rtx tmp8 = gen_reg_rtx (V4SImode);
6691 emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
6692
6693 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
6694 return;
6695 }
6696
6697 if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
6698 {
6699 /* Four operations. */
6700 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6701 rtx tmp2 = gen_reg_rtx (V4SImode);
6702 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6703
6704 rtx tmp3 = gen_reg_rtx (V4SImode);
6705 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6706
6707 rtx tmp4 = gen_reg_rtx (V1TImode);
6708 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6709
6710 rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
6711 rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
6712 rtx tmp7 = gen_reg_rtx (V8HImode);
6713 emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
6714 GEN_INT (bits == 48 ? 0x1f : 0x07)));
6715
6716 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
6717 return;
6718 }
6719
6720 if ((bits & 7) == 0)
6721 {
6722 /* Five operations. */
6723 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6724 rtx tmp2 = gen_reg_rtx (V4SImode);
6725 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6726
6727 rtx tmp3 = gen_reg_rtx (V4SImode);
6728 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6729
6730 rtx tmp4 = gen_reg_rtx (V1TImode);
6731 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
6732
6733 rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6734 rtx tmp6 = gen_reg_rtx (V1TImode);
6735 emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
6736
6737 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6738 rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
6739 rtx tmp9 = gen_reg_rtx (V2DImode);
6740 emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
6741
6742 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
6743 return;
6744 }
6745
6746 if (TARGET_AVX2 && bits < 32)
6747 {
6748 /* Six operations. */
6749 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6750 rtx tmp2 = gen_reg_rtx (V4SImode);
6751 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6752
6753 rtx tmp3 = gen_reg_rtx (V1TImode);
6754 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6755
6756 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6757 rtx tmp5 = gen_reg_rtx (V2DImode);
6758 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6759
6760 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6761 rtx tmp7 = gen_reg_rtx (V2DImode);
6762 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6763
6764 rtx tmp8 = gen_reg_rtx (V2DImode);
6765 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6766
6767 rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
6768 rtx tmp10 = gen_reg_rtx (V4SImode);
6769 emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
6770
6771 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
6772 return;
6773 }
6774
6775 if (TARGET_SSE4_1 && bits < 15)
6776 {
6777 /* Six operations. */
6778 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6779 rtx tmp2 = gen_reg_rtx (V4SImode);
6780 emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
6781
6782 rtx tmp3 = gen_reg_rtx (V1TImode);
6783 emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
6784
6785 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6786 rtx tmp5 = gen_reg_rtx (V2DImode);
6787 emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
6788
6789 rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6790 rtx tmp7 = gen_reg_rtx (V2DImode);
6791 emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
6792
6793 rtx tmp8 = gen_reg_rtx (V2DImode);
6794 emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
6795
6796 rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
6797 rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
6798 rtx tmp11 = gen_reg_rtx (V8HImode);
6799 emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
6800
6801 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
6802 return;
6803 }
6804
6805 if (bits == 1)
6806 {
6807 /* Eight operations. */
6808 rtx tmp1 = gen_reg_rtx (V1TImode);
6809 emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
6810
6811 rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6812 rtx tmp3 = gen_reg_rtx (V2DImode);
6813 emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
6814
6815 rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
6816 rtx tmp5 = gen_reg_rtx (V2DImode);
6817 emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
6818
6819 rtx tmp6 = gen_reg_rtx (V2DImode);
6820 emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
6821
6822 rtx tmp7 = gen_reg_rtx (V2DImode);
6823 emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
6824
6825 rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
6826 rtx tmp9 = gen_reg_rtx (V4SImode);
6827 emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
6828
6829 rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
6830 rtx tmp11 = gen_reg_rtx (V2DImode);
6831 emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
6832
6833 rtx tmp12 = gen_reg_rtx (V2DImode);
6834 emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
6835
6836 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
6837 return;
6838 }
6839
6840 if (bits > 64)
6841 {
6842 /* Eight operations. */
6843 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6844 rtx tmp2 = gen_reg_rtx (V4SImode);
6845 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6846
6847 rtx tmp3 = gen_reg_rtx (V4SImode);
6848 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6849
6850 rtx tmp4 = gen_reg_rtx (V1TImode);
6851 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6852
6853 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6854 rtx tmp6 = gen_reg_rtx (V2DImode);
6855 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
6856
6857 rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6858 rtx tmp8 = gen_reg_rtx (V1TImode);
6859 emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
6860
6861 rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
6862 rtx tmp10 = gen_reg_rtx (V2DImode);
6863 emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
6864
6865 rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
6866 rtx tmp12 = gen_reg_rtx (V2DImode);
6867 emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
6868
6869 rtx tmp13 = gen_reg_rtx (V2DImode);
6870 emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
6871
6872 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
6873 }
6874 else
6875 {
6876 /* Nine operations. */
6877 rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
6878 rtx tmp2 = gen_reg_rtx (V4SImode);
6879 emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
6880
6881 rtx tmp3 = gen_reg_rtx (V4SImode);
6882 emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
6883
6884 rtx tmp4 = gen_reg_rtx (V1TImode);
6885 emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
6886
6887 rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
6888 rtx tmp6 = gen_reg_rtx (V2DImode);
6889 emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
6890
6891 rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
6892 rtx tmp8 = gen_reg_rtx (V2DImode);
6893 emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
6894
6895 rtx tmp9 = gen_reg_rtx (V2DImode);
6896 emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
6897
6898 rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
6899 rtx tmp11 = gen_reg_rtx (V1TImode);
6900 emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
6901
6902 rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
6903 rtx tmp13 = gen_reg_rtx (V2DImode);
6904 emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
6905
6906 rtx tmp14 = gen_reg_rtx (V2DImode);
6907 emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
6908
6909 emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
6910 }
6911 }
6912
6913 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
6914 DImode for constant loop counts. */
6915
6916 static machine_mode
6917 counter_mode (rtx count_exp)
6918 {
6919 if (GET_MODE (count_exp) != VOIDmode)
6920 return GET_MODE (count_exp);
6921 if (!CONST_INT_P (count_exp))
6922 return Pmode;
6923 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
6924 return DImode;
6925 return SImode;
6926 }
6927
6928 /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
6929 to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
6930 specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
6931 memory by VALUE (supposed to be in MODE).
6932
6933 The size is rounded down to whole number of chunk size moved at once.
6934 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
6935
6936
6937 static void
6938 expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
6939 rtx destptr, rtx srcptr, rtx value,
6940 rtx count, machine_mode mode, int unroll,
6941 int expected_size, bool issetmem)
6942 {
6943 rtx_code_label *out_label, *top_label;
6944 rtx iter, tmp;
6945 machine_mode iter_mode = counter_mode (count);
6946 int piece_size_n = GET_MODE_SIZE (mode) * unroll;
6947 rtx piece_size = GEN_INT (piece_size_n);
6948 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
6949 rtx size;
6950 int i;
6951
6952 top_label = gen_label_rtx ();
6953 out_label = gen_label_rtx ();
6954 iter = gen_reg_rtx (iter_mode);
6955
6956 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
6957 NULL, 1, OPTAB_DIRECT);
6958 /* Those two should combine. */
6959 if (piece_size == const1_rtx)
6960 {
6961 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
6962 true, out_label);
6963 predict_jump (REG_BR_PROB_BASE * 10 / 100);
6964 }
6965 emit_move_insn (iter, const0_rtx);
6966
6967 emit_label (top_label);
6968
6969 tmp = convert_modes (Pmode, iter_mode, iter, true);
6970
6971 /* This assert could be relaxed - in this case we'll need to compute
6972 smallest power of two, containing in PIECE_SIZE_N and pass it to
6973 offset_address. */
6974 gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
6975 destmem = offset_address (destmem, tmp, piece_size_n);
6976 destmem = adjust_address (destmem, mode, 0);
6977
6978 if (!issetmem)
6979 {
6980 srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
6981 srcmem = adjust_address (srcmem, mode, 0);
6982
6983 /* When unrolling for chips that reorder memory reads and writes,
6984 we can save registers by using single temporary.
6985 Also using 4 temporaries is overkill in 32bit mode. */
6986 if (!TARGET_64BIT && 0)
6987 {
6988 for (i = 0; i < unroll; i++)
6989 {
6990 if (i)
6991 {
6992 destmem = adjust_address (copy_rtx (destmem), mode,
6993 GET_MODE_SIZE (mode));
6994 srcmem = adjust_address (copy_rtx (srcmem), mode,
6995 GET_MODE_SIZE (mode));
6996 }
6997 emit_move_insn (destmem, srcmem);
6998 }
6999 }
7000 else
7001 {
7002 rtx tmpreg[4];
7003 gcc_assert (unroll <= 4);
7004 for (i = 0; i < unroll; i++)
7005 {
7006 tmpreg[i] = gen_reg_rtx (mode);
7007 if (i)
7008 srcmem = adjust_address (copy_rtx (srcmem), mode,
7009 GET_MODE_SIZE (mode));
7010 emit_move_insn (tmpreg[i], srcmem);
7011 }
7012 for (i = 0; i < unroll; i++)
7013 {
7014 if (i)
7015 destmem = adjust_address (copy_rtx (destmem), mode,
7016 GET_MODE_SIZE (mode));
7017 emit_move_insn (destmem, tmpreg[i]);
7018 }
7019 }
7020 }
7021 else
7022 for (i = 0; i < unroll; i++)
7023 {
7024 if (i)
7025 destmem = adjust_address (copy_rtx (destmem), mode,
7026 GET_MODE_SIZE (mode));
7027 emit_move_insn (destmem, value);
7028 }
7029
7030 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
7031 true, OPTAB_LIB_WIDEN);
7032 if (tmp != iter)
7033 emit_move_insn (iter, tmp);
7034
7035 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
7036 true, top_label);
7037 if (expected_size != -1)
7038 {
7039 expected_size /= GET_MODE_SIZE (mode) * unroll;
7040 if (expected_size == 0)
7041 predict_jump (0);
7042 else if (expected_size > REG_BR_PROB_BASE)
7043 predict_jump (REG_BR_PROB_BASE - 1);
7044 else
7045 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2)
7046 / expected_size);
7047 }
7048 else
7049 predict_jump (REG_BR_PROB_BASE * 80 / 100);
7050 iter = ix86_zero_extend_to_Pmode (iter);
7051 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
7052 true, OPTAB_LIB_WIDEN);
7053 if (tmp != destptr)
7054 emit_move_insn (destptr, tmp);
7055 if (!issetmem)
7056 {
7057 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
7058 true, OPTAB_LIB_WIDEN);
7059 if (tmp != srcptr)
7060 emit_move_insn (srcptr, tmp);
7061 }
7062 emit_label (out_label);
7063 }
7064
7065 /* Divide COUNTREG by SCALE. */
7066 static rtx
7067 scale_counter (rtx countreg, int scale)
7068 {
7069 rtx sc;
7070
7071 if (scale == 1)
7072 return countreg;
7073 if (CONST_INT_P (countreg))
7074 return GEN_INT (INTVAL (countreg) / scale);
7075 gcc_assert (REG_P (countreg));
7076
7077 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
7078 GEN_INT (exact_log2 (scale)),
7079 NULL, 1, OPTAB_DIRECT);
7080 return sc;
7081 }
7082
7083 /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
7084 When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
7085 When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
7086 For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
7087 ORIG_VALUE is the original value passed to memset to fill the memory with.
7088 Other arguments have same meaning as for previous function. */
7089
7090 static void
7091 expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
7092 rtx destptr, rtx srcptr, rtx value, rtx orig_value,
7093 rtx count,
7094 machine_mode mode, bool issetmem)
7095 {
7096 rtx destexp;
7097 rtx srcexp;
7098 rtx countreg;
7099 HOST_WIDE_INT rounded_count;
7100
7101 /* If possible, it is shorter to use rep movs.
7102 TODO: Maybe it is better to move this logic to decide_alg. */
7103 if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
7104 && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
7105 && (!issetmem || orig_value == const0_rtx))
7106 mode = SImode;
7107
7108 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
7109 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
7110
7111 countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
7112 GET_MODE_SIZE (mode)));
7113 if (mode != QImode)
7114 {
7115 destexp = gen_rtx_ASHIFT (Pmode, countreg,
7116 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7117 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
7118 }
7119 else
7120 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
7121 if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
7122 {
7123 rounded_count
7124 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7125 destmem = shallow_copy_rtx (destmem);
7126 set_mem_size (destmem, rounded_count);
7127 }
7128 else if (MEM_SIZE_KNOWN_P (destmem))
7129 clear_mem_size (destmem);
7130
7131 if (issetmem)
7132 {
7133 value = force_reg (mode, gen_lowpart (mode, value));
7134 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
7135 }
7136 else
7137 {
7138 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
7139 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
7140 if (mode != QImode)
7141 {
7142 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
7143 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
7144 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
7145 }
7146 else
7147 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
7148 if (CONST_INT_P (count))
7149 {
7150 rounded_count
7151 = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
7152 srcmem = shallow_copy_rtx (srcmem);
7153 set_mem_size (srcmem, rounded_count);
7154 }
7155 else
7156 {
7157 if (MEM_SIZE_KNOWN_P (srcmem))
7158 clear_mem_size (srcmem);
7159 }
7160 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
7161 destexp, srcexp));
7162 }
7163 }
7164
7165 /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
7166 DESTMEM.
7167 SRC is passed by pointer to be updated on return.
7168 Return value is updated DST. */
7169 static rtx
7170 emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
7171 HOST_WIDE_INT size_to_move)
7172 {
7173 rtx dst = destmem, src = *srcmem, tempreg;
7174 enum insn_code code;
7175 machine_mode move_mode;
7176 int piece_size, i;
7177
7178 /* Find the widest mode in which we could perform moves.
7179 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7180 it until move of such size is supported. */
7181 piece_size = 1 << floor_log2 (size_to_move);
7182 while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
7183 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7184 {
7185 gcc_assert (piece_size > 1);
7186 piece_size >>= 1;
7187 }
7188
7189 /* Find the corresponding vector mode with the same size as MOVE_MODE.
7190 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
7191 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
7192 {
7193 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
7194 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
7195 || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
7196 {
7197 move_mode = word_mode;
7198 piece_size = GET_MODE_SIZE (move_mode);
7199 code = optab_handler (mov_optab, move_mode);
7200 }
7201 }
7202 gcc_assert (code != CODE_FOR_nothing);
7203
7204 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7205 src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
7206
7207 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7208 gcc_assert (size_to_move % piece_size == 0);
7209
7210 for (i = 0; i < size_to_move; i += piece_size)
7211 {
7212 /* We move from memory to memory, so we'll need to do it via
7213 a temporary register. */
7214 tempreg = gen_reg_rtx (move_mode);
7215 emit_insn (GEN_FCN (code) (tempreg, src));
7216 emit_insn (GEN_FCN (code) (dst, tempreg));
7217
7218 emit_move_insn (destptr,
7219 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7220 emit_move_insn (srcptr,
7221 plus_constant (Pmode, copy_rtx (srcptr), piece_size));
7222
7223 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7224 piece_size);
7225 src = adjust_automodify_address_nv (src, move_mode, srcptr,
7226 piece_size);
7227 }
7228
7229 /* Update DST and SRC rtx. */
7230 *srcmem = src;
7231 return dst;
7232 }
7233
7234 /* Helper function for the string operations below. Dest VARIABLE whether
7235 it is aligned to VALUE bytes. If true, jump to the label. */
7236
7237 static rtx_code_label *
7238 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
7239 {
7240 rtx_code_label *label = gen_label_rtx ();
7241 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
7242 if (GET_MODE (variable) == DImode)
7243 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
7244 else
7245 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
7246 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
7247 1, label);
7248 if (epilogue)
7249 predict_jump (REG_BR_PROB_BASE * 50 / 100);
7250 else
7251 predict_jump (REG_BR_PROB_BASE * 90 / 100);
7252 return label;
7253 }
7254
7255
7256 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
7257
7258 static void
7259 expand_cpymem_epilogue (rtx destmem, rtx srcmem,
7260 rtx destptr, rtx srcptr, rtx count, int max_size)
7261 {
7262 rtx src, dest;
7263 if (CONST_INT_P (count))
7264 {
7265 HOST_WIDE_INT countval = INTVAL (count);
7266 HOST_WIDE_INT epilogue_size = countval % max_size;
7267 int i;
7268
7269 /* For now MAX_SIZE should be a power of 2. This assert could be
7270 relaxed, but it'll require a bit more complicated epilogue
7271 expanding. */
7272 gcc_assert ((max_size & (max_size - 1)) == 0);
7273 for (i = max_size; i >= 1; i >>= 1)
7274 {
7275 if (epilogue_size & i)
7276 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7277 }
7278 return;
7279 }
7280 if (max_size > 8)
7281 {
7282 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
7283 count, 1, OPTAB_DIRECT);
7284 expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
7285 count, QImode, 1, 4, false);
7286 return;
7287 }
7288
7289 /* When there are stringops, we can cheaply increase dest and src pointers.
7290 Otherwise we save code size by maintaining offset (zero is readily
7291 available from preceding rep operation) and using x86 addressing modes.
7292 */
7293 if (TARGET_SINGLE_STRINGOP)
7294 {
7295 if (max_size > 4)
7296 {
7297 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7298 src = change_address (srcmem, SImode, srcptr);
7299 dest = change_address (destmem, SImode, destptr);
7300 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7301 emit_label (label);
7302 LABEL_NUSES (label) = 1;
7303 }
7304 if (max_size > 2)
7305 {
7306 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7307 src = change_address (srcmem, HImode, srcptr);
7308 dest = change_address (destmem, HImode, destptr);
7309 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7310 emit_label (label);
7311 LABEL_NUSES (label) = 1;
7312 }
7313 if (max_size > 1)
7314 {
7315 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7316 src = change_address (srcmem, QImode, srcptr);
7317 dest = change_address (destmem, QImode, destptr);
7318 emit_insn (gen_strmov (destptr, dest, srcptr, src));
7319 emit_label (label);
7320 LABEL_NUSES (label) = 1;
7321 }
7322 }
7323 else
7324 {
7325 rtx offset = force_reg (Pmode, const0_rtx);
7326 rtx tmp;
7327
7328 if (max_size > 4)
7329 {
7330 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7331 src = change_address (srcmem, SImode, srcptr);
7332 dest = change_address (destmem, SImode, destptr);
7333 emit_move_insn (dest, src);
7334 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
7335 true, OPTAB_LIB_WIDEN);
7336 if (tmp != offset)
7337 emit_move_insn (offset, tmp);
7338 emit_label (label);
7339 LABEL_NUSES (label) = 1;
7340 }
7341 if (max_size > 2)
7342 {
7343 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7344 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7345 src = change_address (srcmem, HImode, tmp);
7346 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7347 dest = change_address (destmem, HImode, tmp);
7348 emit_move_insn (dest, src);
7349 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
7350 true, OPTAB_LIB_WIDEN);
7351 if (tmp != offset)
7352 emit_move_insn (offset, tmp);
7353 emit_label (label);
7354 LABEL_NUSES (label) = 1;
7355 }
7356 if (max_size > 1)
7357 {
7358 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7359 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
7360 src = change_address (srcmem, QImode, tmp);
7361 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
7362 dest = change_address (destmem, QImode, tmp);
7363 emit_move_insn (dest, src);
7364 emit_label (label);
7365 LABEL_NUSES (label) = 1;
7366 }
7367 }
7368 }
7369
7370 /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
7371 with value PROMOTED_VAL.
7372 SRC is passed by pointer to be updated on return.
7373 Return value is updated DST. */
7374 static rtx
7375 emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
7376 HOST_WIDE_INT size_to_move)
7377 {
7378 rtx dst = destmem;
7379 enum insn_code code;
7380 machine_mode move_mode;
7381 int piece_size, i;
7382
7383 /* Find the widest mode in which we could perform moves.
7384 Start with the biggest power of 2 less than SIZE_TO_MOVE and half
7385 it until move of such size is supported. */
7386 move_mode = GET_MODE (promoted_val);
7387 if (move_mode == VOIDmode)
7388 move_mode = QImode;
7389 if (size_to_move < GET_MODE_SIZE (move_mode))
7390 {
7391 unsigned int move_bits = size_to_move * BITS_PER_UNIT;
7392 move_mode = int_mode_for_size (move_bits, 0).require ();
7393 promoted_val = gen_lowpart (move_mode, promoted_val);
7394 }
7395 piece_size = GET_MODE_SIZE (move_mode);
7396 code = optab_handler (mov_optab, move_mode);
7397 gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
7398
7399 dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
7400
7401 /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
7402 gcc_assert (size_to_move % piece_size == 0);
7403
7404 for (i = 0; i < size_to_move; i += piece_size)
7405 {
7406 if (piece_size <= GET_MODE_SIZE (word_mode))
7407 {
7408 emit_insn (gen_strset (destptr, dst, promoted_val));
7409 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7410 piece_size);
7411 continue;
7412 }
7413
7414 emit_insn (GEN_FCN (code) (dst, promoted_val));
7415
7416 emit_move_insn (destptr,
7417 plus_constant (Pmode, copy_rtx (destptr), piece_size));
7418
7419 dst = adjust_automodify_address_nv (dst, move_mode, destptr,
7420 piece_size);
7421 }
7422
7423 /* Update DST rtx. */
7424 return dst;
7425 }
7426 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7427 static void
7428 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
7429 rtx count, int max_size)
7430 {
7431 count = expand_simple_binop (counter_mode (count), AND, count,
7432 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
7433 expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
7434 gen_lowpart (QImode, value), count, QImode,
7435 1, max_size / 2, true);
7436 }
7437
7438 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
7439 static void
7440 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
7441 rtx count, int max_size)
7442 {
7443 rtx dest;
7444
7445 if (CONST_INT_P (count))
7446 {
7447 HOST_WIDE_INT countval = INTVAL (count);
7448 HOST_WIDE_INT epilogue_size = countval % max_size;
7449 int i;
7450
7451 /* For now MAX_SIZE should be a power of 2. This assert could be
7452 relaxed, but it'll require a bit more complicated epilogue
7453 expanding. */
7454 gcc_assert ((max_size & (max_size - 1)) == 0);
7455 for (i = max_size; i >= 1; i >>= 1)
7456 {
7457 if (epilogue_size & i)
7458 {
7459 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7460 destmem = emit_memset (destmem, destptr, vec_value, i);
7461 else
7462 destmem = emit_memset (destmem, destptr, value, i);
7463 }
7464 }
7465 return;
7466 }
7467 if (max_size > 32)
7468 {
7469 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
7470 return;
7471 }
7472 if (max_size > 16)
7473 {
7474 rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
7475 if (TARGET_64BIT)
7476 {
7477 dest = change_address (destmem, DImode, destptr);
7478 emit_insn (gen_strset (destptr, dest, value));
7479 dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
7480 emit_insn (gen_strset (destptr, dest, value));
7481 }
7482 else
7483 {
7484 dest = change_address (destmem, SImode, destptr);
7485 emit_insn (gen_strset (destptr, dest, value));
7486 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7487 emit_insn (gen_strset (destptr, dest, value));
7488 dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
7489 emit_insn (gen_strset (destptr, dest, value));
7490 dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
7491 emit_insn (gen_strset (destptr, dest, value));
7492 }
7493 emit_label (label);
7494 LABEL_NUSES (label) = 1;
7495 }
7496 if (max_size > 8)
7497 {
7498 rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
7499 if (TARGET_64BIT)
7500 {
7501 dest = change_address (destmem, DImode, destptr);
7502 emit_insn (gen_strset (destptr, dest, value));
7503 }
7504 else
7505 {
7506 dest = change_address (destmem, SImode, destptr);
7507 emit_insn (gen_strset (destptr, dest, value));
7508 dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
7509 emit_insn (gen_strset (destptr, dest, value));
7510 }
7511 emit_label (label);
7512 LABEL_NUSES (label) = 1;
7513 }
7514 if (max_size > 4)
7515 {
7516 rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
7517 dest = change_address (destmem, SImode, destptr);
7518 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
7519 emit_label (label);
7520 LABEL_NUSES (label) = 1;
7521 }
7522 if (max_size > 2)
7523 {
7524 rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
7525 dest = change_address (destmem, HImode, destptr);
7526 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
7527 emit_label (label);
7528 LABEL_NUSES (label) = 1;
7529 }
7530 if (max_size > 1)
7531 {
7532 rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
7533 dest = change_address (destmem, QImode, destptr);
7534 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
7535 emit_label (label);
7536 LABEL_NUSES (label) = 1;
7537 }
7538 }
7539
7540 /* Adjust COUNTER by the VALUE. */
7541 static void
7542 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
7543 {
7544 emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
7545 }
7546
7547 /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
7548 DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
7549 Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
7550 ignored.
7551 Return value is updated DESTMEM. */
7552
7553 static rtx
7554 expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
7555 rtx destptr, rtx srcptr, rtx value,
7556 rtx vec_value, rtx count, int align,
7557 int desired_alignment, bool issetmem)
7558 {
7559 int i;
7560 for (i = 1; i < desired_alignment; i <<= 1)
7561 {
7562 if (align <= i)
7563 {
7564 rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
7565 if (issetmem)
7566 {
7567 if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
7568 destmem = emit_memset (destmem, destptr, vec_value, i);
7569 else
7570 destmem = emit_memset (destmem, destptr, value, i);
7571 }
7572 else
7573 destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
7574 ix86_adjust_counter (count, i);
7575 emit_label (label);
7576 LABEL_NUSES (label) = 1;
7577 set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
7578 }
7579 }
7580 return destmem;
7581 }
7582
7583 /* Test if COUNT&SIZE is nonzero and if so, expand movme
7584 or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
7585 and jump to DONE_LABEL. */
7586 static void
7587 expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
7588 rtx destptr, rtx srcptr,
7589 rtx value, rtx vec_value,
7590 rtx count, int size,
7591 rtx done_label, bool issetmem)
7592 {
7593 rtx_code_label *label = ix86_expand_aligntest (count, size, false);
7594 machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
7595 rtx modesize;
7596 int n;
7597
7598 /* If we do not have vector value to copy, we must reduce size. */
7599 if (issetmem)
7600 {
7601 if (!vec_value)
7602 {
7603 if (GET_MODE (value) == VOIDmode && size > 8)
7604 mode = Pmode;
7605 else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
7606 mode = GET_MODE (value);
7607 }
7608 else
7609 mode = GET_MODE (vec_value), value = vec_value;
7610 }
7611 else
7612 {
7613 /* Choose appropriate vector mode. */
7614 if (size >= 32)
7615 mode = TARGET_AVX ? V32QImode : TARGET_SSE ? V16QImode : DImode;
7616 else if (size >= 16)
7617 mode = TARGET_SSE ? V16QImode : DImode;
7618 srcmem = change_address (srcmem, mode, srcptr);
7619 }
7620 destmem = change_address (destmem, mode, destptr);
7621 modesize = GEN_INT (GET_MODE_SIZE (mode));
7622 gcc_assert (GET_MODE_SIZE (mode) <= size);
7623 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7624 {
7625 if (issetmem)
7626 emit_move_insn (destmem, gen_lowpart (mode, value));
7627 else
7628 {
7629 emit_move_insn (destmem, srcmem);
7630 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7631 }
7632 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7633 }
7634
7635 destmem = offset_address (destmem, count, 1);
7636 destmem = offset_address (destmem, GEN_INT (-2 * size),
7637 GET_MODE_SIZE (mode));
7638 if (!issetmem)
7639 {
7640 srcmem = offset_address (srcmem, count, 1);
7641 srcmem = offset_address (srcmem, GEN_INT (-2 * size),
7642 GET_MODE_SIZE (mode));
7643 }
7644 for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
7645 {
7646 if (issetmem)
7647 emit_move_insn (destmem, gen_lowpart (mode, value));
7648 else
7649 {
7650 emit_move_insn (destmem, srcmem);
7651 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7652 }
7653 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7654 }
7655 emit_jump_insn (gen_jump (done_label));
7656 emit_barrier ();
7657
7658 emit_label (label);
7659 LABEL_NUSES (label) = 1;
7660 }
7661
7662 /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
7663 and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
7664 bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
7665 proceed with an loop copying SIZE bytes at once. Do moves in MODE.
7666 DONE_LABEL is a label after the whole copying sequence. The label is created
7667 on demand if *DONE_LABEL is NULL.
7668 MIN_SIZE is minimal size of block copied. This value gets adjusted for new
7669 bounds after the initial copies.
7670
7671 DESTMEM/SRCMEM are memory expressions pointing to the copies block,
7672 DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
7673 we will dispatch to a library call for large blocks.
7674
7675 In pseudocode we do:
7676
7677 if (COUNT < SIZE)
7678 {
7679 Assume that SIZE is 4. Bigger sizes are handled analogously
7680 if (COUNT & 4)
7681 {
7682 copy 4 bytes from SRCPTR to DESTPTR
7683 copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
7684 goto done_label
7685 }
7686 if (!COUNT)
7687 goto done_label;
7688 copy 1 byte from SRCPTR to DESTPTR
7689 if (COUNT & 2)
7690 {
7691 copy 2 bytes from SRCPTR to DESTPTR
7692 copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
7693 }
7694 }
7695 else
7696 {
7697 copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
7698 copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
7699
7700 OLD_DESPTR = DESTPTR;
7701 Align DESTPTR up to DESIRED_ALIGN
7702 SRCPTR += DESTPTR - OLD_DESTPTR
7703 COUNT -= DEST_PTR - OLD_DESTPTR
7704 if (DYNAMIC_CHECK)
7705 Round COUNT down to multiple of SIZE
7706 << optional caller supplied zero size guard is here >>
7707 << optional caller supplied dynamic check is here >>
7708 << caller supplied main copy loop is here >>
7709 }
7710 done_label:
7711 */
7712 static void
7713 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
7714 rtx *destptr, rtx *srcptr,
7715 machine_mode mode,
7716 rtx value, rtx vec_value,
7717 rtx *count,
7718 rtx_code_label **done_label,
7719 int size,
7720 int desired_align,
7721 int align,
7722 unsigned HOST_WIDE_INT *min_size,
7723 bool dynamic_check,
7724 bool issetmem)
7725 {
7726 rtx_code_label *loop_label = NULL, *label;
7727 int n;
7728 rtx modesize;
7729 int prolog_size = 0;
7730 rtx mode_value;
7731
7732 /* Chose proper value to copy. */
7733 if (issetmem && VECTOR_MODE_P (mode))
7734 mode_value = vec_value;
7735 else
7736 mode_value = value;
7737 gcc_assert (GET_MODE_SIZE (mode) <= size);
7738
7739 /* See if block is big or small, handle small blocks. */
7740 if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
7741 {
7742 int size2 = size;
7743 loop_label = gen_label_rtx ();
7744
7745 if (!*done_label)
7746 *done_label = gen_label_rtx ();
7747
7748 emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
7749 1, loop_label);
7750 size2 >>= 1;
7751
7752 /* Handle sizes > 3. */
7753 for (;size2 > 2; size2 >>= 1)
7754 expand_small_cpymem_or_setmem (destmem, srcmem,
7755 *destptr, *srcptr,
7756 value, vec_value,
7757 *count,
7758 size2, *done_label, issetmem);
7759 /* Nothing to copy? Jump to DONE_LABEL if so */
7760 emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
7761 1, *done_label);
7762
7763 /* Do a byte copy. */
7764 destmem = change_address (destmem, QImode, *destptr);
7765 if (issetmem)
7766 emit_move_insn (destmem, gen_lowpart (QImode, value));
7767 else
7768 {
7769 srcmem = change_address (srcmem, QImode, *srcptr);
7770 emit_move_insn (destmem, srcmem);
7771 }
7772
7773 /* Handle sizes 2 and 3. */
7774 label = ix86_expand_aligntest (*count, 2, false);
7775 destmem = change_address (destmem, HImode, *destptr);
7776 destmem = offset_address (destmem, *count, 1);
7777 destmem = offset_address (destmem, GEN_INT (-2), 2);
7778 if (issetmem)
7779 emit_move_insn (destmem, gen_lowpart (HImode, value));
7780 else
7781 {
7782 srcmem = change_address (srcmem, HImode, *srcptr);
7783 srcmem = offset_address (srcmem, *count, 1);
7784 srcmem = offset_address (srcmem, GEN_INT (-2), 2);
7785 emit_move_insn (destmem, srcmem);
7786 }
7787
7788 emit_label (label);
7789 LABEL_NUSES (label) = 1;
7790 emit_jump_insn (gen_jump (*done_label));
7791 emit_barrier ();
7792 }
7793 else
7794 gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
7795 || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
7796
7797 /* Start memcpy for COUNT >= SIZE. */
7798 if (loop_label)
7799 {
7800 emit_label (loop_label);
7801 LABEL_NUSES (loop_label) = 1;
7802 }
7803
7804 /* Copy first desired_align bytes. */
7805 if (!issetmem)
7806 srcmem = change_address (srcmem, mode, *srcptr);
7807 destmem = change_address (destmem, mode, *destptr);
7808 modesize = GEN_INT (GET_MODE_SIZE (mode));
7809 for (n = 0; prolog_size < desired_align - align; n++)
7810 {
7811 if (issetmem)
7812 emit_move_insn (destmem, mode_value);
7813 else
7814 {
7815 emit_move_insn (destmem, srcmem);
7816 srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
7817 }
7818 destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
7819 prolog_size += GET_MODE_SIZE (mode);
7820 }
7821
7822
7823 /* Copy last SIZE bytes. */
7824 destmem = offset_address (destmem, *count, 1);
7825 destmem = offset_address (destmem,
7826 GEN_INT (-size - prolog_size),
7827 1);
7828 if (issetmem)
7829 emit_move_insn (destmem, mode_value);
7830 else
7831 {
7832 srcmem = offset_address (srcmem, *count, 1);
7833 srcmem = offset_address (srcmem,
7834 GEN_INT (-size - prolog_size),
7835 1);
7836 emit_move_insn (destmem, srcmem);
7837 }
7838 for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
7839 {
7840 destmem = offset_address (destmem, modesize, 1);
7841 if (issetmem)
7842 emit_move_insn (destmem, mode_value);
7843 else
7844 {
7845 srcmem = offset_address (srcmem, modesize, 1);
7846 emit_move_insn (destmem, srcmem);
7847 }
7848 }
7849
7850 /* Align destination. */
7851 if (desired_align > 1 && desired_align > align)
7852 {
7853 rtx saveddest = *destptr;
7854
7855 gcc_assert (desired_align <= size);
7856 /* Align destptr up, place it to new register. */
7857 *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
7858 GEN_INT (prolog_size),
7859 NULL_RTX, 1, OPTAB_DIRECT);
7860 if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
7861 REG_POINTER (*destptr) = 1;
7862 *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
7863 GEN_INT (-desired_align),
7864 *destptr, 1, OPTAB_DIRECT);
7865 /* See how many bytes we skipped. */
7866 saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
7867 *destptr,
7868 saveddest, 1, OPTAB_DIRECT);
7869 /* Adjust srcptr and count. */
7870 if (!issetmem)
7871 *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
7872 saveddest, *srcptr, 1, OPTAB_DIRECT);
7873 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7874 saveddest, *count, 1, OPTAB_DIRECT);
7875 /* We copied at most size + prolog_size. */
7876 if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
7877 *min_size
7878 = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
7879 else
7880 *min_size = 0;
7881
7882 /* Our loops always round down the block size, but for dispatch to
7883 library we need precise value. */
7884 if (dynamic_check)
7885 *count = expand_simple_binop (GET_MODE (*count), AND, *count,
7886 GEN_INT (-size), *count, 1, OPTAB_DIRECT);
7887 }
7888 else
7889 {
7890 gcc_assert (prolog_size == 0);
7891 /* Decrease count, so we won't end up copying last word twice. */
7892 if (!CONST_INT_P (*count))
7893 *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
7894 constm1_rtx, *count, 1, OPTAB_DIRECT);
7895 else
7896 *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
7897 (unsigned HOST_WIDE_INT)size));
7898 if (*min_size)
7899 *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
7900 }
7901 }
7902
7903
7904 /* This function is like the previous one, except here we know how many bytes
7905 need to be copied. That allows us to update alignment not only of DST, which
7906 is returned, but also of SRC, which is passed as a pointer for that
7907 reason. */
7908 static rtx
7909 expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
7910 rtx srcreg, rtx value, rtx vec_value,
7911 int desired_align, int align_bytes,
7912 bool issetmem)
7913 {
7914 rtx src = NULL;
7915 rtx orig_dst = dst;
7916 rtx orig_src = NULL;
7917 int piece_size = 1;
7918 int copied_bytes = 0;
7919
7920 if (!issetmem)
7921 {
7922 gcc_assert (srcp != NULL);
7923 src = *srcp;
7924 orig_src = src;
7925 }
7926
7927 for (piece_size = 1;
7928 piece_size <= desired_align && copied_bytes < align_bytes;
7929 piece_size <<= 1)
7930 {
7931 if (align_bytes & piece_size)
7932 {
7933 if (issetmem)
7934 {
7935 if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
7936 dst = emit_memset (dst, destreg, vec_value, piece_size);
7937 else
7938 dst = emit_memset (dst, destreg, value, piece_size);
7939 }
7940 else
7941 dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
7942 copied_bytes += piece_size;
7943 }
7944 }
7945 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
7946 set_mem_align (dst, desired_align * BITS_PER_UNIT);
7947 if (MEM_SIZE_KNOWN_P (orig_dst))
7948 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
7949
7950 if (!issetmem)
7951 {
7952 int src_align_bytes = get_mem_align_offset (src, desired_align
7953 * BITS_PER_UNIT);
7954 if (src_align_bytes >= 0)
7955 src_align_bytes = desired_align - src_align_bytes;
7956 if (src_align_bytes >= 0)
7957 {
7958 unsigned int src_align;
7959 for (src_align = desired_align; src_align >= 2; src_align >>= 1)
7960 {
7961 if ((src_align_bytes & (src_align - 1))
7962 == (align_bytes & (src_align - 1)))
7963 break;
7964 }
7965 if (src_align > (unsigned int) desired_align)
7966 src_align = desired_align;
7967 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
7968 set_mem_align (src, src_align * BITS_PER_UNIT);
7969 }
7970 if (MEM_SIZE_KNOWN_P (orig_src))
7971 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
7972 *srcp = src;
7973 }
7974
7975 return dst;
7976 }
7977
7978 /* Return true if ALG can be used in current context.
7979 Assume we expand memset if MEMSET is true. */
7980 static bool
7981 alg_usable_p (enum stringop_alg alg, bool memset, bool have_as)
7982 {
7983 if (alg == no_stringop)
7984 return false;
7985 if (alg == vector_loop)
7986 return TARGET_SSE || TARGET_AVX;
7987 /* Algorithms using the rep prefix want at least edi and ecx;
7988 additionally, memset wants eax and memcpy wants esi. Don't
7989 consider such algorithms if the user has appropriated those
7990 registers for their own purposes, or if we have a non-default
7991 address space, since some string insns cannot override the segment. */
7992 if (alg == rep_prefix_1_byte
7993 || alg == rep_prefix_4_byte
7994 || alg == rep_prefix_8_byte)
7995 {
7996 if (have_as)
7997 return false;
7998 if (fixed_regs[CX_REG]
7999 || fixed_regs[DI_REG]
8000 || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG]))
8001 return false;
8002 }
8003 return true;
8004 }
8005
8006 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
8007 static enum stringop_alg
8008 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
8009 unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
8010 bool memset, bool zero_memset, bool have_as,
8011 int *dynamic_check, bool *noalign, bool recur)
8012 {
8013 const struct stringop_algs *algs;
8014 bool optimize_for_speed;
8015 int max = 0;
8016 const struct processor_costs *cost;
8017 int i;
8018 bool any_alg_usable_p = false;
8019
8020 *noalign = false;
8021 *dynamic_check = -1;
8022
8023 /* Even if the string operation call is cold, we still might spend a lot
8024 of time processing large blocks. */
8025 if (optimize_function_for_size_p (cfun)
8026 || (optimize_insn_for_size_p ()
8027 && (max_size < 256
8028 || (expected_size != -1 && expected_size < 256))))
8029 optimize_for_speed = false;
8030 else
8031 optimize_for_speed = true;
8032
8033 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
8034 if (memset)
8035 algs = &cost->memset[TARGET_64BIT != 0];
8036 else
8037 algs = &cost->memcpy[TARGET_64BIT != 0];
8038
8039 /* See maximal size for user defined algorithm. */
8040 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8041 {
8042 enum stringop_alg candidate = algs->size[i].alg;
8043 bool usable = alg_usable_p (candidate, memset, have_as);
8044 any_alg_usable_p |= usable;
8045
8046 if (candidate != libcall && candidate && usable)
8047 max = algs->size[i].max;
8048 }
8049
8050 /* If expected size is not known but max size is small enough
8051 so inline version is a win, set expected size into
8052 the range. */
8053 if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
8054 && expected_size == -1)
8055 expected_size = min_size / 2 + max_size / 2;
8056
8057 /* If user specified the algorithm, honor it if possible. */
8058 if (ix86_stringop_alg != no_stringop
8059 && alg_usable_p (ix86_stringop_alg, memset, have_as))
8060 return ix86_stringop_alg;
8061 /* rep; movq or rep; movl is the smallest variant. */
8062 else if (!optimize_for_speed)
8063 {
8064 *noalign = true;
8065 if (!count || (count & 3) || (memset && !zero_memset))
8066 return alg_usable_p (rep_prefix_1_byte, memset, have_as)
8067 ? rep_prefix_1_byte : loop_1_byte;
8068 else
8069 return alg_usable_p (rep_prefix_4_byte, memset, have_as)
8070 ? rep_prefix_4_byte : loop;
8071 }
8072 /* Very tiny blocks are best handled via the loop, REP is expensive to
8073 setup. */
8074 else if (expected_size != -1 && expected_size < 4)
8075 return loop_1_byte;
8076 else if (expected_size != -1)
8077 {
8078 enum stringop_alg alg = libcall;
8079 bool alg_noalign = false;
8080 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
8081 {
8082 /* We get here if the algorithms that were not libcall-based
8083 were rep-prefix based and we are unable to use rep prefixes
8084 based on global register usage. Break out of the loop and
8085 use the heuristic below. */
8086 if (algs->size[i].max == 0)
8087 break;
8088 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
8089 {
8090 enum stringop_alg candidate = algs->size[i].alg;
8091
8092 if (candidate != libcall
8093 && alg_usable_p (candidate, memset, have_as))
8094 {
8095 alg = candidate;
8096 alg_noalign = algs->size[i].noalign;
8097 }
8098 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
8099 last non-libcall inline algorithm. */
8100 if (TARGET_INLINE_ALL_STRINGOPS)
8101 {
8102 /* When the current size is best to be copied by a libcall,
8103 but we are still forced to inline, run the heuristic below
8104 that will pick code for medium sized blocks. */
8105 if (alg != libcall)
8106 {
8107 *noalign = alg_noalign;
8108 return alg;
8109 }
8110 else if (!any_alg_usable_p)
8111 break;
8112 }
8113 else if (alg_usable_p (candidate, memset, have_as)
8114 && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8115 && candidate == rep_prefix_1_byte
8116 /* NB: If min_size != max_size, size is
8117 unknown. */
8118 && min_size != max_size))
8119 {
8120 *noalign = algs->size[i].noalign;
8121 return candidate;
8122 }
8123 }
8124 }
8125 }
8126 /* When asked to inline the call anyway, try to pick meaningful choice.
8127 We look for maximal size of block that is faster to copy by hand and
8128 take blocks of at most of that size guessing that average size will
8129 be roughly half of the block.
8130
8131 If this turns out to be bad, we might simply specify the preferred
8132 choice in ix86_costs. */
8133 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8134 && (algs->unknown_size == libcall
8135 || !alg_usable_p (algs->unknown_size, memset, have_as)))
8136 {
8137 enum stringop_alg alg;
8138 HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
8139
8140 /* If there aren't any usable algorithms or if recursing already,
8141 then recursing on smaller sizes or same size isn't going to
8142 find anything. Just return the simple byte-at-a-time copy loop. */
8143 if (!any_alg_usable_p || recur)
8144 {
8145 /* Pick something reasonable. */
8146 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
8147 *dynamic_check = 128;
8148 return loop_1_byte;
8149 }
8150 alg = decide_alg (count, new_expected_size, min_size, max_size, memset,
8151 zero_memset, have_as, dynamic_check, noalign, true);
8152 gcc_assert (*dynamic_check == -1);
8153 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
8154 *dynamic_check = max;
8155 else
8156 gcc_assert (alg != libcall);
8157 return alg;
8158 }
8159 return (alg_usable_p (algs->unknown_size, memset, have_as)
8160 ? algs->unknown_size : libcall);
8161 }
8162
8163 /* Decide on alignment. We know that the operand is already aligned to ALIGN
8164 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
8165 static int
8166 decide_alignment (int align,
8167 enum stringop_alg alg,
8168 int expected_size,
8169 machine_mode move_mode)
8170 {
8171 int desired_align = 0;
8172
8173 gcc_assert (alg != no_stringop);
8174
8175 if (alg == libcall)
8176 return 0;
8177 if (move_mode == VOIDmode)
8178 return 0;
8179
8180 desired_align = GET_MODE_SIZE (move_mode);
8181 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
8182 copying whole cacheline at once. */
8183 if (TARGET_CPU_P (PENTIUMPRO)
8184 && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
8185 desired_align = 8;
8186
8187 if (optimize_size)
8188 desired_align = 1;
8189 if (desired_align < align)
8190 desired_align = align;
8191 if (expected_size != -1 && expected_size < 4)
8192 desired_align = align;
8193
8194 return desired_align;
8195 }
8196
8197
8198 /* Helper function for memcpy. For QImode value 0xXY produce
8199 0xXYXYXYXY of wide specified by MODE. This is essentially
8200 a * 0x10101010, but we can do slightly better than
8201 synth_mult by unwinding the sequence by hand on CPUs with
8202 slow multiply. */
8203 static rtx
8204 promote_duplicated_reg (machine_mode mode, rtx val)
8205 {
8206 machine_mode valmode = GET_MODE (val);
8207 rtx tmp;
8208 int nops = mode == DImode ? 3 : 2;
8209
8210 gcc_assert (mode == SImode || mode == DImode || val == const0_rtx);
8211 if (val == const0_rtx)
8212 return copy_to_mode_reg (mode, CONST0_RTX (mode));
8213 if (CONST_INT_P (val))
8214 {
8215 HOST_WIDE_INT v = INTVAL (val) & 255;
8216
8217 v |= v << 8;
8218 v |= v << 16;
8219 if (mode == DImode)
8220 v |= (v << 16) << 16;
8221 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
8222 }
8223
8224 if (valmode == VOIDmode)
8225 valmode = QImode;
8226 if (valmode != QImode)
8227 val = gen_lowpart (QImode, val);
8228 if (mode == QImode)
8229 return val;
8230 if (!TARGET_PARTIAL_REG_STALL)
8231 nops--;
8232 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
8233 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
8234 <= (ix86_cost->shift_const + ix86_cost->add) * nops
8235 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
8236 {
8237 rtx reg = convert_modes (mode, QImode, val, true);
8238 tmp = promote_duplicated_reg (mode, const1_rtx);
8239 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
8240 OPTAB_DIRECT);
8241 }
8242 else
8243 {
8244 rtx reg = convert_modes (mode, QImode, val, true);
8245
8246 if (!TARGET_PARTIAL_REG_STALL)
8247 emit_insn (gen_insv_1 (mode, reg, reg));
8248 else
8249 {
8250 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
8251 NULL, 1, OPTAB_DIRECT);
8252 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
8253 OPTAB_DIRECT);
8254 }
8255 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
8256 NULL, 1, OPTAB_DIRECT);
8257 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8258 if (mode == SImode)
8259 return reg;
8260 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
8261 NULL, 1, OPTAB_DIRECT);
8262 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
8263 return reg;
8264 }
8265 }
8266
8267 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
8268 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
8269 alignment from ALIGN to DESIRED_ALIGN. */
8270 static rtx
8271 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
8272 int align)
8273 {
8274 rtx promoted_val;
8275
8276 if (TARGET_64BIT
8277 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
8278 promoted_val = promote_duplicated_reg (DImode, val);
8279 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
8280 promoted_val = promote_duplicated_reg (SImode, val);
8281 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
8282 promoted_val = promote_duplicated_reg (HImode, val);
8283 else
8284 promoted_val = val;
8285
8286 return promoted_val;
8287 }
8288
8289 /* Copy the address to a Pmode register. This is used for x32 to
8290 truncate DImode TLS address to a SImode register. */
8291
8292 static rtx
8293 ix86_copy_addr_to_reg (rtx addr)
8294 {
8295 rtx reg;
8296 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
8297 {
8298 reg = copy_addr_to_reg (addr);
8299 REG_POINTER (reg) = 1;
8300 return reg;
8301 }
8302 else
8303 {
8304 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
8305 reg = copy_to_mode_reg (DImode, addr);
8306 REG_POINTER (reg) = 1;
8307 return gen_rtx_SUBREG (SImode, reg, 0);
8308 }
8309 }
8310
8311 /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
8312 operations when profitable. The code depends upon architecture, block size
8313 and alignment, but always has one of the following overall structures:
8314
8315 Aligned move sequence:
8316
8317 1) Prologue guard: Conditional that jumps up to epilogues for small
8318 blocks that can be handled by epilogue alone. This is faster
8319 but also needed for correctness, since prologue assume the block
8320 is larger than the desired alignment.
8321
8322 Optional dynamic check for size and libcall for large
8323 blocks is emitted here too, with -minline-stringops-dynamically.
8324
8325 2) Prologue: copy first few bytes in order to get destination
8326 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
8327 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
8328 copied. We emit either a jump tree on power of two sized
8329 blocks, or a byte loop.
8330
8331 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8332 with specified algorithm.
8333
8334 4) Epilogue: code copying tail of the block that is too small to be
8335 handled by main body (or up to size guarded by prologue guard).
8336
8337 Misaligned move sequence
8338
8339 1) missaligned move prologue/epilogue containing:
8340 a) Prologue handling small memory blocks and jumping to done_label
8341 (skipped if blocks are known to be large enough)
8342 b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
8343 needed by single possibly misaligned move
8344 (skipped if alignment is not needed)
8345 c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
8346
8347 2) Zero size guard dispatching to done_label, if needed
8348
8349 3) dispatch to library call, if needed,
8350
8351 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
8352 with specified algorithm. */
8353 bool
8354 ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
8355 rtx align_exp, rtx expected_align_exp,
8356 rtx expected_size_exp, rtx min_size_exp,
8357 rtx max_size_exp, rtx probable_max_size_exp,
8358 bool issetmem)
8359 {
8360 rtx destreg;
8361 rtx srcreg = NULL;
8362 rtx_code_label *label = NULL;
8363 rtx tmp;
8364 rtx_code_label *jump_around_label = NULL;
8365 HOST_WIDE_INT align = 1;
8366 unsigned HOST_WIDE_INT count = 0;
8367 HOST_WIDE_INT expected_size = -1;
8368 int size_needed = 0, epilogue_size_needed;
8369 int desired_align = 0, align_bytes = 0;
8370 enum stringop_alg alg;
8371 rtx promoted_val = NULL;
8372 rtx vec_promoted_val = NULL;
8373 bool force_loopy_epilogue = false;
8374 int dynamic_check;
8375 bool need_zero_guard = false;
8376 bool noalign;
8377 machine_mode move_mode = VOIDmode;
8378 machine_mode wider_mode;
8379 int unroll_factor = 1;
8380 /* TODO: Once value ranges are available, fill in proper data. */
8381 unsigned HOST_WIDE_INT min_size = 0;
8382 unsigned HOST_WIDE_INT max_size = -1;
8383 unsigned HOST_WIDE_INT probable_max_size = -1;
8384 bool misaligned_prologue_used = false;
8385 bool have_as;
8386
8387 if (CONST_INT_P (align_exp))
8388 align = INTVAL (align_exp);
8389 /* i386 can do misaligned access on reasonably increased cost. */
8390 if (CONST_INT_P (expected_align_exp)
8391 && INTVAL (expected_align_exp) > align)
8392 align = INTVAL (expected_align_exp);
8393 /* ALIGN is the minimum of destination and source alignment, but we care here
8394 just about destination alignment. */
8395 else if (!issetmem
8396 && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
8397 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
8398
8399 if (CONST_INT_P (count_exp))
8400 {
8401 min_size = max_size = probable_max_size = count = expected_size
8402 = INTVAL (count_exp);
8403 /* When COUNT is 0, there is nothing to do. */
8404 if (!count)
8405 return true;
8406 }
8407 else
8408 {
8409 if (min_size_exp)
8410 min_size = INTVAL (min_size_exp);
8411 if (max_size_exp)
8412 max_size = INTVAL (max_size_exp);
8413 if (probable_max_size_exp)
8414 probable_max_size = INTVAL (probable_max_size_exp);
8415 if (CONST_INT_P (expected_size_exp))
8416 expected_size = INTVAL (expected_size_exp);
8417 }
8418
8419 /* Make sure we don't need to care about overflow later on. */
8420 if (count > (HOST_WIDE_INT_1U << 30))
8421 return false;
8422
8423 have_as = !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (dst));
8424 if (!issetmem)
8425 have_as |= !ADDR_SPACE_GENERIC_P (MEM_ADDR_SPACE (src));
8426
8427 /* Step 0: Decide on preferred algorithm, desired alignment and
8428 size of chunks to be copied by main loop. */
8429 alg = decide_alg (count, expected_size, min_size, probable_max_size,
8430 issetmem,
8431 issetmem && val_exp == const0_rtx, have_as,
8432 &dynamic_check, &noalign, false);
8433
8434 if (dump_file)
8435 fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
8436 stringop_alg_names[alg]);
8437
8438 if (alg == libcall)
8439 return false;
8440 gcc_assert (alg != no_stringop);
8441
8442 /* For now vector-version of memset is generated only for memory zeroing, as
8443 creating of promoted vector value is very cheap in this case. */
8444 if (issetmem && alg == vector_loop && val_exp != const0_rtx)
8445 alg = unrolled_loop;
8446
8447 if (!count)
8448 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
8449 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
8450 if (!issetmem)
8451 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
8452
8453 unroll_factor = 1;
8454 move_mode = word_mode;
8455 switch (alg)
8456 {
8457 case libcall:
8458 case no_stringop:
8459 case last_alg:
8460 gcc_unreachable ();
8461 case loop_1_byte:
8462 need_zero_guard = true;
8463 move_mode = QImode;
8464 break;
8465 case loop:
8466 need_zero_guard = true;
8467 break;
8468 case unrolled_loop:
8469 need_zero_guard = true;
8470 unroll_factor = (TARGET_64BIT ? 4 : 2);
8471 break;
8472 case vector_loop:
8473 need_zero_guard = true;
8474 unroll_factor = 4;
8475 /* Find the widest supported mode. */
8476 move_mode = word_mode;
8477 while (GET_MODE_WIDER_MODE (move_mode).exists (&wider_mode)
8478 && optab_handler (mov_optab, wider_mode) != CODE_FOR_nothing)
8479 move_mode = wider_mode;
8480
8481 if (TARGET_AVX256_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 128)
8482 move_mode = TImode;
8483 if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (move_mode) > 256)
8484 move_mode = OImode;
8485
8486 /* Find the corresponding vector mode with the same size as MOVE_MODE.
8487 MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8488 if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8489 {
8490 int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8491 if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8492 || optab_handler (mov_optab, move_mode) == CODE_FOR_nothing)
8493 move_mode = word_mode;
8494 }
8495 gcc_assert (optab_handler (mov_optab, move_mode) != CODE_FOR_nothing);
8496 break;
8497 case rep_prefix_8_byte:
8498 move_mode = DImode;
8499 break;
8500 case rep_prefix_4_byte:
8501 move_mode = SImode;
8502 break;
8503 case rep_prefix_1_byte:
8504 move_mode = QImode;
8505 break;
8506 }
8507 size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
8508 epilogue_size_needed = size_needed;
8509
8510 /* If we are going to call any library calls conditionally, make sure any
8511 pending stack adjustment happen before the first conditional branch,
8512 otherwise they will be emitted before the library call only and won't
8513 happen from the other branches. */
8514 if (dynamic_check != -1)
8515 do_pending_stack_adjust ();
8516
8517 desired_align = decide_alignment (align, alg, expected_size, move_mode);
8518 if (!TARGET_ALIGN_STRINGOPS || noalign)
8519 align = desired_align;
8520
8521 /* Step 1: Prologue guard. */
8522
8523 /* Alignment code needs count to be in register. */
8524 if (CONST_INT_P (count_exp) && desired_align > align)
8525 {
8526 if (INTVAL (count_exp) > desired_align
8527 && INTVAL (count_exp) > size_needed)
8528 {
8529 align_bytes
8530 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
8531 if (align_bytes <= 0)
8532 align_bytes = 0;
8533 else
8534 align_bytes = desired_align - align_bytes;
8535 }
8536 if (align_bytes == 0)
8537 count_exp = force_reg (counter_mode (count_exp), count_exp);
8538 }
8539 gcc_assert (desired_align >= 1 && align >= 1);
8540
8541 /* Misaligned move sequences handle both prologue and epilogue at once.
8542 Default code generation results in a smaller code for large alignments
8543 and also avoids redundant job when sizes are known precisely. */
8544 misaligned_prologue_used
8545 = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
8546 && MAX (desired_align, epilogue_size_needed) <= 32
8547 && desired_align <= epilogue_size_needed
8548 && ((desired_align > align && !align_bytes)
8549 || (!count && epilogue_size_needed > 1)));
8550
8551 /* Do the cheap promotion to allow better CSE across the
8552 main loop and epilogue (ie one load of the big constant in the
8553 front of all code.
8554 For now the misaligned move sequences do not have fast path
8555 without broadcasting. */
8556 if (issetmem && ((CONST_INT_P (val_exp) || misaligned_prologue_used)))
8557 {
8558 if (alg == vector_loop)
8559 {
8560 gcc_assert (val_exp == const0_rtx);
8561 vec_promoted_val = promote_duplicated_reg (move_mode, val_exp);
8562 promoted_val = promote_duplicated_reg_to_size (val_exp,
8563 GET_MODE_SIZE (word_mode),
8564 desired_align, align);
8565 }
8566 else
8567 {
8568 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8569 desired_align, align);
8570 }
8571 }
8572 /* Misaligned move sequences handles both prologues and epilogues at once.
8573 Default code generation results in smaller code for large alignments and
8574 also avoids redundant job when sizes are known precisely. */
8575 if (misaligned_prologue_used)
8576 {
8577 /* Misaligned move prologue handled small blocks by itself. */
8578 expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
8579 (dst, src, &destreg, &srcreg,
8580 move_mode, promoted_val, vec_promoted_val,
8581 &count_exp,
8582 &jump_around_label,
8583 desired_align < align
8584 ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
8585 desired_align, align, &min_size, dynamic_check, issetmem);
8586 if (!issetmem)
8587 src = change_address (src, BLKmode, srcreg);
8588 dst = change_address (dst, BLKmode, destreg);
8589 set_mem_align (dst, desired_align * BITS_PER_UNIT);
8590 epilogue_size_needed = 0;
8591 if (need_zero_guard
8592 && min_size < (unsigned HOST_WIDE_INT) size_needed)
8593 {
8594 /* It is possible that we copied enough so the main loop will not
8595 execute. */
8596 gcc_assert (size_needed > 1);
8597 if (jump_around_label == NULL_RTX)
8598 jump_around_label = gen_label_rtx ();
8599 emit_cmp_and_jump_insns (count_exp,
8600 GEN_INT (size_needed),
8601 LTU, 0, counter_mode (count_exp), 1, jump_around_label);
8602 if (expected_size == -1
8603 || expected_size < (desired_align - align) / 2 + size_needed)
8604 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8605 else
8606 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8607 }
8608 }
8609 /* Ensure that alignment prologue won't copy past end of block. */
8610 else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
8611 {
8612 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
8613 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
8614 Make sure it is power of 2. */
8615 epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
8616
8617 /* To improve performance of small blocks, we jump around the VAL
8618 promoting mode. This mean that if the promoted VAL is not constant,
8619 we might not use it in the epilogue and have to use byte
8620 loop variant. */
8621 if (issetmem && epilogue_size_needed > 2 && !promoted_val)
8622 force_loopy_epilogue = true;
8623 if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8624 || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8625 {
8626 /* If main algorithm works on QImode, no epilogue is needed.
8627 For small sizes just don't align anything. */
8628 if (size_needed == 1)
8629 desired_align = align;
8630 else
8631 goto epilogue;
8632 }
8633 else if (!count
8634 && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
8635 {
8636 label = gen_label_rtx ();
8637 emit_cmp_and_jump_insns (count_exp,
8638 GEN_INT (epilogue_size_needed),
8639 LTU, 0, counter_mode (count_exp), 1, label);
8640 if (expected_size == -1 || expected_size < epilogue_size_needed)
8641 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8642 else
8643 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8644 }
8645 }
8646
8647 /* Emit code to decide on runtime whether library call or inline should be
8648 used. */
8649 if (dynamic_check != -1)
8650 {
8651 if (!issetmem && CONST_INT_P (count_exp))
8652 {
8653 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
8654 {
8655 emit_block_copy_via_libcall (dst, src, count_exp);
8656 count_exp = const0_rtx;
8657 goto epilogue;
8658 }
8659 }
8660 else
8661 {
8662 rtx_code_label *hot_label = gen_label_rtx ();
8663 if (jump_around_label == NULL_RTX)
8664 jump_around_label = gen_label_rtx ();
8665 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
8666 LEU, 0, counter_mode (count_exp),
8667 1, hot_label);
8668 predict_jump (REG_BR_PROB_BASE * 90 / 100);
8669 if (issetmem)
8670 set_storage_via_libcall (dst, count_exp, val_exp);
8671 else
8672 emit_block_copy_via_libcall (dst, src, count_exp);
8673 emit_jump (jump_around_label);
8674 emit_label (hot_label);
8675 }
8676 }
8677
8678 /* Step 2: Alignment prologue. */
8679 /* Do the expensive promotion once we branched off the small blocks. */
8680 if (issetmem && !promoted_val)
8681 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
8682 desired_align, align);
8683
8684 if (desired_align > align && !misaligned_prologue_used)
8685 {
8686 if (align_bytes == 0)
8687 {
8688 /* Except for the first move in prologue, we no longer know
8689 constant offset in aliasing info. It don't seems to worth
8690 the pain to maintain it for the first move, so throw away
8691 the info early. */
8692 dst = change_address (dst, BLKmode, destreg);
8693 if (!issetmem)
8694 src = change_address (src, BLKmode, srcreg);
8695 dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
8696 promoted_val, vec_promoted_val,
8697 count_exp, align, desired_align,
8698 issetmem);
8699 /* At most desired_align - align bytes are copied. */
8700 if (min_size < (unsigned)(desired_align - align))
8701 min_size = 0;
8702 else
8703 min_size -= desired_align - align;
8704 }
8705 else
8706 {
8707 /* If we know how many bytes need to be stored before dst is
8708 sufficiently aligned, maintain aliasing info accurately. */
8709 dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
8710 srcreg,
8711 promoted_val,
8712 vec_promoted_val,
8713 desired_align,
8714 align_bytes,
8715 issetmem);
8716
8717 count_exp = plus_constant (counter_mode (count_exp),
8718 count_exp, -align_bytes);
8719 count -= align_bytes;
8720 min_size -= align_bytes;
8721 max_size -= align_bytes;
8722 }
8723 if (need_zero_guard
8724 && min_size < (unsigned HOST_WIDE_INT) size_needed
8725 && (count < (unsigned HOST_WIDE_INT) size_needed
8726 || (align_bytes == 0
8727 && count < ((unsigned HOST_WIDE_INT) size_needed
8728 + desired_align - align))))
8729 {
8730 /* It is possible that we copied enough so the main loop will not
8731 execute. */
8732 gcc_assert (size_needed > 1);
8733 if (label == NULL_RTX)
8734 label = gen_label_rtx ();
8735 emit_cmp_and_jump_insns (count_exp,
8736 GEN_INT (size_needed),
8737 LTU, 0, counter_mode (count_exp), 1, label);
8738 if (expected_size == -1
8739 || expected_size < (desired_align - align) / 2 + size_needed)
8740 predict_jump (REG_BR_PROB_BASE * 20 / 100);
8741 else
8742 predict_jump (REG_BR_PROB_BASE * 60 / 100);
8743 }
8744 }
8745 if (label && size_needed == 1)
8746 {
8747 emit_label (label);
8748 LABEL_NUSES (label) = 1;
8749 label = NULL;
8750 epilogue_size_needed = 1;
8751 if (issetmem)
8752 promoted_val = val_exp;
8753 }
8754 else if (label == NULL_RTX && !misaligned_prologue_used)
8755 epilogue_size_needed = size_needed;
8756
8757 /* Step 3: Main loop. */
8758
8759 switch (alg)
8760 {
8761 case libcall:
8762 case no_stringop:
8763 case last_alg:
8764 gcc_unreachable ();
8765 case loop_1_byte:
8766 case loop:
8767 case unrolled_loop:
8768 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
8769 count_exp, move_mode, unroll_factor,
8770 expected_size, issetmem);
8771 break;
8772 case vector_loop:
8773 expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
8774 vec_promoted_val, count_exp, move_mode,
8775 unroll_factor, expected_size, issetmem);
8776 break;
8777 case rep_prefix_8_byte:
8778 case rep_prefix_4_byte:
8779 case rep_prefix_1_byte:
8780 expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
8781 val_exp, count_exp, move_mode, issetmem);
8782 break;
8783 }
8784 /* Adjust properly the offset of src and dest memory for aliasing. */
8785 if (CONST_INT_P (count_exp))
8786 {
8787 if (!issetmem)
8788 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
8789 (count / size_needed) * size_needed);
8790 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
8791 (count / size_needed) * size_needed);
8792 }
8793 else
8794 {
8795 if (!issetmem)
8796 src = change_address (src, BLKmode, srcreg);
8797 dst = change_address (dst, BLKmode, destreg);
8798 }
8799
8800 /* Step 4: Epilogue to copy the remaining bytes. */
8801 epilogue:
8802 if (label)
8803 {
8804 /* When the main loop is done, COUNT_EXP might hold original count,
8805 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
8806 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
8807 bytes. Compensate if needed. */
8808
8809 if (size_needed < epilogue_size_needed)
8810 {
8811 tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
8812 GEN_INT (size_needed - 1), count_exp, 1,
8813 OPTAB_DIRECT);
8814 if (tmp != count_exp)
8815 emit_move_insn (count_exp, tmp);
8816 }
8817 emit_label (label);
8818 LABEL_NUSES (label) = 1;
8819 }
8820
8821 if (count_exp != const0_rtx && epilogue_size_needed > 1)
8822 {
8823 if (force_loopy_epilogue)
8824 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
8825 epilogue_size_needed);
8826 else
8827 {
8828 if (issetmem)
8829 expand_setmem_epilogue (dst, destreg, promoted_val,
8830 vec_promoted_val, count_exp,
8831 epilogue_size_needed);
8832 else
8833 expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
8834 epilogue_size_needed);
8835 }
8836 }
8837 if (jump_around_label)
8838 emit_label (jump_around_label);
8839 return true;
8840 }
8841
8842 /* Expand cmpstrn or memcmp. */
8843
8844 bool
8845 ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
8846 rtx length, rtx align, bool is_cmpstrn)
8847 {
8848 /* Expand strncmp and memcmp only with -minline-all-stringops since
8849 "repz cmpsb" can be much slower than strncmp and memcmp functions
8850 implemented with vector instructions, see
8851
8852 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
8853 */
8854 if (!TARGET_INLINE_ALL_STRINGOPS)
8855 return false;
8856
8857 /* Can't use this if the user has appropriated ecx, esi or edi. */
8858 if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
8859 return false;
8860
8861 if (is_cmpstrn)
8862 {
8863 /* For strncmp, length is the maximum length, which can be larger
8864 than actual string lengths. We can expand the cmpstrn pattern
8865 to "repz cmpsb" only if one of the strings is a constant so
8866 that expand_builtin_strncmp() can write the length argument to
8867 be the minimum of the const string length and the actual length
8868 argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
8869 tree t1 = MEM_EXPR (src1);
8870 tree t2 = MEM_EXPR (src2);
8871 if (!((t1 && TREE_CODE (t1) == MEM_REF
8872 && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
8873 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
8874 == STRING_CST))
8875 || (t2 && TREE_CODE (t2) == MEM_REF
8876 && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
8877 && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
8878 == STRING_CST))))
8879 return false;
8880 }
8881
8882 rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
8883 rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
8884 if (addr1 != XEXP (src1, 0))
8885 src1 = replace_equiv_address_nv (src1, addr1);
8886 if (addr2 != XEXP (src2, 0))
8887 src2 = replace_equiv_address_nv (src2, addr2);
8888
8889 /* NB: Make a copy of the data length to avoid changing the original
8890 data length by cmpstrnqi patterns. */
8891 length = ix86_zero_extend_to_Pmode (length);
8892 rtx lengthreg = gen_reg_rtx (Pmode);
8893 emit_move_insn (lengthreg, length);
8894
8895 /* If we are testing strict equality, we can use known alignment to
8896 good advantage. This may be possible with combine, particularly
8897 once cc0 is dead. */
8898 if (CONST_INT_P (length))
8899 {
8900 if (length == const0_rtx)
8901 {
8902 emit_move_insn (result, const0_rtx);
8903 return true;
8904 }
8905 emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
8906 src1, src2));
8907 }
8908 else
8909 {
8910 emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
8911 emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
8912 src1, src2));
8913 }
8914
8915 rtx out = gen_lowpart (QImode, result);
8916 emit_insn (gen_cmpintqi (out));
8917 emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
8918
8919 return true;
8920 }
8921
8922 /* Expand the appropriate insns for doing strlen if not just doing
8923 repnz; scasb
8924
8925 out = result, initialized with the start address
8926 align_rtx = alignment of the address.
8927 scratch = scratch register, initialized with the startaddress when
8928 not aligned, otherwise undefined
8929
8930 This is just the body. It needs the initializations mentioned above and
8931 some address computing at the end. These things are done in i386.md. */
8932
8933 static void
8934 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
8935 {
8936 int align;
8937 rtx tmp;
8938 rtx_code_label *align_2_label = NULL;
8939 rtx_code_label *align_3_label = NULL;
8940 rtx_code_label *align_4_label = gen_label_rtx ();
8941 rtx_code_label *end_0_label = gen_label_rtx ();
8942 rtx mem;
8943 rtx tmpreg = gen_reg_rtx (SImode);
8944 rtx scratch = gen_reg_rtx (SImode);
8945 rtx cmp;
8946
8947 align = 0;
8948 if (CONST_INT_P (align_rtx))
8949 align = INTVAL (align_rtx);
8950
8951 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
8952
8953 /* Is there a known alignment and is it less than 4? */
8954 if (align < 4)
8955 {
8956 rtx scratch1 = gen_reg_rtx (Pmode);
8957 emit_move_insn (scratch1, out);
8958 /* Is there a known alignment and is it not 2? */
8959 if (align != 2)
8960 {
8961 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
8962 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
8963
8964 /* Leave just the 3 lower bits. */
8965 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
8966 NULL_RTX, 0, OPTAB_WIDEN);
8967
8968 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8969 Pmode, 1, align_4_label);
8970 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
8971 Pmode, 1, align_2_label);
8972 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
8973 Pmode, 1, align_3_label);
8974 }
8975 else
8976 {
8977 /* Since the alignment is 2, we have to check 2 or 0 bytes;
8978 check if is aligned to 4 - byte. */
8979
8980 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
8981 NULL_RTX, 0, OPTAB_WIDEN);
8982
8983 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
8984 Pmode, 1, align_4_label);
8985 }
8986
8987 mem = change_address (src, QImode, out);
8988
8989 /* Now compare the bytes. */
8990
8991 /* Compare the first n unaligned byte on a byte per byte basis. */
8992 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
8993 QImode, 1, end_0_label);
8994
8995 /* Increment the address. */
8996 emit_insn (gen_add2_insn (out, const1_rtx));
8997
8998 /* Not needed with an alignment of 2 */
8999 if (align != 2)
9000 {
9001 emit_label (align_2_label);
9002
9003 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9004 end_0_label);
9005
9006 emit_insn (gen_add2_insn (out, const1_rtx));
9007
9008 emit_label (align_3_label);
9009 }
9010
9011 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
9012 end_0_label);
9013
9014 emit_insn (gen_add2_insn (out, const1_rtx));
9015 }
9016
9017 /* Generate loop to check 4 bytes at a time. It is not a good idea to
9018 align this loop. It gives only huge programs, but does not help to
9019 speed up. */
9020 emit_label (align_4_label);
9021
9022 mem = change_address (src, SImode, out);
9023 emit_move_insn (scratch, mem);
9024 emit_insn (gen_add2_insn (out, GEN_INT (4)));
9025
9026 /* This formula yields a nonzero result iff one of the bytes is zero.
9027 This saves three branches inside loop and many cycles. */
9028
9029 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
9030 emit_insn (gen_one_cmplsi2 (scratch, scratch));
9031 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
9032 emit_insn (gen_andsi3 (tmpreg, tmpreg,
9033 gen_int_mode (0x80808080, SImode)));
9034 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
9035 align_4_label);
9036
9037 if (TARGET_CMOVE)
9038 {
9039 rtx reg = gen_reg_rtx (SImode);
9040 rtx reg2 = gen_reg_rtx (Pmode);
9041 emit_move_insn (reg, tmpreg);
9042 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
9043
9044 /* If zero is not in the first two bytes, move two bytes forward. */
9045 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9046 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9047 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9048 emit_insn (gen_rtx_SET (tmpreg,
9049 gen_rtx_IF_THEN_ELSE (SImode, tmp,
9050 reg,
9051 tmpreg)));
9052 /* Emit lea manually to avoid clobbering of flags. */
9053 emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
9054
9055 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9056 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
9057 emit_insn (gen_rtx_SET (out,
9058 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
9059 reg2,
9060 out)));
9061 }
9062 else
9063 {
9064 rtx_code_label *end_2_label = gen_label_rtx ();
9065 /* Is zero in the first two bytes? */
9066
9067 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
9068 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
9069 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
9070 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
9071 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
9072 pc_rtx);
9073 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
9074 JUMP_LABEL (tmp) = end_2_label;
9075
9076 /* Not in the first two. Move two bytes forward. */
9077 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
9078 emit_insn (gen_add2_insn (out, const2_rtx));
9079
9080 emit_label (end_2_label);
9081
9082 }
9083
9084 /* Avoid branch in fixing the byte. */
9085 tmpreg = gen_lowpart (QImode, tmpreg);
9086 emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
9087 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
9088 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
9089 emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
9090
9091 emit_label (end_0_label);
9092 }
9093
9094 /* Expand strlen. */
9095
9096 bool
9097 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
9098 {
9099 if (TARGET_UNROLL_STRLEN
9100 && TARGET_INLINE_ALL_STRINGOPS
9101 && eoschar == const0_rtx
9102 && optimize > 1)
9103 {
9104 /* The generic case of strlen expander is long. Avoid it's
9105 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
9106 rtx addr = force_reg (Pmode, XEXP (src, 0));
9107 /* Well it seems that some optimizer does not combine a call like
9108 foo(strlen(bar), strlen(bar));
9109 when the move and the subtraction is done here. It does calculate
9110 the length just once when these instructions are done inside of
9111 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
9112 often used and I use one fewer register for the lifetime of
9113 output_strlen_unroll() this is better. */
9114
9115 emit_move_insn (out, addr);
9116
9117 ix86_expand_strlensi_unroll_1 (out, src, align);
9118
9119 /* strlensi_unroll_1 returns the address of the zero at the end of
9120 the string, like memchr(), so compute the length by subtracting
9121 the start address. */
9122 emit_insn (gen_sub2_insn (out, addr));
9123 return true;
9124 }
9125 else
9126 return false;
9127 }
9128
9129 /* For given symbol (function) construct code to compute address of it's PLT
9130 entry in large x86-64 PIC model. */
9131
9132 static rtx
9133 construct_plt_address (rtx symbol)
9134 {
9135 rtx tmp, unspec;
9136
9137 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
9138 gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
9139 gcc_assert (Pmode == DImode);
9140
9141 tmp = gen_reg_rtx (Pmode);
9142 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
9143
9144 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
9145 emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
9146 return tmp;
9147 }
9148
9149 /* Additional registers that are clobbered by SYSV calls. */
9150
9151 static int const x86_64_ms_sysv_extra_clobbered_registers
9152 [NUM_X86_64_MS_CLOBBERED_REGS] =
9153 {
9154 SI_REG, DI_REG,
9155 XMM6_REG, XMM7_REG,
9156 XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
9157 XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
9158 };
9159
9160 rtx_insn *
9161 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
9162 rtx callarg2,
9163 rtx pop, bool sibcall)
9164 {
9165 rtx vec[3];
9166 rtx use = NULL, call;
9167 unsigned int vec_len = 0;
9168 tree fndecl;
9169
9170 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9171 {
9172 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
9173 if (fndecl
9174 && (lookup_attribute ("interrupt",
9175 TYPE_ATTRIBUTES (TREE_TYPE (fndecl)))))
9176 error ("interrupt service routine cannot be called directly");
9177 }
9178 else
9179 fndecl = NULL_TREE;
9180
9181 if (pop == const0_rtx)
9182 pop = NULL;
9183 gcc_assert (!TARGET_64BIT || !pop);
9184
9185 rtx addr = XEXP (fnaddr, 0);
9186 if (TARGET_MACHO && !TARGET_64BIT)
9187 {
9188 #if TARGET_MACHO
9189 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
9190 fnaddr = machopic_indirect_call_target (fnaddr);
9191 #endif
9192 }
9193 else
9194 {
9195 /* Static functions and indirect calls don't need the pic register. Also,
9196 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
9197 it an indirect call. */
9198 if (flag_pic
9199 && GET_CODE (addr) == SYMBOL_REF
9200 && ix86_call_use_plt_p (addr))
9201 {
9202 if (flag_plt
9203 && (SYMBOL_REF_DECL (addr) == NULL_TREE
9204 || !lookup_attribute ("noplt",
9205 DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
9206 {
9207 if (!TARGET_64BIT
9208 || (ix86_cmodel == CM_LARGE_PIC
9209 && DEFAULT_ABI != MS_ABI))
9210 {
9211 use_reg (&use, gen_rtx_REG (Pmode,
9212 REAL_PIC_OFFSET_TABLE_REGNUM));
9213 if (ix86_use_pseudo_pic_reg ())
9214 emit_move_insn (gen_rtx_REG (Pmode,
9215 REAL_PIC_OFFSET_TABLE_REGNUM),
9216 pic_offset_table_rtx);
9217 }
9218 }
9219 else if (!TARGET_PECOFF && !TARGET_MACHO)
9220 {
9221 if (TARGET_64BIT
9222 && ix86_cmodel == CM_LARGE_PIC
9223 && DEFAULT_ABI != MS_ABI)
9224 {
9225 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9226 UNSPEC_GOT);
9227 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9228 fnaddr = force_reg (Pmode, fnaddr);
9229 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
9230 }
9231 else if (TARGET_64BIT)
9232 {
9233 fnaddr = gen_rtx_UNSPEC (Pmode,
9234 gen_rtvec (1, addr),
9235 UNSPEC_GOTPCREL);
9236 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9237 }
9238 else
9239 {
9240 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
9241 UNSPEC_GOT);
9242 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
9243 fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
9244 fnaddr);
9245 }
9246 fnaddr = gen_const_mem (Pmode, fnaddr);
9247 /* Pmode may not be the same as word_mode for x32, which
9248 doesn't support indirect branch via 32-bit memory slot.
9249 Since x32 GOT slot is 64 bit with zero upper 32 bits,
9250 indirect branch via x32 GOT slot is OK. */
9251 if (GET_MODE (fnaddr) != word_mode)
9252 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
9253 fnaddr = gen_rtx_MEM (QImode, fnaddr);
9254 }
9255 }
9256 }
9257
9258 /* Skip setting up RAX register for -mskip-rax-setup when there are no
9259 parameters passed in vector registers. */
9260 if (TARGET_64BIT
9261 && (INTVAL (callarg2) > 0
9262 || (INTVAL (callarg2) == 0
9263 && (TARGET_SSE || !flag_skip_rax_setup))))
9264 {
9265 rtx al = gen_rtx_REG (QImode, AX_REG);
9266 emit_move_insn (al, callarg2);
9267 use_reg (&use, al);
9268 }
9269
9270 if (ix86_cmodel == CM_LARGE_PIC
9271 && !TARGET_PECOFF
9272 && MEM_P (fnaddr)
9273 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
9274 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
9275 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
9276 /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
9277 branch via x32 GOT slot is OK. */
9278 else if (!(TARGET_X32
9279 && MEM_P (fnaddr)
9280 && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
9281 && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode))
9282 && (sibcall
9283 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
9284 : !call_insn_operand (XEXP (fnaddr, 0), word_mode)))
9285 {
9286 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
9287 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
9288 }
9289
9290 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
9291
9292 if (retval)
9293 call = gen_rtx_SET (retval, call);
9294 vec[vec_len++] = call;
9295
9296 if (pop)
9297 {
9298 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
9299 pop = gen_rtx_SET (stack_pointer_rtx, pop);
9300 vec[vec_len++] = pop;
9301 }
9302
9303 if (cfun->machine->no_caller_saved_registers
9304 && (!fndecl
9305 || (!TREE_THIS_VOLATILE (fndecl)
9306 && !lookup_attribute ("no_caller_saved_registers",
9307 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
9308 {
9309 static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
9310 bool is_64bit_ms_abi = (TARGET_64BIT
9311 && ix86_function_abi (fndecl) == MS_ABI);
9312 char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
9313
9314 /* If there are no caller-saved registers, add all registers
9315 that are clobbered by the call which returns. */
9316 for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
9317 if (!fixed_regs[i]
9318 && (ix86_call_used_regs[i] == 1
9319 || (ix86_call_used_regs[i] & c_mask))
9320 && !STACK_REGNO_P (i)
9321 && !MMX_REGNO_P (i))
9322 clobber_reg (&use,
9323 gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
9324 }
9325 else if (TARGET_64BIT_MS_ABI
9326 && (!callarg2 || INTVAL (callarg2) != -2))
9327 {
9328 unsigned i;
9329
9330 for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
9331 {
9332 int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
9333 machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
9334
9335 clobber_reg (&use, gen_rtx_REG (mode, regno));
9336 }
9337
9338 /* Set here, but it may get cleared later. */
9339 if (TARGET_CALL_MS2SYSV_XLOGUES)
9340 {
9341 if (!TARGET_SSE)
9342 ;
9343
9344 /* Don't break hot-patched functions. */
9345 else if (ix86_function_ms_hook_prologue (current_function_decl))
9346 ;
9347
9348 /* TODO: Cases not yet examined. */
9349 else if (flag_split_stack)
9350 warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
9351
9352 else
9353 {
9354 gcc_assert (!reload_completed);
9355 cfun->machine->call_ms2sysv = true;
9356 }
9357 }
9358 }
9359
9360 if (TARGET_MACHO && TARGET_64BIT && !sibcall
9361 && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
9362 || !fndecl || TREE_PUBLIC (fndecl)))
9363 {
9364 /* We allow public functions defined in a TU to bind locally for PIC
9365 code (the default) on 64bit Mach-O.
9366 If such functions are not inlined, we cannot tell at compile-time if
9367 they will be called via the lazy symbol resolver (this can depend on
9368 options given at link-time). Therefore, we must assume that the lazy
9369 resolver could be used which clobbers R11 and R10. */
9370 clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
9371 clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
9372 }
9373
9374 if (vec_len > 1)
9375 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
9376 rtx_insn *call_insn = emit_call_insn (call);
9377 if (use)
9378 CALL_INSN_FUNCTION_USAGE (call_insn) = use;
9379
9380 return call_insn;
9381 }
9382
9383 /* Split simple return with popping POPC bytes from stack to indirect
9384 branch with stack adjustment . */
9385
9386 void
9387 ix86_split_simple_return_pop_internal (rtx popc)
9388 {
9389 struct machine_function *m = cfun->machine;
9390 rtx ecx = gen_rtx_REG (SImode, CX_REG);
9391 rtx_insn *insn;
9392
9393 /* There is no "pascal" calling convention in any 64bit ABI. */
9394 gcc_assert (!TARGET_64BIT);
9395
9396 insn = emit_insn (gen_pop (ecx));
9397 m->fs.cfa_offset -= UNITS_PER_WORD;
9398 m->fs.sp_offset -= UNITS_PER_WORD;
9399
9400 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
9401 x = gen_rtx_SET (stack_pointer_rtx, x);
9402 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9403 add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
9404 RTX_FRAME_RELATED_P (insn) = 1;
9405
9406 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
9407 x = gen_rtx_SET (stack_pointer_rtx, x);
9408 insn = emit_insn (x);
9409 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
9410 RTX_FRAME_RELATED_P (insn) = 1;
9411
9412 /* Now return address is in ECX. */
9413 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
9414 }
9415
9416 /* Errors in the source file can cause expand_expr to return const0_rtx
9417 where we expect a vector. To avoid crashing, use one of the vector
9418 clear instructions. */
9419
9420 static rtx
9421 safe_vector_operand (rtx x, machine_mode mode)
9422 {
9423 if (x == const0_rtx)
9424 x = CONST0_RTX (mode);
9425 return x;
9426 }
9427
9428 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
9429
9430 static rtx
9431 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
9432 {
9433 rtx pat;
9434 tree arg0 = CALL_EXPR_ARG (exp, 0);
9435 tree arg1 = CALL_EXPR_ARG (exp, 1);
9436 rtx op0 = expand_normal (arg0);
9437 rtx op1 = expand_normal (arg1);
9438 machine_mode tmode = insn_data[icode].operand[0].mode;
9439 machine_mode mode0 = insn_data[icode].operand[1].mode;
9440 machine_mode mode1 = insn_data[icode].operand[2].mode;
9441
9442 if (VECTOR_MODE_P (mode0))
9443 op0 = safe_vector_operand (op0, mode0);
9444 if (VECTOR_MODE_P (mode1))
9445 op1 = safe_vector_operand (op1, mode1);
9446
9447 if (optimize || !target
9448 || GET_MODE (target) != tmode
9449 || !insn_data[icode].operand[0].predicate (target, tmode))
9450 target = gen_reg_rtx (tmode);
9451
9452 if (GET_MODE (op1) == SImode && mode1 == TImode)
9453 {
9454 rtx x = gen_reg_rtx (V4SImode);
9455 emit_insn (gen_sse2_loadd (x, op1));
9456 op1 = gen_lowpart (TImode, x);
9457 }
9458
9459 if (!insn_data[icode].operand[1].predicate (op0, mode0))
9460 op0 = copy_to_mode_reg (mode0, op0);
9461 if (!insn_data[icode].operand[2].predicate (op1, mode1))
9462 op1 = copy_to_mode_reg (mode1, op1);
9463
9464 pat = GEN_FCN (icode) (target, op0, op1);
9465 if (! pat)
9466 return 0;
9467
9468 emit_insn (pat);
9469
9470 return target;
9471 }
9472
9473 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
9474
9475 static rtx
9476 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
9477 enum ix86_builtin_func_type m_type,
9478 enum rtx_code sub_code)
9479 {
9480 rtx pat;
9481 unsigned int i, nargs;
9482 bool comparison_p = false;
9483 bool tf_p = false;
9484 bool last_arg_constant = false;
9485 int num_memory = 0;
9486 rtx xops[4];
9487
9488 machine_mode tmode = insn_data[icode].operand[0].mode;
9489
9490 switch (m_type)
9491 {
9492 case MULTI_ARG_4_DF2_DI_I:
9493 case MULTI_ARG_4_DF2_DI_I1:
9494 case MULTI_ARG_4_SF2_SI_I:
9495 case MULTI_ARG_4_SF2_SI_I1:
9496 nargs = 4;
9497 last_arg_constant = true;
9498 break;
9499
9500 case MULTI_ARG_3_SF:
9501 case MULTI_ARG_3_DF:
9502 case MULTI_ARG_3_SF2:
9503 case MULTI_ARG_3_DF2:
9504 case MULTI_ARG_3_DI:
9505 case MULTI_ARG_3_SI:
9506 case MULTI_ARG_3_SI_DI:
9507 case MULTI_ARG_3_HI:
9508 case MULTI_ARG_3_HI_SI:
9509 case MULTI_ARG_3_QI:
9510 case MULTI_ARG_3_DI2:
9511 case MULTI_ARG_3_SI2:
9512 case MULTI_ARG_3_HI2:
9513 case MULTI_ARG_3_QI2:
9514 nargs = 3;
9515 break;
9516
9517 case MULTI_ARG_2_SF:
9518 case MULTI_ARG_2_DF:
9519 case MULTI_ARG_2_DI:
9520 case MULTI_ARG_2_SI:
9521 case MULTI_ARG_2_HI:
9522 case MULTI_ARG_2_QI:
9523 nargs = 2;
9524 break;
9525
9526 case MULTI_ARG_2_DI_IMM:
9527 case MULTI_ARG_2_SI_IMM:
9528 case MULTI_ARG_2_HI_IMM:
9529 case MULTI_ARG_2_QI_IMM:
9530 nargs = 2;
9531 last_arg_constant = true;
9532 break;
9533
9534 case MULTI_ARG_1_SF:
9535 case MULTI_ARG_1_DF:
9536 case MULTI_ARG_1_SF2:
9537 case MULTI_ARG_1_DF2:
9538 case MULTI_ARG_1_DI:
9539 case MULTI_ARG_1_SI:
9540 case MULTI_ARG_1_HI:
9541 case MULTI_ARG_1_QI:
9542 case MULTI_ARG_1_SI_DI:
9543 case MULTI_ARG_1_HI_DI:
9544 case MULTI_ARG_1_HI_SI:
9545 case MULTI_ARG_1_QI_DI:
9546 case MULTI_ARG_1_QI_SI:
9547 case MULTI_ARG_1_QI_HI:
9548 nargs = 1;
9549 break;
9550
9551 case MULTI_ARG_2_DI_CMP:
9552 case MULTI_ARG_2_SI_CMP:
9553 case MULTI_ARG_2_HI_CMP:
9554 case MULTI_ARG_2_QI_CMP:
9555 nargs = 2;
9556 comparison_p = true;
9557 break;
9558
9559 case MULTI_ARG_2_SF_TF:
9560 case MULTI_ARG_2_DF_TF:
9561 case MULTI_ARG_2_DI_TF:
9562 case MULTI_ARG_2_SI_TF:
9563 case MULTI_ARG_2_HI_TF:
9564 case MULTI_ARG_2_QI_TF:
9565 nargs = 2;
9566 tf_p = true;
9567 break;
9568
9569 default:
9570 gcc_unreachable ();
9571 }
9572
9573 if (optimize || !target
9574 || GET_MODE (target) != tmode
9575 || !insn_data[icode].operand[0].predicate (target, tmode))
9576 target = gen_reg_rtx (tmode);
9577 else if (memory_operand (target, tmode))
9578 num_memory++;
9579
9580 gcc_assert (nargs <= ARRAY_SIZE (xops));
9581
9582 for (i = 0; i < nargs; i++)
9583 {
9584 tree arg = CALL_EXPR_ARG (exp, i);
9585 rtx op = expand_normal (arg);
9586 int adjust = (comparison_p) ? 1 : 0;
9587 machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
9588
9589 if (last_arg_constant && i == nargs - 1)
9590 {
9591 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
9592 {
9593 enum insn_code new_icode = icode;
9594 switch (icode)
9595 {
9596 case CODE_FOR_xop_vpermil2v2df3:
9597 case CODE_FOR_xop_vpermil2v4sf3:
9598 case CODE_FOR_xop_vpermil2v4df3:
9599 case CODE_FOR_xop_vpermil2v8sf3:
9600 error ("the last argument must be a 2-bit immediate");
9601 return gen_reg_rtx (tmode);
9602 case CODE_FOR_xop_rotlv2di3:
9603 new_icode = CODE_FOR_rotlv2di3;
9604 goto xop_rotl;
9605 case CODE_FOR_xop_rotlv4si3:
9606 new_icode = CODE_FOR_rotlv4si3;
9607 goto xop_rotl;
9608 case CODE_FOR_xop_rotlv8hi3:
9609 new_icode = CODE_FOR_rotlv8hi3;
9610 goto xop_rotl;
9611 case CODE_FOR_xop_rotlv16qi3:
9612 new_icode = CODE_FOR_rotlv16qi3;
9613 xop_rotl:
9614 if (CONST_INT_P (op))
9615 {
9616 int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
9617 op = GEN_INT (INTVAL (op) & mask);
9618 gcc_checking_assert
9619 (insn_data[icode].operand[i + 1].predicate (op, mode));
9620 }
9621 else
9622 {
9623 gcc_checking_assert
9624 (nargs == 2
9625 && insn_data[new_icode].operand[0].mode == tmode
9626 && insn_data[new_icode].operand[1].mode == tmode
9627 && insn_data[new_icode].operand[2].mode == mode
9628 && insn_data[new_icode].operand[0].predicate
9629 == insn_data[icode].operand[0].predicate
9630 && insn_data[new_icode].operand[1].predicate
9631 == insn_data[icode].operand[1].predicate);
9632 icode = new_icode;
9633 goto non_constant;
9634 }
9635 break;
9636 default:
9637 gcc_unreachable ();
9638 }
9639 }
9640 }
9641 else
9642 {
9643 non_constant:
9644 if (VECTOR_MODE_P (mode))
9645 op = safe_vector_operand (op, mode);
9646
9647 /* If we aren't optimizing, only allow one memory operand to be
9648 generated. */
9649 if (memory_operand (op, mode))
9650 num_memory++;
9651
9652 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
9653
9654 if (optimize
9655 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
9656 || num_memory > 1)
9657 op = force_reg (mode, op);
9658 }
9659
9660 xops[i] = op;
9661 }
9662
9663 switch (nargs)
9664 {
9665 case 1:
9666 pat = GEN_FCN (icode) (target, xops[0]);
9667 break;
9668
9669 case 2:
9670 if (tf_p)
9671 pat = GEN_FCN (icode) (target, xops[0], xops[1],
9672 GEN_INT ((int)sub_code));
9673 else if (! comparison_p)
9674 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
9675 else
9676 {
9677 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
9678 xops[0], xops[1]);
9679
9680 pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
9681 }
9682 break;
9683
9684 case 3:
9685 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
9686 break;
9687
9688 case 4:
9689 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
9690 break;
9691
9692 default:
9693 gcc_unreachable ();
9694 }
9695
9696 if (! pat)
9697 return 0;
9698
9699 emit_insn (pat);
9700 return target;
9701 }
9702
9703 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
9704 insns with vec_merge. */
9705
9706 static rtx
9707 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
9708 rtx target)
9709 {
9710 rtx pat;
9711 tree arg0 = CALL_EXPR_ARG (exp, 0);
9712 rtx op1, op0 = expand_normal (arg0);
9713 machine_mode tmode = insn_data[icode].operand[0].mode;
9714 machine_mode mode0 = insn_data[icode].operand[1].mode;
9715
9716 if (optimize || !target
9717 || GET_MODE (target) != tmode
9718 || !insn_data[icode].operand[0].predicate (target, tmode))
9719 target = gen_reg_rtx (tmode);
9720
9721 if (VECTOR_MODE_P (mode0))
9722 op0 = safe_vector_operand (op0, mode0);
9723
9724 if ((optimize && !register_operand (op0, mode0))
9725 || !insn_data[icode].operand[1].predicate (op0, mode0))
9726 op0 = copy_to_mode_reg (mode0, op0);
9727
9728 op1 = op0;
9729 if (!insn_data[icode].operand[2].predicate (op1, mode0))
9730 op1 = copy_to_mode_reg (mode0, op1);
9731
9732 pat = GEN_FCN (icode) (target, op0, op1);
9733 if (! pat)
9734 return 0;
9735 emit_insn (pat);
9736 return target;
9737 }
9738
9739 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
9740
9741 static rtx
9742 ix86_expand_sse_compare (const struct builtin_description *d,
9743 tree exp, rtx target, bool swap)
9744 {
9745 rtx pat;
9746 tree arg0 = CALL_EXPR_ARG (exp, 0);
9747 tree arg1 = CALL_EXPR_ARG (exp, 1);
9748 rtx op0 = expand_normal (arg0);
9749 rtx op1 = expand_normal (arg1);
9750 rtx op2;
9751 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9752 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9753 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9754 enum rtx_code comparison = d->comparison;
9755
9756 if (VECTOR_MODE_P (mode0))
9757 op0 = safe_vector_operand (op0, mode0);
9758 if (VECTOR_MODE_P (mode1))
9759 op1 = safe_vector_operand (op1, mode1);
9760
9761 /* Swap operands if we have a comparison that isn't available in
9762 hardware. */
9763 if (swap)
9764 std::swap (op0, op1);
9765
9766 if (optimize || !target
9767 || GET_MODE (target) != tmode
9768 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9769 target = gen_reg_rtx (tmode);
9770
9771 if ((optimize && !register_operand (op0, mode0))
9772 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
9773 op0 = copy_to_mode_reg (mode0, op0);
9774 if ((optimize && !register_operand (op1, mode1))
9775 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
9776 op1 = copy_to_mode_reg (mode1, op1);
9777
9778 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
9779 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9780 if (! pat)
9781 return 0;
9782 emit_insn (pat);
9783 return target;
9784 }
9785
9786 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
9787
9788 static rtx
9789 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
9790 rtx target)
9791 {
9792 rtx pat;
9793 tree arg0 = CALL_EXPR_ARG (exp, 0);
9794 tree arg1 = CALL_EXPR_ARG (exp, 1);
9795 rtx op0 = expand_normal (arg0);
9796 rtx op1 = expand_normal (arg1);
9797 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9798 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9799 enum rtx_code comparison = d->comparison;
9800
9801 if (VECTOR_MODE_P (mode0))
9802 op0 = safe_vector_operand (op0, mode0);
9803 if (VECTOR_MODE_P (mode1))
9804 op1 = safe_vector_operand (op1, mode1);
9805
9806 target = gen_reg_rtx (SImode);
9807 emit_move_insn (target, const0_rtx);
9808 target = gen_rtx_SUBREG (QImode, target, 0);
9809
9810 if ((optimize && !register_operand (op0, mode0))
9811 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9812 op0 = copy_to_mode_reg (mode0, op0);
9813 if ((optimize && !register_operand (op1, mode1))
9814 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9815 op1 = copy_to_mode_reg (mode1, op1);
9816
9817 pat = GEN_FCN (d->icode) (op0, op1);
9818 if (! pat)
9819 return 0;
9820 emit_insn (pat);
9821 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9822 gen_rtx_fmt_ee (comparison, QImode,
9823 SET_DEST (pat),
9824 const0_rtx)));
9825
9826 return SUBREG_REG (target);
9827 }
9828
9829 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
9830
9831 static rtx
9832 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
9833 rtx target)
9834 {
9835 rtx pat;
9836 tree arg0 = CALL_EXPR_ARG (exp, 0);
9837 rtx op1, op0 = expand_normal (arg0);
9838 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9839 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9840
9841 if (optimize || target == 0
9842 || GET_MODE (target) != tmode
9843 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9844 target = gen_reg_rtx (tmode);
9845
9846 if (VECTOR_MODE_P (mode0))
9847 op0 = safe_vector_operand (op0, mode0);
9848
9849 if ((optimize && !register_operand (op0, mode0))
9850 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9851 op0 = copy_to_mode_reg (mode0, op0);
9852
9853 op1 = GEN_INT (d->comparison);
9854
9855 pat = GEN_FCN (d->icode) (target, op0, op1);
9856 if (! pat)
9857 return 0;
9858 emit_insn (pat);
9859 return target;
9860 }
9861
9862 static rtx
9863 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
9864 tree exp, rtx target)
9865 {
9866 rtx pat;
9867 tree arg0 = CALL_EXPR_ARG (exp, 0);
9868 tree arg1 = CALL_EXPR_ARG (exp, 1);
9869 rtx op0 = expand_normal (arg0);
9870 rtx op1 = expand_normal (arg1);
9871 rtx op2;
9872 machine_mode tmode = insn_data[d->icode].operand[0].mode;
9873 machine_mode mode0 = insn_data[d->icode].operand[1].mode;
9874 machine_mode mode1 = insn_data[d->icode].operand[2].mode;
9875
9876 if (optimize || target == 0
9877 || GET_MODE (target) != tmode
9878 || !insn_data[d->icode].operand[0].predicate (target, tmode))
9879 target = gen_reg_rtx (tmode);
9880
9881 op0 = safe_vector_operand (op0, mode0);
9882 op1 = safe_vector_operand (op1, mode1);
9883
9884 if ((optimize && !register_operand (op0, mode0))
9885 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9886 op0 = copy_to_mode_reg (mode0, op0);
9887 if ((optimize && !register_operand (op1, mode1))
9888 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9889 op1 = copy_to_mode_reg (mode1, op1);
9890
9891 op2 = GEN_INT (d->comparison);
9892
9893 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
9894 if (! pat)
9895 return 0;
9896 emit_insn (pat);
9897 return target;
9898 }
9899
9900 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
9901
9902 static rtx
9903 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
9904 rtx target)
9905 {
9906 rtx pat;
9907 tree arg0 = CALL_EXPR_ARG (exp, 0);
9908 tree arg1 = CALL_EXPR_ARG (exp, 1);
9909 rtx op0 = expand_normal (arg0);
9910 rtx op1 = expand_normal (arg1);
9911 machine_mode mode0 = insn_data[d->icode].operand[0].mode;
9912 machine_mode mode1 = insn_data[d->icode].operand[1].mode;
9913 enum rtx_code comparison = d->comparison;
9914
9915 if (VECTOR_MODE_P (mode0))
9916 op0 = safe_vector_operand (op0, mode0);
9917 if (VECTOR_MODE_P (mode1))
9918 op1 = safe_vector_operand (op1, mode1);
9919
9920 target = gen_reg_rtx (SImode);
9921 emit_move_insn (target, const0_rtx);
9922 target = gen_rtx_SUBREG (QImode, target, 0);
9923
9924 if ((optimize && !register_operand (op0, mode0))
9925 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
9926 op0 = copy_to_mode_reg (mode0, op0);
9927 if ((optimize && !register_operand (op1, mode1))
9928 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
9929 op1 = copy_to_mode_reg (mode1, op1);
9930
9931 pat = GEN_FCN (d->icode) (op0, op1);
9932 if (! pat)
9933 return 0;
9934 emit_insn (pat);
9935 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
9936 gen_rtx_fmt_ee (comparison, QImode,
9937 SET_DEST (pat),
9938 const0_rtx)));
9939
9940 return SUBREG_REG (target);
9941 }
9942
9943 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
9944
9945 static rtx
9946 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
9947 tree exp, rtx target)
9948 {
9949 rtx pat;
9950 tree arg0 = CALL_EXPR_ARG (exp, 0);
9951 tree arg1 = CALL_EXPR_ARG (exp, 1);
9952 tree arg2 = CALL_EXPR_ARG (exp, 2);
9953 tree arg3 = CALL_EXPR_ARG (exp, 3);
9954 tree arg4 = CALL_EXPR_ARG (exp, 4);
9955 rtx scratch0, scratch1;
9956 rtx op0 = expand_normal (arg0);
9957 rtx op1 = expand_normal (arg1);
9958 rtx op2 = expand_normal (arg2);
9959 rtx op3 = expand_normal (arg3);
9960 rtx op4 = expand_normal (arg4);
9961 machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
9962
9963 tmode0 = insn_data[d->icode].operand[0].mode;
9964 tmode1 = insn_data[d->icode].operand[1].mode;
9965 modev2 = insn_data[d->icode].operand[2].mode;
9966 modei3 = insn_data[d->icode].operand[3].mode;
9967 modev4 = insn_data[d->icode].operand[4].mode;
9968 modei5 = insn_data[d->icode].operand[5].mode;
9969 modeimm = insn_data[d->icode].operand[6].mode;
9970
9971 if (VECTOR_MODE_P (modev2))
9972 op0 = safe_vector_operand (op0, modev2);
9973 if (VECTOR_MODE_P (modev4))
9974 op2 = safe_vector_operand (op2, modev4);
9975
9976 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
9977 op0 = copy_to_mode_reg (modev2, op0);
9978 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
9979 op1 = copy_to_mode_reg (modei3, op1);
9980 if ((optimize && !register_operand (op2, modev4))
9981 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
9982 op2 = copy_to_mode_reg (modev4, op2);
9983 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
9984 op3 = copy_to_mode_reg (modei5, op3);
9985
9986 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
9987 {
9988 error ("the fifth argument must be an 8-bit immediate");
9989 return const0_rtx;
9990 }
9991
9992 if (d->code == IX86_BUILTIN_PCMPESTRI128)
9993 {
9994 if (optimize || !target
9995 || GET_MODE (target) != tmode0
9996 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
9997 target = gen_reg_rtx (tmode0);
9998
9999 scratch1 = gen_reg_rtx (tmode1);
10000
10001 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
10002 }
10003 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
10004 {
10005 if (optimize || !target
10006 || GET_MODE (target) != tmode1
10007 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10008 target = gen_reg_rtx (tmode1);
10009
10010 scratch0 = gen_reg_rtx (tmode0);
10011
10012 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
10013 }
10014 else
10015 {
10016 gcc_assert (d->flag);
10017
10018 scratch0 = gen_reg_rtx (tmode0);
10019 scratch1 = gen_reg_rtx (tmode1);
10020
10021 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
10022 }
10023
10024 if (! pat)
10025 return 0;
10026
10027 emit_insn (pat);
10028
10029 if (d->flag)
10030 {
10031 target = gen_reg_rtx (SImode);
10032 emit_move_insn (target, const0_rtx);
10033 target = gen_rtx_SUBREG (QImode, target, 0);
10034
10035 emit_insn
10036 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10037 gen_rtx_fmt_ee (EQ, QImode,
10038 gen_rtx_REG ((machine_mode) d->flag,
10039 FLAGS_REG),
10040 const0_rtx)));
10041 return SUBREG_REG (target);
10042 }
10043 else
10044 return target;
10045 }
10046
10047
10048 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
10049
10050 static rtx
10051 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
10052 tree exp, rtx target)
10053 {
10054 rtx pat;
10055 tree arg0 = CALL_EXPR_ARG (exp, 0);
10056 tree arg1 = CALL_EXPR_ARG (exp, 1);
10057 tree arg2 = CALL_EXPR_ARG (exp, 2);
10058 rtx scratch0, scratch1;
10059 rtx op0 = expand_normal (arg0);
10060 rtx op1 = expand_normal (arg1);
10061 rtx op2 = expand_normal (arg2);
10062 machine_mode tmode0, tmode1, modev2, modev3, modeimm;
10063
10064 tmode0 = insn_data[d->icode].operand[0].mode;
10065 tmode1 = insn_data[d->icode].operand[1].mode;
10066 modev2 = insn_data[d->icode].operand[2].mode;
10067 modev3 = insn_data[d->icode].operand[3].mode;
10068 modeimm = insn_data[d->icode].operand[4].mode;
10069
10070 if (VECTOR_MODE_P (modev2))
10071 op0 = safe_vector_operand (op0, modev2);
10072 if (VECTOR_MODE_P (modev3))
10073 op1 = safe_vector_operand (op1, modev3);
10074
10075 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
10076 op0 = copy_to_mode_reg (modev2, op0);
10077 if ((optimize && !register_operand (op1, modev3))
10078 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
10079 op1 = copy_to_mode_reg (modev3, op1);
10080
10081 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
10082 {
10083 error ("the third argument must be an 8-bit immediate");
10084 return const0_rtx;
10085 }
10086
10087 if (d->code == IX86_BUILTIN_PCMPISTRI128)
10088 {
10089 if (optimize || !target
10090 || GET_MODE (target) != tmode0
10091 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
10092 target = gen_reg_rtx (tmode0);
10093
10094 scratch1 = gen_reg_rtx (tmode1);
10095
10096 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
10097 }
10098 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
10099 {
10100 if (optimize || !target
10101 || GET_MODE (target) != tmode1
10102 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
10103 target = gen_reg_rtx (tmode1);
10104
10105 scratch0 = gen_reg_rtx (tmode0);
10106
10107 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
10108 }
10109 else
10110 {
10111 gcc_assert (d->flag);
10112
10113 scratch0 = gen_reg_rtx (tmode0);
10114 scratch1 = gen_reg_rtx (tmode1);
10115
10116 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
10117 }
10118
10119 if (! pat)
10120 return 0;
10121
10122 emit_insn (pat);
10123
10124 if (d->flag)
10125 {
10126 target = gen_reg_rtx (SImode);
10127 emit_move_insn (target, const0_rtx);
10128 target = gen_rtx_SUBREG (QImode, target, 0);
10129
10130 emit_insn
10131 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
10132 gen_rtx_fmt_ee (EQ, QImode,
10133 gen_rtx_REG ((machine_mode) d->flag,
10134 FLAGS_REG),
10135 const0_rtx)));
10136 return SUBREG_REG (target);
10137 }
10138 else
10139 return target;
10140 }
10141
10142 /* Fixup modeless constants to fit required mode. */
10143
10144 static rtx
10145 fixup_modeless_constant (rtx x, machine_mode mode)
10146 {
10147 if (GET_MODE (x) == VOIDmode)
10148 x = convert_to_mode (mode, x, 1);
10149 return x;
10150 }
10151
10152 /* Subroutine of ix86_expand_builtin to take care of insns with
10153 variable number of operands. */
10154
10155 static rtx
10156 ix86_expand_args_builtin (const struct builtin_description *d,
10157 tree exp, rtx target)
10158 {
10159 rtx pat, real_target;
10160 unsigned int i, nargs;
10161 unsigned int nargs_constant = 0;
10162 unsigned int mask_pos = 0;
10163 int num_memory = 0;
10164 rtx xops[6];
10165 bool second_arg_count = false;
10166 enum insn_code icode = d->icode;
10167 const struct insn_data_d *insn_p = &insn_data[icode];
10168 machine_mode tmode = insn_p->operand[0].mode;
10169 machine_mode rmode = VOIDmode;
10170 bool swap = false;
10171 enum rtx_code comparison = d->comparison;
10172
10173 switch ((enum ix86_builtin_func_type) d->flag)
10174 {
10175 case V2DF_FTYPE_V2DF_ROUND:
10176 case V4DF_FTYPE_V4DF_ROUND:
10177 case V8DF_FTYPE_V8DF_ROUND:
10178 case V4SF_FTYPE_V4SF_ROUND:
10179 case V8SF_FTYPE_V8SF_ROUND:
10180 case V16SF_FTYPE_V16SF_ROUND:
10181 case V8HF_FTYPE_V8HF_ROUND:
10182 case V16HF_FTYPE_V16HF_ROUND:
10183 case V32HF_FTYPE_V32HF_ROUND:
10184 case V4SI_FTYPE_V4SF_ROUND:
10185 case V8SI_FTYPE_V8SF_ROUND:
10186 case V16SI_FTYPE_V16SF_ROUND:
10187 return ix86_expand_sse_round (d, exp, target);
10188 case V4SI_FTYPE_V2DF_V2DF_ROUND:
10189 case V8SI_FTYPE_V4DF_V4DF_ROUND:
10190 case V16SI_FTYPE_V8DF_V8DF_ROUND:
10191 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
10192 case INT_FTYPE_V8SF_V8SF_PTEST:
10193 case INT_FTYPE_V4DI_V4DI_PTEST:
10194 case INT_FTYPE_V4DF_V4DF_PTEST:
10195 case INT_FTYPE_V4SF_V4SF_PTEST:
10196 case INT_FTYPE_V2DI_V2DI_PTEST:
10197 case INT_FTYPE_V2DF_V2DF_PTEST:
10198 return ix86_expand_sse_ptest (d, exp, target);
10199 case FLOAT128_FTYPE_FLOAT128:
10200 case FLOAT_FTYPE_FLOAT:
10201 case INT_FTYPE_INT:
10202 case UINT_FTYPE_UINT:
10203 case UINT16_FTYPE_UINT16:
10204 case UINT64_FTYPE_INT:
10205 case UINT64_FTYPE_UINT64:
10206 case INT64_FTYPE_INT64:
10207 case INT64_FTYPE_V4SF:
10208 case INT64_FTYPE_V2DF:
10209 case INT_FTYPE_V16QI:
10210 case INT_FTYPE_V8QI:
10211 case INT_FTYPE_V8SF:
10212 case INT_FTYPE_V4DF:
10213 case INT_FTYPE_V4SF:
10214 case INT_FTYPE_V2DF:
10215 case INT_FTYPE_V32QI:
10216 case V16QI_FTYPE_V16QI:
10217 case V8SI_FTYPE_V8SF:
10218 case V8SI_FTYPE_V4SI:
10219 case V8HI_FTYPE_V8HI:
10220 case V8HI_FTYPE_V16QI:
10221 case V8QI_FTYPE_V8QI:
10222 case V8SF_FTYPE_V8SF:
10223 case V8SF_FTYPE_V8SI:
10224 case V8SF_FTYPE_V4SF:
10225 case V8SF_FTYPE_V8HI:
10226 case V4SI_FTYPE_V4SI:
10227 case V4SI_FTYPE_V16QI:
10228 case V4SI_FTYPE_V4SF:
10229 case V4SI_FTYPE_V8SI:
10230 case V4SI_FTYPE_V8HI:
10231 case V4SI_FTYPE_V4DF:
10232 case V4SI_FTYPE_V2DF:
10233 case V4HI_FTYPE_V4HI:
10234 case V4DF_FTYPE_V4DF:
10235 case V4DF_FTYPE_V4SI:
10236 case V4DF_FTYPE_V4SF:
10237 case V4DF_FTYPE_V2DF:
10238 case V4SF_FTYPE_V4SF:
10239 case V4SF_FTYPE_V4SI:
10240 case V4SF_FTYPE_V8SF:
10241 case V4SF_FTYPE_V4DF:
10242 case V4SF_FTYPE_V8HI:
10243 case V4SF_FTYPE_V2DF:
10244 case V2DI_FTYPE_V2DI:
10245 case V2DI_FTYPE_V16QI:
10246 case V2DI_FTYPE_V8HI:
10247 case V2DI_FTYPE_V4SI:
10248 case V2DF_FTYPE_V2DF:
10249 case V2DF_FTYPE_V4SI:
10250 case V2DF_FTYPE_V4DF:
10251 case V2DF_FTYPE_V4SF:
10252 case V2DF_FTYPE_V2SI:
10253 case V2SI_FTYPE_V2SI:
10254 case V2SI_FTYPE_V4SF:
10255 case V2SI_FTYPE_V2SF:
10256 case V2SI_FTYPE_V2DF:
10257 case V2SF_FTYPE_V2SF:
10258 case V2SF_FTYPE_V2SI:
10259 case V32QI_FTYPE_V32QI:
10260 case V32QI_FTYPE_V16QI:
10261 case V16HI_FTYPE_V16HI:
10262 case V16HI_FTYPE_V8HI:
10263 case V8SI_FTYPE_V8SI:
10264 case V16HI_FTYPE_V16QI:
10265 case V8SI_FTYPE_V16QI:
10266 case V4DI_FTYPE_V16QI:
10267 case V8SI_FTYPE_V8HI:
10268 case V4DI_FTYPE_V8HI:
10269 case V4DI_FTYPE_V4SI:
10270 case V4DI_FTYPE_V2DI:
10271 case UQI_FTYPE_UQI:
10272 case UHI_FTYPE_UHI:
10273 case USI_FTYPE_USI:
10274 case USI_FTYPE_UQI:
10275 case USI_FTYPE_UHI:
10276 case UDI_FTYPE_UDI:
10277 case UHI_FTYPE_V16QI:
10278 case USI_FTYPE_V32QI:
10279 case UDI_FTYPE_V64QI:
10280 case V16QI_FTYPE_UHI:
10281 case V32QI_FTYPE_USI:
10282 case V64QI_FTYPE_UDI:
10283 case V8HI_FTYPE_UQI:
10284 case V16HI_FTYPE_UHI:
10285 case V32HI_FTYPE_USI:
10286 case V4SI_FTYPE_UQI:
10287 case V8SI_FTYPE_UQI:
10288 case V4SI_FTYPE_UHI:
10289 case V8SI_FTYPE_UHI:
10290 case UQI_FTYPE_V8HI:
10291 case UHI_FTYPE_V16HI:
10292 case USI_FTYPE_V32HI:
10293 case UQI_FTYPE_V4SI:
10294 case UQI_FTYPE_V8SI:
10295 case UHI_FTYPE_V16SI:
10296 case UQI_FTYPE_V2DI:
10297 case UQI_FTYPE_V4DI:
10298 case UQI_FTYPE_V8DI:
10299 case V16SI_FTYPE_UHI:
10300 case V2DI_FTYPE_UQI:
10301 case V4DI_FTYPE_UQI:
10302 case V16SI_FTYPE_INT:
10303 case V16SF_FTYPE_V8SF:
10304 case V16SI_FTYPE_V8SI:
10305 case V16SF_FTYPE_V4SF:
10306 case V16SI_FTYPE_V4SI:
10307 case V16SI_FTYPE_V16SF:
10308 case V16SI_FTYPE_V16SI:
10309 case V64QI_FTYPE_V64QI:
10310 case V32HI_FTYPE_V32HI:
10311 case V16SF_FTYPE_V16SF:
10312 case V8DI_FTYPE_UQI:
10313 case V8DI_FTYPE_V8DI:
10314 case V8DF_FTYPE_V4DF:
10315 case V8DF_FTYPE_V2DF:
10316 case V8DF_FTYPE_V8DF:
10317 case V4DI_FTYPE_V4DI:
10318 case V16HI_FTYPE_V16SF:
10319 case V8HI_FTYPE_V8SF:
10320 case V8HI_FTYPE_V4SF:
10321 nargs = 1;
10322 break;
10323 case V4SF_FTYPE_V4SF_VEC_MERGE:
10324 case V2DF_FTYPE_V2DF_VEC_MERGE:
10325 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
10326 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
10327 case V16QI_FTYPE_V16QI_V16QI:
10328 case V16QI_FTYPE_V8HI_V8HI:
10329 case V16HF_FTYPE_V16HF_V16HF:
10330 case V16SF_FTYPE_V16SF_V16SF:
10331 case V8QI_FTYPE_V8QI_V8QI:
10332 case V8QI_FTYPE_V4HI_V4HI:
10333 case V8HI_FTYPE_V8HI_V8HI:
10334 case V8HI_FTYPE_V16QI_V16QI:
10335 case V8HI_FTYPE_V4SI_V4SI:
10336 case V8HF_FTYPE_V8HF_V8HF:
10337 case V8SF_FTYPE_V8SF_V8SF:
10338 case V8SF_FTYPE_V8SF_V8SI:
10339 case V8DF_FTYPE_V8DF_V8DF:
10340 case V4SI_FTYPE_V4SI_V4SI:
10341 case V4SI_FTYPE_V8HI_V8HI:
10342 case V4SI_FTYPE_V2DF_V2DF:
10343 case V4HI_FTYPE_V4HI_V4HI:
10344 case V4HI_FTYPE_V8QI_V8QI:
10345 case V4HI_FTYPE_V2SI_V2SI:
10346 case V4DF_FTYPE_V4DF_V4DF:
10347 case V4DF_FTYPE_V4DF_V4DI:
10348 case V4SF_FTYPE_V4SF_V4SF:
10349 case V4SF_FTYPE_V4SF_V4SI:
10350 case V4SF_FTYPE_V4SF_V2SI:
10351 case V4SF_FTYPE_V4SF_V2DF:
10352 case V4SF_FTYPE_V4SF_UINT:
10353 case V4SF_FTYPE_V4SF_DI:
10354 case V4SF_FTYPE_V4SF_SI:
10355 case V2DI_FTYPE_V2DI_V2DI:
10356 case V2DI_FTYPE_V16QI_V16QI:
10357 case V2DI_FTYPE_V4SI_V4SI:
10358 case V2DI_FTYPE_V2DI_V16QI:
10359 case V2SI_FTYPE_V2SI_V2SI:
10360 case V2SI_FTYPE_V4HI_V4HI:
10361 case V2SI_FTYPE_V2SF_V2SF:
10362 case V2DF_FTYPE_V2DF_V2DF:
10363 case V2DF_FTYPE_V2DF_V4SF:
10364 case V2DF_FTYPE_V2DF_V2DI:
10365 case V2DF_FTYPE_V2DF_DI:
10366 case V2DF_FTYPE_V2DF_SI:
10367 case V2DF_FTYPE_V2DF_UINT:
10368 case V2SF_FTYPE_V2SF_V2SF:
10369 case V1DI_FTYPE_V1DI_V1DI:
10370 case V1DI_FTYPE_V8QI_V8QI:
10371 case V1DI_FTYPE_V2SI_V2SI:
10372 case V32QI_FTYPE_V16HI_V16HI:
10373 case V16HI_FTYPE_V8SI_V8SI:
10374 case V64QI_FTYPE_V64QI_V64QI:
10375 case V32QI_FTYPE_V32QI_V32QI:
10376 case V16HI_FTYPE_V32QI_V32QI:
10377 case V16HI_FTYPE_V16HI_V16HI:
10378 case V8SI_FTYPE_V4DF_V4DF:
10379 case V8SI_FTYPE_V8SI_V8SI:
10380 case V8SI_FTYPE_V16HI_V16HI:
10381 case V4DI_FTYPE_V4DI_V4DI:
10382 case V4DI_FTYPE_V8SI_V8SI:
10383 case V8DI_FTYPE_V64QI_V64QI:
10384 if (comparison == UNKNOWN)
10385 return ix86_expand_binop_builtin (icode, exp, target);
10386 nargs = 2;
10387 break;
10388 case V4SF_FTYPE_V4SF_V4SF_SWAP:
10389 case V2DF_FTYPE_V2DF_V2DF_SWAP:
10390 gcc_assert (comparison != UNKNOWN);
10391 nargs = 2;
10392 swap = true;
10393 break;
10394 case V16HI_FTYPE_V16HI_V8HI_COUNT:
10395 case V16HI_FTYPE_V16HI_SI_COUNT:
10396 case V8SI_FTYPE_V8SI_V4SI_COUNT:
10397 case V8SI_FTYPE_V8SI_SI_COUNT:
10398 case V4DI_FTYPE_V4DI_V2DI_COUNT:
10399 case V4DI_FTYPE_V4DI_INT_COUNT:
10400 case V8HI_FTYPE_V8HI_V8HI_COUNT:
10401 case V8HI_FTYPE_V8HI_SI_COUNT:
10402 case V4SI_FTYPE_V4SI_V4SI_COUNT:
10403 case V4SI_FTYPE_V4SI_SI_COUNT:
10404 case V4HI_FTYPE_V4HI_V4HI_COUNT:
10405 case V4HI_FTYPE_V4HI_SI_COUNT:
10406 case V2DI_FTYPE_V2DI_V2DI_COUNT:
10407 case V2DI_FTYPE_V2DI_SI_COUNT:
10408 case V2SI_FTYPE_V2SI_V2SI_COUNT:
10409 case V2SI_FTYPE_V2SI_SI_COUNT:
10410 case V1DI_FTYPE_V1DI_V1DI_COUNT:
10411 case V1DI_FTYPE_V1DI_SI_COUNT:
10412 nargs = 2;
10413 second_arg_count = true;
10414 break;
10415 case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
10416 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
10417 case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
10418 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
10419 case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
10420 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
10421 case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
10422 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
10423 case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
10424 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
10425 case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
10426 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
10427 case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
10428 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
10429 case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
10430 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
10431 case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
10432 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
10433 nargs = 4;
10434 second_arg_count = true;
10435 break;
10436 case UINT64_FTYPE_UINT64_UINT64:
10437 case UINT_FTYPE_UINT_UINT:
10438 case UINT_FTYPE_UINT_USHORT:
10439 case UINT_FTYPE_UINT_UCHAR:
10440 case UINT16_FTYPE_UINT16_INT:
10441 case UINT8_FTYPE_UINT8_INT:
10442 case UQI_FTYPE_UQI_UQI:
10443 case UHI_FTYPE_UHI_UHI:
10444 case USI_FTYPE_USI_USI:
10445 case UDI_FTYPE_UDI_UDI:
10446 case V16SI_FTYPE_V8DF_V8DF:
10447 case V32HI_FTYPE_V16SF_V16SF:
10448 case V16HI_FTYPE_V8SF_V8SF:
10449 case V8HI_FTYPE_V4SF_V4SF:
10450 case V16HI_FTYPE_V16SF_UHI:
10451 case V8HI_FTYPE_V8SF_UQI:
10452 case V8HI_FTYPE_V4SF_UQI:
10453 nargs = 2;
10454 break;
10455 case V2DI_FTYPE_V2DI_INT_CONVERT:
10456 nargs = 2;
10457 rmode = V1TImode;
10458 nargs_constant = 1;
10459 break;
10460 case V4DI_FTYPE_V4DI_INT_CONVERT:
10461 nargs = 2;
10462 rmode = V2TImode;
10463 nargs_constant = 1;
10464 break;
10465 case V8DI_FTYPE_V8DI_INT_CONVERT:
10466 nargs = 2;
10467 rmode = V4TImode;
10468 nargs_constant = 1;
10469 break;
10470 case V8HI_FTYPE_V8HI_INT:
10471 case V8HI_FTYPE_V8SF_INT:
10472 case V16HI_FTYPE_V16SF_INT:
10473 case V8HI_FTYPE_V4SF_INT:
10474 case V8SF_FTYPE_V8SF_INT:
10475 case V4SF_FTYPE_V16SF_INT:
10476 case V16SF_FTYPE_V16SF_INT:
10477 case V4SI_FTYPE_V4SI_INT:
10478 case V4SI_FTYPE_V8SI_INT:
10479 case V4HI_FTYPE_V4HI_INT:
10480 case V4DF_FTYPE_V4DF_INT:
10481 case V4DF_FTYPE_V8DF_INT:
10482 case V4SF_FTYPE_V4SF_INT:
10483 case V4SF_FTYPE_V8SF_INT:
10484 case V2DI_FTYPE_V2DI_INT:
10485 case V2DF_FTYPE_V2DF_INT:
10486 case V2DF_FTYPE_V4DF_INT:
10487 case V16HI_FTYPE_V16HI_INT:
10488 case V8SI_FTYPE_V8SI_INT:
10489 case V16SI_FTYPE_V16SI_INT:
10490 case V4SI_FTYPE_V16SI_INT:
10491 case V4DI_FTYPE_V4DI_INT:
10492 case V2DI_FTYPE_V4DI_INT:
10493 case V4DI_FTYPE_V8DI_INT:
10494 case UQI_FTYPE_UQI_UQI_CONST:
10495 case UHI_FTYPE_UHI_UQI:
10496 case USI_FTYPE_USI_UQI:
10497 case UDI_FTYPE_UDI_UQI:
10498 nargs = 2;
10499 nargs_constant = 1;
10500 break;
10501 case V16QI_FTYPE_V16QI_V16QI_V16QI:
10502 case V8SF_FTYPE_V8SF_V8SF_V8SF:
10503 case V4DF_FTYPE_V4DF_V4DF_V4DF:
10504 case V4SF_FTYPE_V4SF_V4SF_V4SF:
10505 case V2DF_FTYPE_V2DF_V2DF_V2DF:
10506 case V32QI_FTYPE_V32QI_V32QI_V32QI:
10507 case UHI_FTYPE_V16SI_V16SI_UHI:
10508 case UQI_FTYPE_V8DI_V8DI_UQI:
10509 case V16HI_FTYPE_V16SI_V16HI_UHI:
10510 case V16QI_FTYPE_V16SI_V16QI_UHI:
10511 case V16QI_FTYPE_V8DI_V16QI_UQI:
10512 case V32HF_FTYPE_V32HF_V32HF_USI:
10513 case V16SF_FTYPE_V16SF_V16SF_UHI:
10514 case V16SF_FTYPE_V4SF_V16SF_UHI:
10515 case V16SI_FTYPE_SI_V16SI_UHI:
10516 case V16SI_FTYPE_V16HI_V16SI_UHI:
10517 case V16SI_FTYPE_V16QI_V16SI_UHI:
10518 case V8SF_FTYPE_V4SF_V8SF_UQI:
10519 case V4DF_FTYPE_V2DF_V4DF_UQI:
10520 case V8SI_FTYPE_V4SI_V8SI_UQI:
10521 case V8SI_FTYPE_SI_V8SI_UQI:
10522 case V4SI_FTYPE_V4SI_V4SI_UQI:
10523 case V4SI_FTYPE_SI_V4SI_UQI:
10524 case V4DI_FTYPE_V2DI_V4DI_UQI:
10525 case V4DI_FTYPE_DI_V4DI_UQI:
10526 case V2DI_FTYPE_V2DI_V2DI_UQI:
10527 case V2DI_FTYPE_DI_V2DI_UQI:
10528 case V64QI_FTYPE_V64QI_V64QI_UDI:
10529 case V64QI_FTYPE_V16QI_V64QI_UDI:
10530 case V64QI_FTYPE_QI_V64QI_UDI:
10531 case V32QI_FTYPE_V32QI_V32QI_USI:
10532 case V32QI_FTYPE_V16QI_V32QI_USI:
10533 case V32QI_FTYPE_QI_V32QI_USI:
10534 case V16QI_FTYPE_V16QI_V16QI_UHI:
10535 case V16QI_FTYPE_QI_V16QI_UHI:
10536 case V32HI_FTYPE_V8HI_V32HI_USI:
10537 case V32HI_FTYPE_HI_V32HI_USI:
10538 case V16HI_FTYPE_V8HI_V16HI_UHI:
10539 case V16HI_FTYPE_HI_V16HI_UHI:
10540 case V8HI_FTYPE_V8HI_V8HI_UQI:
10541 case V8HI_FTYPE_HI_V8HI_UQI:
10542 case V16HF_FTYPE_V16HF_V16HF_UHI:
10543 case V8SF_FTYPE_V8HI_V8SF_UQI:
10544 case V4SF_FTYPE_V8HI_V4SF_UQI:
10545 case V8SI_FTYPE_V8HF_V8SI_UQI:
10546 case V8SF_FTYPE_V8HF_V8SF_UQI:
10547 case V8SI_FTYPE_V8SF_V8SI_UQI:
10548 case V4SI_FTYPE_V4SF_V4SI_UQI:
10549 case V4SI_FTYPE_V8HF_V4SI_UQI:
10550 case V4SF_FTYPE_V8HF_V4SF_UQI:
10551 case V4DI_FTYPE_V8HF_V4DI_UQI:
10552 case V4DI_FTYPE_V4SF_V4DI_UQI:
10553 case V2DI_FTYPE_V8HF_V2DI_UQI:
10554 case V2DI_FTYPE_V4SF_V2DI_UQI:
10555 case V8HF_FTYPE_V8HF_V8HF_UQI:
10556 case V8HF_FTYPE_V8HF_V8HF_V8HF:
10557 case V8HF_FTYPE_V8HI_V8HF_UQI:
10558 case V8HF_FTYPE_V8SI_V8HF_UQI:
10559 case V8HF_FTYPE_V8SF_V8HF_UQI:
10560 case V8HF_FTYPE_V4SI_V8HF_UQI:
10561 case V8HF_FTYPE_V4SF_V8HF_UQI:
10562 case V8HF_FTYPE_V4DI_V8HF_UQI:
10563 case V8HF_FTYPE_V4DF_V8HF_UQI:
10564 case V8HF_FTYPE_V2DI_V8HF_UQI:
10565 case V8HF_FTYPE_V2DF_V8HF_UQI:
10566 case V4SF_FTYPE_V4DI_V4SF_UQI:
10567 case V4SF_FTYPE_V2DI_V4SF_UQI:
10568 case V4DF_FTYPE_V4DI_V4DF_UQI:
10569 case V4DF_FTYPE_V8HF_V4DF_UQI:
10570 case V2DF_FTYPE_V8HF_V2DF_UQI:
10571 case V2DF_FTYPE_V2DI_V2DF_UQI:
10572 case V16QI_FTYPE_V8HI_V16QI_UQI:
10573 case V16QI_FTYPE_V16HI_V16QI_UHI:
10574 case V16QI_FTYPE_V4SI_V16QI_UQI:
10575 case V16QI_FTYPE_V8SI_V16QI_UQI:
10576 case V8HI_FTYPE_V8HF_V8HI_UQI:
10577 case V8HI_FTYPE_V4SI_V8HI_UQI:
10578 case V8HI_FTYPE_V8SI_V8HI_UQI:
10579 case V16QI_FTYPE_V2DI_V16QI_UQI:
10580 case V16QI_FTYPE_V4DI_V16QI_UQI:
10581 case V8HI_FTYPE_V2DI_V8HI_UQI:
10582 case V8HI_FTYPE_V4DI_V8HI_UQI:
10583 case V4SI_FTYPE_V2DI_V4SI_UQI:
10584 case V4SI_FTYPE_V4DI_V4SI_UQI:
10585 case V32QI_FTYPE_V32HI_V32QI_USI:
10586 case UHI_FTYPE_V16QI_V16QI_UHI:
10587 case USI_FTYPE_V32QI_V32QI_USI:
10588 case UDI_FTYPE_V64QI_V64QI_UDI:
10589 case UQI_FTYPE_V8HI_V8HI_UQI:
10590 case UHI_FTYPE_V16HI_V16HI_UHI:
10591 case USI_FTYPE_V32HI_V32HI_USI:
10592 case UQI_FTYPE_V4SI_V4SI_UQI:
10593 case UQI_FTYPE_V8SI_V8SI_UQI:
10594 case UQI_FTYPE_V2DI_V2DI_UQI:
10595 case UQI_FTYPE_V4DI_V4DI_UQI:
10596 case V4SF_FTYPE_V2DF_V4SF_UQI:
10597 case V4SF_FTYPE_V4DF_V4SF_UQI:
10598 case V16SI_FTYPE_V16SI_V16SI_UHI:
10599 case V16SI_FTYPE_V4SI_V16SI_UHI:
10600 case V2DI_FTYPE_V4SI_V2DI_UQI:
10601 case V2DI_FTYPE_V8HI_V2DI_UQI:
10602 case V2DI_FTYPE_V16QI_V2DI_UQI:
10603 case V4DI_FTYPE_V4DI_V4DI_UQI:
10604 case V4DI_FTYPE_V4SI_V4DI_UQI:
10605 case V4DI_FTYPE_V8HI_V4DI_UQI:
10606 case V4DI_FTYPE_V16QI_V4DI_UQI:
10607 case V4DI_FTYPE_V4DF_V4DI_UQI:
10608 case V2DI_FTYPE_V2DF_V2DI_UQI:
10609 case V4SI_FTYPE_V4DF_V4SI_UQI:
10610 case V4SI_FTYPE_V2DF_V4SI_UQI:
10611 case V4SI_FTYPE_V8HI_V4SI_UQI:
10612 case V4SI_FTYPE_V16QI_V4SI_UQI:
10613 case V4DI_FTYPE_V4DI_V4DI_V4DI:
10614 case V8DF_FTYPE_V2DF_V8DF_UQI:
10615 case V8DF_FTYPE_V4DF_V8DF_UQI:
10616 case V8DF_FTYPE_V8DF_V8DF_UQI:
10617 case V8SF_FTYPE_V8SF_V8SF_UQI:
10618 case V8SF_FTYPE_V8SI_V8SF_UQI:
10619 case V4DF_FTYPE_V4DF_V4DF_UQI:
10620 case V4SF_FTYPE_V4SF_V4SF_UQI:
10621 case V2DF_FTYPE_V2DF_V2DF_UQI:
10622 case V2DF_FTYPE_V4SF_V2DF_UQI:
10623 case V2DF_FTYPE_V4SI_V2DF_UQI:
10624 case V4SF_FTYPE_V4SI_V4SF_UQI:
10625 case V4DF_FTYPE_V4SF_V4DF_UQI:
10626 case V4DF_FTYPE_V4SI_V4DF_UQI:
10627 case V8SI_FTYPE_V8SI_V8SI_UQI:
10628 case V8SI_FTYPE_V8HI_V8SI_UQI:
10629 case V8SI_FTYPE_V16QI_V8SI_UQI:
10630 case V8DF_FTYPE_V8SI_V8DF_UQI:
10631 case V8DI_FTYPE_DI_V8DI_UQI:
10632 case V16SF_FTYPE_V8SF_V16SF_UHI:
10633 case V16SI_FTYPE_V8SI_V16SI_UHI:
10634 case V16HF_FTYPE_V16HI_V16HF_UHI:
10635 case V16HF_FTYPE_V16HF_V16HF_V16HF:
10636 case V16HI_FTYPE_V16HF_V16HI_UHI:
10637 case V16HI_FTYPE_V16HI_V16HI_UHI:
10638 case V8HI_FTYPE_V16QI_V8HI_UQI:
10639 case V16HI_FTYPE_V16QI_V16HI_UHI:
10640 case V32HI_FTYPE_V32HI_V32HI_USI:
10641 case V32HI_FTYPE_V32QI_V32HI_USI:
10642 case V8DI_FTYPE_V16QI_V8DI_UQI:
10643 case V8DI_FTYPE_V2DI_V8DI_UQI:
10644 case V8DI_FTYPE_V4DI_V8DI_UQI:
10645 case V8DI_FTYPE_V8DI_V8DI_UQI:
10646 case V8DI_FTYPE_V8HI_V8DI_UQI:
10647 case V8DI_FTYPE_V8SI_V8DI_UQI:
10648 case V8HI_FTYPE_V8DI_V8HI_UQI:
10649 case V8SI_FTYPE_V8DI_V8SI_UQI:
10650 case V4SI_FTYPE_V4SI_V4SI_V4SI:
10651 case V16SI_FTYPE_V16SI_V16SI_V16SI:
10652 case V8DI_FTYPE_V8DI_V8DI_V8DI:
10653 case V32HI_FTYPE_V32HI_V32HI_V32HI:
10654 case V2DI_FTYPE_V2DI_V2DI_V2DI:
10655 case V16HI_FTYPE_V16HI_V16HI_V16HI:
10656 case V8SI_FTYPE_V8SI_V8SI_V8SI:
10657 case V8HI_FTYPE_V8HI_V8HI_V8HI:
10658 case V32HI_FTYPE_V16SF_V16SF_USI:
10659 case V16HI_FTYPE_V8SF_V8SF_UHI:
10660 case V8HI_FTYPE_V4SF_V4SF_UQI:
10661 case V16HI_FTYPE_V16SF_V16HI_UHI:
10662 case V8HI_FTYPE_V8SF_V8HI_UQI:
10663 case V8HI_FTYPE_V4SF_V8HI_UQI:
10664 case V16SF_FTYPE_V16SF_V32HI_V32HI:
10665 case V8SF_FTYPE_V8SF_V16HI_V16HI:
10666 case V4SF_FTYPE_V4SF_V8HI_V8HI:
10667 nargs = 3;
10668 break;
10669 case V32QI_FTYPE_V32QI_V32QI_INT:
10670 case V16HI_FTYPE_V16HI_V16HI_INT:
10671 case V16QI_FTYPE_V16QI_V16QI_INT:
10672 case V4DI_FTYPE_V4DI_V4DI_INT:
10673 case V8HI_FTYPE_V8HI_V8HI_INT:
10674 case V8SI_FTYPE_V8SI_V8SI_INT:
10675 case V8SI_FTYPE_V8SI_V4SI_INT:
10676 case V8SF_FTYPE_V8SF_V8SF_INT:
10677 case V8SF_FTYPE_V8SF_V4SF_INT:
10678 case V4SI_FTYPE_V4SI_V4SI_INT:
10679 case V4DF_FTYPE_V4DF_V4DF_INT:
10680 case V16SF_FTYPE_V16SF_V16SF_INT:
10681 case V16SF_FTYPE_V16SF_V4SF_INT:
10682 case V16SI_FTYPE_V16SI_V4SI_INT:
10683 case V4DF_FTYPE_V4DF_V2DF_INT:
10684 case V4SF_FTYPE_V4SF_V4SF_INT:
10685 case V2DI_FTYPE_V2DI_V2DI_INT:
10686 case V4DI_FTYPE_V4DI_V2DI_INT:
10687 case V2DF_FTYPE_V2DF_V2DF_INT:
10688 case UQI_FTYPE_V8DI_V8UDI_INT:
10689 case UQI_FTYPE_V8DF_V8DF_INT:
10690 case UQI_FTYPE_V2DF_V2DF_INT:
10691 case UQI_FTYPE_V4SF_V4SF_INT:
10692 case UHI_FTYPE_V16SI_V16SI_INT:
10693 case UHI_FTYPE_V16SF_V16SF_INT:
10694 case V64QI_FTYPE_V64QI_V64QI_INT:
10695 case V32HI_FTYPE_V32HI_V32HI_INT:
10696 case V16SI_FTYPE_V16SI_V16SI_INT:
10697 case V8DI_FTYPE_V8DI_V8DI_INT:
10698 nargs = 3;
10699 nargs_constant = 1;
10700 break;
10701 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
10702 nargs = 3;
10703 rmode = V4DImode;
10704 nargs_constant = 1;
10705 break;
10706 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
10707 nargs = 3;
10708 rmode = V2DImode;
10709 nargs_constant = 1;
10710 break;
10711 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
10712 nargs = 3;
10713 rmode = DImode;
10714 nargs_constant = 1;
10715 break;
10716 case V2DI_FTYPE_V2DI_UINT_UINT:
10717 nargs = 3;
10718 nargs_constant = 2;
10719 break;
10720 case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
10721 nargs = 3;
10722 rmode = V8DImode;
10723 nargs_constant = 1;
10724 break;
10725 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
10726 nargs = 5;
10727 rmode = V8DImode;
10728 mask_pos = 2;
10729 nargs_constant = 1;
10730 break;
10731 case QI_FTYPE_V8DF_INT_UQI:
10732 case QI_FTYPE_V4DF_INT_UQI:
10733 case QI_FTYPE_V2DF_INT_UQI:
10734 case HI_FTYPE_V16SF_INT_UHI:
10735 case QI_FTYPE_V8SF_INT_UQI:
10736 case QI_FTYPE_V4SF_INT_UQI:
10737 case QI_FTYPE_V8HF_INT_UQI:
10738 case HI_FTYPE_V16HF_INT_UHI:
10739 case SI_FTYPE_V32HF_INT_USI:
10740 case V4SI_FTYPE_V4SI_V4SI_UHI:
10741 case V8SI_FTYPE_V8SI_V8SI_UHI:
10742 nargs = 3;
10743 mask_pos = 1;
10744 nargs_constant = 1;
10745 break;
10746 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
10747 nargs = 5;
10748 rmode = V4DImode;
10749 mask_pos = 2;
10750 nargs_constant = 1;
10751 break;
10752 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
10753 nargs = 5;
10754 rmode = V2DImode;
10755 mask_pos = 2;
10756 nargs_constant = 1;
10757 break;
10758 case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
10759 case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
10760 case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
10761 case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
10762 case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
10763 case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
10764 case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
10765 case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
10766 case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
10767 case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
10768 case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
10769 case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
10770 case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
10771 case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
10772 case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
10773 case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
10774 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
10775 case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
10776 case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
10777 case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
10778 case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
10779 case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
10780 case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
10781 case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
10782 case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
10783 case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
10784 case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
10785 case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
10786 case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
10787 case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
10788 case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
10789 case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
10790 case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
10791 case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
10792 case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
10793 case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
10794 case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
10795 case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
10796 case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
10797 case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
10798 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
10799 case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
10800 case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
10801 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
10802 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
10803 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
10804 case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
10805 case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
10806 case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
10807 case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
10808 case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
10809 case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
10810 case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
10811 case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
10812 case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
10813 case V32HI_FTYPE_V16SF_V16SF_V32HI_USI:
10814 case V16HI_FTYPE_V8SF_V8SF_V16HI_UHI:
10815 case V8HI_FTYPE_V4SF_V4SF_V8HI_UQI:
10816 nargs = 4;
10817 break;
10818 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
10819 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
10820 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
10821 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
10822 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
10823 nargs = 4;
10824 nargs_constant = 1;
10825 break;
10826 case UQI_FTYPE_V4DI_V4DI_INT_UQI:
10827 case UQI_FTYPE_V8SI_V8SI_INT_UQI:
10828 case QI_FTYPE_V4DF_V4DF_INT_UQI:
10829 case QI_FTYPE_V8SF_V8SF_INT_UQI:
10830 case UHI_FTYPE_V16HF_V16HF_INT_UHI:
10831 case UQI_FTYPE_V2DI_V2DI_INT_UQI:
10832 case UQI_FTYPE_V4SI_V4SI_INT_UQI:
10833 case UQI_FTYPE_V2DF_V2DF_INT_UQI:
10834 case UQI_FTYPE_V4SF_V4SF_INT_UQI:
10835 case UQI_FTYPE_V8HF_V8HF_INT_UQI:
10836 case UDI_FTYPE_V64QI_V64QI_INT_UDI:
10837 case USI_FTYPE_V32QI_V32QI_INT_USI:
10838 case UHI_FTYPE_V16QI_V16QI_INT_UHI:
10839 case USI_FTYPE_V32HI_V32HI_INT_USI:
10840 case USI_FTYPE_V32HF_V32HF_INT_USI:
10841 case UHI_FTYPE_V16HI_V16HI_INT_UHI:
10842 case UQI_FTYPE_V8HI_V8HI_INT_UQI:
10843 nargs = 4;
10844 mask_pos = 1;
10845 nargs_constant = 1;
10846 break;
10847 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
10848 nargs = 4;
10849 nargs_constant = 2;
10850 break;
10851 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
10852 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
10853 case V16SF_FTYPE_V16SF_V32HI_V32HI_UHI:
10854 case V8SF_FTYPE_V8SF_V16HI_V16HI_UQI:
10855 case V4SF_FTYPE_V4SF_V8HI_V8HI_UQI:
10856 nargs = 4;
10857 break;
10858 case UQI_FTYPE_V8DI_V8DI_INT_UQI:
10859 case UHI_FTYPE_V16SI_V16SI_INT_UHI:
10860 mask_pos = 1;
10861 nargs = 4;
10862 nargs_constant = 1;
10863 break;
10864 case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
10865 case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
10866 case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
10867 case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
10868 case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
10869 case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
10870 case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
10871 case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
10872 case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
10873 case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
10874 case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
10875 case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
10876 case V32HI_FTYPE_V32HI_INT_V32HI_USI:
10877 case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
10878 case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
10879 case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
10880 case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
10881 case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
10882 case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
10883 case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
10884 case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
10885 case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
10886 case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
10887 case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
10888 case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
10889 case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
10890 case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
10891 case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
10892 case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
10893 case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
10894 case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
10895 case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
10896 nargs = 4;
10897 mask_pos = 2;
10898 nargs_constant = 1;
10899 break;
10900 case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
10901 case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
10902 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
10903 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
10904 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
10905 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
10906 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
10907 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
10908 case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
10909 case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
10910 case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
10911 case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
10912 case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
10913 case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
10914 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
10915 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
10916 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
10917 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
10918 case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
10919 case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
10920 case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
10921 case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
10922 case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
10923 case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
10924 case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
10925 case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
10926 case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
10927 nargs = 5;
10928 mask_pos = 2;
10929 nargs_constant = 1;
10930 break;
10931 case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
10932 case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
10933 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
10934 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
10935 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
10936 case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
10937 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
10938 case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
10939 case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
10940 case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
10941 nargs = 5;
10942 mask_pos = 1;
10943 nargs_constant = 1;
10944 break;
10945 case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
10946 case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
10947 case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
10948 case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
10949 case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
10950 case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
10951 case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
10952 case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
10953 case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
10954 case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
10955 case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
10956 case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
10957 nargs = 5;
10958 mask_pos = 1;
10959 nargs_constant = 2;
10960 break;
10961
10962 default:
10963 gcc_unreachable ();
10964 }
10965
10966 gcc_assert (nargs <= ARRAY_SIZE (xops));
10967
10968 if (comparison != UNKNOWN)
10969 {
10970 gcc_assert (nargs == 2);
10971 return ix86_expand_sse_compare (d, exp, target, swap);
10972 }
10973
10974 if (rmode == VOIDmode || rmode == tmode)
10975 {
10976 if (optimize
10977 || target == 0
10978 || GET_MODE (target) != tmode
10979 || !insn_p->operand[0].predicate (target, tmode))
10980 target = gen_reg_rtx (tmode);
10981 else if (memory_operand (target, tmode))
10982 num_memory++;
10983 real_target = target;
10984 }
10985 else
10986 {
10987 real_target = gen_reg_rtx (tmode);
10988 target = lowpart_subreg (rmode, real_target, tmode);
10989 }
10990
10991 for (i = 0; i < nargs; i++)
10992 {
10993 tree arg = CALL_EXPR_ARG (exp, i);
10994 rtx op = expand_normal (arg);
10995 machine_mode mode = insn_p->operand[i + 1].mode;
10996 bool match = insn_p->operand[i + 1].predicate (op, mode);
10997
10998 if (second_arg_count && i == 1)
10999 {
11000 /* SIMD shift insns take either an 8-bit immediate or
11001 register as count. But builtin functions take int as
11002 count. If count doesn't match, we put it in register.
11003 The instructions are using 64-bit count, if op is just
11004 32-bit, zero-extend it, as negative shift counts
11005 are undefined behavior and zero-extension is more
11006 efficient. */
11007 if (!match)
11008 {
11009 if (SCALAR_INT_MODE_P (GET_MODE (op)))
11010 op = convert_modes (mode, GET_MODE (op), op, 1);
11011 else
11012 op = lowpart_subreg (mode, op, GET_MODE (op));
11013 if (!insn_p->operand[i + 1].predicate (op, mode))
11014 op = copy_to_reg (op);
11015 }
11016 }
11017 else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11018 (!mask_pos && (nargs - i) <= nargs_constant))
11019 {
11020 if (!match)
11021 switch (icode)
11022 {
11023 case CODE_FOR_avx_vinsertf128v4di:
11024 case CODE_FOR_avx_vextractf128v4di:
11025 error ("the last argument must be an 1-bit immediate");
11026 return const0_rtx;
11027
11028 case CODE_FOR_avx512f_cmpv8di3_mask:
11029 case CODE_FOR_avx512f_cmpv16si3_mask:
11030 case CODE_FOR_avx512f_ucmpv8di3_mask:
11031 case CODE_FOR_avx512f_ucmpv16si3_mask:
11032 case CODE_FOR_avx512vl_cmpv4di3_mask:
11033 case CODE_FOR_avx512vl_cmpv8si3_mask:
11034 case CODE_FOR_avx512vl_ucmpv4di3_mask:
11035 case CODE_FOR_avx512vl_ucmpv8si3_mask:
11036 case CODE_FOR_avx512vl_cmpv2di3_mask:
11037 case CODE_FOR_avx512vl_cmpv4si3_mask:
11038 case CODE_FOR_avx512vl_ucmpv2di3_mask:
11039 case CODE_FOR_avx512vl_ucmpv4si3_mask:
11040 error ("the last argument must be a 3-bit immediate");
11041 return const0_rtx;
11042
11043 case CODE_FOR_sse4_1_roundsd:
11044 case CODE_FOR_sse4_1_roundss:
11045
11046 case CODE_FOR_sse4_1_roundpd:
11047 case CODE_FOR_sse4_1_roundps:
11048 case CODE_FOR_avx_roundpd256:
11049 case CODE_FOR_avx_roundps256:
11050
11051 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
11052 case CODE_FOR_sse4_1_roundps_sfix:
11053 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
11054 case CODE_FOR_avx_roundps_sfix256:
11055
11056 case CODE_FOR_sse4_1_blendps:
11057 case CODE_FOR_avx_blendpd256:
11058 case CODE_FOR_avx_vpermilv4df:
11059 case CODE_FOR_avx_vpermilv4df_mask:
11060 case CODE_FOR_avx512f_getmantv8df_mask:
11061 case CODE_FOR_avx512f_getmantv16sf_mask:
11062 case CODE_FOR_avx512vl_getmantv16hf_mask:
11063 case CODE_FOR_avx512vl_getmantv8sf_mask:
11064 case CODE_FOR_avx512vl_getmantv4df_mask:
11065 case CODE_FOR_avx512fp16_getmantv8hf_mask:
11066 case CODE_FOR_avx512vl_getmantv4sf_mask:
11067 case CODE_FOR_avx512vl_getmantv2df_mask:
11068 case CODE_FOR_avx512dq_rangepv8df_mask_round:
11069 case CODE_FOR_avx512dq_rangepv16sf_mask_round:
11070 case CODE_FOR_avx512dq_rangepv4df_mask:
11071 case CODE_FOR_avx512dq_rangepv8sf_mask:
11072 case CODE_FOR_avx512dq_rangepv2df_mask:
11073 case CODE_FOR_avx512dq_rangepv4sf_mask:
11074 case CODE_FOR_avx_shufpd256_mask:
11075 error ("the last argument must be a 4-bit immediate");
11076 return const0_rtx;
11077
11078 case CODE_FOR_sha1rnds4:
11079 case CODE_FOR_sse4_1_blendpd:
11080 case CODE_FOR_avx_vpermilv2df:
11081 case CODE_FOR_avx_vpermilv2df_mask:
11082 case CODE_FOR_xop_vpermil2v2df3:
11083 case CODE_FOR_xop_vpermil2v4sf3:
11084 case CODE_FOR_xop_vpermil2v4df3:
11085 case CODE_FOR_xop_vpermil2v8sf3:
11086 case CODE_FOR_avx512f_vinsertf32x4_mask:
11087 case CODE_FOR_avx512f_vinserti32x4_mask:
11088 case CODE_FOR_avx512f_vextractf32x4_mask:
11089 case CODE_FOR_avx512f_vextracti32x4_mask:
11090 case CODE_FOR_sse2_shufpd:
11091 case CODE_FOR_sse2_shufpd_mask:
11092 case CODE_FOR_avx512dq_shuf_f64x2_mask:
11093 case CODE_FOR_avx512dq_shuf_i64x2_mask:
11094 case CODE_FOR_avx512vl_shuf_i32x4_mask:
11095 case CODE_FOR_avx512vl_shuf_f32x4_mask:
11096 error ("the last argument must be a 2-bit immediate");
11097 return const0_rtx;
11098
11099 case CODE_FOR_avx_vextractf128v4df:
11100 case CODE_FOR_avx_vextractf128v8sf:
11101 case CODE_FOR_avx_vextractf128v8si:
11102 case CODE_FOR_avx_vinsertf128v4df:
11103 case CODE_FOR_avx_vinsertf128v8sf:
11104 case CODE_FOR_avx_vinsertf128v8si:
11105 case CODE_FOR_avx512f_vinsertf64x4_mask:
11106 case CODE_FOR_avx512f_vinserti64x4_mask:
11107 case CODE_FOR_avx512f_vextractf64x4_mask:
11108 case CODE_FOR_avx512f_vextracti64x4_mask:
11109 case CODE_FOR_avx512dq_vinsertf32x8_mask:
11110 case CODE_FOR_avx512dq_vinserti32x8_mask:
11111 case CODE_FOR_avx512vl_vinsertv4df:
11112 case CODE_FOR_avx512vl_vinsertv4di:
11113 case CODE_FOR_avx512vl_vinsertv8sf:
11114 case CODE_FOR_avx512vl_vinsertv8si:
11115 error ("the last argument must be a 1-bit immediate");
11116 return const0_rtx;
11117
11118 case CODE_FOR_avx_vmcmpv2df3:
11119 case CODE_FOR_avx_vmcmpv4sf3:
11120 case CODE_FOR_avx_cmpv2df3:
11121 case CODE_FOR_avx_cmpv4sf3:
11122 case CODE_FOR_avx_cmpv4df3:
11123 case CODE_FOR_avx_cmpv8sf3:
11124 case CODE_FOR_avx512f_cmpv8df3_mask:
11125 case CODE_FOR_avx512f_cmpv16sf3_mask:
11126 case CODE_FOR_avx512f_vmcmpv2df3_mask:
11127 case CODE_FOR_avx512f_vmcmpv4sf3_mask:
11128 case CODE_FOR_avx512bw_cmpv32hf3_mask:
11129 case CODE_FOR_avx512vl_cmpv16hf3_mask:
11130 case CODE_FOR_avx512fp16_cmpv8hf3_mask:
11131 error ("the last argument must be a 5-bit immediate");
11132 return const0_rtx;
11133
11134 default:
11135 switch (nargs_constant)
11136 {
11137 case 2:
11138 if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
11139 (!mask_pos && (nargs - i) == nargs_constant))
11140 {
11141 error ("the next to last argument must be an 8-bit immediate");
11142 break;
11143 }
11144 /* FALLTHRU */
11145 case 1:
11146 error ("the last argument must be an 8-bit immediate");
11147 break;
11148 default:
11149 gcc_unreachable ();
11150 }
11151 return const0_rtx;
11152 }
11153 }
11154 else
11155 {
11156 if (VECTOR_MODE_P (mode))
11157 op = safe_vector_operand (op, mode);
11158
11159 /* If we aren't optimizing, only allow one memory operand to
11160 be generated. */
11161 if (memory_operand (op, mode))
11162 num_memory++;
11163
11164 op = fixup_modeless_constant (op, mode);
11165
11166 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11167 {
11168 if (optimize || !match || num_memory > 1)
11169 op = copy_to_mode_reg (mode, op);
11170 }
11171 else
11172 {
11173 op = copy_to_reg (op);
11174 op = lowpart_subreg (mode, op, GET_MODE (op));
11175 }
11176 }
11177
11178 xops[i] = op;
11179 }
11180
11181 switch (nargs)
11182 {
11183 case 1:
11184 pat = GEN_FCN (icode) (real_target, xops[0]);
11185 break;
11186 case 2:
11187 pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
11188 break;
11189 case 3:
11190 pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
11191 break;
11192 case 4:
11193 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11194 xops[2], xops[3]);
11195 break;
11196 case 5:
11197 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11198 xops[2], xops[3], xops[4]);
11199 break;
11200 case 6:
11201 pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
11202 xops[2], xops[3], xops[4], xops[5]);
11203 break;
11204 default:
11205 gcc_unreachable ();
11206 }
11207
11208 if (! pat)
11209 return 0;
11210
11211 emit_insn (pat);
11212 return target;
11213 }
11214
11215 /* Transform pattern of following layout:
11216 (set A
11217 (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
11218 )
11219 into:
11220 (set (A B)) */
11221
11222 static rtx
11223 ix86_erase_embedded_rounding (rtx pat)
11224 {
11225 if (GET_CODE (pat) == INSN)
11226 pat = PATTERN (pat);
11227
11228 gcc_assert (GET_CODE (pat) == SET);
11229 rtx src = SET_SRC (pat);
11230 gcc_assert (XVECLEN (src, 0) == 2);
11231 rtx p0 = XVECEXP (src, 0, 0);
11232 gcc_assert (GET_CODE (src) == UNSPEC
11233 && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
11234 rtx res = gen_rtx_SET (SET_DEST (pat), p0);
11235 return res;
11236 }
11237
11238 /* Subroutine of ix86_expand_round_builtin to take care of comi insns
11239 with rounding. */
11240 static rtx
11241 ix86_expand_sse_comi_round (const struct builtin_description *d,
11242 tree exp, rtx target)
11243 {
11244 rtx pat, set_dst;
11245 tree arg0 = CALL_EXPR_ARG (exp, 0);
11246 tree arg1 = CALL_EXPR_ARG (exp, 1);
11247 tree arg2 = CALL_EXPR_ARG (exp, 2);
11248 tree arg3 = CALL_EXPR_ARG (exp, 3);
11249 rtx op0 = expand_normal (arg0);
11250 rtx op1 = expand_normal (arg1);
11251 rtx op2 = expand_normal (arg2);
11252 rtx op3 = expand_normal (arg3);
11253 enum insn_code icode = d->icode;
11254 const struct insn_data_d *insn_p = &insn_data[icode];
11255 machine_mode mode0 = insn_p->operand[0].mode;
11256 machine_mode mode1 = insn_p->operand[1].mode;
11257
11258 /* See avxintrin.h for values. */
11259 static const enum rtx_code comparisons[32] =
11260 {
11261 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11262 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
11263 EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
11264 UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
11265 };
11266 static const bool ordereds[32] =
11267 {
11268 true, true, true, false, false, false, false, true,
11269 false, false, false, true, true, true, true, false,
11270 true, true, true, false, false, false, false, true,
11271 false, false, false, true, true, true, true, false
11272 };
11273 static const bool non_signalings[32] =
11274 {
11275 true, false, false, true, true, false, false, true,
11276 true, false, false, true, true, false, false, true,
11277 false, true, true, false, false, true, true, false,
11278 false, true, true, false, false, true, true, false
11279 };
11280
11281 if (!CONST_INT_P (op2))
11282 {
11283 error ("the third argument must be comparison constant");
11284 return const0_rtx;
11285 }
11286 if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
11287 {
11288 error ("incorrect comparison mode");
11289 return const0_rtx;
11290 }
11291
11292 if (!insn_p->operand[2].predicate (op3, SImode))
11293 {
11294 error ("incorrect rounding operand");
11295 return const0_rtx;
11296 }
11297
11298 if (VECTOR_MODE_P (mode0))
11299 op0 = safe_vector_operand (op0, mode0);
11300 if (VECTOR_MODE_P (mode1))
11301 op1 = safe_vector_operand (op1, mode1);
11302
11303 enum rtx_code comparison = comparisons[INTVAL (op2)];
11304 bool ordered = ordereds[INTVAL (op2)];
11305 bool non_signaling = non_signalings[INTVAL (op2)];
11306 rtx const_val = const0_rtx;
11307
11308 bool check_unordered = false;
11309 machine_mode mode = CCFPmode;
11310 switch (comparison)
11311 {
11312 case ORDERED:
11313 if (!ordered)
11314 {
11315 /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
11316 if (!non_signaling)
11317 ordered = true;
11318 mode = CCSmode;
11319 }
11320 else
11321 {
11322 /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
11323 if (non_signaling)
11324 ordered = false;
11325 mode = CCPmode;
11326 }
11327 comparison = NE;
11328 break;
11329 case UNORDERED:
11330 if (ordered)
11331 {
11332 /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
11333 if (non_signaling)
11334 ordered = false;
11335 mode = CCSmode;
11336 }
11337 else
11338 {
11339 /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
11340 if (!non_signaling)
11341 ordered = true;
11342 mode = CCPmode;
11343 }
11344 comparison = EQ;
11345 break;
11346
11347 case LE: /* -> GE */
11348 case LT: /* -> GT */
11349 case UNGE: /* -> UNLE */
11350 case UNGT: /* -> UNLT */
11351 std::swap (op0, op1);
11352 comparison = swap_condition (comparison);
11353 /* FALLTHRU */
11354 case GT:
11355 case GE:
11356 case UNEQ:
11357 case UNLT:
11358 case UNLE:
11359 case LTGT:
11360 /* These are supported by CCFPmode. NB: Use ordered/signaling
11361 COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
11362 with NAN operands. */
11363 if (ordered == non_signaling)
11364 ordered = !ordered;
11365 break;
11366 case EQ:
11367 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11368 _CMP_EQ_OQ/_CMP_EQ_OS. */
11369 check_unordered = true;
11370 mode = CCZmode;
11371 break;
11372 case NE:
11373 /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
11374 _CMP_NEQ_UQ/_CMP_NEQ_US. */
11375 gcc_assert (!ordered);
11376 check_unordered = true;
11377 mode = CCZmode;
11378 const_val = const1_rtx;
11379 break;
11380 default:
11381 gcc_unreachable ();
11382 }
11383
11384 target = gen_reg_rtx (SImode);
11385 emit_move_insn (target, const_val);
11386 target = gen_rtx_SUBREG (QImode, target, 0);
11387
11388 if ((optimize && !register_operand (op0, mode0))
11389 || !insn_p->operand[0].predicate (op0, mode0))
11390 op0 = copy_to_mode_reg (mode0, op0);
11391 if ((optimize && !register_operand (op1, mode1))
11392 || !insn_p->operand[1].predicate (op1, mode1))
11393 op1 = copy_to_mode_reg (mode1, op1);
11394
11395 /*
11396 1. COMI: ordered and signaling.
11397 2. UCOMI: unordered and non-signaling.
11398 */
11399 if (non_signaling)
11400 icode = (icode == CODE_FOR_sse_comi_round
11401 ? CODE_FOR_sse_ucomi_round
11402 : CODE_FOR_sse2_ucomi_round);
11403
11404 pat = GEN_FCN (icode) (op0, op1, op3);
11405 if (! pat)
11406 return 0;
11407
11408 /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
11409 if (INTVAL (op3) == NO_ROUND)
11410 {
11411 pat = ix86_erase_embedded_rounding (pat);
11412 if (! pat)
11413 return 0;
11414
11415 set_dst = SET_DEST (pat);
11416 }
11417 else
11418 {
11419 gcc_assert (GET_CODE (pat) == SET);
11420 set_dst = SET_DEST (pat);
11421 }
11422
11423 emit_insn (pat);
11424
11425 rtx_code_label *label = NULL;
11426
11427 /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11428 with NAN operands. */
11429 if (check_unordered)
11430 {
11431 gcc_assert (comparison == EQ || comparison == NE);
11432
11433 rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11434 label = gen_label_rtx ();
11435 rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11436 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11437 gen_rtx_LABEL_REF (VOIDmode, label),
11438 pc_rtx);
11439 emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11440 }
11441
11442 /* NB: Set CCFPmode and check a different CCmode which is in subset
11443 of CCFPmode. */
11444 if (GET_MODE (set_dst) != mode)
11445 {
11446 gcc_assert (mode == CCAmode || mode == CCCmode
11447 || mode == CCOmode || mode == CCPmode
11448 || mode == CCSmode || mode == CCZmode);
11449 set_dst = gen_rtx_REG (mode, FLAGS_REG);
11450 }
11451
11452 emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11453 gen_rtx_fmt_ee (comparison, QImode,
11454 set_dst,
11455 const0_rtx)));
11456
11457 if (label)
11458 emit_label (label);
11459
11460 return SUBREG_REG (target);
11461 }
11462
11463 static rtx
11464 ix86_expand_round_builtin (const struct builtin_description *d,
11465 tree exp, rtx target)
11466 {
11467 rtx pat;
11468 unsigned int i, nargs;
11469 rtx xops[6];
11470 enum insn_code icode = d->icode;
11471 const struct insn_data_d *insn_p = &insn_data[icode];
11472 machine_mode tmode = insn_p->operand[0].mode;
11473 unsigned int nargs_constant = 0;
11474 unsigned int redundant_embed_rnd = 0;
11475
11476 switch ((enum ix86_builtin_func_type) d->flag)
11477 {
11478 case UINT64_FTYPE_V2DF_INT:
11479 case UINT64_FTYPE_V4SF_INT:
11480 case UINT64_FTYPE_V8HF_INT:
11481 case UINT_FTYPE_V2DF_INT:
11482 case UINT_FTYPE_V4SF_INT:
11483 case UINT_FTYPE_V8HF_INT:
11484 case INT64_FTYPE_V2DF_INT:
11485 case INT64_FTYPE_V4SF_INT:
11486 case INT64_FTYPE_V8HF_INT:
11487 case INT_FTYPE_V2DF_INT:
11488 case INT_FTYPE_V4SF_INT:
11489 case INT_FTYPE_V8HF_INT:
11490 nargs = 2;
11491 break;
11492 case V32HF_FTYPE_V32HF_V32HF_INT:
11493 case V8HF_FTYPE_V8HF_V8HF_INT:
11494 case V8HF_FTYPE_V8HF_INT_INT:
11495 case V8HF_FTYPE_V8HF_UINT_INT:
11496 case V8HF_FTYPE_V8HF_INT64_INT:
11497 case V8HF_FTYPE_V8HF_UINT64_INT:
11498 case V4SF_FTYPE_V4SF_UINT_INT:
11499 case V4SF_FTYPE_V4SF_UINT64_INT:
11500 case V2DF_FTYPE_V2DF_UINT64_INT:
11501 case V4SF_FTYPE_V4SF_INT_INT:
11502 case V4SF_FTYPE_V4SF_INT64_INT:
11503 case V2DF_FTYPE_V2DF_INT64_INT:
11504 case V4SF_FTYPE_V4SF_V4SF_INT:
11505 case V2DF_FTYPE_V2DF_V2DF_INT:
11506 case V4SF_FTYPE_V4SF_V2DF_INT:
11507 case V2DF_FTYPE_V2DF_V4SF_INT:
11508 nargs = 3;
11509 break;
11510 case V8SF_FTYPE_V8DF_V8SF_QI_INT:
11511 case V8DF_FTYPE_V8DF_V8DF_QI_INT:
11512 case V32HI_FTYPE_V32HF_V32HI_USI_INT:
11513 case V8SI_FTYPE_V8DF_V8SI_QI_INT:
11514 case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
11515 case V8DI_FTYPE_V8DF_V8DI_QI_INT:
11516 case V8SF_FTYPE_V8DI_V8SF_QI_INT:
11517 case V8DF_FTYPE_V8DI_V8DF_QI_INT:
11518 case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
11519 case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
11520 case V32HF_FTYPE_V32HI_V32HF_USI_INT:
11521 case V32HF_FTYPE_V32HF_V32HF_USI_INT:
11522 case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
11523 case V16SF_FTYPE_V16SF_V16SF_HI_INT:
11524 case V8DI_FTYPE_V8SF_V8DI_QI_INT:
11525 case V16SF_FTYPE_V16SI_V16SF_HI_INT:
11526 case V16SI_FTYPE_V16SF_V16SI_HI_INT:
11527 case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
11528 case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
11529 case V8DF_FTYPE_V8SF_V8DF_QI_INT:
11530 case V16SF_FTYPE_V16HI_V16SF_HI_INT:
11531 case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
11532 case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
11533 case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
11534 case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
11535 case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
11536 case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
11537 nargs = 4;
11538 break;
11539 case V4SF_FTYPE_V4SF_V4SF_INT_INT:
11540 case V2DF_FTYPE_V2DF_V2DF_INT_INT:
11541 nargs_constant = 2;
11542 nargs = 4;
11543 break;
11544 case INT_FTYPE_V4SF_V4SF_INT_INT:
11545 case INT_FTYPE_V2DF_V2DF_INT_INT:
11546 return ix86_expand_sse_comi_round (d, exp, target);
11547 case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
11548 case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
11549 case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
11550 case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
11551 case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
11552 case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
11553 case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
11554 case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
11555 case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
11556 case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
11557 case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
11558 case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
11559 case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
11560 case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
11561 case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
11562 case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
11563 case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
11564 nargs = 5;
11565 break;
11566 case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
11567 case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
11568 case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
11569 case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
11570 case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
11571 nargs_constant = 4;
11572 nargs = 5;
11573 break;
11574 case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
11575 case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
11576 case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
11577 case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
11578 case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
11579 case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
11580 nargs_constant = 3;
11581 nargs = 5;
11582 break;
11583 case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
11584 case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
11585 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
11586 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
11587 case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
11588 case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
11589 case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
11590 nargs = 6;
11591 nargs_constant = 4;
11592 break;
11593 case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
11594 case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
11595 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
11596 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
11597 nargs = 6;
11598 nargs_constant = 3;
11599 break;
11600 default:
11601 gcc_unreachable ();
11602 }
11603 gcc_assert (nargs <= ARRAY_SIZE (xops));
11604
11605 if (optimize
11606 || target == 0
11607 || GET_MODE (target) != tmode
11608 || !insn_p->operand[0].predicate (target, tmode))
11609 target = gen_reg_rtx (tmode);
11610
11611 for (i = 0; i < nargs; i++)
11612 {
11613 tree arg = CALL_EXPR_ARG (exp, i);
11614 rtx op = expand_normal (arg);
11615 machine_mode mode = insn_p->operand[i + 1].mode;
11616 bool match = insn_p->operand[i + 1].predicate (op, mode);
11617
11618 if (i == nargs - nargs_constant)
11619 {
11620 if (!match)
11621 {
11622 switch (icode)
11623 {
11624 case CODE_FOR_avx512f_getmantv8df_mask_round:
11625 case CODE_FOR_avx512f_getmantv16sf_mask_round:
11626 case CODE_FOR_avx512bw_getmantv32hf_mask_round:
11627 case CODE_FOR_avx512f_vgetmantv2df_round:
11628 case CODE_FOR_avx512f_vgetmantv2df_mask_round:
11629 case CODE_FOR_avx512f_vgetmantv4sf_round:
11630 case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
11631 case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
11632 error ("the immediate argument must be a 4-bit immediate");
11633 return const0_rtx;
11634 case CODE_FOR_avx512f_cmpv8df3_mask_round:
11635 case CODE_FOR_avx512f_cmpv16sf3_mask_round:
11636 case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
11637 case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
11638 case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
11639 case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
11640 error ("the immediate argument must be a 5-bit immediate");
11641 return const0_rtx;
11642 default:
11643 error ("the immediate argument must be an 8-bit immediate");
11644 return const0_rtx;
11645 }
11646 }
11647 }
11648 else if (i == nargs-1)
11649 {
11650 if (!insn_p->operand[nargs].predicate (op, SImode))
11651 {
11652 error ("incorrect rounding operand");
11653 return const0_rtx;
11654 }
11655
11656 /* If there is no rounding use normal version of the pattern. */
11657 if (INTVAL (op) == NO_ROUND)
11658 {
11659 /* Skip erasing embedded rounding for below expanders who
11660 generates multiple insns. In ix86_erase_embedded_rounding
11661 the pattern will be transformed to a single set, and emit_insn
11662 appends the set insead of insert it to chain. So the insns
11663 emitted inside define_expander would be ignored. */
11664 switch (icode)
11665 {
11666 case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
11667 case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
11668 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
11669 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
11670 case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
11671 case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
11672 redundant_embed_rnd = 0;
11673 break;
11674 default:
11675 redundant_embed_rnd = 1;
11676 break;
11677 }
11678 }
11679 }
11680 else
11681 {
11682 if (VECTOR_MODE_P (mode))
11683 op = safe_vector_operand (op, mode);
11684
11685 op = fixup_modeless_constant (op, mode);
11686
11687 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
11688 {
11689 if (optimize || !match)
11690 op = copy_to_mode_reg (mode, op);
11691 }
11692 else
11693 {
11694 op = copy_to_reg (op);
11695 op = lowpart_subreg (mode, op, GET_MODE (op));
11696 }
11697 }
11698
11699 xops[i] = op;
11700 }
11701
11702 switch (nargs)
11703 {
11704 case 1:
11705 pat = GEN_FCN (icode) (target, xops[0]);
11706 break;
11707 case 2:
11708 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11709 break;
11710 case 3:
11711 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11712 break;
11713 case 4:
11714 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11715 xops[2], xops[3]);
11716 break;
11717 case 5:
11718 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11719 xops[2], xops[3], xops[4]);
11720 break;
11721 case 6:
11722 pat = GEN_FCN (icode) (target, xops[0], xops[1],
11723 xops[2], xops[3], xops[4], xops[5]);
11724 break;
11725 default:
11726 gcc_unreachable ();
11727 }
11728
11729 if (!pat)
11730 return 0;
11731
11732 if (redundant_embed_rnd)
11733 pat = ix86_erase_embedded_rounding (pat);
11734
11735 emit_insn (pat);
11736 return target;
11737 }
11738
11739 /* Subroutine of ix86_expand_builtin to take care of special insns
11740 with variable number of operands. */
11741
11742 static rtx
11743 ix86_expand_special_args_builtin (const struct builtin_description *d,
11744 tree exp, rtx target)
11745 {
11746 tree arg;
11747 rtx pat, op;
11748 unsigned int i, nargs, arg_adjust, memory;
11749 bool aligned_mem = false;
11750 rtx xops[3];
11751 enum insn_code icode = d->icode;
11752 const struct insn_data_d *insn_p = &insn_data[icode];
11753 machine_mode tmode = insn_p->operand[0].mode;
11754 enum { load, store } klass;
11755
11756 switch ((enum ix86_builtin_func_type) d->flag)
11757 {
11758 case VOID_FTYPE_VOID:
11759 emit_insn (GEN_FCN (icode) (target));
11760 return 0;
11761 case VOID_FTYPE_UINT64:
11762 case VOID_FTYPE_UNSIGNED:
11763 nargs = 0;
11764 klass = store;
11765 memory = 0;
11766 break;
11767
11768 case INT_FTYPE_VOID:
11769 case USHORT_FTYPE_VOID:
11770 case UINT64_FTYPE_VOID:
11771 case UINT_FTYPE_VOID:
11772 case UINT8_FTYPE_VOID:
11773 case UNSIGNED_FTYPE_VOID:
11774 nargs = 0;
11775 klass = load;
11776 memory = 0;
11777 break;
11778 case UINT64_FTYPE_PUNSIGNED:
11779 case V2DI_FTYPE_PV2DI:
11780 case V4DI_FTYPE_PV4DI:
11781 case V32QI_FTYPE_PCCHAR:
11782 case V16QI_FTYPE_PCCHAR:
11783 case V8SF_FTYPE_PCV4SF:
11784 case V8SF_FTYPE_PCFLOAT:
11785 case V4SF_FTYPE_PCFLOAT:
11786 case V4DF_FTYPE_PCV2DF:
11787 case V4DF_FTYPE_PCDOUBLE:
11788 case V2DF_FTYPE_PCDOUBLE:
11789 case VOID_FTYPE_PVOID:
11790 case V8DI_FTYPE_PV8DI:
11791 nargs = 1;
11792 klass = load;
11793 memory = 0;
11794 switch (icode)
11795 {
11796 case CODE_FOR_sse4_1_movntdqa:
11797 case CODE_FOR_avx2_movntdqa:
11798 case CODE_FOR_avx512f_movntdqa:
11799 aligned_mem = true;
11800 break;
11801 default:
11802 break;
11803 }
11804 break;
11805 case VOID_FTYPE_PV2SF_V4SF:
11806 case VOID_FTYPE_PV8DI_V8DI:
11807 case VOID_FTYPE_PV4DI_V4DI:
11808 case VOID_FTYPE_PV2DI_V2DI:
11809 case VOID_FTYPE_PCHAR_V32QI:
11810 case VOID_FTYPE_PCHAR_V16QI:
11811 case VOID_FTYPE_PFLOAT_V16SF:
11812 case VOID_FTYPE_PFLOAT_V8SF:
11813 case VOID_FTYPE_PFLOAT_V4SF:
11814 case VOID_FTYPE_PDOUBLE_V8DF:
11815 case VOID_FTYPE_PDOUBLE_V4DF:
11816 case VOID_FTYPE_PDOUBLE_V2DF:
11817 case VOID_FTYPE_PLONGLONG_LONGLONG:
11818 case VOID_FTYPE_PULONGLONG_ULONGLONG:
11819 case VOID_FTYPE_PUNSIGNED_UNSIGNED:
11820 case VOID_FTYPE_PINT_INT:
11821 nargs = 1;
11822 klass = store;
11823 /* Reserve memory operand for target. */
11824 memory = ARRAY_SIZE (xops);
11825 switch (icode)
11826 {
11827 /* These builtins and instructions require the memory
11828 to be properly aligned. */
11829 case CODE_FOR_avx_movntv4di:
11830 case CODE_FOR_sse2_movntv2di:
11831 case CODE_FOR_avx_movntv8sf:
11832 case CODE_FOR_sse_movntv4sf:
11833 case CODE_FOR_sse4a_vmmovntv4sf:
11834 case CODE_FOR_avx_movntv4df:
11835 case CODE_FOR_sse2_movntv2df:
11836 case CODE_FOR_sse4a_vmmovntv2df:
11837 case CODE_FOR_sse2_movntidi:
11838 case CODE_FOR_sse_movntq:
11839 case CODE_FOR_sse2_movntisi:
11840 case CODE_FOR_avx512f_movntv16sf:
11841 case CODE_FOR_avx512f_movntv8df:
11842 case CODE_FOR_avx512f_movntv8di:
11843 aligned_mem = true;
11844 break;
11845 default:
11846 break;
11847 }
11848 break;
11849 case VOID_FTYPE_PVOID_PCVOID:
11850 nargs = 1;
11851 klass = store;
11852 memory = 0;
11853
11854 break;
11855 case V4SF_FTYPE_V4SF_PCV2SF:
11856 case V2DF_FTYPE_V2DF_PCDOUBLE:
11857 nargs = 2;
11858 klass = load;
11859 memory = 1;
11860 break;
11861 case V8SF_FTYPE_PCV8SF_V8SI:
11862 case V4DF_FTYPE_PCV4DF_V4DI:
11863 case V4SF_FTYPE_PCV4SF_V4SI:
11864 case V2DF_FTYPE_PCV2DF_V2DI:
11865 case V8SI_FTYPE_PCV8SI_V8SI:
11866 case V4DI_FTYPE_PCV4DI_V4DI:
11867 case V4SI_FTYPE_PCV4SI_V4SI:
11868 case V2DI_FTYPE_PCV2DI_V2DI:
11869 case VOID_FTYPE_INT_INT64:
11870 nargs = 2;
11871 klass = load;
11872 memory = 0;
11873 break;
11874 case VOID_FTYPE_PV8DF_V8DF_UQI:
11875 case VOID_FTYPE_PV4DF_V4DF_UQI:
11876 case VOID_FTYPE_PV2DF_V2DF_UQI:
11877 case VOID_FTYPE_PV16SF_V16SF_UHI:
11878 case VOID_FTYPE_PV8SF_V8SF_UQI:
11879 case VOID_FTYPE_PV4SF_V4SF_UQI:
11880 case VOID_FTYPE_PV8DI_V8DI_UQI:
11881 case VOID_FTYPE_PV4DI_V4DI_UQI:
11882 case VOID_FTYPE_PV2DI_V2DI_UQI:
11883 case VOID_FTYPE_PV16SI_V16SI_UHI:
11884 case VOID_FTYPE_PV8SI_V8SI_UQI:
11885 case VOID_FTYPE_PV4SI_V4SI_UQI:
11886 case VOID_FTYPE_PV64QI_V64QI_UDI:
11887 case VOID_FTYPE_PV32HI_V32HI_USI:
11888 case VOID_FTYPE_PV32QI_V32QI_USI:
11889 case VOID_FTYPE_PV16QI_V16QI_UHI:
11890 case VOID_FTYPE_PV16HI_V16HI_UHI:
11891 case VOID_FTYPE_PV8HI_V8HI_UQI:
11892 switch (icode)
11893 {
11894 /* These builtins and instructions require the memory
11895 to be properly aligned. */
11896 case CODE_FOR_avx512f_storev16sf_mask:
11897 case CODE_FOR_avx512f_storev16si_mask:
11898 case CODE_FOR_avx512f_storev8df_mask:
11899 case CODE_FOR_avx512f_storev8di_mask:
11900 case CODE_FOR_avx512vl_storev8sf_mask:
11901 case CODE_FOR_avx512vl_storev8si_mask:
11902 case CODE_FOR_avx512vl_storev4df_mask:
11903 case CODE_FOR_avx512vl_storev4di_mask:
11904 case CODE_FOR_avx512vl_storev4sf_mask:
11905 case CODE_FOR_avx512vl_storev4si_mask:
11906 case CODE_FOR_avx512vl_storev2df_mask:
11907 case CODE_FOR_avx512vl_storev2di_mask:
11908 aligned_mem = true;
11909 break;
11910 default:
11911 break;
11912 }
11913 /* FALLTHRU */
11914 case VOID_FTYPE_PV8SF_V8SI_V8SF:
11915 case VOID_FTYPE_PV4DF_V4DI_V4DF:
11916 case VOID_FTYPE_PV4SF_V4SI_V4SF:
11917 case VOID_FTYPE_PV2DF_V2DI_V2DF:
11918 case VOID_FTYPE_PV8SI_V8SI_V8SI:
11919 case VOID_FTYPE_PV4DI_V4DI_V4DI:
11920 case VOID_FTYPE_PV4SI_V4SI_V4SI:
11921 case VOID_FTYPE_PV2DI_V2DI_V2DI:
11922 case VOID_FTYPE_PV8SI_V8DI_UQI:
11923 case VOID_FTYPE_PV8HI_V8DI_UQI:
11924 case VOID_FTYPE_PV16HI_V16SI_UHI:
11925 case VOID_FTYPE_PUDI_V8DI_UQI:
11926 case VOID_FTYPE_PV16QI_V16SI_UHI:
11927 case VOID_FTYPE_PV4SI_V4DI_UQI:
11928 case VOID_FTYPE_PUDI_V2DI_UQI:
11929 case VOID_FTYPE_PUDI_V4DI_UQI:
11930 case VOID_FTYPE_PUSI_V2DI_UQI:
11931 case VOID_FTYPE_PV8HI_V8SI_UQI:
11932 case VOID_FTYPE_PUDI_V4SI_UQI:
11933 case VOID_FTYPE_PUSI_V4DI_UQI:
11934 case VOID_FTYPE_PUHI_V2DI_UQI:
11935 case VOID_FTYPE_PUDI_V8SI_UQI:
11936 case VOID_FTYPE_PUSI_V4SI_UQI:
11937 case VOID_FTYPE_PCHAR_V64QI_UDI:
11938 case VOID_FTYPE_PCHAR_V32QI_USI:
11939 case VOID_FTYPE_PCHAR_V16QI_UHI:
11940 case VOID_FTYPE_PSHORT_V32HI_USI:
11941 case VOID_FTYPE_PSHORT_V16HI_UHI:
11942 case VOID_FTYPE_PSHORT_V8HI_UQI:
11943 case VOID_FTYPE_PINT_V16SI_UHI:
11944 case VOID_FTYPE_PINT_V8SI_UQI:
11945 case VOID_FTYPE_PINT_V4SI_UQI:
11946 case VOID_FTYPE_PINT64_V8DI_UQI:
11947 case VOID_FTYPE_PINT64_V4DI_UQI:
11948 case VOID_FTYPE_PINT64_V2DI_UQI:
11949 case VOID_FTYPE_PDOUBLE_V8DF_UQI:
11950 case VOID_FTYPE_PDOUBLE_V4DF_UQI:
11951 case VOID_FTYPE_PDOUBLE_V2DF_UQI:
11952 case VOID_FTYPE_PFLOAT_V16SF_UHI:
11953 case VOID_FTYPE_PFLOAT_V8SF_UQI:
11954 case VOID_FTYPE_PFLOAT_V4SF_UQI:
11955 case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
11956 case VOID_FTYPE_PV32QI_V32HI_USI:
11957 case VOID_FTYPE_PV16QI_V16HI_UHI:
11958 case VOID_FTYPE_PUDI_V8HI_UQI:
11959 nargs = 2;
11960 klass = store;
11961 /* Reserve memory operand for target. */
11962 memory = ARRAY_SIZE (xops);
11963 break;
11964 case V4SF_FTYPE_PCV4SF_V4SF_UQI:
11965 case V8SF_FTYPE_PCV8SF_V8SF_UQI:
11966 case V16SF_FTYPE_PCV16SF_V16SF_UHI:
11967 case V4SI_FTYPE_PCV4SI_V4SI_UQI:
11968 case V8SI_FTYPE_PCV8SI_V8SI_UQI:
11969 case V16SI_FTYPE_PCV16SI_V16SI_UHI:
11970 case V2DF_FTYPE_PCV2DF_V2DF_UQI:
11971 case V4DF_FTYPE_PCV4DF_V4DF_UQI:
11972 case V8DF_FTYPE_PCV8DF_V8DF_UQI:
11973 case V2DI_FTYPE_PCV2DI_V2DI_UQI:
11974 case V4DI_FTYPE_PCV4DI_V4DI_UQI:
11975 case V8DI_FTYPE_PCV8DI_V8DI_UQI:
11976 case V64QI_FTYPE_PCV64QI_V64QI_UDI:
11977 case V32HI_FTYPE_PCV32HI_V32HI_USI:
11978 case V32QI_FTYPE_PCV32QI_V32QI_USI:
11979 case V16QI_FTYPE_PCV16QI_V16QI_UHI:
11980 case V16HI_FTYPE_PCV16HI_V16HI_UHI:
11981 case V8HI_FTYPE_PCV8HI_V8HI_UQI:
11982 switch (icode)
11983 {
11984 /* These builtins and instructions require the memory
11985 to be properly aligned. */
11986 case CODE_FOR_avx512f_loadv16sf_mask:
11987 case CODE_FOR_avx512f_loadv16si_mask:
11988 case CODE_FOR_avx512f_loadv8df_mask:
11989 case CODE_FOR_avx512f_loadv8di_mask:
11990 case CODE_FOR_avx512vl_loadv8sf_mask:
11991 case CODE_FOR_avx512vl_loadv8si_mask:
11992 case CODE_FOR_avx512vl_loadv4df_mask:
11993 case CODE_FOR_avx512vl_loadv4di_mask:
11994 case CODE_FOR_avx512vl_loadv4sf_mask:
11995 case CODE_FOR_avx512vl_loadv4si_mask:
11996 case CODE_FOR_avx512vl_loadv2df_mask:
11997 case CODE_FOR_avx512vl_loadv2di_mask:
11998 case CODE_FOR_avx512bw_loadv64qi_mask:
11999 case CODE_FOR_avx512vl_loadv32qi_mask:
12000 case CODE_FOR_avx512vl_loadv16qi_mask:
12001 case CODE_FOR_avx512bw_loadv32hi_mask:
12002 case CODE_FOR_avx512vl_loadv16hi_mask:
12003 case CODE_FOR_avx512vl_loadv8hi_mask:
12004 aligned_mem = true;
12005 break;
12006 default:
12007 break;
12008 }
12009 /* FALLTHRU */
12010 case V64QI_FTYPE_PCCHAR_V64QI_UDI:
12011 case V32QI_FTYPE_PCCHAR_V32QI_USI:
12012 case V16QI_FTYPE_PCCHAR_V16QI_UHI:
12013 case V32HI_FTYPE_PCSHORT_V32HI_USI:
12014 case V16HI_FTYPE_PCSHORT_V16HI_UHI:
12015 case V8HI_FTYPE_PCSHORT_V8HI_UQI:
12016 case V16SI_FTYPE_PCINT_V16SI_UHI:
12017 case V8SI_FTYPE_PCINT_V8SI_UQI:
12018 case V4SI_FTYPE_PCINT_V4SI_UQI:
12019 case V8DI_FTYPE_PCINT64_V8DI_UQI:
12020 case V4DI_FTYPE_PCINT64_V4DI_UQI:
12021 case V2DI_FTYPE_PCINT64_V2DI_UQI:
12022 case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
12023 case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
12024 case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
12025 case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
12026 case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
12027 case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
12028 case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
12029 nargs = 3;
12030 klass = load;
12031 memory = 0;
12032 break;
12033 default:
12034 gcc_unreachable ();
12035 }
12036
12037 gcc_assert (nargs <= ARRAY_SIZE (xops));
12038
12039 if (klass == store)
12040 {
12041 arg = CALL_EXPR_ARG (exp, 0);
12042 op = expand_normal (arg);
12043 gcc_assert (target == 0);
12044 if (memory)
12045 {
12046 op = ix86_zero_extend_to_Pmode (op);
12047 target = gen_rtx_MEM (tmode, op);
12048 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
12049 on it. Try to improve it using get_pointer_alignment,
12050 and if the special builtin is one that requires strict
12051 mode alignment, also from it's GET_MODE_ALIGNMENT.
12052 Failure to do so could lead to ix86_legitimate_combined_insn
12053 rejecting all changes to such insns. */
12054 unsigned int align = get_pointer_alignment (arg);
12055 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
12056 align = GET_MODE_ALIGNMENT (tmode);
12057 if (MEM_ALIGN (target) < align)
12058 set_mem_align (target, align);
12059 }
12060 else
12061 target = force_reg (tmode, op);
12062 arg_adjust = 1;
12063 }
12064 else
12065 {
12066 arg_adjust = 0;
12067 if (optimize
12068 || target == 0
12069 || !register_operand (target, tmode)
12070 || GET_MODE (target) != tmode)
12071 target = gen_reg_rtx (tmode);
12072 }
12073
12074 for (i = 0; i < nargs; i++)
12075 {
12076 machine_mode mode = insn_p->operand[i + 1].mode;
12077
12078 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
12079 op = expand_normal (arg);
12080
12081 if (i == memory)
12082 {
12083 /* This must be the memory operand. */
12084 op = ix86_zero_extend_to_Pmode (op);
12085 op = gen_rtx_MEM (mode, op);
12086 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
12087 on it. Try to improve it using get_pointer_alignment,
12088 and if the special builtin is one that requires strict
12089 mode alignment, also from it's GET_MODE_ALIGNMENT.
12090 Failure to do so could lead to ix86_legitimate_combined_insn
12091 rejecting all changes to such insns. */
12092 unsigned int align = get_pointer_alignment (arg);
12093 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
12094 align = GET_MODE_ALIGNMENT (mode);
12095 if (MEM_ALIGN (op) < align)
12096 set_mem_align (op, align);
12097 }
12098 else
12099 {
12100 /* This must be register. */
12101 if (VECTOR_MODE_P (mode))
12102 op = safe_vector_operand (op, mode);
12103
12104 op = fixup_modeless_constant (op, mode);
12105
12106 /* NB: 3-operands load implied it's a mask load or v{p}expand*,
12107 and that mask operand shoud be at the end.
12108 Keep all-ones mask which would be simplified by the expander. */
12109 if (nargs == 3 && i == 2 && klass == load
12110 && constm1_operand (op, mode)
12111 && insn_p->operand[i].predicate (op, mode))
12112 ;
12113 else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
12114 op = copy_to_mode_reg (mode, op);
12115 else
12116 {
12117 op = copy_to_reg (op);
12118 op = lowpart_subreg (mode, op, GET_MODE (op));
12119 }
12120 }
12121
12122 xops[i]= op;
12123 }
12124
12125 switch (nargs)
12126 {
12127 case 0:
12128 pat = GEN_FCN (icode) (target);
12129 break;
12130 case 1:
12131 pat = GEN_FCN (icode) (target, xops[0]);
12132 break;
12133 case 2:
12134 pat = GEN_FCN (icode) (target, xops[0], xops[1]);
12135 break;
12136 case 3:
12137 pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
12138 break;
12139 default:
12140 gcc_unreachable ();
12141 }
12142
12143 if (! pat)
12144 return 0;
12145
12146 emit_insn (pat);
12147 return klass == store ? 0 : target;
12148 }
12149
12150 /* Return the integer constant in ARG. Constrain it to be in the range
12151 of the subparts of VEC_TYPE; issue an error if not. */
12152
12153 static int
12154 get_element_number (tree vec_type, tree arg)
12155 {
12156 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
12157
12158 if (!tree_fits_uhwi_p (arg)
12159 || (elt = tree_to_uhwi (arg), elt > max))
12160 {
12161 error ("selector must be an integer constant in the range "
12162 "[0, %wi]", max);
12163 return 0;
12164 }
12165
12166 return elt;
12167 }
12168
12169 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12170 ix86_expand_vector_init. We DO have language-level syntax for this, in
12171 the form of (type){ init-list }. Except that since we can't place emms
12172 instructions from inside the compiler, we can't allow the use of MMX
12173 registers unless the user explicitly asks for it. So we do *not* define
12174 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
12175 we have builtins invoked by mmintrin.h that gives us license to emit
12176 these sorts of instructions. */
12177
12178 static rtx
12179 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
12180 {
12181 machine_mode tmode = TYPE_MODE (type);
12182 machine_mode inner_mode = GET_MODE_INNER (tmode);
12183 int i, n_elt = GET_MODE_NUNITS (tmode);
12184 rtvec v = rtvec_alloc (n_elt);
12185
12186 gcc_assert (VECTOR_MODE_P (tmode));
12187 gcc_assert (call_expr_nargs (exp) == n_elt);
12188
12189 for (i = 0; i < n_elt; ++i)
12190 {
12191 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
12192 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
12193 }
12194
12195 if (!target || !register_operand (target, tmode))
12196 target = gen_reg_rtx (tmode);
12197
12198 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
12199 return target;
12200 }
12201
12202 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12203 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
12204 had a language-level syntax for referencing vector elements. */
12205
12206 static rtx
12207 ix86_expand_vec_ext_builtin (tree exp, rtx target)
12208 {
12209 machine_mode tmode, mode0;
12210 tree arg0, arg1;
12211 int elt;
12212 rtx op0;
12213
12214 arg0 = CALL_EXPR_ARG (exp, 0);
12215 arg1 = CALL_EXPR_ARG (exp, 1);
12216
12217 op0 = expand_normal (arg0);
12218 elt = get_element_number (TREE_TYPE (arg0), arg1);
12219
12220 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12221 mode0 = TYPE_MODE (TREE_TYPE (arg0));
12222 gcc_assert (VECTOR_MODE_P (mode0));
12223
12224 op0 = force_reg (mode0, op0);
12225
12226 if (optimize || !target || !register_operand (target, tmode))
12227 target = gen_reg_rtx (tmode);
12228
12229 ix86_expand_vector_extract (true, target, op0, elt);
12230
12231 return target;
12232 }
12233
12234 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
12235 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
12236 a language-level syntax for referencing vector elements. */
12237
12238 static rtx
12239 ix86_expand_vec_set_builtin (tree exp)
12240 {
12241 machine_mode tmode, mode1;
12242 tree arg0, arg1, arg2;
12243 int elt;
12244 rtx op0, op1, target;
12245
12246 arg0 = CALL_EXPR_ARG (exp, 0);
12247 arg1 = CALL_EXPR_ARG (exp, 1);
12248 arg2 = CALL_EXPR_ARG (exp, 2);
12249
12250 tmode = TYPE_MODE (TREE_TYPE (arg0));
12251 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
12252 gcc_assert (VECTOR_MODE_P (tmode));
12253
12254 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
12255 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
12256 elt = get_element_number (TREE_TYPE (arg0), arg2);
12257
12258 if (GET_MODE (op1) != mode1)
12259 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
12260
12261 op0 = force_reg (tmode, op0);
12262 op1 = force_reg (mode1, op1);
12263
12264 /* OP0 is the source of these builtin functions and shouldn't be
12265 modified. Create a copy, use it and return it as target. */
12266 target = gen_reg_rtx (tmode);
12267 emit_move_insn (target, op0);
12268 ix86_expand_vector_set (true, target, op1, elt);
12269
12270 return target;
12271 }
12272
12273 /* Return true if the necessary isa options for this builtin exist,
12274 else false.
12275 fcode = DECL_MD_FUNCTION_CODE (fndecl); */
12276 bool
12277 ix86_check_builtin_isa_match (unsigned int fcode,
12278 HOST_WIDE_INT* pbisa,
12279 HOST_WIDE_INT* pbisa2)
12280 {
12281 HOST_WIDE_INT isa = ix86_isa_flags;
12282 HOST_WIDE_INT isa2 = ix86_isa_flags2;
12283 HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
12284 HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
12285 /* The general case is we require all the ISAs specified in bisa{,2}
12286 to be enabled.
12287 The exceptions are:
12288 OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
12289 OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
12290 OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
12291 (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
12292 OPTION_MASK_ISA2_AVXVNNI
12293 where for each such pair it is sufficient if either of the ISAs is
12294 enabled, plus if it is ored with other options also those others.
12295 OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
12296 if (((bisa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12297 == (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A))
12298 && (isa & (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A)) != 0)
12299 isa |= (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A);
12300
12301 if (((bisa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12302 == (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32))
12303 && (isa & (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32)) != 0)
12304 isa |= (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32);
12305
12306 if (((bisa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12307 == (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4))
12308 && (isa & (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4)) != 0)
12309 isa |= (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4);
12310
12311 if ((((bisa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12312 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12313 || (bisa2 & OPTION_MASK_ISA2_AVXVNNI) != 0)
12314 && (((isa & (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12315 == (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL))
12316 || (isa2 & OPTION_MASK_ISA2_AVXVNNI) != 0))
12317 {
12318 isa |= OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL;
12319 isa2 |= OPTION_MASK_ISA2_AVXVNNI;
12320 }
12321
12322 if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
12323 /* __builtin_ia32_maskmovq requires MMX registers. */
12324 && fcode != IX86_BUILTIN_MASKMOVQ)
12325 {
12326 bisa &= ~OPTION_MASK_ISA_MMX;
12327 bisa |= OPTION_MASK_ISA_SSE2;
12328 }
12329
12330 if (pbisa)
12331 *pbisa = bisa;
12332 if (pbisa2)
12333 *pbisa2 = bisa2;
12334
12335 return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
12336 }
12337
12338 /* Expand an expression EXP that calls a built-in function,
12339 with result going to TARGET if that's convenient
12340 (and in mode MODE if that's convenient).
12341 SUBTARGET may be used as the target for computing one of EXP's operands.
12342 IGNORE is nonzero if the value is to be ignored. */
12343
12344 rtx
12345 ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
12346 machine_mode mode, int ignore)
12347 {
12348 size_t i;
12349 enum insn_code icode, icode2;
12350 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12351 tree arg0, arg1, arg2, arg3, arg4;
12352 rtx op0, op1, op2, op3, op4, pat, pat2, insn;
12353 machine_mode mode0, mode1, mode2, mode3, mode4;
12354 unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
12355 HOST_WIDE_INT bisa, bisa2;
12356
12357 /* For CPU builtins that can be folded, fold first and expand the fold. */
12358 switch (fcode)
12359 {
12360 case IX86_BUILTIN_CPU_INIT:
12361 {
12362 /* Make it call __cpu_indicator_init in libgcc. */
12363 tree call_expr, fndecl, type;
12364 type = build_function_type_list (integer_type_node, NULL_TREE);
12365 fndecl = build_fn_decl ("__cpu_indicator_init", type);
12366 call_expr = build_call_expr (fndecl, 0);
12367 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
12368 }
12369 case IX86_BUILTIN_CPU_IS:
12370 case IX86_BUILTIN_CPU_SUPPORTS:
12371 {
12372 tree arg0 = CALL_EXPR_ARG (exp, 0);
12373 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
12374 gcc_assert (fold_expr != NULL_TREE);
12375 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
12376 }
12377 }
12378
12379 if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
12380 {
12381 bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
12382 if (TARGET_ABI_X32)
12383 bisa |= OPTION_MASK_ABI_X32;
12384 else
12385 bisa |= OPTION_MASK_ABI_64;
12386 char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
12387 (enum fpmath_unit) 0,
12388 (enum prefer_vector_width) 0,
12389 PVW_NONE, PVW_NONE,
12390 false, add_abi_p);
12391 if (!opts)
12392 error ("%qE needs unknown isa option", fndecl);
12393 else
12394 {
12395 gcc_assert (opts != NULL);
12396 error ("%qE needs isa option %s", fndecl, opts);
12397 free (opts);
12398 }
12399 return expand_call (exp, target, ignore);
12400 }
12401
12402 switch (fcode)
12403 {
12404 case IX86_BUILTIN_MASKMOVQ:
12405 case IX86_BUILTIN_MASKMOVDQU:
12406 icode = (fcode == IX86_BUILTIN_MASKMOVQ
12407 ? CODE_FOR_mmx_maskmovq
12408 : CODE_FOR_sse2_maskmovdqu);
12409 /* Note the arg order is different from the operand order. */
12410 arg1 = CALL_EXPR_ARG (exp, 0);
12411 arg2 = CALL_EXPR_ARG (exp, 1);
12412 arg0 = CALL_EXPR_ARG (exp, 2);
12413 op0 = expand_normal (arg0);
12414 op1 = expand_normal (arg1);
12415 op2 = expand_normal (arg2);
12416 mode0 = insn_data[icode].operand[0].mode;
12417 mode1 = insn_data[icode].operand[1].mode;
12418 mode2 = insn_data[icode].operand[2].mode;
12419
12420 op0 = ix86_zero_extend_to_Pmode (op0);
12421 op0 = gen_rtx_MEM (mode1, op0);
12422
12423 if (!insn_data[icode].operand[0].predicate (op0, mode0))
12424 op0 = copy_to_mode_reg (mode0, op0);
12425 if (!insn_data[icode].operand[1].predicate (op1, mode1))
12426 op1 = copy_to_mode_reg (mode1, op1);
12427 if (!insn_data[icode].operand[2].predicate (op2, mode2))
12428 op2 = copy_to_mode_reg (mode2, op2);
12429 pat = GEN_FCN (icode) (op0, op1, op2);
12430 if (! pat)
12431 return 0;
12432 emit_insn (pat);
12433 return 0;
12434
12435 case IX86_BUILTIN_LDMXCSR:
12436 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
12437 target = assign_386_stack_local (SImode, SLOT_TEMP);
12438 emit_move_insn (target, op0);
12439 emit_insn (gen_sse_ldmxcsr (target));
12440 return 0;
12441
12442 case IX86_BUILTIN_STMXCSR:
12443 target = assign_386_stack_local (SImode, SLOT_TEMP);
12444 emit_insn (gen_sse_stmxcsr (target));
12445 return copy_to_mode_reg (SImode, target);
12446
12447 case IX86_BUILTIN_CLFLUSH:
12448 arg0 = CALL_EXPR_ARG (exp, 0);
12449 op0 = expand_normal (arg0);
12450 icode = CODE_FOR_sse2_clflush;
12451 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12452 op0 = ix86_zero_extend_to_Pmode (op0);
12453
12454 emit_insn (gen_sse2_clflush (op0));
12455 return 0;
12456
12457 case IX86_BUILTIN_CLWB:
12458 arg0 = CALL_EXPR_ARG (exp, 0);
12459 op0 = expand_normal (arg0);
12460 icode = CODE_FOR_clwb;
12461 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12462 op0 = ix86_zero_extend_to_Pmode (op0);
12463
12464 emit_insn (gen_clwb (op0));
12465 return 0;
12466
12467 case IX86_BUILTIN_CLFLUSHOPT:
12468 arg0 = CALL_EXPR_ARG (exp, 0);
12469 op0 = expand_normal (arg0);
12470 icode = CODE_FOR_clflushopt;
12471 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12472 op0 = ix86_zero_extend_to_Pmode (op0);
12473
12474 emit_insn (gen_clflushopt (op0));
12475 return 0;
12476
12477 case IX86_BUILTIN_MONITOR:
12478 case IX86_BUILTIN_MONITORX:
12479 arg0 = CALL_EXPR_ARG (exp, 0);
12480 arg1 = CALL_EXPR_ARG (exp, 1);
12481 arg2 = CALL_EXPR_ARG (exp, 2);
12482 op0 = expand_normal (arg0);
12483 op1 = expand_normal (arg1);
12484 op2 = expand_normal (arg2);
12485 if (!REG_P (op0))
12486 op0 = ix86_zero_extend_to_Pmode (op0);
12487 if (!REG_P (op1))
12488 op1 = copy_to_mode_reg (SImode, op1);
12489 if (!REG_P (op2))
12490 op2 = copy_to_mode_reg (SImode, op2);
12491
12492 emit_insn (fcode == IX86_BUILTIN_MONITOR
12493 ? gen_sse3_monitor (Pmode, op0, op1, op2)
12494 : gen_monitorx (Pmode, op0, op1, op2));
12495 return 0;
12496
12497 case IX86_BUILTIN_MWAIT:
12498 arg0 = CALL_EXPR_ARG (exp, 0);
12499 arg1 = CALL_EXPR_ARG (exp, 1);
12500 op0 = expand_normal (arg0);
12501 op1 = expand_normal (arg1);
12502 if (!REG_P (op0))
12503 op0 = copy_to_mode_reg (SImode, op0);
12504 if (!REG_P (op1))
12505 op1 = copy_to_mode_reg (SImode, op1);
12506 emit_insn (gen_sse3_mwait (op0, op1));
12507 return 0;
12508
12509 case IX86_BUILTIN_MWAITX:
12510 arg0 = CALL_EXPR_ARG (exp, 0);
12511 arg1 = CALL_EXPR_ARG (exp, 1);
12512 arg2 = CALL_EXPR_ARG (exp, 2);
12513 op0 = expand_normal (arg0);
12514 op1 = expand_normal (arg1);
12515 op2 = expand_normal (arg2);
12516 if (!REG_P (op0))
12517 op0 = copy_to_mode_reg (SImode, op0);
12518 if (!REG_P (op1))
12519 op1 = copy_to_mode_reg (SImode, op1);
12520 if (!REG_P (op2))
12521 op2 = copy_to_mode_reg (SImode, op2);
12522 emit_insn (gen_mwaitx (op0, op1, op2));
12523 return 0;
12524
12525 case IX86_BUILTIN_UMONITOR:
12526 arg0 = CALL_EXPR_ARG (exp, 0);
12527 op0 = expand_normal (arg0);
12528
12529 op0 = ix86_zero_extend_to_Pmode (op0);
12530 emit_insn (gen_umonitor (Pmode, op0));
12531 return 0;
12532
12533 case IX86_BUILTIN_UMWAIT:
12534 case IX86_BUILTIN_TPAUSE:
12535 arg0 = CALL_EXPR_ARG (exp, 0);
12536 arg1 = CALL_EXPR_ARG (exp, 1);
12537 op0 = expand_normal (arg0);
12538 op1 = expand_normal (arg1);
12539
12540 if (!REG_P (op0))
12541 op0 = copy_to_mode_reg (SImode, op0);
12542
12543 op1 = force_reg (DImode, op1);
12544
12545 if (TARGET_64BIT)
12546 {
12547 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
12548 NULL, 1, OPTAB_DIRECT);
12549 switch (fcode)
12550 {
12551 case IX86_BUILTIN_UMWAIT:
12552 icode = CODE_FOR_umwait_rex64;
12553 break;
12554 case IX86_BUILTIN_TPAUSE:
12555 icode = CODE_FOR_tpause_rex64;
12556 break;
12557 default:
12558 gcc_unreachable ();
12559 }
12560
12561 op2 = gen_lowpart (SImode, op2);
12562 op1 = gen_lowpart (SImode, op1);
12563 pat = GEN_FCN (icode) (op0, op1, op2);
12564 }
12565 else
12566 {
12567 switch (fcode)
12568 {
12569 case IX86_BUILTIN_UMWAIT:
12570 icode = CODE_FOR_umwait;
12571 break;
12572 case IX86_BUILTIN_TPAUSE:
12573 icode = CODE_FOR_tpause;
12574 break;
12575 default:
12576 gcc_unreachable ();
12577 }
12578 pat = GEN_FCN (icode) (op0, op1);
12579 }
12580
12581 if (!pat)
12582 return 0;
12583
12584 emit_insn (pat);
12585
12586 if (target == 0
12587 || !register_operand (target, QImode))
12588 target = gen_reg_rtx (QImode);
12589
12590 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12591 const0_rtx);
12592 emit_insn (gen_rtx_SET (target, pat));
12593
12594 return target;
12595
12596 case IX86_BUILTIN_TESTUI:
12597 emit_insn (gen_testui ());
12598
12599 if (target == 0
12600 || !register_operand (target, QImode))
12601 target = gen_reg_rtx (QImode);
12602
12603 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
12604 const0_rtx);
12605 emit_insn (gen_rtx_SET (target, pat));
12606
12607 return target;
12608
12609 case IX86_BUILTIN_CLZERO:
12610 arg0 = CALL_EXPR_ARG (exp, 0);
12611 op0 = expand_normal (arg0);
12612 if (!REG_P (op0))
12613 op0 = ix86_zero_extend_to_Pmode (op0);
12614 emit_insn (gen_clzero (Pmode, op0));
12615 return 0;
12616
12617 case IX86_BUILTIN_CLDEMOTE:
12618 arg0 = CALL_EXPR_ARG (exp, 0);
12619 op0 = expand_normal (arg0);
12620 icode = CODE_FOR_cldemote;
12621 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
12622 op0 = ix86_zero_extend_to_Pmode (op0);
12623
12624 emit_insn (gen_cldemote (op0));
12625 return 0;
12626
12627 case IX86_BUILTIN_LOADIWKEY:
12628 {
12629 arg0 = CALL_EXPR_ARG (exp, 0);
12630 arg1 = CALL_EXPR_ARG (exp, 1);
12631 arg2 = CALL_EXPR_ARG (exp, 2);
12632 arg3 = CALL_EXPR_ARG (exp, 3);
12633
12634 op0 = expand_normal (arg0);
12635 op1 = expand_normal (arg1);
12636 op2 = expand_normal (arg2);
12637 op3 = expand_normal (arg3);
12638
12639 if (!REG_P (op0))
12640 op0 = copy_to_mode_reg (V2DImode, op0);
12641 if (!REG_P (op1))
12642 op1 = copy_to_mode_reg (V2DImode, op1);
12643 if (!REG_P (op2))
12644 op2 = copy_to_mode_reg (V2DImode, op2);
12645 if (!REG_P (op3))
12646 op3 = copy_to_mode_reg (SImode, op3);
12647
12648 emit_insn (gen_loadiwkey (op0, op1, op2, op3));
12649
12650 return 0;
12651 }
12652
12653 case IX86_BUILTIN_AESDEC128KLU8:
12654 icode = CODE_FOR_aesdec128klu8;
12655 goto aesdecenc_expand;
12656
12657 case IX86_BUILTIN_AESDEC256KLU8:
12658 icode = CODE_FOR_aesdec256klu8;
12659 goto aesdecenc_expand;
12660
12661 case IX86_BUILTIN_AESENC128KLU8:
12662 icode = CODE_FOR_aesenc128klu8;
12663 goto aesdecenc_expand;
12664
12665 case IX86_BUILTIN_AESENC256KLU8:
12666 icode = CODE_FOR_aesenc256klu8;
12667
12668 aesdecenc_expand:
12669
12670 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
12671 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
12672 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12673
12674 op0 = expand_normal (arg0);
12675 op1 = expand_normal (arg1);
12676 op2 = expand_normal (arg2);
12677
12678 if (!address_operand (op0, V2DImode))
12679 {
12680 op0 = convert_memory_address (Pmode, op0);
12681 op0 = copy_addr_to_reg (op0);
12682 }
12683 op0 = gen_rtx_MEM (V2DImode, op0);
12684
12685 if (!REG_P (op1))
12686 op1 = copy_to_mode_reg (V2DImode, op1);
12687
12688 if (!address_operand (op2, VOIDmode))
12689 {
12690 op2 = convert_memory_address (Pmode, op2);
12691 op2 = copy_addr_to_reg (op2);
12692 }
12693 op2 = gen_rtx_MEM (BLKmode, op2);
12694
12695 emit_insn (GEN_FCN (icode) (op1, op1, op2));
12696
12697 if (target == 0)
12698 target = gen_reg_rtx (QImode);
12699
12700 /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
12701 error occurs. Then the output should be cleared for safety. */
12702 rtx_code_label *ok_label;
12703 rtx tmp;
12704
12705 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12706 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12707 ok_label = gen_label_rtx ();
12708 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12709 true, ok_label);
12710 /* Usually the runtime error seldom occur, so predict OK path as
12711 hotspot to optimize it as fallthrough block. */
12712 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12713
12714 emit_insn (gen_rtx_SET (op1, const0_rtx));
12715
12716 emit_label (ok_label);
12717 emit_insn (gen_rtx_SET (target, pat));
12718 emit_insn (gen_rtx_SET (op0, op1));
12719
12720 return target;
12721
12722 case IX86_BUILTIN_AESDECWIDE128KLU8:
12723 icode = CODE_FOR_aesdecwide128klu8;
12724 goto wideaesdecenc_expand;
12725
12726 case IX86_BUILTIN_AESDECWIDE256KLU8:
12727 icode = CODE_FOR_aesdecwide256klu8;
12728 goto wideaesdecenc_expand;
12729
12730 case IX86_BUILTIN_AESENCWIDE128KLU8:
12731 icode = CODE_FOR_aesencwide128klu8;
12732 goto wideaesdecenc_expand;
12733
12734 case IX86_BUILTIN_AESENCWIDE256KLU8:
12735 icode = CODE_FOR_aesencwide256klu8;
12736
12737 wideaesdecenc_expand:
12738
12739 rtx xmm_regs[8];
12740 rtx op;
12741
12742 arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
12743 arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
12744 arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
12745
12746 op0 = expand_normal (arg0);
12747 op1 = expand_normal (arg1);
12748 op2 = expand_normal (arg2);
12749
12750 if (GET_MODE (op1) != Pmode)
12751 op1 = convert_to_mode (Pmode, op1, 1);
12752
12753 if (!address_operand (op2, VOIDmode))
12754 {
12755 op2 = convert_memory_address (Pmode, op2);
12756 op2 = copy_addr_to_reg (op2);
12757 }
12758 op2 = gen_rtx_MEM (BLKmode, op2);
12759
12760 for (i = 0; i < 8; i++)
12761 {
12762 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12763
12764 op = gen_rtx_MEM (V2DImode,
12765 plus_constant (Pmode, op1, (i * 16)));
12766
12767 emit_move_insn (xmm_regs[i], op);
12768 }
12769
12770 emit_insn (GEN_FCN (icode) (op2));
12771
12772 if (target == 0)
12773 target = gen_reg_rtx (QImode);
12774
12775 tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
12776 pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
12777 ok_label = gen_label_rtx ();
12778 emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
12779 true, ok_label);
12780 predict_jump (REG_BR_PROB_BASE * 90 / 100);
12781
12782 for (i = 0; i < 8; i++)
12783 emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
12784
12785 emit_label (ok_label);
12786 emit_insn (gen_rtx_SET (target, pat));
12787
12788 if (GET_MODE (op0) != Pmode)
12789 op0 = convert_to_mode (Pmode, op0, 1);
12790
12791 for (i = 0; i < 8; i++)
12792 {
12793 op = gen_rtx_MEM (V2DImode,
12794 plus_constant (Pmode, op0, (i * 16)));
12795 emit_move_insn (op, xmm_regs[i]);
12796 }
12797
12798 return target;
12799
12800 case IX86_BUILTIN_ENCODEKEY128U32:
12801 {
12802 rtx op, xmm_regs[7];
12803
12804 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12805 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
12806 arg2 = CALL_EXPR_ARG (exp, 2); // void *h
12807
12808 op0 = expand_normal (arg0);
12809 op1 = expand_normal (arg1);
12810 op2 = expand_normal (arg2);
12811
12812 if (!REG_P (op0))
12813 op0 = copy_to_mode_reg (SImode, op0);
12814
12815 if (GET_MODE (op2) != Pmode)
12816 op2 = convert_to_mode (Pmode, op2, 1);
12817
12818 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12819 emit_move_insn (op, op1);
12820
12821 for (i = 0; i < 3; i++)
12822 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12823
12824 if (target == 0)
12825 target = gen_reg_rtx (SImode);
12826
12827 emit_insn (gen_encodekey128u32 (target, op0));
12828
12829 for (i = 0; i < 3; i++)
12830 {
12831 op = gen_rtx_MEM (V2DImode,
12832 plus_constant (Pmode, op2, (i * 16)));
12833 emit_move_insn (op, xmm_regs[i]);
12834 }
12835
12836 return target;
12837 }
12838 case IX86_BUILTIN_ENCODEKEY256U32:
12839 {
12840 rtx op, xmm_regs[7];
12841
12842 arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
12843 arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
12844 arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
12845 arg3 = CALL_EXPR_ARG (exp, 3); // void *h
12846
12847 op0 = expand_normal (arg0);
12848 op1 = expand_normal (arg1);
12849 op2 = expand_normal (arg2);
12850 op3 = expand_normal (arg3);
12851
12852 if (!REG_P (op0))
12853 op0 = copy_to_mode_reg (SImode, op0);
12854
12855 if (GET_MODE (op3) != Pmode)
12856 op3 = convert_to_mode (Pmode, op3, 1);
12857
12858 /* Force to use xmm0, xmm1 for keylow, keyhi*/
12859 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
12860 emit_move_insn (op, op1);
12861 op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
12862 emit_move_insn (op, op2);
12863
12864 for (i = 0; i < 4; i++)
12865 xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
12866
12867 if (target == 0)
12868 target = gen_reg_rtx (SImode);
12869
12870 emit_insn (gen_encodekey256u32 (target, op0));
12871
12872 for (i = 0; i < 4; i++)
12873 {
12874 op = gen_rtx_MEM (V2DImode,
12875 plus_constant (Pmode, op3, (i * 16)));
12876 emit_move_insn (op, xmm_regs[i]);
12877 }
12878
12879 return target;
12880 }
12881
12882 case IX86_BUILTIN_VEC_INIT_V2SI:
12883 case IX86_BUILTIN_VEC_INIT_V4HI:
12884 case IX86_BUILTIN_VEC_INIT_V8QI:
12885 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
12886
12887 case IX86_BUILTIN_VEC_EXT_V2DF:
12888 case IX86_BUILTIN_VEC_EXT_V2DI:
12889 case IX86_BUILTIN_VEC_EXT_V4SF:
12890 case IX86_BUILTIN_VEC_EXT_V4SI:
12891 case IX86_BUILTIN_VEC_EXT_V8HI:
12892 case IX86_BUILTIN_VEC_EXT_V2SI:
12893 case IX86_BUILTIN_VEC_EXT_V4HI:
12894 case IX86_BUILTIN_VEC_EXT_V16QI:
12895 return ix86_expand_vec_ext_builtin (exp, target);
12896
12897 case IX86_BUILTIN_VEC_SET_V2DI:
12898 case IX86_BUILTIN_VEC_SET_V4SF:
12899 case IX86_BUILTIN_VEC_SET_V4SI:
12900 case IX86_BUILTIN_VEC_SET_V8HI:
12901 case IX86_BUILTIN_VEC_SET_V4HI:
12902 case IX86_BUILTIN_VEC_SET_V16QI:
12903 return ix86_expand_vec_set_builtin (exp);
12904
12905 case IX86_BUILTIN_NANQ:
12906 case IX86_BUILTIN_NANSQ:
12907 return expand_call (exp, target, ignore);
12908
12909 case IX86_BUILTIN_RDPID:
12910
12911 op0 = gen_reg_rtx (word_mode);
12912
12913 if (TARGET_64BIT)
12914 {
12915 insn = gen_rdpid_rex64 (op0);
12916 op0 = convert_to_mode (SImode, op0, 1);
12917 }
12918 else
12919 insn = gen_rdpid (op0);
12920
12921 emit_insn (insn);
12922
12923 if (target == 0
12924 || !register_operand (target, SImode))
12925 target = gen_reg_rtx (SImode);
12926
12927 emit_move_insn (target, op0);
12928 return target;
12929
12930 case IX86_BUILTIN_2INTERSECTD512:
12931 case IX86_BUILTIN_2INTERSECTQ512:
12932 case IX86_BUILTIN_2INTERSECTD256:
12933 case IX86_BUILTIN_2INTERSECTQ256:
12934 case IX86_BUILTIN_2INTERSECTD128:
12935 case IX86_BUILTIN_2INTERSECTQ128:
12936 arg0 = CALL_EXPR_ARG (exp, 0);
12937 arg1 = CALL_EXPR_ARG (exp, 1);
12938 arg2 = CALL_EXPR_ARG (exp, 2);
12939 arg3 = CALL_EXPR_ARG (exp, 3);
12940 op0 = expand_normal (arg0);
12941 op1 = expand_normal (arg1);
12942 op2 = expand_normal (arg2);
12943 op3 = expand_normal (arg3);
12944
12945 if (!address_operand (op0, VOIDmode))
12946 {
12947 op0 = convert_memory_address (Pmode, op0);
12948 op0 = copy_addr_to_reg (op0);
12949 }
12950 if (!address_operand (op1, VOIDmode))
12951 {
12952 op1 = convert_memory_address (Pmode, op1);
12953 op1 = copy_addr_to_reg (op1);
12954 }
12955
12956 switch (fcode)
12957 {
12958 case IX86_BUILTIN_2INTERSECTD512:
12959 mode4 = P2HImode;
12960 icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
12961 break;
12962 case IX86_BUILTIN_2INTERSECTQ512:
12963 mode4 = P2QImode;
12964 icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
12965 break;
12966 case IX86_BUILTIN_2INTERSECTD256:
12967 mode4 = P2QImode;
12968 icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
12969 break;
12970 case IX86_BUILTIN_2INTERSECTQ256:
12971 mode4 = P2QImode;
12972 icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
12973 break;
12974 case IX86_BUILTIN_2INTERSECTD128:
12975 mode4 = P2QImode;
12976 icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
12977 break;
12978 case IX86_BUILTIN_2INTERSECTQ128:
12979 mode4 = P2QImode;
12980 icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
12981 break;
12982 default:
12983 gcc_unreachable ();
12984 }
12985
12986 mode2 = insn_data[icode].operand[1].mode;
12987 mode3 = insn_data[icode].operand[2].mode;
12988 if (!insn_data[icode].operand[1].predicate (op2, mode2))
12989 op2 = copy_to_mode_reg (mode2, op2);
12990 if (!insn_data[icode].operand[2].predicate (op3, mode3))
12991 op3 = copy_to_mode_reg (mode3, op3);
12992
12993 op4 = gen_reg_rtx (mode4);
12994 emit_insn (GEN_FCN (icode) (op4, op2, op3));
12995 mode0 = mode4 == P2HImode ? HImode : QImode;
12996 emit_move_insn (gen_rtx_MEM (mode0, op0),
12997 gen_lowpart (mode0, op4));
12998 emit_move_insn (gen_rtx_MEM (mode0, op1),
12999 gen_highpart (mode0, op4));
13000
13001 return 0;
13002
13003 case IX86_BUILTIN_RDPMC:
13004 case IX86_BUILTIN_RDTSC:
13005 case IX86_BUILTIN_RDTSCP:
13006 case IX86_BUILTIN_XGETBV:
13007
13008 op0 = gen_reg_rtx (DImode);
13009 op1 = gen_reg_rtx (DImode);
13010
13011 if (fcode == IX86_BUILTIN_RDPMC)
13012 {
13013 arg0 = CALL_EXPR_ARG (exp, 0);
13014 op2 = expand_normal (arg0);
13015 if (!register_operand (op2, SImode))
13016 op2 = copy_to_mode_reg (SImode, op2);
13017
13018 insn = (TARGET_64BIT
13019 ? gen_rdpmc_rex64 (op0, op1, op2)
13020 : gen_rdpmc (op0, op2));
13021 emit_insn (insn);
13022 }
13023 else if (fcode == IX86_BUILTIN_XGETBV)
13024 {
13025 arg0 = CALL_EXPR_ARG (exp, 0);
13026 op2 = expand_normal (arg0);
13027 if (!register_operand (op2, SImode))
13028 op2 = copy_to_mode_reg (SImode, op2);
13029
13030 insn = (TARGET_64BIT
13031 ? gen_xgetbv_rex64 (op0, op1, op2)
13032 : gen_xgetbv (op0, op2));
13033 emit_insn (insn);
13034 }
13035 else if (fcode == IX86_BUILTIN_RDTSC)
13036 {
13037 insn = (TARGET_64BIT
13038 ? gen_rdtsc_rex64 (op0, op1)
13039 : gen_rdtsc (op0));
13040 emit_insn (insn);
13041 }
13042 else
13043 {
13044 op2 = gen_reg_rtx (SImode);
13045
13046 insn = (TARGET_64BIT
13047 ? gen_rdtscp_rex64 (op0, op1, op2)
13048 : gen_rdtscp (op0, op2));
13049 emit_insn (insn);
13050
13051 arg0 = CALL_EXPR_ARG (exp, 0);
13052 op4 = expand_normal (arg0);
13053 if (!address_operand (op4, VOIDmode))
13054 {
13055 op4 = convert_memory_address (Pmode, op4);
13056 op4 = copy_addr_to_reg (op4);
13057 }
13058 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
13059 }
13060
13061 if (target == 0
13062 || !register_operand (target, DImode))
13063 target = gen_reg_rtx (DImode);
13064
13065 if (TARGET_64BIT)
13066 {
13067 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
13068 op1, 1, OPTAB_DIRECT);
13069 op0 = expand_simple_binop (DImode, IOR, op0, op1,
13070 op0, 1, OPTAB_DIRECT);
13071 }
13072
13073 emit_move_insn (target, op0);
13074 return target;
13075
13076 case IX86_BUILTIN_ENQCMD:
13077 case IX86_BUILTIN_ENQCMDS:
13078 case IX86_BUILTIN_MOVDIR64B:
13079
13080 arg0 = CALL_EXPR_ARG (exp, 0);
13081 arg1 = CALL_EXPR_ARG (exp, 1);
13082 op0 = expand_normal (arg0);
13083 op1 = expand_normal (arg1);
13084
13085 op0 = ix86_zero_extend_to_Pmode (op0);
13086 if (!address_operand (op1, VOIDmode))
13087 {
13088 op1 = convert_memory_address (Pmode, op1);
13089 op1 = copy_addr_to_reg (op1);
13090 }
13091 op1 = gen_rtx_MEM (XImode, op1);
13092
13093 if (fcode == IX86_BUILTIN_MOVDIR64B)
13094 {
13095 emit_insn (gen_movdir64b (Pmode, op0, op1));
13096 return 0;
13097 }
13098 else
13099 {
13100 if (target == 0
13101 || !register_operand (target, SImode))
13102 target = gen_reg_rtx (SImode);
13103
13104 emit_move_insn (target, const0_rtx);
13105 target = gen_rtx_SUBREG (QImode, target, 0);
13106
13107 int unspecv = (fcode == IX86_BUILTIN_ENQCMD
13108 ? UNSPECV_ENQCMD
13109 : UNSPECV_ENQCMDS);
13110 icode = code_for_enqcmd (unspecv, Pmode);
13111 emit_insn (GEN_FCN (icode) (op0, op1));
13112
13113 emit_insn
13114 (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
13115 gen_rtx_fmt_ee (EQ, QImode,
13116 gen_rtx_REG (CCZmode, FLAGS_REG),
13117 const0_rtx)));
13118 return SUBREG_REG (target);
13119 }
13120
13121 case IX86_BUILTIN_FXSAVE:
13122 case IX86_BUILTIN_FXRSTOR:
13123 case IX86_BUILTIN_FXSAVE64:
13124 case IX86_BUILTIN_FXRSTOR64:
13125 case IX86_BUILTIN_FNSTENV:
13126 case IX86_BUILTIN_FLDENV:
13127 mode0 = BLKmode;
13128 switch (fcode)
13129 {
13130 case IX86_BUILTIN_FXSAVE:
13131 icode = CODE_FOR_fxsave;
13132 break;
13133 case IX86_BUILTIN_FXRSTOR:
13134 icode = CODE_FOR_fxrstor;
13135 break;
13136 case IX86_BUILTIN_FXSAVE64:
13137 icode = CODE_FOR_fxsave64;
13138 break;
13139 case IX86_BUILTIN_FXRSTOR64:
13140 icode = CODE_FOR_fxrstor64;
13141 break;
13142 case IX86_BUILTIN_FNSTENV:
13143 icode = CODE_FOR_fnstenv;
13144 break;
13145 case IX86_BUILTIN_FLDENV:
13146 icode = CODE_FOR_fldenv;
13147 break;
13148 default:
13149 gcc_unreachable ();
13150 }
13151
13152 arg0 = CALL_EXPR_ARG (exp, 0);
13153 op0 = expand_normal (arg0);
13154
13155 if (!address_operand (op0, VOIDmode))
13156 {
13157 op0 = convert_memory_address (Pmode, op0);
13158 op0 = copy_addr_to_reg (op0);
13159 }
13160 op0 = gen_rtx_MEM (mode0, op0);
13161
13162 pat = GEN_FCN (icode) (op0);
13163 if (pat)
13164 emit_insn (pat);
13165 return 0;
13166
13167 case IX86_BUILTIN_XSETBV:
13168 arg0 = CALL_EXPR_ARG (exp, 0);
13169 arg1 = CALL_EXPR_ARG (exp, 1);
13170 op0 = expand_normal (arg0);
13171 op1 = expand_normal (arg1);
13172
13173 if (!REG_P (op0))
13174 op0 = copy_to_mode_reg (SImode, op0);
13175
13176 op1 = force_reg (DImode, op1);
13177
13178 if (TARGET_64BIT)
13179 {
13180 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13181 NULL, 1, OPTAB_DIRECT);
13182
13183 icode = CODE_FOR_xsetbv_rex64;
13184
13185 op2 = gen_lowpart (SImode, op2);
13186 op1 = gen_lowpart (SImode, op1);
13187 pat = GEN_FCN (icode) (op0, op1, op2);
13188 }
13189 else
13190 {
13191 icode = CODE_FOR_xsetbv;
13192
13193 pat = GEN_FCN (icode) (op0, op1);
13194 }
13195 if (pat)
13196 emit_insn (pat);
13197 return 0;
13198
13199 case IX86_BUILTIN_XSAVE:
13200 case IX86_BUILTIN_XRSTOR:
13201 case IX86_BUILTIN_XSAVE64:
13202 case IX86_BUILTIN_XRSTOR64:
13203 case IX86_BUILTIN_XSAVEOPT:
13204 case IX86_BUILTIN_XSAVEOPT64:
13205 case IX86_BUILTIN_XSAVES:
13206 case IX86_BUILTIN_XRSTORS:
13207 case IX86_BUILTIN_XSAVES64:
13208 case IX86_BUILTIN_XRSTORS64:
13209 case IX86_BUILTIN_XSAVEC:
13210 case IX86_BUILTIN_XSAVEC64:
13211 arg0 = CALL_EXPR_ARG (exp, 0);
13212 arg1 = CALL_EXPR_ARG (exp, 1);
13213 op0 = expand_normal (arg0);
13214 op1 = expand_normal (arg1);
13215
13216 if (!address_operand (op0, VOIDmode))
13217 {
13218 op0 = convert_memory_address (Pmode, op0);
13219 op0 = copy_addr_to_reg (op0);
13220 }
13221 op0 = gen_rtx_MEM (BLKmode, op0);
13222
13223 op1 = force_reg (DImode, op1);
13224
13225 if (TARGET_64BIT)
13226 {
13227 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
13228 NULL, 1, OPTAB_DIRECT);
13229 switch (fcode)
13230 {
13231 case IX86_BUILTIN_XSAVE:
13232 icode = CODE_FOR_xsave_rex64;
13233 break;
13234 case IX86_BUILTIN_XRSTOR:
13235 icode = CODE_FOR_xrstor_rex64;
13236 break;
13237 case IX86_BUILTIN_XSAVE64:
13238 icode = CODE_FOR_xsave64;
13239 break;
13240 case IX86_BUILTIN_XRSTOR64:
13241 icode = CODE_FOR_xrstor64;
13242 break;
13243 case IX86_BUILTIN_XSAVEOPT:
13244 icode = CODE_FOR_xsaveopt_rex64;
13245 break;
13246 case IX86_BUILTIN_XSAVEOPT64:
13247 icode = CODE_FOR_xsaveopt64;
13248 break;
13249 case IX86_BUILTIN_XSAVES:
13250 icode = CODE_FOR_xsaves_rex64;
13251 break;
13252 case IX86_BUILTIN_XRSTORS:
13253 icode = CODE_FOR_xrstors_rex64;
13254 break;
13255 case IX86_BUILTIN_XSAVES64:
13256 icode = CODE_FOR_xsaves64;
13257 break;
13258 case IX86_BUILTIN_XRSTORS64:
13259 icode = CODE_FOR_xrstors64;
13260 break;
13261 case IX86_BUILTIN_XSAVEC:
13262 icode = CODE_FOR_xsavec_rex64;
13263 break;
13264 case IX86_BUILTIN_XSAVEC64:
13265 icode = CODE_FOR_xsavec64;
13266 break;
13267 default:
13268 gcc_unreachable ();
13269 }
13270
13271 op2 = gen_lowpart (SImode, op2);
13272 op1 = gen_lowpart (SImode, op1);
13273 pat = GEN_FCN (icode) (op0, op1, op2);
13274 }
13275 else
13276 {
13277 switch (fcode)
13278 {
13279 case IX86_BUILTIN_XSAVE:
13280 icode = CODE_FOR_xsave;
13281 break;
13282 case IX86_BUILTIN_XRSTOR:
13283 icode = CODE_FOR_xrstor;
13284 break;
13285 case IX86_BUILTIN_XSAVEOPT:
13286 icode = CODE_FOR_xsaveopt;
13287 break;
13288 case IX86_BUILTIN_XSAVES:
13289 icode = CODE_FOR_xsaves;
13290 break;
13291 case IX86_BUILTIN_XRSTORS:
13292 icode = CODE_FOR_xrstors;
13293 break;
13294 case IX86_BUILTIN_XSAVEC:
13295 icode = CODE_FOR_xsavec;
13296 break;
13297 default:
13298 gcc_unreachable ();
13299 }
13300 pat = GEN_FCN (icode) (op0, op1);
13301 }
13302
13303 if (pat)
13304 emit_insn (pat);
13305 return 0;
13306
13307 case IX86_BUILTIN_LDTILECFG:
13308 case IX86_BUILTIN_STTILECFG:
13309 arg0 = CALL_EXPR_ARG (exp, 0);
13310 op0 = expand_normal (arg0);
13311
13312 if (!address_operand (op0, VOIDmode))
13313 {
13314 op0 = convert_memory_address (Pmode, op0);
13315 op0 = copy_addr_to_reg (op0);
13316 }
13317 op0 = gen_rtx_MEM (XImode, op0);
13318 if (fcode == IX86_BUILTIN_LDTILECFG)
13319 icode = CODE_FOR_ldtilecfg;
13320 else
13321 icode = CODE_FOR_sttilecfg;
13322 pat = GEN_FCN (icode) (op0);
13323 emit_insn (pat);
13324 return 0;
13325
13326 case IX86_BUILTIN_LLWPCB:
13327 arg0 = CALL_EXPR_ARG (exp, 0);
13328 op0 = expand_normal (arg0);
13329
13330 if (!register_operand (op0, Pmode))
13331 op0 = ix86_zero_extend_to_Pmode (op0);
13332 emit_insn (gen_lwp_llwpcb (Pmode, op0));
13333 return 0;
13334
13335 case IX86_BUILTIN_SLWPCB:
13336 if (!target
13337 || !register_operand (target, Pmode))
13338 target = gen_reg_rtx (Pmode);
13339 emit_insn (gen_lwp_slwpcb (Pmode, target));
13340 return target;
13341
13342 case IX86_BUILTIN_LWPVAL32:
13343 case IX86_BUILTIN_LWPVAL64:
13344 case IX86_BUILTIN_LWPINS32:
13345 case IX86_BUILTIN_LWPINS64:
13346 mode = ((fcode == IX86_BUILTIN_LWPVAL32
13347 || fcode == IX86_BUILTIN_LWPINS32)
13348 ? SImode : DImode);
13349
13350 if (fcode == IX86_BUILTIN_LWPVAL32
13351 || fcode == IX86_BUILTIN_LWPVAL64)
13352 icode = code_for_lwp_lwpval (mode);
13353 else
13354 icode = code_for_lwp_lwpins (mode);
13355
13356 arg0 = CALL_EXPR_ARG (exp, 0);
13357 arg1 = CALL_EXPR_ARG (exp, 1);
13358 arg2 = CALL_EXPR_ARG (exp, 2);
13359 op0 = expand_normal (arg0);
13360 op1 = expand_normal (arg1);
13361 op2 = expand_normal (arg2);
13362 mode0 = insn_data[icode].operand[0].mode;
13363
13364 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13365 op0 = copy_to_mode_reg (mode0, op0);
13366 if (!insn_data[icode].operand[1].predicate (op1, SImode))
13367 op1 = copy_to_mode_reg (SImode, op1);
13368
13369 if (!CONST_INT_P (op2))
13370 {
13371 error ("the last argument must be a 32-bit immediate");
13372 return const0_rtx;
13373 }
13374
13375 emit_insn (GEN_FCN (icode) (op0, op1, op2));
13376
13377 if (fcode == IX86_BUILTIN_LWPINS32
13378 || fcode == IX86_BUILTIN_LWPINS64)
13379 {
13380 if (target == 0
13381 || !nonimmediate_operand (target, QImode))
13382 target = gen_reg_rtx (QImode);
13383
13384 pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13385 const0_rtx);
13386 emit_insn (gen_rtx_SET (target, pat));
13387
13388 return target;
13389 }
13390 else
13391 return 0;
13392
13393 case IX86_BUILTIN_BEXTRI32:
13394 case IX86_BUILTIN_BEXTRI64:
13395 mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
13396
13397 arg0 = CALL_EXPR_ARG (exp, 0);
13398 arg1 = CALL_EXPR_ARG (exp, 1);
13399 op0 = expand_normal (arg0);
13400 op1 = expand_normal (arg1);
13401
13402 if (!CONST_INT_P (op1))
13403 {
13404 error ("last argument must be an immediate");
13405 return const0_rtx;
13406 }
13407 else
13408 {
13409 unsigned char lsb_index = UINTVAL (op1);
13410 unsigned char length = UINTVAL (op1) >> 8;
13411
13412 unsigned char bitsize = GET_MODE_BITSIZE (mode);
13413
13414 icode = code_for_tbm_bextri (mode);
13415
13416 mode1 = insn_data[icode].operand[1].mode;
13417 if (!insn_data[icode].operand[1].predicate (op0, mode1))
13418 op0 = copy_to_mode_reg (mode1, op0);
13419
13420 mode0 = insn_data[icode].operand[0].mode;
13421 if (target == 0
13422 || !register_operand (target, mode0))
13423 target = gen_reg_rtx (mode0);
13424
13425 if (length == 0 || lsb_index >= bitsize)
13426 {
13427 emit_move_insn (target, const0_rtx);
13428 return target;
13429 }
13430
13431 if (length + lsb_index > bitsize)
13432 length = bitsize - lsb_index;
13433
13434 op1 = GEN_INT (length);
13435 op2 = GEN_INT (lsb_index);
13436
13437 emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
13438 return target;
13439 }
13440
13441 case IX86_BUILTIN_RDRAND16_STEP:
13442 mode = HImode;
13443 goto rdrand_step;
13444
13445 case IX86_BUILTIN_RDRAND32_STEP:
13446 mode = SImode;
13447 goto rdrand_step;
13448
13449 case IX86_BUILTIN_RDRAND64_STEP:
13450 mode = DImode;
13451
13452 rdrand_step:
13453 arg0 = CALL_EXPR_ARG (exp, 0);
13454 op1 = expand_normal (arg0);
13455 if (!address_operand (op1, VOIDmode))
13456 {
13457 op1 = convert_memory_address (Pmode, op1);
13458 op1 = copy_addr_to_reg (op1);
13459 }
13460
13461 op0 = gen_reg_rtx (mode);
13462 emit_insn (gen_rdrand (mode, op0));
13463
13464 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13465
13466 op1 = force_reg (SImode, const1_rtx);
13467
13468 /* Emit SImode conditional move. */
13469 if (mode == HImode)
13470 {
13471 if (TARGET_ZERO_EXTEND_WITH_AND
13472 && optimize_function_for_speed_p (cfun))
13473 {
13474 op2 = force_reg (SImode, const0_rtx);
13475
13476 emit_insn (gen_movstricthi
13477 (gen_lowpart (HImode, op2), op0));
13478 }
13479 else
13480 {
13481 op2 = gen_reg_rtx (SImode);
13482
13483 emit_insn (gen_zero_extendhisi2 (op2, op0));
13484 }
13485 }
13486 else if (mode == SImode)
13487 op2 = op0;
13488 else
13489 op2 = gen_rtx_SUBREG (SImode, op0, 0);
13490
13491 if (target == 0
13492 || !register_operand (target, SImode))
13493 target = gen_reg_rtx (SImode);
13494
13495 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
13496 const0_rtx);
13497 emit_insn (gen_rtx_SET (target,
13498 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
13499 return target;
13500
13501 case IX86_BUILTIN_RDSEED16_STEP:
13502 mode = HImode;
13503 goto rdseed_step;
13504
13505 case IX86_BUILTIN_RDSEED32_STEP:
13506 mode = SImode;
13507 goto rdseed_step;
13508
13509 case IX86_BUILTIN_RDSEED64_STEP:
13510 mode = DImode;
13511
13512 rdseed_step:
13513 arg0 = CALL_EXPR_ARG (exp, 0);
13514 op1 = expand_normal (arg0);
13515 if (!address_operand (op1, VOIDmode))
13516 {
13517 op1 = convert_memory_address (Pmode, op1);
13518 op1 = copy_addr_to_reg (op1);
13519 }
13520
13521 op0 = gen_reg_rtx (mode);
13522 emit_insn (gen_rdseed (mode, op0));
13523
13524 emit_move_insn (gen_rtx_MEM (mode, op1), op0);
13525
13526 op2 = gen_reg_rtx (QImode);
13527
13528 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
13529 const0_rtx);
13530 emit_insn (gen_rtx_SET (op2, pat));
13531
13532 if (target == 0
13533 || !register_operand (target, SImode))
13534 target = gen_reg_rtx (SImode);
13535
13536 emit_insn (gen_zero_extendqisi2 (target, op2));
13537 return target;
13538
13539 case IX86_BUILTIN_SBB32:
13540 icode = CODE_FOR_subborrowsi;
13541 icode2 = CODE_FOR_subborrowsi_0;
13542 mode0 = SImode;
13543 mode1 = DImode;
13544 mode2 = CCmode;
13545 goto handlecarry;
13546
13547 case IX86_BUILTIN_SBB64:
13548 icode = CODE_FOR_subborrowdi;
13549 icode2 = CODE_FOR_subborrowdi_0;
13550 mode0 = DImode;
13551 mode1 = TImode;
13552 mode2 = CCmode;
13553 goto handlecarry;
13554
13555 case IX86_BUILTIN_ADDCARRYX32:
13556 icode = CODE_FOR_addcarrysi;
13557 icode2 = CODE_FOR_addcarrysi_0;
13558 mode0 = SImode;
13559 mode1 = DImode;
13560 mode2 = CCCmode;
13561 goto handlecarry;
13562
13563 case IX86_BUILTIN_ADDCARRYX64:
13564 icode = CODE_FOR_addcarrydi;
13565 icode2 = CODE_FOR_addcarrydi_0;
13566 mode0 = DImode;
13567 mode1 = TImode;
13568 mode2 = CCCmode;
13569
13570 handlecarry:
13571 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
13572 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
13573 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
13574 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
13575
13576 op1 = expand_normal (arg0);
13577 if (!integer_zerop (arg0))
13578 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
13579
13580 op2 = expand_normal (arg1);
13581 if (!register_operand (op2, mode0))
13582 op2 = copy_to_mode_reg (mode0, op2);
13583
13584 op3 = expand_normal (arg2);
13585 if (!register_operand (op3, mode0))
13586 op3 = copy_to_mode_reg (mode0, op3);
13587
13588 op4 = expand_normal (arg3);
13589 if (!address_operand (op4, VOIDmode))
13590 {
13591 op4 = convert_memory_address (Pmode, op4);
13592 op4 = copy_addr_to_reg (op4);
13593 }
13594
13595 op0 = gen_reg_rtx (mode0);
13596 if (integer_zerop (arg0))
13597 {
13598 /* If arg0 is 0, optimize right away into add or sub
13599 instruction that sets CCCmode flags. */
13600 op1 = gen_rtx_REG (mode2, FLAGS_REG);
13601 emit_insn (GEN_FCN (icode2) (op0, op2, op3));
13602 }
13603 else
13604 {
13605 /* Generate CF from input operand. */
13606 emit_insn (gen_addqi3_cconly_overflow (op1, constm1_rtx));
13607
13608 /* Generate instruction that consumes CF. */
13609 op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
13610 pat = gen_rtx_LTU (mode1, op1, const0_rtx);
13611 pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
13612 emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
13613 }
13614
13615 /* Return current CF value. */
13616 if (target == 0)
13617 target = gen_reg_rtx (QImode);
13618
13619 pat = gen_rtx_LTU (QImode, op1, const0_rtx);
13620 emit_insn (gen_rtx_SET (target, pat));
13621
13622 /* Store the result. */
13623 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
13624
13625 return target;
13626
13627 case IX86_BUILTIN_READ_FLAGS:
13628 if (ignore)
13629 return const0_rtx;
13630
13631 emit_insn (gen_push (gen_rtx_REG (word_mode, FLAGS_REG)));
13632
13633 if (optimize
13634 || target == NULL_RTX
13635 || !nonimmediate_operand (target, word_mode)
13636 || GET_MODE (target) != word_mode)
13637 target = gen_reg_rtx (word_mode);
13638
13639 emit_insn (gen_pop (target));
13640 return target;
13641
13642 case IX86_BUILTIN_WRITE_FLAGS:
13643
13644 arg0 = CALL_EXPR_ARG (exp, 0);
13645 op0 = expand_normal (arg0);
13646 if (!general_no_elim_operand (op0, word_mode))
13647 op0 = copy_to_mode_reg (word_mode, op0);
13648
13649 emit_insn (gen_push (op0));
13650 emit_insn (gen_pop (gen_rtx_REG (word_mode, FLAGS_REG)));
13651 return 0;
13652
13653 case IX86_BUILTIN_KTESTC8:
13654 icode = CODE_FOR_ktestqi;
13655 mode3 = CCCmode;
13656 goto kortest;
13657
13658 case IX86_BUILTIN_KTESTZ8:
13659 icode = CODE_FOR_ktestqi;
13660 mode3 = CCZmode;
13661 goto kortest;
13662
13663 case IX86_BUILTIN_KTESTC16:
13664 icode = CODE_FOR_ktesthi;
13665 mode3 = CCCmode;
13666 goto kortest;
13667
13668 case IX86_BUILTIN_KTESTZ16:
13669 icode = CODE_FOR_ktesthi;
13670 mode3 = CCZmode;
13671 goto kortest;
13672
13673 case IX86_BUILTIN_KTESTC32:
13674 icode = CODE_FOR_ktestsi;
13675 mode3 = CCCmode;
13676 goto kortest;
13677
13678 case IX86_BUILTIN_KTESTZ32:
13679 icode = CODE_FOR_ktestsi;
13680 mode3 = CCZmode;
13681 goto kortest;
13682
13683 case IX86_BUILTIN_KTESTC64:
13684 icode = CODE_FOR_ktestdi;
13685 mode3 = CCCmode;
13686 goto kortest;
13687
13688 case IX86_BUILTIN_KTESTZ64:
13689 icode = CODE_FOR_ktestdi;
13690 mode3 = CCZmode;
13691 goto kortest;
13692
13693 case IX86_BUILTIN_KORTESTC8:
13694 icode = CODE_FOR_kortestqi;
13695 mode3 = CCCmode;
13696 goto kortest;
13697
13698 case IX86_BUILTIN_KORTESTZ8:
13699 icode = CODE_FOR_kortestqi;
13700 mode3 = CCZmode;
13701 goto kortest;
13702
13703 case IX86_BUILTIN_KORTESTC16:
13704 icode = CODE_FOR_kortesthi;
13705 mode3 = CCCmode;
13706 goto kortest;
13707
13708 case IX86_BUILTIN_KORTESTZ16:
13709 icode = CODE_FOR_kortesthi;
13710 mode3 = CCZmode;
13711 goto kortest;
13712
13713 case IX86_BUILTIN_KORTESTC32:
13714 icode = CODE_FOR_kortestsi;
13715 mode3 = CCCmode;
13716 goto kortest;
13717
13718 case IX86_BUILTIN_KORTESTZ32:
13719 icode = CODE_FOR_kortestsi;
13720 mode3 = CCZmode;
13721 goto kortest;
13722
13723 case IX86_BUILTIN_KORTESTC64:
13724 icode = CODE_FOR_kortestdi;
13725 mode3 = CCCmode;
13726 goto kortest;
13727
13728 case IX86_BUILTIN_KORTESTZ64:
13729 icode = CODE_FOR_kortestdi;
13730 mode3 = CCZmode;
13731
13732 kortest:
13733 arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
13734 arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
13735 op0 = expand_normal (arg0);
13736 op1 = expand_normal (arg1);
13737
13738 mode0 = insn_data[icode].operand[0].mode;
13739 mode1 = insn_data[icode].operand[1].mode;
13740
13741 if (GET_MODE (op0) != VOIDmode)
13742 op0 = force_reg (GET_MODE (op0), op0);
13743
13744 op0 = gen_lowpart (mode0, op0);
13745
13746 if (!insn_data[icode].operand[0].predicate (op0, mode0))
13747 op0 = copy_to_mode_reg (mode0, op0);
13748
13749 if (GET_MODE (op1) != VOIDmode)
13750 op1 = force_reg (GET_MODE (op1), op1);
13751
13752 op1 = gen_lowpart (mode1, op1);
13753
13754 if (!insn_data[icode].operand[1].predicate (op1, mode1))
13755 op1 = copy_to_mode_reg (mode1, op1);
13756
13757 target = gen_reg_rtx (QImode);
13758
13759 /* Emit kortest. */
13760 emit_insn (GEN_FCN (icode) (op0, op1));
13761 /* And use setcc to return result from flags. */
13762 ix86_expand_setcc (target, EQ,
13763 gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
13764 return target;
13765
13766 case IX86_BUILTIN_GATHERSIV2DF:
13767 icode = CODE_FOR_avx2_gathersiv2df;
13768 goto gather_gen;
13769 case IX86_BUILTIN_GATHERSIV4DF:
13770 icode = CODE_FOR_avx2_gathersiv4df;
13771 goto gather_gen;
13772 case IX86_BUILTIN_GATHERDIV2DF:
13773 icode = CODE_FOR_avx2_gatherdiv2df;
13774 goto gather_gen;
13775 case IX86_BUILTIN_GATHERDIV4DF:
13776 icode = CODE_FOR_avx2_gatherdiv4df;
13777 goto gather_gen;
13778 case IX86_BUILTIN_GATHERSIV4SF:
13779 icode = CODE_FOR_avx2_gathersiv4sf;
13780 goto gather_gen;
13781 case IX86_BUILTIN_GATHERSIV8SF:
13782 icode = CODE_FOR_avx2_gathersiv8sf;
13783 goto gather_gen;
13784 case IX86_BUILTIN_GATHERDIV4SF:
13785 icode = CODE_FOR_avx2_gatherdiv4sf;
13786 goto gather_gen;
13787 case IX86_BUILTIN_GATHERDIV8SF:
13788 icode = CODE_FOR_avx2_gatherdiv8sf;
13789 goto gather_gen;
13790 case IX86_BUILTIN_GATHERSIV2DI:
13791 icode = CODE_FOR_avx2_gathersiv2di;
13792 goto gather_gen;
13793 case IX86_BUILTIN_GATHERSIV4DI:
13794 icode = CODE_FOR_avx2_gathersiv4di;
13795 goto gather_gen;
13796 case IX86_BUILTIN_GATHERDIV2DI:
13797 icode = CODE_FOR_avx2_gatherdiv2di;
13798 goto gather_gen;
13799 case IX86_BUILTIN_GATHERDIV4DI:
13800 icode = CODE_FOR_avx2_gatherdiv4di;
13801 goto gather_gen;
13802 case IX86_BUILTIN_GATHERSIV4SI:
13803 icode = CODE_FOR_avx2_gathersiv4si;
13804 goto gather_gen;
13805 case IX86_BUILTIN_GATHERSIV8SI:
13806 icode = CODE_FOR_avx2_gathersiv8si;
13807 goto gather_gen;
13808 case IX86_BUILTIN_GATHERDIV4SI:
13809 icode = CODE_FOR_avx2_gatherdiv4si;
13810 goto gather_gen;
13811 case IX86_BUILTIN_GATHERDIV8SI:
13812 icode = CODE_FOR_avx2_gatherdiv8si;
13813 goto gather_gen;
13814 case IX86_BUILTIN_GATHERALTSIV4DF:
13815 icode = CODE_FOR_avx2_gathersiv4df;
13816 goto gather_gen;
13817 case IX86_BUILTIN_GATHERALTDIV8SF:
13818 icode = CODE_FOR_avx2_gatherdiv8sf;
13819 goto gather_gen;
13820 case IX86_BUILTIN_GATHERALTSIV4DI:
13821 icode = CODE_FOR_avx2_gathersiv4di;
13822 goto gather_gen;
13823 case IX86_BUILTIN_GATHERALTDIV8SI:
13824 icode = CODE_FOR_avx2_gatherdiv8si;
13825 goto gather_gen;
13826 case IX86_BUILTIN_GATHER3SIV16SF:
13827 icode = CODE_FOR_avx512f_gathersiv16sf;
13828 goto gather_gen;
13829 case IX86_BUILTIN_GATHER3SIV8DF:
13830 icode = CODE_FOR_avx512f_gathersiv8df;
13831 goto gather_gen;
13832 case IX86_BUILTIN_GATHER3DIV16SF:
13833 icode = CODE_FOR_avx512f_gatherdiv16sf;
13834 goto gather_gen;
13835 case IX86_BUILTIN_GATHER3DIV8DF:
13836 icode = CODE_FOR_avx512f_gatherdiv8df;
13837 goto gather_gen;
13838 case IX86_BUILTIN_GATHER3SIV16SI:
13839 icode = CODE_FOR_avx512f_gathersiv16si;
13840 goto gather_gen;
13841 case IX86_BUILTIN_GATHER3SIV8DI:
13842 icode = CODE_FOR_avx512f_gathersiv8di;
13843 goto gather_gen;
13844 case IX86_BUILTIN_GATHER3DIV16SI:
13845 icode = CODE_FOR_avx512f_gatherdiv16si;
13846 goto gather_gen;
13847 case IX86_BUILTIN_GATHER3DIV8DI:
13848 icode = CODE_FOR_avx512f_gatherdiv8di;
13849 goto gather_gen;
13850 case IX86_BUILTIN_GATHER3ALTSIV8DF:
13851 icode = CODE_FOR_avx512f_gathersiv8df;
13852 goto gather_gen;
13853 case IX86_BUILTIN_GATHER3ALTDIV16SF:
13854 icode = CODE_FOR_avx512f_gatherdiv16sf;
13855 goto gather_gen;
13856 case IX86_BUILTIN_GATHER3ALTSIV8DI:
13857 icode = CODE_FOR_avx512f_gathersiv8di;
13858 goto gather_gen;
13859 case IX86_BUILTIN_GATHER3ALTDIV16SI:
13860 icode = CODE_FOR_avx512f_gatherdiv16si;
13861 goto gather_gen;
13862 case IX86_BUILTIN_GATHER3SIV2DF:
13863 icode = CODE_FOR_avx512vl_gathersiv2df;
13864 goto gather_gen;
13865 case IX86_BUILTIN_GATHER3SIV4DF:
13866 icode = CODE_FOR_avx512vl_gathersiv4df;
13867 goto gather_gen;
13868 case IX86_BUILTIN_GATHER3DIV2DF:
13869 icode = CODE_FOR_avx512vl_gatherdiv2df;
13870 goto gather_gen;
13871 case IX86_BUILTIN_GATHER3DIV4DF:
13872 icode = CODE_FOR_avx512vl_gatherdiv4df;
13873 goto gather_gen;
13874 case IX86_BUILTIN_GATHER3SIV4SF:
13875 icode = CODE_FOR_avx512vl_gathersiv4sf;
13876 goto gather_gen;
13877 case IX86_BUILTIN_GATHER3SIV8SF:
13878 icode = CODE_FOR_avx512vl_gathersiv8sf;
13879 goto gather_gen;
13880 case IX86_BUILTIN_GATHER3DIV4SF:
13881 icode = CODE_FOR_avx512vl_gatherdiv4sf;
13882 goto gather_gen;
13883 case IX86_BUILTIN_GATHER3DIV8SF:
13884 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13885 goto gather_gen;
13886 case IX86_BUILTIN_GATHER3SIV2DI:
13887 icode = CODE_FOR_avx512vl_gathersiv2di;
13888 goto gather_gen;
13889 case IX86_BUILTIN_GATHER3SIV4DI:
13890 icode = CODE_FOR_avx512vl_gathersiv4di;
13891 goto gather_gen;
13892 case IX86_BUILTIN_GATHER3DIV2DI:
13893 icode = CODE_FOR_avx512vl_gatherdiv2di;
13894 goto gather_gen;
13895 case IX86_BUILTIN_GATHER3DIV4DI:
13896 icode = CODE_FOR_avx512vl_gatherdiv4di;
13897 goto gather_gen;
13898 case IX86_BUILTIN_GATHER3SIV4SI:
13899 icode = CODE_FOR_avx512vl_gathersiv4si;
13900 goto gather_gen;
13901 case IX86_BUILTIN_GATHER3SIV8SI:
13902 icode = CODE_FOR_avx512vl_gathersiv8si;
13903 goto gather_gen;
13904 case IX86_BUILTIN_GATHER3DIV4SI:
13905 icode = CODE_FOR_avx512vl_gatherdiv4si;
13906 goto gather_gen;
13907 case IX86_BUILTIN_GATHER3DIV8SI:
13908 icode = CODE_FOR_avx512vl_gatherdiv8si;
13909 goto gather_gen;
13910 case IX86_BUILTIN_GATHER3ALTSIV4DF:
13911 icode = CODE_FOR_avx512vl_gathersiv4df;
13912 goto gather_gen;
13913 case IX86_BUILTIN_GATHER3ALTDIV8SF:
13914 icode = CODE_FOR_avx512vl_gatherdiv8sf;
13915 goto gather_gen;
13916 case IX86_BUILTIN_GATHER3ALTSIV4DI:
13917 icode = CODE_FOR_avx512vl_gathersiv4di;
13918 goto gather_gen;
13919 case IX86_BUILTIN_GATHER3ALTDIV8SI:
13920 icode = CODE_FOR_avx512vl_gatherdiv8si;
13921 goto gather_gen;
13922 case IX86_BUILTIN_SCATTERSIV16SF:
13923 icode = CODE_FOR_avx512f_scattersiv16sf;
13924 goto scatter_gen;
13925 case IX86_BUILTIN_SCATTERSIV8DF:
13926 icode = CODE_FOR_avx512f_scattersiv8df;
13927 goto scatter_gen;
13928 case IX86_BUILTIN_SCATTERDIV16SF:
13929 icode = CODE_FOR_avx512f_scatterdiv16sf;
13930 goto scatter_gen;
13931 case IX86_BUILTIN_SCATTERDIV8DF:
13932 icode = CODE_FOR_avx512f_scatterdiv8df;
13933 goto scatter_gen;
13934 case IX86_BUILTIN_SCATTERSIV16SI:
13935 icode = CODE_FOR_avx512f_scattersiv16si;
13936 goto scatter_gen;
13937 case IX86_BUILTIN_SCATTERSIV8DI:
13938 icode = CODE_FOR_avx512f_scattersiv8di;
13939 goto scatter_gen;
13940 case IX86_BUILTIN_SCATTERDIV16SI:
13941 icode = CODE_FOR_avx512f_scatterdiv16si;
13942 goto scatter_gen;
13943 case IX86_BUILTIN_SCATTERDIV8DI:
13944 icode = CODE_FOR_avx512f_scatterdiv8di;
13945 goto scatter_gen;
13946 case IX86_BUILTIN_SCATTERSIV8SF:
13947 icode = CODE_FOR_avx512vl_scattersiv8sf;
13948 goto scatter_gen;
13949 case IX86_BUILTIN_SCATTERSIV4SF:
13950 icode = CODE_FOR_avx512vl_scattersiv4sf;
13951 goto scatter_gen;
13952 case IX86_BUILTIN_SCATTERSIV4DF:
13953 icode = CODE_FOR_avx512vl_scattersiv4df;
13954 goto scatter_gen;
13955 case IX86_BUILTIN_SCATTERSIV2DF:
13956 icode = CODE_FOR_avx512vl_scattersiv2df;
13957 goto scatter_gen;
13958 case IX86_BUILTIN_SCATTERDIV8SF:
13959 icode = CODE_FOR_avx512vl_scatterdiv8sf;
13960 goto scatter_gen;
13961 case IX86_BUILTIN_SCATTERDIV4SF:
13962 icode = CODE_FOR_avx512vl_scatterdiv4sf;
13963 goto scatter_gen;
13964 case IX86_BUILTIN_SCATTERDIV4DF:
13965 icode = CODE_FOR_avx512vl_scatterdiv4df;
13966 goto scatter_gen;
13967 case IX86_BUILTIN_SCATTERDIV2DF:
13968 icode = CODE_FOR_avx512vl_scatterdiv2df;
13969 goto scatter_gen;
13970 case IX86_BUILTIN_SCATTERSIV8SI:
13971 icode = CODE_FOR_avx512vl_scattersiv8si;
13972 goto scatter_gen;
13973 case IX86_BUILTIN_SCATTERSIV4SI:
13974 icode = CODE_FOR_avx512vl_scattersiv4si;
13975 goto scatter_gen;
13976 case IX86_BUILTIN_SCATTERSIV4DI:
13977 icode = CODE_FOR_avx512vl_scattersiv4di;
13978 goto scatter_gen;
13979 case IX86_BUILTIN_SCATTERSIV2DI:
13980 icode = CODE_FOR_avx512vl_scattersiv2di;
13981 goto scatter_gen;
13982 case IX86_BUILTIN_SCATTERDIV8SI:
13983 icode = CODE_FOR_avx512vl_scatterdiv8si;
13984 goto scatter_gen;
13985 case IX86_BUILTIN_SCATTERDIV4SI:
13986 icode = CODE_FOR_avx512vl_scatterdiv4si;
13987 goto scatter_gen;
13988 case IX86_BUILTIN_SCATTERDIV4DI:
13989 icode = CODE_FOR_avx512vl_scatterdiv4di;
13990 goto scatter_gen;
13991 case IX86_BUILTIN_SCATTERDIV2DI:
13992 icode = CODE_FOR_avx512vl_scatterdiv2di;
13993 goto scatter_gen;
13994 case IX86_BUILTIN_GATHERPFDPD:
13995 icode = CODE_FOR_avx512pf_gatherpfv8sidf;
13996 goto vec_prefetch_gen;
13997 case IX86_BUILTIN_SCATTERALTSIV8DF:
13998 icode = CODE_FOR_avx512f_scattersiv8df;
13999 goto scatter_gen;
14000 case IX86_BUILTIN_SCATTERALTDIV16SF:
14001 icode = CODE_FOR_avx512f_scatterdiv16sf;
14002 goto scatter_gen;
14003 case IX86_BUILTIN_SCATTERALTSIV8DI:
14004 icode = CODE_FOR_avx512f_scattersiv8di;
14005 goto scatter_gen;
14006 case IX86_BUILTIN_SCATTERALTDIV16SI:
14007 icode = CODE_FOR_avx512f_scatterdiv16si;
14008 goto scatter_gen;
14009 case IX86_BUILTIN_SCATTERALTSIV4DF:
14010 icode = CODE_FOR_avx512vl_scattersiv4df;
14011 goto scatter_gen;
14012 case IX86_BUILTIN_SCATTERALTDIV8SF:
14013 icode = CODE_FOR_avx512vl_scatterdiv8sf;
14014 goto scatter_gen;
14015 case IX86_BUILTIN_SCATTERALTSIV4DI:
14016 icode = CODE_FOR_avx512vl_scattersiv4di;
14017 goto scatter_gen;
14018 case IX86_BUILTIN_SCATTERALTDIV8SI:
14019 icode = CODE_FOR_avx512vl_scatterdiv8si;
14020 goto scatter_gen;
14021 case IX86_BUILTIN_SCATTERALTSIV2DF:
14022 icode = CODE_FOR_avx512vl_scattersiv2df;
14023 goto scatter_gen;
14024 case IX86_BUILTIN_SCATTERALTDIV4SF:
14025 icode = CODE_FOR_avx512vl_scatterdiv4sf;
14026 goto scatter_gen;
14027 case IX86_BUILTIN_SCATTERALTSIV2DI:
14028 icode = CODE_FOR_avx512vl_scattersiv2di;
14029 goto scatter_gen;
14030 case IX86_BUILTIN_SCATTERALTDIV4SI:
14031 icode = CODE_FOR_avx512vl_scatterdiv4si;
14032 goto scatter_gen;
14033 case IX86_BUILTIN_GATHERPFDPS:
14034 icode = CODE_FOR_avx512pf_gatherpfv16sisf;
14035 goto vec_prefetch_gen;
14036 case IX86_BUILTIN_GATHERPFQPD:
14037 icode = CODE_FOR_avx512pf_gatherpfv8didf;
14038 goto vec_prefetch_gen;
14039 case IX86_BUILTIN_GATHERPFQPS:
14040 icode = CODE_FOR_avx512pf_gatherpfv8disf;
14041 goto vec_prefetch_gen;
14042 case IX86_BUILTIN_SCATTERPFDPD:
14043 icode = CODE_FOR_avx512pf_scatterpfv8sidf;
14044 goto vec_prefetch_gen;
14045 case IX86_BUILTIN_SCATTERPFDPS:
14046 icode = CODE_FOR_avx512pf_scatterpfv16sisf;
14047 goto vec_prefetch_gen;
14048 case IX86_BUILTIN_SCATTERPFQPD:
14049 icode = CODE_FOR_avx512pf_scatterpfv8didf;
14050 goto vec_prefetch_gen;
14051 case IX86_BUILTIN_SCATTERPFQPS:
14052 icode = CODE_FOR_avx512pf_scatterpfv8disf;
14053 goto vec_prefetch_gen;
14054
14055 gather_gen:
14056 rtx half;
14057 rtx (*gen) (rtx, rtx);
14058
14059 arg0 = CALL_EXPR_ARG (exp, 0);
14060 arg1 = CALL_EXPR_ARG (exp, 1);
14061 arg2 = CALL_EXPR_ARG (exp, 2);
14062 arg3 = CALL_EXPR_ARG (exp, 3);
14063 arg4 = CALL_EXPR_ARG (exp, 4);
14064 op0 = expand_normal (arg0);
14065 op1 = expand_normal (arg1);
14066 op2 = expand_normal (arg2);
14067 op3 = expand_normal (arg3);
14068 op4 = expand_normal (arg4);
14069 /* Note the arg order is different from the operand order. */
14070 mode0 = insn_data[icode].operand[1].mode;
14071 mode2 = insn_data[icode].operand[3].mode;
14072 mode3 = insn_data[icode].operand[4].mode;
14073 mode4 = insn_data[icode].operand[5].mode;
14074
14075 if (target == NULL_RTX
14076 || GET_MODE (target) != insn_data[icode].operand[0].mode
14077 || !insn_data[icode].operand[0].predicate (target,
14078 GET_MODE (target)))
14079 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
14080 else
14081 subtarget = target;
14082
14083 switch (fcode)
14084 {
14085 case IX86_BUILTIN_GATHER3ALTSIV8DF:
14086 case IX86_BUILTIN_GATHER3ALTSIV8DI:
14087 half = gen_reg_rtx (V8SImode);
14088 if (!nonimmediate_operand (op2, V16SImode))
14089 op2 = copy_to_mode_reg (V16SImode, op2);
14090 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14091 op2 = half;
14092 break;
14093 case IX86_BUILTIN_GATHER3ALTSIV4DF:
14094 case IX86_BUILTIN_GATHER3ALTSIV4DI:
14095 case IX86_BUILTIN_GATHERALTSIV4DF:
14096 case IX86_BUILTIN_GATHERALTSIV4DI:
14097 half = gen_reg_rtx (V4SImode);
14098 if (!nonimmediate_operand (op2, V8SImode))
14099 op2 = copy_to_mode_reg (V8SImode, op2);
14100 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14101 op2 = half;
14102 break;
14103 case IX86_BUILTIN_GATHER3ALTDIV16SF:
14104 case IX86_BUILTIN_GATHER3ALTDIV16SI:
14105 half = gen_reg_rtx (mode0);
14106 if (mode0 == V8SFmode)
14107 gen = gen_vec_extract_lo_v16sf;
14108 else
14109 gen = gen_vec_extract_lo_v16si;
14110 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14111 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14112 emit_insn (gen (half, op0));
14113 op0 = half;
14114 op3 = lowpart_subreg (QImode, op3, HImode);
14115 break;
14116 case IX86_BUILTIN_GATHER3ALTDIV8SF:
14117 case IX86_BUILTIN_GATHER3ALTDIV8SI:
14118 case IX86_BUILTIN_GATHERALTDIV8SF:
14119 case IX86_BUILTIN_GATHERALTDIV8SI:
14120 half = gen_reg_rtx (mode0);
14121 if (mode0 == V4SFmode)
14122 gen = gen_vec_extract_lo_v8sf;
14123 else
14124 gen = gen_vec_extract_lo_v8si;
14125 if (!nonimmediate_operand (op0, GET_MODE (op0)))
14126 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
14127 emit_insn (gen (half, op0));
14128 op0 = half;
14129 if (VECTOR_MODE_P (GET_MODE (op3)))
14130 {
14131 half = gen_reg_rtx (mode0);
14132 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14133 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14134 emit_insn (gen (half, op3));
14135 op3 = half;
14136 }
14137 break;
14138 default:
14139 break;
14140 }
14141
14142 /* Force memory operand only with base register here. But we
14143 don't want to do it on memory operand for other builtin
14144 functions. */
14145 op1 = ix86_zero_extend_to_Pmode (op1);
14146
14147 if (!insn_data[icode].operand[1].predicate (op0, mode0))
14148 op0 = copy_to_mode_reg (mode0, op0);
14149 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
14150 op1 = copy_to_mode_reg (Pmode, op1);
14151 if (!insn_data[icode].operand[3].predicate (op2, mode2))
14152 op2 = copy_to_mode_reg (mode2, op2);
14153
14154 op3 = fixup_modeless_constant (op3, mode3);
14155
14156 if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
14157 {
14158 if (!insn_data[icode].operand[4].predicate (op3, mode3))
14159 op3 = copy_to_mode_reg (mode3, op3);
14160 }
14161 else
14162 {
14163 op3 = copy_to_reg (op3);
14164 op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
14165 }
14166 if (!insn_data[icode].operand[5].predicate (op4, mode4))
14167 {
14168 error ("the last argument must be scale 1, 2, 4, 8");
14169 return const0_rtx;
14170 }
14171
14172 /* Optimize. If mask is known to have all high bits set,
14173 replace op0 with pc_rtx to signal that the instruction
14174 overwrites the whole destination and doesn't use its
14175 previous contents. */
14176 if (optimize)
14177 {
14178 if (TREE_CODE (arg3) == INTEGER_CST)
14179 {
14180 if (integer_all_onesp (arg3))
14181 op0 = pc_rtx;
14182 }
14183 else if (TREE_CODE (arg3) == VECTOR_CST)
14184 {
14185 unsigned int negative = 0;
14186 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
14187 {
14188 tree cst = VECTOR_CST_ELT (arg3, i);
14189 if (TREE_CODE (cst) == INTEGER_CST
14190 && tree_int_cst_sign_bit (cst))
14191 negative++;
14192 else if (TREE_CODE (cst) == REAL_CST
14193 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
14194 negative++;
14195 }
14196 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
14197 op0 = pc_rtx;
14198 }
14199 else if (TREE_CODE (arg3) == SSA_NAME
14200 && TREE_CODE (TREE_TYPE (arg3)) == VECTOR_TYPE)
14201 {
14202 /* Recognize also when mask is like:
14203 __v2df src = _mm_setzero_pd ();
14204 __v2df mask = _mm_cmpeq_pd (src, src);
14205 or
14206 __v8sf src = _mm256_setzero_ps ();
14207 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
14208 as that is a cheaper way to load all ones into
14209 a register than having to load a constant from
14210 memory. */
14211 gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
14212 if (is_gimple_call (def_stmt))
14213 {
14214 tree fndecl = gimple_call_fndecl (def_stmt);
14215 if (fndecl
14216 && fndecl_built_in_p (fndecl, BUILT_IN_MD))
14217 switch (DECL_MD_FUNCTION_CODE (fndecl))
14218 {
14219 case IX86_BUILTIN_CMPPD:
14220 case IX86_BUILTIN_CMPPS:
14221 case IX86_BUILTIN_CMPPD256:
14222 case IX86_BUILTIN_CMPPS256:
14223 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
14224 break;
14225 /* FALLTHRU */
14226 case IX86_BUILTIN_CMPEQPD:
14227 case IX86_BUILTIN_CMPEQPS:
14228 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
14229 && initializer_zerop (gimple_call_arg (def_stmt,
14230 1)))
14231 op0 = pc_rtx;
14232 break;
14233 default:
14234 break;
14235 }
14236 }
14237 }
14238 }
14239
14240 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
14241 if (! pat)
14242 return const0_rtx;
14243 emit_insn (pat);
14244
14245 switch (fcode)
14246 {
14247 case IX86_BUILTIN_GATHER3DIV16SF:
14248 if (target == NULL_RTX)
14249 target = gen_reg_rtx (V8SFmode);
14250 emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
14251 break;
14252 case IX86_BUILTIN_GATHER3DIV16SI:
14253 if (target == NULL_RTX)
14254 target = gen_reg_rtx (V8SImode);
14255 emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
14256 break;
14257 case IX86_BUILTIN_GATHER3DIV8SF:
14258 case IX86_BUILTIN_GATHERDIV8SF:
14259 if (target == NULL_RTX)
14260 target = gen_reg_rtx (V4SFmode);
14261 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
14262 break;
14263 case IX86_BUILTIN_GATHER3DIV8SI:
14264 case IX86_BUILTIN_GATHERDIV8SI:
14265 if (target == NULL_RTX)
14266 target = gen_reg_rtx (V4SImode);
14267 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
14268 break;
14269 default:
14270 target = subtarget;
14271 break;
14272 }
14273 return target;
14274
14275 scatter_gen:
14276 arg0 = CALL_EXPR_ARG (exp, 0);
14277 arg1 = CALL_EXPR_ARG (exp, 1);
14278 arg2 = CALL_EXPR_ARG (exp, 2);
14279 arg3 = CALL_EXPR_ARG (exp, 3);
14280 arg4 = CALL_EXPR_ARG (exp, 4);
14281 op0 = expand_normal (arg0);
14282 op1 = expand_normal (arg1);
14283 op2 = expand_normal (arg2);
14284 op3 = expand_normal (arg3);
14285 op4 = expand_normal (arg4);
14286 mode1 = insn_data[icode].operand[1].mode;
14287 mode2 = insn_data[icode].operand[2].mode;
14288 mode3 = insn_data[icode].operand[3].mode;
14289 mode4 = insn_data[icode].operand[4].mode;
14290
14291 /* Scatter instruction stores operand op3 to memory with
14292 indices from op2 and scale from op4 under writemask op1.
14293 If index operand op2 has more elements then source operand
14294 op3 one need to use only its low half. And vice versa. */
14295 switch (fcode)
14296 {
14297 case IX86_BUILTIN_SCATTERALTSIV8DF:
14298 case IX86_BUILTIN_SCATTERALTSIV8DI:
14299 half = gen_reg_rtx (V8SImode);
14300 if (!nonimmediate_operand (op2, V16SImode))
14301 op2 = copy_to_mode_reg (V16SImode, op2);
14302 emit_insn (gen_vec_extract_lo_v16si (half, op2));
14303 op2 = half;
14304 break;
14305 case IX86_BUILTIN_SCATTERALTDIV16SF:
14306 case IX86_BUILTIN_SCATTERALTDIV16SI:
14307 half = gen_reg_rtx (mode3);
14308 if (mode3 == V8SFmode)
14309 gen = gen_vec_extract_lo_v16sf;
14310 else
14311 gen = gen_vec_extract_lo_v16si;
14312 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14313 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14314 emit_insn (gen (half, op3));
14315 op3 = half;
14316 break;
14317 case IX86_BUILTIN_SCATTERALTSIV4DF:
14318 case IX86_BUILTIN_SCATTERALTSIV4DI:
14319 half = gen_reg_rtx (V4SImode);
14320 if (!nonimmediate_operand (op2, V8SImode))
14321 op2 = copy_to_mode_reg (V8SImode, op2);
14322 emit_insn (gen_vec_extract_lo_v8si (half, op2));
14323 op2 = half;
14324 break;
14325 case IX86_BUILTIN_SCATTERALTDIV8SF:
14326 case IX86_BUILTIN_SCATTERALTDIV8SI:
14327 half = gen_reg_rtx (mode3);
14328 if (mode3 == V4SFmode)
14329 gen = gen_vec_extract_lo_v8sf;
14330 else
14331 gen = gen_vec_extract_lo_v8si;
14332 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14333 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14334 emit_insn (gen (half, op3));
14335 op3 = half;
14336 break;
14337 case IX86_BUILTIN_SCATTERALTSIV2DF:
14338 case IX86_BUILTIN_SCATTERALTSIV2DI:
14339 if (!nonimmediate_operand (op2, V4SImode))
14340 op2 = copy_to_mode_reg (V4SImode, op2);
14341 break;
14342 case IX86_BUILTIN_SCATTERALTDIV4SF:
14343 case IX86_BUILTIN_SCATTERALTDIV4SI:
14344 if (!nonimmediate_operand (op3, GET_MODE (op3)))
14345 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
14346 break;
14347 default:
14348 break;
14349 }
14350
14351 /* Force memory operand only with base register here. But we
14352 don't want to do it on memory operand for other builtin
14353 functions. */
14354 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
14355
14356 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14357 op0 = copy_to_mode_reg (Pmode, op0);
14358
14359 op1 = fixup_modeless_constant (op1, mode1);
14360
14361 if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
14362 {
14363 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14364 op1 = copy_to_mode_reg (mode1, op1);
14365 }
14366 else
14367 {
14368 op1 = copy_to_reg (op1);
14369 op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
14370 }
14371
14372 if (!insn_data[icode].operand[2].predicate (op2, mode2))
14373 op2 = copy_to_mode_reg (mode2, op2);
14374
14375 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14376 op3 = copy_to_mode_reg (mode3, op3);
14377
14378 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14379 {
14380 error ("the last argument must be scale 1, 2, 4, 8");
14381 return const0_rtx;
14382 }
14383
14384 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14385 if (! pat)
14386 return const0_rtx;
14387
14388 emit_insn (pat);
14389 return 0;
14390
14391 vec_prefetch_gen:
14392 arg0 = CALL_EXPR_ARG (exp, 0);
14393 arg1 = CALL_EXPR_ARG (exp, 1);
14394 arg2 = CALL_EXPR_ARG (exp, 2);
14395 arg3 = CALL_EXPR_ARG (exp, 3);
14396 arg4 = CALL_EXPR_ARG (exp, 4);
14397 op0 = expand_normal (arg0);
14398 op1 = expand_normal (arg1);
14399 op2 = expand_normal (arg2);
14400 op3 = expand_normal (arg3);
14401 op4 = expand_normal (arg4);
14402 mode0 = insn_data[icode].operand[0].mode;
14403 mode1 = insn_data[icode].operand[1].mode;
14404 mode3 = insn_data[icode].operand[3].mode;
14405 mode4 = insn_data[icode].operand[4].mode;
14406
14407 op0 = fixup_modeless_constant (op0, mode0);
14408
14409 if (GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
14410 {
14411 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14412 op0 = copy_to_mode_reg (mode0, op0);
14413 }
14414 else
14415 {
14416 op0 = copy_to_reg (op0);
14417 op0 = lowpart_subreg (mode0, op0, GET_MODE (op0));
14418 }
14419
14420 if (!insn_data[icode].operand[1].predicate (op1, mode1))
14421 op1 = copy_to_mode_reg (mode1, op1);
14422
14423 /* Force memory operand only with base register here. But we
14424 don't want to do it on memory operand for other builtin
14425 functions. */
14426 op2 = force_reg (Pmode, convert_to_mode (Pmode, op2, 1));
14427
14428 if (!insn_data[icode].operand[2].predicate (op2, Pmode))
14429 op2 = copy_to_mode_reg (Pmode, op2);
14430
14431 if (!insn_data[icode].operand[3].predicate (op3, mode3))
14432 {
14433 error ("the forth argument must be scale 1, 2, 4, 8");
14434 return const0_rtx;
14435 }
14436
14437 if (!insn_data[icode].operand[4].predicate (op4, mode4))
14438 {
14439 error ("incorrect hint operand");
14440 return const0_rtx;
14441 }
14442
14443 pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
14444 if (! pat)
14445 return const0_rtx;
14446
14447 emit_insn (pat);
14448
14449 return 0;
14450
14451 case IX86_BUILTIN_XABORT:
14452 icode = CODE_FOR_xabort;
14453 arg0 = CALL_EXPR_ARG (exp, 0);
14454 op0 = expand_normal (arg0);
14455 mode0 = insn_data[icode].operand[0].mode;
14456 if (!insn_data[icode].operand[0].predicate (op0, mode0))
14457 {
14458 error ("the argument to %<xabort%> intrinsic must "
14459 "be an 8-bit immediate");
14460 return const0_rtx;
14461 }
14462 emit_insn (gen_xabort (op0));
14463 return 0;
14464
14465 case IX86_BUILTIN_RDSSPD:
14466 case IX86_BUILTIN_RDSSPQ:
14467 mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
14468
14469 if (target == 0
14470 || !register_operand (target, mode))
14471 target = gen_reg_rtx (mode);
14472
14473 op0 = force_reg (mode, const0_rtx);
14474
14475 emit_insn (gen_rdssp (mode, target, op0));
14476 return target;
14477
14478 case IX86_BUILTIN_INCSSPD:
14479 case IX86_BUILTIN_INCSSPQ:
14480 mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
14481
14482 arg0 = CALL_EXPR_ARG (exp, 0);
14483 op0 = expand_normal (arg0);
14484
14485 op0 = force_reg (mode, op0);
14486
14487 emit_insn (gen_incssp (mode, op0));
14488 return 0;
14489
14490 case IX86_BUILTIN_HRESET:
14491 icode = CODE_FOR_hreset;
14492 arg0 = CALL_EXPR_ARG (exp, 0);
14493 op0 = expand_normal (arg0);
14494 op0 = force_reg (SImode, op0);
14495 emit_insn (gen_hreset (op0));
14496 return 0;
14497
14498 case IX86_BUILTIN_RSTORSSP:
14499 case IX86_BUILTIN_CLRSSBSY:
14500 arg0 = CALL_EXPR_ARG (exp, 0);
14501 op0 = expand_normal (arg0);
14502 icode = (fcode == IX86_BUILTIN_RSTORSSP
14503 ? CODE_FOR_rstorssp
14504 : CODE_FOR_clrssbsy);
14505
14506 if (!address_operand (op0, VOIDmode))
14507 {
14508 op0 = convert_memory_address (Pmode, op0);
14509 op0 = copy_addr_to_reg (op0);
14510 }
14511 emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
14512 return 0;
14513
14514 case IX86_BUILTIN_WRSSD:
14515 case IX86_BUILTIN_WRSSQ:
14516 case IX86_BUILTIN_WRUSSD:
14517 case IX86_BUILTIN_WRUSSQ:
14518 mode = ((fcode == IX86_BUILTIN_WRSSD
14519 || fcode == IX86_BUILTIN_WRUSSD)
14520 ? SImode : DImode);
14521
14522 arg0 = CALL_EXPR_ARG (exp, 0);
14523 op0 = expand_normal (arg0);
14524 arg1 = CALL_EXPR_ARG (exp, 1);
14525 op1 = expand_normal (arg1);
14526
14527 op0 = force_reg (mode, op0);
14528
14529 if (!address_operand (op1, VOIDmode))
14530 {
14531 op1 = convert_memory_address (Pmode, op1);
14532 op1 = copy_addr_to_reg (op1);
14533 }
14534 op1 = gen_rtx_MEM (mode, op1);
14535
14536 icode = ((fcode == IX86_BUILTIN_WRSSD
14537 || fcode == IX86_BUILTIN_WRSSQ)
14538 ? code_for_wrss (mode)
14539 : code_for_wruss (mode));
14540 emit_insn (GEN_FCN (icode) (op0, op1));
14541
14542 return 0;
14543
14544 default:
14545 break;
14546 }
14547
14548 if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
14549 && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
14550 {
14551 i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
14552 return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
14553 target);
14554 }
14555
14556 if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
14557 && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
14558 {
14559 i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
14560 return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
14561 target);
14562 }
14563
14564 if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
14565 && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
14566 {
14567 i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
14568 rtx (*fcn) (rtx, rtx, rtx, rtx) = NULL;
14569 rtx (*fcn_mask) (rtx, rtx, rtx, rtx, rtx);
14570 rtx (*fcn_maskz) (rtx, rtx, rtx, rtx, rtx, rtx);
14571 int masked = 1;
14572 machine_mode mode, wide_mode, nar_mode;
14573
14574 nar_mode = V4SFmode;
14575 mode = V16SFmode;
14576 wide_mode = V64SFmode;
14577 fcn_mask = gen_avx5124fmaddps_4fmaddps_mask;
14578 fcn_maskz = gen_avx5124fmaddps_4fmaddps_maskz;
14579
14580 switch (fcode)
14581 {
14582 case IX86_BUILTIN_4FMAPS:
14583 fcn = gen_avx5124fmaddps_4fmaddps;
14584 masked = 0;
14585 goto v4fma_expand;
14586
14587 case IX86_BUILTIN_4DPWSSD:
14588 nar_mode = V4SImode;
14589 mode = V16SImode;
14590 wide_mode = V64SImode;
14591 fcn = gen_avx5124vnniw_vp4dpwssd;
14592 masked = 0;
14593 goto v4fma_expand;
14594
14595 case IX86_BUILTIN_4DPWSSDS:
14596 nar_mode = V4SImode;
14597 mode = V16SImode;
14598 wide_mode = V64SImode;
14599 fcn = gen_avx5124vnniw_vp4dpwssds;
14600 masked = 0;
14601 goto v4fma_expand;
14602
14603 case IX86_BUILTIN_4FNMAPS:
14604 fcn = gen_avx5124fmaddps_4fnmaddps;
14605 masked = 0;
14606 goto v4fma_expand;
14607
14608 case IX86_BUILTIN_4FNMAPS_MASK:
14609 fcn_mask = gen_avx5124fmaddps_4fnmaddps_mask;
14610 fcn_maskz = gen_avx5124fmaddps_4fnmaddps_maskz;
14611 goto v4fma_expand;
14612
14613 case IX86_BUILTIN_4DPWSSD_MASK:
14614 nar_mode = V4SImode;
14615 mode = V16SImode;
14616 wide_mode = V64SImode;
14617 fcn_mask = gen_avx5124vnniw_vp4dpwssd_mask;
14618 fcn_maskz = gen_avx5124vnniw_vp4dpwssd_maskz;
14619 goto v4fma_expand;
14620
14621 case IX86_BUILTIN_4DPWSSDS_MASK:
14622 nar_mode = V4SImode;
14623 mode = V16SImode;
14624 wide_mode = V64SImode;
14625 fcn_mask = gen_avx5124vnniw_vp4dpwssds_mask;
14626 fcn_maskz = gen_avx5124vnniw_vp4dpwssds_maskz;
14627 goto v4fma_expand;
14628
14629 case IX86_BUILTIN_4FMAPS_MASK:
14630 {
14631 tree args[4];
14632 rtx ops[4];
14633 rtx wide_reg;
14634 rtx accum;
14635 rtx addr;
14636 rtx mem;
14637
14638 v4fma_expand:
14639 wide_reg = gen_reg_rtx (wide_mode);
14640 for (i = 0; i < 4; i++)
14641 {
14642 args[i] = CALL_EXPR_ARG (exp, i);
14643 ops[i] = expand_normal (args[i]);
14644
14645 emit_move_insn (gen_rtx_SUBREG (mode, wide_reg, i * 64),
14646 ops[i]);
14647 }
14648
14649 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14650 accum = force_reg (mode, accum);
14651
14652 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14653 addr = force_reg (Pmode, addr);
14654
14655 mem = gen_rtx_MEM (nar_mode, addr);
14656
14657 target = gen_reg_rtx (mode);
14658
14659 emit_move_insn (target, accum);
14660
14661 if (! masked)
14662 emit_insn (fcn (target, accum, wide_reg, mem));
14663 else
14664 {
14665 rtx merge, mask;
14666 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14667
14668 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14669
14670 if (CONST_INT_P (mask))
14671 mask = fixup_modeless_constant (mask, HImode);
14672
14673 mask = force_reg (HImode, mask);
14674
14675 if (GET_MODE (mask) != HImode)
14676 mask = gen_rtx_SUBREG (HImode, mask, 0);
14677
14678 /* If merge is 0 then we're about to emit z-masked variant. */
14679 if (const0_operand (merge, mode))
14680 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14681 /* If merge is the same as accum then emit merge-masked variant. */
14682 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14683 {
14684 merge = force_reg (mode, merge);
14685 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14686 }
14687 /* Merge with something unknown might happen if we z-mask w/ -O0. */
14688 else
14689 {
14690 target = gen_reg_rtx (mode);
14691 emit_move_insn (target, merge);
14692 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14693 }
14694 }
14695 return target;
14696 }
14697
14698 case IX86_BUILTIN_4FNMASS:
14699 fcn = gen_avx5124fmaddps_4fnmaddss;
14700 masked = 0;
14701 goto s4fma_expand;
14702
14703 case IX86_BUILTIN_4FMASS:
14704 fcn = gen_avx5124fmaddps_4fmaddss;
14705 masked = 0;
14706 goto s4fma_expand;
14707
14708 case IX86_BUILTIN_4FNMASS_MASK:
14709 fcn_mask = gen_avx5124fmaddps_4fnmaddss_mask;
14710 fcn_maskz = gen_avx5124fmaddps_4fnmaddss_maskz;
14711 goto s4fma_expand;
14712
14713 case IX86_BUILTIN_4FMASS_MASK:
14714 {
14715 tree args[4];
14716 rtx ops[4];
14717 rtx wide_reg;
14718 rtx accum;
14719 rtx addr;
14720 rtx mem;
14721
14722 fcn_mask = gen_avx5124fmaddps_4fmaddss_mask;
14723 fcn_maskz = gen_avx5124fmaddps_4fmaddss_maskz;
14724
14725 s4fma_expand:
14726 mode = V4SFmode;
14727 wide_reg = gen_reg_rtx (V64SFmode);
14728 for (i = 0; i < 4; i++)
14729 {
14730 rtx tmp;
14731 args[i] = CALL_EXPR_ARG (exp, i);
14732 ops[i] = expand_normal (args[i]);
14733
14734 tmp = gen_reg_rtx (SFmode);
14735 emit_move_insn (tmp, gen_rtx_SUBREG (SFmode, ops[i], 0));
14736
14737 emit_move_insn (gen_rtx_SUBREG (V16SFmode, wide_reg, i * 64),
14738 gen_rtx_SUBREG (V16SFmode, tmp, 0));
14739 }
14740
14741 accum = expand_normal (CALL_EXPR_ARG (exp, 4));
14742 accum = force_reg (V4SFmode, accum);
14743
14744 addr = expand_normal (CALL_EXPR_ARG (exp, 5));
14745 addr = force_reg (Pmode, addr);
14746
14747 mem = gen_rtx_MEM (V4SFmode, addr);
14748
14749 target = gen_reg_rtx (V4SFmode);
14750
14751 emit_move_insn (target, accum);
14752
14753 if (! masked)
14754 emit_insn (fcn (target, accum, wide_reg, mem));
14755 else
14756 {
14757 rtx merge, mask;
14758 merge = expand_normal (CALL_EXPR_ARG (exp, 6));
14759
14760 mask = expand_normal (CALL_EXPR_ARG (exp, 7));
14761
14762 if (CONST_INT_P (mask))
14763 mask = fixup_modeless_constant (mask, QImode);
14764
14765 mask = force_reg (QImode, mask);
14766
14767 if (GET_MODE (mask) != QImode)
14768 mask = gen_rtx_SUBREG (QImode, mask, 0);
14769
14770 /* If merge is 0 then we're about to emit z-masked variant. */
14771 if (const0_operand (merge, mode))
14772 emit_insn (fcn_maskz (target, accum, wide_reg, mem, merge, mask));
14773 /* If merge is the same as accum then emit merge-masked
14774 variant. */
14775 else if (CALL_EXPR_ARG (exp, 6) == CALL_EXPR_ARG (exp, 4))
14776 {
14777 merge = force_reg (mode, merge);
14778 emit_insn (fcn_mask (target, wide_reg, mem, merge, mask));
14779 }
14780 /* Merge with something unknown might happen if we z-mask
14781 w/ -O0. */
14782 else
14783 {
14784 target = gen_reg_rtx (mode);
14785 emit_move_insn (target, merge);
14786 emit_insn (fcn_mask (target, wide_reg, mem, target, mask));
14787 }
14788 }
14789 return target;
14790 }
14791 case IX86_BUILTIN_RDPID:
14792 return ix86_expand_special_args_builtin (bdesc_args + i, exp,
14793 target);
14794 case IX86_BUILTIN_FABSQ:
14795 case IX86_BUILTIN_COPYSIGNQ:
14796 if (!TARGET_SSE)
14797 /* Emit a normal call if SSE isn't available. */
14798 return expand_call (exp, target, ignore);
14799 /* FALLTHRU */
14800 default:
14801 return ix86_expand_args_builtin (bdesc_args + i, exp, target);
14802 }
14803 }
14804
14805 if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
14806 && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
14807 {
14808 i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
14809 return ix86_expand_sse_comi (bdesc_comi + i, exp, target);
14810 }
14811
14812 if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
14813 && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
14814 {
14815 i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
14816 return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
14817 }
14818
14819 if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
14820 && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
14821 {
14822 i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
14823 return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
14824 }
14825
14826 if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
14827 && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
14828 {
14829 i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
14830 return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
14831 }
14832
14833 if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
14834 && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
14835 {
14836 i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
14837 const struct builtin_description *d = bdesc_multi_arg + i;
14838 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
14839 (enum ix86_builtin_func_type)
14840 d->flag, d->comparison);
14841 }
14842
14843 if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
14844 && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
14845 {
14846 i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
14847 return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
14848 target);
14849 }
14850
14851 gcc_unreachable ();
14852 }
14853
14854 /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
14855 fill target with val via vec_duplicate. */
14856
14857 static bool
14858 ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
14859 {
14860 bool ok;
14861 rtx_insn *insn;
14862 rtx dup;
14863
14864 /* First attempt to recognize VAL as-is. */
14865 dup = gen_vec_duplicate (mode, val);
14866 insn = emit_insn (gen_rtx_SET (target, dup));
14867 if (recog_memoized (insn) < 0)
14868 {
14869 rtx_insn *seq;
14870 machine_mode innermode = GET_MODE_INNER (mode);
14871 rtx reg;
14872
14873 /* If that fails, force VAL into a register. */
14874
14875 start_sequence ();
14876 reg = force_reg (innermode, val);
14877 if (GET_MODE (reg) != innermode)
14878 reg = gen_lowpart (innermode, reg);
14879 SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
14880 seq = get_insns ();
14881 end_sequence ();
14882 if (seq)
14883 emit_insn_before (seq, insn);
14884
14885 ok = recog_memoized (insn) >= 0;
14886 gcc_assert (ok);
14887 }
14888 return true;
14889 }
14890
14891 /* Get a vector mode of the same size as the original but with elements
14892 twice as wide. This is only guaranteed to apply to integral vectors. */
14893
14894 static machine_mode
14895 get_mode_wider_vector (machine_mode o)
14896 {
14897 /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
14898 machine_mode n = GET_MODE_WIDER_MODE (o).require ();
14899 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
14900 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
14901 return n;
14902 }
14903
14904 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
14905 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
14906
14907 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
14908 with all elements equal to VAR. Return true if successful. */
14909
14910 bool
14911 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
14912 rtx target, rtx val)
14913 {
14914 bool ok;
14915
14916 switch (mode)
14917 {
14918 case E_V2SImode:
14919 case E_V2SFmode:
14920 if (!mmx_ok)
14921 return false;
14922 /* FALLTHRU */
14923
14924 case E_V4DFmode:
14925 case E_V4DImode:
14926 case E_V8SFmode:
14927 case E_V8SImode:
14928 case E_V2DFmode:
14929 case E_V2DImode:
14930 case E_V4SFmode:
14931 case E_V4SImode:
14932 case E_V16SImode:
14933 case E_V8DImode:
14934 case E_V16SFmode:
14935 case E_V8DFmode:
14936 return ix86_vector_duplicate_value (mode, target, val);
14937
14938 case E_V4HImode:
14939 if (!mmx_ok)
14940 return false;
14941 if (TARGET_SSE || TARGET_3DNOW_A)
14942 {
14943 rtx x;
14944
14945 val = gen_lowpart (SImode, val);
14946 x = gen_rtx_TRUNCATE (HImode, val);
14947 x = gen_rtx_VEC_DUPLICATE (mode, x);
14948 emit_insn (gen_rtx_SET (target, x));
14949 return true;
14950 }
14951 goto widen;
14952
14953 case E_V2HImode:
14954 if (TARGET_SSE2)
14955 {
14956 rtx x;
14957
14958 val = gen_lowpart (SImode, val);
14959 x = gen_rtx_TRUNCATE (HImode, val);
14960 x = gen_rtx_VEC_DUPLICATE (mode, x);
14961 emit_insn (gen_rtx_SET (target, x));
14962 return true;
14963 }
14964 return false;
14965
14966 case E_V8QImode:
14967 case E_V4QImode:
14968 if (!mmx_ok)
14969 return false;
14970 goto widen;
14971
14972 case E_V8HImode:
14973 case E_V8HFmode:
14974 if (TARGET_AVX2)
14975 return ix86_vector_duplicate_value (mode, target, val);
14976
14977 if (TARGET_SSE2)
14978 {
14979 struct expand_vec_perm_d dperm;
14980 rtx tmp1, tmp2;
14981
14982 permute:
14983 memset (&dperm, 0, sizeof (dperm));
14984 dperm.target = target;
14985 dperm.vmode = mode;
14986 dperm.nelt = GET_MODE_NUNITS (mode);
14987 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
14988 dperm.one_operand_p = true;
14989
14990 if (mode == V8HFmode)
14991 {
14992 tmp1 = force_reg (HFmode, val);
14993 tmp2 = gen_reg_rtx (mode);
14994 emit_insn (gen_vec_setv8hf_0 (tmp2, CONST0_RTX (mode), tmp1));
14995 tmp1 = gen_lowpart (mode, tmp2);
14996 }
14997 else
14998 {
14999 /* Extend to SImode using a paradoxical SUBREG. */
15000 tmp1 = gen_reg_rtx (SImode);
15001 emit_move_insn (tmp1, gen_lowpart (SImode, val));
15002
15003 /* Insert the SImode value as
15004 low element of a V4SImode vector. */
15005 tmp2 = gen_reg_rtx (V4SImode);
15006 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
15007 tmp1 = gen_lowpart (mode, tmp2);
15008 }
15009
15010 emit_move_insn (dperm.op0, tmp1);
15011 ok = (expand_vec_perm_1 (&dperm)
15012 || expand_vec_perm_broadcast_1 (&dperm));
15013 gcc_assert (ok);
15014 return ok;
15015 }
15016 goto widen;
15017
15018 case E_V16QImode:
15019 if (TARGET_AVX2)
15020 return ix86_vector_duplicate_value (mode, target, val);
15021
15022 if (TARGET_SSE2)
15023 goto permute;
15024 goto widen;
15025
15026 widen:
15027 /* Replicate the value once into the next wider mode and recurse. */
15028 {
15029 machine_mode smode, wsmode, wvmode;
15030 rtx x;
15031
15032 smode = GET_MODE_INNER (mode);
15033 wvmode = get_mode_wider_vector (mode);
15034 wsmode = GET_MODE_INNER (wvmode);
15035
15036 val = convert_modes (wsmode, smode, val, true);
15037
15038 if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
15039 emit_insn (gen_insv_1 (wsmode, val, val));
15040 else
15041 {
15042 x = expand_simple_binop (wsmode, ASHIFT, val,
15043 GEN_INT (GET_MODE_BITSIZE (smode)),
15044 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15045 val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
15046 OPTAB_LIB_WIDEN);
15047 }
15048
15049 x = gen_reg_rtx (wvmode);
15050 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
15051 gcc_assert (ok);
15052 emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
15053 return ok;
15054 }
15055
15056 case E_V16HImode:
15057 case E_V16HFmode:
15058 case E_V32QImode:
15059 if (TARGET_AVX2)
15060 return ix86_vector_duplicate_value (mode, target, val);
15061 else
15062 {
15063 machine_mode hvmode = (mode == V16HImode ? V8HImode
15064 : mode == V16HFmode ? V8HFmode
15065 : V16QImode);
15066 rtx x = gen_reg_rtx (hvmode);
15067
15068 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15069 gcc_assert (ok);
15070
15071 x = gen_rtx_VEC_CONCAT (mode, x, x);
15072 emit_insn (gen_rtx_SET (target, x));
15073 }
15074 return true;
15075
15076 case E_V32HImode:
15077 case E_V32HFmode:
15078 case E_V64QImode:
15079 if (TARGET_AVX512BW)
15080 return ix86_vector_duplicate_value (mode, target, val);
15081 else
15082 {
15083 machine_mode hvmode = (mode == V32HImode ? V16HImode
15084 : mode == V32HFmode ? V16HFmode
15085 : V32QImode);
15086 rtx x = gen_reg_rtx (hvmode);
15087
15088 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
15089 gcc_assert (ok);
15090
15091 x = gen_rtx_VEC_CONCAT (mode, x, x);
15092 emit_insn (gen_rtx_SET (target, x));
15093 }
15094 return true;
15095
15096 default:
15097 return false;
15098 }
15099 }
15100
15101 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15102 whose ONE_VAR element is VAR, and other elements are zero. Return true
15103 if successful. */
15104
15105 static bool
15106 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
15107 rtx target, rtx var, int one_var)
15108 {
15109 machine_mode vsimode;
15110 rtx new_target;
15111 rtx x, tmp;
15112 bool use_vector_set = false;
15113 rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
15114
15115 switch (mode)
15116 {
15117 case E_V2DImode:
15118 /* For SSE4.1, we normally use vector set. But if the second
15119 element is zero and inter-unit moves are OK, we use movq
15120 instead. */
15121 use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
15122 && !(TARGET_INTER_UNIT_MOVES_TO_VEC
15123 && one_var == 0));
15124 break;
15125 case E_V16QImode:
15126 case E_V4SImode:
15127 case E_V4SFmode:
15128 use_vector_set = TARGET_SSE4_1;
15129 break;
15130 case E_V8HImode:
15131 use_vector_set = TARGET_SSE2;
15132 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15133 ? gen_vec_setv8hi_0 : NULL;
15134 break;
15135 case E_V8QImode:
15136 use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
15137 break;
15138 case E_V4HImode:
15139 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
15140 break;
15141 case E_V4QImode:
15142 use_vector_set = TARGET_SSE4_1;
15143 break;
15144 case E_V32QImode:
15145 use_vector_set = TARGET_AVX;
15146 break;
15147 case E_V16HImode:
15148 use_vector_set = TARGET_AVX;
15149 gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
15150 ? gen_vec_setv16hi_0 : NULL;
15151 break;
15152 case E_V8SImode:
15153 use_vector_set = TARGET_AVX;
15154 gen_vec_set_0 = gen_vec_setv8si_0;
15155 break;
15156 case E_V8SFmode:
15157 use_vector_set = TARGET_AVX;
15158 gen_vec_set_0 = gen_vec_setv8sf_0;
15159 break;
15160 case E_V4DFmode:
15161 use_vector_set = TARGET_AVX;
15162 gen_vec_set_0 = gen_vec_setv4df_0;
15163 break;
15164 case E_V4DImode:
15165 /* Use ix86_expand_vector_set in 64bit mode only. */
15166 use_vector_set = TARGET_AVX && TARGET_64BIT;
15167 gen_vec_set_0 = gen_vec_setv4di_0;
15168 break;
15169 case E_V16SImode:
15170 use_vector_set = TARGET_AVX512F && one_var == 0;
15171 gen_vec_set_0 = gen_vec_setv16si_0;
15172 break;
15173 case E_V16SFmode:
15174 use_vector_set = TARGET_AVX512F && one_var == 0;
15175 gen_vec_set_0 = gen_vec_setv16sf_0;
15176 break;
15177 case E_V8DFmode:
15178 use_vector_set = TARGET_AVX512F && one_var == 0;
15179 gen_vec_set_0 = gen_vec_setv8df_0;
15180 break;
15181 case E_V8DImode:
15182 /* Use ix86_expand_vector_set in 64bit mode only. */
15183 use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
15184 gen_vec_set_0 = gen_vec_setv8di_0;
15185 break;
15186 case E_V8HFmode:
15187 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15188 gen_vec_set_0 = gen_vec_setv8hf_0;
15189 break;
15190 case E_V16HFmode:
15191 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15192 gen_vec_set_0 = gen_vec_setv16hf_0;
15193 break;
15194 case E_V32HFmode:
15195 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15196 gen_vec_set_0 = gen_vec_setv32hf_0;
15197 break;
15198 case E_V32HImode:
15199 use_vector_set = TARGET_AVX512FP16 && one_var == 0;
15200 gen_vec_set_0 = gen_vec_setv32hi_0;
15201 default:
15202 break;
15203 }
15204
15205 if (use_vector_set)
15206 {
15207 if (gen_vec_set_0 && one_var == 0)
15208 {
15209 var = force_reg (GET_MODE_INNER (mode), var);
15210 emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
15211 return true;
15212 }
15213 emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
15214 var = force_reg (GET_MODE_INNER (mode), var);
15215 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15216 return true;
15217 }
15218
15219 switch (mode)
15220 {
15221 case E_V2SFmode:
15222 case E_V2SImode:
15223 if (!mmx_ok)
15224 return false;
15225 /* FALLTHRU */
15226
15227 case E_V2DFmode:
15228 case E_V2DImode:
15229 if (one_var != 0)
15230 return false;
15231 var = force_reg (GET_MODE_INNER (mode), var);
15232 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
15233 emit_insn (gen_rtx_SET (target, x));
15234 return true;
15235
15236 case E_V4SFmode:
15237 case E_V4SImode:
15238 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
15239 new_target = gen_reg_rtx (mode);
15240 else
15241 new_target = target;
15242 var = force_reg (GET_MODE_INNER (mode), var);
15243 x = gen_rtx_VEC_DUPLICATE (mode, var);
15244 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
15245 emit_insn (gen_rtx_SET (new_target, x));
15246 if (one_var != 0)
15247 {
15248 /* We need to shuffle the value to the correct position, so
15249 create a new pseudo to store the intermediate result. */
15250
15251 /* With SSE2, we can use the integer shuffle insns. */
15252 if (mode != V4SFmode && TARGET_SSE2)
15253 {
15254 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
15255 const1_rtx,
15256 GEN_INT (one_var == 1 ? 0 : 1),
15257 GEN_INT (one_var == 2 ? 0 : 1),
15258 GEN_INT (one_var == 3 ? 0 : 1)));
15259 if (target != new_target)
15260 emit_move_insn (target, new_target);
15261 return true;
15262 }
15263
15264 /* Otherwise convert the intermediate result to V4SFmode and
15265 use the SSE1 shuffle instructions. */
15266 if (mode != V4SFmode)
15267 {
15268 tmp = gen_reg_rtx (V4SFmode);
15269 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
15270 }
15271 else
15272 tmp = new_target;
15273
15274 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
15275 const1_rtx,
15276 GEN_INT (one_var == 1 ? 0 : 1),
15277 GEN_INT (one_var == 2 ? 0+4 : 1+4),
15278 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
15279
15280 if (mode != V4SFmode)
15281 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
15282 else if (tmp != target)
15283 emit_move_insn (target, tmp);
15284 }
15285 else if (target != new_target)
15286 emit_move_insn (target, new_target);
15287 return true;
15288
15289 case E_V8HImode:
15290 case E_V16QImode:
15291 vsimode = V4SImode;
15292 goto widen;
15293 case E_V4HImode:
15294 case E_V8QImode:
15295 if (!mmx_ok)
15296 return false;
15297 vsimode = V2SImode;
15298 goto widen;
15299 widen:
15300 if (one_var != 0)
15301 return false;
15302
15303 /* Zero extend the variable element to SImode and recurse. */
15304 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
15305
15306 x = gen_reg_rtx (vsimode);
15307 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
15308 var, one_var))
15309 gcc_unreachable ();
15310
15311 emit_move_insn (target, gen_lowpart (mode, x));
15312 return true;
15313
15314 default:
15315 return false;
15316 }
15317 }
15318
15319 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
15320 consisting of the values in VALS. It is known that all elements
15321 except ONE_VAR are constants. Return true if successful. */
15322
15323 static bool
15324 ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
15325 rtx target, rtx vals, int one_var)
15326 {
15327 rtx var = XVECEXP (vals, 0, one_var);
15328 machine_mode wmode;
15329 rtx const_vec, x;
15330
15331 const_vec = copy_rtx (vals);
15332 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
15333 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
15334
15335 switch (mode)
15336 {
15337 case E_V2DFmode:
15338 case E_V2DImode:
15339 case E_V2SFmode:
15340 case E_V2SImode:
15341 /* For the two element vectors, it's just as easy to use
15342 the general case. */
15343 return false;
15344
15345 case E_V4DImode:
15346 /* Use ix86_expand_vector_set in 64bit mode only. */
15347 if (!TARGET_64BIT)
15348 return false;
15349 /* FALLTHRU */
15350 case E_V8HFmode:
15351 case E_V16HFmode:
15352 case E_V4DFmode:
15353 case E_V8SFmode:
15354 case E_V8SImode:
15355 case E_V16HImode:
15356 case E_V32QImode:
15357 case E_V4SFmode:
15358 case E_V4SImode:
15359 case E_V8HImode:
15360 case E_V4HImode:
15361 break;
15362
15363 case E_V16QImode:
15364 if (TARGET_SSE4_1)
15365 break;
15366 wmode = V8HImode;
15367 goto widen;
15368 case E_V8QImode:
15369 if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
15370 break;
15371 wmode = V4HImode;
15372 goto widen;
15373 case E_V4QImode:
15374 if (TARGET_SSE4_1)
15375 break;
15376 wmode = V2HImode;
15377 widen:
15378 /* There's no way to set one QImode entry easily. Combine
15379 the variable value with its adjacent constant value, and
15380 promote to an HImode set. */
15381 x = XVECEXP (vals, 0, one_var ^ 1);
15382 if (one_var & 1)
15383 {
15384 var = convert_modes (HImode, QImode, var, true);
15385 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
15386 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15387 x = GEN_INT (INTVAL (x) & 0xff);
15388 }
15389 else
15390 {
15391 var = convert_modes (HImode, QImode, var, true);
15392 x = gen_int_mode (UINTVAL (x) << 8, HImode);
15393 }
15394 if (x != const0_rtx)
15395 var = expand_simple_binop (HImode, IOR, var, x, var,
15396 1, OPTAB_LIB_WIDEN);
15397
15398 x = gen_reg_rtx (wmode);
15399 emit_move_insn (x, gen_lowpart (wmode, const_vec));
15400 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
15401
15402 emit_move_insn (target, gen_lowpart (mode, x));
15403 return true;
15404
15405 default:
15406 return false;
15407 }
15408
15409 emit_move_insn (target, const_vec);
15410 ix86_expand_vector_set (mmx_ok, target, var, one_var);
15411 return true;
15412 }
15413
15414 /* A subroutine of ix86_expand_vector_init_general. Use vector
15415 concatenate to handle the most general case: all values variable,
15416 and none identical. */
15417
15418 static void
15419 ix86_expand_vector_init_concat (machine_mode mode,
15420 rtx target, rtx *ops, int n)
15421 {
15422 machine_mode half_mode = VOIDmode;
15423 rtx half[2];
15424 rtvec v;
15425 int i, j;
15426
15427 switch (n)
15428 {
15429 case 2:
15430 switch (mode)
15431 {
15432 case E_V32HFmode:
15433 half_mode = V16HFmode;
15434 break;
15435 case E_V16SImode:
15436 half_mode = V8SImode;
15437 break;
15438 case E_V16SFmode:
15439 half_mode = V8SFmode;
15440 break;
15441 case E_V8DImode:
15442 half_mode = V4DImode;
15443 break;
15444 case E_V8DFmode:
15445 half_mode = V4DFmode;
15446 break;
15447 case E_V16HFmode:
15448 half_mode = V8HFmode;
15449 break;
15450 case E_V8SImode:
15451 half_mode = V4SImode;
15452 break;
15453 case E_V8SFmode:
15454 half_mode = V4SFmode;
15455 break;
15456 case E_V4DImode:
15457 half_mode = V2DImode;
15458 break;
15459 case E_V4DFmode:
15460 half_mode = V2DFmode;
15461 break;
15462 case E_V4SImode:
15463 half_mode = V2SImode;
15464 break;
15465 case E_V4SFmode:
15466 half_mode = V2SFmode;
15467 break;
15468 case E_V2DImode:
15469 half_mode = DImode;
15470 break;
15471 case E_V2SImode:
15472 half_mode = SImode;
15473 break;
15474 case E_V2DFmode:
15475 half_mode = DFmode;
15476 break;
15477 case E_V2SFmode:
15478 half_mode = SFmode;
15479 break;
15480 default:
15481 gcc_unreachable ();
15482 }
15483
15484 if (!register_operand (ops[1], half_mode))
15485 ops[1] = force_reg (half_mode, ops[1]);
15486 if (!register_operand (ops[0], half_mode))
15487 ops[0] = force_reg (half_mode, ops[0]);
15488 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
15489 ops[1])));
15490 break;
15491
15492 case 4:
15493 switch (mode)
15494 {
15495 case E_V4DImode:
15496 half_mode = V2DImode;
15497 break;
15498 case E_V4DFmode:
15499 half_mode = V2DFmode;
15500 break;
15501 case E_V4SImode:
15502 half_mode = V2SImode;
15503 break;
15504 case E_V4SFmode:
15505 half_mode = V2SFmode;
15506 break;
15507 default:
15508 gcc_unreachable ();
15509 }
15510 goto half;
15511
15512 case 8:
15513 switch (mode)
15514 {
15515 case E_V8DImode:
15516 half_mode = V4DImode;
15517 break;
15518 case E_V8DFmode:
15519 half_mode = V4DFmode;
15520 break;
15521 case E_V8SImode:
15522 half_mode = V4SImode;
15523 break;
15524 case E_V8SFmode:
15525 half_mode = V4SFmode;
15526 break;
15527 default:
15528 gcc_unreachable ();
15529 }
15530 goto half;
15531
15532 case 16:
15533 switch (mode)
15534 {
15535 case E_V16SImode:
15536 half_mode = V8SImode;
15537 break;
15538 case E_V16SFmode:
15539 half_mode = V8SFmode;
15540 break;
15541 default:
15542 gcc_unreachable ();
15543 }
15544 goto half;
15545
15546 half:
15547 /* FIXME: We process inputs backward to help RA. PR 36222. */
15548 i = n - 1;
15549 for (j = 1; j != -1; j--)
15550 {
15551 half[j] = gen_reg_rtx (half_mode);
15552 switch (n >> 1)
15553 {
15554 case 2:
15555 v = gen_rtvec (2, ops[i-1], ops[i]);
15556 i -= 2;
15557 break;
15558 case 4:
15559 v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
15560 i -= 4;
15561 break;
15562 case 8:
15563 v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
15564 ops[i-3], ops[i-2], ops[i-1], ops[i]);
15565 i -= 8;
15566 break;
15567 default:
15568 gcc_unreachable ();
15569 }
15570 ix86_expand_vector_init (false, half[j],
15571 gen_rtx_PARALLEL (half_mode, v));
15572 }
15573
15574 ix86_expand_vector_init_concat (mode, target, half, 2);
15575 break;
15576
15577 default:
15578 gcc_unreachable ();
15579 }
15580 }
15581
15582 /* A subroutine of ix86_expand_vector_init_general. Use vector
15583 interleave to handle the most general case: all values variable,
15584 and none identical. */
15585
15586 static void
15587 ix86_expand_vector_init_interleave (machine_mode mode,
15588 rtx target, rtx *ops, int n)
15589 {
15590 machine_mode first_imode, second_imode, third_imode, inner_mode;
15591 int i, j;
15592 rtx op, op0, op1;
15593 rtx (*gen_load_even) (rtx, rtx, rtx);
15594 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
15595 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
15596
15597 switch (mode)
15598 {
15599 case E_V8HFmode:
15600 gen_load_even = gen_vec_interleave_lowv8hf;
15601 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15602 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15603 inner_mode = HFmode;
15604 first_imode = V4SImode;
15605 second_imode = V2DImode;
15606 third_imode = VOIDmode;
15607 break;
15608 case E_V8HImode:
15609 gen_load_even = gen_vec_setv8hi;
15610 gen_interleave_first_low = gen_vec_interleave_lowv4si;
15611 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15612 inner_mode = HImode;
15613 first_imode = V4SImode;
15614 second_imode = V2DImode;
15615 third_imode = VOIDmode;
15616 break;
15617 case E_V16QImode:
15618 gen_load_even = gen_vec_setv16qi;
15619 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
15620 gen_interleave_second_low = gen_vec_interleave_lowv4si;
15621 inner_mode = QImode;
15622 first_imode = V8HImode;
15623 second_imode = V4SImode;
15624 third_imode = V2DImode;
15625 break;
15626 default:
15627 gcc_unreachable ();
15628 }
15629
15630 for (i = 0; i < n; i++)
15631 {
15632 op = ops [i + i];
15633 if (inner_mode == HFmode)
15634 {
15635 rtx even, odd;
15636 /* Use vpuncklwd to pack 2 HFmode. */
15637 op0 = gen_reg_rtx (V8HFmode);
15638 even = lowpart_subreg (V8HFmode, force_reg (HFmode, op), HFmode);
15639 odd = lowpart_subreg (V8HFmode,
15640 force_reg (HFmode, ops[i + i + 1]),
15641 HFmode);
15642 emit_insn (gen_load_even (op0, even, odd));
15643 }
15644 else
15645 {
15646 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
15647 op0 = gen_reg_rtx (SImode);
15648 emit_move_insn (op0, gen_lowpart (SImode, op));
15649
15650 /* Insert the SImode value as low element of V4SImode vector. */
15651 op1 = gen_reg_rtx (V4SImode);
15652 op0 = gen_rtx_VEC_MERGE (V4SImode,
15653 gen_rtx_VEC_DUPLICATE (V4SImode,
15654 op0),
15655 CONST0_RTX (V4SImode),
15656 const1_rtx);
15657 emit_insn (gen_rtx_SET (op1, op0));
15658
15659 /* Cast the V4SImode vector back to a vector in orignal mode. */
15660 op0 = gen_reg_rtx (mode);
15661 emit_move_insn (op0, gen_lowpart (mode, op1));
15662
15663 /* Load even elements into the second position. */
15664 emit_insn (gen_load_even (op0,
15665 force_reg (inner_mode,
15666 ops[i + i + 1]),
15667 const1_rtx));
15668 }
15669
15670 /* Cast vector to FIRST_IMODE vector. */
15671 ops[i] = gen_reg_rtx (first_imode);
15672 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
15673 }
15674
15675 /* Interleave low FIRST_IMODE vectors. */
15676 for (i = j = 0; i < n; i += 2, j++)
15677 {
15678 op0 = gen_reg_rtx (first_imode);
15679 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
15680
15681 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
15682 ops[j] = gen_reg_rtx (second_imode);
15683 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
15684 }
15685
15686 /* Interleave low SECOND_IMODE vectors. */
15687 switch (second_imode)
15688 {
15689 case E_V4SImode:
15690 for (i = j = 0; i < n / 2; i += 2, j++)
15691 {
15692 op0 = gen_reg_rtx (second_imode);
15693 emit_insn (gen_interleave_second_low (op0, ops[i],
15694 ops[i + 1]));
15695
15696 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
15697 vector. */
15698 ops[j] = gen_reg_rtx (third_imode);
15699 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
15700 }
15701 second_imode = V2DImode;
15702 gen_interleave_second_low = gen_vec_interleave_lowv2di;
15703 /* FALLTHRU */
15704
15705 case E_V2DImode:
15706 op0 = gen_reg_rtx (second_imode);
15707 emit_insn (gen_interleave_second_low (op0, ops[0],
15708 ops[1]));
15709
15710 /* Cast the SECOND_IMODE vector back to a vector on original
15711 mode. */
15712 emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
15713 break;
15714
15715 default:
15716 gcc_unreachable ();
15717 }
15718 }
15719
15720 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
15721 all values variable, and none identical. */
15722
15723 static void
15724 ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
15725 rtx target, rtx vals)
15726 {
15727 rtx ops[64], op0, op1, op2, op3, op4, op5;
15728 machine_mode half_mode = VOIDmode;
15729 machine_mode quarter_mode = VOIDmode;
15730 int n, i;
15731
15732 switch (mode)
15733 {
15734 case E_V2SFmode:
15735 case E_V2SImode:
15736 if (!mmx_ok && !TARGET_SSE)
15737 break;
15738 /* FALLTHRU */
15739
15740 case E_V16SImode:
15741 case E_V16SFmode:
15742 case E_V8DFmode:
15743 case E_V8DImode:
15744 case E_V8SFmode:
15745 case E_V8SImode:
15746 case E_V4DFmode:
15747 case E_V4DImode:
15748 case E_V4SFmode:
15749 case E_V4SImode:
15750 case E_V2DFmode:
15751 case E_V2DImode:
15752 n = GET_MODE_NUNITS (mode);
15753 for (i = 0; i < n; i++)
15754 ops[i] = XVECEXP (vals, 0, i);
15755 ix86_expand_vector_init_concat (mode, target, ops, n);
15756 return;
15757
15758 case E_V2TImode:
15759 for (i = 0; i < 2; i++)
15760 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15761 op0 = gen_reg_rtx (V4DImode);
15762 ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
15763 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15764 return;
15765
15766 case E_V4TImode:
15767 for (i = 0; i < 4; i++)
15768 ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
15769 ops[4] = gen_reg_rtx (V4DImode);
15770 ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
15771 ops[5] = gen_reg_rtx (V4DImode);
15772 ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
15773 op0 = gen_reg_rtx (V8DImode);
15774 ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
15775 emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
15776 return;
15777
15778 case E_V32QImode:
15779 half_mode = V16QImode;
15780 goto half;
15781
15782 case E_V16HImode:
15783 half_mode = V8HImode;
15784 goto half;
15785
15786 case E_V16HFmode:
15787 half_mode = V8HFmode;
15788 goto half;
15789
15790 half:
15791 n = GET_MODE_NUNITS (mode);
15792 for (i = 0; i < n; i++)
15793 ops[i] = XVECEXP (vals, 0, i);
15794 op0 = gen_reg_rtx (half_mode);
15795 op1 = gen_reg_rtx (half_mode);
15796 ix86_expand_vector_init_interleave (half_mode, op0, ops,
15797 n >> 2);
15798 ix86_expand_vector_init_interleave (half_mode, op1,
15799 &ops [n >> 1], n >> 2);
15800 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
15801 return;
15802
15803 case E_V64QImode:
15804 quarter_mode = V16QImode;
15805 half_mode = V32QImode;
15806 goto quarter;
15807
15808 case E_V32HImode:
15809 quarter_mode = V8HImode;
15810 half_mode = V16HImode;
15811 goto quarter;
15812
15813 case E_V32HFmode:
15814 quarter_mode = V8HFmode;
15815 half_mode = V16HFmode;
15816 goto quarter;
15817
15818 quarter:
15819 n = GET_MODE_NUNITS (mode);
15820 for (i = 0; i < n; i++)
15821 ops[i] = XVECEXP (vals, 0, i);
15822 op0 = gen_reg_rtx (quarter_mode);
15823 op1 = gen_reg_rtx (quarter_mode);
15824 op2 = gen_reg_rtx (quarter_mode);
15825 op3 = gen_reg_rtx (quarter_mode);
15826 op4 = gen_reg_rtx (half_mode);
15827 op5 = gen_reg_rtx (half_mode);
15828 ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
15829 n >> 3);
15830 ix86_expand_vector_init_interleave (quarter_mode, op1,
15831 &ops [n >> 2], n >> 3);
15832 ix86_expand_vector_init_interleave (quarter_mode, op2,
15833 &ops [n >> 1], n >> 3);
15834 ix86_expand_vector_init_interleave (quarter_mode, op3,
15835 &ops [(n >> 1) | (n >> 2)], n >> 3);
15836 emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
15837 emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
15838 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
15839 return;
15840
15841 case E_V16QImode:
15842 if (!TARGET_SSE4_1)
15843 break;
15844 /* FALLTHRU */
15845
15846 case E_V8HImode:
15847 if (!TARGET_SSE2)
15848 break;
15849
15850 /* Don't use ix86_expand_vector_init_interleave if we can't
15851 move from GPR to SSE register directly. */
15852 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
15853 break;
15854 /* FALLTHRU */
15855
15856 case E_V8HFmode:
15857
15858 n = GET_MODE_NUNITS (mode);
15859 for (i = 0; i < n; i++)
15860 ops[i] = XVECEXP (vals, 0, i);
15861 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
15862 return;
15863
15864 case E_V4HImode:
15865 case E_V8QImode:
15866
15867 case E_V2HImode:
15868 case E_V4QImode:
15869 break;
15870
15871 default:
15872 gcc_unreachable ();
15873 }
15874
15875 {
15876 int i, j, n_elts, n_words, n_elt_per_word;
15877 machine_mode tmp_mode, inner_mode;
15878 rtx words[4], shift;
15879
15880 tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
15881
15882 inner_mode = GET_MODE_INNER (mode);
15883 n_elts = GET_MODE_NUNITS (mode);
15884 n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
15885 n_elt_per_word = n_elts / n_words;
15886 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
15887
15888 for (i = 0; i < n_words; ++i)
15889 {
15890 rtx word = NULL_RTX;
15891
15892 for (j = 0; j < n_elt_per_word; ++j)
15893 {
15894 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
15895 elt = convert_modes (tmp_mode, inner_mode, elt, true);
15896
15897 if (j == 0)
15898 word = elt;
15899 else
15900 {
15901 word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
15902 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15903 word = expand_simple_binop (tmp_mode, IOR, word, elt,
15904 NULL_RTX, 1, OPTAB_LIB_WIDEN);
15905 }
15906 }
15907
15908 words[i] = word;
15909 }
15910
15911 if (n_words == 1)
15912 emit_move_insn (target, gen_lowpart (mode, words[0]));
15913 else if (n_words == 2)
15914 {
15915 rtx tmp = gen_reg_rtx (mode);
15916 emit_clobber (tmp);
15917 emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
15918 emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
15919 emit_move_insn (target, tmp);
15920 }
15921 else if (n_words == 4)
15922 {
15923 rtx tmp = gen_reg_rtx (V4SImode);
15924 gcc_assert (tmp_mode == SImode);
15925 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
15926 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
15927 emit_move_insn (target, gen_lowpart (mode, tmp));
15928 }
15929 else
15930 gcc_unreachable ();
15931 }
15932 }
15933
15934 /* Initialize vector TARGET via VALS. Suppress the use of MMX
15935 instructions unless MMX_OK is true. */
15936
15937 void
15938 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
15939 {
15940 machine_mode mode = GET_MODE (target);
15941 machine_mode inner_mode = GET_MODE_INNER (mode);
15942 int n_elts = GET_MODE_NUNITS (mode);
15943 int n_var = 0, one_var = -1;
15944 bool all_same = true, all_const_zero = true;
15945 int i;
15946 rtx x;
15947
15948 /* Handle first initialization from vector elts. */
15949 if (n_elts != XVECLEN (vals, 0))
15950 {
15951 rtx subtarget = target;
15952 x = XVECEXP (vals, 0, 0);
15953 gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
15954 if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
15955 {
15956 rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
15957 if (inner_mode == QImode
15958 || inner_mode == HImode
15959 || inner_mode == TImode
15960 || inner_mode == HFmode)
15961 {
15962 unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
15963 scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
15964 n_bits /= GET_MODE_SIZE (elt_mode);
15965 mode = mode_for_vector (elt_mode, n_bits).require ();
15966 inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
15967 ops[0] = gen_lowpart (inner_mode, ops[0]);
15968 ops[1] = gen_lowpart (inner_mode, ops[1]);
15969 subtarget = gen_reg_rtx (mode);
15970 }
15971 ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
15972 if (subtarget != target)
15973 emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
15974 return;
15975 }
15976 gcc_unreachable ();
15977 }
15978
15979 for (i = 0; i < n_elts; ++i)
15980 {
15981 x = XVECEXP (vals, 0, i);
15982 if (!(CONST_SCALAR_INT_P (x)
15983 || CONST_DOUBLE_P (x)
15984 || CONST_FIXED_P (x)))
15985 n_var++, one_var = i;
15986 else if (x != CONST0_RTX (inner_mode))
15987 all_const_zero = false;
15988 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
15989 all_same = false;
15990 }
15991
15992 /* Constants are best loaded from the constant pool. */
15993 if (n_var == 0)
15994 {
15995 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
15996 return;
15997 }
15998
15999 /* If all values are identical, broadcast the value. */
16000 if (all_same
16001 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
16002 XVECEXP (vals, 0, 0)))
16003 return;
16004
16005 /* Values where only one field is non-constant are best loaded from
16006 the pool and overwritten via move later. */
16007 if (n_var == 1)
16008 {
16009 if (all_const_zero
16010 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
16011 XVECEXP (vals, 0, one_var),
16012 one_var))
16013 return;
16014
16015 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
16016 return;
16017 }
16018
16019 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
16020 }
16021
16022 /* Implemented as
16023 V setg (V v, int idx, T val)
16024 {
16025 V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
16026 V valv = (V){val, val, val, val, val, val, val, val};
16027 V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
16028 v = (v & ~mask) | (valv & mask);
16029 return v;
16030 }. */
16031 void
16032 ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
16033 {
16034 rtx vec[64];
16035 machine_mode mode = GET_MODE (target);
16036 machine_mode cmp_mode = mode;
16037 int n_elts = GET_MODE_NUNITS (mode);
16038 rtx valv,idxv,constv,idx_tmp;
16039 bool ok = false;
16040
16041 /* 512-bits vector byte/word broadcast and comparison only available
16042 under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
16043 when without TARGET_AVX512BW. */
16044 if ((mode == V32HImode || mode == V32HFmode || mode == V64QImode)
16045 && !TARGET_AVX512BW)
16046 {
16047 gcc_assert (TARGET_AVX512F);
16048 rtx vhi, vlo, idx_hi;
16049 machine_mode half_mode;
16050 rtx (*extract_hi)(rtx, rtx);
16051 rtx (*extract_lo)(rtx, rtx);
16052
16053 if (mode == V32HImode)
16054 {
16055 half_mode = V16HImode;
16056 extract_hi = gen_vec_extract_hi_v32hi;
16057 extract_lo = gen_vec_extract_lo_v32hi;
16058 }
16059 else if (mode == V32HFmode)
16060 {
16061 half_mode = V16HFmode;
16062 extract_hi = gen_vec_extract_hi_v32hf;
16063 extract_lo = gen_vec_extract_lo_v32hf;
16064 }
16065 else
16066 {
16067 half_mode = V32QImode;
16068 extract_hi = gen_vec_extract_hi_v64qi;
16069 extract_lo = gen_vec_extract_lo_v64qi;
16070 }
16071
16072 vhi = gen_reg_rtx (half_mode);
16073 vlo = gen_reg_rtx (half_mode);
16074 idx_hi = gen_reg_rtx (GET_MODE (idx));
16075 emit_insn (extract_hi (vhi, target));
16076 emit_insn (extract_lo (vlo, target));
16077 vec[0] = idx_hi;
16078 vec[1] = idx;
16079 vec[2] = GEN_INT (n_elts/2);
16080 ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
16081 ix86_expand_vector_set_var (vhi, val, idx_hi);
16082 ix86_expand_vector_set_var (vlo, val, idx);
16083 emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
16084 return;
16085 }
16086
16087 if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
16088 {
16089 switch (mode)
16090 {
16091 case E_V2DFmode:
16092 cmp_mode = V2DImode;
16093 break;
16094 case E_V4DFmode:
16095 cmp_mode = V4DImode;
16096 break;
16097 case E_V8DFmode:
16098 cmp_mode = V8DImode;
16099 break;
16100 case E_V2SFmode:
16101 cmp_mode = V2SImode;
16102 break;
16103 case E_V4SFmode:
16104 cmp_mode = V4SImode;
16105 break;
16106 case E_V8SFmode:
16107 cmp_mode = V8SImode;
16108 break;
16109 case E_V16SFmode:
16110 cmp_mode = V16SImode;
16111 break;
16112 case E_V8HFmode:
16113 cmp_mode = V8HImode;
16114 break;
16115 case E_V16HFmode:
16116 cmp_mode = V16HImode;
16117 break;
16118 case E_V32HFmode:
16119 cmp_mode = V32HImode;
16120 break;
16121 default:
16122 gcc_unreachable ();
16123 }
16124 }
16125
16126 for (int i = 0; i != n_elts; i++)
16127 vec[i] = GEN_INT (i);
16128 constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
16129 valv = gen_reg_rtx (mode);
16130 idxv = gen_reg_rtx (cmp_mode);
16131 idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
16132
16133 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16134 mode, valv, val);
16135 gcc_assert (ok);
16136 ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
16137 cmp_mode, idxv, idx_tmp);
16138 gcc_assert (ok);
16139 vec[0] = target;
16140 vec[1] = valv;
16141 vec[2] = target;
16142 vec[3] = gen_rtx_EQ (mode, idxv, constv);
16143 vec[4] = idxv;
16144 vec[5] = constv;
16145 ok = ix86_expand_int_vcond (vec);
16146 gcc_assert (ok);
16147 }
16148
16149 void
16150 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
16151 {
16152 machine_mode mode = GET_MODE (target);
16153 machine_mode inner_mode = GET_MODE_INNER (mode);
16154 machine_mode half_mode;
16155 bool use_vec_merge = false;
16156 bool blendm_const = false;
16157 rtx tmp;
16158 static rtx (*gen_extract[7][2]) (rtx, rtx)
16159 = {
16160 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
16161 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
16162 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
16163 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
16164 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
16165 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
16166 { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
16167 };
16168 static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
16169 = {
16170 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
16171 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
16172 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
16173 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
16174 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
16175 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
16176 { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
16177 };
16178 int i, j, n;
16179 machine_mode mmode = VOIDmode;
16180 rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
16181
16182 switch (mode)
16183 {
16184 case E_V2SImode:
16185 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16186 if (use_vec_merge)
16187 break;
16188 /* FALLTHRU */
16189
16190 case E_V2SFmode:
16191 if (mmx_ok)
16192 {
16193 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16194 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
16195 if (elt == 0)
16196 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16197 else
16198 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16199 emit_insn (gen_rtx_SET (target, tmp));
16200 return;
16201 }
16202 break;
16203
16204 case E_V2DImode:
16205 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
16206 if (use_vec_merge)
16207 break;
16208
16209 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
16210 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
16211 if (elt == 0)
16212 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
16213 else
16214 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
16215 emit_insn (gen_rtx_SET (target, tmp));
16216 return;
16217
16218 case E_V2DFmode:
16219 /* NB: For ELT == 0, use standard scalar operation patterns which
16220 preserve the rest of the vector for combiner:
16221
16222 (vec_merge:V2DF
16223 (vec_duplicate:V2DF (reg:DF))
16224 (reg:V2DF)
16225 (const_int 1))
16226 */
16227 if (elt == 0)
16228 goto do_vec_merge;
16229
16230 {
16231 rtx op0, op1;
16232
16233 /* For the two element vectors, we implement a VEC_CONCAT with
16234 the extraction of the other element. */
16235
16236 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
16237 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
16238
16239 if (elt == 0)
16240 op0 = val, op1 = tmp;
16241 else
16242 op0 = tmp, op1 = val;
16243
16244 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
16245 emit_insn (gen_rtx_SET (target, tmp));
16246 }
16247 return;
16248
16249 case E_V4SFmode:
16250 use_vec_merge = TARGET_SSE4_1;
16251 if (use_vec_merge)
16252 break;
16253
16254 switch (elt)
16255 {
16256 case 0:
16257 use_vec_merge = true;
16258 break;
16259
16260 case 1:
16261 /* tmp = target = A B C D */
16262 tmp = copy_to_reg (target);
16263 /* target = A A B B */
16264 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
16265 /* target = X A B B */
16266 ix86_expand_vector_set (false, target, val, 0);
16267 /* target = A X C D */
16268 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16269 const1_rtx, const0_rtx,
16270 GEN_INT (2+4), GEN_INT (3+4)));
16271 return;
16272
16273 case 2:
16274 /* tmp = target = A B C D */
16275 tmp = copy_to_reg (target);
16276 /* tmp = X B C D */
16277 ix86_expand_vector_set (false, tmp, val, 0);
16278 /* target = A B X D */
16279 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16280 const0_rtx, const1_rtx,
16281 GEN_INT (0+4), GEN_INT (3+4)));
16282 return;
16283
16284 case 3:
16285 /* tmp = target = A B C D */
16286 tmp = copy_to_reg (target);
16287 /* tmp = X B C D */
16288 ix86_expand_vector_set (false, tmp, val, 0);
16289 /* target = A B X D */
16290 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
16291 const0_rtx, const1_rtx,
16292 GEN_INT (2+4), GEN_INT (0+4)));
16293 return;
16294
16295 default:
16296 gcc_unreachable ();
16297 }
16298 break;
16299
16300 case E_V4SImode:
16301 use_vec_merge = TARGET_SSE4_1;
16302 if (use_vec_merge)
16303 break;
16304
16305 /* Element 0 handled by vec_merge below. */
16306 if (elt == 0)
16307 {
16308 use_vec_merge = true;
16309 break;
16310 }
16311
16312 if (TARGET_SSE2)
16313 {
16314 /* With SSE2, use integer shuffles to swap element 0 and ELT,
16315 store into element 0, then shuffle them back. */
16316
16317 rtx order[4];
16318
16319 order[0] = GEN_INT (elt);
16320 order[1] = const1_rtx;
16321 order[2] = const2_rtx;
16322 order[3] = GEN_INT (3);
16323 order[elt] = const0_rtx;
16324
16325 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16326 order[1], order[2], order[3]));
16327
16328 ix86_expand_vector_set (false, target, val, 0);
16329
16330 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
16331 order[1], order[2], order[3]));
16332 }
16333 else
16334 {
16335 /* For SSE1, we have to reuse the V4SF code. */
16336 rtx t = gen_reg_rtx (V4SFmode);
16337 emit_move_insn (t, gen_lowpart (V4SFmode, target));
16338 ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
16339 emit_move_insn (target, gen_lowpart (mode, t));
16340 }
16341 return;
16342
16343 case E_V8HImode:
16344 case E_V8HFmode:
16345 case E_V2HImode:
16346 use_vec_merge = TARGET_SSE2;
16347 break;
16348 case E_V4HImode:
16349 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16350 break;
16351
16352 case E_V16QImode:
16353 case E_V4QImode:
16354 use_vec_merge = TARGET_SSE4_1;
16355 break;
16356
16357 case E_V8QImode:
16358 use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16359 break;
16360
16361 case E_V32QImode:
16362 half_mode = V16QImode;
16363 j = 0;
16364 n = 16;
16365 goto half;
16366
16367 case E_V16HFmode:
16368 /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
16369 if (TARGET_AVX2 && elt != 0)
16370 {
16371 mmode = SImode;
16372 gen_blendm = gen_avx2_pblendph_1;
16373 blendm_const = true;
16374 break;
16375 }
16376 else
16377 {
16378 half_mode = V8HFmode;
16379 j = 6;
16380 n = 8;
16381 goto half;
16382 }
16383
16384 case E_V16HImode:
16385 half_mode = V8HImode;
16386 j = 1;
16387 n = 8;
16388 goto half;
16389
16390 case E_V8SImode:
16391 half_mode = V4SImode;
16392 j = 2;
16393 n = 4;
16394 goto half;
16395
16396 case E_V4DImode:
16397 half_mode = V2DImode;
16398 j = 3;
16399 n = 2;
16400 goto half;
16401
16402 case E_V8SFmode:
16403 half_mode = V4SFmode;
16404 j = 4;
16405 n = 4;
16406 goto half;
16407
16408 case E_V4DFmode:
16409 half_mode = V2DFmode;
16410 j = 5;
16411 n = 2;
16412 goto half;
16413
16414 half:
16415 /* Compute offset. */
16416 i = elt / n;
16417 elt %= n;
16418
16419 gcc_assert (i <= 1);
16420
16421 /* Extract the half. */
16422 tmp = gen_reg_rtx (half_mode);
16423 emit_insn (gen_extract[j][i] (tmp, target));
16424
16425 /* Put val in tmp at elt. */
16426 ix86_expand_vector_set (false, tmp, val, elt);
16427
16428 /* Put it back. */
16429 emit_insn (gen_insert[j][i] (target, target, tmp));
16430 return;
16431
16432 case E_V8DFmode:
16433 if (TARGET_AVX512F)
16434 {
16435 mmode = QImode;
16436 gen_blendm = gen_avx512f_blendmv8df;
16437 }
16438 break;
16439
16440 case E_V8DImode:
16441 if (TARGET_AVX512F)
16442 {
16443 mmode = QImode;
16444 gen_blendm = gen_avx512f_blendmv8di;
16445 }
16446 break;
16447
16448 case E_V16SFmode:
16449 if (TARGET_AVX512F)
16450 {
16451 mmode = HImode;
16452 gen_blendm = gen_avx512f_blendmv16sf;
16453 }
16454 break;
16455
16456 case E_V16SImode:
16457 if (TARGET_AVX512F)
16458 {
16459 mmode = HImode;
16460 gen_blendm = gen_avx512f_blendmv16si;
16461 }
16462 break;
16463
16464 case E_V32HFmode:
16465 if (TARGET_AVX512BW)
16466 {
16467 mmode = SImode;
16468 gen_blendm = gen_avx512bw_blendmv32hf;
16469 }
16470 break;
16471 case E_V32HImode:
16472 if (TARGET_AVX512BW)
16473 {
16474 mmode = SImode;
16475 gen_blendm = gen_avx512bw_blendmv32hi;
16476 }
16477 else if (TARGET_AVX512F)
16478 {
16479 half_mode = E_V8HImode;
16480 n = 8;
16481 goto quarter;
16482 }
16483 break;
16484
16485 case E_V64QImode:
16486 if (TARGET_AVX512BW)
16487 {
16488 mmode = DImode;
16489 gen_blendm = gen_avx512bw_blendmv64qi;
16490 }
16491 else if (TARGET_AVX512F)
16492 {
16493 half_mode = E_V16QImode;
16494 n = 16;
16495 goto quarter;
16496 }
16497 break;
16498
16499 quarter:
16500 /* Compute offset. */
16501 i = elt / n;
16502 elt %= n;
16503
16504 gcc_assert (i <= 3);
16505
16506 {
16507 /* Extract the quarter. */
16508 tmp = gen_reg_rtx (V4SImode);
16509 rtx tmp2 = gen_lowpart (V16SImode, target);
16510 rtx mask = gen_reg_rtx (QImode);
16511
16512 emit_move_insn (mask, constm1_rtx);
16513 emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
16514 tmp, mask));
16515
16516 tmp2 = gen_reg_rtx (half_mode);
16517 emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
16518 tmp = tmp2;
16519
16520 /* Put val in tmp at elt. */
16521 ix86_expand_vector_set (false, tmp, val, elt);
16522
16523 /* Put it back. */
16524 tmp2 = gen_reg_rtx (V16SImode);
16525 rtx tmp3 = gen_lowpart (V16SImode, target);
16526 mask = gen_reg_rtx (HImode);
16527 emit_move_insn (mask, constm1_rtx);
16528 tmp = gen_lowpart (V4SImode, tmp);
16529 emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
16530 tmp3, mask));
16531 emit_move_insn (target, gen_lowpart (mode, tmp2));
16532 }
16533 return;
16534
16535 default:
16536 break;
16537 }
16538
16539 if (mmode != VOIDmode)
16540 {
16541 tmp = gen_reg_rtx (mode);
16542 emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
16543 rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
16544 /* The avx512*_blendm<mode> expanders have different operand order
16545 from VEC_MERGE. In VEC_MERGE, the first input operand is used for
16546 elements where the mask is set and second input operand otherwise,
16547 in {sse,avx}*_*blend* the first input operand is used for elements
16548 where the mask is clear and second input operand otherwise. */
16549 if (!blendm_const)
16550 merge_mask = force_reg (mmode, merge_mask);
16551 emit_insn (gen_blendm (target, target, tmp, merge_mask));
16552 }
16553 else if (use_vec_merge)
16554 {
16555 do_vec_merge:
16556 if (!nonimmediate_operand (val, inner_mode))
16557 val = force_reg (inner_mode, val);
16558 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
16559 tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
16560 GEN_INT (HOST_WIDE_INT_1U << elt));
16561 emit_insn (gen_rtx_SET (target, tmp));
16562 }
16563 else
16564 {
16565 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16566
16567 emit_move_insn (mem, target);
16568
16569 tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
16570 emit_move_insn (tmp, val);
16571
16572 emit_move_insn (target, mem);
16573 }
16574 }
16575
16576 void
16577 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
16578 {
16579 machine_mode mode = GET_MODE (vec);
16580 machine_mode inner_mode = GET_MODE_INNER (mode);
16581 bool use_vec_extr = false;
16582 rtx tmp;
16583
16584 switch (mode)
16585 {
16586 case E_V2SImode:
16587 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16588 if (use_vec_extr)
16589 break;
16590 /* FALLTHRU */
16591
16592 case E_V2SFmode:
16593 if (!mmx_ok)
16594 break;
16595 /* FALLTHRU */
16596
16597 case E_V2DFmode:
16598 case E_V2DImode:
16599 case E_V2TImode:
16600 case E_V4TImode:
16601 use_vec_extr = true;
16602 break;
16603
16604 case E_V4SFmode:
16605 use_vec_extr = TARGET_SSE4_1;
16606 if (use_vec_extr)
16607 break;
16608
16609 switch (elt)
16610 {
16611 case 0:
16612 tmp = vec;
16613 break;
16614
16615 case 1:
16616 case 3:
16617 tmp = gen_reg_rtx (mode);
16618 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
16619 GEN_INT (elt), GEN_INT (elt),
16620 GEN_INT (elt+4), GEN_INT (elt+4)));
16621 break;
16622
16623 case 2:
16624 tmp = gen_reg_rtx (mode);
16625 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
16626 break;
16627
16628 default:
16629 gcc_unreachable ();
16630 }
16631 vec = tmp;
16632 use_vec_extr = true;
16633 elt = 0;
16634 break;
16635
16636 case E_V4SImode:
16637 use_vec_extr = TARGET_SSE4_1;
16638 if (use_vec_extr)
16639 break;
16640
16641 if (TARGET_SSE2)
16642 {
16643 switch (elt)
16644 {
16645 case 0:
16646 tmp = vec;
16647 break;
16648
16649 case 1:
16650 case 3:
16651 tmp = gen_reg_rtx (mode);
16652 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
16653 GEN_INT (elt), GEN_INT (elt),
16654 GEN_INT (elt), GEN_INT (elt)));
16655 break;
16656
16657 case 2:
16658 tmp = gen_reg_rtx (mode);
16659 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
16660 break;
16661
16662 default:
16663 gcc_unreachable ();
16664 }
16665 vec = tmp;
16666 use_vec_extr = true;
16667 elt = 0;
16668 }
16669 else
16670 {
16671 /* For SSE1, we have to reuse the V4SF code. */
16672 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
16673 gen_lowpart (V4SFmode, vec), elt);
16674 return;
16675 }
16676 break;
16677
16678 case E_V8HImode:
16679 case E_V8HFmode:
16680 case E_V2HImode:
16681 use_vec_extr = TARGET_SSE2;
16682 break;
16683 case E_V4HImode:
16684 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
16685 break;
16686
16687 case E_V16QImode:
16688 use_vec_extr = TARGET_SSE4_1;
16689 if (!use_vec_extr
16690 && TARGET_SSE2
16691 && elt == 0
16692 && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
16693 {
16694 tmp = gen_reg_rtx (SImode);
16695 ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
16696 0);
16697 emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
16698 return;
16699 }
16700 break;
16701 case E_V4QImode:
16702 use_vec_extr = TARGET_SSE4_1;
16703 break;
16704
16705 case E_V8SFmode:
16706 if (TARGET_AVX)
16707 {
16708 tmp = gen_reg_rtx (V4SFmode);
16709 if (elt < 4)
16710 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
16711 else
16712 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
16713 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16714 return;
16715 }
16716 break;
16717
16718 case E_V4DFmode:
16719 if (TARGET_AVX)
16720 {
16721 tmp = gen_reg_rtx (V2DFmode);
16722 if (elt < 2)
16723 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
16724 else
16725 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
16726 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16727 return;
16728 }
16729 break;
16730
16731 case E_V32QImode:
16732 if (TARGET_AVX)
16733 {
16734 tmp = gen_reg_rtx (V16QImode);
16735 if (elt < 16)
16736 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
16737 else
16738 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
16739 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16740 return;
16741 }
16742 break;
16743
16744 case E_V16HImode:
16745 if (TARGET_AVX)
16746 {
16747 tmp = gen_reg_rtx (V8HImode);
16748 if (elt < 8)
16749 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
16750 else
16751 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
16752 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16753 return;
16754 }
16755 break;
16756
16757 case E_V8SImode:
16758 if (TARGET_AVX)
16759 {
16760 tmp = gen_reg_rtx (V4SImode);
16761 if (elt < 4)
16762 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
16763 else
16764 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
16765 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16766 return;
16767 }
16768 break;
16769
16770 case E_V4DImode:
16771 if (TARGET_AVX)
16772 {
16773 tmp = gen_reg_rtx (V2DImode);
16774 if (elt < 2)
16775 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
16776 else
16777 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
16778 ix86_expand_vector_extract (false, target, tmp, elt & 1);
16779 return;
16780 }
16781 break;
16782
16783 case E_V32HImode:
16784 if (TARGET_AVX512BW)
16785 {
16786 tmp = gen_reg_rtx (V16HImode);
16787 if (elt < 16)
16788 emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
16789 else
16790 emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
16791 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16792 return;
16793 }
16794 break;
16795
16796 case E_V64QImode:
16797 if (TARGET_AVX512BW)
16798 {
16799 tmp = gen_reg_rtx (V32QImode);
16800 if (elt < 32)
16801 emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
16802 else
16803 emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
16804 ix86_expand_vector_extract (false, target, tmp, elt & 31);
16805 return;
16806 }
16807 break;
16808
16809 case E_V16SFmode:
16810 tmp = gen_reg_rtx (V8SFmode);
16811 if (elt < 8)
16812 emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
16813 else
16814 emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
16815 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16816 return;
16817
16818 case E_V8DFmode:
16819 tmp = gen_reg_rtx (V4DFmode);
16820 if (elt < 4)
16821 emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
16822 else
16823 emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
16824 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16825 return;
16826
16827 case E_V16SImode:
16828 tmp = gen_reg_rtx (V8SImode);
16829 if (elt < 8)
16830 emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
16831 else
16832 emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
16833 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16834 return;
16835
16836 case E_V8DImode:
16837 tmp = gen_reg_rtx (V4DImode);
16838 if (elt < 4)
16839 emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
16840 else
16841 emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
16842 ix86_expand_vector_extract (false, target, tmp, elt & 3);
16843 return;
16844
16845 case E_V32HFmode:
16846 if (TARGET_AVX512BW)
16847 {
16848 tmp = gen_reg_rtx (V16HFmode);
16849 if (elt < 16)
16850 emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
16851 else
16852 emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
16853 ix86_expand_vector_extract (false, target, tmp, elt & 15);
16854 return;
16855 }
16856 break;
16857
16858 case E_V16HFmode:
16859 if (TARGET_AVX)
16860 {
16861 tmp = gen_reg_rtx (V8HFmode);
16862 if (elt < 8)
16863 emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
16864 else
16865 emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
16866 ix86_expand_vector_extract (false, target, tmp, elt & 7);
16867 return;
16868 }
16869 break;
16870
16871 case E_V8QImode:
16872 use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
16873 /* ??? Could extract the appropriate HImode element and shift. */
16874 break;
16875
16876 default:
16877 break;
16878 }
16879
16880 if (use_vec_extr)
16881 {
16882 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
16883 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
16884
16885 /* Let the rtl optimizers know about the zero extension performed. */
16886 if (inner_mode == QImode || inner_mode == HImode)
16887 {
16888 rtx reg = gen_reg_rtx (SImode);
16889 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
16890 emit_move_insn (reg, tmp);
16891 tmp = gen_lowpart (inner_mode, reg);
16892 SUBREG_PROMOTED_VAR_P (tmp) = 1;
16893 SUBREG_PROMOTED_SET (tmp, 1);
16894 }
16895
16896 emit_move_insn (target, tmp);
16897 }
16898 else
16899 {
16900 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
16901
16902 emit_move_insn (mem, vec);
16903
16904 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
16905 emit_move_insn (target, tmp);
16906 }
16907 }
16908
16909 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
16910 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
16911 The upper bits of DEST are undefined, though they shouldn't cause
16912 exceptions (some bits from src or all zeros are ok). */
16913
16914 static void
16915 emit_reduc_half (rtx dest, rtx src, int i)
16916 {
16917 rtx tem, d = dest;
16918 switch (GET_MODE (src))
16919 {
16920 case E_V4SFmode:
16921 if (i == 128)
16922 tem = gen_sse_movhlps (dest, src, src);
16923 else
16924 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
16925 GEN_INT (1 + 4), GEN_INT (1 + 4));
16926 break;
16927 case E_V2DFmode:
16928 tem = gen_vec_interleave_highv2df (dest, src, src);
16929 break;
16930 case E_V4QImode:
16931 d = gen_reg_rtx (V1SImode);
16932 tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
16933 GEN_INT (i / 2));
16934 break;
16935 case E_V4HImode:
16936 d = gen_reg_rtx (V1DImode);
16937 tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
16938 GEN_INT (i / 2));
16939 break;
16940 case E_V16QImode:
16941 case E_V8HImode:
16942 case E_V8HFmode:
16943 case E_V4SImode:
16944 case E_V2DImode:
16945 d = gen_reg_rtx (V1TImode);
16946 tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
16947 GEN_INT (i / 2));
16948 break;
16949 case E_V8SFmode:
16950 if (i == 256)
16951 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
16952 else
16953 tem = gen_avx_shufps256 (dest, src, src,
16954 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
16955 break;
16956 case E_V4DFmode:
16957 if (i == 256)
16958 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
16959 else
16960 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
16961 break;
16962 case E_V32QImode:
16963 case E_V16HImode:
16964 case E_V16HFmode:
16965 case E_V8SImode:
16966 case E_V4DImode:
16967 if (i == 256)
16968 {
16969 if (GET_MODE (dest) != V4DImode)
16970 d = gen_reg_rtx (V4DImode);
16971 tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
16972 gen_lowpart (V4DImode, src),
16973 const1_rtx);
16974 }
16975 else
16976 {
16977 d = gen_reg_rtx (V2TImode);
16978 tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
16979 GEN_INT (i / 2));
16980 }
16981 break;
16982 case E_V64QImode:
16983 case E_V32HImode:
16984 case E_V32HFmode:
16985 if (i < 64)
16986 {
16987 d = gen_reg_rtx (V4TImode);
16988 tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
16989 GEN_INT (i / 2));
16990 break;
16991 }
16992 /* FALLTHRU */
16993 case E_V16SImode:
16994 case E_V16SFmode:
16995 case E_V8DImode:
16996 case E_V8DFmode:
16997 if (i > 128)
16998 tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
16999 gen_lowpart (V16SImode, src),
17000 gen_lowpart (V16SImode, src),
17001 GEN_INT (0x4 + (i == 512 ? 4 : 0)),
17002 GEN_INT (0x5 + (i == 512 ? 4 : 0)),
17003 GEN_INT (0x6 + (i == 512 ? 4 : 0)),
17004 GEN_INT (0x7 + (i == 512 ? 4 : 0)),
17005 GEN_INT (0xC), GEN_INT (0xD),
17006 GEN_INT (0xE), GEN_INT (0xF),
17007 GEN_INT (0x10), GEN_INT (0x11),
17008 GEN_INT (0x12), GEN_INT (0x13),
17009 GEN_INT (0x14), GEN_INT (0x15),
17010 GEN_INT (0x16), GEN_INT (0x17));
17011 else
17012 tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
17013 gen_lowpart (V16SImode, src),
17014 GEN_INT (i == 128 ? 0x2 : 0x1),
17015 GEN_INT (0x3),
17016 GEN_INT (0x3),
17017 GEN_INT (0x3),
17018 GEN_INT (i == 128 ? 0x6 : 0x5),
17019 GEN_INT (0x7),
17020 GEN_INT (0x7),
17021 GEN_INT (0x7),
17022 GEN_INT (i == 128 ? 0xA : 0x9),
17023 GEN_INT (0xB),
17024 GEN_INT (0xB),
17025 GEN_INT (0xB),
17026 GEN_INT (i == 128 ? 0xE : 0xD),
17027 GEN_INT (0xF),
17028 GEN_INT (0xF),
17029 GEN_INT (0xF));
17030 break;
17031 default:
17032 gcc_unreachable ();
17033 }
17034 emit_insn (tem);
17035 if (d != dest)
17036 emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
17037 }
17038
17039 /* Expand a vector reduction. FN is the binary pattern to reduce;
17040 DEST is the destination; IN is the input vector. */
17041
17042 void
17043 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
17044 {
17045 rtx half, dst, vec = in;
17046 machine_mode mode = GET_MODE (in);
17047 int i;
17048
17049 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
17050 if (TARGET_SSE4_1
17051 && mode == V8HImode
17052 && fn == gen_uminv8hi3)
17053 {
17054 emit_insn (gen_sse4_1_phminposuw (dest, in));
17055 return;
17056 }
17057
17058 for (i = GET_MODE_BITSIZE (mode);
17059 i > GET_MODE_UNIT_BITSIZE (mode);
17060 i >>= 1)
17061 {
17062 half = gen_reg_rtx (mode);
17063 emit_reduc_half (half, vec, i);
17064 if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
17065 dst = dest;
17066 else
17067 dst = gen_reg_rtx (mode);
17068 emit_insn (fn (dst, half, vec));
17069 vec = dst;
17070 }
17071 }
17072
17073 /* Output code to perform a conditional jump to LABEL, if C2 flag in
17074 FP status register is set. */
17075
17076 void
17077 ix86_emit_fp_unordered_jump (rtx label)
17078 {
17079 rtx reg = gen_reg_rtx (HImode);
17080 rtx_insn *insn;
17081 rtx temp;
17082
17083 emit_insn (gen_x86_fnstsw_1 (reg));
17084
17085 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
17086 {
17087 emit_insn (gen_x86_sahf_1 (reg));
17088
17089 temp = gen_rtx_REG (CCmode, FLAGS_REG);
17090 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
17091 }
17092 else
17093 {
17094 emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
17095
17096 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
17097 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
17098 }
17099
17100 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
17101 gen_rtx_LABEL_REF (VOIDmode, label),
17102 pc_rtx);
17103 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
17104 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17105 JUMP_LABEL (insn) = label;
17106 }
17107
17108 /* Output code to perform an sinh XFmode calculation. */
17109
17110 void
17111 ix86_emit_i387_sinh (rtx op0, rtx op1)
17112 {
17113 rtx e1 = gen_reg_rtx (XFmode);
17114 rtx e2 = gen_reg_rtx (XFmode);
17115 rtx scratch = gen_reg_rtx (HImode);
17116 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17117 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17118 rtx cst1, tmp;
17119 rtx_code_label *jump_label = gen_label_rtx ();
17120 rtx_insn *insn;
17121
17122 /* scratch = fxam (op1) */
17123 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17124
17125 /* e1 = expm1 (|op1|) */
17126 emit_insn (gen_absxf2 (e2, op1));
17127 emit_insn (gen_expm1xf2 (e1, e2));
17128
17129 /* e2 = e1 / (e1 + 1.0) + e1 */
17130 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17131 emit_insn (gen_addxf3 (e2, e1, cst1));
17132 emit_insn (gen_divxf3 (e2, e1, e2));
17133 emit_insn (gen_addxf3 (e2, e2, e1));
17134
17135 /* flags = signbit (op1) */
17136 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17137
17138 /* if (flags) then e2 = -e2 */
17139 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17140 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17141 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17142 pc_rtx);
17143 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17144 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17145 JUMP_LABEL (insn) = jump_label;
17146
17147 emit_insn (gen_negxf2 (e2, e2));
17148
17149 emit_label (jump_label);
17150 LABEL_NUSES (jump_label) = 1;
17151
17152 /* op0 = 0.5 * e2 */
17153 half = force_reg (XFmode, half);
17154 emit_insn (gen_mulxf3 (op0, e2, half));
17155 }
17156
17157 /* Output code to perform an cosh XFmode calculation. */
17158
17159 void
17160 ix86_emit_i387_cosh (rtx op0, rtx op1)
17161 {
17162 rtx e1 = gen_reg_rtx (XFmode);
17163 rtx e2 = gen_reg_rtx (XFmode);
17164 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17165 rtx cst1;
17166
17167 /* e1 = exp (op1) */
17168 emit_insn (gen_expxf2 (e1, op1));
17169
17170 /* e2 = e1 + 1.0 / e1 */
17171 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17172 emit_insn (gen_divxf3 (e2, cst1, e1));
17173 emit_insn (gen_addxf3 (e2, e1, e2));
17174
17175 /* op0 = 0.5 * e2 */
17176 half = force_reg (XFmode, half);
17177 emit_insn (gen_mulxf3 (op0, e2, half));
17178 }
17179
17180 /* Output code to perform an tanh XFmode calculation. */
17181
17182 void
17183 ix86_emit_i387_tanh (rtx op0, rtx op1)
17184 {
17185 rtx e1 = gen_reg_rtx (XFmode);
17186 rtx e2 = gen_reg_rtx (XFmode);
17187 rtx scratch = gen_reg_rtx (HImode);
17188 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17189 rtx cst2, tmp;
17190 rtx_code_label *jump_label = gen_label_rtx ();
17191 rtx_insn *insn;
17192
17193 /* scratch = fxam (op1) */
17194 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17195
17196 /* e1 = expm1 (-|2 * op1|) */
17197 emit_insn (gen_addxf3 (e2, op1, op1));
17198 emit_insn (gen_absxf2 (e2, e2));
17199 emit_insn (gen_negxf2 (e2, e2));
17200 emit_insn (gen_expm1xf2 (e1, e2));
17201
17202 /* e2 = e1 / (e1 + 2.0) */
17203 cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
17204 emit_insn (gen_addxf3 (e2, e1, cst2));
17205 emit_insn (gen_divxf3 (e2, e1, e2));
17206
17207 /* flags = signbit (op1) */
17208 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17209
17210 /* if (!flags) then e2 = -e2 */
17211 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17212 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17213 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17214 pc_rtx);
17215 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17216 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17217 JUMP_LABEL (insn) = jump_label;
17218
17219 emit_insn (gen_negxf2 (e2, e2));
17220
17221 emit_label (jump_label);
17222 LABEL_NUSES (jump_label) = 1;
17223
17224 emit_move_insn (op0, e2);
17225 }
17226
17227 /* Output code to perform an asinh XFmode calculation. */
17228
17229 void
17230 ix86_emit_i387_asinh (rtx op0, rtx op1)
17231 {
17232 rtx e1 = gen_reg_rtx (XFmode);
17233 rtx e2 = gen_reg_rtx (XFmode);
17234 rtx scratch = gen_reg_rtx (HImode);
17235 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17236 rtx cst1, tmp;
17237 rtx_code_label *jump_label = gen_label_rtx ();
17238 rtx_insn *insn;
17239
17240 /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
17241 emit_insn (gen_mulxf3 (e1, op1, op1));
17242 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17243 emit_insn (gen_addxf3 (e2, e1, cst1));
17244 emit_insn (gen_sqrtxf2 (e2, e2));
17245 emit_insn (gen_addxf3 (e2, e2, cst1));
17246
17247 /* e1 = e1 / e2 */
17248 emit_insn (gen_divxf3 (e1, e1, e2));
17249
17250 /* scratch = fxam (op1) */
17251 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17252
17253 /* e1 = e1 + |op1| */
17254 emit_insn (gen_absxf2 (e2, op1));
17255 emit_insn (gen_addxf3 (e1, e1, e2));
17256
17257 /* e2 = log1p (e1) */
17258 ix86_emit_i387_log1p (e2, e1);
17259
17260 /* flags = signbit (op1) */
17261 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17262
17263 /* if (flags) then e2 = -e2 */
17264 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17265 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17266 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17267 pc_rtx);
17268 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17269 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17270 JUMP_LABEL (insn) = jump_label;
17271
17272 emit_insn (gen_negxf2 (e2, e2));
17273
17274 emit_label (jump_label);
17275 LABEL_NUSES (jump_label) = 1;
17276
17277 emit_move_insn (op0, e2);
17278 }
17279
17280 /* Output code to perform an acosh XFmode calculation. */
17281
17282 void
17283 ix86_emit_i387_acosh (rtx op0, rtx op1)
17284 {
17285 rtx e1 = gen_reg_rtx (XFmode);
17286 rtx e2 = gen_reg_rtx (XFmode);
17287 rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17288
17289 /* e2 = sqrt (op1 + 1.0) */
17290 emit_insn (gen_addxf3 (e2, op1, cst1));
17291 emit_insn (gen_sqrtxf2 (e2, e2));
17292
17293 /* e1 = sqrt (op1 - 1.0) */
17294 emit_insn (gen_subxf3 (e1, op1, cst1));
17295 emit_insn (gen_sqrtxf2 (e1, e1));
17296
17297 /* e1 = e1 * e2 */
17298 emit_insn (gen_mulxf3 (e1, e1, e2));
17299
17300 /* e1 = e1 + op1 */
17301 emit_insn (gen_addxf3 (e1, e1, op1));
17302
17303 /* op0 = log (e1) */
17304 emit_insn (gen_logxf2 (op0, e1));
17305 }
17306
17307 /* Output code to perform an atanh XFmode calculation. */
17308
17309 void
17310 ix86_emit_i387_atanh (rtx op0, rtx op1)
17311 {
17312 rtx e1 = gen_reg_rtx (XFmode);
17313 rtx e2 = gen_reg_rtx (XFmode);
17314 rtx scratch = gen_reg_rtx (HImode);
17315 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17316 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17317 rtx cst1, tmp;
17318 rtx_code_label *jump_label = gen_label_rtx ();
17319 rtx_insn *insn;
17320
17321 /* scratch = fxam (op1) */
17322 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17323
17324 /* e2 = |op1| */
17325 emit_insn (gen_absxf2 (e2, op1));
17326
17327 /* e1 = -(e2 + e2) / (e2 + 1.0) */
17328 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17329 emit_insn (gen_addxf3 (e1, e2, cst1));
17330 emit_insn (gen_addxf3 (e2, e2, e2));
17331 emit_insn (gen_negxf2 (e2, e2));
17332 emit_insn (gen_divxf3 (e1, e2, e1));
17333
17334 /* e2 = log1p (e1) */
17335 ix86_emit_i387_log1p (e2, e1);
17336
17337 /* flags = signbit (op1) */
17338 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17339
17340 /* if (!flags) then e2 = -e2 */
17341 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17342 gen_rtx_NE (VOIDmode, flags, const0_rtx),
17343 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17344 pc_rtx);
17345 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17346 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17347 JUMP_LABEL (insn) = jump_label;
17348
17349 emit_insn (gen_negxf2 (e2, e2));
17350
17351 emit_label (jump_label);
17352 LABEL_NUSES (jump_label) = 1;
17353
17354 /* op0 = 0.5 * e2 */
17355 half = force_reg (XFmode, half);
17356 emit_insn (gen_mulxf3 (op0, e2, half));
17357 }
17358
17359 /* Output code to perform a log1p XFmode calculation. */
17360
17361 void
17362 ix86_emit_i387_log1p (rtx op0, rtx op1)
17363 {
17364 rtx_code_label *label1 = gen_label_rtx ();
17365 rtx_code_label *label2 = gen_label_rtx ();
17366
17367 rtx tmp = gen_reg_rtx (XFmode);
17368 rtx res = gen_reg_rtx (XFmode);
17369 rtx cst, cstln2, cst1;
17370 rtx_insn *insn;
17371
17372 /* The emit_jump call emits pending stack adjust, make sure it is emitted
17373 before the conditional jump, otherwise the stack adjustment will be
17374 only conditional. */
17375 do_pending_stack_adjust ();
17376
17377 cst = const_double_from_real_value
17378 (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
17379 cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
17380
17381 emit_insn (gen_absxf2 (tmp, op1));
17382
17383 cst = force_reg (XFmode, cst);
17384 ix86_expand_branch (GE, tmp, cst, label1);
17385 predict_jump (REG_BR_PROB_BASE * 10 / 100);
17386 insn = get_last_insn ();
17387 JUMP_LABEL (insn) = label1;
17388
17389 emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
17390 emit_jump (label2);
17391
17392 emit_label (label1);
17393 LABEL_NUSES (label1) = 1;
17394
17395 cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
17396 emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
17397 emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
17398
17399 emit_label (label2);
17400 LABEL_NUSES (label2) = 1;
17401
17402 emit_move_insn (op0, res);
17403 }
17404
17405 /* Emit code for round calculation. */
17406 void
17407 ix86_emit_i387_round (rtx op0, rtx op1)
17408 {
17409 machine_mode inmode = GET_MODE (op1);
17410 machine_mode outmode = GET_MODE (op0);
17411 rtx e1 = gen_reg_rtx (XFmode);
17412 rtx e2 = gen_reg_rtx (XFmode);
17413 rtx scratch = gen_reg_rtx (HImode);
17414 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
17415 rtx half = const_double_from_real_value (dconsthalf, XFmode);
17416 rtx res = gen_reg_rtx (outmode);
17417 rtx_code_label *jump_label = gen_label_rtx ();
17418 rtx (*floor_insn) (rtx, rtx);
17419 rtx (*neg_insn) (rtx, rtx);
17420 rtx_insn *insn;
17421 rtx tmp;
17422
17423 switch (inmode)
17424 {
17425 case E_SFmode:
17426 case E_DFmode:
17427 tmp = gen_reg_rtx (XFmode);
17428
17429 emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
17430 op1 = tmp;
17431 break;
17432 case E_XFmode:
17433 break;
17434 default:
17435 gcc_unreachable ();
17436 }
17437
17438 switch (outmode)
17439 {
17440 case E_SFmode:
17441 floor_insn = gen_frndintxf2_floor;
17442 neg_insn = gen_negsf2;
17443 break;
17444 case E_DFmode:
17445 floor_insn = gen_frndintxf2_floor;
17446 neg_insn = gen_negdf2;
17447 break;
17448 case E_XFmode:
17449 floor_insn = gen_frndintxf2_floor;
17450 neg_insn = gen_negxf2;
17451 break;
17452 case E_HImode:
17453 floor_insn = gen_lfloorxfhi2;
17454 neg_insn = gen_neghi2;
17455 break;
17456 case E_SImode:
17457 floor_insn = gen_lfloorxfsi2;
17458 neg_insn = gen_negsi2;
17459 break;
17460 case E_DImode:
17461 floor_insn = gen_lfloorxfdi2;
17462 neg_insn = gen_negdi2;
17463 break;
17464 default:
17465 gcc_unreachable ();
17466 }
17467
17468 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
17469
17470 /* scratch = fxam(op1) */
17471 emit_insn (gen_fxamxf2_i387 (scratch, op1));
17472
17473 /* e1 = fabs(op1) */
17474 emit_insn (gen_absxf2 (e1, op1));
17475
17476 /* e2 = e1 + 0.5 */
17477 half = force_reg (XFmode, half);
17478 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
17479
17480 /* res = floor(e2) */
17481 switch (outmode)
17482 {
17483 case E_SFmode:
17484 case E_DFmode:
17485 {
17486 tmp = gen_reg_rtx (XFmode);
17487
17488 emit_insn (floor_insn (tmp, e2));
17489 emit_insn (gen_rtx_SET (res,
17490 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
17491 UNSPEC_TRUNC_NOOP)));
17492 }
17493 break;
17494 default:
17495 emit_insn (floor_insn (res, e2));
17496 }
17497
17498 /* flags = signbit(a) */
17499 emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
17500
17501 /* if (flags) then res = -res */
17502 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
17503 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
17504 gen_rtx_LABEL_REF (VOIDmode, jump_label),
17505 pc_rtx);
17506 insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17507 predict_jump (REG_BR_PROB_BASE * 50 / 100);
17508 JUMP_LABEL (insn) = jump_label;
17509
17510 emit_insn (neg_insn (res, res));
17511
17512 emit_label (jump_label);
17513 LABEL_NUSES (jump_label) = 1;
17514
17515 emit_move_insn (op0, res);
17516 }
17517
17518 /* Output code to perform a Newton-Rhapson approximation of a single precision
17519 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
17520
17521 void
17522 ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
17523 {
17524 rtx x0, x1, e0, e1;
17525
17526 x0 = gen_reg_rtx (mode);
17527 e0 = gen_reg_rtx (mode);
17528 e1 = gen_reg_rtx (mode);
17529 x1 = gen_reg_rtx (mode);
17530
17531 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
17532
17533 b = force_reg (mode, b);
17534
17535 /* x0 = rcp(b) estimate */
17536 if (mode == V16SFmode || mode == V8DFmode)
17537 {
17538 if (TARGET_AVX512ER)
17539 {
17540 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17541 UNSPEC_RCP28)));
17542 /* res = a * x0 */
17543 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x0)));
17544 return;
17545 }
17546 else
17547 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17548 UNSPEC_RCP14)));
17549 }
17550 else
17551 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
17552 UNSPEC_RCP)));
17553
17554 /* e0 = x0 * b */
17555 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
17556
17557 /* e0 = x0 * e0 */
17558 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
17559
17560 /* e1 = x0 + x0 */
17561 emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
17562
17563 /* x1 = e1 - e0 */
17564 emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
17565
17566 /* res = a * x1 */
17567 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
17568 }
17569
17570 /* Output code to perform a Newton-Rhapson approximation of a
17571 single precision floating point [reciprocal] square root. */
17572
17573 void
17574 ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
17575 {
17576 rtx x0, e0, e1, e2, e3, mthree, mhalf;
17577 REAL_VALUE_TYPE r;
17578 int unspec;
17579
17580 x0 = gen_reg_rtx (mode);
17581 e0 = gen_reg_rtx (mode);
17582 e1 = gen_reg_rtx (mode);
17583 e2 = gen_reg_rtx (mode);
17584 e3 = gen_reg_rtx (mode);
17585
17586 if (TARGET_AVX512ER && mode == V16SFmode)
17587 {
17588 if (recip)
17589 /* res = rsqrt28(a) estimate */
17590 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17591 UNSPEC_RSQRT28)));
17592 else
17593 {
17594 /* x0 = rsqrt28(a) estimate */
17595 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17596 UNSPEC_RSQRT28)));
17597 /* res = rcp28(x0) estimate */
17598 emit_insn (gen_rtx_SET (res, gen_rtx_UNSPEC (mode, gen_rtvec (1, x0),
17599 UNSPEC_RCP28)));
17600 }
17601 return;
17602 }
17603
17604 real_from_integer (&r, VOIDmode, -3, SIGNED);
17605 mthree = const_double_from_real_value (r, SFmode);
17606
17607 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
17608 mhalf = const_double_from_real_value (r, SFmode);
17609 unspec = UNSPEC_RSQRT;
17610
17611 if (VECTOR_MODE_P (mode))
17612 {
17613 mthree = ix86_build_const_vector (mode, true, mthree);
17614 mhalf = ix86_build_const_vector (mode, true, mhalf);
17615 /* There is no 512-bit rsqrt. There is however rsqrt14. */
17616 if (GET_MODE_SIZE (mode) == 64)
17617 unspec = UNSPEC_RSQRT14;
17618 }
17619
17620 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
17621 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
17622
17623 a = force_reg (mode, a);
17624
17625 /* x0 = rsqrt(a) estimate */
17626 emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
17627 unspec)));
17628
17629 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
17630 if (!recip)
17631 {
17632 rtx zero = force_reg (mode, CONST0_RTX(mode));
17633 rtx mask;
17634
17635 /* Handle masked compare. */
17636 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
17637 {
17638 mask = gen_reg_rtx (HImode);
17639 /* Imm value 0x4 corresponds to not-equal comparison. */
17640 emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
17641 emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
17642 }
17643 else
17644 {
17645 mask = gen_reg_rtx (mode);
17646 emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
17647 emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
17648 }
17649 }
17650
17651 mthree = force_reg (mode, mthree);
17652
17653 /* e0 = x0 * a */
17654 emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
17655
17656 unsigned vector_size = GET_MODE_SIZE (mode);
17657 if (TARGET_FMA
17658 || (TARGET_AVX512F && vector_size == 64)
17659 || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
17660 emit_insn (gen_rtx_SET (e2,
17661 gen_rtx_FMA (mode, e0, x0, mthree)));
17662 else
17663 {
17664 /* e1 = e0 * x0 */
17665 emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
17666
17667 /* e2 = e1 - 3. */
17668 emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
17669 }
17670
17671 mhalf = force_reg (mode, mhalf);
17672 if (recip)
17673 /* e3 = -.5 * x0 */
17674 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
17675 else
17676 /* e3 = -.5 * e0 */
17677 emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
17678 /* ret = e2 * e3 */
17679 emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
17680 }
17681
17682 /* Expand fabs (OP0) and return a new rtx that holds the result. The
17683 mask for masking out the sign-bit is stored in *SMASK, if that is
17684 non-null. */
17685
17686 static rtx
17687 ix86_expand_sse_fabs (rtx op0, rtx *smask)
17688 {
17689 machine_mode vmode, mode = GET_MODE (op0);
17690 rtx xa, mask;
17691
17692 xa = gen_reg_rtx (mode);
17693 if (mode == SFmode)
17694 vmode = V4SFmode;
17695 else if (mode == DFmode)
17696 vmode = V2DFmode;
17697 else
17698 vmode = mode;
17699 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
17700 if (!VECTOR_MODE_P (mode))
17701 {
17702 /* We need to generate a scalar mode mask in this case. */
17703 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17704 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17705 mask = gen_reg_rtx (mode);
17706 emit_insn (gen_rtx_SET (mask, tmp));
17707 }
17708 emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
17709
17710 if (smask)
17711 *smask = mask;
17712
17713 return xa;
17714 }
17715
17716 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
17717 swapping the operands if SWAP_OPERANDS is true. The expanded
17718 code is a forward jump to a newly created label in case the
17719 comparison is true. The generated label rtx is returned. */
17720 static rtx_code_label *
17721 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
17722 bool swap_operands)
17723 {
17724 bool unordered_compare = ix86_unordered_fp_compare (code);
17725 rtx_code_label *label;
17726 rtx tmp, reg;
17727
17728 if (swap_operands)
17729 std::swap (op0, op1);
17730
17731 label = gen_label_rtx ();
17732 tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
17733 if (unordered_compare)
17734 tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
17735 reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
17736 emit_insn (gen_rtx_SET (reg, tmp));
17737 tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
17738 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17739 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
17740 tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
17741 JUMP_LABEL (tmp) = label;
17742
17743 return label;
17744 }
17745
17746 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
17747 using comparison code CODE. Operands are swapped for the comparison if
17748 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
17749 static rtx
17750 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
17751 bool swap_operands)
17752 {
17753 rtx (*insn)(rtx, rtx, rtx, rtx);
17754 machine_mode mode = GET_MODE (op0);
17755 rtx mask = gen_reg_rtx (mode);
17756
17757 if (swap_operands)
17758 std::swap (op0, op1);
17759
17760 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
17761
17762 emit_insn (insn (mask, op0, op1,
17763 gen_rtx_fmt_ee (code, mode, op0, op1)));
17764 return mask;
17765 }
17766
17767 /* Expand copysign from SIGN to the positive value ABS_VALUE
17768 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
17769 the sign-bit. */
17770
17771 static void
17772 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
17773 {
17774 machine_mode mode = GET_MODE (sign);
17775 rtx sgn = gen_reg_rtx (mode);
17776 if (mask == NULL_RTX)
17777 {
17778 machine_mode vmode;
17779
17780 if (mode == SFmode)
17781 vmode = V4SFmode;
17782 else if (mode == DFmode)
17783 vmode = V2DFmode;
17784 else
17785 vmode = mode;
17786
17787 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
17788 if (!VECTOR_MODE_P (mode))
17789 {
17790 /* We need to generate a scalar mode mask in this case. */
17791 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
17792 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
17793 mask = gen_reg_rtx (mode);
17794 emit_insn (gen_rtx_SET (mask, tmp));
17795 }
17796 }
17797 else
17798 mask = gen_rtx_NOT (mode, mask);
17799 emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
17800 emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
17801 }
17802
17803 /* Expand SSE sequence for computing lround from OP1 storing
17804 into OP0. */
17805
17806 void
17807 ix86_expand_lround (rtx op0, rtx op1)
17808 {
17809 /* C code for the stuff we're doing below:
17810 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
17811 return (long)tmp;
17812 */
17813 machine_mode mode = GET_MODE (op1);
17814 const struct real_format *fmt;
17815 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
17816 rtx adj;
17817
17818 /* load nextafter (0.5, 0.0) */
17819 fmt = REAL_MODE_FORMAT (mode);
17820 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
17821 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
17822
17823 /* adj = copysign (0.5, op1) */
17824 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
17825 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
17826
17827 /* adj = op1 + adj */
17828 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
17829
17830 /* op0 = (imode)adj */
17831 expand_fix (op0, adj, 0);
17832 }
17833
17834 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
17835 into OPERAND0. */
17836
17837 void
17838 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
17839 {
17840 /* C code for the stuff we're doing below (for do_floor):
17841 xi = (long)op1;
17842 xi -= (double)xi > op1 ? 1 : 0;
17843 return xi;
17844 */
17845 machine_mode fmode = GET_MODE (op1);
17846 machine_mode imode = GET_MODE (op0);
17847 rtx ireg, freg, tmp;
17848 rtx_code_label *label;
17849
17850 /* reg = (long)op1 */
17851 ireg = gen_reg_rtx (imode);
17852 expand_fix (ireg, op1, 0);
17853
17854 /* freg = (double)reg */
17855 freg = gen_reg_rtx (fmode);
17856 expand_float (freg, ireg, 0);
17857
17858 /* ireg = (freg > op1) ? ireg - 1 : ireg */
17859 label = ix86_expand_sse_compare_and_jump (UNLE,
17860 freg, op1, !do_floor);
17861 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
17862 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
17863 emit_move_insn (ireg, tmp);
17864
17865 emit_label (label);
17866 LABEL_NUSES (label) = 1;
17867
17868 emit_move_insn (op0, ireg);
17869 }
17870
17871 /* Generate and return a rtx of mode MODE for 2**n where n is the number
17872 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
17873
17874 static rtx
17875 ix86_gen_TWO52 (machine_mode mode)
17876 {
17877 const struct real_format *fmt;
17878 REAL_VALUE_TYPE TWO52r;
17879 rtx TWO52;
17880
17881 fmt = REAL_MODE_FORMAT (mode);
17882 real_2expN (&TWO52r, fmt->p - 1, mode);
17883 TWO52 = const_double_from_real_value (TWO52r, mode);
17884 TWO52 = force_reg (mode, TWO52);
17885
17886 return TWO52;
17887 }
17888
17889 /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
17890
17891 void
17892 ix86_expand_rint (rtx operand0, rtx operand1)
17893 {
17894 /* C code for the stuff we're doing below:
17895 xa = fabs (operand1);
17896 if (!isless (xa, 2**52))
17897 return operand1;
17898 two52 = 2**52;
17899 if (flag_rounding_math)
17900 {
17901 two52 = copysign (two52, operand1);
17902 xa = operand1;
17903 }
17904 xa = xa + two52 - two52;
17905 return copysign (xa, operand1);
17906 */
17907 machine_mode mode = GET_MODE (operand0);
17908 rtx res, xa, TWO52, mask;
17909 rtx_code_label *label;
17910
17911 TWO52 = ix86_gen_TWO52 (mode);
17912
17913 /* Temporary for holding the result, initialized to the input
17914 operand to ease control flow. */
17915 res = copy_to_reg (operand1);
17916
17917 /* xa = abs (operand1) */
17918 xa = ix86_expand_sse_fabs (res, &mask);
17919
17920 /* if (!isless (xa, TWO52)) goto label; */
17921 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17922
17923 if (flag_rounding_math)
17924 {
17925 ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
17926 xa = res;
17927 }
17928
17929 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
17930 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
17931
17932 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17933 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
17934 xa = ix86_expand_sse_fabs (xa, NULL);
17935
17936 ix86_sse_copysign_to_positive (res, xa, res, mask);
17937
17938 emit_label (label);
17939 LABEL_NUSES (label) = 1;
17940
17941 emit_move_insn (operand0, res);
17942 }
17943
17944 /* Expand SSE2 sequence for computing floor or ceil
17945 from OPERAND1 storing into OPERAND0. */
17946 void
17947 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
17948 {
17949 /* C code for the stuff we expand below.
17950 double xa = fabs (x), x2;
17951 if (!isless (xa, TWO52))
17952 return x;
17953 x2 = (double)(long)x;
17954
17955 Compensate. Floor:
17956 if (x2 > x)
17957 x2 -= 1;
17958 Compensate. Ceil:
17959 if (x2 < x)
17960 x2 += 1;
17961
17962 if (HONOR_SIGNED_ZEROS (mode))
17963 return copysign (x2, x);
17964 return x2;
17965 */
17966 machine_mode mode = GET_MODE (operand0);
17967 rtx xa, xi, TWO52, tmp, one, res, mask;
17968 rtx_code_label *label;
17969
17970 TWO52 = ix86_gen_TWO52 (mode);
17971
17972 /* Temporary for holding the result, initialized to the input
17973 operand to ease control flow. */
17974 res = copy_to_reg (operand1);
17975
17976 /* xa = abs (operand1) */
17977 xa = ix86_expand_sse_fabs (res, &mask);
17978
17979 /* if (!isless (xa, TWO52)) goto label; */
17980 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
17981
17982 /* xa = (double)(long)x */
17983 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
17984 expand_fix (xi, res, 0);
17985 expand_float (xa, xi, 0);
17986
17987 /* generate 1.0 */
17988 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
17989
17990 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
17991 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
17992 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
17993 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
17994 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
17995 if (HONOR_SIGNED_ZEROS (mode))
17996 {
17997 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
17998 if (do_floor && flag_rounding_math)
17999 tmp = ix86_expand_sse_fabs (tmp, NULL);
18000
18001 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18002 }
18003 emit_move_insn (res, tmp);
18004
18005 emit_label (label);
18006 LABEL_NUSES (label) = 1;
18007
18008 emit_move_insn (operand0, res);
18009 }
18010
18011 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
18012 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18013 that is only available on 64bit targets. */
18014 void
18015 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
18016 {
18017 /* C code for the stuff we expand below.
18018 double xa = fabs (x), x2;
18019 if (!isless (xa, TWO52))
18020 return x;
18021 xa = xa + TWO52 - TWO52;
18022 x2 = copysign (xa, x);
18023
18024 Compensate. Floor:
18025 if (x2 > x)
18026 x2 -= 1;
18027 Compensate. Ceil:
18028 if (x2 < x)
18029 x2 += 1;
18030
18031 if (HONOR_SIGNED_ZEROS (mode))
18032 x2 = copysign (x2, x);
18033 return x2;
18034 */
18035 machine_mode mode = GET_MODE (operand0);
18036 rtx xa, TWO52, tmp, one, res, mask;
18037 rtx_code_label *label;
18038
18039 TWO52 = ix86_gen_TWO52 (mode);
18040
18041 /* Temporary for holding the result, initialized to the input
18042 operand to ease control flow. */
18043 res = copy_to_reg (operand1);
18044
18045 /* xa = abs (operand1) */
18046 xa = ix86_expand_sse_fabs (res, &mask);
18047
18048 /* if (!isless (xa, TWO52)) goto label; */
18049 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18050
18051 /* xa = xa + TWO52 - TWO52; */
18052 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18053 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
18054
18055 /* xa = copysign (xa, operand1) */
18056 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18057
18058 /* generate 1.0 */
18059 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18060
18061 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
18062 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
18063 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18064 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
18065 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18066 if (HONOR_SIGNED_ZEROS (mode))
18067 {
18068 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18069 if (do_floor && flag_rounding_math)
18070 tmp = ix86_expand_sse_fabs (tmp, NULL);
18071
18072 ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
18073 }
18074 emit_move_insn (res, tmp);
18075
18076 emit_label (label);
18077 LABEL_NUSES (label) = 1;
18078
18079 emit_move_insn (operand0, res);
18080 }
18081
18082 /* Expand SSE sequence for computing trunc
18083 from OPERAND1 storing into OPERAND0. */
18084 void
18085 ix86_expand_trunc (rtx operand0, rtx operand1)
18086 {
18087 /* C code for SSE variant we expand below.
18088 double xa = fabs (x), x2;
18089 if (!isless (xa, TWO52))
18090 return x;
18091 x2 = (double)(long)x;
18092 if (HONOR_SIGNED_ZEROS (mode))
18093 return copysign (x2, x);
18094 return x2;
18095 */
18096 machine_mode mode = GET_MODE (operand0);
18097 rtx xa, xi, TWO52, res, mask;
18098 rtx_code_label *label;
18099
18100 TWO52 = ix86_gen_TWO52 (mode);
18101
18102 /* Temporary for holding the result, initialized to the input
18103 operand to ease control flow. */
18104 res = copy_to_reg (operand1);
18105
18106 /* xa = abs (operand1) */
18107 xa = ix86_expand_sse_fabs (res, &mask);
18108
18109 /* if (!isless (xa, TWO52)) goto label; */
18110 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18111
18112 /* xa = (double)(long)x */
18113 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18114 expand_fix (xi, res, 0);
18115 expand_float (xa, xi, 0);
18116
18117 if (HONOR_SIGNED_ZEROS (mode))
18118 ix86_sse_copysign_to_positive (xa, xa, res, mask);
18119
18120 emit_move_insn (res, xa);
18121
18122 emit_label (label);
18123 LABEL_NUSES (label) = 1;
18124
18125 emit_move_insn (operand0, res);
18126 }
18127
18128 /* Expand SSE sequence for computing trunc from OPERAND1 storing
18129 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18130 that is only available on 64bit targets. */
18131 void
18132 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
18133 {
18134 machine_mode mode = GET_MODE (operand0);
18135 rtx xa, xa2, TWO52, tmp, one, res, mask;
18136 rtx_code_label *label;
18137
18138 /* C code for SSE variant we expand below.
18139 double xa = fabs (x), x2;
18140 if (!isless (xa, TWO52))
18141 return x;
18142 xa2 = xa + TWO52 - TWO52;
18143 Compensate:
18144 if (xa2 > xa)
18145 xa2 -= 1.0;
18146 x2 = copysign (xa2, x);
18147 return x2;
18148 */
18149
18150 TWO52 = ix86_gen_TWO52 (mode);
18151
18152 /* Temporary for holding the result, initialized to the input
18153 operand to ease control flow. */
18154 res =copy_to_reg (operand1);
18155
18156 /* xa = abs (operand1) */
18157 xa = ix86_expand_sse_fabs (res, &mask);
18158
18159 /* if (!isless (xa, TWO52)) goto label; */
18160 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18161
18162 /* xa2 = xa + TWO52 - TWO52; */
18163 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18164 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18165
18166 /* generate 1.0 */
18167 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
18168
18169 /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
18170 tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
18171 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
18172 tmp = expand_simple_binop (mode, MINUS,
18173 xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18174 /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
18175 if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
18176 tmp = ix86_expand_sse_fabs (tmp, NULL);
18177
18178 /* res = copysign (xa2, operand1) */
18179 ix86_sse_copysign_to_positive (res, tmp, res, mask);
18180
18181 emit_label (label);
18182 LABEL_NUSES (label) = 1;
18183
18184 emit_move_insn (operand0, res);
18185 }
18186
18187 /* Expand SSE sequence for computing round
18188 from OPERAND1 storing into OPERAND0. */
18189 void
18190 ix86_expand_round (rtx operand0, rtx operand1)
18191 {
18192 /* C code for the stuff we're doing below:
18193 double xa = fabs (x);
18194 if (!isless (xa, TWO52))
18195 return x;
18196 xa = (double)(long)(xa + nextafter (0.5, 0.0));
18197 return copysign (xa, x);
18198 */
18199 machine_mode mode = GET_MODE (operand0);
18200 rtx res, TWO52, xa, xi, half, mask;
18201 rtx_code_label *label;
18202 const struct real_format *fmt;
18203 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18204
18205 /* Temporary for holding the result, initialized to the input
18206 operand to ease control flow. */
18207 res = copy_to_reg (operand1);
18208
18209 TWO52 = ix86_gen_TWO52 (mode);
18210 xa = ix86_expand_sse_fabs (res, &mask);
18211 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18212
18213 /* load nextafter (0.5, 0.0) */
18214 fmt = REAL_MODE_FORMAT (mode);
18215 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18216 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18217
18218 /* xa = xa + 0.5 */
18219 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
18220 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
18221
18222 /* xa = (double)(int64_t)xa */
18223 xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
18224 expand_fix (xi, xa, 0);
18225 expand_float (xa, xi, 0);
18226
18227 /* res = copysign (xa, operand1) */
18228 ix86_sse_copysign_to_positive (res, xa, res, mask);
18229
18230 emit_label (label);
18231 LABEL_NUSES (label) = 1;
18232
18233 emit_move_insn (operand0, res);
18234 }
18235
18236 /* Expand SSE sequence for computing round from OPERAND1 storing
18237 into OPERAND0 without relying on DImode truncation via cvttsd2siq
18238 that is only available on 64bit targets. */
18239 void
18240 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
18241 {
18242 /* C code for the stuff we expand below.
18243 double xa = fabs (x), xa2, x2;
18244 if (!isless (xa, TWO52))
18245 return x;
18246 Using the absolute value and copying back sign makes
18247 -0.0 -> -0.0 correct.
18248 xa2 = xa + TWO52 - TWO52;
18249 Compensate.
18250 dxa = xa2 - xa;
18251 if (dxa <= -0.5)
18252 xa2 += 1;
18253 else if (dxa > 0.5)
18254 xa2 -= 1;
18255 x2 = copysign (xa2, x);
18256 return x2;
18257 */
18258 machine_mode mode = GET_MODE (operand0);
18259 rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
18260 rtx_code_label *label;
18261
18262 TWO52 = ix86_gen_TWO52 (mode);
18263
18264 /* Temporary for holding the result, initialized to the input
18265 operand to ease control flow. */
18266 res = copy_to_reg (operand1);
18267
18268 /* xa = abs (operand1) */
18269 xa = ix86_expand_sse_fabs (res, &mask);
18270
18271 /* if (!isless (xa, TWO52)) goto label; */
18272 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
18273
18274 /* xa2 = xa + TWO52 - TWO52; */
18275 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
18276 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
18277
18278 /* dxa = xa2 - xa; */
18279 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
18280
18281 /* generate 0.5, 1.0 and -0.5 */
18282 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
18283 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
18284 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
18285 0, OPTAB_DIRECT);
18286
18287 /* Compensate. */
18288 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
18289 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
18290 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18291 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18292 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
18293 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
18294 emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
18295 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
18296
18297 /* res = copysign (xa2, operand1) */
18298 ix86_sse_copysign_to_positive (res, xa2, res, mask);
18299
18300 emit_label (label);
18301 LABEL_NUSES (label) = 1;
18302
18303 emit_move_insn (operand0, res);
18304 }
18305
18306 /* Expand SSE sequence for computing round
18307 from OP1 storing into OP0 using sse4 round insn. */
18308 void
18309 ix86_expand_round_sse4 (rtx op0, rtx op1)
18310 {
18311 machine_mode mode = GET_MODE (op0);
18312 rtx e1, e2, res, half;
18313 const struct real_format *fmt;
18314 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
18315 rtx (*gen_copysign) (rtx, rtx, rtx);
18316 rtx (*gen_round) (rtx, rtx, rtx);
18317
18318 switch (mode)
18319 {
18320 case E_SFmode:
18321 gen_copysign = gen_copysignsf3;
18322 gen_round = gen_sse4_1_roundsf2;
18323 break;
18324 case E_DFmode:
18325 gen_copysign = gen_copysigndf3;
18326 gen_round = gen_sse4_1_rounddf2;
18327 break;
18328 default:
18329 gcc_unreachable ();
18330 }
18331
18332 /* round (a) = trunc (a + copysign (0.5, a)) */
18333
18334 /* load nextafter (0.5, 0.0) */
18335 fmt = REAL_MODE_FORMAT (mode);
18336 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
18337 real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
18338 half = const_double_from_real_value (pred_half, mode);
18339
18340 /* e1 = copysign (0.5, op1) */
18341 e1 = gen_reg_rtx (mode);
18342 emit_insn (gen_copysign (e1, half, op1));
18343
18344 /* e2 = op1 + e1 */
18345 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
18346
18347 /* res = trunc (e2) */
18348 res = gen_reg_rtx (mode);
18349 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
18350
18351 emit_move_insn (op0, res);
18352 }
18353
18354 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
18355 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
18356 insn every time. */
18357
18358 static GTY(()) rtx_insn *vselect_insn;
18359
18360 /* Initialize vselect_insn. */
18361
18362 static void
18363 init_vselect_insn (void)
18364 {
18365 unsigned i;
18366 rtx x;
18367
18368 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
18369 for (i = 0; i < MAX_VECT_LEN; ++i)
18370 XVECEXP (x, 0, i) = const0_rtx;
18371 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
18372 const0_rtx), x);
18373 x = gen_rtx_SET (const0_rtx, x);
18374 start_sequence ();
18375 vselect_insn = emit_insn (x);
18376 end_sequence ();
18377 }
18378
18379 /* Construct (set target (vec_select op0 (parallel perm))) and
18380 return true if that's a valid instruction in the active ISA. */
18381
18382 static bool
18383 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
18384 unsigned nelt, bool testing_p)
18385 {
18386 unsigned int i;
18387 rtx x, save_vconcat;
18388 int icode;
18389
18390 if (vselect_insn == NULL_RTX)
18391 init_vselect_insn ();
18392
18393 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
18394 PUT_NUM_ELEM (XVEC (x, 0), nelt);
18395 for (i = 0; i < nelt; ++i)
18396 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
18397 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18398 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
18399 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
18400 SET_DEST (PATTERN (vselect_insn)) = target;
18401 icode = recog_memoized (vselect_insn);
18402
18403 if (icode >= 0 && !testing_p)
18404 emit_insn (copy_rtx (PATTERN (vselect_insn)));
18405
18406 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
18407 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
18408 INSN_CODE (vselect_insn) = -1;
18409
18410 return icode >= 0;
18411 }
18412
18413 /* Similar, but generate a vec_concat from op0 and op1 as well. */
18414
18415 static bool
18416 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
18417 const unsigned char *perm, unsigned nelt,
18418 bool testing_p)
18419 {
18420 machine_mode v2mode;
18421 rtx x;
18422 bool ok;
18423
18424 if (vselect_insn == NULL_RTX)
18425 init_vselect_insn ();
18426
18427 if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
18428 return false;
18429 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
18430 PUT_MODE (x, v2mode);
18431 XEXP (x, 0) = op0;
18432 XEXP (x, 1) = op1;
18433 ok = expand_vselect (target, x, perm, nelt, testing_p);
18434 XEXP (x, 0) = const0_rtx;
18435 XEXP (x, 1) = const0_rtx;
18436 return ok;
18437 }
18438
18439 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18440 using movss or movsd. */
18441 static bool
18442 expand_vec_perm_movs (struct expand_vec_perm_d *d)
18443 {
18444 machine_mode vmode = d->vmode;
18445 unsigned i, nelt = d->nelt;
18446 rtx x;
18447
18448 if (d->one_operand_p)
18449 return false;
18450
18451 if (!(TARGET_SSE && vmode == V4SFmode)
18452 && !(TARGET_MMX_WITH_SSE && vmode == V2SFmode)
18453 && !(TARGET_SSE2 && vmode == V2DFmode))
18454 return false;
18455
18456 /* Only the first element is changed. */
18457 if (d->perm[0] != nelt && d->perm[0] != 0)
18458 return false;
18459 for (i = 1; i < nelt; ++i)
18460 if (d->perm[i] != i + nelt - d->perm[0])
18461 return false;
18462
18463 if (d->testing_p)
18464 return true;
18465
18466 if (d->perm[0] == nelt)
18467 x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
18468 else
18469 x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
18470
18471 emit_insn (gen_rtx_SET (d->target, x));
18472
18473 return true;
18474 }
18475
18476 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18477 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
18478
18479 static bool
18480 expand_vec_perm_blend (struct expand_vec_perm_d *d)
18481 {
18482 machine_mode mmode, vmode = d->vmode;
18483 unsigned i, nelt = d->nelt;
18484 unsigned HOST_WIDE_INT mask;
18485 rtx target, op0, op1, maskop, x;
18486 rtx rperm[32], vperm;
18487
18488 if (d->one_operand_p)
18489 return false;
18490 if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
18491 && (TARGET_AVX512BW
18492 || GET_MODE_UNIT_SIZE (vmode) >= 4))
18493 ;
18494 else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
18495 ;
18496 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
18497 ;
18498 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
18499 || GET_MODE_SIZE (vmode) == 8
18500 || GET_MODE_SIZE (vmode) == 4))
18501 ;
18502 else
18503 return false;
18504
18505 /* This is a blend, not a permute. Elements must stay in their
18506 respective lanes. */
18507 for (i = 0; i < nelt; ++i)
18508 {
18509 unsigned e = d->perm[i];
18510 if (!(e == i || e == i + nelt))
18511 return false;
18512 }
18513
18514 if (d->testing_p)
18515 return true;
18516
18517 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
18518 decision should be extracted elsewhere, so that we only try that
18519 sequence once all budget==3 options have been tried. */
18520 target = d->target;
18521 op0 = d->op0;
18522 op1 = d->op1;
18523 mask = 0;
18524
18525 switch (vmode)
18526 {
18527 case E_V8DFmode:
18528 case E_V16SFmode:
18529 case E_V4DFmode:
18530 case E_V8SFmode:
18531 case E_V2DFmode:
18532 case E_V4SFmode:
18533 case E_V4HImode:
18534 case E_V8HImode:
18535 case E_V8SImode:
18536 case E_V32HImode:
18537 case E_V64QImode:
18538 case E_V16SImode:
18539 case E_V8DImode:
18540 for (i = 0; i < nelt; ++i)
18541 mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
18542 break;
18543
18544 case E_V2DImode:
18545 for (i = 0; i < 2; ++i)
18546 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
18547 vmode = V8HImode;
18548 goto do_subreg;
18549
18550 case E_V2SImode:
18551 for (i = 0; i < 2; ++i)
18552 mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
18553 vmode = V4HImode;
18554 goto do_subreg;
18555
18556 case E_V4SImode:
18557 for (i = 0; i < 4; ++i)
18558 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18559 vmode = V8HImode;
18560 goto do_subreg;
18561
18562 case E_V16QImode:
18563 /* See if bytes move in pairs so we can use pblendw with
18564 an immediate argument, rather than pblendvb with a vector
18565 argument. */
18566 for (i = 0; i < 16; i += 2)
18567 if (d->perm[i] + 1 != d->perm[i + 1])
18568 {
18569 use_pblendvb:
18570 for (i = 0; i < nelt; ++i)
18571 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
18572
18573 finish_pblendvb:
18574 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18575 vperm = force_reg (vmode, vperm);
18576
18577 if (GET_MODE_SIZE (vmode) == 4)
18578 emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
18579 else if (GET_MODE_SIZE (vmode) == 8)
18580 emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
18581 else if (GET_MODE_SIZE (vmode) == 16)
18582 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
18583 else
18584 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
18585 if (target != d->target)
18586 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18587 return true;
18588 }
18589
18590 for (i = 0; i < 8; ++i)
18591 mask |= (d->perm[i * 2] >= 16) << i;
18592 vmode = V8HImode;
18593 /* FALLTHRU */
18594
18595 do_subreg:
18596 target = gen_reg_rtx (vmode);
18597 op0 = gen_lowpart (vmode, op0);
18598 op1 = gen_lowpart (vmode, op1);
18599 break;
18600
18601 case E_V8QImode:
18602 for (i = 0; i < 8; i += 2)
18603 if (d->perm[i] + 1 != d->perm[i + 1])
18604 goto use_pblendvb;
18605
18606 for (i = 0; i < 4; ++i)
18607 mask |= (d->perm[i * 2] >= 8) << i;
18608 vmode = V4HImode;
18609 goto do_subreg;
18610
18611 case E_V4QImode:
18612 for (i = 0; i < 4; i += 2)
18613 if (d->perm[i] + 1 != d->perm[i + 1])
18614 goto use_pblendvb;
18615
18616 for (i = 0; i < 2; ++i)
18617 mask |= (d->perm[i * 2] >= 4) << i;
18618 vmode = V2HImode;
18619 goto do_subreg;
18620
18621 case E_V32QImode:
18622 /* See if bytes move in pairs. If not, vpblendvb must be used. */
18623 for (i = 0; i < 32; i += 2)
18624 if (d->perm[i] + 1 != d->perm[i + 1])
18625 goto use_pblendvb;
18626 /* See if bytes move in quadruplets. If yes, vpblendd
18627 with immediate can be used. */
18628 for (i = 0; i < 32; i += 4)
18629 if (d->perm[i] + 2 != d->perm[i + 2])
18630 break;
18631 if (i < 32)
18632 {
18633 /* See if bytes move the same in both lanes. If yes,
18634 vpblendw with immediate can be used. */
18635 for (i = 0; i < 16; i += 2)
18636 if (d->perm[i] + 16 != d->perm[i + 16])
18637 goto use_pblendvb;
18638
18639 /* Use vpblendw. */
18640 for (i = 0; i < 16; ++i)
18641 mask |= (d->perm[i * 2] >= 32) << i;
18642 vmode = V16HImode;
18643 goto do_subreg;
18644 }
18645
18646 /* Use vpblendd. */
18647 for (i = 0; i < 8; ++i)
18648 mask |= (d->perm[i * 4] >= 32) << i;
18649 vmode = V8SImode;
18650 goto do_subreg;
18651
18652 case E_V16HImode:
18653 /* See if words move in pairs. If yes, vpblendd can be used. */
18654 for (i = 0; i < 16; i += 2)
18655 if (d->perm[i] + 1 != d->perm[i + 1])
18656 break;
18657 if (i < 16)
18658 {
18659 /* See if words move the same in both lanes. If not,
18660 vpblendvb must be used. */
18661 for (i = 0; i < 8; i++)
18662 if (d->perm[i] + 8 != d->perm[i + 8])
18663 {
18664 /* Use vpblendvb. */
18665 for (i = 0; i < 32; ++i)
18666 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
18667
18668 vmode = V32QImode;
18669 nelt = 32;
18670 target = gen_reg_rtx (vmode);
18671 op0 = gen_lowpart (vmode, op0);
18672 op1 = gen_lowpart (vmode, op1);
18673 goto finish_pblendvb;
18674 }
18675
18676 /* Use vpblendw. */
18677 for (i = 0; i < 16; ++i)
18678 mask |= (d->perm[i] >= 16) << i;
18679 break;
18680 }
18681
18682 /* Use vpblendd. */
18683 for (i = 0; i < 8; ++i)
18684 mask |= (d->perm[i * 2] >= 16) << i;
18685 vmode = V8SImode;
18686 goto do_subreg;
18687
18688 case E_V4DImode:
18689 /* Use vpblendd. */
18690 for (i = 0; i < 4; ++i)
18691 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
18692 vmode = V8SImode;
18693 goto do_subreg;
18694
18695 default:
18696 gcc_unreachable ();
18697 }
18698
18699 switch (vmode)
18700 {
18701 case E_V8DFmode:
18702 case E_V8DImode:
18703 mmode = QImode;
18704 break;
18705 case E_V16SFmode:
18706 case E_V16SImode:
18707 mmode = HImode;
18708 break;
18709 case E_V32HImode:
18710 mmode = SImode;
18711 break;
18712 case E_V64QImode:
18713 mmode = DImode;
18714 break;
18715 default:
18716 mmode = VOIDmode;
18717 }
18718
18719 if (mmode != VOIDmode)
18720 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
18721 else
18722 maskop = GEN_INT (mask);
18723
18724 /* This matches five different patterns with the different modes. */
18725 x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
18726 x = gen_rtx_SET (target, x);
18727 emit_insn (x);
18728 if (target != d->target)
18729 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18730
18731 return true;
18732 }
18733
18734 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18735 in terms of the variable form of vpermilps.
18736
18737 Note that we will have already failed the immediate input vpermilps,
18738 which requires that the high and low part shuffle be identical; the
18739 variable form doesn't require that. */
18740
18741 static bool
18742 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
18743 {
18744 rtx rperm[8], vperm;
18745 unsigned i;
18746
18747 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
18748 return false;
18749
18750 /* We can only permute within the 128-bit lane. */
18751 for (i = 0; i < 8; ++i)
18752 {
18753 unsigned e = d->perm[i];
18754 if (i < 4 ? e >= 4 : e < 4)
18755 return false;
18756 }
18757
18758 if (d->testing_p)
18759 return true;
18760
18761 for (i = 0; i < 8; ++i)
18762 {
18763 unsigned e = d->perm[i];
18764
18765 /* Within each 128-bit lane, the elements of op0 are numbered
18766 from 0 and the elements of op1 are numbered from 4. */
18767 if (e >= 8 + 4)
18768 e -= 8;
18769 else if (e >= 4)
18770 e -= 4;
18771
18772 rperm[i] = GEN_INT (e);
18773 }
18774
18775 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
18776 vperm = force_reg (V8SImode, vperm);
18777 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
18778
18779 return true;
18780 }
18781
18782 /* For V*[QHS]Imode permutations, check if the same permutation
18783 can't be performed in a 2x, 4x or 8x wider inner mode. */
18784
18785 static bool
18786 canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
18787 struct expand_vec_perm_d *nd)
18788 {
18789 int i;
18790 machine_mode mode = VOIDmode;
18791
18792 switch (d->vmode)
18793 {
18794 case E_V8QImode: mode = V4HImode; break;
18795 case E_V16QImode: mode = V8HImode; break;
18796 case E_V32QImode: mode = V16HImode; break;
18797 case E_V64QImode: mode = V32HImode; break;
18798 case E_V4HImode: mode = V2SImode; break;
18799 case E_V8HImode: mode = V4SImode; break;
18800 case E_V16HImode: mode = V8SImode; break;
18801 case E_V32HImode: mode = V16SImode; break;
18802 case E_V4SImode: mode = V2DImode; break;
18803 case E_V8SImode: mode = V4DImode; break;
18804 case E_V16SImode: mode = V8DImode; break;
18805 default: return false;
18806 }
18807 for (i = 0; i < d->nelt; i += 2)
18808 if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
18809 return false;
18810 nd->vmode = mode;
18811 nd->nelt = d->nelt / 2;
18812 for (i = 0; i < nd->nelt; i++)
18813 nd->perm[i] = d->perm[2 * i] / 2;
18814 if (GET_MODE_INNER (mode) != DImode)
18815 canonicalize_vector_int_perm (nd, nd);
18816 if (nd != d)
18817 {
18818 nd->one_operand_p = d->one_operand_p;
18819 nd->testing_p = d->testing_p;
18820 if (d->op0 == d->op1)
18821 nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
18822 else
18823 {
18824 nd->op0 = gen_lowpart (nd->vmode, d->op0);
18825 nd->op1 = gen_lowpart (nd->vmode, d->op1);
18826 }
18827 if (d->testing_p)
18828 nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
18829 else
18830 nd->target = gen_reg_rtx (nd->vmode);
18831 }
18832 return true;
18833 }
18834
18835 /* Return true if permutation D can be performed as VMODE permutation
18836 instead. */
18837
18838 static bool
18839 valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
18840 {
18841 unsigned int i, j, chunk;
18842
18843 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
18844 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
18845 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
18846 return false;
18847
18848 if (GET_MODE_NUNITS (vmode) >= d->nelt)
18849 return true;
18850
18851 chunk = d->nelt / GET_MODE_NUNITS (vmode);
18852 for (i = 0; i < d->nelt; i += chunk)
18853 if (d->perm[i] & (chunk - 1))
18854 return false;
18855 else
18856 for (j = 1; j < chunk; ++j)
18857 if (d->perm[i] + j != d->perm[i + j])
18858 return false;
18859
18860 return true;
18861 }
18862
18863 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
18864 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
18865
18866 static bool
18867 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
18868 {
18869 unsigned i, nelt, eltsz, mask;
18870 unsigned char perm[64];
18871 machine_mode vmode;
18872 struct expand_vec_perm_d nd;
18873 rtx rperm[64], vperm, target, op0, op1;
18874
18875 nelt = d->nelt;
18876
18877 if (!d->one_operand_p)
18878 switch (GET_MODE_SIZE (d->vmode))
18879 {
18880 case 4:
18881 if (!TARGET_XOP)
18882 return false;
18883 vmode = V4QImode;
18884 break;
18885
18886 case 8:
18887 if (!TARGET_XOP)
18888 return false;
18889 vmode = V8QImode;
18890 break;
18891
18892 case 16:
18893 if (!TARGET_XOP)
18894 return false;
18895 vmode = V16QImode;
18896 break;
18897
18898 case 32:
18899 if (!TARGET_AVX2)
18900 return false;
18901
18902 if (valid_perm_using_mode_p (V2TImode, d))
18903 {
18904 if (d->testing_p)
18905 return true;
18906
18907 /* Use vperm2i128 insn. The pattern uses
18908 V4DImode instead of V2TImode. */
18909 target = d->target;
18910 if (d->vmode != V4DImode)
18911 target = gen_reg_rtx (V4DImode);
18912 op0 = gen_lowpart (V4DImode, d->op0);
18913 op1 = gen_lowpart (V4DImode, d->op1);
18914 rperm[0]
18915 = GEN_INT ((d->perm[0] / (nelt / 2))
18916 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
18917 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
18918 if (target != d->target)
18919 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18920 return true;
18921 }
18922 /* FALLTHRU */
18923
18924 default:
18925 return false;
18926 }
18927 else
18928 switch (GET_MODE_SIZE (d->vmode))
18929 {
18930 case 4:
18931 if (!TARGET_SSSE3)
18932 return false;
18933 vmode = V4QImode;
18934 break;
18935
18936 case 8:
18937 if (!TARGET_SSSE3)
18938 return false;
18939 vmode = V8QImode;
18940 break;
18941
18942 case 16:
18943 if (!TARGET_SSSE3)
18944 return false;
18945 vmode = V16QImode;
18946 break;
18947
18948 case 32:
18949 if (!TARGET_AVX2)
18950 return false;
18951
18952 /* V4DImode should be already handled through
18953 expand_vselect by vpermq instruction. */
18954 gcc_assert (d->vmode != V4DImode);
18955
18956 vmode = V32QImode;
18957 if (d->vmode == V8SImode
18958 || d->vmode == V16HImode
18959 || d->vmode == V32QImode)
18960 {
18961 /* First see if vpermq can be used for
18962 V8SImode/V16HImode/V32QImode. */
18963 if (valid_perm_using_mode_p (V4DImode, d))
18964 {
18965 for (i = 0; i < 4; i++)
18966 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
18967 if (d->testing_p)
18968 return true;
18969 target = gen_reg_rtx (V4DImode);
18970 if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
18971 perm, 4, false))
18972 {
18973 emit_move_insn (d->target,
18974 gen_lowpart (d->vmode, target));
18975 return true;
18976 }
18977 return false;
18978 }
18979
18980 /* Next see if vpermd can be used. */
18981 if (valid_perm_using_mode_p (V8SImode, d))
18982 vmode = V8SImode;
18983 }
18984 /* Or if vpermps can be used. */
18985 else if (d->vmode == V8SFmode)
18986 vmode = V8SImode;
18987
18988 if (vmode == V32QImode)
18989 {
18990 /* vpshufb only works intra lanes, it is not
18991 possible to shuffle bytes in between the lanes. */
18992 for (i = 0; i < nelt; ++i)
18993 if ((d->perm[i] ^ i) & (nelt / 2))
18994 return false;
18995 }
18996 break;
18997
18998 case 64:
18999 if (!TARGET_AVX512BW)
19000 return false;
19001
19002 /* If vpermq didn't work, vpshufb won't work either. */
19003 if (d->vmode == V8DFmode || d->vmode == V8DImode)
19004 return false;
19005
19006 vmode = V64QImode;
19007 if (d->vmode == V16SImode
19008 || d->vmode == V32HImode
19009 || d->vmode == V64QImode)
19010 {
19011 /* First see if vpermq can be used for
19012 V16SImode/V32HImode/V64QImode. */
19013 if (valid_perm_using_mode_p (V8DImode, d))
19014 {
19015 for (i = 0; i < 8; i++)
19016 perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
19017 if (d->testing_p)
19018 return true;
19019 target = gen_reg_rtx (V8DImode);
19020 if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
19021 perm, 8, false))
19022 {
19023 emit_move_insn (d->target,
19024 gen_lowpart (d->vmode, target));
19025 return true;
19026 }
19027 return false;
19028 }
19029
19030 /* Next see if vpermd can be used. */
19031 if (valid_perm_using_mode_p (V16SImode, d))
19032 vmode = V16SImode;
19033 }
19034 /* Or if vpermps can be used. */
19035 else if (d->vmode == V16SFmode)
19036 vmode = V16SImode;
19037
19038 if (vmode == V64QImode)
19039 {
19040 /* vpshufb only works intra lanes, it is not
19041 possible to shuffle bytes in between the lanes. */
19042 for (i = 0; i < nelt; ++i)
19043 if ((d->perm[i] ^ i) & (3 * nelt / 4))
19044 return false;
19045 }
19046 break;
19047
19048 default:
19049 return false;
19050 }
19051
19052 if (d->testing_p)
19053 return true;
19054
19055 /* Try to avoid variable permutation instruction. */
19056 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19057 {
19058 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19059 return true;
19060 }
19061
19062 if (vmode == V8SImode)
19063 for (i = 0; i < 8; ++i)
19064 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
19065 else if (vmode == V16SImode)
19066 for (i = 0; i < 16; ++i)
19067 rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
19068 else
19069 {
19070 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
19071 if (!d->one_operand_p)
19072 mask = 2 * nelt - 1;
19073 else if (vmode == V64QImode)
19074 mask = nelt / 4 - 1;
19075 else if (vmode == V32QImode)
19076 mask = nelt / 2 - 1;
19077 else
19078 mask = nelt - 1;
19079
19080 for (i = 0; i < nelt; ++i)
19081 {
19082 unsigned j, e = d->perm[i] & mask;
19083 for (j = 0; j < eltsz; ++j)
19084 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
19085 }
19086 }
19087
19088 machine_mode vpmode = vmode;
19089
19090 nelt = GET_MODE_SIZE (vmode);
19091
19092 /* Emulate narrow modes with V16QI instructions. */
19093 if (nelt < 16)
19094 {
19095 rtx m128 = GEN_INT (-128);
19096
19097 /* Remap elements from the second operand, as we have to
19098 account for inactive top elements from the first operand. */
19099 if (!d->one_operand_p)
19100 {
19101 for (i = 0; i < nelt; ++i)
19102 {
19103 unsigned ival = UINTVAL (rperm[i]);
19104 if (ival >= nelt)
19105 rperm[i] = GEN_INT (ival + 16 - nelt);
19106 }
19107 }
19108
19109 /* Fill inactive elements in the top positions with zeros. */
19110 for (i = nelt; i < 16; ++i)
19111 rperm[i] = m128;
19112
19113 vpmode = V16QImode;
19114 }
19115
19116 vperm = gen_rtx_CONST_VECTOR (vpmode,
19117 gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
19118 vperm = force_reg (vpmode, vperm);
19119
19120 if (vmode == d->vmode)
19121 target = d->target;
19122 else
19123 target = gen_reg_rtx (vmode);
19124
19125 op0 = gen_lowpart (vmode, d->op0);
19126
19127 if (d->one_operand_p)
19128 {
19129 rtx (*gen) (rtx, rtx, rtx);
19130
19131 if (vmode == V4QImode)
19132 gen = gen_mmx_pshufbv4qi3;
19133 else if (vmode == V8QImode)
19134 gen = gen_mmx_pshufbv8qi3;
19135 else if (vmode == V16QImode)
19136 gen = gen_ssse3_pshufbv16qi3;
19137 else if (vmode == V32QImode)
19138 gen = gen_avx2_pshufbv32qi3;
19139 else if (vmode == V64QImode)
19140 gen = gen_avx512bw_pshufbv64qi3;
19141 else if (vmode == V8SFmode)
19142 gen = gen_avx2_permvarv8sf;
19143 else if (vmode == V8SImode)
19144 gen = gen_avx2_permvarv8si;
19145 else if (vmode == V16SFmode)
19146 gen = gen_avx512f_permvarv16sf;
19147 else if (vmode == V16SImode)
19148 gen = gen_avx512f_permvarv16si;
19149 else
19150 gcc_unreachable ();
19151
19152 emit_insn (gen (target, op0, vperm));
19153 }
19154 else
19155 {
19156 rtx (*gen) (rtx, rtx, rtx, rtx);
19157
19158 op1 = gen_lowpart (vmode, d->op1);
19159
19160 if (vmode == V4QImode)
19161 gen = gen_mmx_ppermv32;
19162 else if (vmode == V8QImode)
19163 gen = gen_mmx_ppermv64;
19164 else if (vmode == V16QImode)
19165 gen = gen_xop_pperm;
19166 else
19167 gcc_unreachable ();
19168
19169 emit_insn (gen (target, op0, op1, vperm));
19170 }
19171
19172 if (target != d->target)
19173 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19174
19175 return true;
19176 }
19177
19178 /* Try to expand one-operand permutation with constant mask. */
19179
19180 static bool
19181 ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
19182 {
19183 machine_mode mode = GET_MODE (d->op0);
19184 machine_mode maskmode = mode;
19185 unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
19186 rtx (*gen) (rtx, rtx, rtx) = NULL;
19187 rtx target, op0, mask;
19188 rtx vec[64];
19189
19190 if (!rtx_equal_p (d->op0, d->op1))
19191 return false;
19192
19193 if (!TARGET_AVX512F)
19194 return false;
19195
19196 /* Accept VNxHImode and VNxQImode now. */
19197 if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
19198 return false;
19199
19200 /* vpermw. */
19201 if (!TARGET_AVX512BW && inner_size == 2)
19202 return false;
19203
19204 /* vpermb. */
19205 if (!TARGET_AVX512VBMI && inner_size == 1)
19206 return false;
19207
19208 switch (mode)
19209 {
19210 case E_V16SImode:
19211 gen = gen_avx512f_permvarv16si;
19212 break;
19213 case E_V16SFmode:
19214 gen = gen_avx512f_permvarv16sf;
19215 maskmode = V16SImode;
19216 break;
19217 case E_V8DImode:
19218 gen = gen_avx512f_permvarv8di;
19219 break;
19220 case E_V8DFmode:
19221 gen = gen_avx512f_permvarv8df;
19222 maskmode = V8DImode;
19223 break;
19224 case E_V32HImode:
19225 gen = gen_avx512bw_permvarv32hi;
19226 break;
19227 case E_V16HImode:
19228 gen = gen_avx512vl_permvarv16hi;
19229 break;
19230 case E_V8HImode:
19231 gen = gen_avx512vl_permvarv8hi;
19232 break;
19233 case E_V64QImode:
19234 gen = gen_avx512bw_permvarv64qi;
19235 break;
19236 case E_V32QImode:
19237 gen = gen_avx512vl_permvarv32qi;
19238 break;
19239 case E_V16QImode:
19240 gen = gen_avx512vl_permvarv16qi;
19241 break;
19242
19243 default:
19244 return false;
19245 }
19246
19247 if (d->testing_p)
19248 return true;
19249
19250 target = d->target;
19251 op0 = d->op0;
19252 for (int i = 0; i < d->nelt; ++i)
19253 vec[i] = GEN_INT (d->perm[i]);
19254 mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
19255 emit_insn (gen (target, op0, force_reg (maskmode, mask)));
19256 return true;
19257 }
19258
19259 static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
19260
19261 /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
19262 in a single instruction. */
19263
19264 static bool
19265 expand_vec_perm_1 (struct expand_vec_perm_d *d)
19266 {
19267 unsigned i, nelt = d->nelt;
19268 struct expand_vec_perm_d nd;
19269
19270 /* Check plain VEC_SELECT first, because AVX has instructions that could
19271 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
19272 input where SEL+CONCAT may not. */
19273 if (d->one_operand_p)
19274 {
19275 int mask = nelt - 1;
19276 bool identity_perm = true;
19277 bool broadcast_perm = true;
19278
19279 for (i = 0; i < nelt; i++)
19280 {
19281 nd.perm[i] = d->perm[i] & mask;
19282 if (nd.perm[i] != i)
19283 identity_perm = false;
19284 if (nd.perm[i])
19285 broadcast_perm = false;
19286 }
19287
19288 if (identity_perm)
19289 {
19290 if (!d->testing_p)
19291 emit_move_insn (d->target, d->op0);
19292 return true;
19293 }
19294 else if (broadcast_perm && TARGET_AVX2)
19295 {
19296 /* Use vpbroadcast{b,w,d}. */
19297 rtx (*gen) (rtx, rtx) = NULL;
19298 switch (d->vmode)
19299 {
19300 case E_V64QImode:
19301 if (TARGET_AVX512BW)
19302 gen = gen_avx512bw_vec_dupv64qi_1;
19303 break;
19304 case E_V32QImode:
19305 gen = gen_avx2_pbroadcastv32qi_1;
19306 break;
19307 case E_V32HImode:
19308 if (TARGET_AVX512BW)
19309 gen = gen_avx512bw_vec_dupv32hi_1;
19310 break;
19311 case E_V16HImode:
19312 gen = gen_avx2_pbroadcastv16hi_1;
19313 break;
19314 case E_V16SImode:
19315 if (TARGET_AVX512F)
19316 gen = gen_avx512f_vec_dupv16si_1;
19317 break;
19318 case E_V8SImode:
19319 gen = gen_avx2_pbroadcastv8si_1;
19320 break;
19321 case E_V16QImode:
19322 gen = gen_avx2_pbroadcastv16qi;
19323 break;
19324 case E_V8HImode:
19325 gen = gen_avx2_pbroadcastv8hi;
19326 break;
19327 case E_V16SFmode:
19328 if (TARGET_AVX512F)
19329 gen = gen_avx512f_vec_dupv16sf_1;
19330 break;
19331 case E_V8SFmode:
19332 gen = gen_avx2_vec_dupv8sf_1;
19333 break;
19334 case E_V8DFmode:
19335 if (TARGET_AVX512F)
19336 gen = gen_avx512f_vec_dupv8df_1;
19337 break;
19338 case E_V8DImode:
19339 if (TARGET_AVX512F)
19340 gen = gen_avx512f_vec_dupv8di_1;
19341 break;
19342 /* For other modes prefer other shuffles this function creates. */
19343 default: break;
19344 }
19345 if (gen != NULL)
19346 {
19347 if (!d->testing_p)
19348 emit_insn (gen (d->target, d->op0));
19349 return true;
19350 }
19351 }
19352
19353 if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
19354 return true;
19355
19356 /* There are plenty of patterns in sse.md that are written for
19357 SEL+CONCAT and are not replicated for a single op. Perhaps
19358 that should be changed, to avoid the nastiness here. */
19359
19360 /* Recognize interleave style patterns, which means incrementing
19361 every other permutation operand. */
19362 for (i = 0; i < nelt; i += 2)
19363 {
19364 nd.perm[i] = d->perm[i] & mask;
19365 nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
19366 }
19367 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19368 d->testing_p))
19369 return true;
19370
19371 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
19372 if (nelt >= 4)
19373 {
19374 for (i = 0; i < nelt; i += 4)
19375 {
19376 nd.perm[i + 0] = d->perm[i + 0] & mask;
19377 nd.perm[i + 1] = d->perm[i + 1] & mask;
19378 nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
19379 nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
19380 }
19381
19382 if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
19383 d->testing_p))
19384 return true;
19385 }
19386 }
19387
19388 /* Try movss/movsd instructions. */
19389 if (expand_vec_perm_movs (d))
19390 return true;
19391
19392 /* Finally, try the fully general two operand permute. */
19393 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
19394 d->testing_p))
19395 return true;
19396
19397 /* Recognize interleave style patterns with reversed operands. */
19398 if (!d->one_operand_p)
19399 {
19400 for (i = 0; i < nelt; ++i)
19401 {
19402 unsigned e = d->perm[i];
19403 if (e >= nelt)
19404 e -= nelt;
19405 else
19406 e += nelt;
19407 nd.perm[i] = e;
19408 }
19409
19410 if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
19411 d->testing_p))
19412 return true;
19413 }
19414
19415 /* Try the SSE4.1 blend variable merge instructions. */
19416 if (expand_vec_perm_blend (d))
19417 return true;
19418
19419 /* Try one of the AVX vpermil variable permutations. */
19420 if (expand_vec_perm_vpermil (d))
19421 return true;
19422
19423 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
19424 vpshufb, vpermd, vpermps or vpermq variable permutation. */
19425 if (expand_vec_perm_pshufb (d))
19426 return true;
19427
19428 /* Try the AVX2 vpalignr instruction. */
19429 if (expand_vec_perm_palignr (d, true))
19430 return true;
19431
19432 /* Try the AVX512F vperm{w,b,s,d} instructions */
19433 if (ix86_expand_vec_one_operand_perm_avx512 (d))
19434 return true;
19435
19436 /* Try the AVX512F vpermt2/vpermi2 instructions. */
19437 if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
19438 return true;
19439
19440 /* See if we can get the same permutation in different vector integer
19441 mode. */
19442 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
19443 {
19444 if (!d->testing_p)
19445 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
19446 return true;
19447 }
19448 return false;
19449 }
19450
19451 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
19452 in terms of a pair of pshuflw + pshufhw instructions. */
19453
19454 static bool
19455 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
19456 {
19457 unsigned char perm2[MAX_VECT_LEN];
19458 unsigned i;
19459 bool ok;
19460
19461 if (d->vmode != V8HImode || !d->one_operand_p)
19462 return false;
19463
19464 /* The two permutations only operate in 64-bit lanes. */
19465 for (i = 0; i < 4; ++i)
19466 if (d->perm[i] >= 4)
19467 return false;
19468 for (i = 4; i < 8; ++i)
19469 if (d->perm[i] < 4)
19470 return false;
19471
19472 if (d->testing_p)
19473 return true;
19474
19475 /* Emit the pshuflw. */
19476 memcpy (perm2, d->perm, 4);
19477 for (i = 4; i < 8; ++i)
19478 perm2[i] = i;
19479 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
19480 gcc_assert (ok);
19481
19482 /* Emit the pshufhw. */
19483 memcpy (perm2 + 4, d->perm + 4, 4);
19484 for (i = 0; i < 4; ++i)
19485 perm2[i] = i;
19486 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
19487 gcc_assert (ok);
19488
19489 return true;
19490 }
19491
19492 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19493 the permutation using the SSSE3 palignr instruction. This succeeds
19494 when all of the elements in PERM fit within one vector and we merely
19495 need to shift them down so that a single vector permutation has a
19496 chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
19497 the vpalignr instruction itself can perform the requested permutation. */
19498
19499 static bool
19500 expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
19501 {
19502 unsigned i, nelt = d->nelt;
19503 unsigned min, max, minswap, maxswap;
19504 bool in_order, ok, swap = false;
19505 rtx shift, target;
19506 struct expand_vec_perm_d dcopy;
19507
19508 /* Even with AVX, palignr only operates on 128-bit vectors,
19509 in AVX2 palignr operates on both 128-bit lanes. */
19510 if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
19511 && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
19512 return false;
19513
19514 min = 2 * nelt;
19515 max = 0;
19516 minswap = 2 * nelt;
19517 maxswap = 0;
19518 for (i = 0; i < nelt; ++i)
19519 {
19520 unsigned e = d->perm[i];
19521 unsigned eswap = d->perm[i] ^ nelt;
19522 if (GET_MODE_SIZE (d->vmode) == 32)
19523 {
19524 e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
19525 eswap = e ^ (nelt / 2);
19526 }
19527 if (e < min)
19528 min = e;
19529 if (e > max)
19530 max = e;
19531 if (eswap < minswap)
19532 minswap = eswap;
19533 if (eswap > maxswap)
19534 maxswap = eswap;
19535 }
19536 if (min == 0
19537 || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
19538 {
19539 if (d->one_operand_p
19540 || minswap == 0
19541 || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
19542 ? nelt / 2 : nelt))
19543 return false;
19544 swap = true;
19545 min = minswap;
19546 max = maxswap;
19547 }
19548
19549 /* Given that we have SSSE3, we know we'll be able to implement the
19550 single operand permutation after the palignr with pshufb for
19551 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
19552 first. */
19553 if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
19554 return true;
19555
19556 dcopy = *d;
19557 if (swap)
19558 {
19559 dcopy.op0 = d->op1;
19560 dcopy.op1 = d->op0;
19561 for (i = 0; i < nelt; ++i)
19562 dcopy.perm[i] ^= nelt;
19563 }
19564
19565 in_order = true;
19566 for (i = 0; i < nelt; ++i)
19567 {
19568 unsigned e = dcopy.perm[i];
19569 if (GET_MODE_SIZE (d->vmode) == 32
19570 && e >= nelt
19571 && (e & (nelt / 2 - 1)) < min)
19572 e = e - min - (nelt / 2);
19573 else
19574 e = e - min;
19575 if (e != i)
19576 in_order = false;
19577 dcopy.perm[i] = e;
19578 }
19579 dcopy.one_operand_p = true;
19580
19581 if (single_insn_only_p && !in_order)
19582 return false;
19583
19584 /* For AVX2, test whether we can permute the result in one instruction. */
19585 if (d->testing_p)
19586 {
19587 if (in_order)
19588 return true;
19589 dcopy.op1 = dcopy.op0;
19590 return expand_vec_perm_1 (&dcopy);
19591 }
19592
19593 shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
19594 if (GET_MODE_SIZE (d->vmode) == 16)
19595 {
19596 target = gen_reg_rtx (TImode);
19597 emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, dcopy.op1),
19598 gen_lowpart (TImode, dcopy.op0), shift));
19599 }
19600 else
19601 {
19602 target = gen_reg_rtx (V2TImode);
19603 emit_insn (gen_avx2_palignrv2ti (target,
19604 gen_lowpart (V2TImode, dcopy.op1),
19605 gen_lowpart (V2TImode, dcopy.op0),
19606 shift));
19607 }
19608
19609 dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
19610
19611 /* Test for the degenerate case where the alignment by itself
19612 produces the desired permutation. */
19613 if (in_order)
19614 {
19615 emit_move_insn (d->target, dcopy.op0);
19616 return true;
19617 }
19618
19619 ok = expand_vec_perm_1 (&dcopy);
19620 gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
19621
19622 return ok;
19623 }
19624
19625 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19626 the permutation using the SSE4_1 pblendv instruction. Potentially
19627 reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
19628
19629 static bool
19630 expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
19631 {
19632 unsigned i, which, nelt = d->nelt;
19633 struct expand_vec_perm_d dcopy, dcopy1;
19634 machine_mode vmode = d->vmode;
19635 bool ok;
19636
19637 /* Use the same checks as in expand_vec_perm_blend. */
19638 if (d->one_operand_p)
19639 return false;
19640 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
19641 ;
19642 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
19643 ;
19644 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
19645 || GET_MODE_SIZE (vmode) == 8
19646 || GET_MODE_SIZE (vmode) == 16))
19647 ;
19648 else
19649 return false;
19650
19651 /* Figure out where permutation elements stay not in their
19652 respective lanes. */
19653 for (i = 0, which = 0; i < nelt; ++i)
19654 {
19655 unsigned e = d->perm[i];
19656 if (e != i)
19657 which |= (e < nelt ? 1 : 2);
19658 }
19659 /* We can pblend the part where elements stay not in their
19660 respective lanes only when these elements are all in one
19661 half of a permutation.
19662 {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
19663 lanes, but both 8 and 9 >= 8
19664 {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
19665 respective lanes and 8 >= 8, but 2 not. */
19666 if (which != 1 && which != 2)
19667 return false;
19668 if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
19669 return true;
19670
19671 /* First we apply one operand permutation to the part where
19672 elements stay not in their respective lanes. */
19673 dcopy = *d;
19674 if (which == 2)
19675 dcopy.op0 = dcopy.op1 = d->op1;
19676 else
19677 dcopy.op0 = dcopy.op1 = d->op0;
19678 if (!d->testing_p)
19679 dcopy.target = gen_reg_rtx (vmode);
19680 dcopy.one_operand_p = true;
19681
19682 for (i = 0; i < nelt; ++i)
19683 dcopy.perm[i] = d->perm[i] & (nelt - 1);
19684
19685 ok = expand_vec_perm_1 (&dcopy);
19686 if (GET_MODE_SIZE (vmode) != 16 && !ok)
19687 return false;
19688 else
19689 gcc_assert (ok);
19690 if (d->testing_p)
19691 return true;
19692
19693 /* Next we put permuted elements into their positions. */
19694 dcopy1 = *d;
19695 if (which == 2)
19696 dcopy1.op1 = dcopy.target;
19697 else
19698 dcopy1.op0 = dcopy.target;
19699
19700 for (i = 0; i < nelt; ++i)
19701 dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
19702
19703 ok = expand_vec_perm_blend (&dcopy1);
19704 gcc_assert (ok);
19705
19706 return true;
19707 }
19708
19709 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
19710
19711 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
19712 a two vector permutation into a single vector permutation by using
19713 an interleave operation to merge the vectors. */
19714
19715 static bool
19716 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
19717 {
19718 struct expand_vec_perm_d dremap, dfinal;
19719 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
19720 unsigned HOST_WIDE_INT contents;
19721 unsigned char remap[2 * MAX_VECT_LEN];
19722 rtx_insn *seq;
19723 bool ok, same_halves = false;
19724
19725 if (GET_MODE_SIZE (d->vmode) == 4
19726 || GET_MODE_SIZE (d->vmode) == 8
19727 || GET_MODE_SIZE (d->vmode) == 16)
19728 {
19729 if (d->one_operand_p)
19730 return false;
19731 }
19732 else if (GET_MODE_SIZE (d->vmode) == 32)
19733 {
19734 if (!TARGET_AVX)
19735 return false;
19736 /* For 32-byte modes allow even d->one_operand_p.
19737 The lack of cross-lane shuffling in some instructions
19738 might prevent a single insn shuffle. */
19739 dfinal = *d;
19740 dfinal.testing_p = true;
19741 /* If expand_vec_perm_interleave3 can expand this into
19742 a 3 insn sequence, give up and let it be expanded as
19743 3 insn sequence. While that is one insn longer,
19744 it doesn't need a memory operand and in the common
19745 case that both interleave low and high permutations
19746 with the same operands are adjacent needs 4 insns
19747 for both after CSE. */
19748 if (expand_vec_perm_interleave3 (&dfinal))
19749 return false;
19750 }
19751 else
19752 return false;
19753
19754 /* Examine from whence the elements come. */
19755 contents = 0;
19756 for (i = 0; i < nelt; ++i)
19757 contents |= HOST_WIDE_INT_1U << d->perm[i];
19758
19759 memset (remap, 0xff, sizeof (remap));
19760 dremap = *d;
19761
19762 if (GET_MODE_SIZE (d->vmode) == 4
19763 || GET_MODE_SIZE (d->vmode) == 8)
19764 {
19765 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19766
19767 /* Split the two input vectors into 4 halves. */
19768 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19769 h2 = h1 << nelt2;
19770 h3 = h2 << nelt2;
19771 h4 = h3 << nelt2;
19772
19773 /* If the elements from the low halves use interleave low,
19774 and similarly for interleave high. */
19775 if ((contents & (h1 | h3)) == contents)
19776 {
19777 /* punpckl* */
19778 for (i = 0; i < nelt2; ++i)
19779 {
19780 remap[i] = i * 2;
19781 remap[i + nelt] = i * 2 + 1;
19782 dremap.perm[i * 2] = i;
19783 dremap.perm[i * 2 + 1] = i + nelt;
19784 }
19785 }
19786 else if ((contents & (h2 | h4)) == contents)
19787 {
19788 /* punpckh* */
19789 for (i = 0; i < nelt2; ++i)
19790 {
19791 remap[i + nelt2] = i * 2;
19792 remap[i + nelt + nelt2] = i * 2 + 1;
19793 dremap.perm[i * 2] = i + nelt2;
19794 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19795 }
19796 }
19797 else
19798 return false;
19799 }
19800 else if (GET_MODE_SIZE (d->vmode) == 16)
19801 {
19802 unsigned HOST_WIDE_INT h1, h2, h3, h4;
19803
19804 /* Split the two input vectors into 4 halves. */
19805 h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
19806 h2 = h1 << nelt2;
19807 h3 = h2 << nelt2;
19808 h4 = h3 << nelt2;
19809
19810 /* If the elements from the low halves use interleave low, and similarly
19811 for interleave high. If the elements are from mis-matched halves, we
19812 can use shufps for V4SF/V4SI or do a DImode shuffle. */
19813 if ((contents & (h1 | h3)) == contents)
19814 {
19815 /* punpckl* */
19816 for (i = 0; i < nelt2; ++i)
19817 {
19818 remap[i] = i * 2;
19819 remap[i + nelt] = i * 2 + 1;
19820 dremap.perm[i * 2] = i;
19821 dremap.perm[i * 2 + 1] = i + nelt;
19822 }
19823 if (!TARGET_SSE2 && d->vmode == V4SImode)
19824 dremap.vmode = V4SFmode;
19825 }
19826 else if ((contents & (h2 | h4)) == contents)
19827 {
19828 /* punpckh* */
19829 for (i = 0; i < nelt2; ++i)
19830 {
19831 remap[i + nelt2] = i * 2;
19832 remap[i + nelt + nelt2] = i * 2 + 1;
19833 dremap.perm[i * 2] = i + nelt2;
19834 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
19835 }
19836 if (!TARGET_SSE2 && d->vmode == V4SImode)
19837 dremap.vmode = V4SFmode;
19838 }
19839 else if ((contents & (h1 | h4)) == contents)
19840 {
19841 /* shufps */
19842 for (i = 0; i < nelt2; ++i)
19843 {
19844 remap[i] = i;
19845 remap[i + nelt + nelt2] = i + nelt2;
19846 dremap.perm[i] = i;
19847 dremap.perm[i + nelt2] = i + nelt + nelt2;
19848 }
19849 if (nelt != 4)
19850 {
19851 /* shufpd */
19852 dremap.vmode = V2DImode;
19853 dremap.nelt = 2;
19854 dremap.perm[0] = 0;
19855 dremap.perm[1] = 3;
19856 }
19857 }
19858 else if ((contents & (h2 | h3)) == contents)
19859 {
19860 /* shufps */
19861 for (i = 0; i < nelt2; ++i)
19862 {
19863 remap[i + nelt2] = i;
19864 remap[i + nelt] = i + nelt2;
19865 dremap.perm[i] = i + nelt2;
19866 dremap.perm[i + nelt2] = i + nelt;
19867 }
19868 if (nelt != 4)
19869 {
19870 /* shufpd */
19871 dremap.vmode = V2DImode;
19872 dremap.nelt = 2;
19873 dremap.perm[0] = 1;
19874 dremap.perm[1] = 2;
19875 }
19876 }
19877 else
19878 return false;
19879 }
19880 else
19881 {
19882 unsigned int nelt4 = nelt / 4, nzcnt = 0;
19883 unsigned HOST_WIDE_INT q[8];
19884 unsigned int nonzero_halves[4];
19885
19886 /* Split the two input vectors into 8 quarters. */
19887 q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
19888 for (i = 1; i < 8; ++i)
19889 q[i] = q[0] << (nelt4 * i);
19890 for (i = 0; i < 4; ++i)
19891 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
19892 {
19893 nonzero_halves[nzcnt] = i;
19894 ++nzcnt;
19895 }
19896
19897 if (nzcnt == 1)
19898 {
19899 gcc_assert (d->one_operand_p);
19900 nonzero_halves[1] = nonzero_halves[0];
19901 same_halves = true;
19902 }
19903 else if (d->one_operand_p)
19904 {
19905 gcc_assert (nonzero_halves[0] == 0);
19906 gcc_assert (nonzero_halves[1] == 1);
19907 }
19908
19909 if (nzcnt <= 2)
19910 {
19911 if (d->perm[0] / nelt2 == nonzero_halves[1])
19912 {
19913 /* Attempt to increase the likelihood that dfinal
19914 shuffle will be intra-lane. */
19915 std::swap (nonzero_halves[0], nonzero_halves[1]);
19916 }
19917
19918 /* vperm2f128 or vperm2i128. */
19919 for (i = 0; i < nelt2; ++i)
19920 {
19921 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
19922 remap[i + nonzero_halves[0] * nelt2] = i;
19923 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
19924 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
19925 }
19926
19927 if (d->vmode != V8SFmode
19928 && d->vmode != V4DFmode
19929 && d->vmode != V8SImode)
19930 {
19931 dremap.vmode = V8SImode;
19932 dremap.nelt = 8;
19933 for (i = 0; i < 4; ++i)
19934 {
19935 dremap.perm[i] = i + nonzero_halves[0] * 4;
19936 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
19937 }
19938 }
19939 }
19940 else if (d->one_operand_p)
19941 return false;
19942 else if (TARGET_AVX2
19943 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
19944 {
19945 /* vpunpckl* */
19946 for (i = 0; i < nelt4; ++i)
19947 {
19948 remap[i] = i * 2;
19949 remap[i + nelt] = i * 2 + 1;
19950 remap[i + nelt2] = i * 2 + nelt2;
19951 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
19952 dremap.perm[i * 2] = i;
19953 dremap.perm[i * 2 + 1] = i + nelt;
19954 dremap.perm[i * 2 + nelt2] = i + nelt2;
19955 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
19956 }
19957 }
19958 else if (TARGET_AVX2
19959 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
19960 {
19961 /* vpunpckh* */
19962 for (i = 0; i < nelt4; ++i)
19963 {
19964 remap[i + nelt4] = i * 2;
19965 remap[i + nelt + nelt4] = i * 2 + 1;
19966 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
19967 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
19968 dremap.perm[i * 2] = i + nelt4;
19969 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
19970 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
19971 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
19972 }
19973 }
19974 else
19975 return false;
19976 }
19977
19978 /* Use the remapping array set up above to move the elements from their
19979 swizzled locations into their final destinations. */
19980 dfinal = *d;
19981 for (i = 0; i < nelt; ++i)
19982 {
19983 unsigned e = remap[d->perm[i]];
19984 gcc_assert (e < nelt);
19985 /* If same_halves is true, both halves of the remapped vector are the
19986 same. Avoid cross-lane accesses if possible. */
19987 if (same_halves && i >= nelt2)
19988 {
19989 gcc_assert (e < nelt2);
19990 dfinal.perm[i] = e + nelt2;
19991 }
19992 else
19993 dfinal.perm[i] = e;
19994 }
19995 if (!d->testing_p)
19996 {
19997 dremap.target = gen_reg_rtx (dremap.vmode);
19998 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
19999 }
20000 dfinal.op1 = dfinal.op0;
20001 dfinal.one_operand_p = true;
20002
20003 /* Test if the final remap can be done with a single insn. For V4SFmode or
20004 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
20005 start_sequence ();
20006 ok = expand_vec_perm_1 (&dfinal);
20007 seq = get_insns ();
20008 end_sequence ();
20009
20010 if (!ok)
20011 return false;
20012
20013 if (d->testing_p)
20014 return true;
20015
20016 if (dremap.vmode != dfinal.vmode)
20017 {
20018 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
20019 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
20020 }
20021
20022 ok = expand_vec_perm_1 (&dremap);
20023 gcc_assert (ok);
20024
20025 emit_insn (seq);
20026 return true;
20027 }
20028
20029 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20030 a single vector cross-lane permutation into vpermq followed
20031 by any of the single insn permutations. */
20032
20033 static bool
20034 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
20035 {
20036 struct expand_vec_perm_d dremap, dfinal;
20037 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
20038 unsigned contents[2];
20039 bool ok;
20040
20041 if (!(TARGET_AVX2
20042 && (d->vmode == V32QImode || d->vmode == V16HImode)
20043 && d->one_operand_p))
20044 return false;
20045
20046 contents[0] = 0;
20047 contents[1] = 0;
20048 for (i = 0; i < nelt2; ++i)
20049 {
20050 contents[0] |= 1u << (d->perm[i] / nelt4);
20051 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
20052 }
20053
20054 for (i = 0; i < 2; ++i)
20055 {
20056 unsigned int cnt = 0;
20057 for (j = 0; j < 4; ++j)
20058 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
20059 return false;
20060 }
20061
20062 if (d->testing_p)
20063 return true;
20064
20065 dremap = *d;
20066 dremap.vmode = V4DImode;
20067 dremap.nelt = 4;
20068 dremap.target = gen_reg_rtx (V4DImode);
20069 dremap.op0 = gen_lowpart (V4DImode, d->op0);
20070 dremap.op1 = dremap.op0;
20071 dremap.one_operand_p = true;
20072 for (i = 0; i < 2; ++i)
20073 {
20074 unsigned int cnt = 0;
20075 for (j = 0; j < 4; ++j)
20076 if ((contents[i] & (1u << j)) != 0)
20077 dremap.perm[2 * i + cnt++] = j;
20078 for (; cnt < 2; ++cnt)
20079 dremap.perm[2 * i + cnt] = 0;
20080 }
20081
20082 dfinal = *d;
20083 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
20084 dfinal.op1 = dfinal.op0;
20085 dfinal.one_operand_p = true;
20086 for (i = 0, j = 0; i < nelt; ++i)
20087 {
20088 if (i == nelt2)
20089 j = 2;
20090 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
20091 if ((d->perm[i] / nelt4) == dremap.perm[j])
20092 ;
20093 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
20094 dfinal.perm[i] |= nelt4;
20095 else
20096 gcc_unreachable ();
20097 }
20098
20099 ok = expand_vec_perm_1 (&dremap);
20100 gcc_assert (ok);
20101
20102 ok = expand_vec_perm_1 (&dfinal);
20103 gcc_assert (ok);
20104
20105 return true;
20106 }
20107
20108 static bool canonicalize_perm (struct expand_vec_perm_d *d);
20109
20110 /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
20111 a vector permutation using two instructions, vperm2f128 resp.
20112 vperm2i128 followed by any single in-lane permutation. */
20113
20114 static bool
20115 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
20116 {
20117 struct expand_vec_perm_d dfirst, dsecond;
20118 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
20119 bool ok;
20120
20121 if (!TARGET_AVX
20122 || GET_MODE_SIZE (d->vmode) != 32
20123 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
20124 return false;
20125
20126 dsecond = *d;
20127 dsecond.one_operand_p = false;
20128 dsecond.testing_p = true;
20129
20130 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
20131 immediate. For perm < 16 the second permutation uses
20132 d->op0 as first operand, for perm >= 16 it uses d->op1
20133 as first operand. The second operand is the result of
20134 vperm2[fi]128. */
20135 for (perm = 0; perm < 32; perm++)
20136 {
20137 /* Ignore permutations which do not move anything cross-lane. */
20138 if (perm < 16)
20139 {
20140 /* The second shuffle for e.g. V4DFmode has
20141 0123 and ABCD operands.
20142 Ignore AB23, as 23 is already in the second lane
20143 of the first operand. */
20144 if ((perm & 0xc) == (1 << 2)) continue;
20145 /* And 01CD, as 01 is in the first lane of the first
20146 operand. */
20147 if ((perm & 3) == 0) continue;
20148 /* And 4567, as then the vperm2[fi]128 doesn't change
20149 anything on the original 4567 second operand. */
20150 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
20151 }
20152 else
20153 {
20154 /* The second shuffle for e.g. V4DFmode has
20155 4567 and ABCD operands.
20156 Ignore AB67, as 67 is already in the second lane
20157 of the first operand. */
20158 if ((perm & 0xc) == (3 << 2)) continue;
20159 /* And 45CD, as 45 is in the first lane of the first
20160 operand. */
20161 if ((perm & 3) == 2) continue;
20162 /* And 0123, as then the vperm2[fi]128 doesn't change
20163 anything on the original 0123 first operand. */
20164 if ((perm & 0xf) == (1 << 2)) continue;
20165 }
20166
20167 for (i = 0; i < nelt; i++)
20168 {
20169 j = d->perm[i] / nelt2;
20170 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
20171 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
20172 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
20173 dsecond.perm[i] = d->perm[i] & (nelt - 1);
20174 else
20175 break;
20176 }
20177
20178 if (i == nelt)
20179 {
20180 start_sequence ();
20181 ok = expand_vec_perm_1 (&dsecond);
20182 end_sequence ();
20183 }
20184 else
20185 ok = false;
20186
20187 if (ok)
20188 {
20189 if (d->testing_p)
20190 return true;
20191
20192 /* Found a usable second shuffle. dfirst will be
20193 vperm2f128 on d->op0 and d->op1. */
20194 dsecond.testing_p = false;
20195 dfirst = *d;
20196 dfirst.target = gen_reg_rtx (d->vmode);
20197 for (i = 0; i < nelt; i++)
20198 dfirst.perm[i] = (i & (nelt2 - 1))
20199 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
20200
20201 canonicalize_perm (&dfirst);
20202 ok = expand_vec_perm_1 (&dfirst);
20203 gcc_assert (ok);
20204
20205 /* And dsecond is some single insn shuffle, taking
20206 d->op0 and result of vperm2f128 (if perm < 16) or
20207 d->op1 and result of vperm2f128 (otherwise). */
20208 if (perm >= 16)
20209 dsecond.op0 = dsecond.op1;
20210 dsecond.op1 = dfirst.target;
20211
20212 ok = expand_vec_perm_1 (&dsecond);
20213 gcc_assert (ok);
20214
20215 return true;
20216 }
20217
20218 /* For one operand, the only useful vperm2f128 permutation is 0x01
20219 aka lanes swap. */
20220 if (d->one_operand_p)
20221 return false;
20222 }
20223
20224 return false;
20225 }
20226
20227 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20228 a two vector permutation using 2 intra-lane interleave insns
20229 and cross-lane shuffle for 32-byte vectors. */
20230
20231 static bool
20232 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
20233 {
20234 unsigned i, nelt;
20235 rtx (*gen) (rtx, rtx, rtx);
20236
20237 if (d->one_operand_p)
20238 return false;
20239 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
20240 ;
20241 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
20242 ;
20243 else
20244 return false;
20245
20246 nelt = d->nelt;
20247 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
20248 return false;
20249 for (i = 0; i < nelt; i += 2)
20250 if (d->perm[i] != d->perm[0] + i / 2
20251 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
20252 return false;
20253
20254 if (d->testing_p)
20255 return true;
20256
20257 switch (d->vmode)
20258 {
20259 case E_V32QImode:
20260 if (d->perm[0])
20261 gen = gen_vec_interleave_highv32qi;
20262 else
20263 gen = gen_vec_interleave_lowv32qi;
20264 break;
20265 case E_V16HImode:
20266 if (d->perm[0])
20267 gen = gen_vec_interleave_highv16hi;
20268 else
20269 gen = gen_vec_interleave_lowv16hi;
20270 break;
20271 case E_V8SImode:
20272 if (d->perm[0])
20273 gen = gen_vec_interleave_highv8si;
20274 else
20275 gen = gen_vec_interleave_lowv8si;
20276 break;
20277 case E_V4DImode:
20278 if (d->perm[0])
20279 gen = gen_vec_interleave_highv4di;
20280 else
20281 gen = gen_vec_interleave_lowv4di;
20282 break;
20283 case E_V8SFmode:
20284 if (d->perm[0])
20285 gen = gen_vec_interleave_highv8sf;
20286 else
20287 gen = gen_vec_interleave_lowv8sf;
20288 break;
20289 case E_V4DFmode:
20290 if (d->perm[0])
20291 gen = gen_vec_interleave_highv4df;
20292 else
20293 gen = gen_vec_interleave_lowv4df;
20294 break;
20295 default:
20296 gcc_unreachable ();
20297 }
20298
20299 emit_insn (gen (d->target, d->op0, d->op1));
20300 return true;
20301 }
20302
20303 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20304 a single vector permutation using a single intra-lane vector
20305 permutation, vperm2f128 swapping the lanes and vblend* insn blending
20306 the non-swapped and swapped vectors together. */
20307
20308 static bool
20309 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
20310 {
20311 struct expand_vec_perm_d dfirst, dsecond;
20312 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
20313 rtx_insn *seq;
20314 bool ok;
20315 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20316
20317 if (!TARGET_AVX
20318 || TARGET_AVX2
20319 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20320 || !d->one_operand_p)
20321 return false;
20322
20323 dfirst = *d;
20324 for (i = 0; i < nelt; i++)
20325 dfirst.perm[i] = 0xff;
20326 for (i = 0, msk = 0; i < nelt; i++)
20327 {
20328 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20329 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
20330 return false;
20331 dfirst.perm[j] = d->perm[i];
20332 if (j != i)
20333 msk |= (1 << i);
20334 }
20335 for (i = 0; i < nelt; i++)
20336 if (dfirst.perm[i] == 0xff)
20337 dfirst.perm[i] = i;
20338
20339 if (!d->testing_p)
20340 dfirst.target = gen_reg_rtx (dfirst.vmode);
20341
20342 start_sequence ();
20343 ok = expand_vec_perm_1 (&dfirst);
20344 seq = get_insns ();
20345 end_sequence ();
20346
20347 if (!ok)
20348 return false;
20349
20350 if (d->testing_p)
20351 return true;
20352
20353 emit_insn (seq);
20354
20355 dsecond = *d;
20356 dsecond.op0 = dfirst.target;
20357 dsecond.op1 = dfirst.target;
20358 dsecond.one_operand_p = true;
20359 dsecond.target = gen_reg_rtx (dsecond.vmode);
20360 for (i = 0; i < nelt; i++)
20361 dsecond.perm[i] = i ^ nelt2;
20362
20363 ok = expand_vec_perm_1 (&dsecond);
20364 gcc_assert (ok);
20365
20366 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20367 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
20368 return true;
20369 }
20370
20371 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20372 a two vector permutation using two single vector permutations and
20373 {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
20374 of dfirst or dsecond is identity permutation. */
20375
20376 static bool
20377 expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
20378 {
20379 unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
20380 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20381 bool ident1 = true, ident2 = true;
20382
20383 if (d->one_operand_p)
20384 return false;
20385
20386 if (GET_MODE_SIZE (d->vmode) == 16)
20387 {
20388 if (!TARGET_SSE)
20389 return false;
20390 if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
20391 return false;
20392 }
20393 else if (GET_MODE_SIZE (d->vmode) == 32)
20394 {
20395 if (!TARGET_AVX)
20396 return false;
20397 if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
20398 return false;
20399 lane = nelt2;
20400 }
20401 else
20402 return false;
20403
20404 for (i = 1; i < nelt; i++)
20405 if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
20406 return false;
20407
20408 dfirst = *d;
20409 dsecond = *d;
20410 dfinal = *d;
20411 dfirst.op1 = dfirst.op0;
20412 dfirst.one_operand_p = true;
20413 dsecond.op0 = dsecond.op1;
20414 dsecond.one_operand_p = true;
20415
20416 for (i = 0; i < nelt; i++)
20417 if (d->perm[i] >= nelt)
20418 {
20419 dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
20420 if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
20421 ident2 = false;
20422 dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
20423 = d->perm[i] - nelt;
20424 }
20425 else
20426 {
20427 dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
20428 if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
20429 ident1 = false;
20430 dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
20431 }
20432
20433 if (two_insn && !ident1 && !ident2)
20434 return false;
20435
20436 if (!d->testing_p)
20437 {
20438 if (!ident1)
20439 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20440 if (!ident2)
20441 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20442 if (d->perm[0] >= nelt)
20443 std::swap (dfinal.op0, dfinal.op1);
20444 }
20445
20446 bool ok;
20447 rtx_insn *seq1 = NULL, *seq2 = NULL;
20448
20449 if (!ident1)
20450 {
20451 start_sequence ();
20452 ok = expand_vec_perm_1 (&dfirst);
20453 seq1 = get_insns ();
20454 end_sequence ();
20455
20456 if (!ok)
20457 return false;
20458 }
20459
20460 if (!ident2)
20461 {
20462 start_sequence ();
20463 ok = expand_vec_perm_1 (&dsecond);
20464 seq2 = get_insns ();
20465 end_sequence ();
20466
20467 if (!ok)
20468 return false;
20469 }
20470
20471 if (d->testing_p)
20472 return true;
20473
20474 for (i = 0; i < nelt; i++)
20475 {
20476 dfinal.perm[i] = i / 2;
20477 if (i >= lane)
20478 dfinal.perm[i] += lane / 2;
20479 if ((i & 1) != 0)
20480 dfinal.perm[i] += nelt;
20481 }
20482 emit_insn (seq1);
20483 emit_insn (seq2);
20484 ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
20485 dfinal.perm, dfinal.nelt, false);
20486 gcc_assert (ok);
20487 return true;
20488 }
20489
20490 /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
20491 the permutation using two single vector permutations and the SSE4_1 pblendv
20492 instruction. If two_insn, succeed only if one of dfirst or dsecond is
20493 identity permutation. */
20494
20495 static bool
20496 expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
20497 {
20498 unsigned i, nelt = d->nelt;
20499 struct expand_vec_perm_d dfirst, dsecond, dfinal;
20500 machine_mode vmode = d->vmode;
20501 bool ident1 = true, ident2 = true;
20502
20503 /* Use the same checks as in expand_vec_perm_blend. */
20504 if (d->one_operand_p)
20505 return false;
20506 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
20507 ;
20508 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
20509 ;
20510 else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
20511 || GET_MODE_SIZE (vmode) == 8
20512 || GET_MODE_SIZE (vmode) == 4))
20513 ;
20514 else
20515 return false;
20516
20517 dfirst = *d;
20518 dsecond = *d;
20519 dfinal = *d;
20520 dfirst.op1 = dfirst.op0;
20521 dfirst.one_operand_p = true;
20522 dsecond.op0 = dsecond.op1;
20523 dsecond.one_operand_p = true;
20524
20525 for (i = 0; i < nelt; ++i)
20526 if (d->perm[i] >= nelt)
20527 {
20528 dfirst.perm[i] = 0xff;
20529 dsecond.perm[i] = d->perm[i] - nelt;
20530 if (d->perm[i] != i + nelt)
20531 ident2 = false;
20532 }
20533 else
20534 {
20535 dsecond.perm[i] = 0xff;
20536 dfirst.perm[i] = d->perm[i];
20537 if (d->perm[i] != i)
20538 ident1 = false;
20539 }
20540
20541 if (two_insn && !ident1 && !ident2)
20542 return false;
20543
20544 /* For now. Ideally treat 0xff as a wildcard. */
20545 for (i = 0; i < nelt; ++i)
20546 if (dfirst.perm[i] == 0xff)
20547 {
20548 if (GET_MODE_SIZE (vmode) == 32
20549 && dfirst.perm[i ^ (nelt / 2)] != 0xff)
20550 dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20551 else
20552 dfirst.perm[i] = i;
20553 }
20554 else
20555 {
20556 if (GET_MODE_SIZE (vmode) == 32
20557 && dsecond.perm[i ^ (nelt / 2)] != 0xff)
20558 dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
20559 else
20560 dsecond.perm[i] = i;
20561 }
20562
20563 if (!d->testing_p)
20564 {
20565 if (!ident1)
20566 dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
20567 if (!ident2)
20568 dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
20569 }
20570
20571 bool ok;
20572 rtx_insn *seq1 = NULL, *seq2 = NULL;
20573
20574 if (!ident1)
20575 {
20576 start_sequence ();
20577 ok = expand_vec_perm_1 (&dfirst);
20578 seq1 = get_insns ();
20579 end_sequence ();
20580
20581 if (!ok)
20582 return false;
20583 }
20584
20585 if (!ident2)
20586 {
20587 start_sequence ();
20588 ok = expand_vec_perm_1 (&dsecond);
20589 seq2 = get_insns ();
20590 end_sequence ();
20591
20592 if (!ok)
20593 return false;
20594 }
20595
20596 if (d->testing_p)
20597 return true;
20598
20599 for (i = 0; i < nelt; ++i)
20600 dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
20601
20602 emit_insn (seq1);
20603 emit_insn (seq2);
20604 ok = expand_vec_perm_blend (&dfinal);
20605 gcc_assert (ok);
20606 return true;
20607 }
20608
20609 /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
20610 permutation using two vperm2f128, followed by a vshufpd insn blending
20611 the two vectors together. */
20612
20613 static bool
20614 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
20615 {
20616 struct expand_vec_perm_d dfirst, dsecond, dthird;
20617 bool ok;
20618
20619 if (!TARGET_AVX || (d->vmode != V4DFmode))
20620 return false;
20621
20622 if (d->testing_p)
20623 return true;
20624
20625 dfirst = *d;
20626 dsecond = *d;
20627 dthird = *d;
20628
20629 dfirst.perm[0] = (d->perm[0] & ~1);
20630 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
20631 dfirst.perm[2] = (d->perm[2] & ~1);
20632 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
20633 dsecond.perm[0] = (d->perm[1] & ~1);
20634 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
20635 dsecond.perm[2] = (d->perm[3] & ~1);
20636 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
20637 dthird.perm[0] = (d->perm[0] % 2);
20638 dthird.perm[1] = (d->perm[1] % 2) + 4;
20639 dthird.perm[2] = (d->perm[2] % 2) + 2;
20640 dthird.perm[3] = (d->perm[3] % 2) + 6;
20641
20642 dfirst.target = gen_reg_rtx (dfirst.vmode);
20643 dsecond.target = gen_reg_rtx (dsecond.vmode);
20644 dthird.op0 = dfirst.target;
20645 dthird.op1 = dsecond.target;
20646 dthird.one_operand_p = false;
20647
20648 canonicalize_perm (&dfirst);
20649 canonicalize_perm (&dsecond);
20650
20651 ok = expand_vec_perm_1 (&dfirst)
20652 && expand_vec_perm_1 (&dsecond)
20653 && expand_vec_perm_1 (&dthird);
20654
20655 gcc_assert (ok);
20656
20657 return true;
20658 }
20659
20660 static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
20661
20662 /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
20663 a two vector permutation using two intra-lane vector
20664 permutations, vperm2f128 swapping the lanes and vblend* insn blending
20665 the non-swapped and swapped vectors together. */
20666
20667 static bool
20668 expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
20669 {
20670 struct expand_vec_perm_d dfirst, dsecond, dthird;
20671 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
20672 rtx_insn *seq1, *seq2;
20673 bool ok;
20674 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
20675
20676 if (!TARGET_AVX
20677 || TARGET_AVX2
20678 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
20679 || d->one_operand_p)
20680 return false;
20681
20682 dfirst = *d;
20683 dsecond = *d;
20684 for (i = 0; i < nelt; i++)
20685 {
20686 dfirst.perm[i] = 0xff;
20687 dsecond.perm[i] = 0xff;
20688 }
20689 for (i = 0, msk = 0; i < nelt; i++)
20690 {
20691 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
20692 if (j == i)
20693 {
20694 dfirst.perm[j] = d->perm[i];
20695 which1 |= (d->perm[i] < nelt ? 1 : 2);
20696 }
20697 else
20698 {
20699 dsecond.perm[j] = d->perm[i];
20700 which2 |= (d->perm[i] < nelt ? 1 : 2);
20701 msk |= (1U << i);
20702 }
20703 }
20704 if (msk == 0 || msk == (1U << nelt) - 1)
20705 return false;
20706
20707 if (!d->testing_p)
20708 {
20709 dfirst.target = gen_reg_rtx (dfirst.vmode);
20710 dsecond.target = gen_reg_rtx (dsecond.vmode);
20711 }
20712
20713 for (i = 0; i < nelt; i++)
20714 {
20715 if (dfirst.perm[i] == 0xff)
20716 dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
20717 if (dsecond.perm[i] == 0xff)
20718 dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
20719 }
20720 canonicalize_perm (&dfirst);
20721 start_sequence ();
20722 ok = ix86_expand_vec_perm_const_1 (&dfirst);
20723 seq1 = get_insns ();
20724 end_sequence ();
20725
20726 if (!ok)
20727 return false;
20728
20729 canonicalize_perm (&dsecond);
20730 start_sequence ();
20731 ok = ix86_expand_vec_perm_const_1 (&dsecond);
20732 seq2 = get_insns ();
20733 end_sequence ();
20734
20735 if (!ok)
20736 return false;
20737
20738 if (d->testing_p)
20739 return true;
20740
20741 emit_insn (seq1);
20742 emit_insn (seq2);
20743
20744 dthird = *d;
20745 dthird.op0 = dsecond.target;
20746 dthird.op1 = dsecond.target;
20747 dthird.one_operand_p = true;
20748 dthird.target = gen_reg_rtx (dthird.vmode);
20749 for (i = 0; i < nelt; i++)
20750 dthird.perm[i] = i ^ nelt2;
20751
20752 ok = expand_vec_perm_1 (&dthird);
20753 gcc_assert (ok);
20754
20755 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
20756 emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
20757 return true;
20758 }
20759
20760 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
20761 permutation with two pshufb insns and an ior. We should have already
20762 failed all two instruction sequences. */
20763
20764 static bool
20765 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
20766 {
20767 rtx rperm[2][16], vperm, l, h, op, m128;
20768 unsigned int i, nelt, eltsz;
20769 machine_mode mode;
20770 rtx (*gen) (rtx, rtx, rtx);
20771
20772 if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
20773 && GET_MODE_SIZE (d->vmode) != 8
20774 && GET_MODE_SIZE (d->vmode) != 4))
20775 return false;
20776 gcc_assert (!d->one_operand_p);
20777
20778 if (d->testing_p)
20779 return true;
20780
20781 switch (GET_MODE_SIZE (d->vmode))
20782 {
20783 case 4:
20784 mode = V4QImode;
20785 gen = gen_mmx_pshufbv4qi3;
20786 break;
20787 case 8:
20788 mode = V8QImode;
20789 gen = gen_mmx_pshufbv8qi3;
20790 break;
20791 case 16:
20792 mode = V16QImode;
20793 gen = gen_ssse3_pshufbv16qi3;
20794 break;
20795 default:
20796 gcc_unreachable ();
20797 }
20798
20799 nelt = d->nelt;
20800 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20801
20802 /* Generate two permutation masks. If the required element is within
20803 the given vector it is shuffled into the proper lane. If the required
20804 element is in the other vector, force a zero into the lane by setting
20805 bit 7 in the permutation mask. */
20806 m128 = GEN_INT (-128);
20807 for (i = 0; i < nelt; ++i)
20808 {
20809 unsigned j, k, e = d->perm[i];
20810 unsigned which = (e >= nelt);
20811 if (e >= nelt)
20812 e -= nelt;
20813
20814 for (j = 0; j < eltsz; ++j)
20815 {
20816 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
20817 rperm[1-which][i*eltsz + j] = m128;
20818 }
20819
20820 for (k = i*eltsz + j; k < 16; ++k)
20821 rperm[0][k] = rperm[1][k] = m128;
20822 }
20823
20824 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
20825 vperm = force_reg (V16QImode, vperm);
20826
20827 l = gen_reg_rtx (mode);
20828 op = gen_lowpart (mode, d->op0);
20829 emit_insn (gen (l, op, vperm));
20830
20831 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
20832 vperm = force_reg (V16QImode, vperm);
20833
20834 h = gen_reg_rtx (mode);
20835 op = gen_lowpart (mode, d->op1);
20836 emit_insn (gen (h, op, vperm));
20837
20838 op = d->target;
20839 if (d->vmode != mode)
20840 op = gen_reg_rtx (mode);
20841 ix86_emit_vec_binop (IOR, mode, op, l, h);
20842 if (op != d->target)
20843 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20844
20845 return true;
20846 }
20847
20848 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
20849 with two vpshufb insns, vpermq and vpor. We should have already failed
20850 all two or three instruction sequences. */
20851
20852 static bool
20853 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
20854 {
20855 rtx rperm[2][32], vperm, l, h, hp, op, m128;
20856 unsigned int i, nelt, eltsz;
20857
20858 if (!TARGET_AVX2
20859 || !d->one_operand_p
20860 || (d->vmode != V32QImode && d->vmode != V16HImode))
20861 return false;
20862
20863 if (d->testing_p)
20864 return true;
20865
20866 nelt = d->nelt;
20867 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20868
20869 /* Generate two permutation masks. If the required element is within
20870 the same lane, it is shuffled in. If the required element from the
20871 other lane, force a zero by setting bit 7 in the permutation mask.
20872 In the other mask the mask has non-negative elements if element
20873 is requested from the other lane, but also moved to the other lane,
20874 so that the result of vpshufb can have the two V2TImode halves
20875 swapped. */
20876 m128 = GEN_INT (-128);
20877 for (i = 0; i < nelt; ++i)
20878 {
20879 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20880 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
20881
20882 for (j = 0; j < eltsz; ++j)
20883 {
20884 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
20885 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
20886 }
20887 }
20888
20889 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20890 vperm = force_reg (V32QImode, vperm);
20891
20892 h = gen_reg_rtx (V32QImode);
20893 op = gen_lowpart (V32QImode, d->op0);
20894 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20895
20896 /* Swap the 128-byte lanes of h into hp. */
20897 hp = gen_reg_rtx (V4DImode);
20898 op = gen_lowpart (V4DImode, h);
20899 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
20900 const1_rtx));
20901
20902 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20903 vperm = force_reg (V32QImode, vperm);
20904
20905 l = gen_reg_rtx (V32QImode);
20906 op = gen_lowpart (V32QImode, d->op0);
20907 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20908
20909 op = d->target;
20910 if (d->vmode != V32QImode)
20911 op = gen_reg_rtx (V32QImode);
20912 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
20913 if (op != d->target)
20914 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20915
20916 return true;
20917 }
20918
20919 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
20920 and extract-odd permutations of two V32QImode and V16QImode operand
20921 with two vpshufb insns, vpor and vpermq. We should have already
20922 failed all two or three instruction sequences. */
20923
20924 static bool
20925 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
20926 {
20927 rtx rperm[2][32], vperm, l, h, ior, op, m128;
20928 unsigned int i, nelt, eltsz;
20929
20930 if (!TARGET_AVX2
20931 || d->one_operand_p
20932 || (d->vmode != V32QImode && d->vmode != V16HImode))
20933 return false;
20934
20935 for (i = 0; i < d->nelt; ++i)
20936 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
20937 return false;
20938
20939 if (d->testing_p)
20940 return true;
20941
20942 nelt = d->nelt;
20943 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
20944
20945 /* Generate two permutation masks. In the first permutation mask
20946 the first quarter will contain indexes for the first half
20947 of the op0, the second quarter will contain bit 7 set, third quarter
20948 will contain indexes for the second half of the op0 and the
20949 last quarter bit 7 set. In the second permutation mask
20950 the first quarter will contain bit 7 set, the second quarter
20951 indexes for the first half of the op1, the third quarter bit 7 set
20952 and last quarter indexes for the second half of the op1.
20953 I.e. the first mask e.g. for V32QImode extract even will be:
20954 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
20955 (all values masked with 0xf except for -128) and second mask
20956 for extract even will be
20957 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
20958 m128 = GEN_INT (-128);
20959 for (i = 0; i < nelt; ++i)
20960 {
20961 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
20962 unsigned which = d->perm[i] >= nelt;
20963 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
20964
20965 for (j = 0; j < eltsz; ++j)
20966 {
20967 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
20968 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
20969 }
20970 }
20971
20972 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
20973 vperm = force_reg (V32QImode, vperm);
20974
20975 l = gen_reg_rtx (V32QImode);
20976 op = gen_lowpart (V32QImode, d->op0);
20977 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
20978
20979 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
20980 vperm = force_reg (V32QImode, vperm);
20981
20982 h = gen_reg_rtx (V32QImode);
20983 op = gen_lowpart (V32QImode, d->op1);
20984 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
20985
20986 ior = gen_reg_rtx (V32QImode);
20987 emit_insn (gen_iorv32qi3 (ior, l, h));
20988
20989 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
20990 op = gen_reg_rtx (V4DImode);
20991 ior = gen_lowpart (V4DImode, ior);
20992 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
20993 const1_rtx, GEN_INT (3)));
20994 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
20995
20996 return true;
20997 }
20998
20999 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21000 and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
21001 operands with two "and" and "pack" or two "shift" and "pack" insns.
21002 We should have already failed all two instruction sequences. */
21003
21004 static bool
21005 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
21006 {
21007 rtx op, dop0, dop1, t;
21008 unsigned i, odd, c, s, nelt = d->nelt;
21009 bool end_perm = false;
21010 machine_mode half_mode;
21011 rtx (*gen_and) (rtx, rtx, rtx);
21012 rtx (*gen_pack) (rtx, rtx, rtx);
21013 rtx (*gen_shift) (rtx, rtx, rtx);
21014
21015 if (d->one_operand_p)
21016 return false;
21017
21018 switch (d->vmode)
21019 {
21020 case E_V4HImode:
21021 /* Required for "pack". */
21022 if (!TARGET_SSE4_1)
21023 return false;
21024 c = 0xffff;
21025 s = 16;
21026 half_mode = V2SImode;
21027 gen_and = gen_andv2si3;
21028 gen_pack = gen_mmx_packusdw;
21029 gen_shift = gen_lshrv2si3;
21030 break;
21031 case E_V8HImode:
21032 /* Required for "pack". */
21033 if (!TARGET_SSE4_1)
21034 return false;
21035 c = 0xffff;
21036 s = 16;
21037 half_mode = V4SImode;
21038 gen_and = gen_andv4si3;
21039 gen_pack = gen_sse4_1_packusdw;
21040 gen_shift = gen_lshrv4si3;
21041 break;
21042 case E_V8QImode:
21043 /* No check as all instructions are SSE2. */
21044 c = 0xff;
21045 s = 8;
21046 half_mode = V4HImode;
21047 gen_and = gen_andv4hi3;
21048 gen_pack = gen_mmx_packuswb;
21049 gen_shift = gen_lshrv4hi3;
21050 break;
21051 case E_V16QImode:
21052 /* No check as all instructions are SSE2. */
21053 c = 0xff;
21054 s = 8;
21055 half_mode = V8HImode;
21056 gen_and = gen_andv8hi3;
21057 gen_pack = gen_sse2_packuswb;
21058 gen_shift = gen_lshrv8hi3;
21059 break;
21060 case E_V16HImode:
21061 if (!TARGET_AVX2)
21062 return false;
21063 c = 0xffff;
21064 s = 16;
21065 half_mode = V8SImode;
21066 gen_and = gen_andv8si3;
21067 gen_pack = gen_avx2_packusdw;
21068 gen_shift = gen_lshrv8si3;
21069 end_perm = true;
21070 break;
21071 case E_V32QImode:
21072 if (!TARGET_AVX2)
21073 return false;
21074 c = 0xff;
21075 s = 8;
21076 half_mode = V16HImode;
21077 gen_and = gen_andv16hi3;
21078 gen_pack = gen_avx2_packuswb;
21079 gen_shift = gen_lshrv16hi3;
21080 end_perm = true;
21081 break;
21082 default:
21083 /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
21084 are more profitable than general shuffles. */
21085 return false;
21086 }
21087
21088 /* Check that permutation is even or odd. */
21089 odd = d->perm[0];
21090 if (odd > 1)
21091 return false;
21092
21093 for (i = 1; i < nelt; ++i)
21094 if (d->perm[i] != 2 * i + odd)
21095 return false;
21096
21097 if (d->testing_p)
21098 return true;
21099
21100 dop0 = gen_reg_rtx (half_mode);
21101 dop1 = gen_reg_rtx (half_mode);
21102 if (odd == 0)
21103 {
21104 t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
21105 t = force_reg (half_mode, t);
21106 emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
21107 emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
21108 }
21109 else
21110 {
21111 emit_insn (gen_shift (dop0,
21112 gen_lowpart (half_mode, d->op0),
21113 GEN_INT (s)));
21114 emit_insn (gen_shift (dop1,
21115 gen_lowpart (half_mode, d->op1),
21116 GEN_INT (s)));
21117 }
21118 /* In AVX2 for 256 bit case we need to permute pack result. */
21119 if (TARGET_AVX2 && end_perm)
21120 {
21121 op = gen_reg_rtx (d->vmode);
21122 t = gen_reg_rtx (V4DImode);
21123 emit_insn (gen_pack (op, dop0, dop1));
21124 emit_insn (gen_avx2_permv4di_1 (t,
21125 gen_lowpart (V4DImode, op),
21126 const0_rtx,
21127 const2_rtx,
21128 const1_rtx,
21129 GEN_INT (3)));
21130 emit_move_insn (d->target, gen_lowpart (d->vmode, t));
21131 }
21132 else
21133 emit_insn (gen_pack (d->target, dop0, dop1));
21134
21135 return true;
21136 }
21137
21138 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
21139 and extract-odd permutations of two V64QI operands
21140 with two "shifts", two "truncs" and one "concat" insns for "odd"
21141 and two "truncs" and one concat insn for "even."
21142 Have already failed all two instruction sequences. */
21143
21144 static bool
21145 expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
21146 {
21147 rtx t1, t2, t3, t4;
21148 unsigned i, odd, nelt = d->nelt;
21149
21150 if (!TARGET_AVX512BW
21151 || d->one_operand_p
21152 || d->vmode != V64QImode)
21153 return false;
21154
21155 /* Check that permutation is even or odd. */
21156 odd = d->perm[0];
21157 if (odd > 1)
21158 return false;
21159
21160 for (i = 1; i < nelt; ++i)
21161 if (d->perm[i] != 2 * i + odd)
21162 return false;
21163
21164 if (d->testing_p)
21165 return true;
21166
21167
21168 if (odd)
21169 {
21170 t1 = gen_reg_rtx (V32HImode);
21171 t2 = gen_reg_rtx (V32HImode);
21172 emit_insn (gen_lshrv32hi3 (t1,
21173 gen_lowpart (V32HImode, d->op0),
21174 GEN_INT (8)));
21175 emit_insn (gen_lshrv32hi3 (t2,
21176 gen_lowpart (V32HImode, d->op1),
21177 GEN_INT (8)));
21178 }
21179 else
21180 {
21181 t1 = gen_lowpart (V32HImode, d->op0);
21182 t2 = gen_lowpart (V32HImode, d->op1);
21183 }
21184
21185 t3 = gen_reg_rtx (V32QImode);
21186 t4 = gen_reg_rtx (V32QImode);
21187 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
21188 emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
21189 emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
21190
21191 return true;
21192 }
21193
21194 /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
21195 and extract-odd permutations. */
21196
21197 static bool
21198 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
21199 {
21200 rtx t1, t2, t3, t4, t5;
21201
21202 switch (d->vmode)
21203 {
21204 case E_V4DFmode:
21205 if (d->testing_p)
21206 break;
21207 t1 = gen_reg_rtx (V4DFmode);
21208 t2 = gen_reg_rtx (V4DFmode);
21209
21210 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21211 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
21212 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
21213
21214 /* Now an unpck[lh]pd will produce the result required. */
21215 if (odd)
21216 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
21217 else
21218 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
21219 emit_insn (t3);
21220 break;
21221
21222 case E_V8SFmode:
21223 {
21224 int mask = odd ? 0xdd : 0x88;
21225
21226 if (d->testing_p)
21227 break;
21228 t1 = gen_reg_rtx (V8SFmode);
21229 t2 = gen_reg_rtx (V8SFmode);
21230 t3 = gen_reg_rtx (V8SFmode);
21231
21232 /* Shuffle within the 128-bit lanes to produce:
21233 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
21234 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
21235 GEN_INT (mask)));
21236
21237 /* Shuffle the lanes around to produce:
21238 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
21239 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
21240 GEN_INT (0x3)));
21241
21242 /* Shuffle within the 128-bit lanes to produce:
21243 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
21244 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
21245
21246 /* Shuffle within the 128-bit lanes to produce:
21247 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
21248 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
21249
21250 /* Shuffle the lanes around to produce:
21251 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
21252 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
21253 GEN_INT (0x20)));
21254 }
21255 break;
21256
21257 case E_V2DFmode:
21258 case E_V4SFmode:
21259 case E_V2DImode:
21260 case E_V2SImode:
21261 case E_V4SImode:
21262 case E_V2HImode:
21263 /* These are always directly implementable by expand_vec_perm_1. */
21264 gcc_unreachable ();
21265
21266 case E_V2SFmode:
21267 gcc_assert (TARGET_MMX_WITH_SSE);
21268 /* We have no suitable instructions. */
21269 if (d->testing_p)
21270 return false;
21271 break;
21272
21273 case E_V4QImode:
21274 if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21275 return expand_vec_perm_pshufb2 (d);
21276 else
21277 {
21278 if (d->testing_p)
21279 break;
21280 /* We need 2*log2(N)-1 operations to achieve odd/even
21281 with interleave. */
21282 t1 = gen_reg_rtx (V4QImode);
21283 emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
21284 emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
21285 if (odd)
21286 t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
21287 else
21288 t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
21289 emit_insn (t2);
21290 }
21291 break;
21292
21293 case E_V4HImode:
21294 if (TARGET_SSE4_1)
21295 return expand_vec_perm_even_odd_pack (d);
21296 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21297 return expand_vec_perm_pshufb2 (d);
21298 else
21299 {
21300 if (d->testing_p)
21301 break;
21302 /* We need 2*log2(N)-1 operations to achieve odd/even
21303 with interleave. */
21304 t1 = gen_reg_rtx (V4HImode);
21305 emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
21306 emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
21307 if (odd)
21308 t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
21309 else
21310 t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
21311 emit_insn (t2);
21312 }
21313 break;
21314
21315 case E_V8HImode:
21316 if (TARGET_SSE4_1)
21317 return expand_vec_perm_even_odd_pack (d);
21318 else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
21319 return expand_vec_perm_pshufb2 (d);
21320 else
21321 {
21322 if (d->testing_p)
21323 break;
21324 /* We need 2*log2(N)-1 operations to achieve odd/even
21325 with interleave. */
21326 t1 = gen_reg_rtx (V8HImode);
21327 t2 = gen_reg_rtx (V8HImode);
21328 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
21329 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
21330 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
21331 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
21332 if (odd)
21333 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
21334 else
21335 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
21336 emit_insn (t3);
21337 }
21338 break;
21339
21340 case E_V8QImode:
21341 case E_V16QImode:
21342 return expand_vec_perm_even_odd_pack (d);
21343
21344 case E_V16HImode:
21345 case E_V32QImode:
21346 return expand_vec_perm_even_odd_pack (d);
21347
21348 case E_V64QImode:
21349 return expand_vec_perm_even_odd_trunc (d);
21350
21351 case E_V4DImode:
21352 if (!TARGET_AVX2)
21353 {
21354 struct expand_vec_perm_d d_copy = *d;
21355 d_copy.vmode = V4DFmode;
21356 if (d->testing_p)
21357 d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
21358 else
21359 d_copy.target = gen_reg_rtx (V4DFmode);
21360 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
21361 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
21362 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21363 {
21364 if (!d->testing_p)
21365 emit_move_insn (d->target,
21366 gen_lowpart (V4DImode, d_copy.target));
21367 return true;
21368 }
21369 return false;
21370 }
21371
21372 if (d->testing_p)
21373 break;
21374
21375 t1 = gen_reg_rtx (V4DImode);
21376 t2 = gen_reg_rtx (V4DImode);
21377
21378 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
21379 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
21380 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
21381
21382 /* Now an vpunpck[lh]qdq will produce the result required. */
21383 if (odd)
21384 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
21385 else
21386 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
21387 emit_insn (t3);
21388 break;
21389
21390 case E_V8SImode:
21391 if (!TARGET_AVX2)
21392 {
21393 struct expand_vec_perm_d d_copy = *d;
21394 d_copy.vmode = V8SFmode;
21395 if (d->testing_p)
21396 d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
21397 else
21398 d_copy.target = gen_reg_rtx (V8SFmode);
21399 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
21400 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
21401 if (expand_vec_perm_even_odd_1 (&d_copy, odd))
21402 {
21403 if (!d->testing_p)
21404 emit_move_insn (d->target,
21405 gen_lowpart (V8SImode, d_copy.target));
21406 return true;
21407 }
21408 return false;
21409 }
21410
21411 if (d->testing_p)
21412 break;
21413
21414 t1 = gen_reg_rtx (V8SImode);
21415 t2 = gen_reg_rtx (V8SImode);
21416 t3 = gen_reg_rtx (V4DImode);
21417 t4 = gen_reg_rtx (V4DImode);
21418 t5 = gen_reg_rtx (V4DImode);
21419
21420 /* Shuffle the lanes around into
21421 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
21422 emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
21423 gen_lowpart (V4DImode, d->op1),
21424 GEN_INT (0x20)));
21425 emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
21426 gen_lowpart (V4DImode, d->op1),
21427 GEN_INT (0x31)));
21428
21429 /* Swap the 2nd and 3rd position in each lane into
21430 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
21431 emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
21432 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21433 emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
21434 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
21435
21436 /* Now an vpunpck[lh]qdq will produce
21437 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
21438 if (odd)
21439 t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
21440 gen_lowpart (V4DImode, t2));
21441 else
21442 t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
21443 gen_lowpart (V4DImode, t2));
21444 emit_insn (t3);
21445 emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
21446 break;
21447
21448 default:
21449 gcc_unreachable ();
21450 }
21451
21452 return true;
21453 }
21454
21455 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21456 extract-even and extract-odd permutations. */
21457
21458 static bool
21459 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
21460 {
21461 unsigned i, odd, nelt = d->nelt;
21462
21463 odd = d->perm[0];
21464 if (odd != 0 && odd != 1)
21465 return false;
21466
21467 for (i = 1; i < nelt; ++i)
21468 if (d->perm[i] != 2 * i + odd)
21469 return false;
21470
21471 if (d->vmode == E_V32HImode
21472 && d->testing_p
21473 && !TARGET_AVX512BW)
21474 return false;
21475
21476 return expand_vec_perm_even_odd_1 (d, odd);
21477 }
21478
21479 /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
21480 permutations. We assume that expand_vec_perm_1 has already failed. */
21481
21482 static bool
21483 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
21484 {
21485 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
21486 machine_mode vmode = d->vmode;
21487 rtx (*gen) (rtx, rtx, rtx);
21488 unsigned char perm2[4];
21489 rtx op0 = d->op0, dest;
21490 bool ok;
21491
21492 switch (vmode)
21493 {
21494 case E_V4DFmode:
21495 case E_V8SFmode:
21496 /* These are special-cased in sse.md so that we can optionally
21497 use the vbroadcast instruction. They expand to two insns
21498 if the input happens to be in a register. */
21499 gcc_unreachable ();
21500
21501 case E_V2DFmode:
21502 case E_V2SFmode:
21503 case E_V4SFmode:
21504 case E_V2DImode:
21505 case E_V2SImode:
21506 case E_V4SImode:
21507 case E_V2HImode:
21508 case E_V4HImode:
21509 /* These are always implementable using standard shuffle patterns. */
21510 gcc_unreachable ();
21511
21512 case E_V4QImode:
21513 /* This can be implemented via interleave and pshuflw. */
21514 if (d->testing_p)
21515 return true;
21516
21517 if (elt >= nelt2)
21518 {
21519 gen = gen_mmx_punpckhbw_low;
21520 elt -= nelt2;
21521 }
21522 else
21523 gen = gen_mmx_punpcklbw_low;
21524
21525 dest = gen_reg_rtx (vmode);
21526 emit_insn (gen (dest, op0, op0));
21527 vmode = get_mode_wider_vector (vmode);
21528 op0 = gen_lowpart (vmode, dest);
21529
21530 memset (perm2, elt, 2);
21531 dest = gen_reg_rtx (vmode);
21532 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21533 gcc_assert (ok);
21534
21535 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21536 return true;
21537
21538 case E_V8QImode:
21539 /* This can be implemented via interleave. We save one insn by
21540 stopping once we have promoted to V2SImode and then use pshufd. */
21541 if (d->testing_p)
21542 return true;
21543 do
21544 {
21545 if (elt >= nelt2)
21546 {
21547 gen = vmode == V8QImode ? gen_mmx_punpckhbw
21548 : gen_mmx_punpckhwd;
21549 elt -= nelt2;
21550 }
21551 else
21552 gen = vmode == V8QImode ? gen_mmx_punpcklbw
21553 : gen_mmx_punpcklwd;
21554 nelt2 /= 2;
21555
21556 dest = gen_reg_rtx (vmode);
21557 emit_insn (gen (dest, op0, op0));
21558 vmode = get_mode_wider_vector (vmode);
21559 op0 = gen_lowpart (vmode, dest);
21560 }
21561 while (vmode != V2SImode);
21562
21563 memset (perm2, elt, 2);
21564 dest = gen_reg_rtx (vmode);
21565 ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
21566 gcc_assert (ok);
21567
21568 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21569 return true;
21570
21571 case E_V8HImode:
21572 case E_V16QImode:
21573 /* These can be implemented via interleave. We save one insn by
21574 stopping once we have promoted to V4SImode and then use pshufd. */
21575 if (d->testing_p)
21576 return true;
21577 do
21578 {
21579 if (elt >= nelt2)
21580 {
21581 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
21582 : gen_vec_interleave_highv8hi;
21583 elt -= nelt2;
21584 }
21585 else
21586 gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
21587 : gen_vec_interleave_lowv8hi;
21588 nelt2 /= 2;
21589
21590 dest = gen_reg_rtx (vmode);
21591 emit_insn (gen (dest, op0, op0));
21592 vmode = get_mode_wider_vector (vmode);
21593 op0 = gen_lowpart (vmode, dest);
21594 }
21595 while (vmode != V4SImode);
21596
21597 memset (perm2, elt, 4);
21598 dest = gen_reg_rtx (vmode);
21599 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21600 gcc_assert (ok);
21601
21602 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21603 return true;
21604
21605 case E_V8HFmode:
21606 /* This can be implemented via interleave and pshufd. */
21607 if (d->testing_p)
21608 return true;
21609
21610 if (elt >= nelt2)
21611 {
21612 gen = gen_vec_interleave_highv8hf;
21613 elt -= nelt2;
21614 }
21615 else
21616 gen = gen_vec_interleave_lowv8hf;
21617 nelt2 /= 2;
21618
21619 dest = gen_reg_rtx (vmode);
21620 emit_insn (gen (dest, op0, op0));
21621
21622 vmode = V4SImode;
21623 op0 = gen_lowpart (vmode, dest);
21624
21625 memset (perm2, elt, 4);
21626 dest = gen_reg_rtx (vmode);
21627 ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
21628 gcc_assert (ok);
21629
21630 emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
21631 return true;
21632
21633 case E_V32QImode:
21634 case E_V16HImode:
21635 case E_V8SImode:
21636 case E_V4DImode:
21637 /* For AVX2 broadcasts of the first element vpbroadcast* or
21638 vpermq should be used by expand_vec_perm_1. */
21639 gcc_assert (!TARGET_AVX2 || d->perm[0]);
21640 return false;
21641
21642 case E_V64QImode:
21643 gcc_assert (!TARGET_AVX512BW || d->perm[0]);
21644 return false;
21645
21646 case E_V32HImode:
21647 gcc_assert (!TARGET_AVX512BW);
21648 return false;
21649
21650 default:
21651 gcc_unreachable ();
21652 }
21653 }
21654
21655 /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
21656 broadcast permutations. */
21657
21658 static bool
21659 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
21660 {
21661 unsigned i, elt, nelt = d->nelt;
21662
21663 if (!d->one_operand_p)
21664 return false;
21665
21666 elt = d->perm[0];
21667 for (i = 1; i < nelt; ++i)
21668 if (d->perm[i] != elt)
21669 return false;
21670
21671 return expand_vec_perm_broadcast_1 (d);
21672 }
21673
21674 /* Implement arbitrary permutations of two V64QImode operands
21675 with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
21676 static bool
21677 expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
21678 {
21679 if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
21680 return false;
21681
21682 if (d->testing_p)
21683 return true;
21684
21685 struct expand_vec_perm_d ds[2];
21686 rtx rperm[128], vperm, target0, target1;
21687 unsigned int i, nelt;
21688 machine_mode vmode;
21689
21690 nelt = d->nelt;
21691 vmode = V64QImode;
21692
21693 for (i = 0; i < 2; i++)
21694 {
21695 ds[i] = *d;
21696 ds[i].vmode = V32HImode;
21697 ds[i].nelt = 32;
21698 ds[i].target = gen_reg_rtx (V32HImode);
21699 ds[i].op0 = gen_lowpart (V32HImode, d->op0);
21700 ds[i].op1 = gen_lowpart (V32HImode, d->op1);
21701 }
21702
21703 /* Prepare permutations such that the first one takes care of
21704 putting the even bytes into the right positions or one higher
21705 positions (ds[0]) and the second one takes care of
21706 putting the odd bytes into the right positions or one below
21707 (ds[1]). */
21708
21709 for (i = 0; i < nelt; i++)
21710 {
21711 ds[i & 1].perm[i / 2] = d->perm[i] / 2;
21712 if (i & 1)
21713 {
21714 rperm[i] = constm1_rtx;
21715 rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21716 }
21717 else
21718 {
21719 rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
21720 rperm[i + 64] = constm1_rtx;
21721 }
21722 }
21723
21724 bool ok = expand_vec_perm_1 (&ds[0]);
21725 gcc_assert (ok);
21726 ds[0].target = gen_lowpart (V64QImode, ds[0].target);
21727
21728 ok = expand_vec_perm_1 (&ds[1]);
21729 gcc_assert (ok);
21730 ds[1].target = gen_lowpart (V64QImode, ds[1].target);
21731
21732 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
21733 vperm = force_reg (vmode, vperm);
21734 target0 = gen_reg_rtx (V64QImode);
21735 emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
21736
21737 vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
21738 vperm = force_reg (vmode, vperm);
21739 target1 = gen_reg_rtx (V64QImode);
21740 emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
21741
21742 emit_insn (gen_iorv64qi3 (d->target, target0, target1));
21743 return true;
21744 }
21745
21746 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
21747 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
21748 all the shorter instruction sequences. */
21749
21750 static bool
21751 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
21752 {
21753 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
21754 unsigned int i, nelt, eltsz;
21755 bool used[4];
21756
21757 if (!TARGET_AVX2
21758 || d->one_operand_p
21759 || (d->vmode != V32QImode && d->vmode != V16HImode))
21760 return false;
21761
21762 if (d->testing_p)
21763 return true;
21764
21765 nelt = d->nelt;
21766 eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21767
21768 /* Generate 4 permutation masks. If the required element is within
21769 the same lane, it is shuffled in. If the required element from the
21770 other lane, force a zero by setting bit 7 in the permutation mask.
21771 In the other mask the mask has non-negative elements if element
21772 is requested from the other lane, but also moved to the other lane,
21773 so that the result of vpshufb can have the two V2TImode halves
21774 swapped. */
21775 m128 = GEN_INT (-128);
21776 for (i = 0; i < 32; ++i)
21777 {
21778 rperm[0][i] = m128;
21779 rperm[1][i] = m128;
21780 rperm[2][i] = m128;
21781 rperm[3][i] = m128;
21782 }
21783 used[0] = false;
21784 used[1] = false;
21785 used[2] = false;
21786 used[3] = false;
21787 for (i = 0; i < nelt; ++i)
21788 {
21789 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
21790 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
21791 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
21792
21793 for (j = 0; j < eltsz; ++j)
21794 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
21795 used[which] = true;
21796 }
21797
21798 for (i = 0; i < 2; ++i)
21799 {
21800 if (!used[2 * i + 1])
21801 {
21802 h[i] = NULL_RTX;
21803 continue;
21804 }
21805 vperm = gen_rtx_CONST_VECTOR (V32QImode,
21806 gen_rtvec_v (32, rperm[2 * i + 1]));
21807 vperm = force_reg (V32QImode, vperm);
21808 h[i] = gen_reg_rtx (V32QImode);
21809 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21810 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
21811 }
21812
21813 /* Swap the 128-byte lanes of h[X]. */
21814 for (i = 0; i < 2; ++i)
21815 {
21816 if (h[i] == NULL_RTX)
21817 continue;
21818 op = gen_reg_rtx (V4DImode);
21819 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
21820 const2_rtx, GEN_INT (3), const0_rtx,
21821 const1_rtx));
21822 h[i] = gen_lowpart (V32QImode, op);
21823 }
21824
21825 for (i = 0; i < 2; ++i)
21826 {
21827 if (!used[2 * i])
21828 {
21829 l[i] = NULL_RTX;
21830 continue;
21831 }
21832 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
21833 vperm = force_reg (V32QImode, vperm);
21834 l[i] = gen_reg_rtx (V32QImode);
21835 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
21836 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
21837 }
21838
21839 for (i = 0; i < 2; ++i)
21840 {
21841 if (h[i] && l[i])
21842 {
21843 op = gen_reg_rtx (V32QImode);
21844 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
21845 l[i] = op;
21846 }
21847 else if (h[i])
21848 l[i] = h[i];
21849 }
21850
21851 gcc_assert (l[0] && l[1]);
21852 op = d->target;
21853 if (d->vmode != V32QImode)
21854 op = gen_reg_rtx (V32QImode);
21855 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
21856 if (op != d->target)
21857 emit_move_insn (d->target, gen_lowpart (d->vmode, op));
21858 return true;
21859 }
21860
21861 /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
21862 taken care of, perform the expansion in D and return true on success. */
21863
21864 static bool
21865 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
21866 {
21867 /* Try a single instruction expansion. */
21868 if (expand_vec_perm_1 (d))
21869 return true;
21870
21871 /* Try sequences of two instructions. */
21872
21873 if (expand_vec_perm_pshuflw_pshufhw (d))
21874 return true;
21875
21876 if (expand_vec_perm_palignr (d, false))
21877 return true;
21878
21879 if (expand_vec_perm_interleave2 (d))
21880 return true;
21881
21882 if (expand_vec_perm_broadcast (d))
21883 return true;
21884
21885 if (expand_vec_perm_vpermq_perm_1 (d))
21886 return true;
21887
21888 if (expand_vec_perm_vperm2f128 (d))
21889 return true;
21890
21891 if (expand_vec_perm_pblendv (d))
21892 return true;
21893
21894 if (expand_vec_perm_2perm_interleave (d, true))
21895 return true;
21896
21897 if (expand_vec_perm_2perm_pblendv (d, true))
21898 return true;
21899
21900 /* Try sequences of three instructions. */
21901
21902 if (expand_vec_perm_even_odd_pack (d))
21903 return true;
21904
21905 if (expand_vec_perm_2vperm2f128_vshuf (d))
21906 return true;
21907
21908 if (expand_vec_perm_pshufb2 (d))
21909 return true;
21910
21911 if (expand_vec_perm_interleave3 (d))
21912 return true;
21913
21914 if (expand_vec_perm_vperm2f128_vblend (d))
21915 return true;
21916
21917 if (expand_vec_perm_2perm_interleave (d, false))
21918 return true;
21919
21920 if (expand_vec_perm_2perm_pblendv (d, false))
21921 return true;
21922
21923 /* Try sequences of four instructions. */
21924
21925 if (expand_vec_perm_even_odd_trunc (d))
21926 return true;
21927 if (expand_vec_perm_vpshufb2_vpermq (d))
21928 return true;
21929
21930 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
21931 return true;
21932
21933 if (expand_vec_perm_vpermt2_vpshub2 (d))
21934 return true;
21935
21936 /* ??? Look for narrow permutations whose element orderings would
21937 allow the promotion to a wider mode. */
21938
21939 /* ??? Look for sequences of interleave or a wider permute that place
21940 the data into the correct lanes for a half-vector shuffle like
21941 pshuf[lh]w or vpermilps. */
21942
21943 /* ??? Look for sequences of interleave that produce the desired results.
21944 The combinatorics of punpck[lh] get pretty ugly... */
21945
21946 if (expand_vec_perm_even_odd (d))
21947 return true;
21948
21949 /* Even longer sequences. */
21950 if (expand_vec_perm_vpshufb4_vpermq2 (d))
21951 return true;
21952
21953 /* See if we can get the same permutation in different vector integer
21954 mode. */
21955 struct expand_vec_perm_d nd;
21956 if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21957 {
21958 if (!d->testing_p)
21959 emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21960 return true;
21961 }
21962
21963 /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
21964 if (expand_vec_perm2_vperm2f128_vblend (d))
21965 return true;
21966
21967 return false;
21968 }
21969
21970 /* If a permutation only uses one operand, make it clear. Returns true
21971 if the permutation references both operands. */
21972
21973 static bool
21974 canonicalize_perm (struct expand_vec_perm_d *d)
21975 {
21976 int i, which, nelt = d->nelt;
21977
21978 for (i = which = 0; i < nelt; ++i)
21979 which |= (d->perm[i] < nelt ? 1 : 2);
21980
21981 d->one_operand_p = true;
21982 switch (which)
21983 {
21984 default:
21985 gcc_unreachable();
21986
21987 case 3:
21988 if (!rtx_equal_p (d->op0, d->op1))
21989 {
21990 d->one_operand_p = false;
21991 break;
21992 }
21993 /* The elements of PERM do not suggest that only the first operand
21994 is used, but both operands are identical. Allow easier matching
21995 of the permutation by folding the permutation into the single
21996 input vector. */
21997 /* FALLTHRU */
21998
21999 case 2:
22000 for (i = 0; i < nelt; ++i)
22001 d->perm[i] &= nelt - 1;
22002 d->op0 = d->op1;
22003 break;
22004
22005 case 1:
22006 d->op1 = d->op0;
22007 break;
22008 }
22009
22010 return (which == 3);
22011 }
22012
22013 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22014
22015 bool
22016 ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
22017 rtx op1, const vec_perm_indices &sel)
22018 {
22019 struct expand_vec_perm_d d;
22020 unsigned char perm[MAX_VECT_LEN];
22021 unsigned int i, nelt, which;
22022 bool two_args;
22023
22024 /* For HF mode vector, convert it to HI using subreg. */
22025 if (GET_MODE_INNER (vmode) == HFmode)
22026 {
22027 machine_mode orig_mode = vmode;
22028 vmode = mode_for_vector (HImode,
22029 GET_MODE_NUNITS (vmode)).require ();
22030 if (target)
22031 target = lowpart_subreg (vmode, target, orig_mode);
22032 if (op0)
22033 op0 = lowpart_subreg (vmode, op0, orig_mode);
22034 if (op1)
22035 op1 = lowpart_subreg (vmode, op1, orig_mode);
22036 }
22037
22038 d.target = target;
22039 d.op0 = op0;
22040 d.op1 = op1;
22041
22042 d.vmode = vmode;
22043 gcc_assert (VECTOR_MODE_P (d.vmode));
22044 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22045 d.testing_p = !target;
22046
22047 gcc_assert (sel.length () == nelt);
22048 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
22049
22050 /* Given sufficient ISA support we can just return true here
22051 for selected vector modes. */
22052 switch (d.vmode)
22053 {
22054 case E_V16SFmode:
22055 case E_V16SImode:
22056 case E_V8DImode:
22057 case E_V8DFmode:
22058 if (!TARGET_AVX512F)
22059 return false;
22060 /* All implementable with a single vperm[it]2 insn. */
22061 if (d.testing_p)
22062 return true;
22063 break;
22064 case E_V32HImode:
22065 if (!TARGET_AVX512F)
22066 return false;
22067 if (d.testing_p && TARGET_AVX512BW)
22068 /* All implementable with a single vperm[it]2 insn. */
22069 return true;
22070 break;
22071 case E_V64QImode:
22072 if (!TARGET_AVX512F)
22073 return false;
22074 if (d.testing_p && TARGET_AVX512BW)
22075 /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
22076 return true;
22077 break;
22078 case E_V8SImode:
22079 case E_V8SFmode:
22080 case E_V4DFmode:
22081 case E_V4DImode:
22082 if (!TARGET_AVX)
22083 return false;
22084 if (d.testing_p && TARGET_AVX512VL)
22085 /* All implementable with a single vperm[it]2 insn. */
22086 return true;
22087 break;
22088 case E_V16HImode:
22089 if (!TARGET_SSE2)
22090 return false;
22091 if (d.testing_p && TARGET_AVX2)
22092 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22093 return true;
22094 break;
22095 case E_V32QImode:
22096 if (!TARGET_SSE2)
22097 return false;
22098 if (d.testing_p && TARGET_AVX2)
22099 /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
22100 return true;
22101 break;
22102 case E_V8HImode:
22103 case E_V16QImode:
22104 if (!TARGET_SSE2)
22105 return false;
22106 /* Fall through. */
22107 case E_V4SImode:
22108 case E_V4SFmode:
22109 if (!TARGET_SSE)
22110 return false;
22111 /* All implementable with a single vpperm insn. */
22112 if (d.testing_p && TARGET_XOP)
22113 return true;
22114 /* All implementable with 2 pshufb + 1 ior. */
22115 if (d.testing_p && TARGET_SSSE3)
22116 return true;
22117 break;
22118 case E_V2SFmode:
22119 case E_V2SImode:
22120 case E_V4HImode:
22121 case E_V8QImode:
22122 if (!TARGET_MMX_WITH_SSE)
22123 return false;
22124 break;
22125 case E_V2HImode:
22126 if (!TARGET_SSE2)
22127 return false;
22128 /* All implementable with *punpckwd. */
22129 if (d.testing_p)
22130 return true;
22131 break;
22132 case E_V4QImode:
22133 if (!TARGET_SSE2)
22134 return false;
22135 break;
22136 case E_V2DImode:
22137 case E_V2DFmode:
22138 if (!TARGET_SSE)
22139 return false;
22140 /* All implementable with shufpd or unpck[lh]pd. */
22141 if (d.testing_p)
22142 return true;
22143 break;
22144 default:
22145 return false;
22146 }
22147
22148 for (i = which = 0; i < nelt; ++i)
22149 {
22150 unsigned char e = sel[i];
22151 gcc_assert (e < 2 * nelt);
22152 d.perm[i] = e;
22153 perm[i] = e;
22154 which |= (e < nelt ? 1 : 2);
22155 }
22156
22157 if (d.testing_p)
22158 {
22159 /* For all elements from second vector, fold the elements to first. */
22160 if (which == 2)
22161 for (i = 0; i < nelt; ++i)
22162 d.perm[i] -= nelt;
22163
22164 /* Check whether the mask can be applied to the vector type. */
22165 d.one_operand_p = (which != 3);
22166
22167 /* Implementable with shufps, pshufd or pshuflw. */
22168 if (d.one_operand_p
22169 && (d.vmode == V4SFmode || d.vmode == V2SFmode
22170 || d.vmode == V4SImode || d.vmode == V2SImode
22171 || d.vmode == V4HImode || d.vmode == V2HImode))
22172 return true;
22173
22174 /* Otherwise we have to go through the motions and see if we can
22175 figure out how to generate the requested permutation. */
22176 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
22177 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
22178 if (!d.one_operand_p)
22179 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
22180
22181 start_sequence ();
22182 bool ret = ix86_expand_vec_perm_const_1 (&d);
22183 end_sequence ();
22184
22185 return ret;
22186 }
22187
22188 two_args = canonicalize_perm (&d);
22189
22190 /* If one of the operands is a zero vector, try to match pmovzx. */
22191 if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
22192 {
22193 struct expand_vec_perm_d dzero = d;
22194 if (d.op0 == CONST0_RTX (vmode))
22195 {
22196 d.op1 = dzero.op1 = force_reg (vmode, d.op1);
22197 std::swap (dzero.op0, dzero.op1);
22198 for (i = 0; i < nelt; ++i)
22199 dzero.perm[i] ^= nelt;
22200 }
22201 else
22202 d.op0 = dzero.op0 = force_reg (vmode, d.op0);
22203
22204 if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
22205 dzero.perm, nelt, dzero.testing_p))
22206 return true;
22207 }
22208
22209 /* Force operands into registers. */
22210 rtx nop0 = force_reg (vmode, d.op0);
22211 if (d.op0 == d.op1)
22212 d.op1 = nop0;
22213 d.op0 = nop0;
22214 d.op1 = force_reg (vmode, d.op1);
22215
22216 if (ix86_expand_vec_perm_const_1 (&d))
22217 return true;
22218
22219 /* If the selector says both arguments are needed, but the operands are the
22220 same, the above tried to expand with one_operand_p and flattened selector.
22221 If that didn't work, retry without one_operand_p; we succeeded with that
22222 during testing. */
22223 if (two_args && d.one_operand_p)
22224 {
22225 d.one_operand_p = false;
22226 memcpy (d.perm, perm, sizeof (perm));
22227 return ix86_expand_vec_perm_const_1 (&d);
22228 }
22229
22230 return false;
22231 }
22232
22233 void
22234 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
22235 {
22236 struct expand_vec_perm_d d;
22237 unsigned i, nelt;
22238
22239 d.target = targ;
22240 d.op0 = op0;
22241 d.op1 = op1;
22242 d.vmode = GET_MODE (targ);
22243 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22244 d.one_operand_p = false;
22245 d.testing_p = false;
22246
22247 for (i = 0; i < nelt; ++i)
22248 d.perm[i] = i * 2 + odd;
22249
22250 /* We'll either be able to implement the permutation directly... */
22251 if (expand_vec_perm_1 (&d))
22252 return;
22253
22254 /* ... or we use the special-case patterns. */
22255 expand_vec_perm_even_odd_1 (&d, odd);
22256 }
22257
22258 static void
22259 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
22260 {
22261 struct expand_vec_perm_d d;
22262 unsigned i, nelt, base;
22263 bool ok;
22264
22265 d.target = targ;
22266 d.op0 = op0;
22267 d.op1 = op1;
22268 d.vmode = GET_MODE (targ);
22269 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
22270 d.one_operand_p = false;
22271 d.testing_p = false;
22272
22273 base = high_p ? nelt / 2 : 0;
22274 for (i = 0; i < nelt / 2; ++i)
22275 {
22276 d.perm[i * 2] = i + base;
22277 d.perm[i * 2 + 1] = i + base + nelt;
22278 }
22279
22280 /* Note that for AVX this isn't one instruction. */
22281 ok = ix86_expand_vec_perm_const_1 (&d);
22282 gcc_assert (ok);
22283 }
22284
22285 /* This function is similar as ix86_expand_vecop_qihi,
22286 but optimized under AVX512BW by using vpmovwb.
22287 For example, optimize vector MUL generation like
22288
22289 vpmovzxbw ymm2, xmm0
22290 vpmovzxbw ymm3, xmm1
22291 vpmullw ymm4, ymm2, ymm3
22292 vpmovwb xmm0, ymm4
22293
22294 it would take less instructions than ix86_expand_vecop_qihi.
22295 Return true if success. */
22296
22297 static bool
22298 ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22299 {
22300 machine_mode himode, qimode = GET_MODE (dest);
22301 rtx hop1, hop2, hdest;
22302 rtx (*gen_extend)(rtx, rtx);
22303 rtx (*gen_truncate)(rtx, rtx);
22304 bool uns_p = (code == ASHIFTRT) ? false : true;
22305
22306 /* There's no V64HImode multiplication instruction. */
22307 if (qimode == E_V64QImode)
22308 return false;
22309
22310 /* vpmovwb only available under AVX512BW. */
22311 if (!TARGET_AVX512BW)
22312 return false;
22313 if ((qimode == V8QImode || qimode == V16QImode)
22314 && !TARGET_AVX512VL)
22315 return false;
22316 /* Not generate zmm instruction when prefer 128/256 bit vector width. */
22317 if (qimode == V32QImode
22318 && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
22319 return false;
22320
22321 switch (qimode)
22322 {
22323 case E_V8QImode:
22324 himode = V8HImode;
22325 gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
22326 gen_truncate = gen_truncv8hiv8qi2;
22327 break;
22328 case E_V16QImode:
22329 himode = V16HImode;
22330 gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
22331 gen_truncate = gen_truncv16hiv16qi2;
22332 break;
22333 case E_V32QImode:
22334 himode = V32HImode;
22335 gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
22336 gen_truncate = gen_truncv32hiv32qi2;
22337 break;
22338 default:
22339 gcc_unreachable ();
22340 }
22341
22342 hop1 = gen_reg_rtx (himode);
22343 hop2 = gen_reg_rtx (himode);
22344 hdest = gen_reg_rtx (himode);
22345 emit_insn (gen_extend (hop1, op1));
22346 emit_insn (gen_extend (hop2, op2));
22347 emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
22348 hop1, hop2)));
22349 emit_insn (gen_truncate (dest, hdest));
22350 return true;
22351 }
22352
22353 /* Expand a vector operation shift by constant for a V*QImode in terms of the
22354 same operation on V*HImode. Return true if success. */
22355 static bool
22356 ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
22357 rtx dest, rtx op1, rtx op2)
22358 {
22359 machine_mode qimode, himode;
22360 HOST_WIDE_INT and_constant, xor_constant;
22361 HOST_WIDE_INT shift_amount;
22362 rtx vec_const_and, vec_const_xor;
22363 rtx tmp, op1_subreg;
22364 rtx (*gen_shift) (rtx, rtx, rtx);
22365 rtx (*gen_and) (rtx, rtx, rtx);
22366 rtx (*gen_xor) (rtx, rtx, rtx);
22367 rtx (*gen_sub) (rtx, rtx, rtx);
22368
22369 /* Only optimize shift by constant. */
22370 if (!CONST_INT_P (op2))
22371 return false;
22372
22373 qimode = GET_MODE (dest);
22374 shift_amount = INTVAL (op2);
22375 /* Do nothing when shift amount greater equal 8. */
22376 if (shift_amount > 7)
22377 return false;
22378
22379 gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
22380 /* Record sign bit. */
22381 xor_constant = 1 << (8 - shift_amount - 1);
22382
22383 /* Zero upper/lower bits shift from left/right element. */
22384 and_constant
22385 = (code == ASHIFT ? 256 - (1 << shift_amount)
22386 : (1 << (8 - shift_amount)) - 1);
22387
22388 switch (qimode)
22389 {
22390 case V16QImode:
22391 himode = V8HImode;
22392 gen_shift =
22393 ((code == ASHIFT)
22394 ? gen_ashlv8hi3
22395 : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
22396 gen_and = gen_andv16qi3;
22397 gen_xor = gen_xorv16qi3;
22398 gen_sub = gen_subv16qi3;
22399 break;
22400 case V32QImode:
22401 himode = V16HImode;
22402 gen_shift =
22403 ((code == ASHIFT)
22404 ? gen_ashlv16hi3
22405 : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
22406 gen_and = gen_andv32qi3;
22407 gen_xor = gen_xorv32qi3;
22408 gen_sub = gen_subv32qi3;
22409 break;
22410 case V64QImode:
22411 himode = V32HImode;
22412 gen_shift =
22413 ((code == ASHIFT)
22414 ? gen_ashlv32hi3
22415 : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
22416 gen_and = gen_andv64qi3;
22417 gen_xor = gen_xorv64qi3;
22418 gen_sub = gen_subv64qi3;
22419 break;
22420 default:
22421 gcc_unreachable ();
22422 }
22423
22424 tmp = gen_reg_rtx (himode);
22425 vec_const_and = gen_reg_rtx (qimode);
22426 op1_subreg = lowpart_subreg (himode, op1, qimode);
22427
22428 /* For ASHIFT and LSHIFTRT, perform operation like
22429 vpsllw/vpsrlw $shift_amount, %op1, %dest.
22430 vpand %vec_const_and, %dest. */
22431 emit_insn (gen_shift (tmp, op1_subreg, op2));
22432 emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
22433 emit_move_insn (vec_const_and,
22434 ix86_build_const_vector (qimode, true,
22435 gen_int_mode (and_constant, QImode)));
22436 emit_insn (gen_and (dest, dest, vec_const_and));
22437
22438 /* For ASHIFTRT, perform extra operation like
22439 vpxor %vec_const_xor, %dest, %dest
22440 vpsubb %vec_const_xor, %dest, %dest */
22441 if (code == ASHIFTRT)
22442 {
22443 vec_const_xor = gen_reg_rtx (qimode);
22444 emit_move_insn (vec_const_xor,
22445 ix86_build_const_vector (qimode, true,
22446 gen_int_mode (xor_constant, QImode)));
22447 emit_insn (gen_xor (dest, dest, vec_const_xor));
22448 emit_insn (gen_sub (dest, dest, vec_const_xor));
22449 }
22450 return true;
22451 }
22452
22453 /* Expand a vector operation CODE for a V*QImode in terms of the
22454 same operation on V*HImode. */
22455
22456 void
22457 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
22458 {
22459 machine_mode qimode = GET_MODE (dest);
22460 machine_mode himode;
22461 rtx (*gen_il) (rtx, rtx, rtx);
22462 rtx (*gen_ih) (rtx, rtx, rtx);
22463 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
22464 struct expand_vec_perm_d d;
22465 bool ok, full_interleave;
22466 bool uns_p = false;
22467 int i;
22468
22469 if (CONST_INT_P (op2)
22470 && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
22471 && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
22472 return;
22473
22474 if (TARGET_AVX512BW
22475 && VECTOR_MODE_P (GET_MODE (op2))
22476 && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
22477 return;
22478
22479 switch (qimode)
22480 {
22481 case E_V16QImode:
22482 himode = V8HImode;
22483 gen_il = gen_vec_interleave_lowv16qi;
22484 gen_ih = gen_vec_interleave_highv16qi;
22485 break;
22486 case E_V32QImode:
22487 himode = V16HImode;
22488 gen_il = gen_avx2_interleave_lowv32qi;
22489 gen_ih = gen_avx2_interleave_highv32qi;
22490 break;
22491 case E_V64QImode:
22492 himode = V32HImode;
22493 gen_il = gen_avx512bw_interleave_lowv64qi;
22494 gen_ih = gen_avx512bw_interleave_highv64qi;
22495 break;
22496 default:
22497 gcc_unreachable ();
22498 }
22499
22500 switch (code)
22501 {
22502 case MULT:
22503 /* Unpack data such that we've got a source byte in each low byte of
22504 each word. We don't care what goes into the high byte of each word.
22505 Rather than trying to get zero in there, most convenient is to let
22506 it be a copy of the low byte. */
22507 op2_l = gen_reg_rtx (qimode);
22508 op2_h = gen_reg_rtx (qimode);
22509 emit_insn (gen_il (op2_l, op2, op2));
22510 emit_insn (gen_ih (op2_h, op2, op2));
22511
22512 op1_l = gen_reg_rtx (qimode);
22513 op1_h = gen_reg_rtx (qimode);
22514 emit_insn (gen_il (op1_l, op1, op1));
22515 emit_insn (gen_ih (op1_h, op1, op1));
22516 full_interleave = qimode == V16QImode;
22517 break;
22518
22519 case ASHIFT:
22520 case LSHIFTRT:
22521 uns_p = true;
22522 /* FALLTHRU */
22523 case ASHIFTRT:
22524 op1_l = gen_reg_rtx (himode);
22525 op1_h = gen_reg_rtx (himode);
22526 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
22527 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
22528 /* vashr/vlshr/vashl */
22529 if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22530 {
22531 rtx tmp = force_reg (qimode, op2);
22532 op2_l = gen_reg_rtx (himode);
22533 op2_h = gen_reg_rtx (himode);
22534 ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
22535 ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
22536 }
22537 else
22538 op2_l = op2_h = op2;
22539
22540 full_interleave = true;
22541 break;
22542 default:
22543 gcc_unreachable ();
22544 }
22545
22546 /* Perform vashr/vlshr/vashl. */
22547 if (code != MULT
22548 && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
22549 {
22550 res_l = gen_reg_rtx (himode);
22551 res_h = gen_reg_rtx (himode);
22552 emit_insn (gen_rtx_SET (res_l,
22553 simplify_gen_binary (code, himode,
22554 op1_l, op2_l)));
22555 emit_insn (gen_rtx_SET (res_h,
22556 simplify_gen_binary (code, himode,
22557 op1_h, op2_h)));
22558 }
22559 /* Performance mult/ashr/lshr/ashl. */
22560 else
22561 {
22562 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
22563 1, OPTAB_DIRECT);
22564 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
22565 1, OPTAB_DIRECT);
22566 }
22567
22568 gcc_assert (res_l && res_h);
22569
22570 /* Merge the data back into the right place. */
22571 d.target = dest;
22572 d.op0 = gen_lowpart (qimode, res_l);
22573 d.op1 = gen_lowpart (qimode, res_h);
22574 d.vmode = qimode;
22575 d.nelt = GET_MODE_NUNITS (qimode);
22576 d.one_operand_p = false;
22577 d.testing_p = false;
22578
22579 if (full_interleave)
22580 {
22581 /* For SSE2, we used an full interleave, so the desired
22582 results are in the even elements. */
22583 for (i = 0; i < d.nelt; ++i)
22584 d.perm[i] = i * 2;
22585 }
22586 else
22587 {
22588 /* For AVX, the interleave used above was not cross-lane. So the
22589 extraction is evens but with the second and third quarter swapped.
22590 Happily, that is even one insn shorter than even extraction.
22591 For AVX512BW we have 4 lanes. We extract evens from within a lane,
22592 always first from the first and then from the second source operand,
22593 the index bits above the low 4 bits remains the same.
22594 Thus, for d.nelt == 32 we want permutation
22595 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
22596 and for d.nelt == 64 we want permutation
22597 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
22598 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
22599 for (i = 0; i < d.nelt; ++i)
22600 d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
22601 }
22602
22603 ok = ix86_expand_vec_perm_const_1 (&d);
22604 gcc_assert (ok);
22605
22606 set_unique_reg_note (get_last_insn (), REG_EQUAL,
22607 gen_rtx_fmt_ee (code, qimode, op1, op2));
22608 }
22609
22610 /* Helper function of ix86_expand_mul_widen_evenodd. Return true
22611 if op is CONST_VECTOR with all odd elements equal to their
22612 preceding element. */
22613
22614 static bool
22615 const_vector_equal_evenodd_p (rtx op)
22616 {
22617 machine_mode mode = GET_MODE (op);
22618 int i, nunits = GET_MODE_NUNITS (mode);
22619 if (GET_CODE (op) != CONST_VECTOR
22620 || nunits != CONST_VECTOR_NUNITS (op))
22621 return false;
22622 for (i = 0; i < nunits; i += 2)
22623 if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
22624 return false;
22625 return true;
22626 }
22627
22628 void
22629 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
22630 bool uns_p, bool odd_p)
22631 {
22632 machine_mode mode = GET_MODE (op1);
22633 machine_mode wmode = GET_MODE (dest);
22634 rtx x;
22635 rtx orig_op1 = op1, orig_op2 = op2;
22636
22637 if (!nonimmediate_operand (op1, mode))
22638 op1 = force_reg (mode, op1);
22639 if (!nonimmediate_operand (op2, mode))
22640 op2 = force_reg (mode, op2);
22641
22642 /* We only play even/odd games with vectors of SImode. */
22643 gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
22644
22645 /* If we're looking for the odd results, shift those members down to
22646 the even slots. For some cpus this is faster than a PSHUFD. */
22647 if (odd_p)
22648 {
22649 /* For XOP use vpmacsdqh, but only for smult, as it is only
22650 signed. */
22651 if (TARGET_XOP && mode == V4SImode && !uns_p)
22652 {
22653 x = force_reg (wmode, CONST0_RTX (wmode));
22654 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
22655 return;
22656 }
22657
22658 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
22659 if (!const_vector_equal_evenodd_p (orig_op1))
22660 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
22661 x, NULL, 1, OPTAB_DIRECT);
22662 if (!const_vector_equal_evenodd_p (orig_op2))
22663 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
22664 x, NULL, 1, OPTAB_DIRECT);
22665 op1 = gen_lowpart (mode, op1);
22666 op2 = gen_lowpart (mode, op2);
22667 }
22668
22669 if (mode == V16SImode)
22670 {
22671 if (uns_p)
22672 x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
22673 else
22674 x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
22675 }
22676 else if (mode == V8SImode)
22677 {
22678 if (uns_p)
22679 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
22680 else
22681 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
22682 }
22683 else if (uns_p)
22684 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
22685 else if (TARGET_SSE4_1)
22686 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
22687 else
22688 {
22689 rtx s1, s2, t0, t1, t2;
22690
22691 /* The easiest way to implement this without PMULDQ is to go through
22692 the motions as if we are performing a full 64-bit multiply. With
22693 the exception that we need to do less shuffling of the elements. */
22694
22695 /* Compute the sign-extension, aka highparts, of the two operands. */
22696 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22697 op1, pc_rtx, pc_rtx);
22698 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
22699 op2, pc_rtx, pc_rtx);
22700
22701 /* Multiply LO(A) * HI(B), and vice-versa. */
22702 t1 = gen_reg_rtx (wmode);
22703 t2 = gen_reg_rtx (wmode);
22704 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
22705 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
22706
22707 /* Multiply LO(A) * LO(B). */
22708 t0 = gen_reg_rtx (wmode);
22709 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
22710
22711 /* Combine and shift the highparts into place. */
22712 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
22713 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
22714 1, OPTAB_DIRECT);
22715
22716 /* Combine high and low parts. */
22717 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
22718 return;
22719 }
22720 emit_insn (x);
22721 }
22722
22723 void
22724 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
22725 bool uns_p, bool high_p)
22726 {
22727 machine_mode wmode = GET_MODE (dest);
22728 machine_mode mode = GET_MODE (op1);
22729 rtx t1, t2, t3, t4, mask;
22730
22731 switch (mode)
22732 {
22733 case E_V4SImode:
22734 t1 = gen_reg_rtx (mode);
22735 t2 = gen_reg_rtx (mode);
22736 if (TARGET_XOP && !uns_p)
22737 {
22738 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
22739 shuffle the elements once so that all elements are in the right
22740 place for immediate use: { A C B D }. */
22741 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
22742 const1_rtx, GEN_INT (3)));
22743 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
22744 const1_rtx, GEN_INT (3)));
22745 }
22746 else
22747 {
22748 /* Put the elements into place for the multiply. */
22749 ix86_expand_vec_interleave (t1, op1, op1, high_p);
22750 ix86_expand_vec_interleave (t2, op2, op2, high_p);
22751 high_p = false;
22752 }
22753 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
22754 break;
22755
22756 case E_V8SImode:
22757 /* Shuffle the elements between the lanes. After this we
22758 have { A B E F | C D G H } for each operand. */
22759 t1 = gen_reg_rtx (V4DImode);
22760 t2 = gen_reg_rtx (V4DImode);
22761 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
22762 const0_rtx, const2_rtx,
22763 const1_rtx, GEN_INT (3)));
22764 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
22765 const0_rtx, const2_rtx,
22766 const1_rtx, GEN_INT (3)));
22767
22768 /* Shuffle the elements within the lanes. After this we
22769 have { A A B B | C C D D } or { E E F F | G G H H }. */
22770 t3 = gen_reg_rtx (V8SImode);
22771 t4 = gen_reg_rtx (V8SImode);
22772 mask = GEN_INT (high_p
22773 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
22774 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
22775 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
22776 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
22777
22778 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
22779 break;
22780
22781 case E_V8HImode:
22782 case E_V16HImode:
22783 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
22784 uns_p, OPTAB_DIRECT);
22785 t2 = expand_binop (mode,
22786 uns_p ? umul_highpart_optab : smul_highpart_optab,
22787 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
22788 gcc_assert (t1 && t2);
22789
22790 t3 = gen_reg_rtx (mode);
22791 ix86_expand_vec_interleave (t3, t1, t2, high_p);
22792 emit_move_insn (dest, gen_lowpart (wmode, t3));
22793 break;
22794
22795 case E_V16QImode:
22796 case E_V32QImode:
22797 case E_V32HImode:
22798 case E_V16SImode:
22799 case E_V64QImode:
22800 t1 = gen_reg_rtx (wmode);
22801 t2 = gen_reg_rtx (wmode);
22802 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
22803 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
22804
22805 emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
22806 break;
22807
22808 default:
22809 gcc_unreachable ();
22810 }
22811 }
22812
22813 void
22814 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
22815 {
22816 rtx res_1, res_2, res_3, res_4;
22817
22818 res_1 = gen_reg_rtx (V4SImode);
22819 res_2 = gen_reg_rtx (V4SImode);
22820 res_3 = gen_reg_rtx (V2DImode);
22821 res_4 = gen_reg_rtx (V2DImode);
22822 ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
22823 ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
22824
22825 /* Move the results in element 2 down to element 1; we don't care
22826 what goes in elements 2 and 3. Then we can merge the parts
22827 back together with an interleave.
22828
22829 Note that two other sequences were tried:
22830 (1) Use interleaves at the start instead of psrldq, which allows
22831 us to use a single shufps to merge things back at the end.
22832 (2) Use shufps here to combine the two vectors, then pshufd to
22833 put the elements in the correct order.
22834 In both cases the cost of the reformatting stall was too high
22835 and the overall sequence slower. */
22836
22837 emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
22838 const0_rtx, const2_rtx,
22839 const0_rtx, const0_rtx));
22840 emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
22841 const0_rtx, const2_rtx,
22842 const0_rtx, const0_rtx));
22843 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
22844
22845 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
22846 }
22847
22848 void
22849 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
22850 {
22851 machine_mode mode = GET_MODE (op0);
22852 rtx t1, t2, t3, t4, t5, t6;
22853
22854 if (TARGET_AVX512DQ && mode == V8DImode)
22855 emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
22856 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
22857 emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
22858 else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
22859 emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
22860 else if (TARGET_XOP && mode == V2DImode)
22861 {
22862 /* op1: A,B,C,D, op2: E,F,G,H */
22863 op1 = gen_lowpart (V4SImode, op1);
22864 op2 = gen_lowpart (V4SImode, op2);
22865
22866 t1 = gen_reg_rtx (V4SImode);
22867 t2 = gen_reg_rtx (V4SImode);
22868 t3 = gen_reg_rtx (V2DImode);
22869 t4 = gen_reg_rtx (V2DImode);
22870
22871 /* t1: B,A,D,C */
22872 emit_insn (gen_sse2_pshufd_1 (t1, op1,
22873 GEN_INT (1),
22874 GEN_INT (0),
22875 GEN_INT (3),
22876 GEN_INT (2)));
22877
22878 /* t2: (B*E),(A*F),(D*G),(C*H) */
22879 emit_insn (gen_mulv4si3 (t2, t1, op2));
22880
22881 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
22882 emit_insn (gen_xop_phadddq (t3, t2));
22883
22884 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
22885 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
22886
22887 /* Multiply lower parts and add all */
22888 t5 = gen_reg_rtx (V2DImode);
22889 emit_insn (gen_vec_widen_umult_even_v4si (t5,
22890 gen_lowpart (V4SImode, op1),
22891 gen_lowpart (V4SImode, op2)));
22892 force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
22893 }
22894 else
22895 {
22896 machine_mode nmode;
22897 rtx (*umul) (rtx, rtx, rtx);
22898
22899 if (mode == V2DImode)
22900 {
22901 umul = gen_vec_widen_umult_even_v4si;
22902 nmode = V4SImode;
22903 }
22904 else if (mode == V4DImode)
22905 {
22906 umul = gen_vec_widen_umult_even_v8si;
22907 nmode = V8SImode;
22908 }
22909 else if (mode == V8DImode)
22910 {
22911 umul = gen_vec_widen_umult_even_v16si;
22912 nmode = V16SImode;
22913 }
22914 else
22915 gcc_unreachable ();
22916
22917
22918 /* Multiply low parts. */
22919 t1 = gen_reg_rtx (mode);
22920 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
22921
22922 /* Shift input vectors right 32 bits so we can multiply high parts. */
22923 t6 = GEN_INT (32);
22924 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
22925 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
22926
22927 /* Multiply high parts by low parts. */
22928 t4 = gen_reg_rtx (mode);
22929 t5 = gen_reg_rtx (mode);
22930 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
22931 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
22932
22933 /* Combine and shift the highparts back. */
22934 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
22935 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
22936
22937 /* Combine high and low parts. */
22938 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
22939 }
22940
22941 set_unique_reg_note (get_last_insn (), REG_EQUAL,
22942 gen_rtx_MULT (mode, op1, op2));
22943 }
22944
22945 /* Return 1 if control tansfer instruction INSN
22946 should be encoded with notrack prefix. */
22947
22948 bool
22949 ix86_notrack_prefixed_insn_p (rtx_insn *insn)
22950 {
22951 if (!insn || !((flag_cf_protection & CF_BRANCH)))
22952 return false;
22953
22954 if (CALL_P (insn))
22955 {
22956 rtx call = get_call_rtx_from (insn);
22957 gcc_assert (call != NULL_RTX);
22958 rtx addr = XEXP (call, 0);
22959
22960 /* Do not emit 'notrack' if it's not an indirect call. */
22961 if (MEM_P (addr)
22962 && GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
22963 return false;
22964 else
22965 return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
22966 }
22967
22968 if (JUMP_P (insn) && !flag_cet_switch)
22969 {
22970 rtx target = JUMP_LABEL (insn);
22971 if (target == NULL_RTX || ANY_RETURN_P (target))
22972 return false;
22973
22974 /* Check the jump is a switch table. */
22975 rtx_insn *label = as_a<rtx_insn *> (target);
22976 rtx_insn *table = next_insn (label);
22977 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
22978 return false;
22979 else
22980 return true;
22981 }
22982 return false;
22983 }
22984
22985 /* Calculate integer abs() using only SSE2 instructions. */
22986
22987 void
22988 ix86_expand_sse2_abs (rtx target, rtx input)
22989 {
22990 machine_mode mode = GET_MODE (target);
22991 rtx tmp0, tmp1, x;
22992
22993 switch (mode)
22994 {
22995 case E_V2DImode:
22996 case E_V4DImode:
22997 /* For 64-bit signed integer X, with SSE4.2 use
22998 pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
22999 Otherwise handle it similarly to V4SImode, except use 64 as W instead of
23000 32 and use logical instead of arithmetic right shift (which is
23001 unimplemented) and subtract. */
23002 if (TARGET_SSE4_2)
23003 {
23004 tmp0 = gen_reg_rtx (mode);
23005 tmp1 = gen_reg_rtx (mode);
23006 emit_move_insn (tmp1, CONST0_RTX (mode));
23007 if (mode == E_V2DImode)
23008 emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
23009 else
23010 emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
23011 }
23012 else
23013 {
23014 tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
23015 GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
23016 - 1), NULL, 0, OPTAB_DIRECT);
23017 tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
23018 }
23019
23020 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23021 NULL, 0, OPTAB_DIRECT);
23022 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23023 target, 0, OPTAB_DIRECT);
23024 break;
23025
23026 case E_V4SImode:
23027 /* For 32-bit signed integer X, the best way to calculate the absolute
23028 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
23029 tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
23030 GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
23031 NULL, 0, OPTAB_DIRECT);
23032 tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
23033 NULL, 0, OPTAB_DIRECT);
23034 x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
23035 target, 0, OPTAB_DIRECT);
23036 break;
23037
23038 case E_V8HImode:
23039 /* For 16-bit signed integer X, the best way to calculate the absolute
23040 value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
23041 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23042
23043 x = expand_simple_binop (mode, SMAX, tmp0, input,
23044 target, 0, OPTAB_DIRECT);
23045 break;
23046
23047 case E_V16QImode:
23048 /* For 8-bit signed integer X, the best way to calculate the absolute
23049 value of X is min ((unsigned char) X, (unsigned char) (-X)),
23050 as SSE2 provides the PMINUB insn. */
23051 tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
23052
23053 x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
23054 target, 0, OPTAB_DIRECT);
23055 break;
23056
23057 default:
23058 gcc_unreachable ();
23059 }
23060
23061 if (x != target)
23062 emit_move_insn (target, x);
23063 }
23064
23065 /* Expand an extract from a vector register through pextr insn.
23066 Return true if successful. */
23067
23068 bool
23069 ix86_expand_pextr (rtx *operands)
23070 {
23071 rtx dst = operands[0];
23072 rtx src = operands[1];
23073
23074 unsigned int size = INTVAL (operands[2]);
23075 unsigned int pos = INTVAL (operands[3]);
23076
23077 if (SUBREG_P (dst))
23078 {
23079 /* Reject non-lowpart subregs. */
23080 if (SUBREG_BYTE (dst) > 0)
23081 return false;
23082 dst = SUBREG_REG (dst);
23083 }
23084
23085 if (SUBREG_P (src))
23086 {
23087 pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
23088 src = SUBREG_REG (src);
23089 }
23090
23091 switch (GET_MODE (src))
23092 {
23093 case E_V16QImode:
23094 case E_V8HImode:
23095 case E_V4SImode:
23096 case E_V2DImode:
23097 case E_V1TImode:
23098 {
23099 machine_mode srcmode, dstmode;
23100 rtx d, pat;
23101
23102 if (!int_mode_for_size (size, 0).exists (&dstmode))
23103 return false;
23104
23105 switch (dstmode)
23106 {
23107 case E_QImode:
23108 if (!TARGET_SSE4_1)
23109 return false;
23110 srcmode = V16QImode;
23111 break;
23112
23113 case E_HImode:
23114 if (!TARGET_SSE2)
23115 return false;
23116 srcmode = V8HImode;
23117 break;
23118
23119 case E_SImode:
23120 if (!TARGET_SSE4_1)
23121 return false;
23122 srcmode = V4SImode;
23123 break;
23124
23125 case E_DImode:
23126 gcc_assert (TARGET_64BIT);
23127 if (!TARGET_SSE4_1)
23128 return false;
23129 srcmode = V2DImode;
23130 break;
23131
23132 default:
23133 return false;
23134 }
23135
23136 /* Reject extractions from misaligned positions. */
23137 if (pos & (size-1))
23138 return false;
23139
23140 if (GET_MODE (dst) == dstmode)
23141 d = dst;
23142 else
23143 d = gen_reg_rtx (dstmode);
23144
23145 /* Construct insn pattern. */
23146 pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
23147 pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
23148
23149 /* Let the rtl optimizers know about the zero extension performed. */
23150 if (dstmode == QImode || dstmode == HImode)
23151 {
23152 pat = gen_rtx_ZERO_EXTEND (SImode, pat);
23153 d = gen_lowpart (SImode, d);
23154 }
23155
23156 emit_insn (gen_rtx_SET (d, pat));
23157
23158 if (d != dst)
23159 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23160 return true;
23161 }
23162
23163 default:
23164 return false;
23165 }
23166 }
23167
23168 /* Expand an insert into a vector register through pinsr insn.
23169 Return true if successful. */
23170
23171 bool
23172 ix86_expand_pinsr (rtx *operands)
23173 {
23174 rtx dst = operands[0];
23175 rtx src = operands[3];
23176
23177 unsigned int size = INTVAL (operands[1]);
23178 unsigned int pos = INTVAL (operands[2]);
23179
23180 if (SUBREG_P (dst))
23181 {
23182 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
23183 dst = SUBREG_REG (dst);
23184 }
23185
23186 switch (GET_MODE (dst))
23187 {
23188 case E_V16QImode:
23189 case E_V8HImode:
23190 case E_V4SImode:
23191 case E_V2DImode:
23192 case E_V1TImode:
23193 {
23194 machine_mode srcmode, dstmode;
23195 rtx (*pinsr)(rtx, rtx, rtx, rtx);
23196 rtx d;
23197
23198 if (!int_mode_for_size (size, 0).exists (&srcmode))
23199 return false;
23200
23201 switch (srcmode)
23202 {
23203 case E_QImode:
23204 if (!TARGET_SSE4_1)
23205 return false;
23206 dstmode = V16QImode;
23207 pinsr = gen_sse4_1_pinsrb;
23208 break;
23209
23210 case E_HImode:
23211 if (!TARGET_SSE2)
23212 return false;
23213 dstmode = V8HImode;
23214 pinsr = gen_sse2_pinsrw;
23215 break;
23216
23217 case E_SImode:
23218 if (!TARGET_SSE4_1)
23219 return false;
23220 dstmode = V4SImode;
23221 pinsr = gen_sse4_1_pinsrd;
23222 break;
23223
23224 case E_DImode:
23225 gcc_assert (TARGET_64BIT);
23226 if (!TARGET_SSE4_1)
23227 return false;
23228 dstmode = V2DImode;
23229 pinsr = gen_sse4_1_pinsrq;
23230 break;
23231
23232 default:
23233 return false;
23234 }
23235
23236 /* Reject insertions to misaligned positions. */
23237 if (pos & (size-1))
23238 return false;
23239
23240 if (SUBREG_P (src))
23241 {
23242 unsigned int srcpos = SUBREG_BYTE (src);
23243
23244 if (srcpos > 0)
23245 {
23246 rtx extr_ops[4];
23247
23248 extr_ops[0] = gen_reg_rtx (srcmode);
23249 extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
23250 extr_ops[2] = GEN_INT (size);
23251 extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
23252
23253 if (!ix86_expand_pextr (extr_ops))
23254 return false;
23255
23256 src = extr_ops[0];
23257 }
23258 else
23259 src = gen_lowpart (srcmode, SUBREG_REG (src));
23260 }
23261
23262 if (GET_MODE (dst) == dstmode)
23263 d = dst;
23264 else
23265 d = gen_reg_rtx (dstmode);
23266
23267 emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
23268 gen_lowpart (srcmode, src),
23269 GEN_INT (1 << (pos / size))));
23270 if (d != dst)
23271 emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
23272 return true;
23273 }
23274
23275 default:
23276 return false;
23277 }
23278 }
23279
23280 /* All CPUs prefer to avoid cross-lane operations so perform reductions
23281 upper against lower halves up to SSE reg size. */
23282
23283 machine_mode
23284 ix86_split_reduction (machine_mode mode)
23285 {
23286 /* Reduce lowpart against highpart until we reach SSE reg width to
23287 avoid cross-lane operations. */
23288 switch (mode)
23289 {
23290 case E_V8DImode:
23291 case E_V4DImode:
23292 return V2DImode;
23293 case E_V16SImode:
23294 case E_V8SImode:
23295 return V4SImode;
23296 case E_V32HImode:
23297 case E_V16HImode:
23298 return V8HImode;
23299 case E_V64QImode:
23300 case E_V32QImode:
23301 return V16QImode;
23302 case E_V16SFmode:
23303 case E_V8SFmode:
23304 return V4SFmode;
23305 case E_V8DFmode:
23306 case E_V4DFmode:
23307 return V2DFmode;
23308 default:
23309 return mode;
23310 }
23311 }
23312
23313 /* Generate call to __divmoddi4. */
23314
23315 void
23316 ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
23317 rtx op0, rtx op1,
23318 rtx *quot_p, rtx *rem_p)
23319 {
23320 rtx rem = assign_386_stack_local (mode, SLOT_TEMP);
23321
23322 rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
23323 mode, op0, mode, op1, mode,
23324 XEXP (rem, 0), Pmode);
23325 *quot_p = quot;
23326 *rem_p = rem;
23327 }
23328
23329 void
23330 ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
23331 enum rtx_code code, bool after,
23332 bool doubleword)
23333 {
23334 rtx old_reg, new_reg, old_mem, success;
23335 machine_mode mode = GET_MODE (target);
23336 rtx_code_label *loop_label = NULL;
23337
23338 old_reg = gen_reg_rtx (mode);
23339 new_reg = old_reg;
23340 old_mem = copy_to_reg (mem);
23341 loop_label = gen_label_rtx ();
23342 emit_label (loop_label);
23343 emit_move_insn (old_reg, old_mem);
23344
23345 /* return value for atomic_fetch_op. */
23346 if (!after)
23347 emit_move_insn (target, old_reg);
23348
23349 if (code == NOT)
23350 {
23351 new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
23352 true, OPTAB_LIB_WIDEN);
23353 new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
23354 }
23355 else
23356 new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
23357 true, OPTAB_LIB_WIDEN);
23358
23359 /* return value for atomic_op_fetch. */
23360 if (after)
23361 emit_move_insn (target, new_reg);
23362
23363 success = NULL_RTX;
23364
23365 ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
23366 gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
23367 SImode),
23368 doubleword, loop_label);
23369 }
23370
23371 /* Relax cmpxchg instruction, param loop_label indicates whether
23372 the instruction should be relaxed with a pause loop. If not,
23373 it will be relaxed to an atomic load + compare, and skip
23374 cmpxchg instruction if mem != exp_input. */
23375
23376 void
23377 ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
23378 rtx mem, rtx exp_input, rtx new_input,
23379 rtx mem_model, bool doubleword,
23380 rtx_code_label *loop_label)
23381 {
23382 rtx_code_label *cmp_label = NULL;
23383 rtx_code_label *done_label = NULL;
23384 rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
23385 rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
23386 rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
23387 machine_mode mode = GET_MODE (target_val), hmode = mode;
23388
23389 if (*ptarget_bool == NULL)
23390 target_bool = gen_reg_rtx (QImode);
23391 else
23392 target_bool = *ptarget_bool;
23393
23394 cmp_label = gen_label_rtx ();
23395 done_label = gen_label_rtx ();
23396
23397 new_mem = gen_reg_rtx (mode);
23398 /* Load memory first. */
23399 expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
23400
23401 switch (mode)
23402 {
23403 case E_TImode:
23404 gendw = gen_atomic_compare_and_swapti_doubleword;
23405 hmode = DImode;
23406 break;
23407 case E_DImode:
23408 if (doubleword)
23409 {
23410 gendw = gen_atomic_compare_and_swapdi_doubleword;
23411 hmode = SImode;
23412 }
23413 else
23414 gen = gen_atomic_compare_and_swapdi_1;
23415 break;
23416 case E_SImode:
23417 gen = gen_atomic_compare_and_swapsi_1;
23418 break;
23419 case E_HImode:
23420 gen = gen_atomic_compare_and_swaphi_1;
23421 break;
23422 case E_QImode:
23423 gen = gen_atomic_compare_and_swapqi_1;
23424 break;
23425 default:
23426 gcc_unreachable ();
23427 }
23428
23429 /* Compare mem value with expected value. */
23430 if (doubleword)
23431 {
23432 rtx low_new_mem = gen_lowpart (hmode, new_mem);
23433 rtx low_exp_input = gen_lowpart (hmode, exp_input);
23434 rtx high_new_mem = gen_highpart (hmode, new_mem);
23435 rtx high_exp_input = gen_highpart (hmode, exp_input);
23436 emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
23437 hmode, 1, cmp_label,
23438 profile_probability::guessed_never ());
23439 emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
23440 hmode, 1, cmp_label,
23441 profile_probability::guessed_never ());
23442 }
23443 else
23444 emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
23445 GET_MODE (exp_input), 1, cmp_label,
23446 profile_probability::guessed_never ());
23447
23448 /* Directly emits cmpxchg here. */
23449 if (doubleword)
23450 emit_insn (gendw (target_val, mem, exp_input,
23451 gen_lowpart (hmode, new_input),
23452 gen_highpart (hmode, new_input),
23453 mem_model));
23454 else
23455 emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
23456
23457 if (!loop_label)
23458 {
23459 emit_jump_insn (gen_jump (done_label));
23460 emit_barrier ();
23461 emit_label (cmp_label);
23462 emit_move_insn (target_val, new_mem);
23463 emit_label (done_label);
23464 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23465 const0_rtx);
23466 }
23467 else
23468 {
23469 ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
23470 const0_rtx);
23471 emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
23472 GET_MODE (target_bool), 1, loop_label,
23473 profile_probability::guessed_never ());
23474 emit_jump_insn (gen_jump (done_label));
23475 emit_barrier ();
23476
23477 /* If mem is not expected, pause and loop back. */
23478 emit_label (cmp_label);
23479 emit_move_insn (target_val, new_mem);
23480 emit_insn (gen_pause ());
23481 emit_jump_insn (gen_jump (loop_label));
23482 emit_barrier ();
23483 emit_label (done_label);
23484 }
23485
23486 *ptarget_bool = target_bool;
23487 }
23488
23489 #include "gt-i386-expand.h"
23490