Home | History | Annotate | Line # | Download | only in riscv
      1 /* Subroutines used for code generation for RISC-V 'V' Extension for
      2    GNU compiler.
      3    Copyright (C) 2022-2024 Free Software Foundation, Inc.
      4    Contributed by Juzhe Zhong (juzhe.zhong (at) rivai.ai), RiVAI Technologies Ltd.
      5 
      6    This file is part of GCC.
      7 
      8    GCC is free software; you can redistribute it and/or modify it
      9    under the terms of the GNU General Public License as published by
     10    the Free Software Foundation; either version 3, or (at your option)
     11    any later version.
     12 
     13    GCC is distributed in the hope that it will be useful, but
     14    WITHOUT ANY WARRANTY; without even the implied warranty of
     15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16    General Public License for more details.
     17 
     18    You should have received a copy of the GNU General Public License
     19    along with GCC; see the file COPYING3.  If not see
     20    <http://www.gnu.org/licenses/>.  */
     21 
     22 #define IN_TARGET_CODE 1
     23 
     24 /* We have a maximum of 11 operands for RVV instruction patterns according to
     25    the vector.md.  */
     26 #define RVV_INSN_OPERANDS_MAX 11
     27 
     28 #include "config.h"
     29 #include "system.h"
     30 #include "coretypes.h"
     31 #include "tm.h"
     32 #include "backend.h"
     33 #include "rtl.h"
     34 #include "insn-config.h"
     35 #include "insn-attr.h"
     36 #include "recog.h"
     37 #include "alias.h"
     38 #include "tree.h"
     39 #include "stringpool.h"
     40 #include "attribs.h"
     41 #include "explow.h"
     42 #include "memmodel.h"
     43 #include "emit-rtl.h"
     44 #include "tm_p.h"
     45 #include "target.h"
     46 #include "targhooks.h"
     47 #include "expr.h"
     48 #include "optabs.h"
     49 #include "tm-constrs.h"
     50 #include "rtx-vector-builder.h"
     51 #include "targhooks.h"
     52 #include "predict.h"
     53 
     54 using namespace riscv_vector;
     55 
     56 namespace riscv_vector {
     57 
     58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli.  */
     59 bool
     60 imm_avl_p (machine_mode mode)
     61 {
     62   poly_uint64 nunits = GET_MODE_NUNITS (mode);
     63 
     64   return nunits.is_constant ()
     65 	   /* The vsetivli can only hold register 0~31.  */
     66 	   ? (IN_RANGE (nunits.to_constant (), 0, 31))
     67 	   /* Only allowed in VLS-VLMAX mode.  */
     68 	   : false;
     69 }
     70 
     71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31].  */
     72 static bool
     73 is_vlmax_len_p (machine_mode mode, rtx len)
     74 {
     75   poly_int64 value;
     76   return poly_int_rtx_p (len, &value)
     77 	 && known_eq (value, GET_MODE_NUNITS (mode));
     78 }
     79 
     80 /* Helper functions for insn_flags && insn_types */
     81 
     82 /* Return true if caller need pass mask operand for insn pattern with
     83    INSN_FLAGS. */
     84 
     85 static bool
     86 need_mask_operand_p (unsigned insn_flags)
     87 {
     88   return (insn_flags & HAS_MASK_P)
     89 	 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P));
     90 }
     91 
     92 template <int MAX_OPERANDS> class insn_expander
     93 {
     94 public:
     95   insn_expander () = delete;
     96 
     97   insn_expander (unsigned insn_flags, bool vlmax_p)
     98     : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p),
     99       m_vl_op (NULL_RTX)
    100   {
    101     check_insn_flags ();
    102   }
    103 
    104   void check_insn_flags () const
    105   {
    106     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
    107       /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P.  */
    108       gcc_assert ((m_insn_flags & HAS_MASK_P));
    109 
    110     if (m_insn_flags & USE_ALL_TRUES_MASK_P)
    111       /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P.  */
    112       gcc_assert ((m_insn_flags & HAS_MASK_P));
    113 
    114     /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive.  */
    115     gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P)
    116 		  && (m_insn_flags & USE_ALL_TRUES_MASK_P)));
    117 
    118     if (m_insn_flags & USE_VUNDEF_MERGE_P)
    119       /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P.  */
    120       gcc_assert ((m_insn_flags & HAS_MERGE_P));
    121 
    122     /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive.  */
    123     gcc_assert (
    124       !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P)));
    125 
    126     /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive.  */
    127     gcc_assert (
    128       !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P)));
    129 
    130     /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually
    131        exclusive.  */
    132     gcc_assert (
    133       !((m_insn_flags & NULLARY_OP_P)
    134 	&& ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P)
    135 	    || (m_insn_flags & TERNARY_OP_P))));
    136     gcc_assert (
    137       !((m_insn_flags & UNARY_OP_P)
    138 	&& ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P)
    139 	    || (m_insn_flags & TERNARY_OP_P))));
    140     gcc_assert (
    141       !((m_insn_flags & BINARY_OP_P)
    142 	&& ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
    143 	    || (m_insn_flags & TERNARY_OP_P))));
    144     gcc_assert (
    145       !((m_insn_flags & TERNARY_OP_P)
    146 	&& ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P)
    147 	    || (m_insn_flags & BINARY_OP_P))));
    148   }
    149 
    150   void set_vl (rtx vl) { m_vl_op = vl; }
    151 
    152   void add_output_operand (rtx x, machine_mode mode)
    153   {
    154     create_output_operand (&m_ops[m_opno++], x, mode);
    155     gcc_assert (m_opno <= MAX_OPERANDS);
    156   }
    157   void add_input_operand (rtx x, machine_mode mode)
    158   {
    159     create_input_operand (&m_ops[m_opno++], x, mode);
    160     gcc_assert (m_opno <= MAX_OPERANDS);
    161   }
    162   void add_all_one_mask_operand (machine_mode mask_mode)
    163   {
    164     add_input_operand (CONSTM1_RTX (mask_mode), mask_mode);
    165   }
    166   void add_first_one_true_mask_operand (machine_mode mask_mode)
    167   {
    168     add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode);
    169   }
    170   void add_vundef_operand (machine_mode dest_mode)
    171   {
    172     add_input_operand (RVV_VUNDEF (dest_mode), dest_mode);
    173   }
    174   void add_policy_operand ()
    175   {
    176     if (m_insn_flags & TU_POLICY_P)
    177       {
    178 	rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode);
    179 	add_input_operand (tail_policy_rtx, Pmode);
    180       }
    181     else if (m_insn_flags & TDEFAULT_POLICY_P)
    182       {
    183 	rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode);
    184 	add_input_operand (tail_policy_rtx, Pmode);
    185       }
    186 
    187     if (m_insn_flags & MU_POLICY_P)
    188       {
    189 	rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode);
    190 	add_input_operand (mask_policy_rtx, Pmode);
    191       }
    192     else if (m_insn_flags & MDEFAULT_POLICY_P)
    193       {
    194 	rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode);
    195 	add_input_operand (mask_policy_rtx, Pmode);
    196       }
    197   }
    198   void add_avl_type_operand (avl_type type)
    199   {
    200     add_input_operand (gen_int_mode (type, Pmode), Pmode);
    201   }
    202 
    203   void
    204   add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode)
    205   {
    206     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
    207     add_input_operand (frm_rtx, Pmode);
    208   }
    209 
    210   void
    211   add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode)
    212   {
    213     rtx frm_rtx = gen_int_mode (rounding_mode, Pmode);
    214     add_input_operand (frm_rtx, Pmode);
    215   }
    216 
    217   /* Return the vtype mode based on insn_flags.
    218      vtype mode mean the mode vsetvl insn set. */
    219   machine_mode
    220   get_vtype_mode (rtx *ops)
    221   {
    222     machine_mode vtype_mode;
    223     if (m_insn_flags & VTYPE_MODE_FROM_OP1_P)
    224       vtype_mode = GET_MODE (ops[1]);
    225     else
    226       vtype_mode = GET_MODE (ops[0]);
    227     return vtype_mode;
    228   }
    229 
    230   void emit_insn (enum insn_code icode, rtx *ops)
    231   {
    232     int opno = 0;
    233     int num_ops;
    234     /* It's true if any operand is memory operand.  */
    235     bool any_mem_p = false;
    236 
    237     machine_mode vtype_mode = get_vtype_mode (ops);
    238     machine_mode mask_mode = get_mask_mode (vtype_mode);
    239 
    240     /* Add dest operand.  */
    241     if (m_insn_flags & HAS_DEST_P)
    242       {
    243 	rtx op = ops[opno++];
    244 	any_mem_p |= MEM_P (op);
    245 	add_output_operand (op, GET_MODE (op));
    246       }
    247 
    248     /* Add mask operand.  */
    249     if (m_insn_flags & USE_ONE_TRUE_MASK_P)
    250       add_first_one_true_mask_operand (mask_mode);
    251     else if (m_insn_flags & USE_ALL_TRUES_MASK_P)
    252       add_all_one_mask_operand (mask_mode);
    253     else if (m_insn_flags & HAS_MASK_P)
    254       {
    255 	machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
    256 	gcc_assert (mode != VOIDmode);
    257 	add_input_operand (ops[opno++], mode);
    258       }
    259 
    260     /* Add merge operand.  */
    261     if (m_insn_flags & USE_VUNDEF_MERGE_P)
    262       /* Same as dest operand.  */
    263       add_vundef_operand (GET_MODE (ops[0]));
    264     else if (m_insn_flags & HAS_MERGE_P)
    265       {
    266 	machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
    267 	gcc_assert (mode != VOIDmode);
    268 	add_input_operand (ops[opno++], mode);
    269       }
    270 
    271     if (m_insn_flags & NULLARY_OP_P)
    272       num_ops = 0;
    273     else if (m_insn_flags & UNARY_OP_P)
    274       num_ops = 1;
    275     else if (m_insn_flags & BINARY_OP_P)
    276       num_ops = 2;
    277     else if (m_insn_flags & TERNARY_OP_P)
    278       num_ops = 3;
    279     else
    280       gcc_unreachable ();
    281 
    282     /* Add the remain operands.  */
    283     for (; num_ops; num_ops--, opno++)
    284       {
    285 	any_mem_p |= MEM_P (ops[opno]);
    286 	machine_mode mode = insn_data[(int) icode].operand[m_opno].mode;
    287 	/* 'create_input_operand doesn't allow VOIDmode.
    288 	   According to vector.md, we may have some patterns that do not have
    289 	   explicit machine mode specifying the operand. Such operands are
    290 	   always Pmode.  */
    291 	if (mode == VOIDmode)
    292 	  mode = Pmode;
    293 	else
    294 	  /* Early assertion ensures same mode since maybe_legitimize_operand
    295 	     will check this.  */
    296 	  gcc_assert (GET_MODE (ops[opno]) == VOIDmode
    297 		      || GET_MODE (ops[opno]) == mode);
    298 
    299 	add_input_operand (ops[opno], mode);
    300       }
    301 
    302     /* Add vl operand.  */
    303     rtx len = m_vl_op;
    304     bool vls_p = false;
    305     if (m_vlmax_p)
    306       {
    307 	if (riscv_v_ext_vls_mode_p (vtype_mode))
    308 	  {
    309 	    /* VLS modes always set VSETVL by
    310 	       "vsetvl zero, rs1/imm".  */
    311 	    poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode);
    312 	    len = gen_int_mode (nunits, Pmode);
    313 	    vls_p = true;
    314 	  }
    315 	else if (can_create_pseudo_p ())
    316 	  {
    317 	    len = gen_reg_rtx (Pmode);
    318 	    emit_vlmax_vsetvl (vtype_mode, len);
    319 	  }
    320       }
    321 
    322     gcc_assert (len != NULL_RTX);
    323     add_input_operand (len, Pmode);
    324 
    325     /* Add tail and mask policy operands.  */
    326     add_policy_operand ();
    327 
    328     /* Add avl_type operand.  */
    329     add_avl_type_operand (
    330       vls_p ? avl_type::VLS
    331 	    : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX));
    332 
    333     /* Add rounding mode operand.  */
    334     if (m_insn_flags & FRM_DYN_P)
    335       add_rounding_mode_operand (FRM_DYN);
    336     else if (m_insn_flags & FRM_RUP_P)
    337       add_rounding_mode_operand (FRM_RUP);
    338     else if (m_insn_flags & FRM_RDN_P)
    339       add_rounding_mode_operand (FRM_RDN);
    340     else if (m_insn_flags & FRM_RMM_P)
    341       add_rounding_mode_operand (FRM_RMM);
    342     else if (m_insn_flags & FRM_RNE_P)
    343       add_rounding_mode_operand (FRM_RNE);
    344     else if (m_insn_flags & VXRM_RNU_P)
    345       add_rounding_mode_operand (VXRM_RNU);
    346     else if (m_insn_flags & VXRM_RDN_P)
    347       add_rounding_mode_operand (VXRM_RDN);
    348 
    349     gcc_assert (insn_data[(int) icode].n_operands == m_opno);
    350     expand (icode, any_mem_p);
    351   }
    352 
    353   void expand (enum insn_code icode, bool temporary_volatile_p = false)
    354   {
    355     if (temporary_volatile_p)
    356       {
    357 	temporary_volatile_ok v (true);
    358 	expand_insn (icode, m_opno, m_ops);
    359       }
    360     else
    361       expand_insn (icode, m_opno, m_ops);
    362   }
    363 
    364 private:
    365   unsigned m_insn_flags;
    366   int m_opno;
    367   bool m_vlmax_p;
    368   rtx m_vl_op;
    369   expand_operand m_ops[MAX_OPERANDS];
    370 };
    371 
    372 /* Emit an RVV insn with a vector length that equals the number of units of the
    373    vector mode.  For VLA modes this corresponds to VLMAX.
    374 
    375    Unless the vector length can be encoded in the vsetivl[i] instruction this
    376    function must only be used as long as we can create pseudo registers. This is
    377    because it will set a pseudo register to VLMAX using vsetvl and use this as
    378    definition for the vector length.  */
    379 void
    380 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops)
    381 {
    382   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
    383   gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops)));
    384 
    385   e.emit_insn ((enum insn_code) icode, ops);
    386 }
    387 
    388 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo
    389    registers anymore.  This function, however, takes a predefined vector length
    390    from the value in VL. */
    391 void
    392 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
    393 {
    394   gcc_assert (!can_create_pseudo_p ());
    395   machine_mode mode = GET_MODE (ops[0]);
    396 
    397   if (imm_avl_p (mode))
    398     {
    399       /* Even though VL is a real hardreg already allocated since
    400 	 it is post-RA now, we still gain benefits that we emit
    401 	 vsetivli zero, imm instead of vsetvli VL, zero which is
    402 	 we can be more flexible in post-RA instruction scheduling.  */
    403       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
    404       e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode));
    405       e.emit_insn ((enum insn_code) icode, ops);
    406     }
    407   else
    408     {
    409       insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true);
    410       e.set_vl (vl);
    411       e.emit_insn ((enum insn_code) icode, ops);
    412     }
    413 }
    414 
    415 /* Emit an RVV insn with a predefined vector length.  Contrary to
    416    emit_vlmax_insn the instruction's vector length is not deduced from its mode
    417    but taken from  the value in VL.  */
    418 void
    419 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl)
    420 {
    421   insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false);
    422   e.set_vl (vl);
    423   e.emit_insn ((enum insn_code) icode, ops);
    424 }
    425 
    426 class rvv_builder : public rtx_vector_builder
    427 {
    428 public:
    429   rvv_builder () : rtx_vector_builder () {}
    430   rvv_builder (machine_mode mode, unsigned int npatterns,
    431 	       unsigned int nelts_per_pattern)
    432     : rtx_vector_builder (mode, npatterns, nelts_per_pattern)
    433   {
    434     m_inner_mode = GET_MODE_INNER (mode);
    435     m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode);
    436     m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode);
    437     m_mask_mode = get_mask_mode (mode);
    438 
    439     gcc_assert (
    440       int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode));
    441     m_int_mode
    442       = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require ();
    443   }
    444 
    445   bool can_duplicate_repeating_sequence_p ();
    446   bool is_repeating_sequence ();
    447   rtx get_merged_repeating_sequence ();
    448 
    449   bool repeating_sequence_use_merge_profitable_p ();
    450   bool combine_sequence_use_slideup_profitable_p ();
    451   bool combine_sequence_use_merge_profitable_p ();
    452   rtx get_merge_scalar_mask (unsigned int, machine_mode) const;
    453 
    454   bool single_step_npatterns_p () const;
    455   bool npatterns_all_equal_p () const;
    456   bool interleaved_stepped_npatterns_p () const;
    457   bool npatterns_vid_diff_repeated_p () const;
    458 
    459   machine_mode new_mode () const { return m_new_mode; }
    460   scalar_mode inner_mode () const { return m_inner_mode; }
    461   scalar_int_mode inner_int_mode () const { return m_inner_int_mode; }
    462   machine_mode mask_mode () const { return m_mask_mode; }
    463   machine_mode int_mode () const { return m_int_mode; }
    464   unsigned int inner_bits_size () const { return m_inner_bits_size; }
    465   unsigned int inner_bytes_size () const { return m_inner_bytes_size; }
    466 
    467 private:
    468   scalar_mode m_inner_mode;
    469   scalar_int_mode m_inner_int_mode;
    470   machine_mode m_new_mode;
    471   scalar_int_mode m_new_inner_mode;
    472   machine_mode m_mask_mode;
    473   machine_mode m_int_mode;
    474   unsigned int m_inner_bits_size;
    475   unsigned int m_inner_bytes_size;
    476 };
    477 
    478 /* Return true if the vector duplicated by a super element which is the fusion
    479    of consecutive elements.
    480 
    481      v = { a, b, a, b } super element = ab, v = { ab, ab }  */
    482 bool
    483 rvv_builder::can_duplicate_repeating_sequence_p ()
    484 {
    485   poly_uint64 new_size = exact_div (full_nelts (), npatterns ());
    486   unsigned int new_inner_size = m_inner_bits_size * npatterns ();
    487   if (m_inner_mode == Pmode
    488       || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode)
    489       || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD
    490       || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode))
    491     return false;
    492   if (full_nelts ().is_constant ())
    493     return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ());
    494   return nelts_per_pattern () == 1;
    495 }
    496 
    497 /* Return true if the vector is a simple sequence with one pattern and all
    498    elements the same.  */
    499 bool
    500 rvv_builder::is_repeating_sequence ()
    501 {
    502   if (npatterns () > 1)
    503     return false;
    504   if (full_nelts ().is_constant ())
    505     return repeating_sequence_p (0, full_nelts ().to_constant (), 1);
    506   return nelts_per_pattern () == 1;
    507 }
    508 
    509 /* Return true if it is a repeating sequence that using
    510    merge approach has better codegen than using default
    511    approach (slide1down).
    512 
    513    Sequence A:
    514      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
    515 
    516    nelts = 16
    517    npatterns = 2
    518 
    519    for merging a we need mask 101010....
    520    for merging b we need mask 010101....
    521 
    522    Foreach element in the npattern, we need to build a mask in scalar register.
    523    Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar
    524    instruction and 1 scalar move to v0 register.  Finally we need vector merge
    525    to merge them.
    526 
    527    lui		a5, #imm
    528    add		a5, #imm
    529    vmov.s.x	v0, a5
    530    vmerge.vxm	v9, v9, a1, v0
    531 
    532    So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8.
    533    If we use slide1down, the COST = nelts = 16 > 8 (COST of merge).
    534    So return true in this case as it is profitable.
    535 
    536    Sequence B:
    537      {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h}
    538 
    539    nelts = 16
    540    npatterns = 8
    541 
    542    COST of merge approach = (3 + 1) * npatterns = 24
    543    COST of slide1down approach = nelts = 16
    544    Return false in this case as it is NOT profitable in merge approach.
    545 */
    546 bool
    547 rvv_builder::repeating_sequence_use_merge_profitable_p ()
    548 {
    549   if (inner_bytes_size () > UNITS_PER_WORD)
    550     return false;
    551 
    552   unsigned int nelts = full_nelts ().to_constant ();
    553 
    554   if (!repeating_sequence_p (0, nelts, npatterns ()))
    555     return false;
    556 
    557   unsigned int merge_cost = 1;
    558   unsigned int build_merge_mask_cost = 3;
    559   unsigned int slide1down_cost = nelts;
    560 
    561   return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost;
    562 }
    563 
    564 /* Return true if it's worthwhile to use slideup combine 2 vectors.  */
    565 bool
    566 rvv_builder::combine_sequence_use_slideup_profitable_p ()
    567 {
    568   int nelts = full_nelts ().to_constant ();
    569   int leading_ndups = this->count_dups (0, nelts - 1, 1);
    570   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
    571 
    572   /* ??? Current heuristic we do is we do combine 2 vectors
    573      by slideup when:
    574        1. # of leading same elements is equal to # of trailing same elements.
    575        2. Both of above are equal to nelts / 2.
    576      Otherwise, it is not profitable.  */
    577   return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2;
    578 }
    579 
    580 /* Return true if it's worthwhile to use merge combine vector with a scalar.  */
    581 bool
    582 rvv_builder::combine_sequence_use_merge_profitable_p ()
    583 {
    584   int nelts = full_nelts ().to_constant ();
    585   int leading_ndups = this->count_dups (0, nelts - 1, 1);
    586   int trailing_ndups = this->count_dups (nelts - 1, -1, -1);
    587   int nregs = riscv_get_v_regno_alignment (int_mode ());
    588 
    589   if (leading_ndups + trailing_ndups != nelts)
    590     return false;
    591 
    592   /* Leading elements num > 255 which exceeds the maximum value
    593      of QImode, we will need to use HImode.  */
    594   machine_mode mode;
    595   if (leading_ndups > 255 || nregs > 2)
    596     {
    597       if (!get_vector_mode (HImode, nelts).exists (&mode))
    598 	return false;
    599       /* We will need one more AVL/VL toggling vsetvl instruction.  */
    600       return leading_ndups > 4 && trailing_ndups > 4;
    601     }
    602 
    603   /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a }
    604      consume 3 slide instructions.  */
    605   return leading_ndups > 3 && trailing_ndups > 3;
    606 }
    607 
    608 /* Merge the repeating sequence into a single element and return the RTX.  */
    609 rtx
    610 rvv_builder::get_merged_repeating_sequence ()
    611 {
    612   scalar_int_mode mode = Pmode;
    613   rtx target = gen_reg_rtx (mode);
    614   emit_move_insn (target, const0_rtx);
    615   rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode);
    616   /* { a, b, a, b }: Generate duplicate element = b << bits | a.  */
    617   for (unsigned int i = 0; i < npatterns (); i++)
    618     {
    619       unsigned int loc = m_inner_bits_size * i;
    620       rtx shift = gen_int_mode (loc, mode);
    621       rtx ele = gen_lowpart (mode, elt (i));
    622       rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false,
    623 				     OPTAB_DIRECT);
    624       rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false,
    625 				      OPTAB_DIRECT);
    626       rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false,
    627 				      OPTAB_DIRECT);
    628       emit_move_insn (target, tmp3);
    629     }
    630   if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD)
    631     return gen_lowpart (m_new_inner_mode, target);
    632   return target;
    633 }
    634 
    635 /* Get the mask for merge approach.
    636 
    637    Consider such following case:
    638      {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}
    639    To merge "a", the mask should be 1010....
    640    To merge "b", the mask should be 0101....
    641 */
    642 rtx
    643 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern,
    644 				    machine_mode inner_mode) const
    645 {
    646   unsigned HOST_WIDE_INT mask = 0;
    647   unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern);
    648   /* Here we construct a mask pattern that will later be broadcast
    649      to a vector register.  The maximum broadcast size for vmv.v.x/vmv.s.x
    650      is determined by the length of a vector element (ELEN) and not by
    651      XLEN so make sure we do not exceed it.  One example is -march=zve32*
    652      which mandates ELEN == 32 but can be combined with -march=rv64
    653      with XLEN == 64.  */
    654   unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32;
    655 
    656   gcc_assert (elen % npatterns () == 0);
    657 
    658   int limit = elen / npatterns ();
    659 
    660   for (int i = 0; i < limit; i++)
    661     mask |= base_mask << (i * npatterns ());
    662 
    663   return gen_int_mode (mask, inner_mode);
    664 }
    665 
    666 /* Return true if the variable-length vector is single step.
    667    Single step means step all patterns in NPATTERNS are equal.
    668    Consider this following case:
    669 
    670      CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
    671        { 0, 2, 2, 4, 4, 6, ... }
    672      First pattern: step1 = 2 - 0 = 2
    673 		    step2 = 4 - 2 = 2
    674      Second pattern: step1 = 4 - 2 = 2
    675 		     step2 = 6 - 4 = 2
    676      Since all steps of NPATTERNS are equal step = 2.
    677      Return true in this case.
    678 
    679      CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3.
    680        { 0, 1, 2, 4, 4, 7, ... }
    681      First pattern: step1 = 2 - 0 = 2
    682 		    step2 = 4 - 2 = 2
    683      Second pattern: step1 = 4 - 1 = 3
    684 		     step2 = 7 - 4 = 3
    685      Since not all steps are equal, return false.  */
    686 bool
    687 rvv_builder::single_step_npatterns_p () const
    688 {
    689   if (nelts_per_pattern () != 3)
    690     return false;
    691 
    692   poly_int64 step
    693     = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0));
    694   for (unsigned int i = 0; i < npatterns (); i++)
    695     {
    696       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
    697       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
    698       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
    699       poly_int64 diff1 = ele1 - ele0;
    700       poly_int64 diff2 = ele2 - ele1;
    701       if (maybe_ne (step, diff1) || maybe_ne (step, diff2))
    702 	return false;
    703     }
    704   return true;
    705 }
    706 
    707 /* Return true if the diff between const vector and vid sequence
    708    is repeated. For example as below cases:
    709    The diff means the const vector - vid.
    710      CASE 1:
    711      CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... }
    712      VID         : {0, 1, 2, 3, 4, 5, 6, 7, ... }
    713      DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... }
    714      The diff sequence {3, 1,-1,-3} is repeated in the npattern and
    715      return TRUE for case 1.
    716 
    717      CASE 2:
    718      CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...}
    719      VID         : { 0, 1, 2, 3, 4, 5, 6, 7, ... }
    720      DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... }
    721      The diff sequence {-4, 3} is not repated in the npattern and
    722      return FALSE for case 2.  */
    723 bool
    724 rvv_builder::npatterns_vid_diff_repeated_p () const
    725 {
    726   if (nelts_per_pattern () != 3)
    727     return false;
    728   else if (npatterns () == 0)
    729     return false;
    730 
    731   for (unsigned i = 0; i < npatterns (); i++)
    732     {
    733       poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i;
    734       poly_int64 diff_1
    735 	= rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i;
    736 
    737       if (maybe_ne (diff_0, diff_1))
    738 	return false;
    739     }
    740 
    741   return true;
    742 }
    743 
    744 /* Return true if the permutation consists of two
    745    interleaved patterns with a constant step each.
    746    TODO: We currently only support NPATTERNS = 2.  */
    747 bool
    748 rvv_builder::interleaved_stepped_npatterns_p () const
    749 {
    750   if (npatterns () != 2 || nelts_per_pattern () != 3)
    751     return false;
    752   for (unsigned int i = 0; i < npatterns (); i++)
    753     {
    754       poly_int64 ele0 = rtx_to_poly_int64 (elt (i));
    755       poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i));
    756       poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i));
    757       poly_int64 diff1 = ele1 - ele0;
    758       poly_int64 diff2 = ele2 - ele1;
    759       if (maybe_ne (diff1, diff2))
    760 	return false;
    761     }
    762   return true;
    763 }
    764 
    765 /* Return true if all elements of NPATTERNS are equal.
    766 
    767    E.g. NPATTERNS = 4:
    768      { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... }
    769    E.g. NPATTERNS = 8:
    770      { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... }
    771    We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same.
    772    We don't need to check the elements[n] with n >= NPATTERNS since
    773    they don't belong to the same pattern.
    774 */
    775 bool
    776 rvv_builder::npatterns_all_equal_p () const
    777 {
    778   poly_int64 ele0 = rtx_to_poly_int64 (elt (0));
    779   for (unsigned int i = 1; i < npatterns (); i++)
    780     {
    781       poly_int64 ele = rtx_to_poly_int64 (elt (i));
    782       if (!known_eq (ele, ele0))
    783 	return false;
    784     }
    785   return true;
    786 }
    787 
    788 static unsigned
    789 get_sew (machine_mode mode)
    790 {
    791   unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
    792 		       ? 8
    793 		       : GET_MODE_BITSIZE (GET_MODE_INNER (mode));
    794   return sew;
    795 }
    796 
    797 /* Return true if X is a const_vector with all duplicate elements, which is in
    798    the range between MINVAL and MAXVAL.  */
    799 bool
    800 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval,
    801 			       HOST_WIDE_INT maxval)
    802 {
    803   rtx elt;
    804   return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt)
    805 	  && IN_RANGE (INTVAL (elt), minval, maxval));
    806 }
    807 
    808 /* Return true if VEC is a constant in which every element is in the range
    809    [MINVAL, MAXVAL].  The elements do not need to have the same value.
    810 
    811    This function also exists in aarch64, we may unify it in middle-end in the
    812    future.  */
    813 
    814 static bool
    815 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval)
    816 {
    817   if (!CONST_VECTOR_P (vec)
    818       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
    819     return false;
    820 
    821   int nunits;
    822   if (!CONST_VECTOR_STEPPED_P (vec))
    823     nunits = const_vector_encoded_nelts (vec);
    824   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
    825     return false;
    826 
    827   for (int i = 0; i < nunits; i++)
    828     {
    829       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
    830       poly_int64 value;
    831       if (!poly_int_rtx_p (vec_elem, &value)
    832 	  || maybe_lt (value, minval)
    833 	  || maybe_gt (value, maxval))
    834 	return false;
    835     }
    836   return true;
    837 }
    838 
    839 /* Return a const vector of VAL. The VAL can be either const_int or
    840    const_poly_int.  */
    841 
    842 static rtx
    843 gen_const_vector_dup (machine_mode mode, poly_int64 val)
    844 {
    845   scalar_mode smode = GET_MODE_INNER (mode);
    846   rtx c = gen_int_mode (val, smode);
    847   if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode))
    848     {
    849       /* When VAL is const_poly_int value, we need to explicitly broadcast
    850 	 it into a vector using RVV broadcast instruction.  */
    851       return expand_vector_broadcast (mode, c);
    852     }
    853    return gen_const_vec_duplicate (mode, c);
    854 }
    855 
    856 /* Emit a vlmax vsetvl instruction.  This should only be used when
    857    optimization is disabled or after vsetvl insertion pass.  */
    858 void
    859 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl)
    860 {
    861   unsigned int sew = get_sew (vmode);
    862   emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode),
    863 			 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx,
    864 			 const0_rtx));
    865 }
    866 
    867 void
    868 emit_vlmax_vsetvl (machine_mode vmode, rtx vl)
    869 {
    870   unsigned int sew = get_sew (vmode);
    871   enum vlmul_type vlmul = get_vlmul (vmode);
    872   unsigned int ratio = calculate_ratio (sew, vlmul);
    873 
    874   if (!optimize)
    875     emit_hard_vlmax_vsetvl (vmode, vl);
    876   else
    877     emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode)));
    878 }
    879 
    880 /* Calculate SEW/LMUL ratio.  */
    881 unsigned int
    882 calculate_ratio (unsigned int sew, enum vlmul_type vlmul)
    883 {
    884   unsigned int ratio;
    885   switch (vlmul)
    886     {
    887     case LMUL_1:
    888       ratio = sew;
    889       break;
    890     case LMUL_2:
    891       ratio = sew / 2;
    892       break;
    893     case LMUL_4:
    894       ratio = sew / 4;
    895       break;
    896     case LMUL_8:
    897       ratio = sew / 8;
    898       break;
    899     case LMUL_F8:
    900       ratio = sew * 8;
    901       break;
    902     case LMUL_F4:
    903       ratio = sew * 4;
    904       break;
    905     case LMUL_F2:
    906       ratio = sew * 2;
    907       break;
    908     default:
    909       gcc_unreachable ();
    910     }
    911   return ratio;
    912 }
    913 
    914 /* SCALABLE means that the vector-length is agnostic (run-time invariant and
    915    compile-time unknown). ZVL meands that the vector-length is specific
    916    (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing
    917    auto-vectorization using VLMAX vsetvl configuration.  */
    918 static bool
    919 autovec_use_vlmax_p (void)
    920 {
    921   return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE
    922 	  || rvv_vector_bits == RVV_VECTOR_BITS_ZVL;
    923 }
    924 
    925 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel
    926    is a const duplicate vector. Otherwise, emit vrgather.vv.  */
    927 static void
    928 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel)
    929 {
    930   rtx elt;
    931   insn_code icode;
    932   machine_mode data_mode = GET_MODE (target);
    933   machine_mode sel_mode = GET_MODE (sel);
    934   if (const_vec_duplicate_p (sel, &elt))
    935     {
    936       icode = code_for_pred_gather_scalar (data_mode);
    937       sel = elt;
    938     }
    939   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
    940     icode = code_for_pred_gatherei16 (data_mode);
    941   else
    942     icode = code_for_pred_gather (data_mode);
    943   rtx ops[] = {target, op, sel};
    944   emit_vlmax_insn (icode, BINARY_OP, ops);
    945 }
    946 
    947 static void
    948 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask)
    949 {
    950   rtx elt;
    951   insn_code icode;
    952   machine_mode data_mode = GET_MODE (target);
    953   machine_mode sel_mode = GET_MODE (sel);
    954   if (const_vec_duplicate_p (sel, &elt))
    955     {
    956       icode = code_for_pred_gather_scalar (data_mode);
    957       sel = elt;
    958     }
    959   else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode)))
    960     icode = code_for_pred_gatherei16 (data_mode);
    961   else
    962     icode = code_for_pred_gather (data_mode);
    963   rtx ops[] = {target, mask, target, op, sel};
    964   emit_vlmax_insn (icode, BINARY_OP_TAMU, ops);
    965 }
    966 
    967 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress):
    968    https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc
    969 
    970   There is no inverse vdecompress provided, as this operation can be readily
    971   synthesized using iota and a masked vrgather:
    972 
    973       Desired functionality of 'vdecompress'
    974 	7 6 5 4 3 2 1 0     # vid
    975 
    976 	      e d c b a     # packed vector of 5 elements
    977 	1 0 0 1 1 1 0 1     # mask vector of 8 elements
    978 	p q r s t u v w     # destination register before vdecompress
    979 
    980 	e q r d c b v a     # result of vdecompress
    981        # v0 holds mask
    982        # v1 holds packed data
    983        # v11 holds input expanded vector and result
    984        viota.m v10, v0                 # Calc iota from mask in v0
    985        vrgather.vv v11, v1, v10, v0.t  # Expand into destination
    986      p q r s t u v w  # v11 destination register
    987 	   e d c b a  # v1 source vector
    988      1 0 0 1 1 1 0 1  # v0 mask vector
    989 
    990      4 4 4 3 2 1 1 0  # v10 result of viota.m
    991      e q r d c b v a  # v11 destination after vrgather using viota.m under mask
    992 */
    993 static void
    994 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
    995 {
    996   machine_mode data_mode = GET_MODE (target);
    997   machine_mode sel_mode = related_int_vector_mode (data_mode).require ();
    998   if (GET_MODE_INNER (data_mode) == QImode)
    999     sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require ();
   1000 
   1001   rtx sel = gen_reg_rtx (sel_mode);
   1002   rtx iota_ops[] = {sel, mask};
   1003   emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops);
   1004   emit_vlmax_gather_insn (target, op0, sel);
   1005   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
   1006 }
   1007 
   1008 /* Emit merge instruction.  */
   1009 
   1010 static machine_mode
   1011 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder,
   1012 					 machine_mode mask_bit_mode)
   1013 {
   1014   unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant ();
   1015   unsigned mask_scalar_size = mask_precision > builder.inner_bits_size ()
   1016     ? builder.inner_bits_size () : mask_precision;
   1017 
   1018   scalar_mode inner_mode;
   1019   unsigned minimal_bits_size;
   1020 
   1021   switch (mask_scalar_size)
   1022     {
   1023       case 8:
   1024 	inner_mode = QImode;
   1025 	minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8.  */
   1026 	break;
   1027       case 16:
   1028 	inner_mode = HImode;
   1029 	minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4.  */
   1030 	break;
   1031       case 32:
   1032 	inner_mode = SImode;
   1033 	minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2.  */
   1034 	break;
   1035       case 64:
   1036 	inner_mode = DImode;
   1037 	minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1.  */
   1038 	break;
   1039       default:
   1040 	gcc_unreachable ();
   1041 	break;
   1042     }
   1043 
   1044   gcc_assert (mask_precision % mask_scalar_size == 0);
   1045 
   1046   uint64_t dup_nunit = mask_precision > mask_scalar_size
   1047     ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size;
   1048 
   1049   return get_vector_mode (inner_mode, dup_nunit).require ();
   1050 }
   1051 
   1052 /* Expand series const vector.  If VID is NULL_RTX, we use vid.v
   1053    instructions to generate sequence for VID:
   1054 
   1055      VID = { 0, 1, 2, 3, ... }
   1056 
   1057    Otherwise, we use the VID argument directly.  */
   1058 
   1059 void
   1060 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid)
   1061 {
   1062   machine_mode mode = GET_MODE (dest);
   1063   poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1;
   1064   poly_int64 value;
   1065   rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode);
   1066 
   1067   /* VECT_IV = BASE + I * STEP.  */
   1068 
   1069   /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v.  */
   1070   bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx)
   1071 		   && poly_int_rtx_p (base, &value)
   1072 		   && known_eq (nunits_m1, value);
   1073   if (!vid)
   1074     {
   1075       vid = gen_reg_rtx (mode);
   1076       rtx op[] = {vid};
   1077       emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op);
   1078     }
   1079 
   1080   rtx step_adj;
   1081   if (reverse_p)
   1082     {
   1083       /* Special case:
   1084 	   {nunits - 1, nunits - 2, ... , 0}.
   1085 	   nunits can be either const_int or const_poly_int.
   1086 
   1087 	 Code sequence:
   1088 	   vid.v v
   1089 	   vrsub nunits - 1, v.  */
   1090       rtx ops[]
   1091 	= {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))};
   1092       insn_code icode = code_for_pred_sub_reverse_scalar (mode);
   1093       emit_vlmax_insn (icode, BINARY_OP, ops);
   1094     }
   1095   else
   1096     {
   1097       /* Step 2: Generate I * STEP.
   1098 	 - STEP is 1, we don't emit any instructions.
   1099 	 - STEP is power of 2, we use vsll.vi/vsll.vx.
   1100 	 - STEP is non-power of 2, we use vmul.vx.  */
   1101       if (rtx_equal_p (step, const1_rtx))
   1102 	step_adj = vid;
   1103       else
   1104 	{
   1105 	  step_adj = gen_reg_rtx (mode);
   1106 	  if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step)))
   1107 	    {
   1108 	      /* Emit logical left shift operation.  */
   1109 	      int shift = exact_log2 (INTVAL (step));
   1110 	      rtx shift_amount = gen_int_mode (shift, Pmode);
   1111 	      insn_code icode = code_for_pred_scalar (ASHIFT, mode);
   1112 	      rtx ops[] = {step_adj, vid, shift_amount};
   1113 	      emit_vlmax_insn (icode, BINARY_OP, ops);
   1114 	    }
   1115 	  else
   1116 	    {
   1117 	      insn_code icode = code_for_pred_scalar (MULT, mode);
   1118 	      rtx ops[] = {step_adj, vid, step};
   1119 	      emit_vlmax_insn (icode, BINARY_OP, ops);
   1120 	    }
   1121 	}
   1122 
   1123       /* Step 3: Generate BASE + I * STEP.
   1124 	  - BASE is 0, use result of vid.
   1125 	  - BASE is not 0, we use vadd.vx/vadd.vi.  */
   1126       if (rtx_equal_p (base, const0_rtx))
   1127 	emit_move_insn (result, step_adj);
   1128       else
   1129 	{
   1130 	  insn_code icode = code_for_pred_scalar (PLUS, mode);
   1131 	  rtx ops[] = {result, step_adj, base};
   1132 	  emit_vlmax_insn (icode, BINARY_OP, ops);
   1133 	}
   1134     }
   1135 
   1136   if (result != dest)
   1137     emit_move_insn (dest, result);
   1138 }
   1139 
   1140 static void
   1141 expand_const_vector (rtx target, rtx src)
   1142 {
   1143   machine_mode mode = GET_MODE (target);
   1144   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
   1145     {
   1146       rtx elt;
   1147       gcc_assert (
   1148 	const_vec_duplicate_p (src, &elt)
   1149 	&& (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx)));
   1150       rtx ops[] = {target, src};
   1151       emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops);
   1152       return;
   1153     }
   1154 
   1155   rtx elt;
   1156   if (const_vec_duplicate_p (src, &elt))
   1157     {
   1158       rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode);
   1159       /* Element in range -16 ~ 15 integer or 0.0 floating-point,
   1160 	 we use vmv.v.i instruction.  */
   1161       if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src))
   1162 	{
   1163 	  rtx ops[] = {tmp, src};
   1164 	  emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops);
   1165 	}
   1166       else
   1167 	{
   1168 	  /* Emit vec_duplicate<mode> split pattern before RA so that
   1169 	     we could have a better optimization opportunity in LICM
   1170 	     which will hoist vmv.v.x outside the loop and in fwprop && combine
   1171 	     which will transform 'vv' into 'vx' instruction.
   1172 
   1173 	     The reason we don't emit vec_duplicate<mode> split pattern during
   1174 	     RA since the split stage after RA is a too late stage to generate
   1175 	     RVV instruction which need an additional register (We can't
   1176 	     allocate a new register after RA) for VL operand of vsetvl
   1177 	     instruction (vsetvl a5, zero).  */
   1178 	  if (lra_in_progress)
   1179 	    {
   1180 	      rtx ops[] = {tmp, elt};
   1181 	      emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops);
   1182 	    }
   1183 	  else
   1184 	    {
   1185 	      struct expand_operand ops[2];
   1186 	      enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
   1187 	      gcc_assert (icode != CODE_FOR_nothing);
   1188 	      create_output_operand (&ops[0], tmp, mode);
   1189 	      create_input_operand (&ops[1], elt, GET_MODE_INNER (mode));
   1190 	      expand_insn (icode, 2, ops);
   1191 	      tmp = ops[0].value;
   1192 	    }
   1193 	}
   1194 
   1195       if (tmp != target)
   1196 	emit_move_insn (target, tmp);
   1197       return;
   1198     }
   1199 
   1200   /* Support scalable const series vector.  */
   1201   rtx base, step;
   1202   if (const_vec_series_p (src, &base, &step))
   1203     {
   1204       expand_vec_series (target, base, step);
   1205       return;
   1206     }
   1207 
   1208   /* Handle variable-length vector.  */
   1209   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
   1210   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
   1211   rvv_builder builder (mode, npatterns, nelts_per_pattern);
   1212   for (unsigned int i = 0; i < nelts_per_pattern; i++)
   1213     {
   1214       for (unsigned int j = 0; j < npatterns; j++)
   1215 	builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j));
   1216     }
   1217   builder.finalize ();
   1218 
   1219   if (CONST_VECTOR_DUPLICATE_P (src))
   1220     {
   1221       /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1
   1222 	 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... }
   1223 	      NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... }
   1224 	The elements within NPATTERNS are not necessary regular.  */
   1225       if (builder.can_duplicate_repeating_sequence_p ())
   1226 	{
   1227 	  /* We handle the case that we can find a vector containter to hold
   1228 	     element bitsize = NPATTERNS * ele_bitsize.
   1229 
   1230 	       NPATTERNS = 8, element width = 8
   1231 		 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
   1232 	       In this case, we can combine NPATTERNS element into a larger
   1233 	       element. Use element width = 64 and broadcast a vector with
   1234 	       all element equal to 0x0706050403020100.  */
   1235 	  rtx ele = builder.get_merged_repeating_sequence ();
   1236 	  rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
   1237 	  emit_move_insn (target, gen_lowpart (mode, dup));
   1238 	}
   1239       else
   1240 	{
   1241 	  /* We handle the case that we can't find a vector containter to hold
   1242 	     element bitsize = NPATTERNS * ele_bitsize.
   1243 
   1244 	       NPATTERNS = 8, element width = 16
   1245 		 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... }
   1246 	       Since NPATTERNS * element width = 128, we can't find a container
   1247 	       to hold it.
   1248 
   1249 	       In this case, we use NPATTERNS merge operations to generate such
   1250 	       vector.  */
   1251 	  unsigned int nbits = npatterns - 1;
   1252 
   1253 	  /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
   1254 	  rtx vid = gen_reg_rtx (builder.int_mode ());
   1255 	  rtx op[] = {vid};
   1256 	  emit_vlmax_insn (code_for_pred_series (builder.int_mode ()),
   1257 			    NULLARY_OP, op);
   1258 
   1259 	  /* Generate vid_repeat = { 0, 1, ... nbits, ... }  */
   1260 	  rtx vid_repeat = gen_reg_rtx (builder.int_mode ());
   1261 	  rtx and_ops[] = {vid_repeat, vid,
   1262 			   gen_int_mode (nbits, builder.inner_int_mode ())};
   1263 	  emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
   1264 			    BINARY_OP, and_ops);
   1265 
   1266 	  rtx tmp = gen_reg_rtx (builder.mode ());
   1267 	  rtx dup_ops[] = {tmp, builder.elt (0)};
   1268 	  emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP,
   1269 			    dup_ops);
   1270 	  for (unsigned int i = 1; i < builder.npatterns (); i++)
   1271 	    {
   1272 	      /* Generate mask according to i.  */
   1273 	      rtx mask = gen_reg_rtx (builder.mask_mode ());
   1274 	      rtx const_vec = gen_const_vector_dup (builder.int_mode (), i);
   1275 	      expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
   1276 
   1277 	      /* Merge scalar to each i.  */
   1278 	      rtx tmp2 = gen_reg_rtx (builder.mode ());
   1279 	      rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask};
   1280 	      insn_code icode = code_for_pred_merge_scalar (builder.mode ());
   1281 	      emit_vlmax_insn (icode, MERGE_OP, merge_ops);
   1282 	      tmp = tmp2;
   1283 	    }
   1284 	  emit_move_insn (target, tmp);
   1285 	}
   1286     }
   1287   else if (CONST_VECTOR_STEPPED_P (src))
   1288     {
   1289       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
   1290       if (builder.single_step_npatterns_p ())
   1291 	{
   1292 	  /* Describe the case by choosing NPATTERNS = 4 as an example.  */
   1293 	  insn_code icode;
   1294 
   1295 	  /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
   1296 	  rtx vid = gen_reg_rtx (builder.mode ());
   1297 	  rtx vid_ops[] = {vid};
   1298 	  icode = code_for_pred_series (builder.mode ());
   1299 	  emit_vlmax_insn (icode, NULLARY_OP, vid_ops);
   1300 
   1301 	  if (builder.npatterns_all_equal_p ())
   1302 	    {
   1303 	      /* Generate the variable-length vector following this rule:
   1304 		 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...}
   1305 		   E.g. { 0, 0, 8, 8, 16, 16, ... } */
   1306 
   1307 	      /* We want to create a pattern where value[idx] = floor (idx /
   1308 		 NPATTERNS). As NPATTERNS is always a power of two we can
   1309 		 rewrite this as = idx & -NPATTERNS.  */
   1310 	      /* Step 2: VID AND -NPATTERNS:
   1311 		 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... }
   1312 	      */
   1313 	      rtx imm
   1314 		= gen_int_mode (-builder.npatterns (), builder.inner_mode ());
   1315 	      rtx tmp1 = gen_reg_rtx (builder.mode ());
   1316 	      rtx and_ops[] = {tmp1, vid, imm};
   1317 	      icode = code_for_pred_scalar (AND, builder.mode ());
   1318 	      emit_vlmax_insn (icode, BINARY_OP, and_ops);
   1319 
   1320 	      /* Step 3: Convert to step size 1.  */
   1321 	      rtx tmp2 = gen_reg_rtx (builder.mode ());
   1322 	      /* log2 (npatterns) to get the shift amount to convert
   1323 		 Eg.  { 0, 0, 0, 0, 4, 4, ... }
   1324 		 into { 0, 0, 0, 0, 1, 1, ... }.  */
   1325 	      HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ;
   1326 	      rtx shift = gen_int_mode (shift_amt, builder.inner_mode ());
   1327 	      rtx shift_ops[] = {tmp2, tmp1, shift};
   1328 	      icode = code_for_pred_scalar (ASHIFTRT, builder.mode ());
   1329 	      emit_vlmax_insn (icode, BINARY_OP, shift_ops);
   1330 
   1331 	      /* Step 4: Multiply to step size n.  */
   1332 	      HOST_WIDE_INT step_size =
   1333 		INTVAL (builder.elt (builder.npatterns ()))
   1334 		- INTVAL (builder.elt (0));
   1335 	      rtx tmp3 = gen_reg_rtx (builder.mode ());
   1336 	      if (pow2p_hwi (step_size))
   1337 		{
   1338 		  /* Power of 2 can be handled with a left shift.  */
   1339 		  HOST_WIDE_INT shift = exact_log2 (step_size);
   1340 		  rtx shift_amount = gen_int_mode (shift, Pmode);
   1341 		  insn_code icode = code_for_pred_scalar (ASHIFT, mode);
   1342 		  rtx ops[] = {tmp3, tmp2, shift_amount};
   1343 		  emit_vlmax_insn (icode, BINARY_OP, ops);
   1344 		}
   1345 	      else
   1346 		{
   1347 		  rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ());
   1348 		  insn_code icode = code_for_pred_scalar (MULT, builder.mode ());
   1349 		  rtx ops[] = {tmp3, tmp2, mult_amt};
   1350 		  emit_vlmax_insn (icode, BINARY_OP, ops);
   1351 		}
   1352 
   1353 	      /* Step 5: Add starting value to all elements.  */
   1354 	      HOST_WIDE_INT init_val = INTVAL (builder.elt (0));
   1355 	      if (init_val == 0)
   1356 		emit_move_insn (target, tmp3);
   1357 	      else
   1358 		{
   1359 		  rtx dup = gen_const_vector_dup (builder.mode (), init_val);
   1360 		  rtx add_ops[] = {target, tmp3, dup};
   1361 		  icode = code_for_pred (PLUS, builder.mode ());
   1362 		  emit_vlmax_insn (icode, BINARY_OP, add_ops);
   1363 		}
   1364 	    }
   1365 	  else
   1366 	    {
   1367 	      /* Generate the variable-length vector following this rule:
   1368 		{ a, b, a + step, b + step, a + step*2, b + step*2, ... }  */
   1369 
   1370 	      if (builder.npatterns_vid_diff_repeated_p ())
   1371 		{
   1372 		  /* Case 1: For example as below:
   1373 		     {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... }
   1374 		     We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is
   1375 		     repeated as below after minus vid.
   1376 		     {3, 1, -1, -3, 3, 1, -1, -3...}
   1377 		     Then we can simplify the diff code gen to at most
   1378 		     npatterns().  */
   1379 		  rvv_builder v (builder.mode (), builder.npatterns (), 1);
   1380 
   1381 		  /* Step 1: Generate diff = TARGET - VID.  */
   1382 		  for (unsigned int i = 0; i < v.npatterns (); ++i)
   1383 		    {
   1384 		     poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
   1385 		     v.quick_push (gen_int_mode (diff, v.inner_mode ()));
   1386 		    }
   1387 
   1388 		  /* Step 2: Generate result = VID + diff.  */
   1389 		  rtx vec = v.build ();
   1390 		  rtx add_ops[] = {target, vid, vec};
   1391 		  emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
   1392 				   BINARY_OP, add_ops);
   1393 		}
   1394 	      else
   1395 		{
   1396 		  /* Case 2: For example as below:
   1397 		     { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... }
   1398 		   */
   1399 		  rvv_builder v (builder.mode (), builder.npatterns (), 1);
   1400 
   1401 		  /* Step 1: Generate { a, b, a, b, ... }  */
   1402 		  for (unsigned int i = 0; i < v.npatterns (); ++i)
   1403 		    v.quick_push (builder.elt (i));
   1404 		  rtx new_base = v.build ();
   1405 
   1406 		  /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS). */
   1407 		  rtx shift_count
   1408 		    = gen_int_mode (exact_log2 (builder.npatterns ()),
   1409 				    builder.inner_mode ());
   1410 		  rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT,
   1411 						 vid, shift_count, NULL_RTX,
   1412 						 false, OPTAB_DIRECT);
   1413 
   1414 		  /* Step 3: Generate tmp2 = tmp * step. */
   1415 		  rtx tmp2 = gen_reg_rtx (builder.mode ());
   1416 		  rtx step
   1417 		    = simplify_binary_operation (MINUS, builder.inner_mode (),
   1418 						 builder.elt (v.npatterns()),
   1419 						 builder.elt (0));
   1420 		  expand_vec_series (tmp2, const0_rtx, step, tmp);
   1421 
   1422 		  /* Step 4: Generate target = tmp2 + new_base. */
   1423 		  rtx add_ops[] = {target, tmp2, new_base};
   1424 		  emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()),
   1425 				   BINARY_OP, add_ops);
   1426 		}
   1427 	    }
   1428 	}
   1429       else if (builder.interleaved_stepped_npatterns_p ())
   1430 	{
   1431 	  rtx base1 = builder.elt (0);
   1432 	  rtx base2 = builder.elt (1);
   1433 	  poly_int64 step1
   1434 	    = rtx_to_poly_int64 (builder.elt (builder.npatterns ()))
   1435 	      - rtx_to_poly_int64 (base1);
   1436 	  poly_int64 step2
   1437 	    = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1))
   1438 	      - rtx_to_poly_int64 (base2);
   1439 
   1440 	  /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW
   1441 	     integer vector mode to generate such vector efficiently.
   1442 
   1443 	     E.g. EEW = 16, { 2, 0, 4, 0, ... }
   1444 
   1445 	     can be interpreted into:
   1446 
   1447 		  EEW = 32, { 2, 4, ... }.
   1448 
   1449 	     This only works as long as the larger type does not overflow
   1450 	     as we can't guarantee a zero value for each second element
   1451 	     of the sequence with smaller EEW.
   1452 	     ??? For now we assume that no overflow happens with positive
   1453 	     steps and forbid negative steps altogether.  */
   1454 	  unsigned int new_smode_bitsize = builder.inner_bits_size () * 2;
   1455 	  scalar_int_mode new_smode;
   1456 	  machine_mode new_mode;
   1457 	  poly_uint64 new_nunits
   1458 	    = exact_div (GET_MODE_NUNITS (builder.mode ()), 2);
   1459 	  if (known_ge (step1, 0) && known_ge (step2, 0)
   1460 	      && int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode)
   1461 	      && get_vector_mode (new_smode, new_nunits).exists (&new_mode))
   1462 	    {
   1463 	      rtx tmp = gen_reg_rtx (new_mode);
   1464 	      base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode);
   1465 	      expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode));
   1466 
   1467 	      if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0))
   1468 		/* { 1, 0, 2, 0, ... }.  */
   1469 		emit_move_insn (target, gen_lowpart (mode, tmp));
   1470 	      else if (known_eq (step2, 0))
   1471 		{
   1472 		  /* { 1, 1, 2, 1, ... }.  */
   1473 		  rtx scalar = expand_simple_binop (
   1474 		    new_smode, ASHIFT,
   1475 		    gen_int_mode (rtx_to_poly_int64 (base2), new_smode),
   1476 		    gen_int_mode (builder.inner_bits_size (), new_smode),
   1477 		    NULL_RTX, false, OPTAB_DIRECT);
   1478 		  rtx tmp2 = gen_reg_rtx (new_mode);
   1479 		  rtx and_ops[] = {tmp2, tmp, scalar};
   1480 		  emit_vlmax_insn (code_for_pred_scalar (AND, new_mode),
   1481 				   BINARY_OP, and_ops);
   1482 		  emit_move_insn (target, gen_lowpart (mode, tmp2));
   1483 		}
   1484 	      else
   1485 		{
   1486 		  /* { 1, 3, 2, 6, ... }.  */
   1487 		  rtx tmp2 = gen_reg_rtx (new_mode);
   1488 		  base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode);
   1489 		  expand_vec_series (tmp2, base2,
   1490 				     gen_int_mode (step2, new_smode));
   1491 		  rtx shifted_tmp2 = expand_simple_binop (
   1492 		    new_mode, ASHIFT, tmp2,
   1493 		    gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX,
   1494 		    false, OPTAB_DIRECT);
   1495 		  rtx tmp3 = gen_reg_rtx (new_mode);
   1496 		  rtx ior_ops[] = {tmp3, tmp, shifted_tmp2};
   1497 		  emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP,
   1498 				   ior_ops);
   1499 		  emit_move_insn (target, gen_lowpart (mode, tmp3));
   1500 		}
   1501 	    }
   1502 	  else
   1503 	    {
   1504 	      rtx vid = gen_reg_rtx (mode);
   1505 	      expand_vec_series (vid, const0_rtx, const1_rtx);
   1506 	      /* Transform into { 0, 0, 1, 1, 2, 2, ... }.  */
   1507 	      rtx shifted_vid
   1508 		= expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx,
   1509 				       NULL_RTX, false, OPTAB_DIRECT);
   1510 	      rtx tmp1 = gen_reg_rtx (mode);
   1511 	      rtx tmp2 = gen_reg_rtx (mode);
   1512 	      expand_vec_series (tmp1, base1,
   1513 				 gen_int_mode (step1, builder.inner_mode ()),
   1514 				 shifted_vid);
   1515 	      expand_vec_series (tmp2, base2,
   1516 				 gen_int_mode (step2, builder.inner_mode ()),
   1517 				 shifted_vid);
   1518 
   1519 	      /* Transform into { 0, 1, 0, 1, 0, 1, ... }.  */
   1520 	      rtx and_vid = gen_reg_rtx (mode);
   1521 	      rtx and_ops[] = {and_vid, vid, const1_rtx};
   1522 	      emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP,
   1523 			       and_ops);
   1524 	      rtx mask = gen_reg_rtx (builder.mask_mode ());
   1525 	      expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode));
   1526 
   1527 	      rtx ops[] = {target, tmp1, tmp2, mask};
   1528 	      emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops);
   1529 	    }
   1530 	}
   1531       else if (npatterns == 1 && nelts_per_pattern == 3)
   1532 	{
   1533 	  /* Generate the following CONST_VECTOR:
   1534 	     { base0, base1, base1 + step, base1 + step * 2, ... }  */
   1535 	  rtx base0 = builder.elt (0);
   1536 	  rtx base1 = builder.elt (1);
   1537 	  rtx base2 = builder.elt (2);
   1538 
   1539 	  rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
   1540 						base2, base1);
   1541 
   1542 	  /* Step 1 - { base1, base1 + step, base1 + step * 2, ... }  */
   1543 	  rtx tmp = gen_reg_rtx (mode);
   1544 	  expand_vec_series (tmp, base1, step);
   1545 	  /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... }  */
   1546 	  if (!rtx_equal_p (base0, const0_rtx))
   1547 	    base0 = force_reg (builder.inner_mode (), base0);
   1548 
   1549 	  insn_code icode = optab_handler (vec_shl_insert_optab, mode);
   1550 	  gcc_assert (icode != CODE_FOR_nothing);
   1551 	  emit_insn (GEN_FCN (icode) (target, tmp, base0));
   1552 	}
   1553       else
   1554 	/* TODO: We will enable more variable-length vector in the future.  */
   1555 	gcc_unreachable ();
   1556     }
   1557   else
   1558     gcc_unreachable ();
   1559 }
   1560 
   1561 /* Get the frm mode with given CONST_INT rtx, the default mode is
   1562    FRM_DYN.  */
   1563 enum floating_point_rounding_mode
   1564 get_frm_mode (rtx operand)
   1565 {
   1566   gcc_assert (CONST_INT_P (operand));
   1567 
   1568   switch (INTVAL (operand))
   1569     {
   1570     case FRM_RNE:
   1571       return FRM_RNE;
   1572     case FRM_RTZ:
   1573       return FRM_RTZ;
   1574     case FRM_RDN:
   1575       return FRM_RDN;
   1576     case FRM_RUP:
   1577       return FRM_RUP;
   1578     case FRM_RMM:
   1579       return FRM_RMM;
   1580     case FRM_DYN:
   1581       return FRM_DYN;
   1582     default:
   1583       gcc_unreachable ();
   1584     }
   1585 
   1586   gcc_unreachable ();
   1587 }
   1588 
   1589 /* Expand a pre-RA RVV data move from SRC to DEST.
   1590    It expands move for RVV fractional vector modes.
   1591    Return true if the move as already been emitted.  */
   1592 bool
   1593 legitimize_move (rtx dest, rtx *srcp)
   1594 {
   1595   rtx src = *srcp;
   1596   machine_mode mode = GET_MODE (dest);
   1597   if (CONST_VECTOR_P (src))
   1598     {
   1599       expand_const_vector (dest, src);
   1600       return true;
   1601     }
   1602 
   1603   if (riscv_v_ext_vls_mode_p (mode))
   1604     {
   1605       if (GET_MODE_NUNITS (mode).to_constant () <= 31)
   1606 	{
   1607 	  /* For NUNITS <= 31 VLS modes, we don't need extrac
   1608 	     scalar regisers so we apply the naive (set (op0) (op1)) pattern. */
   1609 	  if (can_create_pseudo_p ())
   1610 	    {
   1611 	      /* Need to force register if mem <- !reg.  */
   1612 	      if (MEM_P (dest) && !REG_P (src))
   1613 		*srcp = force_reg (mode, src);
   1614 
   1615 	      return false;
   1616 	    }
   1617 	}
   1618       else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress)
   1619 	{
   1620 	  emit_insn (gen_mov_lra (mode, Pmode, dest, src));
   1621 	  return true;
   1622 	}
   1623     }
   1624   else
   1625     {
   1626       /* In order to decrease the memory traffic, we don't use whole register
   1627        * load/store for the LMUL less than 1 and mask mode, so those case will
   1628        * require one extra general purpose register, but it's not allowed during
   1629        * LRA process, so we have a special move pattern used for LRA, which will
   1630        * defer the expansion after LRA.  */
   1631       if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
   1632 	   || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
   1633 	  && lra_in_progress)
   1634 	{
   1635 	  emit_insn (gen_mov_lra (mode, Pmode, dest, src));
   1636 	  return true;
   1637 	}
   1638 
   1639       if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR)
   1640 	  && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
   1641 	{
   1642 	  /* Need to force register if mem <- !reg.  */
   1643 	  if (MEM_P (dest) && !REG_P (src))
   1644 	    *srcp = force_reg (mode, src);
   1645 
   1646 	  return false;
   1647 	}
   1648     }
   1649 
   1650   if (register_operand (src, mode) && register_operand (dest, mode))
   1651     {
   1652       emit_insn (gen_rtx_SET (dest, src));
   1653       return true;
   1654     }
   1655 
   1656   unsigned insn_flags
   1657     = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP;
   1658   if (!register_operand (src, mode) && !register_operand (dest, mode))
   1659     {
   1660       rtx tmp = gen_reg_rtx (mode);
   1661       if (MEM_P (src))
   1662 	{
   1663 	  rtx ops[] = {tmp, src};
   1664 	  emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
   1665 	}
   1666       else
   1667 	emit_move_insn (tmp, src);
   1668       src = tmp;
   1669     }
   1670 
   1671   if (satisfies_constraint_vu (src))
   1672     return false;
   1673 
   1674   rtx ops[] = {dest, src};
   1675   emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops);
   1676   return true;
   1677 }
   1678 
   1679 /* VTYPE information for machine_mode.  */
   1680 struct mode_vtype_group
   1681 {
   1682   enum vlmul_type vlmul[NUM_MACHINE_MODES];
   1683   uint8_t ratio[NUM_MACHINE_MODES];
   1684   machine_mode subpart_mode[NUM_MACHINE_MODES];
   1685   uint8_t nf[NUM_MACHINE_MODES];
   1686   mode_vtype_group ()
   1687   {
   1688 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO)                                 \
   1689   vlmul[MODE##mode] = VLMUL;                                                   \
   1690   ratio[MODE##mode] = RATIO;
   1691 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO)         \
   1692   subpart_mode[MODE##mode] = SUBPART_MODE##mode;                               \
   1693   nf[MODE##mode] = NF;                                                         \
   1694   vlmul[MODE##mode] = VLMUL;                                                   \
   1695   ratio[MODE##mode] = RATIO;
   1696 #include "riscv-vector-switch.def"
   1697 #undef ENTRY
   1698 #undef TUPLE_ENTRY
   1699   }
   1700 };
   1701 
   1702 static mode_vtype_group mode_vtype_infos;
   1703 
   1704 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR.  */
   1705 enum vlmul_type
   1706 get_vlmul (machine_mode mode)
   1707 {
   1708   /* For VLS modes, the vlmul should be dynamically
   1709      calculated since we need to adjust VLMUL according
   1710      to TARGET_MIN_VLEN.  */
   1711   if (riscv_v_ext_vls_mode_p (mode))
   1712     {
   1713       int size = GET_MODE_BITSIZE (mode).to_constant ();
   1714       int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
   1715       if (size < TARGET_MIN_VLEN)
   1716 	{
   1717 	  int factor = TARGET_MIN_VLEN / size;
   1718 	  if (inner_size == 8)
   1719 	    factor = MIN (factor, 8);
   1720 	  else if (inner_size == 16)
   1721 	    factor = MIN (factor, 4);
   1722 	  else if (inner_size == 32)
   1723 	    factor = MIN (factor, 2);
   1724 	  else if (inner_size == 64)
   1725 	    factor = MIN (factor, 1);
   1726 	  else
   1727 	    gcc_unreachable ();
   1728 
   1729 	  switch (factor)
   1730 	    {
   1731 	    case 1:
   1732 	      return LMUL_1;
   1733 	    case 2:
   1734 	      return LMUL_F2;
   1735 	    case 4:
   1736 	      return LMUL_F4;
   1737 	    case 8:
   1738 	      return LMUL_F8;
   1739 
   1740 	    default:
   1741 	      gcc_unreachable ();
   1742 	    }
   1743 	}
   1744       else
   1745 	{
   1746 	  int factor = size / TARGET_MIN_VLEN;
   1747 	  switch (factor)
   1748 	    {
   1749 	    case 1:
   1750 	      return LMUL_1;
   1751 	    case 2:
   1752 	      return LMUL_2;
   1753 	    case 4:
   1754 	      return LMUL_4;
   1755 	    case 8:
   1756 	      return LMUL_8;
   1757 
   1758 	    default:
   1759 	      gcc_unreachable ();
   1760 	    }
   1761 	}
   1762     }
   1763   return mode_vtype_infos.vlmul[mode];
   1764 }
   1765 
   1766 /* Return the VLMAX rtx of vector mode MODE.  */
   1767 rtx
   1768 get_vlmax_rtx (machine_mode mode)
   1769 {
   1770   gcc_assert (riscv_v_ext_vector_mode_p (mode));
   1771   return gen_int_mode (GET_MODE_NUNITS (mode), Pmode);
   1772 }
   1773 
   1774 /* Return the NF value of the corresponding mode.  */
   1775 unsigned int
   1776 get_nf (machine_mode mode)
   1777 {
   1778   /* We don't allow non-tuple modes go through this function.  */
   1779   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
   1780   return mode_vtype_infos.nf[mode];
   1781 }
   1782 
   1783 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode,
   1784    the subpart mode is RVVM2SImode. This will help to build
   1785    array/struct type in builtins.  */
   1786 machine_mode
   1787 get_subpart_mode (machine_mode mode)
   1788 {
   1789   /* We don't allow non-tuple modes go through this function.  */
   1790   gcc_assert (riscv_v_ext_tuple_mode_p (mode));
   1791   return mode_vtype_infos.subpart_mode[mode];
   1792 }
   1793 
   1794 /* Get ratio according to machine mode.  */
   1795 unsigned int
   1796 get_ratio (machine_mode mode)
   1797 {
   1798   if (riscv_v_ext_vls_mode_p (mode))
   1799     {
   1800       unsigned int sew = get_sew (mode);
   1801       vlmul_type vlmul = get_vlmul (mode);
   1802       switch (vlmul)
   1803 	{
   1804 	case LMUL_1:
   1805 	  return sew;
   1806 	case LMUL_2:
   1807 	  return sew / 2;
   1808 	case LMUL_4:
   1809 	  return sew / 4;
   1810 	case LMUL_8:
   1811 	  return sew / 8;
   1812 	case LMUL_F8:
   1813 	  return sew * 8;
   1814 	case LMUL_F4:
   1815 	  return sew * 4;
   1816 	case LMUL_F2:
   1817 	  return sew * 2;
   1818 
   1819 	default:
   1820 	  gcc_unreachable ();
   1821 	}
   1822     }
   1823   return mode_vtype_infos.ratio[mode];
   1824 }
   1825 
   1826 /* Get ta according to operand[tail_op_idx].  */
   1827 int
   1828 get_ta (rtx ta)
   1829 {
   1830   if (INTVAL (ta) == TAIL_ANY)
   1831     return INVALID_ATTRIBUTE;
   1832   return INTVAL (ta);
   1833 }
   1834 
   1835 /* Get ma according to operand[mask_op_idx].  */
   1836 int
   1837 get_ma (rtx ma)
   1838 {
   1839   if (INTVAL (ma) == MASK_ANY)
   1840     return INVALID_ATTRIBUTE;
   1841   return INTVAL (ma);
   1842 }
   1843 
   1844 /* Get prefer tail policy.  */
   1845 enum tail_policy
   1846 get_prefer_tail_policy ()
   1847 {
   1848   /* TODO: By default, we choose to use TAIL_ANY which allows
   1849      compiler pick up either agnostic or undisturbed. Maybe we
   1850      will have a compile option like -mprefer=agnostic to set
   1851      this value???.  */
   1852   return TAIL_ANY;
   1853 }
   1854 
   1855 /* Get prefer mask policy.  */
   1856 enum mask_policy
   1857 get_prefer_mask_policy ()
   1858 {
   1859   /* TODO: By default, we choose to use MASK_ANY which allows
   1860      compiler pick up either agnostic or undisturbed. Maybe we
   1861      will have a compile option like -mprefer=agnostic to set
   1862      this value???.  */
   1863   return MASK_ANY;
   1864 }
   1865 
   1866 /* Get avl_type rtx.  */
   1867 rtx
   1868 get_avl_type_rtx (enum avl_type type)
   1869 {
   1870   return gen_int_mode (type, Pmode);
   1871 }
   1872 
   1873 /* Return the appropriate mask mode for MODE.  */
   1874 
   1875 machine_mode
   1876 get_mask_mode (machine_mode mode)
   1877 {
   1878   poly_int64 nunits = GET_MODE_NUNITS (mode);
   1879   if (riscv_v_ext_tuple_mode_p (mode))
   1880     {
   1881       unsigned int nf = get_nf (mode);
   1882       nunits = exact_div (nunits, nf);
   1883     }
   1884   return get_vector_mode (BImode, nunits).require ();
   1885 }
   1886 
   1887 /* Return the appropriate M1 mode for MODE.  */
   1888 
   1889 static opt_machine_mode
   1890 get_m1_mode (machine_mode mode)
   1891 {
   1892   scalar_mode smode = GET_MODE_INNER (mode);
   1893   unsigned int bytes = GET_MODE_SIZE (smode);
   1894   poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes);
   1895   return get_vector_mode (smode, m1_nunits);
   1896 }
   1897 
   1898 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE.
   1899    This function is not only used by builtins, but also will be used by
   1900    auto-vectorization in the future.  */
   1901 opt_machine_mode
   1902 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits)
   1903 {
   1904   enum mode_class mclass;
   1905   if (inner_mode == E_BImode)
   1906     mclass = MODE_VECTOR_BOOL;
   1907   else if (FLOAT_MODE_P (inner_mode))
   1908     mclass = MODE_VECTOR_FLOAT;
   1909   else
   1910     mclass = MODE_VECTOR_INT;
   1911   machine_mode mode;
   1912   FOR_EACH_MODE_IN_CLASS (mode, mclass)
   1913     if (inner_mode == GET_MODE_INNER (mode)
   1914 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
   1915 	&& (riscv_v_ext_vector_mode_p (mode)
   1916 	    || riscv_v_ext_vls_mode_p (mode)))
   1917       return mode;
   1918   return opt_machine_mode ();
   1919 }
   1920 
   1921 /* Return the RVV tuple mode if we can find the legal tuple mode for the
   1922    corresponding subpart mode and NF.  */
   1923 opt_machine_mode
   1924 get_tuple_mode (machine_mode subpart_mode, unsigned int nf)
   1925 {
   1926   poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf;
   1927   scalar_mode inner_mode = GET_MODE_INNER (subpart_mode);
   1928   enum mode_class mclass = GET_MODE_CLASS (subpart_mode);
   1929   machine_mode mode;
   1930   FOR_EACH_MODE_IN_CLASS (mode, mclass)
   1931     if (inner_mode == GET_MODE_INNER (mode)
   1932 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
   1933 	&& riscv_v_ext_tuple_mode_p (mode)
   1934 	&& get_subpart_mode (mode) == subpart_mode)
   1935       return mode;
   1936   return opt_machine_mode ();
   1937 }
   1938 
   1939 bool
   1940 simm5_p (rtx x)
   1941 {
   1942   if (!CONST_INT_P (x))
   1943     return false;
   1944   return IN_RANGE (INTVAL (x), -16, 15);
   1945 }
   1946 
   1947 bool
   1948 neg_simm5_p (rtx x)
   1949 {
   1950   if (!CONST_INT_P (x))
   1951     return false;
   1952   return IN_RANGE (INTVAL (x), -15, 16);
   1953 }
   1954 
   1955 bool
   1956 has_vi_variant_p (rtx_code code, rtx x)
   1957 {
   1958   switch (code)
   1959     {
   1960     case PLUS:
   1961     case AND:
   1962     case IOR:
   1963     case XOR:
   1964     case SS_PLUS:
   1965     case US_PLUS:
   1966     case EQ:
   1967     case NE:
   1968     case LE:
   1969     case LEU:
   1970     case GT:
   1971     case GTU:
   1972       return simm5_p (x);
   1973 
   1974     case LT:
   1975     case LTU:
   1976     case GE:
   1977     case GEU:
   1978     case MINUS:
   1979     case SS_MINUS:
   1980       return neg_simm5_p (x);
   1981 
   1982     default:
   1983       return false;
   1984     }
   1985 }
   1986 
   1987 bool
   1988 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl,
   1989 		     machine_mode vector_mode, bool has_vi_variant_p,
   1990 		     void (*emit_vector_func) (rtx *, rtx), enum avl_type type)
   1991 {
   1992   machine_mode scalar_mode = GET_MODE_INNER (vector_mode);
   1993   if (has_vi_variant_p)
   1994     {
   1995       *scalar_op = force_reg (scalar_mode, *scalar_op);
   1996       return false;
   1997     }
   1998 
   1999   if (TARGET_64BIT)
   2000     {
   2001       if (!rtx_equal_p (*scalar_op, const0_rtx))
   2002 	*scalar_op = force_reg (scalar_mode, *scalar_op);
   2003       return false;
   2004     }
   2005 
   2006   if (immediate_operand (*scalar_op, Pmode))
   2007     {
   2008       if (!rtx_equal_p (*scalar_op, const0_rtx))
   2009 	*scalar_op = force_reg (Pmode, *scalar_op);
   2010 
   2011       *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op);
   2012       return false;
   2013     }
   2014 
   2015   if (CONST_INT_P (*scalar_op))
   2016     {
   2017       if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode)))
   2018 	*scalar_op = force_const_mem (scalar_mode, *scalar_op);
   2019       else
   2020 	*scalar_op = force_reg (scalar_mode, *scalar_op);
   2021     }
   2022 
   2023   rtx tmp = gen_reg_rtx (vector_mode);
   2024   rtx ops[] = {tmp, *scalar_op};
   2025   if (type == VLMAX)
   2026     emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops);
   2027   else
   2028     emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops,
   2029 			vl);
   2030   emit_vector_func (operands, tmp);
   2031 
   2032   return true;
   2033 }
   2034 
   2035 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask.  */
   2036 rtx
   2037 gen_scalar_move_mask (machine_mode mode)
   2038 {
   2039   rtx_vector_builder builder (mode, 1, 2);
   2040   builder.quick_push (const1_rtx);
   2041   builder.quick_push (const0_rtx);
   2042   return builder.build ();
   2043 }
   2044 
   2045 static unsigned
   2046 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size)
   2047 {
   2048   // Original equation:
   2049   //   VLMAX = (VectorBits / EltSize) * LMUL
   2050   //   where LMUL = MinSize / TARGET_MIN_VLEN
   2051   // The following equations have been reordered to prevent loss of precision
   2052   // when calculating fractional LMUL.
   2053   return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN;
   2054 }
   2055 
   2056 static unsigned
   2057 get_unknown_min_value (machine_mode mode)
   2058 {
   2059   enum vlmul_type vlmul = get_vlmul (mode);
   2060   switch (vlmul)
   2061     {
   2062     case LMUL_1:
   2063       return TARGET_MIN_VLEN;
   2064     case LMUL_2:
   2065       return TARGET_MIN_VLEN * 2;
   2066     case LMUL_4:
   2067       return TARGET_MIN_VLEN * 4;
   2068     case LMUL_8:
   2069       return TARGET_MIN_VLEN * 8;
   2070     default:
   2071       gcc_unreachable ();
   2072     }
   2073 }
   2074 
   2075 static rtx
   2076 force_vector_length_operand (rtx vl)
   2077 {
   2078   if (CONST_INT_P (vl) && !satisfies_constraint_vl (vl))
   2079     return force_reg (Pmode, vl);
   2080   return vl;
   2081 }
   2082 
   2083 rtx
   2084 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl)
   2085 {
   2086   unsigned int sew = get_sew (vmode);
   2087   rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode);
   2088   rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode);
   2089   return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode),
   2090 				     gen_int_mode (get_vlmul (vmode), Pmode),
   2091 				     tail_policy, mask_policy);
   2092 }
   2093 
   2094 /* GET VL * 2 rtx.  */
   2095 static rtx
   2096 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode)
   2097 {
   2098   rtx i32vl = NULL_RTX;
   2099   if (CONST_INT_P (avl))
   2100     {
   2101       unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode));
   2102       unsigned min_size = get_unknown_min_value (mode);
   2103       unsigned vlen_max = RVV_65536;
   2104       unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size);
   2105       unsigned vlen_min = TARGET_MIN_VLEN;
   2106       unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size);
   2107 
   2108       unsigned HOST_WIDE_INT avl_int = INTVAL (avl);
   2109       if (avl_int <= vlmax_min)
   2110 	i32vl = gen_int_mode (2 * avl_int, Pmode);
   2111       else if (avl_int >= 2 * vlmax_max)
   2112 	{
   2113 	  // Just set i32vl to VLMAX in this situation
   2114 	  i32vl = gen_reg_rtx (Pmode);
   2115 	  emit_insn (
   2116 	    gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX));
   2117 	}
   2118       else
   2119 	{
   2120 	  // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl
   2121 	  // is related to the hardware implementation.
   2122 	  // So let the following code handle
   2123 	}
   2124     }
   2125   if (!i32vl)
   2126     {
   2127       // Using vsetvli instruction to get actually used length which related to
   2128       // the hardware implementation
   2129       rtx i64vl = gen_reg_rtx (Pmode);
   2130       emit_insn (
   2131 	gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl)));
   2132       // scale 2 for 32-bit length
   2133       i32vl = gen_reg_rtx (Pmode);
   2134       emit_insn (
   2135 	gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx)));
   2136     }
   2137 
   2138   return force_vector_length_operand (i32vl);
   2139 }
   2140 
   2141 bool
   2142 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode,
   2143 		     machine_mode demote_mask_mode, rtx *ops)
   2144 {
   2145   rtx scalar_op = ops[4];
   2146   rtx avl = ops[5];
   2147   machine_mode scalar_mode = GET_MODE_INNER (mode);
   2148   if (rtx_equal_p (scalar_op, const0_rtx))
   2149     {
   2150       ops[5] = force_vector_length_operand (ops[5]);
   2151       return false;
   2152     }
   2153 
   2154   if (TARGET_64BIT)
   2155     {
   2156       ops[4] = force_reg (scalar_mode, scalar_op);
   2157       ops[5] = force_vector_length_operand (ops[5]);
   2158       return false;
   2159     }
   2160 
   2161   if (immediate_operand (scalar_op, Pmode))
   2162     {
   2163       ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op));
   2164       ops[5] = force_vector_length_operand (ops[5]);
   2165       return false;
   2166     }
   2167 
   2168   if (CONST_INT_P (scalar_op))
   2169     scalar_op = force_reg (scalar_mode, scalar_op);
   2170 
   2171   rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode);
   2172 
   2173   rtx demote_scalar_op1, demote_scalar_op2;
   2174   if (unspec == UNSPEC_VSLIDE1UP)
   2175     {
   2176       demote_scalar_op1 = gen_highpart (Pmode, scalar_op);
   2177       demote_scalar_op2 = gen_lowpart (Pmode, scalar_op);
   2178     }
   2179   else
   2180     {
   2181       demote_scalar_op1 = gen_lowpart (Pmode, scalar_op);
   2182       demote_scalar_op2 = gen_highpart (Pmode, scalar_op);
   2183     }
   2184 
   2185   rtx temp = gen_reg_rtx (demote_mode);
   2186   rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode);
   2187   rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode);
   2188   rtx merge = RVV_VUNDEF (demote_mode);
   2189   /* Handle vslide1<ud>_tu.  */
   2190   if (register_operand (ops[2], mode)
   2191       && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))))
   2192     {
   2193       merge = gen_lowpart (demote_mode, ops[2]);
   2194       ta = ops[6];
   2195       ma = ops[7];
   2196     }
   2197 
   2198   emit_insn (gen_pred_slide (unspec, demote_mode, temp,
   2199 			     CONSTM1_RTX (demote_mask_mode), merge,
   2200 			     gen_lowpart (demote_mode, ops[3]),
   2201 			     demote_scalar_op1, vl_x2, ta, ma, ops[8]));
   2202   emit_insn (gen_pred_slide (unspec, demote_mode,
   2203 			     gen_lowpart (demote_mode, ops[0]),
   2204 			     CONSTM1_RTX (demote_mask_mode), merge, temp,
   2205 			     demote_scalar_op2, vl_x2, ta, ma, ops[8]));
   2206 
   2207   if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))
   2208       && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2]))))
   2209     emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1],
   2210 			       force_vector_length_operand (ops[5]), ops[6],
   2211 			       ops[8]));
   2212   return true;
   2213 }
   2214 
   2215 rtx
   2216 gen_avl_for_scalar_move (rtx avl)
   2217 {
   2218   /* AVL for scalar move has different behavior between 0 and large than 0.  */
   2219   if (CONST_INT_P (avl))
   2220     {
   2221       /* So we could just set AVL to 1 for any constant other than 0.  */
   2222       if (rtx_equal_p (avl, const0_rtx))
   2223 	return const0_rtx;
   2224       else
   2225 	return const1_rtx;
   2226     }
   2227   else
   2228     {
   2229       /* For non-constant value, we set any non zero value to 1 by
   2230 	 `sgtu new_avl,input_avl,zero` + `vsetvli`.  */
   2231       rtx tmp = gen_reg_rtx (Pmode);
   2232       emit_insn (
   2233 	gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx)));
   2234       return tmp;
   2235     }
   2236 }
   2237 
   2238 /* Expand tuple modes data movement for.  */
   2239 void
   2240 expand_tuple_move (rtx *ops)
   2241 {
   2242   unsigned int i;
   2243   machine_mode tuple_mode = GET_MODE (ops[0]);
   2244   machine_mode subpart_mode = get_subpart_mode (tuple_mode);
   2245   poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode);
   2246   unsigned int nf = get_nf (tuple_mode);
   2247   bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR);
   2248 
   2249   if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1]))
   2250     {
   2251       rtx val;
   2252       gcc_assert (can_create_pseudo_p ()
   2253 		  && const_vec_duplicate_p (ops[1], &val));
   2254       for (i = 0; i < nf; ++i)
   2255 	{
   2256 	  poly_int64 offset = i * subpart_size;
   2257 	  rtx subreg
   2258 	    = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
   2259 	  rtx dup = gen_const_vec_duplicate (subpart_mode, val);
   2260 	  emit_move_insn (subreg, dup);
   2261 	}
   2262     }
   2263   else if (REG_P (ops[0]) && REG_P (ops[1]))
   2264     {
   2265       for (i = 0; i < nf; ++i)
   2266 	{
   2267 	  int index = i;
   2268 
   2269 	  /* Take NF = 2 and LMUL = 1 for example:
   2270 
   2271 	      - move v8 to v9:
   2272 		 vmv1r v10,v9
   2273 		 vmv1r v9,v8
   2274 
   2275 	      - move v8 to v7:
   2276 		 vmv1r v7,v8
   2277 		 vmv1r v8,v9  */
   2278 	  if (REGNO (ops[0]) > REGNO (ops[1]))
   2279 	    index = nf - 1 - i;
   2280 	  poly_int64 offset = index * subpart_size;
   2281 	  rtx dst_subreg
   2282 	    = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset);
   2283 	  rtx src_subreg
   2284 	    = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset);
   2285 	  emit_insn (gen_rtx_SET (dst_subreg, src_subreg));
   2286 	}
   2287     }
   2288   else
   2289     {
   2290       /* Expand tuple memory data movement.  */
   2291       gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1]));
   2292       rtx offset = gen_int_mode (subpart_size, Pmode);
   2293       if (!subpart_size.is_constant ())
   2294 	{
   2295 	  emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode));
   2296 	  if (fractional_p)
   2297 	    {
   2298 	      unsigned int factor
   2299 		= exact_div (BYTES_PER_RISCV_VECTOR, subpart_size)
   2300 		    .to_constant ();
   2301 	      rtx pat
   2302 		= gen_rtx_ASHIFTRT (Pmode, ops[2],
   2303 				    gen_int_mode (exact_log2 (factor), Pmode));
   2304 	      emit_insn (gen_rtx_SET (ops[2], pat));
   2305 	    }
   2306 
   2307 	  if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR))
   2308 	    {
   2309 	      unsigned int factor
   2310 		= exact_div (subpart_size, BYTES_PER_RISCV_VECTOR)
   2311 		    .to_constant ();
   2312 	      rtx pat
   2313 		= gen_rtx_ASHIFT (Pmode, ops[2],
   2314 				  gen_int_mode (exact_log2 (factor), Pmode));
   2315 	      emit_insn (gen_rtx_SET (ops[2], pat));
   2316 	    }
   2317 	  offset = ops[2];
   2318 	}
   2319 
   2320       /* Non-fractional LMUL has whole register moves that don't require a
   2321 	 vsetvl for VLMAX.  */
   2322       if (fractional_p)
   2323 	emit_vlmax_vsetvl (subpart_mode, ops[4]);
   2324       if (MEM_P (ops[1]))
   2325 	{
   2326 	  /* Load operations.  */
   2327 	  emit_move_insn (ops[3], XEXP (ops[1], 0));
   2328 	  for (i = 0; i < nf; i++)
   2329 	    {
   2330 	      rtx subreg = simplify_gen_subreg (subpart_mode, ops[0],
   2331 						tuple_mode, i * subpart_size);
   2332 	      if (i != 0)
   2333 		{
   2334 		  rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
   2335 		  emit_insn (gen_rtx_SET (ops[3], new_addr));
   2336 		}
   2337 	      rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
   2338 
   2339 	      if (fractional_p)
   2340 		{
   2341 		  rtx operands[] = {subreg, mem};
   2342 		  emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
   2343 					UNARY_OP, operands, ops[4]);
   2344 		}
   2345 	      else
   2346 		emit_move_insn (subreg, mem);
   2347 	    }
   2348 	}
   2349       else
   2350 	{
   2351 	  /* Store operations.  */
   2352 	  emit_move_insn (ops[3], XEXP (ops[0], 0));
   2353 	  for (i = 0; i < nf; i++)
   2354 	    {
   2355 	      rtx subreg = simplify_gen_subreg (subpart_mode, ops[1],
   2356 						tuple_mode, i * subpart_size);
   2357 	      if (i != 0)
   2358 		{
   2359 		  rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset);
   2360 		  emit_insn (gen_rtx_SET (ops[3], new_addr));
   2361 		}
   2362 	      rtx mem = gen_rtx_MEM (subpart_mode, ops[3]);
   2363 
   2364 	      if (fractional_p)
   2365 		{
   2366 		  rtx operands[] = {mem, subreg};
   2367 		  emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode),
   2368 					UNARY_OP, operands, ops[4]);
   2369 		}
   2370 	      else
   2371 		emit_move_insn (mem, subreg);
   2372 	    }
   2373 	}
   2374     }
   2375 }
   2376 
   2377 /* Return the vectorization machine mode for RVV according to LMUL.  */
   2378 machine_mode
   2379 preferred_simd_mode (scalar_mode mode)
   2380 {
   2381   if (autovec_use_vlmax_p ())
   2382     {
   2383       /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and
   2384 	 rvv_max_lmul as multiply factor to calculate the NUNITS to
   2385 	 get the auto-vectorization mode.  */
   2386       poly_uint64 nunits;
   2387       poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
   2388       poly_uint64 scalar_size = GET_MODE_SIZE (mode);
   2389       /* Disable vectorization when we can't find a RVV mode for it.
   2390 	 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize
   2391 	 a double (DFmode) type.  */
   2392       if (!multiple_p (vector_size, scalar_size, &nunits))
   2393 	return word_mode;
   2394       machine_mode rvv_mode;
   2395       if (get_vector_mode (mode, nunits).exists (&rvv_mode))
   2396 	return rvv_mode;
   2397     }
   2398   return word_mode;
   2399 }
   2400 
   2401 /* Subroutine of riscv_vector_expand_vector_init.
   2402    Works as follows:
   2403    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
   2404    (b) Skip leading elements from BUILDER, which are the same as
   2405        element NELTS_REQD - 1.
   2406    (c) Insert earlier elements in reverse order in TARGET using vslide1down.  */
   2407 
   2408 static void
   2409 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder,
   2410 				 int nelts_reqd)
   2411 {
   2412   machine_mode mode = GET_MODE (target);
   2413   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
   2414   emit_move_insn (target, dup);
   2415   int ndups = builder.count_dups (0, nelts_reqd - 1, 1);
   2416   for (int i = ndups; i < nelts_reqd; i++)
   2417     {
   2418       unsigned int unspec
   2419 	= FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN;
   2420       insn_code icode = code_for_pred_slide (unspec, mode);
   2421       rtx ops[] = {target, target, builder.elt (i)};
   2422       emit_vlmax_insn (icode, BINARY_OP, ops);
   2423     }
   2424 }
   2425 
   2426 /* Use merge approach to initialize the vector with repeating sequence.
   2427    v = {a, b, a, b, a, b, a, b}.
   2428 
   2429    v = broadcast (a).
   2430    mask = 0b01010101....
   2431    v = merge (v, b, mask)
   2432 */
   2433 static void
   2434 expand_vector_init_merge_repeating_sequence (rtx target,
   2435 					     const rvv_builder &builder)
   2436 {
   2437   /* We can't use BIT mode (BI) directly to generate mask = 0b01010...
   2438      since we don't have such instruction in RVV.
   2439      Instead, we should use INT mode (QI/HI/SI/DI) with integer move
   2440      instruction to generate the mask data we want.  */
   2441   machine_mode mask_bit_mode = get_mask_mode (builder.mode ());
   2442   machine_mode mask_int_mode
   2443     = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode);
   2444   uint64_t full_nelts = builder.full_nelts ().to_constant ();
   2445 
   2446   /* Step 1: Broadcast the first pattern.  */
   2447   rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))};
   2448   emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()),
   2449 		    UNARY_OP, ops);
   2450   /* Step 2: Merge the rest iteration of pattern.  */
   2451   for (unsigned int i = 1; i < builder.npatterns (); i++)
   2452     {
   2453       /* Step 2-1: Generate mask register v0 for each merge.  */
   2454       rtx merge_mask
   2455 	= builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode));
   2456       rtx mask = gen_reg_rtx (mask_bit_mode);
   2457       rtx dup = gen_reg_rtx (mask_int_mode);
   2458 
   2459       if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x.  */
   2460 	{
   2461 	  rtx ops[] = {dup, merge_mask};
   2462 	  emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)),
   2463 			       SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode));
   2464 	}
   2465       else /* vmv.v.x.  */
   2466 	{
   2467 	  rtx ops[] = {dup,
   2468 		       force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)};
   2469 	  rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()),
   2470 				 Pmode);
   2471 	  emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP,
   2472 			       ops, vl);
   2473 	}
   2474 
   2475       emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup));
   2476 
   2477       /* Step 2-2: Merge pattern according to the mask.  */
   2478       rtx ops[] = {target, target, builder.elt (i), mask};
   2479       emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)),
   2480 			MERGE_OP, ops);
   2481     }
   2482 }
   2483 
   2484 /* Use slideup approach to combine the vectors.
   2485      v = {a, a, a, a, b, b, b, b}
   2486 
   2487    First:
   2488      v1 = {a, a, a, a, a, a, a, a}
   2489      v2 = {b, b, b, b, b, b, b, b}
   2490      v = slideup (v1, v2, nelt / 2)
   2491 */
   2492 static void
   2493 expand_vector_init_slideup_combine_sequence (rtx target,
   2494 					     const rvv_builder &builder)
   2495 {
   2496   machine_mode mode = GET_MODE (target);
   2497   int nelts = builder.full_nelts ().to_constant ();
   2498   rtx first_elt = builder.elt (0);
   2499   rtx last_elt = builder.elt (nelts - 1);
   2500   rtx low = expand_vector_broadcast (mode, first_elt);
   2501   rtx high = expand_vector_broadcast (mode, last_elt);
   2502   insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode);
   2503   rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)};
   2504   emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops);
   2505 }
   2506 
   2507 /* Use merge approach to merge a scalar into a vector.
   2508      v = {a, a, a, a, a, a, b, b}
   2509 
   2510      v1 = {a, a, a, a, a, a, a, a}
   2511      scalar = b
   2512      mask = {0, 0, 0, 0, 0, 0, 1, 1}
   2513 */
   2514 static void
   2515 expand_vector_init_merge_combine_sequence (rtx target,
   2516 					   const rvv_builder &builder)
   2517 {
   2518   machine_mode mode = GET_MODE (target);
   2519   machine_mode imode = builder.int_mode ();
   2520   machine_mode mmode = builder.mask_mode ();
   2521   int nelts = builder.full_nelts ().to_constant ();
   2522   int leading_ndups = builder.count_dups (0, nelts - 1, 1);
   2523   if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode)
   2524       || riscv_get_v_regno_alignment (imode) > 1)
   2525     imode = get_vector_mode (HImode, nelts).require ();
   2526 
   2527   /* Generate vid = { 0, 1, 2, ..., n }.  */
   2528   rtx vid = gen_reg_rtx (imode);
   2529   expand_vec_series (vid, const0_rtx, const1_rtx);
   2530 
   2531   /* Generate mask.  */
   2532   rtx mask = gen_reg_rtx (mmode);
   2533   insn_code icode = code_for_pred_cmp_scalar (imode);
   2534   rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ());
   2535   rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index);
   2536   /* vmsgtu.vi/vmsgtu.vx.  */
   2537   rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx);
   2538   rtx sel = builder.elt (nelts - 1);
   2539   rtx mask_ops[] = {mask, cmp, vid, index};
   2540   emit_vlmax_insn (icode, COMPARE_OP, mask_ops);
   2541 
   2542   /* Duplicate the first elements.  */
   2543   rtx dup = expand_vector_broadcast (mode, builder.elt (0));
   2544   /* Merge scalar into vector according to mask.  */
   2545   rtx merge_ops[] = {target, dup, sel, mask};
   2546   icode = code_for_pred_merge_scalar (mode);
   2547   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
   2548 }
   2549 
   2550 /* Subroutine of expand_vec_init to handle case
   2551    when all trailing elements of builder are same.
   2552    This works as follows:
   2553    (a) Use expand_insn interface to broadcast last vector element in TARGET.
   2554    (b) Insert remaining elements in TARGET using insr.
   2555 
   2556    ??? The heuristic used is to do above if number of same trailing elements
   2557    is greater than leading_ndups, loosely based on
   2558    heuristic from mostly_zeros_p.  May need fine-tuning.  */
   2559 
   2560 static bool
   2561 expand_vector_init_trailing_same_elem (rtx target,
   2562 				       const rtx_vector_builder &builder,
   2563 				       int nelts_reqd)
   2564 {
   2565   int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
   2566   int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
   2567   machine_mode mode = GET_MODE (target);
   2568 
   2569   if (trailing_ndups > leading_ndups)
   2570     {
   2571       rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
   2572       for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
   2573 	{
   2574 	  unsigned int unspec
   2575 	    = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
   2576 	  insn_code icode = code_for_pred_slide (unspec, mode);
   2577 	  rtx tmp = gen_reg_rtx (mode);
   2578 	  rtx ops[] = {tmp, dup, builder.elt (i)};
   2579 	  emit_vlmax_insn (icode, BINARY_OP, ops);
   2580 	  /* slide1up need source and dest to be different REG.  */
   2581 	  dup = tmp;
   2582 	}
   2583 
   2584       emit_move_insn (target, dup);
   2585       return true;
   2586     }
   2587 
   2588   return false;
   2589 }
   2590 
   2591 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
   2592 
   2593 void
   2594 expand_vec_init (rtx target, rtx vals)
   2595 {
   2596   machine_mode mode = GET_MODE (target);
   2597   int nelts = XVECLEN (vals, 0);
   2598 
   2599   rvv_builder v (mode, nelts, 1);
   2600   for (int i = 0; i < nelts; i++)
   2601     v.quick_push (XVECEXP (vals, 0, i));
   2602   v.finalize ();
   2603 
   2604   /* If the sequence is v = { a, a, a, a } just broadcast an element.  */
   2605   if (v.is_repeating_sequence ())
   2606     {
   2607       machine_mode mode = GET_MODE (target);
   2608       rtx dup = expand_vector_broadcast (mode, v.elt (0));
   2609       emit_move_insn (target, dup);
   2610       return;
   2611     }
   2612 
   2613   if (nelts > 3)
   2614     {
   2615       /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }.  */
   2616       if (v.can_duplicate_repeating_sequence_p ())
   2617 	{
   2618 	  rtx ele = v.get_merged_repeating_sequence ();
   2619 	  rtx dup = expand_vector_broadcast (v.new_mode (), ele);
   2620 	  emit_move_insn (target, gen_lowpart (mode, dup));
   2621 	  return;
   2622 	}
   2623 
   2624       /* Case 2: Optimize repeating sequence cases that Case 1 can
   2625 	 not handle and it is profitable.  For example:
   2626 	 ELEMENT BITSIZE = 64.
   2627 	 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}.
   2628 	 We can't find a vector mode for "ab" which will be combined into
   2629 	 128-bit element to duplicate.  */
   2630       if (v.repeating_sequence_use_merge_profitable_p ())
   2631 	{
   2632 	  expand_vector_init_merge_repeating_sequence (target, v);
   2633 	  return;
   2634 	}
   2635 
   2636       /* Case 3: Optimize combine sequence.
   2637 	 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}.
   2638 	 We can combine:
   2639 	   v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
   2640 	 and
   2641 	   v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}.
   2642 	 by slideup.  */
   2643       if (v.combine_sequence_use_slideup_profitable_p ())
   2644 	{
   2645 	  expand_vector_init_slideup_combine_sequence (target, v);
   2646 	  return;
   2647 	}
   2648 
   2649       /* Case 4: Optimize combine sequence.
   2650 	 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.
   2651 
   2652 	 Generate vector:
   2653 	   v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}.
   2654 
   2655 	 Generate mask:
   2656 	   mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}.
   2657 
   2658 	 Merge b into v by mask:
   2659 	   v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}.  */
   2660       if (v.combine_sequence_use_merge_profitable_p ())
   2661 	{
   2662 	  expand_vector_init_merge_combine_sequence (target, v);
   2663 	  return;
   2664 	}
   2665     }
   2666 
   2667   /* Optimize trailing same elements sequence:
   2668       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
   2669   if (!expand_vector_init_trailing_same_elem (target, v, nelts))
   2670     /* Handle common situation by vslide1down. This function can handle any
   2671        situation of vec_init<mode>. Only the cases that are not optimized above
   2672        will fall through here.  */
   2673     expand_vector_init_insert_elems (target, v, nelts);
   2674 }
   2675 
   2676 /* Get insn code for corresponding comparison.  */
   2677 
   2678 static insn_code
   2679 get_cmp_insn_code (rtx_code code, machine_mode mode)
   2680 {
   2681   insn_code icode;
   2682   switch (code)
   2683     {
   2684     case EQ:
   2685     case NE:
   2686     case LE:
   2687     case LEU:
   2688     case GT:
   2689     case GTU:
   2690     case LTGT:
   2691       icode = code_for_pred_cmp (mode);
   2692       break;
   2693     case LT:
   2694     case LTU:
   2695     case GE:
   2696     case GEU:
   2697       if (FLOAT_MODE_P (mode))
   2698 	icode = code_for_pred_cmp (mode);
   2699       else
   2700 	icode = code_for_pred_ltge (mode);
   2701       break;
   2702     default:
   2703       gcc_unreachable ();
   2704     }
   2705   return icode;
   2706 }
   2707 
   2708 /* This hook gives the vectorizer more vector mode options.  We want it to not
   2709    only try modes with the maximum number of units a full vector can hold but
   2710    for example also half the number of units for a smaller elements size.
   2711    Such vectors can be promoted to a full vector of widened elements
   2712    (still with the same number of elements, essentially vectorizing at a
   2713    fixed number of units rather than a fixed number of bytes).  */
   2714 unsigned int
   2715 autovectorize_vector_modes (vector_modes *modes, bool)
   2716 {
   2717   if (autovec_use_vlmax_p ())
   2718     {
   2719       poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL;
   2720 
   2721       /* Start with a RVV<LMUL>QImode where LMUL is the number of units that
   2722 	 fit a whole vector.
   2723 	 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which
   2724 	 is guided by the extensions we have available (vf2, vf4 and vf8).
   2725 
   2726 	 - full_size: Try using full vectors for all element types.
   2727 	 - full_size / 2:
   2728 	   Try using 16-bit containers for 8-bit elements and full vectors
   2729 	   for wider elements.
   2730 	 - full_size / 4:
   2731 	   Try using 32-bit containers for 8-bit and 16-bit elements and
   2732 	   full vectors for wider elements.
   2733 	 - full_size / 8:
   2734 	   Try using 64-bit containers for all element types.  */
   2735       static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64};
   2736       for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++)
   2737 	{
   2738 	  poly_uint64 units;
   2739 	  machine_mode mode;
   2740 	  if (can_div_trunc_p (full_size, rvv_factors[i], &units)
   2741 	      && get_vector_mode (QImode, units).exists (&mode))
   2742 	    modes->safe_push (mode);
   2743 	}
   2744     }
   2745     /* Push all VLSmodes according to TARGET_MIN_VLEN.  */
   2746     unsigned int i = 0;
   2747     unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8;
   2748     unsigned int size = base_size;
   2749     machine_mode mode;
   2750     while (size > 0 && get_vector_mode (QImode, size).exists (&mode))
   2751      {
   2752 	if (vls_mode_valid_p (mode))
   2753 	  modes->safe_push (mode);
   2754 
   2755 	i++;
   2756 	size = base_size / (1U << i);
   2757      }
   2758   /* Enable LOOP_VINFO comparison in COST model.  */
   2759   return VECT_COMPARE_COSTS;
   2760 }
   2761 
   2762 /* Return true if we can find the related MODE according to default LMUL. */
   2763 static bool
   2764 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode,
   2765 			 poly_uint64 *nunits)
   2766 {
   2767   if (!autovec_use_vlmax_p ())
   2768     return false;
   2769   if (riscv_v_ext_vector_mode_p (vector_mode)
   2770       && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
   2771 		     GET_MODE_SIZE (element_mode), nunits))
   2772     return true;
   2773   if (riscv_v_ext_vls_mode_p (vector_mode)
   2774       && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL,
   2775 		     GET_MODE_SIZE (element_mode), nunits))
   2776     return true;
   2777   return false;
   2778 }
   2779 
   2780 /* If the given VECTOR_MODE is an RVV mode,  first get the largest number
   2781    of units that fit into a full vector at the given ELEMENT_MODE.
   2782    We will have the vectorizer call us with a successively decreasing
   2783    number of units (as specified in autovectorize_vector_modes).
   2784    The starting mode is always the one specified by preferred_simd_mode. */
   2785 opt_machine_mode
   2786 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode,
   2787 			poly_uint64 nunits)
   2788 {
   2789   /* TODO: We will support RVV VLS auto-vectorization mode in the future. */
   2790   poly_uint64 min_units;
   2791   if (can_find_related_mode_p (vector_mode, element_mode, &min_units))
   2792     {
   2793       machine_mode rvv_mode;
   2794       if (maybe_ne (nunits, 0U))
   2795 	{
   2796 	  /* If we were given a number of units NUNITS, try to find an
   2797 	     RVV vector mode of inner mode ELEMENT_MODE with the same
   2798 	     number of units.  */
   2799 	  if (multiple_p (min_units, nunits)
   2800 	      && get_vector_mode (element_mode, nunits).exists (&rvv_mode))
   2801 	    return rvv_mode;
   2802 	}
   2803       else
   2804 	{
   2805 	  /* Look for a vector mode with the same number of units as the
   2806 	     VECTOR_MODE we were given.  We keep track of the minimum
   2807 	     number of units so far which determines the smallest necessary
   2808 	     but largest possible, suitable mode for vectorization.  */
   2809 	  min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode));
   2810 	  if (get_vector_mode (element_mode, min_units).exists (&rvv_mode))
   2811 	    return rvv_mode;
   2812 	}
   2813     }
   2814 
   2815   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
   2816 }
   2817 
   2818 /* Expand an RVV comparison.  */
   2819 
   2820 void
   2821 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask,
   2822 		rtx maskoff)
   2823 {
   2824   machine_mode mask_mode = GET_MODE (target);
   2825   machine_mode data_mode = GET_MODE (op0);
   2826   insn_code icode = get_cmp_insn_code (code, data_mode);
   2827 
   2828   if (code == LTGT)
   2829     {
   2830       rtx lt = gen_reg_rtx (mask_mode);
   2831       rtx gt = gen_reg_rtx (mask_mode);
   2832       expand_vec_cmp (lt, LT, op0, op1, mask, maskoff);
   2833       expand_vec_cmp (gt, GT, op0, op1, mask, maskoff);
   2834       icode = code_for_pred (IOR, mask_mode);
   2835       rtx ops[] = {target, lt, gt};
   2836       emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
   2837       return;
   2838     }
   2839 
   2840   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1);
   2841   if (!mask && !maskoff)
   2842     {
   2843       rtx ops[] = {target, cmp, op0, op1};
   2844       emit_vlmax_insn (icode, COMPARE_OP, ops);
   2845     }
   2846   else
   2847     {
   2848       rtx ops[] = {target, mask, maskoff, cmp, op0, op1};
   2849       emit_vlmax_insn (icode, COMPARE_OP_MU, ops);
   2850     }
   2851 }
   2852 
   2853 /* Expand an RVV floating-point comparison:
   2854 
   2855    If CAN_INVERT_P is true, the caller can also handle inverted results;
   2856    return true if the result is in fact inverted.  */
   2857 
   2858 bool
   2859 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1,
   2860 		      bool can_invert_p)
   2861 {
   2862   machine_mode mask_mode = GET_MODE (target);
   2863   machine_mode data_mode = GET_MODE (op0);
   2864 
   2865   /* If can_invert_p = true:
   2866      It suffices to implement a u>= b as !(a < b) but with the NaNs masked off:
   2867 
   2868        vmfeq.vv    v0, va, va
   2869        vmfeq.vv    v1, vb, vb
   2870        vmand.mm    v0, v0, v1
   2871        vmflt.vv    v0, va, vb, v0.t
   2872        vmnot.m     v0, v0
   2873 
   2874      And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the
   2875      second vmfeq.vv:
   2876 
   2877        vmfeq.vv    v0, va, va
   2878        vmfeq.vv    v0, vb, vb, v0.t
   2879        vmflt.vv    v0, va, vb, v0.t
   2880        vmnot.m     v0, v0
   2881 
   2882      If can_invert_p = false:
   2883 
   2884        # Example of implementing isgreater()
   2885        vmfeq.vv v0, va, va        # Only set where A is not NaN.
   2886        vmfeq.vv v1, vb, vb        # Only set where B is not NaN.
   2887        vmand.mm v0, v0, v1        # Only set where A and B are ordered,
   2888        vmfgt.vv v0, va, vb, v0.t  #  so only set flags on ordered values.
   2889   */
   2890 
   2891   rtx eq0 = gen_reg_rtx (mask_mode);
   2892   rtx eq1 = gen_reg_rtx (mask_mode);
   2893   switch (code)
   2894     {
   2895     case EQ:
   2896     case NE:
   2897     case LT:
   2898     case LE:
   2899     case GT:
   2900     case GE:
   2901     case LTGT:
   2902       /* There is native support for the comparison.  */
   2903       expand_vec_cmp (target, code, op0, op1);
   2904       return false;
   2905     case UNEQ:
   2906     case ORDERED:
   2907     case UNORDERED:
   2908     case UNLT:
   2909     case UNLE:
   2910     case UNGT:
   2911     case UNGE:
   2912       /* vmfeq.vv v0, va, va  */
   2913       expand_vec_cmp (eq0, EQ, op0, op0);
   2914       if (HONOR_SNANS (data_mode))
   2915 	{
   2916 	  /*
   2917 	     vmfeq.vv    v1, vb, vb
   2918 	     vmand.mm    v0, v0, v1
   2919 	  */
   2920 	  expand_vec_cmp (eq1, EQ, op1, op1);
   2921 	  insn_code icode = code_for_pred (AND, mask_mode);
   2922 	  rtx ops[] = {eq0, eq0, eq1};
   2923 	  emit_vlmax_insn (icode, BINARY_MASK_OP, ops);
   2924 	}
   2925       else
   2926 	{
   2927 	  /* vmfeq.vv    v0, vb, vb, v0.t  */
   2928 	  expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0);
   2929 	}
   2930       break;
   2931     default:
   2932       gcc_unreachable ();
   2933     }
   2934 
   2935   if (code == ORDERED)
   2936     {
   2937       emit_move_insn (target, eq0);
   2938       return false;
   2939     }
   2940 
   2941   /* There is native support for the inverse comparison.  */
   2942   code = reverse_condition_maybe_unordered (code);
   2943   if (code == ORDERED)
   2944     emit_move_insn (target, eq0);
   2945   else
   2946     expand_vec_cmp (eq0, code, op0, op1, eq0, eq0);
   2947 
   2948   if (can_invert_p)
   2949     {
   2950       emit_move_insn (target, eq0);
   2951       return true;
   2952     }
   2953 
   2954   /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions
   2955      into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm.  */
   2956   emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0)));
   2957   return false;
   2958 }
   2959 
   2960 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL].
   2961    MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is
   2962    2 * nunits - 1.  */
   2963 static rtx
   2964 modulo_sel_indices (rtx op0, rtx op1, rtx sel)
   2965 {
   2966   rtx sel_mod;
   2967   machine_mode sel_mode = GET_MODE (sel);
   2968   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
   2969   poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1;
   2970   /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it.
   2971      Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the
   2972      indice.  */
   2973   if (CONST_VECTOR_P (sel)
   2974       && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel)))
   2975     sel_mod = sel;
   2976   else
   2977     {
   2978       rtx mod = gen_const_vector_dup (sel_mode, max_sel);
   2979       sel_mod
   2980 	= expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT);
   2981     }
   2982   return sel_mod;
   2983 }
   2984 
   2985 /* Implement vec_perm<mode>.  */
   2986 
   2987 void
   2988 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
   2989 {
   2990   machine_mode data_mode = GET_MODE (target);
   2991   machine_mode sel_mode = GET_MODE (sel);
   2992   poly_uint64 nunits = GET_MODE_NUNITS (sel_mode);
   2993 
   2994   /* Check if the sel only references the first values vector. If each select
   2995      index is in range of [0, nunits - 1]. A single vrgather instructions is
   2996      enough. Since we will use vrgatherei16.vv for variable-length vector,
   2997      it is never out of range and we don't need to modulo the index.  */
   2998   if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1))
   2999     {
   3000       emit_vlmax_gather_insn (target, op0, sel);
   3001       return;
   3002     }
   3003 
   3004   /* Check if all the indices are same.  */
   3005   rtx elt;
   3006   if (const_vec_duplicate_p (sel, &elt))
   3007     {
   3008       poly_uint64 value = rtx_to_poly_int64 (elt);
   3009       rtx op = op0;
   3010       if (maybe_gt (value, nunits - 1))
   3011 	{
   3012 	  sel = gen_const_vector_dup (sel_mode, value - nunits);
   3013 	  op = op1;
   3014 	}
   3015       emit_vlmax_gather_insn (target, op, sel);
   3016     }
   3017 
   3018   /* Note: vec_perm indices are supposed to wrap when they go beyond the
   3019      size of the two value vectors, i.e. the upper bits of the indices
   3020      are effectively ignored.  RVV vrgather instead produces 0 for any
   3021      out-of-range indices, so we need to modulo all the vec_perm indices
   3022      to ensure they are all in range of [0, nunits - 1] when op0 == op1
   3023      or all in range of [0, 2 * nunits - 1] when op0 != op1.  */
   3024   rtx sel_mod = modulo_sel_indices (op0, op1, sel);
   3025 
   3026   /* Check if the two values vectors are the same.  */
   3027   if (rtx_equal_p (op0, op1))
   3028     {
   3029       emit_vlmax_gather_insn (target, op0, sel_mod);
   3030       return;
   3031     }
   3032 
   3033   /* This following sequence is handling the case that:
   3034      __builtin_shufflevector (vec1, vec2, index...), the index can be any
   3035      value in range of [0, 2 * nunits - 1].  */
   3036   machine_mode mask_mode;
   3037   mask_mode = get_mask_mode (data_mode);
   3038   rtx mask = gen_reg_rtx (mask_mode);
   3039   rtx max_sel = gen_const_vector_dup (sel_mode, nunits);
   3040 
   3041   /* Step 1: generate a mask that should select everything >= nunits into the
   3042    * mask.  */
   3043   expand_vec_cmp (mask, GEU, sel_mod, max_sel);
   3044 
   3045   /* Step2: gather every op0 values indexed by sel into target,
   3046 	    we don't need to care about the result of the element
   3047 	    whose index >= nunits.  */
   3048   emit_vlmax_gather_insn (target, op0, sel_mod);
   3049 
   3050   /* Step3: shift the range from (nunits, max_of_mode] to
   3051 	    [0, max_of_mode - nunits].  */
   3052   rtx tmp = gen_reg_rtx (sel_mode);
   3053   rtx ops[] = {tmp, sel_mod, max_sel};
   3054   emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops);
   3055 
   3056   /* Step4: gather those into the previously masked-out elements
   3057 	    of target.  */
   3058   emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask);
   3059 }
   3060 
   3061 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV.  */
   3062 
   3063 /* vec_perm support.  */
   3064 
   3065 struct expand_vec_perm_d
   3066 {
   3067   rtx target, op0, op1;
   3068   vec_perm_indices perm;
   3069   machine_mode vmode;
   3070   machine_mode op_mode;
   3071   bool one_vector_p;
   3072   bool testing_p;
   3073 };
   3074 
   3075 /* Return the appropriate index mode for gather instructions.  */
   3076 opt_machine_mode
   3077 get_gather_index_mode (struct expand_vec_perm_d *d)
   3078 {
   3079   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   3080   poly_uint64 nunits = GET_MODE_NUNITS (d->vmode);
   3081 
   3082   if (GET_MODE_INNER (d->vmode) == QImode)
   3083     {
   3084       if (nunits.is_constant ())
   3085 	{
   3086 	  /* If indice is LMUL8 CONST_VECTOR and any element value
   3087 	     exceed the range of 0 ~ 255, Forbid such permutation
   3088 	     since we need vector HI mode to hold such indice and
   3089 	     we don't have it.  */
   3090 	  if (!d->perm.all_in_range_p (0, 255)
   3091 	      && !get_vector_mode (HImode, nunits).exists (&sel_mode))
   3092 	    return opt_machine_mode ();
   3093 	}
   3094       else
   3095 	{
   3096 	  /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
   3097 	     Otherwise, it could overflow the index range.  */
   3098 	  if (!get_vector_mode (HImode, nunits).exists (&sel_mode))
   3099 	    return opt_machine_mode ();
   3100 	}
   3101     }
   3102   else if (riscv_get_v_regno_alignment (sel_mode) > 1
   3103 	   && GET_MODE_INNER (sel_mode) != HImode)
   3104     sel_mode = get_vector_mode (HImode, nunits).require ();
   3105   return sel_mode;
   3106 }
   3107 
   3108 /* Recognize the patterns that we can use merge operation to shuffle the
   3109    vectors. The value of Each element (index i) in selector can only be
   3110    either i or nunits + i.  We will check the pattern is actually monotonic.
   3111 
   3112    E.g.
   3113    v = VEC_PERM_EXPR (v0, v1, selector),
   3114    selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ...  }
   3115 
   3116    We can transform such pattern into:
   3117 
   3118    v = vcond_mask (v0, v1, mask),
   3119    mask = { 0, 1, 0, 1, 0, 1, ... }.  */
   3120 
   3121 static bool
   3122 shuffle_merge_patterns (struct expand_vec_perm_d *d)
   3123 {
   3124   machine_mode vmode = d->vmode;
   3125   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
   3126   int n_patterns = d->perm.encoding ().npatterns ();
   3127   poly_int64 vec_len = d->perm.length ();
   3128 
   3129   for (int i = 0; i < n_patterns; ++i)
   3130     if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i))
   3131       return false;
   3132 
   3133   /* Check the pattern is monotonic here, otherwise, return false.  */
   3134   for (int i = n_patterns; i < n_patterns * 2; i++)
   3135     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
   3136 	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
   3137       return false;
   3138 
   3139   /* We need to use precomputed mask for such situation and such mask
   3140      can only be computed in compile-time known size modes.  */
   3141   bool indices_fit_selector_p
   3142     = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256);
   3143   if (!indices_fit_selector_p && !vec_len.is_constant ())
   3144     return false;
   3145 
   3146   if (d->testing_p)
   3147     return true;
   3148 
   3149   machine_mode mask_mode = get_mask_mode (vmode);
   3150   rtx mask = gen_reg_rtx (mask_mode);
   3151 
   3152   if (indices_fit_selector_p)
   3153     {
   3154       /* MASK = SELECTOR < NUNTIS ? 1 : 0.  */
   3155       rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
   3156       rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode));
   3157       insn_code icode = code_for_pred_cmp_scalar (sel_mode);
   3158       rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x);
   3159       rtx ops[] = {mask, cmp, sel, x};
   3160       emit_vlmax_insn (icode, COMPARE_OP, ops);
   3161     }
   3162   else
   3163     {
   3164       /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu
   3165 	 directly to generate the selector mask, instead, we can only use
   3166 	 precomputed mask.
   3167 
   3168 	 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we
   3169 	 don't have a QImode scalar register to hold larger than 255.
   3170 	 We also cannot hold that in a vector QImode register if LMUL = 8, and,
   3171 	 since there is no larger HI mode vector we cannot create a larger
   3172 	 selector.
   3173 
   3174 	 As the mask is a simple {0, 1, ...} pattern and the length is known we
   3175 	 can store it in a scalar register and broadcast it to a mask register.
   3176        */
   3177       gcc_assert (vec_len.is_constant ());
   3178       int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8);
   3179       machine_mode mode = get_vector_mode (QImode, size).require ();
   3180       rtx tmp = gen_reg_rtx (mode);
   3181       rvv_builder v (mode, 1, size);
   3182       for (int i = 0; i < vec_len.to_constant () / 8; i++)
   3183 	{
   3184 	  uint8_t value = 0;
   3185 	  for (int j = 0; j < 8; j++)
   3186 	    {
   3187 	      int index = i * 8 + j;
   3188 	      if (known_lt (d->perm[index], 256))
   3189 		value |= 1 << j;
   3190 	    }
   3191 	  v.quick_push (gen_int_mode (value, QImode));
   3192 	}
   3193       emit_move_insn (tmp, v.build ());
   3194       emit_move_insn (mask, gen_lowpart (mask_mode, tmp));
   3195     }
   3196 
   3197   /* TARGET = MASK ? OP0 : OP1.  */
   3198   /* swap op0 and op1 since the order is opposite to pred_merge.  */
   3199   rtx ops2[] = {d->target, d->op1, d->op0, mask};
   3200   emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2);
   3201   return true;
   3202 }
   3203 
   3204 /* Recognize the consecutive index that we can use a single
   3205    vrgather.v[x|i] to shuffle the vectors.
   3206 
   3207    e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}>
   3208    Use SEW = 32, index = 1 vrgather.vi to get the result.  */
   3209 static bool
   3210 shuffle_consecutive_patterns (struct expand_vec_perm_d *d)
   3211 {
   3212   machine_mode vmode = d->vmode;
   3213   scalar_mode smode = GET_MODE_INNER (vmode);
   3214   poly_int64 vec_len = d->perm.length ();
   3215   HOST_WIDE_INT elt;
   3216 
   3217   if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt))
   3218     return false;
   3219   int vlen = vec_len.to_constant ();
   3220 
   3221   /* Compute the last element index of consecutive pattern from the leading
   3222      consecutive elements.  */
   3223   int last_consecutive_idx = -1;
   3224   int consecutive_num = -1;
   3225   for (int i = 1; i < vlen; i++)
   3226     {
   3227       if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
   3228 	break;
   3229       last_consecutive_idx = i;
   3230       consecutive_num = last_consecutive_idx + 1;
   3231     }
   3232 
   3233   int new_vlen = vlen / consecutive_num;
   3234   if (last_consecutive_idx < 0 || consecutive_num == vlen
   3235       || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen))
   3236     return false;
   3237   /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>.
   3238      All elements of index, index + 1, ... index + consecutive_num - 1 should
   3239      locate at the same vector.  */
   3240   if (maybe_ge (d->perm[0], vec_len)
   3241       != maybe_ge (d->perm[last_consecutive_idx], vec_len))
   3242     return false;
   3243   /* If a vector has 8 elements.  We allow optimizations on consecutive
   3244      patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>.
   3245      Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns
   3246      to be optimized.  */
   3247   if (d->perm[0].to_constant () % consecutive_num != 0)
   3248     return false;
   3249   unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode);
   3250   if (container_bits > 64)
   3251     return false;
   3252   else if (container_bits == 64)
   3253     {
   3254       if (!TARGET_VECTOR_ELEN_64)
   3255 	return false;
   3256       else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64)
   3257 	return false;
   3258     }
   3259 
   3260   /* Check the rest of elements are the same consecutive pattern.  */
   3261   for (int i = consecutive_num; i < vlen; i++)
   3262     if (maybe_ne (d->perm[i], d->perm[i % consecutive_num]))
   3263       return false;
   3264 
   3265   if (FLOAT_MODE_P (smode))
   3266     smode = float_mode_for_size (container_bits).require ();
   3267   else
   3268     smode = int_mode_for_size (container_bits, 0).require ();
   3269   if (!get_vector_mode (smode, new_vlen).exists (&vmode))
   3270     return false;
   3271   machine_mode sel_mode = related_int_vector_mode (vmode).require ();
   3272 
   3273   /* Success! */
   3274   if (d->testing_p)
   3275     return true;
   3276 
   3277   int index = elt / consecutive_num;
   3278   if (index >= new_vlen)
   3279     index = index - new_vlen;
   3280   rtx sel = gen_const_vector_dup (sel_mode, index);
   3281   rtx op = elt >= vlen ? d->op0 : d->op1;
   3282   emit_vlmax_gather_insn (gen_lowpart (vmode, d->target),
   3283 			  gen_lowpart (vmode, op), sel);
   3284   return true;
   3285 }
   3286 
   3287 /* Recognize the patterns that we can use compress operation to shuffle the
   3288    vectors. The perm selector of compress pattern is divided into 2 part:
   3289    The first part is the random index number < NUNITS.
   3290    The second part is consecutive last N index number >= NUNITS.
   3291 
   3292    E.g.
   3293    v = VEC_PERM_EXPR (v0, v1, selector),
   3294    selector = { 0, 2, 6, 7 }
   3295 
   3296    We can transform such pattern into:
   3297 
   3298    op1 = vcompress (op0, mask)
   3299    mask = { 1, 0, 1, 0 }
   3300    v = op1.  */
   3301 
   3302 static bool
   3303 shuffle_compress_patterns (struct expand_vec_perm_d *d)
   3304 {
   3305   machine_mode vmode = d->vmode;
   3306   poly_int64 vec_len = d->perm.length ();
   3307 
   3308   if (!vec_len.is_constant ())
   3309     return false;
   3310 
   3311   int vlen = vec_len.to_constant ();
   3312 
   3313   /* It's not worthwhile the compress pattern has elemenets < 4
   3314      and we can't modulo indices for compress pattern.  */
   3315   if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4)
   3316     return false;
   3317 
   3318   /* Compress pattern doesn't work for one vector.  */
   3319   if (d->one_vector_p)
   3320     return false;
   3321 
   3322   /* Compress point is the point that all elements value with index i >=
   3323      compress point of the selector are all consecutive series increasing and
   3324      each selector value >= NUNTIS. In this case, we could compress all elements
   3325      of i < compress point into the op1.  */
   3326   int compress_point = -1;
   3327   for (int i = 0; i < vlen; i++)
   3328     {
   3329       if (compress_point < 0 && known_ge (d->perm[i], vec_len))
   3330 	{
   3331 	  compress_point = i;
   3332 	  break;
   3333 	}
   3334     }
   3335 
   3336   /* We don't apply compress approach if we can't find the compress point.  */
   3337   if (compress_point < 0)
   3338     return false;
   3339 
   3340   /* We can only apply compress approach when all index values from 0 to
   3341      compress point are increasing.  */
   3342   for (int i = 1; i < compress_point; i++)
   3343     if (maybe_le (d->perm[i], d->perm[i - 1]))
   3344       return false;
   3345 
   3346   /* It must be series increasing from compress point.  */
   3347   for (int i = 1 + compress_point; i < vlen; i++)
   3348     if (maybe_ne (d->perm[i], d->perm[i - 1] + 1))
   3349       return false;
   3350 
   3351   /* Success!  */
   3352   if (d->testing_p)
   3353     return true;
   3354 
   3355   /* Check whether we need to slideup op1 to apply compress approach.
   3356 
   3357        E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which
   3358 	    is 2 * NUNITS - 1, so we don't need to slide up.
   3359 
   3360 	    For index = { 0, 2, 5, 6}, we need to slide op1 up before
   3361 	    we apply compress approach.  */
   3362   bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1)
   3363 			&& !const_vec_duplicate_p (d->op1);
   3364 
   3365   /* If we leave it directly be handled by general gather,
   3366      the code sequence will be:
   3367 	VECTOR LOAD  selector
   3368 	GEU          mask, selector, NUNITS
   3369 	GATHER       dest, op0, selector
   3370 	SUB          selector, selector, NUNITS
   3371 	GATHER       dest, op1, selector, mask
   3372      Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered
   3373      as COST = 4. So, we consider the general gather handling COST = 9.
   3374      TODO: This cost is not accurate, we can adjust it by tune info.  */
   3375   int general_cost = 9;
   3376 
   3377   /* If we can use compress approach, the code squence will be:
   3378 	MASK LOAD    mask
   3379 	COMPRESS     op1, op0, mask
   3380      If it needs slide up, it will be:
   3381 	MASK LOAD    mask
   3382 	SLIDEUP      op1
   3383 	COMPRESS     op1, op0, mask
   3384      By default, mask load COST = 2.
   3385      TODO: This cost is not accurate, we can adjust it by tune info.  */
   3386   int compress_cost = 4;
   3387 
   3388   if (general_cost <= compress_cost)
   3389     return false;
   3390 
   3391   /* Build a mask that is true when selector element is true.  */
   3392   machine_mode mask_mode = get_mask_mode (vmode);
   3393   rvv_builder builder (mask_mode, vlen, 1);
   3394   for (int i = 0; i < vlen; i++)
   3395     {
   3396       bool is_compress_index = false;
   3397       for (int j = 0; j < compress_point; j++)
   3398 	{
   3399 	  if (known_eq (d->perm[j], i))
   3400 	    {
   3401 	      is_compress_index = true;
   3402 	      break;
   3403 	    }
   3404 	}
   3405       if (is_compress_index)
   3406 	builder.quick_push (CONST1_RTX (BImode));
   3407       else
   3408 	builder.quick_push (CONST0_RTX (BImode));
   3409     }
   3410   rtx mask = force_reg (mask_mode, builder.build ());
   3411 
   3412   rtx merge = d->op1;
   3413   if (need_slideup_p)
   3414     {
   3415       int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1;
   3416       merge = gen_reg_rtx (vmode);
   3417       rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)};
   3418       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode);
   3419       emit_vlmax_insn (icode, BINARY_OP, ops);
   3420     }
   3421 
   3422   insn_code icode = code_for_pred_compress (vmode);
   3423   rtx ops[] = {d->target, merge, d->op0, mask};
   3424   emit_nonvlmax_insn (icode, COMPRESS_OP_MERGE, ops,
   3425 		      gen_int_mode (vlen, Pmode));
   3426   return true;
   3427 }
   3428 
   3429 /* Recognize decompress patterns:
   3430 
   3431    1. VEC_PERM_EXPR op0 and op1
   3432       with isel = { 0, nunits, 1, nunits + 1, ... }.
   3433       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
   3434 
   3435    2. VEC_PERM_EXPR op0 and op1
   3436       with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }.
   3437       Slide down op0 and op1 with OFFSET = 1/2 nunits.
   3438       Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }.
   3439 */
   3440 static bool
   3441 shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   3442 {
   3443   poly_uint64 nelt = d->perm.length ();
   3444   machine_mode mask_mode = get_mask_mode (d->vmode);
   3445 
   3446   /* For constant size indices, we dont't need to handle it here.
   3447      Just leave it to vec_perm<mode>.  */
   3448   if (d->perm.length ().is_constant ())
   3449     return false;
   3450 
   3451   poly_uint64 first = d->perm[0];
   3452   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
   3453       || !d->perm.series_p (0, 2, first, 1)
   3454       || !d->perm.series_p (1, 2, first + nelt, 1))
   3455     return false;
   3456 
   3457   /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv.
   3458      Otherwise, it could overflow the index range.  */
   3459   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
   3460   if (GET_MODE_INNER (d->vmode) == QImode
   3461       && !get_vector_mode (HImode, nelt).exists (&sel_mode))
   3462     return false;
   3463 
   3464   /* Success!  */
   3465   if (d->testing_p)
   3466     return true;
   3467 
   3468   rtx op0, op1;
   3469   if (known_eq (first, 0U))
   3470     {
   3471       op0 = d->op0;
   3472       op1 = d->op1;
   3473     }
   3474   else
   3475     {
   3476       op0 = gen_reg_rtx (d->vmode);
   3477       op1 = gen_reg_rtx (d->vmode);
   3478       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode);
   3479       rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)};
   3480       rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)};
   3481       emit_vlmax_insn (icode, BINARY_OP, ops0);
   3482       emit_vlmax_insn (icode, BINARY_OP, ops1);
   3483     }
   3484   /* Generate { 0, 1, .... } mask.  */
   3485   rtx vid = gen_reg_rtx (sel_mode);
   3486   rtx vid_repeat = gen_reg_rtx (sel_mode);
   3487   expand_vec_series (vid, const0_rtx, const1_rtx);
   3488   rtx and_ops[] = {vid_repeat, vid, const1_rtx};
   3489   emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops);
   3490   rtx const_vec = gen_const_vector_dup (sel_mode, 1);
   3491   rtx mask = gen_reg_rtx (mask_mode);
   3492   expand_vec_cmp (mask, EQ, vid_repeat, const_vec);
   3493   emit_vlmax_decompress_insn (d->target, op0, op1, mask);
   3494   return true;
   3495 }
   3496 
   3497 static bool
   3498 shuffle_bswap_pattern (struct expand_vec_perm_d *d)
   3499 {
   3500   HOST_WIDE_INT diff;
   3501   unsigned i, size, step;
   3502 
   3503   if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
   3504     return false;
   3505 
   3506   step = diff + 1;
   3507   size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
   3508 
   3509   switch (size)
   3510     {
   3511     case 16:
   3512       break;
   3513     case 32:
   3514     case 64:
   3515       /* We will have VEC_PERM_EXPR after rtl expand when invoking
   3516 	 __builtin_bswap. It will generate about 9 instructions in
   3517 	 loop as below, no matter it is bswap16, bswap32 or bswap64.
   3518 	   .L2:
   3519 	 1 vle16.v v4,0(a0)
   3520 	 2 vmv.v.x v2,a7
   3521 	 3 vand.vv v2,v6,v2
   3522 	 4 slli    a2,a5,1
   3523 	 5 vrgatherei16.vv v1,v4,v2
   3524 	 6 sub     a4,a4,a5
   3525 	 7 vse16.v v1,0(a3)
   3526 	 8 add     a0,a0,a2
   3527 	 9 add     a3,a3,a2
   3528 	   bne     a4,zero,.L2
   3529 
   3530 	 But for bswap16 we may have a even simple code gen, which
   3531 	 has only 7 instructions in loop as below.
   3532 	   .L5
   3533 	 1 vle8.v  v2,0(a5)
   3534 	 2 addi    a5,a5,32
   3535 	 3 vsrl.vi v4,v2,8
   3536 	 4 vsll.vi v2,v2,8
   3537 	 5 vor.vv  v4,v4,v2
   3538 	 6 vse8.v  v4,0(a4)
   3539 	 7 addi    a4,a4,32
   3540 	   bne     a5,a6,.L5
   3541 
   3542 	 Unfortunately, the instructions in loop will grow to 13 and 24
   3543 	 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
   3544 	 for both the bswap64 and bswap32, but take shift and or (7 insn)
   3545 	 for bswap16.
   3546        */
   3547     default:
   3548       return false;
   3549     }
   3550 
   3551   for (i = 0; i < step; i++)
   3552     if (!d->perm.series_p (i, step, diff - i, step))
   3553       return false;
   3554 
   3555   /* Disable when nunits < 4 since the later generic approach
   3556      is more profitable on BSWAP.  */
   3557   if (!known_gt (GET_MODE_NUNITS (d->vmode), 2))
   3558     return false;
   3559 
   3560   if (d->testing_p)
   3561     return true;
   3562 
   3563   machine_mode vhi_mode;
   3564   poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
   3565 
   3566   if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
   3567     return false;
   3568 
   3569   /* Step-1: Move op0 to src with VHI mode.  */
   3570   rtx src = gen_reg_rtx (vhi_mode);
   3571   emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
   3572 
   3573   /* Step-2: Shift right 8 bits to dest.  */
   3574   rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
   3575 			   NULL_RTX, 0, OPTAB_DIRECT);
   3576 
   3577   /* Step-3: Shift left 8 bits to src.  */
   3578   src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
   3579 		      NULL_RTX, 0, OPTAB_DIRECT);
   3580 
   3581   /* Step-4: Logic Or dest and src to dest.  */
   3582   dest = expand_binop (vhi_mode, ior_optab, dest, src,
   3583 		       NULL_RTX, 0, OPTAB_DIRECT);
   3584 
   3585   /* Step-5: Move src to target with VQI mode.  */
   3586   emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
   3587 
   3588   return true;
   3589 }
   3590 
   3591 /* Recognize the pattern that can be shuffled by vec_extract and slide1up
   3592    approach.  */
   3593 
   3594 static bool
   3595 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d)
   3596 {
   3597   poly_int64 nunits = GET_MODE_NUNITS (d->vmode);
   3598 
   3599   /* Recognize { nunits - 1, nunits, nunits + 1, ... }.  */
   3600   if (!d->perm.series_p (0, 2, nunits - 1, 2)
   3601       || !d->perm.series_p (1, 2, nunits, 2))
   3602     return false;
   3603 
   3604   /* Disable when nunits < 4 since the later generic approach
   3605      is more profitable on indice = { nunits - 1, nunits }.  */
   3606   if (!known_gt (nunits, 2))
   3607     return false;
   3608 
   3609   /* Success! */
   3610   if (d->testing_p)
   3611     return true;
   3612 
   3613   /* Extract the last element of the first vector.  */
   3614   scalar_mode smode = GET_MODE_INNER (d->vmode);
   3615   rtx tmp = gen_reg_rtx (smode);
   3616   emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode));
   3617 
   3618   /* Insert the scalar into element 0.  */
   3619   unsigned int unspec
   3620     = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
   3621   insn_code icode = code_for_pred_slide (unspec, d->vmode);
   3622   rtx ops[] = {d->target, d->op1, tmp};
   3623   emit_vlmax_insn (icode, BINARY_OP, ops);
   3624   return true;
   3625 }
   3626 
   3627 static bool
   3628 shuffle_series_patterns (struct expand_vec_perm_d *d)
   3629 {
   3630   if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1)
   3631     return false;
   3632 
   3633   poly_int64 el1 = d->perm[0];
   3634   poly_int64 el2 = d->perm[1];
   3635   poly_int64 el3 = d->perm[2];
   3636 
   3637   poly_int64 step1 = el2 - el1;
   3638   poly_int64 step2 = el3 - el2;
   3639 
   3640   bool need_insert = false;
   3641   bool have_series = false;
   3642 
   3643   /* Check for a full series.  */
   3644   if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1))
   3645     have_series = true;
   3646 
   3647   /* Check for a series starting at the second element.  */
   3648   else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2))
   3649     {
   3650       have_series = true;
   3651       need_insert = true;
   3652     }
   3653 
   3654   if (!have_series)
   3655     return false;
   3656 
   3657   /* Disable shuffle if we can't find an appropriate integer index mode for
   3658      gather.  */
   3659   machine_mode sel_mode;
   3660   if (!get_gather_index_mode (d).exists (&sel_mode))
   3661     return false;
   3662 
   3663   /* Success! */
   3664   if (d->testing_p)
   3665     return true;
   3666 
   3667   /* Create the series.  */
   3668   machine_mode eltmode = Pmode;
   3669   rtx series = gen_reg_rtx (sel_mode);
   3670   expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode),
   3671 		     gen_int_mode (need_insert ? step2 : step1, eltmode));
   3672 
   3673   /* Insert the remaining element if necessary.  */
   3674   if (need_insert)
   3675     {
   3676       insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode);
   3677       rtx ops[]
   3678 	= {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))};
   3679       emit_vlmax_insn (icode, BINARY_OP, ops);
   3680     }
   3681 
   3682   emit_vlmax_gather_insn (d->target, d->op0, series);
   3683 
   3684   return true;
   3685 }
   3686 
   3687 /* Recognize the pattern that can be shuffled by generic approach.  */
   3688 
   3689 static bool
   3690 shuffle_generic_patterns (struct expand_vec_perm_d *d)
   3691 {
   3692   machine_mode sel_mode;
   3693 
   3694   /* We don't enable SLP for non-power of 2 NPATTERNS.  */
   3695   if (!pow2p_hwi (d->perm.encoding().npatterns ()))
   3696     return false;
   3697 
   3698   /* Disable shuffle if we can't find an appropriate integer index mode for
   3699      gather.  */
   3700   if (!get_gather_index_mode (d).exists (&sel_mode))
   3701     return false;
   3702 
   3703   /* Success! */
   3704   if (d->testing_p)
   3705     return true;
   3706 
   3707   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
   3708   /* Some FIXED-VLMAX/VLS vector permutation situations call targethook
   3709      instead of expand vec_perm<mode>, we handle it directly.  */
   3710   expand_vec_perm (d->target, d->op0, d->op1, sel);
   3711   return true;
   3712 }
   3713 
   3714 /* This function recognizes and supports different permutation patterns
   3715    and enable VLA SLP auto-vectorization.  */
   3716 static bool
   3717 expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   3718 {
   3719   gcc_assert (d->op_mode != E_VOIDmode);
   3720 
   3721   /* The pattern matching functions above are written to look for a small
   3722      number to begin the sequence (0, 1, N/2).  If we begin with an index
   3723      from the second operand, we can swap the operands.  */
   3724   poly_int64 nelt = d->perm.length ();
   3725   if (known_ge (d->perm[0], nelt))
   3726     {
   3727       d->perm.rotate_inputs (1);
   3728       std::swap (d->op0, d->op1);
   3729     }
   3730 
   3731   if (known_gt (nelt, 1))
   3732     {
   3733       if (d->vmode == d->op_mode)
   3734 	{
   3735 	  if (shuffle_merge_patterns (d))
   3736 	    return true;
   3737 	  if (shuffle_consecutive_patterns (d))
   3738 	    return true;
   3739 	  if (shuffle_compress_patterns (d))
   3740 	    return true;
   3741 	  if (shuffle_decompress_patterns (d))
   3742 	    return true;
   3743 	  if (shuffle_bswap_pattern (d))
   3744 	    return true;
   3745 	  if (shuffle_extract_and_slide1up_patterns (d))
   3746 	    return true;
   3747 	  if (shuffle_series_patterns (d))
   3748 	    return true;
   3749 	  if (shuffle_generic_patterns (d))
   3750 	    return true;
   3751 	  return false;
   3752 	}
   3753       else
   3754 	return false;
   3755     }
   3756   return false;
   3757 }
   3758 
   3759 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV
   3760  * instructions.  */
   3761 bool
   3762 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target,
   3763 		       rtx op0, rtx op1, const vec_perm_indices &sel)
   3764 {
   3765   /* RVV doesn't have Mask type pack/unpack instructions and we don't use
   3766      mask to do the iteration loop control. Just disable it directly.  */
   3767   if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL)
   3768     return false;
   3769   /* FIXME: Explicitly disable VLA interleave SLP vectorization when we
   3770      may encounter ICE for poly size (1, 1) vectors in loop vectorizer.
   3771      Ideally, middle-end loop vectorizer should be able to disable it
   3772      itself, We can remove the codes here when middle-end code is able
   3773      to disable VLA SLP vectorization for poly size (1, 1) VF.  */
   3774   if (!BYTES_PER_RISCV_VECTOR.is_constant ()
   3775       && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL,
   3776 		   poly_int64 (16, 16)))
   3777     return false;
   3778 
   3779   struct expand_vec_perm_d d;
   3780 
   3781   /* Check whether the mask can be applied to a single vector.  */
   3782   if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1)))
   3783     d.one_vector_p = true;
   3784   else if (sel.all_from_input_p (0))
   3785     {
   3786       d.one_vector_p = true;
   3787       op1 = op0;
   3788     }
   3789   else if (sel.all_from_input_p (1))
   3790     {
   3791       d.one_vector_p = true;
   3792       op0 = op1;
   3793     }
   3794   else
   3795     d.one_vector_p = false;
   3796 
   3797   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
   3798 		     sel.nelts_per_input ());
   3799   d.vmode = vmode;
   3800   d.op_mode = op_mode;
   3801   d.target = target;
   3802   d.op0 = op0;
   3803   if (op0 == op1)
   3804     d.op1 = d.op0;
   3805   else
   3806     d.op1 = op1;
   3807   d.testing_p = !target;
   3808 
   3809   if (!d.testing_p)
   3810     return expand_vec_perm_const_1 (&d);
   3811 
   3812   rtx_insn *last = get_last_insn ();
   3813   bool ret = expand_vec_perm_const_1 (&d);
   3814   gcc_assert (last == get_last_insn ());
   3815 
   3816   return ret;
   3817 }
   3818 
   3819 /* Generate no side effects vsetvl to get the vector length.  */
   3820 void
   3821 expand_select_vl (rtx *ops)
   3822 {
   3823   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
   3824   if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
   3825     {
   3826       /* If length is known <= VF, we just use the length directly instead
   3827 	 of using vsetvli.
   3828 
   3829 	 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
   3830 	 We move 3 into _255 intead of using explicit vsetvl.  */
   3831       emit_move_insn (ops[0], ops[1]);
   3832       return;
   3833     }
   3834   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
   3835      since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
   3836   scalar_int_mode mode = QImode;
   3837   machine_mode rvv_mode = get_vector_mode (mode, nunits).require ();
   3838   emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1]));
   3839 }
   3840 
   3841 /* Expand MASK_LEN_{LOAD,STORE}.  */
   3842 void
   3843 expand_load_store (rtx *ops, bool is_load)
   3844 {
   3845   rtx mask = ops[2];
   3846   rtx len = ops[3];
   3847   machine_mode mode = GET_MODE (ops[0]);
   3848 
   3849   if (is_vlmax_len_p (mode, len))
   3850     {
   3851       /* If the length operand is equal to VF, it is VLMAX load/store.  */
   3852       if (is_load)
   3853 	{
   3854 	  rtx m_ops[] = {ops[0], mask, ops[1]};
   3855 	  emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops);
   3856 	}
   3857       else
   3858 	{
   3859 	  len = gen_reg_rtx (Pmode);
   3860 	  emit_vlmax_vsetvl (mode, len);
   3861 	  emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
   3862 				     get_avl_type_rtx (VLMAX)));
   3863 	}
   3864     }
   3865   else
   3866     {
   3867       if (!satisfies_constraint_vl (len))
   3868 	len = force_reg (Pmode, len);
   3869       if (is_load)
   3870 	{
   3871 	  rtx m_ops[] = {ops[0], mask, ops[1]};
   3872 	  emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops,
   3873 			       len);
   3874 	}
   3875       else
   3876 	emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len,
   3877 				   get_avl_type_rtx (NONVLMAX)));
   3878     }
   3879 }
   3880 
   3881 
   3882 /* Return true if the operation is the floating-point operation need FRM.  */
   3883 static bool
   3884 needs_fp_rounding (unsigned icode, machine_mode mode)
   3885 {
   3886   if (!FLOAT_MODE_P (mode))
   3887     return false;
   3888 
   3889   return icode != maybe_code_for_pred (SMIN, mode)
   3890 	 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode)
   3891 	 && icode != maybe_code_for_pred (SMAX, mode)
   3892 	 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode)
   3893 	 && icode != maybe_code_for_pred (NEG, mode)
   3894 	 && icode != maybe_code_for_pred (ABS, mode)
   3895 	 /* narrower-FP -> FP */
   3896 	 && icode != maybe_code_for_pred_extend (mode)
   3897 	 /* narrower-INT -> FP */
   3898 	 && icode != maybe_code_for_pred_widen (FLOAT, mode)
   3899 	 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode)
   3900 	 /* vfsgnj */
   3901 	 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode)
   3902 	 && icode != maybe_code_for_pred_mov (mode);
   3903 }
   3904 
   3905 /* Subroutine to expand COND_LEN_* patterns.  */
   3906 static void
   3907 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len)
   3908 {
   3909   rtx dest = ops[0];
   3910   rtx mask = ops[1];
   3911   machine_mode mode = GET_MODE (dest);
   3912   machine_mode mask_mode = GET_MODE (mask);
   3913   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
   3914   bool is_vlmax_len = is_vlmax_len_p (mode, len);
   3915 
   3916   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
   3917   /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
   3918      dummy mask) into NEG_EXPR in GIMPLE FOLD yet.  So, we do such
   3919      simplification in RISC-V backend and may do that in middle-end in the
   3920      future.  */
   3921   if (is_dummy_mask && is_vlmax_len)
   3922     insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
   3923   else if (is_dummy_mask)
   3924     insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
   3925   else if (is_vlmax_len)
   3926     insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
   3927   else
   3928     insn_flags |= TU_POLICY_P | MU_POLICY_P;
   3929 
   3930   if (needs_fp_rounding (icode, mode))
   3931     insn_flags |= FRM_DYN_P;
   3932 
   3933   if (is_vlmax_len)
   3934     emit_vlmax_insn (icode, insn_flags, ops);
   3935   else
   3936     emit_nonvlmax_insn (icode, insn_flags, ops, len);
   3937 }
   3938 
   3939 /* Return RVV_VUNDEF if the ELSE value is scratch rtx.  */
   3940 static rtx
   3941 get_else_operand (rtx op)
   3942 {
   3943   return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op;
   3944 }
   3945 
   3946 /* Expand unary ops COND_LEN_*.  */
   3947 void
   3948 expand_cond_len_unop (unsigned icode, rtx *ops)
   3949 {
   3950   rtx dest = ops[0];
   3951   rtx mask = ops[1];
   3952   rtx src = ops[2];
   3953   rtx merge = get_else_operand (ops[3]);
   3954   rtx len = ops[4];
   3955 
   3956   rtx cond_ops[] = {dest, mask, merge, src};
   3957   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
   3958 }
   3959 
   3960 /* Expand unary ops COND_*.  */
   3961 void
   3962 expand_cond_unop (unsigned icode, rtx *ops)
   3963 {
   3964   rtx dest = ops[0];
   3965   rtx mask = ops[1];
   3966   rtx src = ops[2];
   3967   rtx merge = get_else_operand (ops[3]);
   3968   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
   3969 
   3970   rtx cond_ops[] = {dest, mask, merge, src};
   3971   expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len);
   3972 }
   3973 
   3974 /* Expand binary ops COND_LEN_*.  */
   3975 void
   3976 expand_cond_len_binop (unsigned icode, rtx *ops)
   3977 {
   3978   rtx dest = ops[0];
   3979   rtx mask = ops[1];
   3980   rtx src1 = ops[2];
   3981   rtx src2 = ops[3];
   3982   rtx merge = get_else_operand (ops[4]);
   3983   rtx len = ops[5];
   3984 
   3985   rtx cond_ops[] = {dest, mask, merge, src1, src2};
   3986   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
   3987 }
   3988 
   3989 /* Expand binary ops COND_*.  */
   3990 void
   3991 expand_cond_binop (unsigned icode, rtx *ops)
   3992 {
   3993   rtx dest = ops[0];
   3994   rtx mask = ops[1];
   3995   rtx src1 = ops[2];
   3996   rtx src2 = ops[3];
   3997   rtx merge = get_else_operand (ops[4]);
   3998   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
   3999 
   4000   rtx cond_ops[] = {dest, mask, merge, src1, src2};
   4001   expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len);
   4002 }
   4003 
   4004 /* Prepare insn_code for gather_load/scatter_store according to
   4005    the vector mode and index mode.  */
   4006 static insn_code
   4007 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode,
   4008 			bool is_load)
   4009 {
   4010   if (!is_load)
   4011     return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode);
   4012   else
   4013     {
   4014       unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode));
   4015       unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode));
   4016       if (dst_eew_bitsize == src_eew_bitsize)
   4017 	return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode);
   4018       else if (dst_eew_bitsize > src_eew_bitsize)
   4019 	{
   4020 	  unsigned factor = dst_eew_bitsize / src_eew_bitsize;
   4021 	  switch (factor)
   4022 	    {
   4023 	    case 2:
   4024 	      return code_for_pred_indexed_load_x2_greater_eew (
   4025 		UNSPEC_UNORDERED, vec_mode);
   4026 	    case 4:
   4027 	      return code_for_pred_indexed_load_x4_greater_eew (
   4028 		UNSPEC_UNORDERED, vec_mode);
   4029 	    case 8:
   4030 	      return code_for_pred_indexed_load_x8_greater_eew (
   4031 		UNSPEC_UNORDERED, vec_mode);
   4032 	    default:
   4033 	      gcc_unreachable ();
   4034 	    }
   4035 	}
   4036       else
   4037 	{
   4038 	  unsigned factor = src_eew_bitsize / dst_eew_bitsize;
   4039 	  switch (factor)
   4040 	    {
   4041 	    case 2:
   4042 	      return code_for_pred_indexed_load_x2_smaller_eew (
   4043 		UNSPEC_UNORDERED, vec_mode);
   4044 	    case 4:
   4045 	      return code_for_pred_indexed_load_x4_smaller_eew (
   4046 		UNSPEC_UNORDERED, vec_mode);
   4047 	    case 8:
   4048 	      return code_for_pred_indexed_load_x8_smaller_eew (
   4049 		UNSPEC_UNORDERED, vec_mode);
   4050 	    default:
   4051 	      gcc_unreachable ();
   4052 	    }
   4053 	}
   4054     }
   4055 }
   4056 
   4057 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}.  */
   4058 void
   4059 expand_gather_scatter (rtx *ops, bool is_load)
   4060 {
   4061   rtx ptr, vec_offset, vec_reg;
   4062   bool zero_extend_p;
   4063   int scale_log2;
   4064   rtx mask = ops[5];
   4065   rtx len = ops[6];
   4066   if (is_load)
   4067     {
   4068       vec_reg = ops[0];
   4069       ptr = ops[1];
   4070       vec_offset = ops[2];
   4071       zero_extend_p = INTVAL (ops[3]);
   4072       scale_log2 = exact_log2 (INTVAL (ops[4]));
   4073     }
   4074   else
   4075     {
   4076       vec_reg = ops[4];
   4077       ptr = ops[0];
   4078       vec_offset = ops[1];
   4079       zero_extend_p = INTVAL (ops[2]);
   4080       scale_log2 = exact_log2 (INTVAL (ops[3]));
   4081     }
   4082 
   4083   machine_mode vec_mode = GET_MODE (vec_reg);
   4084   machine_mode idx_mode = GET_MODE (vec_offset);
   4085   scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode);
   4086   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
   4087   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   4088   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
   4089 
   4090   /* Extend the offset element to address width.  */
   4091   if (inner_offsize < BITS_PER_WORD)
   4092     {
   4093       /* 7.2. Vector Load/Store Addressing Modes.
   4094 	 If the vector offset elements are narrower than XLEN, they are
   4095 	 zero-extended to XLEN before adding to the ptr effective address. If
   4096 	 the vector offset elements are wider than XLEN, the least-significant
   4097 	 XLEN bits are used in the address calculation. An implementation must
   4098 	 raise an illegal instruction exception if the EEW is not supported for
   4099 	 offset elements.
   4100 
   4101 	 RVV spec only refers to the scale_log == 0 case.  */
   4102       if (!zero_extend_p || scale_log2 != 0)
   4103 	{
   4104 	  if (zero_extend_p)
   4105 	    inner_idx_mode
   4106 	      = int_mode_for_size (inner_offsize * 2, 0).require ();
   4107 	  else
   4108 	    inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
   4109 	  machine_mode new_idx_mode
   4110 	    = get_vector_mode (inner_idx_mode, nunits).require ();
   4111 	  rtx tmp = gen_reg_rtx (new_idx_mode);
   4112 	  emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
   4113 				      zero_extend_p ? true : false));
   4114 	  vec_offset = tmp;
   4115 	  idx_mode = new_idx_mode;
   4116 	}
   4117     }
   4118 
   4119   if (scale_log2 != 0)
   4120     {
   4121       rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
   4122 			      gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
   4123 			      OPTAB_DIRECT);
   4124       vec_offset = tmp;
   4125     }
   4126 
   4127   insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load);
   4128   if (is_vlmax)
   4129     {
   4130       if (is_load)
   4131 	{
   4132 	  rtx load_ops[]
   4133 	    = {vec_reg, mask, ptr, vec_offset};
   4134 	  emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops);
   4135 	}
   4136       else
   4137 	{
   4138 	  rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
   4139 	  emit_vlmax_insn (icode, SCATTER_OP_M, store_ops);
   4140 	}
   4141     }
   4142   else
   4143     {
   4144       if (is_load)
   4145 	{
   4146 	  rtx load_ops[]
   4147 	    = {vec_reg, mask, ptr, vec_offset};
   4148 	  emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len);
   4149 	}
   4150       else
   4151 	{
   4152 	  rtx store_ops[] = {mask, ptr, vec_offset, vec_reg};
   4153 	  emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len);
   4154 	}
   4155     }
   4156 }
   4157 
   4158 /* Expand COND_LEN_*.  */
   4159 void
   4160 expand_cond_len_ternop (unsigned icode, rtx *ops)
   4161 {
   4162   rtx dest = ops[0];
   4163   rtx mask = ops[1];
   4164   rtx src1 = ops[2];
   4165   rtx src2 = ops[3];
   4166   rtx src3 = ops[4];
   4167   rtx merge = get_else_operand (ops[5]);
   4168   rtx len = ops[6];
   4169 
   4170   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
   4171   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
   4172 }
   4173 
   4174 /* Expand COND_*.  */
   4175 void
   4176 expand_cond_ternop (unsigned icode, rtx *ops)
   4177 {
   4178   rtx dest = ops[0];
   4179   rtx mask = ops[1];
   4180   rtx src1 = ops[2];
   4181   rtx src2 = ops[3];
   4182   rtx src3 = ops[4];
   4183   rtx merge = get_else_operand (ops[5]);
   4184   rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode);
   4185 
   4186   rtx cond_ops[] = {dest, mask, src1, src2, src3, merge};
   4187   expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len);
   4188 }
   4189 
   4190 /* Expand reduction operations.
   4191      Case 1: ops = {scalar_dest, vector_src}
   4192      Case 2: ops = {scalar_dest, vector_src, mask, vl}
   4193 */
   4194 void
   4195 expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe,
   4196 		  unsigned insn_flags, rtx *ops, rtx init)
   4197 {
   4198   rtx scalar_dest = ops[0];
   4199   rtx vector_src = ops[1];
   4200   machine_mode vmode = GET_MODE (vector_src);
   4201   machine_mode vel_mode = GET_MODE (scalar_dest);
   4202   machine_mode m1_mode = get_m1_mode (vel_mode).require ();
   4203   rtx vl_op = NULL_RTX;
   4204   bool need_vl0_safe = false;
   4205   if (need_mask_operand_p (insn_flags))
   4206     {
   4207       vl_op = ops[3];
   4208       need_vl0_safe = !CONST_INT_P (vl_op) && !CONST_POLY_INT_P (vl_op);
   4209     }
   4210 
   4211   rtx m1_tmp = gen_reg_rtx (m1_mode);
   4212   rtx scalar_move_ops[] = {m1_tmp, init};
   4213   insn_code icode = code_for_pred_broadcast (m1_mode);
   4214   if (need_mask_operand_p (insn_flags))
   4215     {
   4216       if (need_vl0_safe)
   4217 	emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx);
   4218       else
   4219 	emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op);
   4220     }
   4221   else
   4222     emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
   4223 
   4224   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
   4225   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
   4226 
   4227   if (need_vl0_safe)
   4228     icode = code_for_pred (unspec_for_vl0_safe, vmode);
   4229   else
   4230     icode = code_for_pred (unspec, vmode);
   4231 
   4232   if (need_mask_operand_p (insn_flags))
   4233     {
   4234       rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp};
   4235       emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, vl_op);
   4236     }
   4237   else
   4238     emit_vlmax_insn (icode, insn_flags, reduc_ops);
   4239 
   4240   emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2));
   4241 }
   4242 
   4243 /* Prepare ops for ternary operations.
   4244    It can be called before or after RA.  */
   4245 void
   4246 prepare_ternary_operands (rtx *ops)
   4247 {
   4248   machine_mode mode = GET_MODE (ops[0]);
   4249 
   4250   if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode))
   4251       && (VECTOR_MODE_P (GET_MODE (ops[2]))
   4252 	  && !rtx_equal_p (ops[2], ops[5]))
   4253       && !rtx_equal_p (ops[3], ops[5])
   4254       && !rtx_equal_p (ops[4], ops[5]))
   4255     {
   4256       /* RA will fail to find vector REG and report ICE, so we pre-merge
   4257 	 the ops for LMUL = 8.  */
   4258       if (satisfies_constraint_Wc1 (ops[1]))
   4259 	{
   4260 	  emit_move_insn (ops[0], ops[5]);
   4261 	  emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6],
   4262 				   ops[7], ops[8], ops[9]));
   4263 	}
   4264       else
   4265 	emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5],
   4266 				   ops[4], ops[1], ops[6], ops[7], ops[9]));
   4267       ops[5] = ops[4] = ops[0];
   4268     }
   4269   else
   4270     {
   4271       /* Swap the multiplication ops if the fallback value is the
   4272 	 second of the two.  */
   4273       if (rtx_equal_p (ops[3], ops[5]))
   4274 	std::swap (ops[2], ops[3]);
   4275 
   4276       /* TODO: ??? Maybe we could support splitting FMA (a, 4, b)
   4277 	 into PLUS (ASHIFT (a, 2), b) according to uarchs.  */
   4278     }
   4279   gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode))
   4280 	      || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4]));
   4281 }
   4282 
   4283 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}.  */
   4284 void
   4285 expand_lanes_load_store (rtx *ops, bool is_load)
   4286 {
   4287   rtx mask = ops[2];
   4288   rtx len = ops[3];
   4289   rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0);
   4290   rtx reg = is_load ? ops[0] : ops[1];
   4291   machine_mode mode = GET_MODE (ops[0]);
   4292 
   4293   if (is_vlmax_len_p (mode, len))
   4294     {
   4295       /* If the length operand is equal to VF, it is VLMAX load/store.  */
   4296       if (is_load)
   4297 	{
   4298 	  rtx m_ops[] = {reg, mask, addr};
   4299 	  emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA,
   4300 			    m_ops);
   4301 	}
   4302       else
   4303 	{
   4304 	  len = gen_reg_rtx (Pmode);
   4305 	  emit_vlmax_vsetvl (mode, len);
   4306 	  emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
   4307 						  get_avl_type_rtx (VLMAX)));
   4308 	}
   4309     }
   4310   else
   4311     {
   4312       if (!satisfies_constraint_vl (len))
   4313 	len = force_reg (Pmode, len);
   4314       if (is_load)
   4315 	{
   4316 	  rtx m_ops[] = {reg, mask, addr};
   4317 	  emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode),
   4318 			       UNARY_OP_TAMA, m_ops, len);
   4319 	}
   4320       else
   4321 	emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len,
   4322 						get_avl_type_rtx (NONVLMAX)));
   4323     }
   4324 }
   4325 
   4326 /* Expand LEN_FOLD_EXTRACT_LAST.  */
   4327 void
   4328 expand_fold_extract_last (rtx *ops)
   4329 {
   4330   rtx dst = ops[0];
   4331   rtx default_value = ops[1];
   4332   rtx mask = ops[2];
   4333   rtx anchor = gen_reg_rtx (Pmode);
   4334   rtx index = gen_reg_rtx (Pmode);
   4335   rtx vect = ops[3];
   4336   rtx else_label = gen_label_rtx ();
   4337   rtx end_label = gen_label_rtx ();
   4338   rtx len = ops[4];
   4339   machine_mode mode = GET_MODE (vect);
   4340   machine_mode mask_mode = GET_MODE (mask);
   4341   rtx compress_vect = gen_reg_rtx (mode);
   4342   rtx slide_vect = gen_reg_rtx (mode);
   4343   insn_code icode;
   4344 
   4345   if (is_vlmax_len_p (mode, len))
   4346     len = NULL_RTX;
   4347 
   4348   /* Calculate the number of 1-bit in mask. */
   4349   rtx cpop_ops[] = {anchor, mask};
   4350   if (len)
   4351     emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
   4352 			 cpop_ops, len);
   4353   else
   4354     emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP,
   4355 		      cpop_ops);
   4356 
   4357   riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx);
   4358   emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx)));
   4359   /* Compress the vector.  */
   4360   icode = code_for_pred_compress (mode);
   4361   rtx compress_ops[] = {compress_vect, vect, mask};
   4362   if (len)
   4363     emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len);
   4364   else
   4365     emit_vlmax_insn (icode, COMPRESS_OP, compress_ops);
   4366   /* Emit the slide down to index 0 in a new vector.  */
   4367   rtx slide_ops[] = {slide_vect, compress_vect, index};
   4368   icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode);
   4369   if (len)
   4370     emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len);
   4371   else
   4372     emit_vlmax_insn (icode, BINARY_OP, slide_ops);
   4373   /* Emit v(f)mv.[xf].s.  */
   4374   emit_insn (gen_pred_extract_first (mode, dst, slide_vect));
   4375 
   4376   emit_jump_insn (gen_jump (end_label));
   4377   emit_barrier ();
   4378   emit_label (else_label);
   4379   emit_move_insn (dst, default_value);
   4380   emit_label (end_label);
   4381 }
   4382 
   4383 /* Return true if the LMUL of comparison less than or equal to one.  */
   4384 bool
   4385 cmp_lmul_le_one (machine_mode mode)
   4386 {
   4387   if (riscv_v_ext_vector_mode_p (mode))
   4388     return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
   4389   else if (riscv_v_ext_vls_mode_p (mode))
   4390     return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
   4391   return false;
   4392 }
   4393 
   4394 /* Return true if the LMUL of comparison greater than one.  */
   4395 bool
   4396 cmp_lmul_gt_one (machine_mode mode)
   4397 {
   4398   if (riscv_v_ext_vector_mode_p (mode))
   4399     return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR);
   4400   else if (riscv_v_ext_vls_mode_p (mode))
   4401     return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN);
   4402   return false;
   4403 }
   4404 
   4405 /* Return true if the VLS mode is legal. There are 2 cases here.
   4406 
   4407    1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode
   4408       is the highest priority choice and should not conflict with VLS modes.
   4409    2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the
   4410       VLS mode are smaller than the minimal vla.
   4411 
   4412    Take vlen = 2048 as example for case 2.
   4413 
   4414    Note: Below table based on vlen = 2048.
   4415    +----------------------------------------------------+----------------------+
   4416    | VLS mode                                           | VLA mode             |
   4417    +----------------------------------------------------+----------------------+
   4418    | Name       | Precision | Inner Precision | Enabled | Min mode  | Min bits |
   4419    +------------+-----------+-----------------+---------+-----------+----------+
   4420    | V1BI       |     1     |              1  | Yes     | RVVMF64BI |    32    |
   4421    | V2BI       |     2     |              1  | Yes     | RVVMF64BI |    32    |
   4422    | V4BI       |     4     |              1  | Yes     | RVVMF64BI |    32    |
   4423    | V8BI       |     8     |              1  | Yes     | RVVMF64BI |    32    |
   4424    | V16BI      |    16     |              1  | Yes     | RVVMF64BI |    32    |
   4425    | V32BI      |    32     |              1  | NO      | RVVMF64BI |    32    |
   4426    | V64BI      |    64     |              1  | NO      | RVVMF64BI |    32    |
   4427    | ...        |   ...     |            ...  | ...     | RVVMF64BI |    32    |
   4428    | V4096BI    |  4096     |              1  | NO      | RVVMF64BI |    32    |
   4429    +------------+-----------+-----------------+---------+-----------+----------+
   4430    | V1QI       |     8     |              8  | Yes     | RVVMF8QI  |   256    |
   4431    | V2QI       |    16     |              8  | Yes     | RVVMF8QI  |   256    |
   4432    | V4QI       |    32     |              8  | Yes     | RVVMF8QI  |   256    |
   4433    | V8QI       |    64     |              8  | Yes     | RVVMF8QI  |   256    |
   4434    | V16QI      |   128     |              8  | Yes     | RVVMF8QI  |   256    |
   4435    | V32QI      |   256     |              8  | NO      | RVVMF8QI  |   256    |
   4436    | V64QI      |   512     |              8  | NO      | RVVMF8QI  |   256    |
   4437    | ...        |   ...     |              .. | ...     | RVVMF8QI  |   256    |
   4438    | V4096QI    | 32768     |              8  | NO      | RVVMF8QI  |   256    |
   4439    +------------+-----------+-----------------+---------+-----------+----------+
   4440    | V1HI       |    16     |              16 | Yes     | RVVMF4HI  |   512    |
   4441    | V2HI       |    32     |              16 | Yes     | RVVMF4HI  |   512    |
   4442    | V4HI       |    64     |              16 | Yes     | RVVMF4HI  |   512    |
   4443    | V8HI       |   128     |              16 | Yes     | RVVMF4HI  |   512    |
   4444    | V16HI      |   256     |              16 | Yes     | RVVMF4HI  |   512    |
   4445    | V32HI      |   512     |              16 | NO      | RVVMF4HI  |   512    |
   4446    | V64HI      |  1024     |              16 | NO      | RVVMF4HI  |   512    |
   4447    | ...        |   ...     |              .. | ...     | RVVMF4HI  |   512    |
   4448    | V2048HI    | 32768     |              16 | NO      | RVVMF4HI  |   512    |
   4449    +------------+-----------+-----------------+---------+-----------+----------+
   4450    | V1SI/SF    |    32     |              32 | Yes     | RVVMF2SI  |  1024    |
   4451    | V2SI/SF    |    64     |              32 | Yes     | RVVMF2SI  |  1024    |
   4452    | V4SI/SF    |   128     |              32 | Yes     | RVVMF2SI  |  1024    |
   4453    | V8SI/SF    |   256     |              32 | Yes     | RVVMF2SI  |  1024    |
   4454    | V16SI/SF   |   512     |              32 | Yes     | RVVMF2SI  |  1024    |
   4455    | V32SI/SF   |  1024     |              32 | NO      | RVVMF2SI  |  1024    |
   4456    | V64SI/SF   |  2048     |              32 | NO      | RVVMF2SI  |  1024    |
   4457    | ...        |   ...     |              .. | ...     | RVVMF2SI  |  1024    |
   4458    | V1024SI/SF | 32768     |              32 | NO      | RVVMF2SI  |  1024    |
   4459    +------------+-----------+-----------------+---------+-----------+----------+
   4460    | V1DI/DF    |    64     |              64 | Yes     | RVVM1DI   |  2048    |
   4461    | V2DI/DF    |   128     |              64 | Yes     | RVVM1DI   |  2048    |
   4462    | V4DI/DF    |   256     |              64 | Yes     | RVVM1DI   |  2048    |
   4463    | V8DI/DF    |   512     |              64 | Yes     | RVVM1DI   |  2048    |
   4464    | V16DI/DF   |  1024     |              64 | Yes     | RVVM1DI   |  2048    |
   4465    | V32DI/DF   |  2048     |              64 | NO      | RVVM1DI   |  2048    |
   4466    | V64DI/DF   |  4096     |              64 | NO      | RVVM1DI   |  2048    |
   4467    | ...        |   ...     |              .. | ...     | RVVM1DI   |  2048    |
   4468    | V512DI/DF  | 32768     |              64 | NO      | RVVM1DI   |  2048    |
   4469    +------------+-----------+-----------------+---------+-----------+----------+
   4470 
   4471    Then we can have the condition for VLS mode in fixed-vlmax, aka:
   4472      PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)).  */
   4473 bool
   4474 vls_mode_valid_p (machine_mode vls_mode)
   4475 {
   4476   if (!TARGET_VECTOR || TARGET_XTHEADVECTOR)
   4477     return false;
   4478 
   4479   if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE)
   4480     {
   4481       if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL
   4482 	  && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR,
   4483 			 GET_MODE_PRECISION (vls_mode)))
   4484 	/* We enable VLS modes which are aligned with TARGET_MAX_LMUL and
   4485 	   BITS_PER_RISCV_VECTOR.
   4486 
   4487 	   e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128).
   4488 	   We enable VLS modes have fixed size <= 128bit.  Since ordered_p is
   4489 	   false between VLA modes with size = (128, 128) bits and VLS mode
   4490 	   with size = 128 bits, we will end up with multiple ICEs in
   4491 	   middle-end generic codes.  */
   4492 	return false;
   4493       return true;
   4494     }
   4495 
   4496   if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL)
   4497     {
   4498       machine_mode inner_mode = GET_MODE_INNER (vls_mode);
   4499       int precision = GET_MODE_PRECISION (inner_mode).to_constant ();
   4500       int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision);
   4501 
   4502       return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize;
   4503     }
   4504 
   4505   return false;
   4506 }
   4507 
   4508 /* We don't have to convert the floating point to integer when the
   4509    mantissa is zero.  Thus, ther will be a limitation for both the
   4510    single and double precision floating point.  There will be no
   4511    mantissa if the floating point is greater than the limit.
   4512 
   4513    1. Half floating point.
   4514       +-----------+---------------+
   4515       | float     | binary layout |
   4516       +-----------+---------------+
   4517       | 1023.5    | 0x63ff        |
   4518       +-----------+---------------+
   4519       | 1024.0    | 0x6400        |
   4520       +-----------+---------------+
   4521       | 1025.0    | 0x6401        |
   4522       +-----------+---------------+
   4523       | ...       | ...           |
   4524 
   4525       All half floating point will be unchanged for ceil if it is
   4526       greater than and equal to 1024.
   4527 
   4528    2. Single floating point.
   4529       +-----------+---------------+
   4530       | float     | binary layout |
   4531       +-----------+---------------+
   4532       | 8388607.5 | 0x4affffff    |
   4533       +-----------+---------------+
   4534       | 8388608.0 | 0x4b000000    |
   4535       +-----------+---------------+
   4536       | 8388609.0 | 0x4b000001    |
   4537       +-----------+---------------+
   4538       | ...       | ...           |
   4539 
   4540       All single floating point will be unchanged for ceil if it is
   4541       greater than and equal to 8388608.
   4542 
   4543    3. Double floating point.
   4544       +--------------------+--------------------+
   4545       | float              | binary layout      |
   4546       +--------------------+--------------------+
   4547       | 4503599627370495.5 | 0X432fffffffffffff |
   4548       +--------------------+--------------------+
   4549       | 4503599627370496.0 | 0X4330000000000000 |
   4550       +--------------------+--------------------+
   4551       | 4503599627370497.0 | 0X4340000000000000 |
   4552       +--------------------+--------------------+
   4553       | ...                | ...                |
   4554 
   4555       All double floating point will be unchanged for ceil if it is
   4556       greater than and equal to 4503599627370496.
   4557  */
   4558 static rtx
   4559 get_fp_rounding_coefficient (machine_mode inner_mode)
   4560 {
   4561   REAL_VALUE_TYPE real;
   4562 
   4563   if (inner_mode == E_HFmode)
   4564     real_from_integer (&real, inner_mode, 1024, SIGNED);
   4565   else if (inner_mode == E_SFmode)
   4566     real_from_integer (&real, inner_mode, 8388608, SIGNED);
   4567   else if (inner_mode == E_DFmode)
   4568     real_from_integer (&real, inner_mode, 4503599627370496, SIGNED);
   4569   else
   4570     gcc_unreachable ();
   4571 
   4572   return const_double_from_real_value (real, inner_mode);
   4573 }
   4574 
   4575 static rtx
   4576 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar,
   4577 			 machine_mode vec_fp_mode)
   4578 {
   4579   /* Step-1: Prepare the scalar float compare register.  */
   4580   rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode));
   4581   emit_insn (gen_move_insn (fp_reg, fp_scalar));
   4582 
   4583   /* Step-2: Generate the mask.  */
   4584   machine_mode mask_mode = get_mask_mode (vec_fp_mode);
   4585   rtx mask = gen_reg_rtx (mask_mode);
   4586   rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg);
   4587   rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg};
   4588   insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode);
   4589   emit_vlmax_insn (icode, COMPARE_OP, cmp_ops);
   4590 
   4591   return mask;
   4592 }
   4593 
   4594 static void
   4595 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1,
   4596 		   machine_mode vec_mode)
   4597 {
   4598   rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1};
   4599   insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode);
   4600 
   4601   emit_vlmax_insn (icode, BINARY_OP, sgnj_ops);
   4602 }
   4603 
   4604 static void
   4605 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode)
   4606 {
   4607   rtx abs_ops[] = {op_dest, op_src};
   4608   insn_code icode = code_for_pred (ABS, vec_mode);
   4609 
   4610   emit_vlmax_insn (icode, UNARY_OP, abs_ops);
   4611 }
   4612 
   4613 static void
   4614 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask,
   4615 		  insn_type type, machine_mode vec_mode)
   4616 {
   4617   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
   4618 
   4619   if (type & USE_VUNDEF_MERGE_P)
   4620     {
   4621       rtx cvt_x_ops[] = {op_dest, mask, op_src};
   4622       emit_vlmax_insn (icode, type, cvt_x_ops);
   4623     }
   4624   else
   4625     {
   4626       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
   4627       emit_vlmax_insn (icode, type, cvt_x_ops);
   4628     }
   4629 }
   4630 
   4631 static void
   4632 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
   4633 		  machine_mode vec_mode)
   4634 {
   4635   rtx ops[] = {op_dest, op_src};
   4636   insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
   4637 
   4638   emit_vlmax_insn (icode, type, ops);
   4639 }
   4640 
   4641 static void
   4642 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
   4643 			 machine_mode vec_mode)
   4644 {
   4645   rtx ops[] = {op_dest, op_src};
   4646   insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
   4647 
   4648   emit_vlmax_insn (icode, type, ops);
   4649 }
   4650 
   4651 static void
   4652 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type,
   4653 			 machine_mode vec_mode)
   4654 {
   4655   rtx ops[] = {op_dest, op_src};
   4656   insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode);
   4657 
   4658   emit_vlmax_insn (icode, type, ops);
   4659 }
   4660 
   4661 static void
   4662 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type,
   4663 			 machine_mode vec_mode)
   4664 {
   4665   rtx ops[] = {op_dest, op_src};
   4666   insn_code icode = code_for_pred_extend (vec_mode);
   4667 
   4668   emit_vlmax_insn (icode, type, ops);
   4669 }
   4670 
   4671 static void
   4672 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask,
   4673 		  insn_type type, machine_mode vec_mode)
   4674 {
   4675   rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src};
   4676   insn_code icode = code_for_pred (FLOAT, vec_mode);
   4677 
   4678   emit_vlmax_insn (icode, type, cvt_fp_ops);
   4679 }
   4680 
   4681 static void
   4682 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
   4683 		      insn_type type, machine_mode vec_mode)
   4684 {
   4685   insn_code icode = code_for_pred (FIX, vec_mode);
   4686 
   4687   if (type & USE_VUNDEF_MERGE_P)
   4688     {
   4689       rtx cvt_x_ops[] = {op_dest, mask, op_src};
   4690       emit_vlmax_insn (icode, type, cvt_x_ops);
   4691     }
   4692   else
   4693     {
   4694       rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src};
   4695       emit_vlmax_insn (icode, type, cvt_x_ops);
   4696     }
   4697 }
   4698 
   4699 void
   4700 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4701 		 machine_mode vec_int_mode)
   4702 {
   4703   /* Step-1: Get the abs float value for mask generation.  */
   4704   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4705 
   4706   /* Step-2: Generate the mask on const fp.  */
   4707   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4708   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4709 
   4710   /* Step-3: Convert to integer on mask, with rounding up (aka ceil).  */
   4711   rtx tmp = gen_reg_rtx (vec_int_mode);
   4712   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode);
   4713 
   4714   /* Step-4: Convert to floating-point on mask for the final result.
   4715      To avoid unnecessary frm register access, we use RUP here and it will
   4716      never do the rounding up because the tmp rtx comes from the float
   4717      to int conversion.  */
   4718   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode);
   4719 
   4720   /* Step-5: Retrieve the sign bit for -0.0.  */
   4721   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4722 }
   4723 
   4724 void
   4725 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4726 		  machine_mode vec_int_mode)
   4727 {
   4728   /* Step-1: Get the abs float value for mask generation.  */
   4729   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4730 
   4731   /* Step-2: Generate the mask on const fp.  */
   4732   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4733   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4734 
   4735   /* Step-3: Convert to integer on mask, with rounding down (aka floor).  */
   4736   rtx tmp = gen_reg_rtx (vec_int_mode);
   4737   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode);
   4738 
   4739   /* Step-4: Convert to floating-point on mask for the floor result.  */
   4740   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode);
   4741 
   4742   /* Step-5: Retrieve the sign bit for -0.0.  */
   4743   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4744 }
   4745 
   4746 void
   4747 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4748 		      machine_mode vec_int_mode)
   4749 {
   4750   /* Step-1: Get the abs float value for mask generation.  */
   4751   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4752 
   4753   /* Step-2: Generate the mask on const fp.  */
   4754   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4755   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4756 
   4757   /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */
   4758   rtx fflags = gen_reg_rtx (SImode);
   4759   emit_insn (gen_riscv_frflags (fflags));
   4760 
   4761   /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint).  */
   4762   rtx tmp = gen_reg_rtx (vec_int_mode);
   4763   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
   4764 
   4765   /* Step-5: Convert to floating-point on mask for the nearbyint result.  */
   4766   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
   4767 
   4768   /* Step-6: Restore FP exception flags. */
   4769   emit_insn (gen_riscv_fsflags (fflags));
   4770 
   4771   /* Step-7: Retrieve the sign bit for -0.0.  */
   4772   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4773 }
   4774 
   4775 void
   4776 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4777 		 machine_mode vec_int_mode)
   4778 {
   4779   /* Step-1: Get the abs float value for mask generation.  */
   4780   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4781 
   4782   /* Step-2: Generate the mask on const fp.  */
   4783   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4784   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4785 
   4786   /* Step-3: Convert to integer on mask, with dyn rounding (aka rint).  */
   4787   rtx tmp = gen_reg_rtx (vec_int_mode);
   4788   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode);
   4789 
   4790   /* Step-4: Convert to floating-point on mask for the rint result.  */
   4791   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
   4792 
   4793   /* Step-5: Retrieve the sign bit for -0.0.  */
   4794   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4795 }
   4796 
   4797 void
   4798 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4799 		  machine_mode vec_int_mode)
   4800 {
   4801   /* Step-1: Get the abs float value for mask generation.  */
   4802   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4803 
   4804   /* Step-2: Generate the mask on const fp.  */
   4805   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4806   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4807 
   4808   /* Step-3: Convert to integer on mask, rounding to nearest (aka round).  */
   4809   rtx tmp = gen_reg_rtx (vec_int_mode);
   4810   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode);
   4811 
   4812   /* Step-4: Convert to floating-point on mask for the round result.  */
   4813   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode);
   4814 
   4815   /* Step-5: Retrieve the sign bit for -0.0.  */
   4816   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4817 }
   4818 
   4819 void
   4820 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4821 		  machine_mode vec_int_mode)
   4822 {
   4823   /* Step-1: Get the abs float value for mask generation.  */
   4824   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4825 
   4826   /* Step-2: Generate the mask on const fp.  */
   4827   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4828   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4829 
   4830   /* Step-3: Convert to integer on mask, rounding to zero (aka truncate).  */
   4831   rtx tmp = gen_reg_rtx (vec_int_mode);
   4832   emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode);
   4833 
   4834   /* Step-4: Convert to floating-point on mask for the rint result.  */
   4835   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode);
   4836 
   4837   /* Step-5: Retrieve the sign bit for -0.0.  */
   4838   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4839 }
   4840 
   4841 void
   4842 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4843 		      machine_mode vec_int_mode)
   4844 {
   4845   /* Step-1: Get the abs float value for mask generation.  */
   4846   emit_vec_abs (op_0, op_1, vec_fp_mode);
   4847 
   4848   /* Step-2: Generate the mask on const fp.  */
   4849   rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode));
   4850   rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode);
   4851 
   4852   /* Step-3: Convert to integer on mask, rounding to nearest, ties to even.  */
   4853   rtx tmp = gen_reg_rtx (vec_int_mode);
   4854   emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode);
   4855 
   4856   /* Step-4: Convert to floating-point on mask for the rint result.  */
   4857   emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode);
   4858 
   4859   /* Step-5: Retrieve the sign bit for -0.0.  */
   4860   emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode);
   4861 }
   4862 
   4863 /* Handling the rounding from floating-point to int/long/long long.  */
   4864 static void
   4865 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type,
   4866 			      machine_mode vec_fp_mode,
   4867 			      machine_mode vec_int_mode,
   4868 			      machine_mode vec_bridge_mode = E_VOIDmode)
   4869 {
   4870   poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode);
   4871   poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode);
   4872 
   4873   if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI.  */
   4874     emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode);
   4875   else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI.  */
   4876     emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode);
   4877   else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI.  */
   4878     emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode);
   4879   else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI.  */
   4880     {
   4881       gcc_assert (vec_bridge_mode != E_VOIDmode);
   4882 
   4883       rtx op_sf = gen_reg_rtx (vec_bridge_mode);
   4884 
   4885       /* Step-1: HF => SF, no rounding here.  */
   4886       emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode);
   4887       /* Step-2: SF => DI.  */
   4888       emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode);
   4889     }
   4890   else
   4891     gcc_unreachable ();
   4892 }
   4893 
   4894 void
   4895 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4896 		  machine_mode vec_int_mode, machine_mode vec_bridge_mode)
   4897 {
   4898   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode,
   4899 				vec_int_mode, vec_bridge_mode);
   4900 }
   4901 
   4902 void
   4903 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4904 		   machine_mode vec_int_mode, machine_mode vec_bridge_mode)
   4905 {
   4906   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode,
   4907 				vec_int_mode, vec_bridge_mode);
   4908 }
   4909 
   4910 void
   4911 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4912 		  machine_mode vec_int_mode)
   4913 {
   4914   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode,
   4915 				vec_int_mode);
   4916 }
   4917 
   4918 void
   4919 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
   4920 		   machine_mode vec_int_mode)
   4921 {
   4922   emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode,
   4923 				vec_int_mode);
   4924 }
   4925 
   4926 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
   4927    well.  */
   4928 void
   4929 expand_popcount (rtx *ops)
   4930 {
   4931   rtx dst = ops[0];
   4932   rtx src = ops[1];
   4933   machine_mode mode = GET_MODE (dst);
   4934   scalar_mode imode = GET_MODE_INNER (mode);
   4935   static const uint64_t m5 = 0x5555555555555555ULL;
   4936   static const uint64_t m3 = 0x3333333333333333ULL;
   4937   static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL;
   4938   static const uint64_t m1 = 0x0101010101010101ULL;
   4939 
   4940   rtx x1 = gen_reg_rtx (mode);
   4941   rtx x2 = gen_reg_rtx (mode);
   4942   rtx x3 = gen_reg_rtx (mode);
   4943   rtx x4 = gen_reg_rtx (mode);
   4944 
   4945   /* x1 = src - (src >> 1) & 0x555...);  */
   4946   rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true,
   4947 			     OPTAB_DIRECT);
   4948 
   4949   rtx and1 = gen_reg_rtx (mode);
   4950   rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)};
   4951   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
   4952 		   ops1);
   4953 
   4954   x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT);
   4955 
   4956   /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL);
   4957    */
   4958   rtx and2 = gen_reg_rtx (mode);
   4959   rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)};
   4960   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
   4961 		   ops2);
   4962 
   4963   rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true,
   4964 			     OPTAB_DIRECT);
   4965 
   4966   rtx and22 = gen_reg_rtx (mode);
   4967   rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)};
   4968   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
   4969 		   ops22);
   4970 
   4971   x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT);
   4972 
   4973   /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL;  */
   4974   rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true,
   4975 			     OPTAB_DIRECT);
   4976 
   4977   rtx plus3
   4978     = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT);
   4979 
   4980   rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)};
   4981   emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP,
   4982 		   ops3);
   4983 
   4984   /* dest = (x3 * 0x0101010101010101ULL) >> 56;  */
   4985   rtx mul4 = gen_reg_rtx (mode);
   4986   rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)};
   4987   emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP,
   4988 		   ops4);
   4989 
   4990   x4 = expand_binop (mode, lshr_optab, mul4,
   4991 		     GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true,
   4992 		     OPTAB_DIRECT);
   4993 
   4994   emit_move_insn (dst, x4);
   4995 }
   4996 
   4997 /* Return true if it is VLMAX AVL TYPE.  */
   4998 bool
   4999 vlmax_avl_type_p (rtx_insn *rinsn)
   5000 {
   5001   extract_insn_cached (rinsn);
   5002   int index = get_attr_avl_type_idx (rinsn);
   5003   if (index == INVALID_ATTRIBUTE)
   5004     return false;
   5005 
   5006   gcc_assert (index < recog_data.n_operands);
   5007 
   5008   rtx avl_type = recog_data.operand[index];
   5009   return INTVAL (avl_type) == VLMAX;
   5010 }
   5011 
   5012 /* Return true if it is an RVV instruction depends on VL global
   5013    status register.  */
   5014 bool
   5015 has_vl_op (rtx_insn *rinsn)
   5016 {
   5017   return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn);
   5018 }
   5019 
   5020 /* Get default tail policy.  */
   5021 static bool
   5022 get_default_ta ()
   5023 {
   5024   /* For the instruction that doesn't require TA, we still need a default value
   5025      to emit vsetvl. We pick up the default value according to prefer policy. */
   5026   return (bool) (get_prefer_tail_policy () & 0x1
   5027 		 || (get_prefer_tail_policy () >> 1 & 0x1));
   5028 }
   5029 
   5030 /* Helper function to get TA operand.  */
   5031 bool
   5032 tail_agnostic_p (rtx_insn *rinsn)
   5033 {
   5034   /* If it doesn't have TA, we return agnostic by default.  */
   5035   extract_insn_cached (rinsn);
   5036   int ta = get_attr_ta (rinsn);
   5037   return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta);
   5038 }
   5039 
   5040 /* Change insn and Assert the change always happens.  */
   5041 void
   5042 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group)
   5043 {
   5044   bool change_p = validate_change (object, loc, new_rtx, in_group);
   5045   gcc_assert (change_p);
   5046 }
   5047 
   5048 /* Return true if it is NONVLMAX AVL TYPE.  */
   5049 bool
   5050 nonvlmax_avl_type_p (rtx_insn *rinsn)
   5051 {
   5052   extract_insn_cached (rinsn);
   5053   int index = get_attr_avl_type_idx (rinsn);
   5054   if (index == INVALID_ATTRIBUTE)
   5055     return false;
   5056 
   5057   gcc_assert (index < recog_data.n_operands);
   5058 
   5059   rtx avl_type = recog_data.operand[index];
   5060   return INTVAL (avl_type) == NONVLMAX;
   5061 }
   5062 
   5063 /* Return true if RTX is RVV VLMAX AVL.  */
   5064 bool
   5065 vlmax_avl_p (rtx x)
   5066 {
   5067   return x && rtx_equal_p (x, RVV_VLMAX);
   5068 }
   5069 
   5070 /* Helper function to get SEW operand. We always have SEW value for
   5071    all RVV instructions that have VTYPE OP.  */
   5072 uint8_t
   5073 get_sew (rtx_insn *rinsn)
   5074 {
   5075   return get_attr_sew (rinsn);
   5076 }
   5077 
   5078 /* Helper function to get VLMUL operand. We always have VLMUL value for
   5079    all RVV instructions that have VTYPE OP. */
   5080 enum vlmul_type
   5081 get_vlmul (rtx_insn *rinsn)
   5082 {
   5083   return (enum vlmul_type) get_attr_vlmul (rinsn);
   5084 }
   5085 
   5086 /* Count the number of REGNO in RINSN.  */
   5087 int
   5088 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno)
   5089 {
   5090   int count = 0;
   5091   extract_insn (rinsn);
   5092   for (int i = 0; i < recog_data.n_operands; i++)
   5093     if (refers_to_regno_p (regno, recog_data.operand[i]))
   5094       count++;
   5095   return count;
   5096 }
   5097 
   5098 /* Return true if the OP can be directly broadcasted.  */
   5099 bool
   5100 can_be_broadcasted_p (rtx op)
   5101 {
   5102   machine_mode mode = GET_MODE (op);
   5103   /* We don't allow RA (register allocation) reload generate
   5104     (vec_duplicate:DI reg) in RV32 system wheras we allow
   5105     (vec_duplicate:DI mem) in RV32 system.  */
   5106   if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode)
   5107       && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode))
   5108       && !satisfies_constraint_Wdm (op))
   5109     return false;
   5110 
   5111   if (satisfies_constraint_vl (op) || register_operand (op, mode)
   5112       || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode)))
   5113     return true;
   5114 
   5115   return can_create_pseudo_p () && nonmemory_operand (op, mode);
   5116 }
   5117 
   5118 void
   5119 emit_vec_extract (rtx target, rtx src, rtx index)
   5120 {
   5121   machine_mode vmode = GET_MODE (src);
   5122   machine_mode smode = GET_MODE (target);
   5123   class expand_operand ops[3];
   5124   enum insn_code icode
   5125     = convert_optab_handler (vec_extract_optab, vmode, smode);
   5126   gcc_assert (icode != CODE_FOR_nothing);
   5127   create_output_operand (&ops[0], target, smode);
   5128   ops[0].target = 1;
   5129   create_input_operand (&ops[1], src, vmode);
   5130 
   5131   poly_int64 val;
   5132   if (poly_int_rtx_p (index, &val))
   5133     create_integer_operand (&ops[2], val);
   5134   else
   5135     create_input_operand (&ops[2], index, Pmode);
   5136 
   5137   expand_insn (icode, 3, ops);
   5138   if (ops[0].value != target)
   5139     emit_move_insn (target, ops[0].value);
   5140 }
   5141 
   5142 /* Return true if the offset mode is valid mode that we use for gather/scatter
   5143    autovectorization.  */
   5144 bool
   5145 gather_scatter_valid_offset_p (machine_mode mode)
   5146 {
   5147   /* If the element size of offset mode is already >= Pmode size,
   5148      we don't need any extensions.  */
   5149   if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD))
   5150     return true;
   5151 
   5152   /* Since we are very likely extend the offset mode into vector Pmode,
   5153      Disable gather/scatter autovectorization if we can't extend the offset
   5154      mode into vector Pmode.  */
   5155   if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ())
   5156     return false;
   5157   return true;
   5158 }
   5159 
   5160 /* Implement TARGET_ESTIMATED_POLY_VALUE.
   5161    Look into the tuning structure for an estimate.
   5162    KIND specifies the type of requested estimate: min, max or likely.
   5163    For cores with a known VLA width all three estimates are the same.
   5164    For generic VLA tuning we want to distinguish the maximum estimate from
   5165    the minimum and likely ones.
   5166    The likely estimate is the same as the minimum in that case to give a
   5167    conservative behavior of auto-vectorizing with VLA when it is a win
   5168    even for VLA vectorization.
   5169    When VLA width information is available VAL.coeffs[1] is multiplied by
   5170    the number of VLA chunks over the initial VLS bits.  */
   5171 HOST_WIDE_INT
   5172 estimated_poly_value (poly_int64 val, unsigned int kind)
   5173 {
   5174   unsigned int width_source
   5175     = BITS_PER_RISCV_VECTOR.is_constant ()
   5176 	? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant ()
   5177 	: (unsigned int) RVV_VECTOR_BITS_SCALABLE;
   5178 
   5179   /* If there is no core-specific information then the minimum and likely
   5180      values are based on TARGET_MIN_VLEN vectors and the maximum is based on
   5181      the architectural maximum of 65536 bits.  */
   5182   unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1;
   5183   if (width_source == RVV_VECTOR_BITS_SCALABLE)
   5184     switch (kind)
   5185       {
   5186       case POLY_VALUE_MIN:
   5187       case POLY_VALUE_LIKELY:
   5188 	return val.coeffs[0];
   5189 
   5190       case POLY_VALUE_MAX:
   5191 	return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes;
   5192       }
   5193 
   5194   /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the
   5195      lowest as likely.  This could be made more general if future -mtune
   5196      options need it to be.  */
   5197   if (kind == POLY_VALUE_MAX)
   5198     width_source = 1 << floor_log2 (width_source);
   5199   else
   5200     width_source = least_bit_hwi (width_source);
   5201 
   5202   /* If the core provides width information, use that.  */
   5203   HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN;
   5204   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
   5205 }
   5206 
   5207 /* Return true it is whole register-register move.  */
   5208 bool
   5209 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
   5210 {
   5211   /* An operation is a whole-register move if either
   5212      (1) Its vlmax operand equals VLMAX
   5213      (2) Its vl operand equals the number of units of its mode.  */
   5214   if (register_operand (ops[0], mode)
   5215       && register_operand (ops[3], mode)
   5216       && satisfies_constraint_vu (ops[2])
   5217       && satisfies_constraint_Wc1 (ops[1]))
   5218     {
   5219       if (INTVAL (ops[avl_type_index]) == VLMAX)
   5220 	return true;
   5221       /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
   5222 	 into NON-VLMAX with LEN = NUNITS.  */
   5223       else if (CONST_INT_P (ops[4])
   5224 	       && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
   5225 	return true;
   5226     }
   5227   return false;
   5228 }
   5229 
   5230 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
   5231 bool
   5232 splat_to_scalar_move_p (rtx *ops)
   5233 {
   5234   return satisfies_constraint_Wc1 (ops[1])
   5235 	 && satisfies_constraint_vu (ops[2])
   5236 	 && !MEM_P (ops[3])
   5237 	 && satisfies_constraint_c01 (ops[4])
   5238 	 && INTVAL (ops[7]) == NONVLMAX
   5239 	 && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
   5240 }
   5241 
   5242 } // namespace riscv_vector
   5243