1 /* Subroutines used for code generation for RISC-V 'V' Extension for 2 GNU compiler. 3 Copyright (C) 2022-2024 Free Software Foundation, Inc. 4 Contributed by Juzhe Zhong (juzhe.zhong (at) rivai.ai), RiVAI Technologies Ltd. 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it 9 under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 3, or (at your option) 11 any later version. 12 13 GCC is distributed in the hope that it will be useful, but 14 WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #define IN_TARGET_CODE 1 23 24 /* We have a maximum of 11 operands for RVV instruction patterns according to 25 the vector.md. */ 26 #define RVV_INSN_OPERANDS_MAX 11 27 28 #include "config.h" 29 #include "system.h" 30 #include "coretypes.h" 31 #include "tm.h" 32 #include "backend.h" 33 #include "rtl.h" 34 #include "insn-config.h" 35 #include "insn-attr.h" 36 #include "recog.h" 37 #include "alias.h" 38 #include "tree.h" 39 #include "stringpool.h" 40 #include "attribs.h" 41 #include "explow.h" 42 #include "memmodel.h" 43 #include "emit-rtl.h" 44 #include "tm_p.h" 45 #include "target.h" 46 #include "targhooks.h" 47 #include "expr.h" 48 #include "optabs.h" 49 #include "tm-constrs.h" 50 #include "rtx-vector-builder.h" 51 #include "targhooks.h" 52 #include "predict.h" 53 54 using namespace riscv_vector; 55 56 namespace riscv_vector { 57 58 /* Return true if NUNTIS <=31 so that we can use immediate AVL in vsetivli. */ 59 bool 60 imm_avl_p (machine_mode mode) 61 { 62 poly_uint64 nunits = GET_MODE_NUNITS (mode); 63 64 return nunits.is_constant () 65 /* The vsetivli can only hold register 0~31. */ 66 ? (IN_RANGE (nunits.to_constant (), 0, 31)) 67 /* Only allowed in VLS-VLMAX mode. */ 68 : false; 69 } 70 71 /* Return true if LEN is equal to NUNITS that out of the range [0, 31]. */ 72 static bool 73 is_vlmax_len_p (machine_mode mode, rtx len) 74 { 75 poly_int64 value; 76 return poly_int_rtx_p (len, &value) 77 && known_eq (value, GET_MODE_NUNITS (mode)); 78 } 79 80 /* Helper functions for insn_flags && insn_types */ 81 82 /* Return true if caller need pass mask operand for insn pattern with 83 INSN_FLAGS. */ 84 85 static bool 86 need_mask_operand_p (unsigned insn_flags) 87 { 88 return (insn_flags & HAS_MASK_P) 89 && !(insn_flags & (USE_ONE_TRUE_MASK_P | USE_ALL_TRUES_MASK_P)); 90 } 91 92 template <int MAX_OPERANDS> class insn_expander 93 { 94 public: 95 insn_expander () = delete; 96 97 insn_expander (unsigned insn_flags, bool vlmax_p) 98 : m_insn_flags (insn_flags), m_opno (0), m_vlmax_p (vlmax_p), 99 m_vl_op (NULL_RTX) 100 { 101 check_insn_flags (); 102 } 103 104 void check_insn_flags () const 105 { 106 if (m_insn_flags & USE_ONE_TRUE_MASK_P) 107 /* USE_ONE_TRUE_MASK_P is dependent on HAS_MASK_P. */ 108 gcc_assert ((m_insn_flags & HAS_MASK_P)); 109 110 if (m_insn_flags & USE_ALL_TRUES_MASK_P) 111 /* USE_ALL_TRUES_MASK_P is dependent on HAS_MASK_P. */ 112 gcc_assert ((m_insn_flags & HAS_MASK_P)); 113 114 /* USE_ONE_TRUE_MASK_P and USE_ALL_TRUES_MASK_P are mutually exclusive. */ 115 gcc_assert (!((m_insn_flags & USE_ONE_TRUE_MASK_P) 116 && (m_insn_flags & USE_ALL_TRUES_MASK_P))); 117 118 if (m_insn_flags & USE_VUNDEF_MERGE_P) 119 /* USE_VUNDEF_MERGE_P is dependent on HAS_MERGE_P. */ 120 gcc_assert ((m_insn_flags & HAS_MERGE_P)); 121 122 /* TU_POLICY_P and TDEFAULT_POLICY_P are mutually exclusive. */ 123 gcc_assert ( 124 !((m_insn_flags & TU_POLICY_P) && (m_insn_flags & TDEFAULT_POLICY_P))); 125 126 /* MU_POLICY_P and MDEFAULT_POLICY_P are mutually exclusive. */ 127 gcc_assert ( 128 !((m_insn_flags & MU_POLICY_P) && (m_insn_flags & MDEFAULT_POLICY_P))); 129 130 /* NULLARY_OP_P, UNARY_OP_P, BINARY_OP_P, TERNARY_OP_P are mutually 131 exclusive. */ 132 gcc_assert ( 133 !((m_insn_flags & NULLARY_OP_P) 134 && ((m_insn_flags & UNARY_OP_P) || (m_insn_flags & BINARY_OP_P) 135 || (m_insn_flags & TERNARY_OP_P)))); 136 gcc_assert ( 137 !((m_insn_flags & UNARY_OP_P) 138 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & BINARY_OP_P) 139 || (m_insn_flags & TERNARY_OP_P)))); 140 gcc_assert ( 141 !((m_insn_flags & BINARY_OP_P) 142 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P) 143 || (m_insn_flags & TERNARY_OP_P)))); 144 gcc_assert ( 145 !((m_insn_flags & TERNARY_OP_P) 146 && ((m_insn_flags & NULLARY_OP_P) || (m_insn_flags & UNARY_OP_P) 147 || (m_insn_flags & BINARY_OP_P)))); 148 } 149 150 void set_vl (rtx vl) { m_vl_op = vl; } 151 152 void add_output_operand (rtx x, machine_mode mode) 153 { 154 create_output_operand (&m_ops[m_opno++], x, mode); 155 gcc_assert (m_opno <= MAX_OPERANDS); 156 } 157 void add_input_operand (rtx x, machine_mode mode) 158 { 159 create_input_operand (&m_ops[m_opno++], x, mode); 160 gcc_assert (m_opno <= MAX_OPERANDS); 161 } 162 void add_all_one_mask_operand (machine_mode mask_mode) 163 { 164 add_input_operand (CONSTM1_RTX (mask_mode), mask_mode); 165 } 166 void add_first_one_true_mask_operand (machine_mode mask_mode) 167 { 168 add_input_operand (gen_scalar_move_mask (mask_mode), mask_mode); 169 } 170 void add_vundef_operand (machine_mode dest_mode) 171 { 172 add_input_operand (RVV_VUNDEF (dest_mode), dest_mode); 173 } 174 void add_policy_operand () 175 { 176 if (m_insn_flags & TU_POLICY_P) 177 { 178 rtx tail_policy_rtx = gen_int_mode (TAIL_UNDISTURBED, Pmode); 179 add_input_operand (tail_policy_rtx, Pmode); 180 } 181 else if (m_insn_flags & TDEFAULT_POLICY_P) 182 { 183 rtx tail_policy_rtx = gen_int_mode (get_prefer_tail_policy (), Pmode); 184 add_input_operand (tail_policy_rtx, Pmode); 185 } 186 187 if (m_insn_flags & MU_POLICY_P) 188 { 189 rtx mask_policy_rtx = gen_int_mode (MASK_UNDISTURBED, Pmode); 190 add_input_operand (mask_policy_rtx, Pmode); 191 } 192 else if (m_insn_flags & MDEFAULT_POLICY_P) 193 { 194 rtx mask_policy_rtx = gen_int_mode (get_prefer_mask_policy (), Pmode); 195 add_input_operand (mask_policy_rtx, Pmode); 196 } 197 } 198 void add_avl_type_operand (avl_type type) 199 { 200 add_input_operand (gen_int_mode (type, Pmode), Pmode); 201 } 202 203 void 204 add_rounding_mode_operand (enum floating_point_rounding_mode rounding_mode) 205 { 206 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode); 207 add_input_operand (frm_rtx, Pmode); 208 } 209 210 void 211 add_rounding_mode_operand (enum fixed_point_rounding_mode rounding_mode) 212 { 213 rtx frm_rtx = gen_int_mode (rounding_mode, Pmode); 214 add_input_operand (frm_rtx, Pmode); 215 } 216 217 /* Return the vtype mode based on insn_flags. 218 vtype mode mean the mode vsetvl insn set. */ 219 machine_mode 220 get_vtype_mode (rtx *ops) 221 { 222 machine_mode vtype_mode; 223 if (m_insn_flags & VTYPE_MODE_FROM_OP1_P) 224 vtype_mode = GET_MODE (ops[1]); 225 else 226 vtype_mode = GET_MODE (ops[0]); 227 return vtype_mode; 228 } 229 230 void emit_insn (enum insn_code icode, rtx *ops) 231 { 232 int opno = 0; 233 int num_ops; 234 /* It's true if any operand is memory operand. */ 235 bool any_mem_p = false; 236 237 machine_mode vtype_mode = get_vtype_mode (ops); 238 machine_mode mask_mode = get_mask_mode (vtype_mode); 239 240 /* Add dest operand. */ 241 if (m_insn_flags & HAS_DEST_P) 242 { 243 rtx op = ops[opno++]; 244 any_mem_p |= MEM_P (op); 245 add_output_operand (op, GET_MODE (op)); 246 } 247 248 /* Add mask operand. */ 249 if (m_insn_flags & USE_ONE_TRUE_MASK_P) 250 add_first_one_true_mask_operand (mask_mode); 251 else if (m_insn_flags & USE_ALL_TRUES_MASK_P) 252 add_all_one_mask_operand (mask_mode); 253 else if (m_insn_flags & HAS_MASK_P) 254 { 255 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode; 256 gcc_assert (mode != VOIDmode); 257 add_input_operand (ops[opno++], mode); 258 } 259 260 /* Add merge operand. */ 261 if (m_insn_flags & USE_VUNDEF_MERGE_P) 262 /* Same as dest operand. */ 263 add_vundef_operand (GET_MODE (ops[0])); 264 else if (m_insn_flags & HAS_MERGE_P) 265 { 266 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode; 267 gcc_assert (mode != VOIDmode); 268 add_input_operand (ops[opno++], mode); 269 } 270 271 if (m_insn_flags & NULLARY_OP_P) 272 num_ops = 0; 273 else if (m_insn_flags & UNARY_OP_P) 274 num_ops = 1; 275 else if (m_insn_flags & BINARY_OP_P) 276 num_ops = 2; 277 else if (m_insn_flags & TERNARY_OP_P) 278 num_ops = 3; 279 else 280 gcc_unreachable (); 281 282 /* Add the remain operands. */ 283 for (; num_ops; num_ops--, opno++) 284 { 285 any_mem_p |= MEM_P (ops[opno]); 286 machine_mode mode = insn_data[(int) icode].operand[m_opno].mode; 287 /* 'create_input_operand doesn't allow VOIDmode. 288 According to vector.md, we may have some patterns that do not have 289 explicit machine mode specifying the operand. Such operands are 290 always Pmode. */ 291 if (mode == VOIDmode) 292 mode = Pmode; 293 else 294 /* Early assertion ensures same mode since maybe_legitimize_operand 295 will check this. */ 296 gcc_assert (GET_MODE (ops[opno]) == VOIDmode 297 || GET_MODE (ops[opno]) == mode); 298 299 add_input_operand (ops[opno], mode); 300 } 301 302 /* Add vl operand. */ 303 rtx len = m_vl_op; 304 bool vls_p = false; 305 if (m_vlmax_p) 306 { 307 if (riscv_v_ext_vls_mode_p (vtype_mode)) 308 { 309 /* VLS modes always set VSETVL by 310 "vsetvl zero, rs1/imm". */ 311 poly_uint64 nunits = GET_MODE_NUNITS (vtype_mode); 312 len = gen_int_mode (nunits, Pmode); 313 vls_p = true; 314 } 315 else if (can_create_pseudo_p ()) 316 { 317 len = gen_reg_rtx (Pmode); 318 emit_vlmax_vsetvl (vtype_mode, len); 319 } 320 } 321 322 gcc_assert (len != NULL_RTX); 323 add_input_operand (len, Pmode); 324 325 /* Add tail and mask policy operands. */ 326 add_policy_operand (); 327 328 /* Add avl_type operand. */ 329 add_avl_type_operand ( 330 vls_p ? avl_type::VLS 331 : (m_vlmax_p ? avl_type::VLMAX : avl_type::NONVLMAX)); 332 333 /* Add rounding mode operand. */ 334 if (m_insn_flags & FRM_DYN_P) 335 add_rounding_mode_operand (FRM_DYN); 336 else if (m_insn_flags & FRM_RUP_P) 337 add_rounding_mode_operand (FRM_RUP); 338 else if (m_insn_flags & FRM_RDN_P) 339 add_rounding_mode_operand (FRM_RDN); 340 else if (m_insn_flags & FRM_RMM_P) 341 add_rounding_mode_operand (FRM_RMM); 342 else if (m_insn_flags & FRM_RNE_P) 343 add_rounding_mode_operand (FRM_RNE); 344 else if (m_insn_flags & VXRM_RNU_P) 345 add_rounding_mode_operand (VXRM_RNU); 346 else if (m_insn_flags & VXRM_RDN_P) 347 add_rounding_mode_operand (VXRM_RDN); 348 349 gcc_assert (insn_data[(int) icode].n_operands == m_opno); 350 expand (icode, any_mem_p); 351 } 352 353 void expand (enum insn_code icode, bool temporary_volatile_p = false) 354 { 355 if (temporary_volatile_p) 356 { 357 temporary_volatile_ok v (true); 358 expand_insn (icode, m_opno, m_ops); 359 } 360 else 361 expand_insn (icode, m_opno, m_ops); 362 } 363 364 private: 365 unsigned m_insn_flags; 366 int m_opno; 367 bool m_vlmax_p; 368 rtx m_vl_op; 369 expand_operand m_ops[MAX_OPERANDS]; 370 }; 371 372 /* Emit an RVV insn with a vector length that equals the number of units of the 373 vector mode. For VLA modes this corresponds to VLMAX. 374 375 Unless the vector length can be encoded in the vsetivl[i] instruction this 376 function must only be used as long as we can create pseudo registers. This is 377 because it will set a pseudo register to VLMAX using vsetvl and use this as 378 definition for the vector length. */ 379 void 380 emit_vlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops) 381 { 382 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true); 383 gcc_assert (can_create_pseudo_p () || imm_avl_p (e.get_vtype_mode (ops))); 384 385 e.emit_insn ((enum insn_code) icode, ops); 386 } 387 388 /* Like emit_vlmax_insn but must only be used when we cannot create pseudo 389 registers anymore. This function, however, takes a predefined vector length 390 from the value in VL. */ 391 void 392 emit_vlmax_insn_lra (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl) 393 { 394 gcc_assert (!can_create_pseudo_p ()); 395 machine_mode mode = GET_MODE (ops[0]); 396 397 if (imm_avl_p (mode)) 398 { 399 /* Even though VL is a real hardreg already allocated since 400 it is post-RA now, we still gain benefits that we emit 401 vsetivli zero, imm instead of vsetvli VL, zero which is 402 we can be more flexible in post-RA instruction scheduling. */ 403 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false); 404 e.set_vl (gen_int_mode (GET_MODE_NUNITS (mode), Pmode)); 405 e.emit_insn ((enum insn_code) icode, ops); 406 } 407 else 408 { 409 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, true); 410 e.set_vl (vl); 411 e.emit_insn ((enum insn_code) icode, ops); 412 } 413 } 414 415 /* Emit an RVV insn with a predefined vector length. Contrary to 416 emit_vlmax_insn the instruction's vector length is not deduced from its mode 417 but taken from the value in VL. */ 418 void 419 emit_nonvlmax_insn (unsigned icode, unsigned insn_flags, rtx *ops, rtx vl) 420 { 421 insn_expander<RVV_INSN_OPERANDS_MAX> e (insn_flags, false); 422 e.set_vl (vl); 423 e.emit_insn ((enum insn_code) icode, ops); 424 } 425 426 class rvv_builder : public rtx_vector_builder 427 { 428 public: 429 rvv_builder () : rtx_vector_builder () {} 430 rvv_builder (machine_mode mode, unsigned int npatterns, 431 unsigned int nelts_per_pattern) 432 : rtx_vector_builder (mode, npatterns, nelts_per_pattern) 433 { 434 m_inner_mode = GET_MODE_INNER (mode); 435 m_inner_bits_size = GET_MODE_BITSIZE (m_inner_mode); 436 m_inner_bytes_size = GET_MODE_SIZE (m_inner_mode); 437 m_mask_mode = get_mask_mode (mode); 438 439 gcc_assert ( 440 int_mode_for_size (inner_bits_size (), 0).exists (&m_inner_int_mode)); 441 m_int_mode 442 = get_vector_mode (m_inner_int_mode, GET_MODE_NUNITS (mode)).require (); 443 } 444 445 bool can_duplicate_repeating_sequence_p (); 446 bool is_repeating_sequence (); 447 rtx get_merged_repeating_sequence (); 448 449 bool repeating_sequence_use_merge_profitable_p (); 450 bool combine_sequence_use_slideup_profitable_p (); 451 bool combine_sequence_use_merge_profitable_p (); 452 rtx get_merge_scalar_mask (unsigned int, machine_mode) const; 453 454 bool single_step_npatterns_p () const; 455 bool npatterns_all_equal_p () const; 456 bool interleaved_stepped_npatterns_p () const; 457 bool npatterns_vid_diff_repeated_p () const; 458 459 machine_mode new_mode () const { return m_new_mode; } 460 scalar_mode inner_mode () const { return m_inner_mode; } 461 scalar_int_mode inner_int_mode () const { return m_inner_int_mode; } 462 machine_mode mask_mode () const { return m_mask_mode; } 463 machine_mode int_mode () const { return m_int_mode; } 464 unsigned int inner_bits_size () const { return m_inner_bits_size; } 465 unsigned int inner_bytes_size () const { return m_inner_bytes_size; } 466 467 private: 468 scalar_mode m_inner_mode; 469 scalar_int_mode m_inner_int_mode; 470 machine_mode m_new_mode; 471 scalar_int_mode m_new_inner_mode; 472 machine_mode m_mask_mode; 473 machine_mode m_int_mode; 474 unsigned int m_inner_bits_size; 475 unsigned int m_inner_bytes_size; 476 }; 477 478 /* Return true if the vector duplicated by a super element which is the fusion 479 of consecutive elements. 480 481 v = { a, b, a, b } super element = ab, v = { ab, ab } */ 482 bool 483 rvv_builder::can_duplicate_repeating_sequence_p () 484 { 485 poly_uint64 new_size = exact_div (full_nelts (), npatterns ()); 486 unsigned int new_inner_size = m_inner_bits_size * npatterns (); 487 if (m_inner_mode == Pmode 488 || !int_mode_for_size (new_inner_size, 0).exists (&m_new_inner_mode) 489 || GET_MODE_SIZE (m_new_inner_mode) > UNITS_PER_WORD 490 || !get_vector_mode (m_new_inner_mode, new_size).exists (&m_new_mode)) 491 return false; 492 if (full_nelts ().is_constant ()) 493 return repeating_sequence_p (0, full_nelts ().to_constant (), npatterns ()); 494 return nelts_per_pattern () == 1; 495 } 496 497 /* Return true if the vector is a simple sequence with one pattern and all 498 elements the same. */ 499 bool 500 rvv_builder::is_repeating_sequence () 501 { 502 if (npatterns () > 1) 503 return false; 504 if (full_nelts ().is_constant ()) 505 return repeating_sequence_p (0, full_nelts ().to_constant (), 1); 506 return nelts_per_pattern () == 1; 507 } 508 509 /* Return true if it is a repeating sequence that using 510 merge approach has better codegen than using default 511 approach (slide1down). 512 513 Sequence A: 514 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} 515 516 nelts = 16 517 npatterns = 2 518 519 for merging a we need mask 101010.... 520 for merging b we need mask 010101.... 521 522 Foreach element in the npattern, we need to build a mask in scalar register. 523 Mostely we need 3 instructions (aka COST = 3), which is consist of 2 scalar 524 instruction and 1 scalar move to v0 register. Finally we need vector merge 525 to merge them. 526 527 lui a5, #imm 528 add a5, #imm 529 vmov.s.x v0, a5 530 vmerge.vxm v9, v9, a1, v0 531 532 So the overall (roughly) COST of Sequence A = (3 + 1) * npatterns = 8. 533 If we use slide1down, the COST = nelts = 16 > 8 (COST of merge). 534 So return true in this case as it is profitable. 535 536 Sequence B: 537 {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h} 538 539 nelts = 16 540 npatterns = 8 541 542 COST of merge approach = (3 + 1) * npatterns = 24 543 COST of slide1down approach = nelts = 16 544 Return false in this case as it is NOT profitable in merge approach. 545 */ 546 bool 547 rvv_builder::repeating_sequence_use_merge_profitable_p () 548 { 549 if (inner_bytes_size () > UNITS_PER_WORD) 550 return false; 551 552 unsigned int nelts = full_nelts ().to_constant (); 553 554 if (!repeating_sequence_p (0, nelts, npatterns ())) 555 return false; 556 557 unsigned int merge_cost = 1; 558 unsigned int build_merge_mask_cost = 3; 559 unsigned int slide1down_cost = nelts; 560 561 return (build_merge_mask_cost + merge_cost) * npatterns () < slide1down_cost; 562 } 563 564 /* Return true if it's worthwhile to use slideup combine 2 vectors. */ 565 bool 566 rvv_builder::combine_sequence_use_slideup_profitable_p () 567 { 568 int nelts = full_nelts ().to_constant (); 569 int leading_ndups = this->count_dups (0, nelts - 1, 1); 570 int trailing_ndups = this->count_dups (nelts - 1, -1, -1); 571 572 /* ??? Current heuristic we do is we do combine 2 vectors 573 by slideup when: 574 1. # of leading same elements is equal to # of trailing same elements. 575 2. Both of above are equal to nelts / 2. 576 Otherwise, it is not profitable. */ 577 return leading_ndups == trailing_ndups && trailing_ndups == nelts / 2; 578 } 579 580 /* Return true if it's worthwhile to use merge combine vector with a scalar. */ 581 bool 582 rvv_builder::combine_sequence_use_merge_profitable_p () 583 { 584 int nelts = full_nelts ().to_constant (); 585 int leading_ndups = this->count_dups (0, nelts - 1, 1); 586 int trailing_ndups = this->count_dups (nelts - 1, -1, -1); 587 int nregs = riscv_get_v_regno_alignment (int_mode ()); 588 589 if (leading_ndups + trailing_ndups != nelts) 590 return false; 591 592 /* Leading elements num > 255 which exceeds the maximum value 593 of QImode, we will need to use HImode. */ 594 machine_mode mode; 595 if (leading_ndups > 255 || nregs > 2) 596 { 597 if (!get_vector_mode (HImode, nelts).exists (&mode)) 598 return false; 599 /* We will need one more AVL/VL toggling vsetvl instruction. */ 600 return leading_ndups > 4 && trailing_ndups > 4; 601 } 602 603 /* { a, a, a, b, b, ... , b } and { b, b, b, a, a, ... , a } 604 consume 3 slide instructions. */ 605 return leading_ndups > 3 && trailing_ndups > 3; 606 } 607 608 /* Merge the repeating sequence into a single element and return the RTX. */ 609 rtx 610 rvv_builder::get_merged_repeating_sequence () 611 { 612 scalar_int_mode mode = Pmode; 613 rtx target = gen_reg_rtx (mode); 614 emit_move_insn (target, const0_rtx); 615 rtx imm = gen_int_mode ((1ULL << m_inner_bits_size) - 1, mode); 616 /* { a, b, a, b }: Generate duplicate element = b << bits | a. */ 617 for (unsigned int i = 0; i < npatterns (); i++) 618 { 619 unsigned int loc = m_inner_bits_size * i; 620 rtx shift = gen_int_mode (loc, mode); 621 rtx ele = gen_lowpart (mode, elt (i)); 622 rtx tmp = expand_simple_binop (mode, AND, ele, imm, NULL_RTX, false, 623 OPTAB_DIRECT); 624 rtx tmp2 = expand_simple_binop (mode, ASHIFT, tmp, shift, NULL_RTX, false, 625 OPTAB_DIRECT); 626 rtx tmp3 = expand_simple_binop (mode, IOR, tmp2, target, NULL_RTX, false, 627 OPTAB_DIRECT); 628 emit_move_insn (target, tmp3); 629 } 630 if (GET_MODE_SIZE (m_new_inner_mode) < UNITS_PER_WORD) 631 return gen_lowpart (m_new_inner_mode, target); 632 return target; 633 } 634 635 /* Get the mask for merge approach. 636 637 Consider such following case: 638 {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b} 639 To merge "a", the mask should be 1010.... 640 To merge "b", the mask should be 0101.... 641 */ 642 rtx 643 rvv_builder::get_merge_scalar_mask (unsigned int index_in_pattern, 644 machine_mode inner_mode) const 645 { 646 unsigned HOST_WIDE_INT mask = 0; 647 unsigned HOST_WIDE_INT base_mask = (1ULL << index_in_pattern); 648 /* Here we construct a mask pattern that will later be broadcast 649 to a vector register. The maximum broadcast size for vmv.v.x/vmv.s.x 650 is determined by the length of a vector element (ELEN) and not by 651 XLEN so make sure we do not exceed it. One example is -march=zve32* 652 which mandates ELEN == 32 but can be combined with -march=rv64 653 with XLEN == 64. */ 654 unsigned int elen = TARGET_VECTOR_ELEN_64 ? 64 : 32; 655 656 gcc_assert (elen % npatterns () == 0); 657 658 int limit = elen / npatterns (); 659 660 for (int i = 0; i < limit; i++) 661 mask |= base_mask << (i * npatterns ()); 662 663 return gen_int_mode (mask, inner_mode); 664 } 665 666 /* Return true if the variable-length vector is single step. 667 Single step means step all patterns in NPATTERNS are equal. 668 Consider this following case: 669 670 CASE 1: NPATTERNS = 2, NELTS_PER_PATTERN = 3. 671 { 0, 2, 2, 4, 4, 6, ... } 672 First pattern: step1 = 2 - 0 = 2 673 step2 = 4 - 2 = 2 674 Second pattern: step1 = 4 - 2 = 2 675 step2 = 6 - 4 = 2 676 Since all steps of NPATTERNS are equal step = 2. 677 Return true in this case. 678 679 CASE 2: NPATTERNS = 2, NELTS_PER_PATTERN = 3. 680 { 0, 1, 2, 4, 4, 7, ... } 681 First pattern: step1 = 2 - 0 = 2 682 step2 = 4 - 2 = 2 683 Second pattern: step1 = 4 - 1 = 3 684 step2 = 7 - 4 = 3 685 Since not all steps are equal, return false. */ 686 bool 687 rvv_builder::single_step_npatterns_p () const 688 { 689 if (nelts_per_pattern () != 3) 690 return false; 691 692 poly_int64 step 693 = rtx_to_poly_int64 (elt (npatterns ())) - rtx_to_poly_int64 (elt (0)); 694 for (unsigned int i = 0; i < npatterns (); i++) 695 { 696 poly_int64 ele0 = rtx_to_poly_int64 (elt (i)); 697 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i)); 698 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i)); 699 poly_int64 diff1 = ele1 - ele0; 700 poly_int64 diff2 = ele2 - ele1; 701 if (maybe_ne (step, diff1) || maybe_ne (step, diff2)) 702 return false; 703 } 704 return true; 705 } 706 707 /* Return true if the diff between const vector and vid sequence 708 is repeated. For example as below cases: 709 The diff means the const vector - vid. 710 CASE 1: 711 CONST VECTOR: {3, 2, 1, 0, 7, 6, 5, 4, ... } 712 VID : {0, 1, 2, 3, 4, 5, 6, 7, ... } 713 DIFF(MINUS) : {3, 1,-1,-3, 3, 1,-1,-3, ... } 714 The diff sequence {3, 1,-1,-3} is repeated in the npattern and 715 return TRUE for case 1. 716 717 CASE 2: 718 CONST VECTOR: {-4, 4,-3, 5,-2, 6,-1, 7, ...} 719 VID : { 0, 1, 2, 3, 4, 5, 6, 7, ... } 720 DIFF(MINUS) : {-4, 3,-5,-2,-6, 1,-7, 0, ... } 721 The diff sequence {-4, 3} is not repated in the npattern and 722 return FALSE for case 2. */ 723 bool 724 rvv_builder::npatterns_vid_diff_repeated_p () const 725 { 726 if (nelts_per_pattern () != 3) 727 return false; 728 else if (npatterns () == 0) 729 return false; 730 731 for (unsigned i = 0; i < npatterns (); i++) 732 { 733 poly_int64 diff_0 = rtx_to_poly_int64 (elt (i)) - i; 734 poly_int64 diff_1 735 = rtx_to_poly_int64 (elt (npatterns () + i)) - npatterns () - i; 736 737 if (maybe_ne (diff_0, diff_1)) 738 return false; 739 } 740 741 return true; 742 } 743 744 /* Return true if the permutation consists of two 745 interleaved patterns with a constant step each. 746 TODO: We currently only support NPATTERNS = 2. */ 747 bool 748 rvv_builder::interleaved_stepped_npatterns_p () const 749 { 750 if (npatterns () != 2 || nelts_per_pattern () != 3) 751 return false; 752 for (unsigned int i = 0; i < npatterns (); i++) 753 { 754 poly_int64 ele0 = rtx_to_poly_int64 (elt (i)); 755 poly_int64 ele1 = rtx_to_poly_int64 (elt (npatterns () + i)); 756 poly_int64 ele2 = rtx_to_poly_int64 (elt (npatterns () * 2 + i)); 757 poly_int64 diff1 = ele1 - ele0; 758 poly_int64 diff2 = ele2 - ele1; 759 if (maybe_ne (diff1, diff2)) 760 return false; 761 } 762 return true; 763 } 764 765 /* Return true if all elements of NPATTERNS are equal. 766 767 E.g. NPATTERNS = 4: 768 { 2, 2, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 16, 16, 16, 16, ... } 769 E.g. NPATTERNS = 8: 770 { 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 8, 8, 8, 8, ... } 771 We only check ele[0] ~ ele[NPATTERNS - 1] whether they are the same. 772 We don't need to check the elements[n] with n >= NPATTERNS since 773 they don't belong to the same pattern. 774 */ 775 bool 776 rvv_builder::npatterns_all_equal_p () const 777 { 778 poly_int64 ele0 = rtx_to_poly_int64 (elt (0)); 779 for (unsigned int i = 1; i < npatterns (); i++) 780 { 781 poly_int64 ele = rtx_to_poly_int64 (elt (i)); 782 if (!known_eq (ele, ele0)) 783 return false; 784 } 785 return true; 786 } 787 788 static unsigned 789 get_sew (machine_mode mode) 790 { 791 unsigned int sew = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL 792 ? 8 793 : GET_MODE_BITSIZE (GET_MODE_INNER (mode)); 794 return sew; 795 } 796 797 /* Return true if X is a const_vector with all duplicate elements, which is in 798 the range between MINVAL and MAXVAL. */ 799 bool 800 const_vec_all_same_in_range_p (rtx x, HOST_WIDE_INT minval, 801 HOST_WIDE_INT maxval) 802 { 803 rtx elt; 804 return (const_vec_duplicate_p (x, &elt) && CONST_INT_P (elt) 805 && IN_RANGE (INTVAL (elt), minval, maxval)); 806 } 807 808 /* Return true if VEC is a constant in which every element is in the range 809 [MINVAL, MAXVAL]. The elements do not need to have the same value. 810 811 This function also exists in aarch64, we may unify it in middle-end in the 812 future. */ 813 814 static bool 815 const_vec_all_in_range_p (rtx vec, poly_int64 minval, poly_int64 maxval) 816 { 817 if (!CONST_VECTOR_P (vec) 818 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT) 819 return false; 820 821 int nunits; 822 if (!CONST_VECTOR_STEPPED_P (vec)) 823 nunits = const_vector_encoded_nelts (vec); 824 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits)) 825 return false; 826 827 for (int i = 0; i < nunits; i++) 828 { 829 rtx vec_elem = CONST_VECTOR_ELT (vec, i); 830 poly_int64 value; 831 if (!poly_int_rtx_p (vec_elem, &value) 832 || maybe_lt (value, minval) 833 || maybe_gt (value, maxval)) 834 return false; 835 } 836 return true; 837 } 838 839 /* Return a const vector of VAL. The VAL can be either const_int or 840 const_poly_int. */ 841 842 static rtx 843 gen_const_vector_dup (machine_mode mode, poly_int64 val) 844 { 845 scalar_mode smode = GET_MODE_INNER (mode); 846 rtx c = gen_int_mode (val, smode); 847 if (!val.is_constant () && GET_MODE_SIZE (smode) > GET_MODE_SIZE (Pmode)) 848 { 849 /* When VAL is const_poly_int value, we need to explicitly broadcast 850 it into a vector using RVV broadcast instruction. */ 851 return expand_vector_broadcast (mode, c); 852 } 853 return gen_const_vec_duplicate (mode, c); 854 } 855 856 /* Emit a vlmax vsetvl instruction. This should only be used when 857 optimization is disabled or after vsetvl insertion pass. */ 858 void 859 emit_hard_vlmax_vsetvl (machine_mode vmode, rtx vl) 860 { 861 unsigned int sew = get_sew (vmode); 862 emit_insn (gen_vsetvl (Pmode, vl, RVV_VLMAX, gen_int_mode (sew, Pmode), 863 gen_int_mode (get_vlmul (vmode), Pmode), const0_rtx, 864 const0_rtx)); 865 } 866 867 void 868 emit_vlmax_vsetvl (machine_mode vmode, rtx vl) 869 { 870 unsigned int sew = get_sew (vmode); 871 enum vlmul_type vlmul = get_vlmul (vmode); 872 unsigned int ratio = calculate_ratio (sew, vlmul); 873 874 if (!optimize) 875 emit_hard_vlmax_vsetvl (vmode, vl); 876 else 877 emit_insn (gen_vlmax_avl (Pmode, vl, gen_int_mode (ratio, Pmode))); 878 } 879 880 /* Calculate SEW/LMUL ratio. */ 881 unsigned int 882 calculate_ratio (unsigned int sew, enum vlmul_type vlmul) 883 { 884 unsigned int ratio; 885 switch (vlmul) 886 { 887 case LMUL_1: 888 ratio = sew; 889 break; 890 case LMUL_2: 891 ratio = sew / 2; 892 break; 893 case LMUL_4: 894 ratio = sew / 4; 895 break; 896 case LMUL_8: 897 ratio = sew / 8; 898 break; 899 case LMUL_F8: 900 ratio = sew * 8; 901 break; 902 case LMUL_F4: 903 ratio = sew * 4; 904 break; 905 case LMUL_F2: 906 ratio = sew * 2; 907 break; 908 default: 909 gcc_unreachable (); 910 } 911 return ratio; 912 } 913 914 /* SCALABLE means that the vector-length is agnostic (run-time invariant and 915 compile-time unknown). ZVL meands that the vector-length is specific 916 (compile-time known by march like zvl*b). Both SCALABLE and ZVL are doing 917 auto-vectorization using VLMAX vsetvl configuration. */ 918 static bool 919 autovec_use_vlmax_p (void) 920 { 921 return rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE 922 || rvv_vector_bits == RVV_VECTOR_BITS_ZVL; 923 } 924 925 /* This function emits VLMAX vrgather instruction. Emit vrgather.vx/vi when sel 926 is a const duplicate vector. Otherwise, emit vrgather.vv. */ 927 static void 928 emit_vlmax_gather_insn (rtx target, rtx op, rtx sel) 929 { 930 rtx elt; 931 insn_code icode; 932 machine_mode data_mode = GET_MODE (target); 933 machine_mode sel_mode = GET_MODE (sel); 934 if (const_vec_duplicate_p (sel, &elt)) 935 { 936 icode = code_for_pred_gather_scalar (data_mode); 937 sel = elt; 938 } 939 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode))) 940 icode = code_for_pred_gatherei16 (data_mode); 941 else 942 icode = code_for_pred_gather (data_mode); 943 rtx ops[] = {target, op, sel}; 944 emit_vlmax_insn (icode, BINARY_OP, ops); 945 } 946 947 static void 948 emit_vlmax_masked_gather_mu_insn (rtx target, rtx op, rtx sel, rtx mask) 949 { 950 rtx elt; 951 insn_code icode; 952 machine_mode data_mode = GET_MODE (target); 953 machine_mode sel_mode = GET_MODE (sel); 954 if (const_vec_duplicate_p (sel, &elt)) 955 { 956 icode = code_for_pred_gather_scalar (data_mode); 957 sel = elt; 958 } 959 else if (maybe_ne (GET_MODE_SIZE (data_mode), GET_MODE_SIZE (sel_mode))) 960 icode = code_for_pred_gatherei16 (data_mode); 961 else 962 icode = code_for_pred_gather (data_mode); 963 rtx ops[] = {target, mask, target, op, sel}; 964 emit_vlmax_insn (icode, BINARY_OP_TAMU, ops); 965 } 966 967 /* According to RVV ISA spec (16.5.1. Synthesizing vdecompress): 968 https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc 969 970 There is no inverse vdecompress provided, as this operation can be readily 971 synthesized using iota and a masked vrgather: 972 973 Desired functionality of 'vdecompress' 974 7 6 5 4 3 2 1 0 # vid 975 976 e d c b a # packed vector of 5 elements 977 1 0 0 1 1 1 0 1 # mask vector of 8 elements 978 p q r s t u v w # destination register before vdecompress 979 980 e q r d c b v a # result of vdecompress 981 # v0 holds mask 982 # v1 holds packed data 983 # v11 holds input expanded vector and result 984 viota.m v10, v0 # Calc iota from mask in v0 985 vrgather.vv v11, v1, v10, v0.t # Expand into destination 986 p q r s t u v w # v11 destination register 987 e d c b a # v1 source vector 988 1 0 0 1 1 1 0 1 # v0 mask vector 989 990 4 4 4 3 2 1 1 0 # v10 result of viota.m 991 e q r d c b v a # v11 destination after vrgather using viota.m under mask 992 */ 993 static void 994 emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask) 995 { 996 machine_mode data_mode = GET_MODE (target); 997 machine_mode sel_mode = related_int_vector_mode (data_mode).require (); 998 if (GET_MODE_INNER (data_mode) == QImode) 999 sel_mode = get_vector_mode (HImode, GET_MODE_NUNITS (data_mode)).require (); 1000 1001 rtx sel = gen_reg_rtx (sel_mode); 1002 rtx iota_ops[] = {sel, mask}; 1003 emit_vlmax_insn (code_for_pred_iota (sel_mode), UNARY_OP, iota_ops); 1004 emit_vlmax_gather_insn (target, op0, sel); 1005 emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask); 1006 } 1007 1008 /* Emit merge instruction. */ 1009 1010 static machine_mode 1011 get_repeating_sequence_dup_machine_mode (const rvv_builder &builder, 1012 machine_mode mask_bit_mode) 1013 { 1014 unsigned mask_precision = GET_MODE_PRECISION (mask_bit_mode).to_constant (); 1015 unsigned mask_scalar_size = mask_precision > builder.inner_bits_size () 1016 ? builder.inner_bits_size () : mask_precision; 1017 1018 scalar_mode inner_mode; 1019 unsigned minimal_bits_size; 1020 1021 switch (mask_scalar_size) 1022 { 1023 case 8: 1024 inner_mode = QImode; 1025 minimal_bits_size = TARGET_MIN_VLEN / 8; /* AKA RVVMF8. */ 1026 break; 1027 case 16: 1028 inner_mode = HImode; 1029 minimal_bits_size = TARGET_MIN_VLEN / 4; /* AKA RVVMF4. */ 1030 break; 1031 case 32: 1032 inner_mode = SImode; 1033 minimal_bits_size = TARGET_MIN_VLEN / 2; /* AKA RVVMF2. */ 1034 break; 1035 case 64: 1036 inner_mode = DImode; 1037 minimal_bits_size = TARGET_MIN_VLEN / 1; /* AKA RVVM1. */ 1038 break; 1039 default: 1040 gcc_unreachable (); 1041 break; 1042 } 1043 1044 gcc_assert (mask_precision % mask_scalar_size == 0); 1045 1046 uint64_t dup_nunit = mask_precision > mask_scalar_size 1047 ? mask_precision / mask_scalar_size : minimal_bits_size / mask_scalar_size; 1048 1049 return get_vector_mode (inner_mode, dup_nunit).require (); 1050 } 1051 1052 /* Expand series const vector. If VID is NULL_RTX, we use vid.v 1053 instructions to generate sequence for VID: 1054 1055 VID = { 0, 1, 2, 3, ... } 1056 1057 Otherwise, we use the VID argument directly. */ 1058 1059 void 1060 expand_vec_series (rtx dest, rtx base, rtx step, rtx vid) 1061 { 1062 machine_mode mode = GET_MODE (dest); 1063 poly_int64 nunits_m1 = GET_MODE_NUNITS (mode) - 1; 1064 poly_int64 value; 1065 rtx result = register_operand (dest, mode) ? dest : gen_reg_rtx (mode); 1066 1067 /* VECT_IV = BASE + I * STEP. */ 1068 1069 /* Step 1: Generate I = { 0, 1, 2, ... } by vid.v. */ 1070 bool reverse_p = !vid && rtx_equal_p (step, constm1_rtx) 1071 && poly_int_rtx_p (base, &value) 1072 && known_eq (nunits_m1, value); 1073 if (!vid) 1074 { 1075 vid = gen_reg_rtx (mode); 1076 rtx op[] = {vid}; 1077 emit_vlmax_insn (code_for_pred_series (mode), NULLARY_OP, op); 1078 } 1079 1080 rtx step_adj; 1081 if (reverse_p) 1082 { 1083 /* Special case: 1084 {nunits - 1, nunits - 2, ... , 0}. 1085 nunits can be either const_int or const_poly_int. 1086 1087 Code sequence: 1088 vid.v v 1089 vrsub nunits - 1, v. */ 1090 rtx ops[] 1091 = {result, vid, gen_int_mode (nunits_m1, GET_MODE_INNER (mode))}; 1092 insn_code icode = code_for_pred_sub_reverse_scalar (mode); 1093 emit_vlmax_insn (icode, BINARY_OP, ops); 1094 } 1095 else 1096 { 1097 /* Step 2: Generate I * STEP. 1098 - STEP is 1, we don't emit any instructions. 1099 - STEP is power of 2, we use vsll.vi/vsll.vx. 1100 - STEP is non-power of 2, we use vmul.vx. */ 1101 if (rtx_equal_p (step, const1_rtx)) 1102 step_adj = vid; 1103 else 1104 { 1105 step_adj = gen_reg_rtx (mode); 1106 if (CONST_INT_P (step) && pow2p_hwi (INTVAL (step))) 1107 { 1108 /* Emit logical left shift operation. */ 1109 int shift = exact_log2 (INTVAL (step)); 1110 rtx shift_amount = gen_int_mode (shift, Pmode); 1111 insn_code icode = code_for_pred_scalar (ASHIFT, mode); 1112 rtx ops[] = {step_adj, vid, shift_amount}; 1113 emit_vlmax_insn (icode, BINARY_OP, ops); 1114 } 1115 else 1116 { 1117 insn_code icode = code_for_pred_scalar (MULT, mode); 1118 rtx ops[] = {step_adj, vid, step}; 1119 emit_vlmax_insn (icode, BINARY_OP, ops); 1120 } 1121 } 1122 1123 /* Step 3: Generate BASE + I * STEP. 1124 - BASE is 0, use result of vid. 1125 - BASE is not 0, we use vadd.vx/vadd.vi. */ 1126 if (rtx_equal_p (base, const0_rtx)) 1127 emit_move_insn (result, step_adj); 1128 else 1129 { 1130 insn_code icode = code_for_pred_scalar (PLUS, mode); 1131 rtx ops[] = {result, step_adj, base}; 1132 emit_vlmax_insn (icode, BINARY_OP, ops); 1133 } 1134 } 1135 1136 if (result != dest) 1137 emit_move_insn (dest, result); 1138 } 1139 1140 static void 1141 expand_const_vector (rtx target, rtx src) 1142 { 1143 machine_mode mode = GET_MODE (target); 1144 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) 1145 { 1146 rtx elt; 1147 gcc_assert ( 1148 const_vec_duplicate_p (src, &elt) 1149 && (rtx_equal_p (elt, const0_rtx) || rtx_equal_p (elt, const1_rtx))); 1150 rtx ops[] = {target, src}; 1151 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_MASK_OP, ops); 1152 return; 1153 } 1154 1155 rtx elt; 1156 if (const_vec_duplicate_p (src, &elt)) 1157 { 1158 rtx tmp = register_operand (target, mode) ? target : gen_reg_rtx (mode); 1159 /* Element in range -16 ~ 15 integer or 0.0 floating-point, 1160 we use vmv.v.i instruction. */ 1161 if (satisfies_constraint_vi (src) || satisfies_constraint_Wc0 (src)) 1162 { 1163 rtx ops[] = {tmp, src}; 1164 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP, ops); 1165 } 1166 else 1167 { 1168 /* Emit vec_duplicate<mode> split pattern before RA so that 1169 we could have a better optimization opportunity in LICM 1170 which will hoist vmv.v.x outside the loop and in fwprop && combine 1171 which will transform 'vv' into 'vx' instruction. 1172 1173 The reason we don't emit vec_duplicate<mode> split pattern during 1174 RA since the split stage after RA is a too late stage to generate 1175 RVV instruction which need an additional register (We can't 1176 allocate a new register after RA) for VL operand of vsetvl 1177 instruction (vsetvl a5, zero). */ 1178 if (lra_in_progress) 1179 { 1180 rtx ops[] = {tmp, elt}; 1181 emit_vlmax_insn (code_for_pred_broadcast (mode), UNARY_OP, ops); 1182 } 1183 else 1184 { 1185 struct expand_operand ops[2]; 1186 enum insn_code icode = optab_handler (vec_duplicate_optab, mode); 1187 gcc_assert (icode != CODE_FOR_nothing); 1188 create_output_operand (&ops[0], tmp, mode); 1189 create_input_operand (&ops[1], elt, GET_MODE_INNER (mode)); 1190 expand_insn (icode, 2, ops); 1191 tmp = ops[0].value; 1192 } 1193 } 1194 1195 if (tmp != target) 1196 emit_move_insn (target, tmp); 1197 return; 1198 } 1199 1200 /* Support scalable const series vector. */ 1201 rtx base, step; 1202 if (const_vec_series_p (src, &base, &step)) 1203 { 1204 expand_vec_series (target, base, step); 1205 return; 1206 } 1207 1208 /* Handle variable-length vector. */ 1209 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src); 1210 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); 1211 rvv_builder builder (mode, npatterns, nelts_per_pattern); 1212 for (unsigned int i = 0; i < nelts_per_pattern; i++) 1213 { 1214 for (unsigned int j = 0; j < npatterns; j++) 1215 builder.quick_push (CONST_VECTOR_ELT (src, i * npatterns + j)); 1216 } 1217 builder.finalize (); 1218 1219 if (CONST_VECTOR_DUPLICATE_P (src)) 1220 { 1221 /* Handle the case with repeating sequence that NELTS_PER_PATTERN = 1 1222 E.g. NPATTERNS = 4, v = { 0, 2, 6, 7, ... } 1223 NPATTERNS = 8, v = { 0, 2, 6, 7, 19, 20, 8, 7 ... } 1224 The elements within NPATTERNS are not necessary regular. */ 1225 if (builder.can_duplicate_repeating_sequence_p ()) 1226 { 1227 /* We handle the case that we can find a vector containter to hold 1228 element bitsize = NPATTERNS * ele_bitsize. 1229 1230 NPATTERNS = 8, element width = 8 1231 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... } 1232 In this case, we can combine NPATTERNS element into a larger 1233 element. Use element width = 64 and broadcast a vector with 1234 all element equal to 0x0706050403020100. */ 1235 rtx ele = builder.get_merged_repeating_sequence (); 1236 rtx dup = expand_vector_broadcast (builder.new_mode (), ele); 1237 emit_move_insn (target, gen_lowpart (mode, dup)); 1238 } 1239 else 1240 { 1241 /* We handle the case that we can't find a vector containter to hold 1242 element bitsize = NPATTERNS * ele_bitsize. 1243 1244 NPATTERNS = 8, element width = 16 1245 v = { 0, 1, 2, 3, 4, 5, 6, 7, ... } 1246 Since NPATTERNS * element width = 128, we can't find a container 1247 to hold it. 1248 1249 In this case, we use NPATTERNS merge operations to generate such 1250 vector. */ 1251 unsigned int nbits = npatterns - 1; 1252 1253 /* Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */ 1254 rtx vid = gen_reg_rtx (builder.int_mode ()); 1255 rtx op[] = {vid}; 1256 emit_vlmax_insn (code_for_pred_series (builder.int_mode ()), 1257 NULLARY_OP, op); 1258 1259 /* Generate vid_repeat = { 0, 1, ... nbits, ... } */ 1260 rtx vid_repeat = gen_reg_rtx (builder.int_mode ()); 1261 rtx and_ops[] = {vid_repeat, vid, 1262 gen_int_mode (nbits, builder.inner_int_mode ())}; 1263 emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()), 1264 BINARY_OP, and_ops); 1265 1266 rtx tmp = gen_reg_rtx (builder.mode ()); 1267 rtx dup_ops[] = {tmp, builder.elt (0)}; 1268 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), UNARY_OP, 1269 dup_ops); 1270 for (unsigned int i = 1; i < builder.npatterns (); i++) 1271 { 1272 /* Generate mask according to i. */ 1273 rtx mask = gen_reg_rtx (builder.mask_mode ()); 1274 rtx const_vec = gen_const_vector_dup (builder.int_mode (), i); 1275 expand_vec_cmp (mask, EQ, vid_repeat, const_vec); 1276 1277 /* Merge scalar to each i. */ 1278 rtx tmp2 = gen_reg_rtx (builder.mode ()); 1279 rtx merge_ops[] = {tmp2, tmp, builder.elt (i), mask}; 1280 insn_code icode = code_for_pred_merge_scalar (builder.mode ()); 1281 emit_vlmax_insn (icode, MERGE_OP, merge_ops); 1282 tmp = tmp2; 1283 } 1284 emit_move_insn (target, tmp); 1285 } 1286 } 1287 else if (CONST_VECTOR_STEPPED_P (src)) 1288 { 1289 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); 1290 if (builder.single_step_npatterns_p ()) 1291 { 1292 /* Describe the case by choosing NPATTERNS = 4 as an example. */ 1293 insn_code icode; 1294 1295 /* Step 1: Generate vid = { 0, 1, 2, 3, 4, 5, 6, 7, ... }. */ 1296 rtx vid = gen_reg_rtx (builder.mode ()); 1297 rtx vid_ops[] = {vid}; 1298 icode = code_for_pred_series (builder.mode ()); 1299 emit_vlmax_insn (icode, NULLARY_OP, vid_ops); 1300 1301 if (builder.npatterns_all_equal_p ()) 1302 { 1303 /* Generate the variable-length vector following this rule: 1304 { a, a, a + step, a + step, a + step * 2, a + step * 2, ...} 1305 E.g. { 0, 0, 8, 8, 16, 16, ... } */ 1306 1307 /* We want to create a pattern where value[idx] = floor (idx / 1308 NPATTERNS). As NPATTERNS is always a power of two we can 1309 rewrite this as = idx & -NPATTERNS. */ 1310 /* Step 2: VID AND -NPATTERNS: 1311 { 0&-4, 1&-4, 2&-4, 3 &-4, 4 &-4, 5 &-4, 6 &-4, 7 &-4, ... } 1312 */ 1313 rtx imm 1314 = gen_int_mode (-builder.npatterns (), builder.inner_mode ()); 1315 rtx tmp1 = gen_reg_rtx (builder.mode ()); 1316 rtx and_ops[] = {tmp1, vid, imm}; 1317 icode = code_for_pred_scalar (AND, builder.mode ()); 1318 emit_vlmax_insn (icode, BINARY_OP, and_ops); 1319 1320 /* Step 3: Convert to step size 1. */ 1321 rtx tmp2 = gen_reg_rtx (builder.mode ()); 1322 /* log2 (npatterns) to get the shift amount to convert 1323 Eg. { 0, 0, 0, 0, 4, 4, ... } 1324 into { 0, 0, 0, 0, 1, 1, ... }. */ 1325 HOST_WIDE_INT shift_amt = exact_log2 (builder.npatterns ()) ; 1326 rtx shift = gen_int_mode (shift_amt, builder.inner_mode ()); 1327 rtx shift_ops[] = {tmp2, tmp1, shift}; 1328 icode = code_for_pred_scalar (ASHIFTRT, builder.mode ()); 1329 emit_vlmax_insn (icode, BINARY_OP, shift_ops); 1330 1331 /* Step 4: Multiply to step size n. */ 1332 HOST_WIDE_INT step_size = 1333 INTVAL (builder.elt (builder.npatterns ())) 1334 - INTVAL (builder.elt (0)); 1335 rtx tmp3 = gen_reg_rtx (builder.mode ()); 1336 if (pow2p_hwi (step_size)) 1337 { 1338 /* Power of 2 can be handled with a left shift. */ 1339 HOST_WIDE_INT shift = exact_log2 (step_size); 1340 rtx shift_amount = gen_int_mode (shift, Pmode); 1341 insn_code icode = code_for_pred_scalar (ASHIFT, mode); 1342 rtx ops[] = {tmp3, tmp2, shift_amount}; 1343 emit_vlmax_insn (icode, BINARY_OP, ops); 1344 } 1345 else 1346 { 1347 rtx mult_amt = gen_int_mode (step_size, builder.inner_mode ()); 1348 insn_code icode = code_for_pred_scalar (MULT, builder.mode ()); 1349 rtx ops[] = {tmp3, tmp2, mult_amt}; 1350 emit_vlmax_insn (icode, BINARY_OP, ops); 1351 } 1352 1353 /* Step 5: Add starting value to all elements. */ 1354 HOST_WIDE_INT init_val = INTVAL (builder.elt (0)); 1355 if (init_val == 0) 1356 emit_move_insn (target, tmp3); 1357 else 1358 { 1359 rtx dup = gen_const_vector_dup (builder.mode (), init_val); 1360 rtx add_ops[] = {target, tmp3, dup}; 1361 icode = code_for_pred (PLUS, builder.mode ()); 1362 emit_vlmax_insn (icode, BINARY_OP, add_ops); 1363 } 1364 } 1365 else 1366 { 1367 /* Generate the variable-length vector following this rule: 1368 { a, b, a + step, b + step, a + step*2, b + step*2, ... } */ 1369 1370 if (builder.npatterns_vid_diff_repeated_p ()) 1371 { 1372 /* Case 1: For example as below: 1373 {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8... } 1374 We have 3 - 0 = 3 equals 7 - 4 = 3, the sequence is 1375 repeated as below after minus vid. 1376 {3, 1, -1, -3, 3, 1, -1, -3...} 1377 Then we can simplify the diff code gen to at most 1378 npatterns(). */ 1379 rvv_builder v (builder.mode (), builder.npatterns (), 1); 1380 1381 /* Step 1: Generate diff = TARGET - VID. */ 1382 for (unsigned int i = 0; i < v.npatterns (); ++i) 1383 { 1384 poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i; 1385 v.quick_push (gen_int_mode (diff, v.inner_mode ())); 1386 } 1387 1388 /* Step 2: Generate result = VID + diff. */ 1389 rtx vec = v.build (); 1390 rtx add_ops[] = {target, vid, vec}; 1391 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), 1392 BINARY_OP, add_ops); 1393 } 1394 else 1395 { 1396 /* Case 2: For example as below: 1397 { -4, 4, -4 + 1, 4 + 1, -4 + 2, 4 + 2, -4 + 3, 4 + 3, ... } 1398 */ 1399 rvv_builder v (builder.mode (), builder.npatterns (), 1); 1400 1401 /* Step 1: Generate { a, b, a, b, ... } */ 1402 for (unsigned int i = 0; i < v.npatterns (); ++i) 1403 v.quick_push (builder.elt (i)); 1404 rtx new_base = v.build (); 1405 1406 /* Step 2: Generate tmp = VID >> LOG2 (NPATTERNS). */ 1407 rtx shift_count 1408 = gen_int_mode (exact_log2 (builder.npatterns ()), 1409 builder.inner_mode ()); 1410 rtx tmp = expand_simple_binop (builder.mode (), LSHIFTRT, 1411 vid, shift_count, NULL_RTX, 1412 false, OPTAB_DIRECT); 1413 1414 /* Step 3: Generate tmp2 = tmp * step. */ 1415 rtx tmp2 = gen_reg_rtx (builder.mode ()); 1416 rtx step 1417 = simplify_binary_operation (MINUS, builder.inner_mode (), 1418 builder.elt (v.npatterns()), 1419 builder.elt (0)); 1420 expand_vec_series (tmp2, const0_rtx, step, tmp); 1421 1422 /* Step 4: Generate target = tmp2 + new_base. */ 1423 rtx add_ops[] = {target, tmp2, new_base}; 1424 emit_vlmax_insn (code_for_pred (PLUS, builder.mode ()), 1425 BINARY_OP, add_ops); 1426 } 1427 } 1428 } 1429 else if (builder.interleaved_stepped_npatterns_p ()) 1430 { 1431 rtx base1 = builder.elt (0); 1432 rtx base2 = builder.elt (1); 1433 poly_int64 step1 1434 = rtx_to_poly_int64 (builder.elt (builder.npatterns ())) 1435 - rtx_to_poly_int64 (base1); 1436 poly_int64 step2 1437 = rtx_to_poly_int64 (builder.elt (builder.npatterns () + 1)) 1438 - rtx_to_poly_int64 (base2); 1439 1440 /* For { 1, 0, 2, 0, ... , n - 1, 0 }, we can use larger EEW 1441 integer vector mode to generate such vector efficiently. 1442 1443 E.g. EEW = 16, { 2, 0, 4, 0, ... } 1444 1445 can be interpreted into: 1446 1447 EEW = 32, { 2, 4, ... }. 1448 1449 This only works as long as the larger type does not overflow 1450 as we can't guarantee a zero value for each second element 1451 of the sequence with smaller EEW. 1452 ??? For now we assume that no overflow happens with positive 1453 steps and forbid negative steps altogether. */ 1454 unsigned int new_smode_bitsize = builder.inner_bits_size () * 2; 1455 scalar_int_mode new_smode; 1456 machine_mode new_mode; 1457 poly_uint64 new_nunits 1458 = exact_div (GET_MODE_NUNITS (builder.mode ()), 2); 1459 if (known_ge (step1, 0) && known_ge (step2, 0) 1460 && int_mode_for_size (new_smode_bitsize, 0).exists (&new_smode) 1461 && get_vector_mode (new_smode, new_nunits).exists (&new_mode)) 1462 { 1463 rtx tmp = gen_reg_rtx (new_mode); 1464 base1 = gen_int_mode (rtx_to_poly_int64 (base1), new_smode); 1465 expand_vec_series (tmp, base1, gen_int_mode (step1, new_smode)); 1466 1467 if (rtx_equal_p (base2, const0_rtx) && known_eq (step2, 0)) 1468 /* { 1, 0, 2, 0, ... }. */ 1469 emit_move_insn (target, gen_lowpart (mode, tmp)); 1470 else if (known_eq (step2, 0)) 1471 { 1472 /* { 1, 1, 2, 1, ... }. */ 1473 rtx scalar = expand_simple_binop ( 1474 new_smode, ASHIFT, 1475 gen_int_mode (rtx_to_poly_int64 (base2), new_smode), 1476 gen_int_mode (builder.inner_bits_size (), new_smode), 1477 NULL_RTX, false, OPTAB_DIRECT); 1478 rtx tmp2 = gen_reg_rtx (new_mode); 1479 rtx and_ops[] = {tmp2, tmp, scalar}; 1480 emit_vlmax_insn (code_for_pred_scalar (AND, new_mode), 1481 BINARY_OP, and_ops); 1482 emit_move_insn (target, gen_lowpart (mode, tmp2)); 1483 } 1484 else 1485 { 1486 /* { 1, 3, 2, 6, ... }. */ 1487 rtx tmp2 = gen_reg_rtx (new_mode); 1488 base2 = gen_int_mode (rtx_to_poly_int64 (base2), new_smode); 1489 expand_vec_series (tmp2, base2, 1490 gen_int_mode (step2, new_smode)); 1491 rtx shifted_tmp2 = expand_simple_binop ( 1492 new_mode, ASHIFT, tmp2, 1493 gen_int_mode (builder.inner_bits_size (), Pmode), NULL_RTX, 1494 false, OPTAB_DIRECT); 1495 rtx tmp3 = gen_reg_rtx (new_mode); 1496 rtx ior_ops[] = {tmp3, tmp, shifted_tmp2}; 1497 emit_vlmax_insn (code_for_pred (IOR, new_mode), BINARY_OP, 1498 ior_ops); 1499 emit_move_insn (target, gen_lowpart (mode, tmp3)); 1500 } 1501 } 1502 else 1503 { 1504 rtx vid = gen_reg_rtx (mode); 1505 expand_vec_series (vid, const0_rtx, const1_rtx); 1506 /* Transform into { 0, 0, 1, 1, 2, 2, ... }. */ 1507 rtx shifted_vid 1508 = expand_simple_binop (mode, LSHIFTRT, vid, const1_rtx, 1509 NULL_RTX, false, OPTAB_DIRECT); 1510 rtx tmp1 = gen_reg_rtx (mode); 1511 rtx tmp2 = gen_reg_rtx (mode); 1512 expand_vec_series (tmp1, base1, 1513 gen_int_mode (step1, builder.inner_mode ()), 1514 shifted_vid); 1515 expand_vec_series (tmp2, base2, 1516 gen_int_mode (step2, builder.inner_mode ()), 1517 shifted_vid); 1518 1519 /* Transform into { 0, 1, 0, 1, 0, 1, ... }. */ 1520 rtx and_vid = gen_reg_rtx (mode); 1521 rtx and_ops[] = {and_vid, vid, const1_rtx}; 1522 emit_vlmax_insn (code_for_pred_scalar (AND, mode), BINARY_OP, 1523 and_ops); 1524 rtx mask = gen_reg_rtx (builder.mask_mode ()); 1525 expand_vec_cmp (mask, EQ, and_vid, CONST1_RTX (mode)); 1526 1527 rtx ops[] = {target, tmp1, tmp2, mask}; 1528 emit_vlmax_insn (code_for_pred_merge (mode), MERGE_OP, ops); 1529 } 1530 } 1531 else if (npatterns == 1 && nelts_per_pattern == 3) 1532 { 1533 /* Generate the following CONST_VECTOR: 1534 { base0, base1, base1 + step, base1 + step * 2, ... } */ 1535 rtx base0 = builder.elt (0); 1536 rtx base1 = builder.elt (1); 1537 rtx base2 = builder.elt (2); 1538 1539 rtx step = simplify_binary_operation (MINUS, builder.inner_mode (), 1540 base2, base1); 1541 1542 /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */ 1543 rtx tmp = gen_reg_rtx (mode); 1544 expand_vec_series (tmp, base1, step); 1545 /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... } */ 1546 if (!rtx_equal_p (base0, const0_rtx)) 1547 base0 = force_reg (builder.inner_mode (), base0); 1548 1549 insn_code icode = optab_handler (vec_shl_insert_optab, mode); 1550 gcc_assert (icode != CODE_FOR_nothing); 1551 emit_insn (GEN_FCN (icode) (target, tmp, base0)); 1552 } 1553 else 1554 /* TODO: We will enable more variable-length vector in the future. */ 1555 gcc_unreachable (); 1556 } 1557 else 1558 gcc_unreachable (); 1559 } 1560 1561 /* Get the frm mode with given CONST_INT rtx, the default mode is 1562 FRM_DYN. */ 1563 enum floating_point_rounding_mode 1564 get_frm_mode (rtx operand) 1565 { 1566 gcc_assert (CONST_INT_P (operand)); 1567 1568 switch (INTVAL (operand)) 1569 { 1570 case FRM_RNE: 1571 return FRM_RNE; 1572 case FRM_RTZ: 1573 return FRM_RTZ; 1574 case FRM_RDN: 1575 return FRM_RDN; 1576 case FRM_RUP: 1577 return FRM_RUP; 1578 case FRM_RMM: 1579 return FRM_RMM; 1580 case FRM_DYN: 1581 return FRM_DYN; 1582 default: 1583 gcc_unreachable (); 1584 } 1585 1586 gcc_unreachable (); 1587 } 1588 1589 /* Expand a pre-RA RVV data move from SRC to DEST. 1590 It expands move for RVV fractional vector modes. 1591 Return true if the move as already been emitted. */ 1592 bool 1593 legitimize_move (rtx dest, rtx *srcp) 1594 { 1595 rtx src = *srcp; 1596 machine_mode mode = GET_MODE (dest); 1597 if (CONST_VECTOR_P (src)) 1598 { 1599 expand_const_vector (dest, src); 1600 return true; 1601 } 1602 1603 if (riscv_v_ext_vls_mode_p (mode)) 1604 { 1605 if (GET_MODE_NUNITS (mode).to_constant () <= 31) 1606 { 1607 /* For NUNITS <= 31 VLS modes, we don't need extrac 1608 scalar regisers so we apply the naive (set (op0) (op1)) pattern. */ 1609 if (can_create_pseudo_p ()) 1610 { 1611 /* Need to force register if mem <- !reg. */ 1612 if (MEM_P (dest) && !REG_P (src)) 1613 *srcp = force_reg (mode, src); 1614 1615 return false; 1616 } 1617 } 1618 else if (GET_MODE_NUNITS (mode).to_constant () > 31 && lra_in_progress) 1619 { 1620 emit_insn (gen_mov_lra (mode, Pmode, dest, src)); 1621 return true; 1622 } 1623 } 1624 else 1625 { 1626 /* In order to decrease the memory traffic, we don't use whole register 1627 * load/store for the LMUL less than 1 and mask mode, so those case will 1628 * require one extra general purpose register, but it's not allowed during 1629 * LRA process, so we have a special move pattern used for LRA, which will 1630 * defer the expansion after LRA. */ 1631 if ((known_lt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) 1632 || GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL) 1633 && lra_in_progress) 1634 { 1635 emit_insn (gen_mov_lra (mode, Pmode, dest, src)); 1636 return true; 1637 } 1638 1639 if (known_ge (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR) 1640 && GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) 1641 { 1642 /* Need to force register if mem <- !reg. */ 1643 if (MEM_P (dest) && !REG_P (src)) 1644 *srcp = force_reg (mode, src); 1645 1646 return false; 1647 } 1648 } 1649 1650 if (register_operand (src, mode) && register_operand (dest, mode)) 1651 { 1652 emit_insn (gen_rtx_SET (dest, src)); 1653 return true; 1654 } 1655 1656 unsigned insn_flags 1657 = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? UNARY_MASK_OP : UNARY_OP; 1658 if (!register_operand (src, mode) && !register_operand (dest, mode)) 1659 { 1660 rtx tmp = gen_reg_rtx (mode); 1661 if (MEM_P (src)) 1662 { 1663 rtx ops[] = {tmp, src}; 1664 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops); 1665 } 1666 else 1667 emit_move_insn (tmp, src); 1668 src = tmp; 1669 } 1670 1671 if (satisfies_constraint_vu (src)) 1672 return false; 1673 1674 rtx ops[] = {dest, src}; 1675 emit_vlmax_insn (code_for_pred_mov (mode), insn_flags, ops); 1676 return true; 1677 } 1678 1679 /* VTYPE information for machine_mode. */ 1680 struct mode_vtype_group 1681 { 1682 enum vlmul_type vlmul[NUM_MACHINE_MODES]; 1683 uint8_t ratio[NUM_MACHINE_MODES]; 1684 machine_mode subpart_mode[NUM_MACHINE_MODES]; 1685 uint8_t nf[NUM_MACHINE_MODES]; 1686 mode_vtype_group () 1687 { 1688 #define ENTRY(MODE, REQUIREMENT, VLMUL, RATIO) \ 1689 vlmul[MODE##mode] = VLMUL; \ 1690 ratio[MODE##mode] = RATIO; 1691 #define TUPLE_ENTRY(MODE, REQUIREMENT, SUBPART_MODE, NF, VLMUL, RATIO) \ 1692 subpart_mode[MODE##mode] = SUBPART_MODE##mode; \ 1693 nf[MODE##mode] = NF; \ 1694 vlmul[MODE##mode] = VLMUL; \ 1695 ratio[MODE##mode] = RATIO; 1696 #include "riscv-vector-switch.def" 1697 #undef ENTRY 1698 #undef TUPLE_ENTRY 1699 } 1700 }; 1701 1702 static mode_vtype_group mode_vtype_infos; 1703 1704 /* Get vlmul field value by comparing LMUL with BYTES_PER_RISCV_VECTOR. */ 1705 enum vlmul_type 1706 get_vlmul (machine_mode mode) 1707 { 1708 /* For VLS modes, the vlmul should be dynamically 1709 calculated since we need to adjust VLMUL according 1710 to TARGET_MIN_VLEN. */ 1711 if (riscv_v_ext_vls_mode_p (mode)) 1712 { 1713 int size = GET_MODE_BITSIZE (mode).to_constant (); 1714 int inner_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode)); 1715 if (size < TARGET_MIN_VLEN) 1716 { 1717 int factor = TARGET_MIN_VLEN / size; 1718 if (inner_size == 8) 1719 factor = MIN (factor, 8); 1720 else if (inner_size == 16) 1721 factor = MIN (factor, 4); 1722 else if (inner_size == 32) 1723 factor = MIN (factor, 2); 1724 else if (inner_size == 64) 1725 factor = MIN (factor, 1); 1726 else 1727 gcc_unreachable (); 1728 1729 switch (factor) 1730 { 1731 case 1: 1732 return LMUL_1; 1733 case 2: 1734 return LMUL_F2; 1735 case 4: 1736 return LMUL_F4; 1737 case 8: 1738 return LMUL_F8; 1739 1740 default: 1741 gcc_unreachable (); 1742 } 1743 } 1744 else 1745 { 1746 int factor = size / TARGET_MIN_VLEN; 1747 switch (factor) 1748 { 1749 case 1: 1750 return LMUL_1; 1751 case 2: 1752 return LMUL_2; 1753 case 4: 1754 return LMUL_4; 1755 case 8: 1756 return LMUL_8; 1757 1758 default: 1759 gcc_unreachable (); 1760 } 1761 } 1762 } 1763 return mode_vtype_infos.vlmul[mode]; 1764 } 1765 1766 /* Return the VLMAX rtx of vector mode MODE. */ 1767 rtx 1768 get_vlmax_rtx (machine_mode mode) 1769 { 1770 gcc_assert (riscv_v_ext_vector_mode_p (mode)); 1771 return gen_int_mode (GET_MODE_NUNITS (mode), Pmode); 1772 } 1773 1774 /* Return the NF value of the corresponding mode. */ 1775 unsigned int 1776 get_nf (machine_mode mode) 1777 { 1778 /* We don't allow non-tuple modes go through this function. */ 1779 gcc_assert (riscv_v_ext_tuple_mode_p (mode)); 1780 return mode_vtype_infos.nf[mode]; 1781 } 1782 1783 /* Return the subpart mode of the tuple mode. For RVVM2x2SImode, 1784 the subpart mode is RVVM2SImode. This will help to build 1785 array/struct type in builtins. */ 1786 machine_mode 1787 get_subpart_mode (machine_mode mode) 1788 { 1789 /* We don't allow non-tuple modes go through this function. */ 1790 gcc_assert (riscv_v_ext_tuple_mode_p (mode)); 1791 return mode_vtype_infos.subpart_mode[mode]; 1792 } 1793 1794 /* Get ratio according to machine mode. */ 1795 unsigned int 1796 get_ratio (machine_mode mode) 1797 { 1798 if (riscv_v_ext_vls_mode_p (mode)) 1799 { 1800 unsigned int sew = get_sew (mode); 1801 vlmul_type vlmul = get_vlmul (mode); 1802 switch (vlmul) 1803 { 1804 case LMUL_1: 1805 return sew; 1806 case LMUL_2: 1807 return sew / 2; 1808 case LMUL_4: 1809 return sew / 4; 1810 case LMUL_8: 1811 return sew / 8; 1812 case LMUL_F8: 1813 return sew * 8; 1814 case LMUL_F4: 1815 return sew * 4; 1816 case LMUL_F2: 1817 return sew * 2; 1818 1819 default: 1820 gcc_unreachable (); 1821 } 1822 } 1823 return mode_vtype_infos.ratio[mode]; 1824 } 1825 1826 /* Get ta according to operand[tail_op_idx]. */ 1827 int 1828 get_ta (rtx ta) 1829 { 1830 if (INTVAL (ta) == TAIL_ANY) 1831 return INVALID_ATTRIBUTE; 1832 return INTVAL (ta); 1833 } 1834 1835 /* Get ma according to operand[mask_op_idx]. */ 1836 int 1837 get_ma (rtx ma) 1838 { 1839 if (INTVAL (ma) == MASK_ANY) 1840 return INVALID_ATTRIBUTE; 1841 return INTVAL (ma); 1842 } 1843 1844 /* Get prefer tail policy. */ 1845 enum tail_policy 1846 get_prefer_tail_policy () 1847 { 1848 /* TODO: By default, we choose to use TAIL_ANY which allows 1849 compiler pick up either agnostic or undisturbed. Maybe we 1850 will have a compile option like -mprefer=agnostic to set 1851 this value???. */ 1852 return TAIL_ANY; 1853 } 1854 1855 /* Get prefer mask policy. */ 1856 enum mask_policy 1857 get_prefer_mask_policy () 1858 { 1859 /* TODO: By default, we choose to use MASK_ANY which allows 1860 compiler pick up either agnostic or undisturbed. Maybe we 1861 will have a compile option like -mprefer=agnostic to set 1862 this value???. */ 1863 return MASK_ANY; 1864 } 1865 1866 /* Get avl_type rtx. */ 1867 rtx 1868 get_avl_type_rtx (enum avl_type type) 1869 { 1870 return gen_int_mode (type, Pmode); 1871 } 1872 1873 /* Return the appropriate mask mode for MODE. */ 1874 1875 machine_mode 1876 get_mask_mode (machine_mode mode) 1877 { 1878 poly_int64 nunits = GET_MODE_NUNITS (mode); 1879 if (riscv_v_ext_tuple_mode_p (mode)) 1880 { 1881 unsigned int nf = get_nf (mode); 1882 nunits = exact_div (nunits, nf); 1883 } 1884 return get_vector_mode (BImode, nunits).require (); 1885 } 1886 1887 /* Return the appropriate M1 mode for MODE. */ 1888 1889 static opt_machine_mode 1890 get_m1_mode (machine_mode mode) 1891 { 1892 scalar_mode smode = GET_MODE_INNER (mode); 1893 unsigned int bytes = GET_MODE_SIZE (smode); 1894 poly_uint64 m1_nunits = exact_div (BYTES_PER_RISCV_VECTOR, bytes); 1895 return get_vector_mode (smode, m1_nunits); 1896 } 1897 1898 /* Return the RVV vector mode that has NUNITS elements of mode INNER_MODE. 1899 This function is not only used by builtins, but also will be used by 1900 auto-vectorization in the future. */ 1901 opt_machine_mode 1902 get_vector_mode (scalar_mode inner_mode, poly_uint64 nunits) 1903 { 1904 enum mode_class mclass; 1905 if (inner_mode == E_BImode) 1906 mclass = MODE_VECTOR_BOOL; 1907 else if (FLOAT_MODE_P (inner_mode)) 1908 mclass = MODE_VECTOR_FLOAT; 1909 else 1910 mclass = MODE_VECTOR_INT; 1911 machine_mode mode; 1912 FOR_EACH_MODE_IN_CLASS (mode, mclass) 1913 if (inner_mode == GET_MODE_INNER (mode) 1914 && known_eq (nunits, GET_MODE_NUNITS (mode)) 1915 && (riscv_v_ext_vector_mode_p (mode) 1916 || riscv_v_ext_vls_mode_p (mode))) 1917 return mode; 1918 return opt_machine_mode (); 1919 } 1920 1921 /* Return the RVV tuple mode if we can find the legal tuple mode for the 1922 corresponding subpart mode and NF. */ 1923 opt_machine_mode 1924 get_tuple_mode (machine_mode subpart_mode, unsigned int nf) 1925 { 1926 poly_uint64 nunits = GET_MODE_NUNITS (subpart_mode) * nf; 1927 scalar_mode inner_mode = GET_MODE_INNER (subpart_mode); 1928 enum mode_class mclass = GET_MODE_CLASS (subpart_mode); 1929 machine_mode mode; 1930 FOR_EACH_MODE_IN_CLASS (mode, mclass) 1931 if (inner_mode == GET_MODE_INNER (mode) 1932 && known_eq (nunits, GET_MODE_NUNITS (mode)) 1933 && riscv_v_ext_tuple_mode_p (mode) 1934 && get_subpart_mode (mode) == subpart_mode) 1935 return mode; 1936 return opt_machine_mode (); 1937 } 1938 1939 bool 1940 simm5_p (rtx x) 1941 { 1942 if (!CONST_INT_P (x)) 1943 return false; 1944 return IN_RANGE (INTVAL (x), -16, 15); 1945 } 1946 1947 bool 1948 neg_simm5_p (rtx x) 1949 { 1950 if (!CONST_INT_P (x)) 1951 return false; 1952 return IN_RANGE (INTVAL (x), -15, 16); 1953 } 1954 1955 bool 1956 has_vi_variant_p (rtx_code code, rtx x) 1957 { 1958 switch (code) 1959 { 1960 case PLUS: 1961 case AND: 1962 case IOR: 1963 case XOR: 1964 case SS_PLUS: 1965 case US_PLUS: 1966 case EQ: 1967 case NE: 1968 case LE: 1969 case LEU: 1970 case GT: 1971 case GTU: 1972 return simm5_p (x); 1973 1974 case LT: 1975 case LTU: 1976 case GE: 1977 case GEU: 1978 case MINUS: 1979 case SS_MINUS: 1980 return neg_simm5_p (x); 1981 1982 default: 1983 return false; 1984 } 1985 } 1986 1987 bool 1988 sew64_scalar_helper (rtx *operands, rtx *scalar_op, rtx vl, 1989 machine_mode vector_mode, bool has_vi_variant_p, 1990 void (*emit_vector_func) (rtx *, rtx), enum avl_type type) 1991 { 1992 machine_mode scalar_mode = GET_MODE_INNER (vector_mode); 1993 if (has_vi_variant_p) 1994 { 1995 *scalar_op = force_reg (scalar_mode, *scalar_op); 1996 return false; 1997 } 1998 1999 if (TARGET_64BIT) 2000 { 2001 if (!rtx_equal_p (*scalar_op, const0_rtx)) 2002 *scalar_op = force_reg (scalar_mode, *scalar_op); 2003 return false; 2004 } 2005 2006 if (immediate_operand (*scalar_op, Pmode)) 2007 { 2008 if (!rtx_equal_p (*scalar_op, const0_rtx)) 2009 *scalar_op = force_reg (Pmode, *scalar_op); 2010 2011 *scalar_op = gen_rtx_SIGN_EXTEND (scalar_mode, *scalar_op); 2012 return false; 2013 } 2014 2015 if (CONST_INT_P (*scalar_op)) 2016 { 2017 if (maybe_gt (GET_MODE_SIZE (scalar_mode), GET_MODE_SIZE (Pmode))) 2018 *scalar_op = force_const_mem (scalar_mode, *scalar_op); 2019 else 2020 *scalar_op = force_reg (scalar_mode, *scalar_op); 2021 } 2022 2023 rtx tmp = gen_reg_rtx (vector_mode); 2024 rtx ops[] = {tmp, *scalar_op}; 2025 if (type == VLMAX) 2026 emit_vlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops); 2027 else 2028 emit_nonvlmax_insn (code_for_pred_broadcast (vector_mode), UNARY_OP, ops, 2029 vl); 2030 emit_vector_func (operands, tmp); 2031 2032 return true; 2033 } 2034 2035 /* Get { ... ,0, 0, 0, ..., 0, 0, 0, 1 } mask. */ 2036 rtx 2037 gen_scalar_move_mask (machine_mode mode) 2038 { 2039 rtx_vector_builder builder (mode, 1, 2); 2040 builder.quick_push (const1_rtx); 2041 builder.quick_push (const0_rtx); 2042 return builder.build (); 2043 } 2044 2045 static unsigned 2046 compute_vlmax (unsigned vector_bits, unsigned elt_size, unsigned min_size) 2047 { 2048 // Original equation: 2049 // VLMAX = (VectorBits / EltSize) * LMUL 2050 // where LMUL = MinSize / TARGET_MIN_VLEN 2051 // The following equations have been reordered to prevent loss of precision 2052 // when calculating fractional LMUL. 2053 return ((vector_bits / elt_size) * min_size) / TARGET_MIN_VLEN; 2054 } 2055 2056 static unsigned 2057 get_unknown_min_value (machine_mode mode) 2058 { 2059 enum vlmul_type vlmul = get_vlmul (mode); 2060 switch (vlmul) 2061 { 2062 case LMUL_1: 2063 return TARGET_MIN_VLEN; 2064 case LMUL_2: 2065 return TARGET_MIN_VLEN * 2; 2066 case LMUL_4: 2067 return TARGET_MIN_VLEN * 4; 2068 case LMUL_8: 2069 return TARGET_MIN_VLEN * 8; 2070 default: 2071 gcc_unreachable (); 2072 } 2073 } 2074 2075 static rtx 2076 force_vector_length_operand (rtx vl) 2077 { 2078 if (CONST_INT_P (vl) && !satisfies_constraint_vl (vl)) 2079 return force_reg (Pmode, vl); 2080 return vl; 2081 } 2082 2083 rtx 2084 gen_no_side_effects_vsetvl_rtx (machine_mode vmode, rtx vl, rtx avl) 2085 { 2086 unsigned int sew = get_sew (vmode); 2087 rtx tail_policy = gen_int_mode (get_prefer_tail_policy (), Pmode); 2088 rtx mask_policy = gen_int_mode (get_prefer_mask_policy (), Pmode); 2089 return gen_vsetvl_no_side_effects (Pmode, vl, avl, gen_int_mode (sew, Pmode), 2090 gen_int_mode (get_vlmul (vmode), Pmode), 2091 tail_policy, mask_policy); 2092 } 2093 2094 /* GET VL * 2 rtx. */ 2095 static rtx 2096 get_vl_x2_rtx (rtx avl, machine_mode mode, machine_mode demote_mode) 2097 { 2098 rtx i32vl = NULL_RTX; 2099 if (CONST_INT_P (avl)) 2100 { 2101 unsigned elt_size = GET_MODE_BITSIZE (GET_MODE_INNER (mode)); 2102 unsigned min_size = get_unknown_min_value (mode); 2103 unsigned vlen_max = RVV_65536; 2104 unsigned vlmax_max = compute_vlmax (vlen_max, elt_size, min_size); 2105 unsigned vlen_min = TARGET_MIN_VLEN; 2106 unsigned vlmax_min = compute_vlmax (vlen_min, elt_size, min_size); 2107 2108 unsigned HOST_WIDE_INT avl_int = INTVAL (avl); 2109 if (avl_int <= vlmax_min) 2110 i32vl = gen_int_mode (2 * avl_int, Pmode); 2111 else if (avl_int >= 2 * vlmax_max) 2112 { 2113 // Just set i32vl to VLMAX in this situation 2114 i32vl = gen_reg_rtx (Pmode); 2115 emit_insn ( 2116 gen_no_side_effects_vsetvl_rtx (demote_mode, i32vl, RVV_VLMAX)); 2117 } 2118 else 2119 { 2120 // For AVL between (MinVLMAX, 2 * MaxVLMAX), the actual working vl 2121 // is related to the hardware implementation. 2122 // So let the following code handle 2123 } 2124 } 2125 if (!i32vl) 2126 { 2127 // Using vsetvli instruction to get actually used length which related to 2128 // the hardware implementation 2129 rtx i64vl = gen_reg_rtx (Pmode); 2130 emit_insn ( 2131 gen_no_side_effects_vsetvl_rtx (mode, i64vl, force_reg (Pmode, avl))); 2132 // scale 2 for 32-bit length 2133 i32vl = gen_reg_rtx (Pmode); 2134 emit_insn ( 2135 gen_rtx_SET (i32vl, gen_rtx_ASHIFT (Pmode, i64vl, const1_rtx))); 2136 } 2137 2138 return force_vector_length_operand (i32vl); 2139 } 2140 2141 bool 2142 slide1_sew64_helper (int unspec, machine_mode mode, machine_mode demote_mode, 2143 machine_mode demote_mask_mode, rtx *ops) 2144 { 2145 rtx scalar_op = ops[4]; 2146 rtx avl = ops[5]; 2147 machine_mode scalar_mode = GET_MODE_INNER (mode); 2148 if (rtx_equal_p (scalar_op, const0_rtx)) 2149 { 2150 ops[5] = force_vector_length_operand (ops[5]); 2151 return false; 2152 } 2153 2154 if (TARGET_64BIT) 2155 { 2156 ops[4] = force_reg (scalar_mode, scalar_op); 2157 ops[5] = force_vector_length_operand (ops[5]); 2158 return false; 2159 } 2160 2161 if (immediate_operand (scalar_op, Pmode)) 2162 { 2163 ops[4] = gen_rtx_SIGN_EXTEND (scalar_mode, force_reg (Pmode, scalar_op)); 2164 ops[5] = force_vector_length_operand (ops[5]); 2165 return false; 2166 } 2167 2168 if (CONST_INT_P (scalar_op)) 2169 scalar_op = force_reg (scalar_mode, scalar_op); 2170 2171 rtx vl_x2 = get_vl_x2_rtx (avl, mode, demote_mode); 2172 2173 rtx demote_scalar_op1, demote_scalar_op2; 2174 if (unspec == UNSPEC_VSLIDE1UP) 2175 { 2176 demote_scalar_op1 = gen_highpart (Pmode, scalar_op); 2177 demote_scalar_op2 = gen_lowpart (Pmode, scalar_op); 2178 } 2179 else 2180 { 2181 demote_scalar_op1 = gen_lowpart (Pmode, scalar_op); 2182 demote_scalar_op2 = gen_highpart (Pmode, scalar_op); 2183 } 2184 2185 rtx temp = gen_reg_rtx (demote_mode); 2186 rtx ta = gen_int_mode (get_prefer_tail_policy (), Pmode); 2187 rtx ma = gen_int_mode (get_prefer_mask_policy (), Pmode); 2188 rtx merge = RVV_VUNDEF (demote_mode); 2189 /* Handle vslide1<ud>_tu. */ 2190 if (register_operand (ops[2], mode) 2191 && rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1])))) 2192 { 2193 merge = gen_lowpart (demote_mode, ops[2]); 2194 ta = ops[6]; 2195 ma = ops[7]; 2196 } 2197 2198 emit_insn (gen_pred_slide (unspec, demote_mode, temp, 2199 CONSTM1_RTX (demote_mask_mode), merge, 2200 gen_lowpart (demote_mode, ops[3]), 2201 demote_scalar_op1, vl_x2, ta, ma, ops[8])); 2202 emit_insn (gen_pred_slide (unspec, demote_mode, 2203 gen_lowpart (demote_mode, ops[0]), 2204 CONSTM1_RTX (demote_mask_mode), merge, temp, 2205 demote_scalar_op2, vl_x2, ta, ma, ops[8])); 2206 2207 if (!rtx_equal_p (ops[1], CONSTM1_RTX (GET_MODE (ops[1]))) 2208 && !rtx_equal_p (ops[2], RVV_VUNDEF (GET_MODE (ops[2])))) 2209 emit_insn (gen_pred_merge (mode, ops[0], ops[2], ops[2], ops[0], ops[1], 2210 force_vector_length_operand (ops[5]), ops[6], 2211 ops[8])); 2212 return true; 2213 } 2214 2215 rtx 2216 gen_avl_for_scalar_move (rtx avl) 2217 { 2218 /* AVL for scalar move has different behavior between 0 and large than 0. */ 2219 if (CONST_INT_P (avl)) 2220 { 2221 /* So we could just set AVL to 1 for any constant other than 0. */ 2222 if (rtx_equal_p (avl, const0_rtx)) 2223 return const0_rtx; 2224 else 2225 return const1_rtx; 2226 } 2227 else 2228 { 2229 /* For non-constant value, we set any non zero value to 1 by 2230 `sgtu new_avl,input_avl,zero` + `vsetvli`. */ 2231 rtx tmp = gen_reg_rtx (Pmode); 2232 emit_insn ( 2233 gen_rtx_SET (tmp, gen_rtx_fmt_ee (GTU, Pmode, avl, const0_rtx))); 2234 return tmp; 2235 } 2236 } 2237 2238 /* Expand tuple modes data movement for. */ 2239 void 2240 expand_tuple_move (rtx *ops) 2241 { 2242 unsigned int i; 2243 machine_mode tuple_mode = GET_MODE (ops[0]); 2244 machine_mode subpart_mode = get_subpart_mode (tuple_mode); 2245 poly_int64 subpart_size = GET_MODE_SIZE (subpart_mode); 2246 unsigned int nf = get_nf (tuple_mode); 2247 bool fractional_p = known_lt (subpart_size, BYTES_PER_RISCV_VECTOR); 2248 2249 if (REG_P (ops[0]) && CONST_VECTOR_P (ops[1])) 2250 { 2251 rtx val; 2252 gcc_assert (can_create_pseudo_p () 2253 && const_vec_duplicate_p (ops[1], &val)); 2254 for (i = 0; i < nf; ++i) 2255 { 2256 poly_int64 offset = i * subpart_size; 2257 rtx subreg 2258 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset); 2259 rtx dup = gen_const_vec_duplicate (subpart_mode, val); 2260 emit_move_insn (subreg, dup); 2261 } 2262 } 2263 else if (REG_P (ops[0]) && REG_P (ops[1])) 2264 { 2265 for (i = 0; i < nf; ++i) 2266 { 2267 int index = i; 2268 2269 /* Take NF = 2 and LMUL = 1 for example: 2270 2271 - move v8 to v9: 2272 vmv1r v10,v9 2273 vmv1r v9,v8 2274 2275 - move v8 to v7: 2276 vmv1r v7,v8 2277 vmv1r v8,v9 */ 2278 if (REGNO (ops[0]) > REGNO (ops[1])) 2279 index = nf - 1 - i; 2280 poly_int64 offset = index * subpart_size; 2281 rtx dst_subreg 2282 = simplify_gen_subreg (subpart_mode, ops[0], tuple_mode, offset); 2283 rtx src_subreg 2284 = simplify_gen_subreg (subpart_mode, ops[1], tuple_mode, offset); 2285 emit_insn (gen_rtx_SET (dst_subreg, src_subreg)); 2286 } 2287 } 2288 else 2289 { 2290 /* Expand tuple memory data movement. */ 2291 gcc_assert (MEM_P (ops[0]) || MEM_P (ops[1])); 2292 rtx offset = gen_int_mode (subpart_size, Pmode); 2293 if (!subpart_size.is_constant ()) 2294 { 2295 emit_move_insn (ops[2], gen_int_mode (BYTES_PER_RISCV_VECTOR, Pmode)); 2296 if (fractional_p) 2297 { 2298 unsigned int factor 2299 = exact_div (BYTES_PER_RISCV_VECTOR, subpart_size) 2300 .to_constant (); 2301 rtx pat 2302 = gen_rtx_ASHIFTRT (Pmode, ops[2], 2303 gen_int_mode (exact_log2 (factor), Pmode)); 2304 emit_insn (gen_rtx_SET (ops[2], pat)); 2305 } 2306 2307 if (known_gt (subpart_size, BYTES_PER_RISCV_VECTOR)) 2308 { 2309 unsigned int factor 2310 = exact_div (subpart_size, BYTES_PER_RISCV_VECTOR) 2311 .to_constant (); 2312 rtx pat 2313 = gen_rtx_ASHIFT (Pmode, ops[2], 2314 gen_int_mode (exact_log2 (factor), Pmode)); 2315 emit_insn (gen_rtx_SET (ops[2], pat)); 2316 } 2317 offset = ops[2]; 2318 } 2319 2320 /* Non-fractional LMUL has whole register moves that don't require a 2321 vsetvl for VLMAX. */ 2322 if (fractional_p) 2323 emit_vlmax_vsetvl (subpart_mode, ops[4]); 2324 if (MEM_P (ops[1])) 2325 { 2326 /* Load operations. */ 2327 emit_move_insn (ops[3], XEXP (ops[1], 0)); 2328 for (i = 0; i < nf; i++) 2329 { 2330 rtx subreg = simplify_gen_subreg (subpart_mode, ops[0], 2331 tuple_mode, i * subpart_size); 2332 if (i != 0) 2333 { 2334 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset); 2335 emit_insn (gen_rtx_SET (ops[3], new_addr)); 2336 } 2337 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]); 2338 2339 if (fractional_p) 2340 { 2341 rtx operands[] = {subreg, mem}; 2342 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode), 2343 UNARY_OP, operands, ops[4]); 2344 } 2345 else 2346 emit_move_insn (subreg, mem); 2347 } 2348 } 2349 else 2350 { 2351 /* Store operations. */ 2352 emit_move_insn (ops[3], XEXP (ops[0], 0)); 2353 for (i = 0; i < nf; i++) 2354 { 2355 rtx subreg = simplify_gen_subreg (subpart_mode, ops[1], 2356 tuple_mode, i * subpart_size); 2357 if (i != 0) 2358 { 2359 rtx new_addr = gen_rtx_PLUS (Pmode, ops[3], offset); 2360 emit_insn (gen_rtx_SET (ops[3], new_addr)); 2361 } 2362 rtx mem = gen_rtx_MEM (subpart_mode, ops[3]); 2363 2364 if (fractional_p) 2365 { 2366 rtx operands[] = {mem, subreg}; 2367 emit_vlmax_insn_lra (code_for_pred_mov (subpart_mode), 2368 UNARY_OP, operands, ops[4]); 2369 } 2370 else 2371 emit_move_insn (mem, subreg); 2372 } 2373 } 2374 } 2375 } 2376 2377 /* Return the vectorization machine mode for RVV according to LMUL. */ 2378 machine_mode 2379 preferred_simd_mode (scalar_mode mode) 2380 { 2381 if (autovec_use_vlmax_p ()) 2382 { 2383 /* We use LMUL = 1 as base bytesize which is BYTES_PER_RISCV_VECTOR and 2384 rvv_max_lmul as multiply factor to calculate the NUNITS to 2385 get the auto-vectorization mode. */ 2386 poly_uint64 nunits; 2387 poly_uint64 vector_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL; 2388 poly_uint64 scalar_size = GET_MODE_SIZE (mode); 2389 /* Disable vectorization when we can't find a RVV mode for it. 2390 E.g. -march=rv64gc_zve32x doesn't have a vector mode to vectorize 2391 a double (DFmode) type. */ 2392 if (!multiple_p (vector_size, scalar_size, &nunits)) 2393 return word_mode; 2394 machine_mode rvv_mode; 2395 if (get_vector_mode (mode, nunits).exists (&rvv_mode)) 2396 return rvv_mode; 2397 } 2398 return word_mode; 2399 } 2400 2401 /* Subroutine of riscv_vector_expand_vector_init. 2402 Works as follows: 2403 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER. 2404 (b) Skip leading elements from BUILDER, which are the same as 2405 element NELTS_REQD - 1. 2406 (c) Insert earlier elements in reverse order in TARGET using vslide1down. */ 2407 2408 static void 2409 expand_vector_init_insert_elems (rtx target, const rvv_builder &builder, 2410 int nelts_reqd) 2411 { 2412 machine_mode mode = GET_MODE (target); 2413 rtx dup = expand_vector_broadcast (mode, builder.elt (0)); 2414 emit_move_insn (target, dup); 2415 int ndups = builder.count_dups (0, nelts_reqd - 1, 1); 2416 for (int i = ndups; i < nelts_reqd; i++) 2417 { 2418 unsigned int unspec 2419 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1DOWN : UNSPEC_VSLIDE1DOWN; 2420 insn_code icode = code_for_pred_slide (unspec, mode); 2421 rtx ops[] = {target, target, builder.elt (i)}; 2422 emit_vlmax_insn (icode, BINARY_OP, ops); 2423 } 2424 } 2425 2426 /* Use merge approach to initialize the vector with repeating sequence. 2427 v = {a, b, a, b, a, b, a, b}. 2428 2429 v = broadcast (a). 2430 mask = 0b01010101.... 2431 v = merge (v, b, mask) 2432 */ 2433 static void 2434 expand_vector_init_merge_repeating_sequence (rtx target, 2435 const rvv_builder &builder) 2436 { 2437 /* We can't use BIT mode (BI) directly to generate mask = 0b01010... 2438 since we don't have such instruction in RVV. 2439 Instead, we should use INT mode (QI/HI/SI/DI) with integer move 2440 instruction to generate the mask data we want. */ 2441 machine_mode mask_bit_mode = get_mask_mode (builder.mode ()); 2442 machine_mode mask_int_mode 2443 = get_repeating_sequence_dup_machine_mode (builder, mask_bit_mode); 2444 uint64_t full_nelts = builder.full_nelts ().to_constant (); 2445 2446 /* Step 1: Broadcast the first pattern. */ 2447 rtx ops[] = {target, force_reg (builder.inner_mode (), builder.elt (0))}; 2448 emit_vlmax_insn (code_for_pred_broadcast (builder.mode ()), 2449 UNARY_OP, ops); 2450 /* Step 2: Merge the rest iteration of pattern. */ 2451 for (unsigned int i = 1; i < builder.npatterns (); i++) 2452 { 2453 /* Step 2-1: Generate mask register v0 for each merge. */ 2454 rtx merge_mask 2455 = builder.get_merge_scalar_mask (i, GET_MODE_INNER (mask_int_mode)); 2456 rtx mask = gen_reg_rtx (mask_bit_mode); 2457 rtx dup = gen_reg_rtx (mask_int_mode); 2458 2459 if (full_nelts <= builder.inner_bits_size ()) /* vmv.s.x. */ 2460 { 2461 rtx ops[] = {dup, merge_mask}; 2462 emit_nonvlmax_insn (code_for_pred_broadcast (GET_MODE (dup)), 2463 SCALAR_MOVE_OP, ops, CONST1_RTX (Pmode)); 2464 } 2465 else /* vmv.v.x. */ 2466 { 2467 rtx ops[] = {dup, 2468 force_reg (GET_MODE_INNER (mask_int_mode), merge_mask)}; 2469 rtx vl = gen_int_mode (CEIL (full_nelts, builder.inner_bits_size ()), 2470 Pmode); 2471 emit_nonvlmax_insn (code_for_pred_broadcast (mask_int_mode), UNARY_OP, 2472 ops, vl); 2473 } 2474 2475 emit_move_insn (mask, gen_lowpart (mask_bit_mode, dup)); 2476 2477 /* Step 2-2: Merge pattern according to the mask. */ 2478 rtx ops[] = {target, target, builder.elt (i), mask}; 2479 emit_vlmax_insn (code_for_pred_merge_scalar (GET_MODE (target)), 2480 MERGE_OP, ops); 2481 } 2482 } 2483 2484 /* Use slideup approach to combine the vectors. 2485 v = {a, a, a, a, b, b, b, b} 2486 2487 First: 2488 v1 = {a, a, a, a, a, a, a, a} 2489 v2 = {b, b, b, b, b, b, b, b} 2490 v = slideup (v1, v2, nelt / 2) 2491 */ 2492 static void 2493 expand_vector_init_slideup_combine_sequence (rtx target, 2494 const rvv_builder &builder) 2495 { 2496 machine_mode mode = GET_MODE (target); 2497 int nelts = builder.full_nelts ().to_constant (); 2498 rtx first_elt = builder.elt (0); 2499 rtx last_elt = builder.elt (nelts - 1); 2500 rtx low = expand_vector_broadcast (mode, first_elt); 2501 rtx high = expand_vector_broadcast (mode, last_elt); 2502 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, mode); 2503 rtx ops[] = {target, low, high, gen_int_mode (nelts / 2, Pmode)}; 2504 emit_vlmax_insn (icode, SLIDEUP_OP_MERGE, ops); 2505 } 2506 2507 /* Use merge approach to merge a scalar into a vector. 2508 v = {a, a, a, a, a, a, b, b} 2509 2510 v1 = {a, a, a, a, a, a, a, a} 2511 scalar = b 2512 mask = {0, 0, 0, 0, 0, 0, 1, 1} 2513 */ 2514 static void 2515 expand_vector_init_merge_combine_sequence (rtx target, 2516 const rvv_builder &builder) 2517 { 2518 machine_mode mode = GET_MODE (target); 2519 machine_mode imode = builder.int_mode (); 2520 machine_mode mmode = builder.mask_mode (); 2521 int nelts = builder.full_nelts ().to_constant (); 2522 int leading_ndups = builder.count_dups (0, nelts - 1, 1); 2523 if ((leading_ndups > 255 && GET_MODE_INNER (imode) == QImode) 2524 || riscv_get_v_regno_alignment (imode) > 1) 2525 imode = get_vector_mode (HImode, nelts).require (); 2526 2527 /* Generate vid = { 0, 1, 2, ..., n }. */ 2528 rtx vid = gen_reg_rtx (imode); 2529 expand_vec_series (vid, const0_rtx, const1_rtx); 2530 2531 /* Generate mask. */ 2532 rtx mask = gen_reg_rtx (mmode); 2533 insn_code icode = code_for_pred_cmp_scalar (imode); 2534 rtx index = gen_int_mode (leading_ndups - 1, builder.inner_int_mode ()); 2535 rtx dup_rtx = gen_rtx_VEC_DUPLICATE (imode, index); 2536 /* vmsgtu.vi/vmsgtu.vx. */ 2537 rtx cmp = gen_rtx_fmt_ee (GTU, mmode, vid, dup_rtx); 2538 rtx sel = builder.elt (nelts - 1); 2539 rtx mask_ops[] = {mask, cmp, vid, index}; 2540 emit_vlmax_insn (icode, COMPARE_OP, mask_ops); 2541 2542 /* Duplicate the first elements. */ 2543 rtx dup = expand_vector_broadcast (mode, builder.elt (0)); 2544 /* Merge scalar into vector according to mask. */ 2545 rtx merge_ops[] = {target, dup, sel, mask}; 2546 icode = code_for_pred_merge_scalar (mode); 2547 emit_vlmax_insn (icode, MERGE_OP, merge_ops); 2548 } 2549 2550 /* Subroutine of expand_vec_init to handle case 2551 when all trailing elements of builder are same. 2552 This works as follows: 2553 (a) Use expand_insn interface to broadcast last vector element in TARGET. 2554 (b) Insert remaining elements in TARGET using insr. 2555 2556 ??? The heuristic used is to do above if number of same trailing elements 2557 is greater than leading_ndups, loosely based on 2558 heuristic from mostly_zeros_p. May need fine-tuning. */ 2559 2560 static bool 2561 expand_vector_init_trailing_same_elem (rtx target, 2562 const rtx_vector_builder &builder, 2563 int nelts_reqd) 2564 { 2565 int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1); 2566 int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1); 2567 machine_mode mode = GET_MODE (target); 2568 2569 if (trailing_ndups > leading_ndups) 2570 { 2571 rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1)); 2572 for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--) 2573 { 2574 unsigned int unspec 2575 = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; 2576 insn_code icode = code_for_pred_slide (unspec, mode); 2577 rtx tmp = gen_reg_rtx (mode); 2578 rtx ops[] = {tmp, dup, builder.elt (i)}; 2579 emit_vlmax_insn (icode, BINARY_OP, ops); 2580 /* slide1up need source and dest to be different REG. */ 2581 dup = tmp; 2582 } 2583 2584 emit_move_insn (target, dup); 2585 return true; 2586 } 2587 2588 return false; 2589 } 2590 2591 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */ 2592 2593 void 2594 expand_vec_init (rtx target, rtx vals) 2595 { 2596 machine_mode mode = GET_MODE (target); 2597 int nelts = XVECLEN (vals, 0); 2598 2599 rvv_builder v (mode, nelts, 1); 2600 for (int i = 0; i < nelts; i++) 2601 v.quick_push (XVECEXP (vals, 0, i)); 2602 v.finalize (); 2603 2604 /* If the sequence is v = { a, a, a, a } just broadcast an element. */ 2605 if (v.is_repeating_sequence ()) 2606 { 2607 machine_mode mode = GET_MODE (target); 2608 rtx dup = expand_vector_broadcast (mode, v.elt (0)); 2609 emit_move_insn (target, dup); 2610 return; 2611 } 2612 2613 if (nelts > 3) 2614 { 2615 /* Case 1: Convert v = { a, b, a, b } into v = { ab, ab }. */ 2616 if (v.can_duplicate_repeating_sequence_p ()) 2617 { 2618 rtx ele = v.get_merged_repeating_sequence (); 2619 rtx dup = expand_vector_broadcast (v.new_mode (), ele); 2620 emit_move_insn (target, gen_lowpart (mode, dup)); 2621 return; 2622 } 2623 2624 /* Case 2: Optimize repeating sequence cases that Case 1 can 2625 not handle and it is profitable. For example: 2626 ELEMENT BITSIZE = 64. 2627 v = {a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b}. 2628 We can't find a vector mode for "ab" which will be combined into 2629 128-bit element to duplicate. */ 2630 if (v.repeating_sequence_use_merge_profitable_p ()) 2631 { 2632 expand_vector_init_merge_repeating_sequence (target, v); 2633 return; 2634 } 2635 2636 /* Case 3: Optimize combine sequence. 2637 E.g. v = {a, a, a, a, a, a, a, a, b, b, b, b, b, b, b, b}. 2638 We can combine: 2639 v1 = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}. 2640 and 2641 v2 = {b, b, b, b, b, b, b, b, b, b, b, b, b, b, b, b}. 2642 by slideup. */ 2643 if (v.combine_sequence_use_slideup_profitable_p ()) 2644 { 2645 expand_vector_init_slideup_combine_sequence (target, v); 2646 return; 2647 } 2648 2649 /* Case 4: Optimize combine sequence. 2650 E.g. v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. 2651 2652 Generate vector: 2653 v = {a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a}. 2654 2655 Generate mask: 2656 mask = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}. 2657 2658 Merge b into v by mask: 2659 v = {a, a, a, a, a, a, a, a, a, a, a, b, b, b, b, b}. */ 2660 if (v.combine_sequence_use_merge_profitable_p ()) 2661 { 2662 expand_vector_init_merge_combine_sequence (target, v); 2663 return; 2664 } 2665 } 2666 2667 /* Optimize trailing same elements sequence: 2668 v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x}; */ 2669 if (!expand_vector_init_trailing_same_elem (target, v, nelts)) 2670 /* Handle common situation by vslide1down. This function can handle any 2671 situation of vec_init<mode>. Only the cases that are not optimized above 2672 will fall through here. */ 2673 expand_vector_init_insert_elems (target, v, nelts); 2674 } 2675 2676 /* Get insn code for corresponding comparison. */ 2677 2678 static insn_code 2679 get_cmp_insn_code (rtx_code code, machine_mode mode) 2680 { 2681 insn_code icode; 2682 switch (code) 2683 { 2684 case EQ: 2685 case NE: 2686 case LE: 2687 case LEU: 2688 case GT: 2689 case GTU: 2690 case LTGT: 2691 icode = code_for_pred_cmp (mode); 2692 break; 2693 case LT: 2694 case LTU: 2695 case GE: 2696 case GEU: 2697 if (FLOAT_MODE_P (mode)) 2698 icode = code_for_pred_cmp (mode); 2699 else 2700 icode = code_for_pred_ltge (mode); 2701 break; 2702 default: 2703 gcc_unreachable (); 2704 } 2705 return icode; 2706 } 2707 2708 /* This hook gives the vectorizer more vector mode options. We want it to not 2709 only try modes with the maximum number of units a full vector can hold but 2710 for example also half the number of units for a smaller elements size. 2711 Such vectors can be promoted to a full vector of widened elements 2712 (still with the same number of elements, essentially vectorizing at a 2713 fixed number of units rather than a fixed number of bytes). */ 2714 unsigned int 2715 autovectorize_vector_modes (vector_modes *modes, bool) 2716 { 2717 if (autovec_use_vlmax_p ()) 2718 { 2719 poly_uint64 full_size = BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL; 2720 2721 /* Start with a RVV<LMUL>QImode where LMUL is the number of units that 2722 fit a whole vector. 2723 Then try LMUL = nunits / 2, nunits / 4 and nunits / 8 which 2724 is guided by the extensions we have available (vf2, vf4 and vf8). 2725 2726 - full_size: Try using full vectors for all element types. 2727 - full_size / 2: 2728 Try using 16-bit containers for 8-bit elements and full vectors 2729 for wider elements. 2730 - full_size / 4: 2731 Try using 32-bit containers for 8-bit and 16-bit elements and 2732 full vectors for wider elements. 2733 - full_size / 8: 2734 Try using 64-bit containers for all element types. */ 2735 static const int rvv_factors[] = {1, 2, 4, 8, 16, 32, 64}; 2736 for (unsigned int i = 0; i < sizeof (rvv_factors) / sizeof (int); i++) 2737 { 2738 poly_uint64 units; 2739 machine_mode mode; 2740 if (can_div_trunc_p (full_size, rvv_factors[i], &units) 2741 && get_vector_mode (QImode, units).exists (&mode)) 2742 modes->safe_push (mode); 2743 } 2744 } 2745 /* Push all VLSmodes according to TARGET_MIN_VLEN. */ 2746 unsigned int i = 0; 2747 unsigned int base_size = TARGET_MIN_VLEN * TARGET_MAX_LMUL / 8; 2748 unsigned int size = base_size; 2749 machine_mode mode; 2750 while (size > 0 && get_vector_mode (QImode, size).exists (&mode)) 2751 { 2752 if (vls_mode_valid_p (mode)) 2753 modes->safe_push (mode); 2754 2755 i++; 2756 size = base_size / (1U << i); 2757 } 2758 /* Enable LOOP_VINFO comparison in COST model. */ 2759 return VECT_COMPARE_COSTS; 2760 } 2761 2762 /* Return true if we can find the related MODE according to default LMUL. */ 2763 static bool 2764 can_find_related_mode_p (machine_mode vector_mode, scalar_mode element_mode, 2765 poly_uint64 *nunits) 2766 { 2767 if (!autovec_use_vlmax_p ()) 2768 return false; 2769 if (riscv_v_ext_vector_mode_p (vector_mode) 2770 && multiple_p (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL, 2771 GET_MODE_SIZE (element_mode), nunits)) 2772 return true; 2773 if (riscv_v_ext_vls_mode_p (vector_mode) 2774 && multiple_p (TARGET_MIN_VLEN * TARGET_MAX_LMUL, 2775 GET_MODE_SIZE (element_mode), nunits)) 2776 return true; 2777 return false; 2778 } 2779 2780 /* If the given VECTOR_MODE is an RVV mode, first get the largest number 2781 of units that fit into a full vector at the given ELEMENT_MODE. 2782 We will have the vectorizer call us with a successively decreasing 2783 number of units (as specified in autovectorize_vector_modes). 2784 The starting mode is always the one specified by preferred_simd_mode. */ 2785 opt_machine_mode 2786 vectorize_related_mode (machine_mode vector_mode, scalar_mode element_mode, 2787 poly_uint64 nunits) 2788 { 2789 /* TODO: We will support RVV VLS auto-vectorization mode in the future. */ 2790 poly_uint64 min_units; 2791 if (can_find_related_mode_p (vector_mode, element_mode, &min_units)) 2792 { 2793 machine_mode rvv_mode; 2794 if (maybe_ne (nunits, 0U)) 2795 { 2796 /* If we were given a number of units NUNITS, try to find an 2797 RVV vector mode of inner mode ELEMENT_MODE with the same 2798 number of units. */ 2799 if (multiple_p (min_units, nunits) 2800 && get_vector_mode (element_mode, nunits).exists (&rvv_mode)) 2801 return rvv_mode; 2802 } 2803 else 2804 { 2805 /* Look for a vector mode with the same number of units as the 2806 VECTOR_MODE we were given. We keep track of the minimum 2807 number of units so far which determines the smallest necessary 2808 but largest possible, suitable mode for vectorization. */ 2809 min_units = ordered_min (min_units, GET_MODE_SIZE (vector_mode)); 2810 if (get_vector_mode (element_mode, min_units).exists (&rvv_mode)) 2811 return rvv_mode; 2812 } 2813 } 2814 2815 return default_vectorize_related_mode (vector_mode, element_mode, nunits); 2816 } 2817 2818 /* Expand an RVV comparison. */ 2819 2820 void 2821 expand_vec_cmp (rtx target, rtx_code code, rtx op0, rtx op1, rtx mask, 2822 rtx maskoff) 2823 { 2824 machine_mode mask_mode = GET_MODE (target); 2825 machine_mode data_mode = GET_MODE (op0); 2826 insn_code icode = get_cmp_insn_code (code, data_mode); 2827 2828 if (code == LTGT) 2829 { 2830 rtx lt = gen_reg_rtx (mask_mode); 2831 rtx gt = gen_reg_rtx (mask_mode); 2832 expand_vec_cmp (lt, LT, op0, op1, mask, maskoff); 2833 expand_vec_cmp (gt, GT, op0, op1, mask, maskoff); 2834 icode = code_for_pred (IOR, mask_mode); 2835 rtx ops[] = {target, lt, gt}; 2836 emit_vlmax_insn (icode, BINARY_MASK_OP, ops); 2837 return; 2838 } 2839 2840 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, op0, op1); 2841 if (!mask && !maskoff) 2842 { 2843 rtx ops[] = {target, cmp, op0, op1}; 2844 emit_vlmax_insn (icode, COMPARE_OP, ops); 2845 } 2846 else 2847 { 2848 rtx ops[] = {target, mask, maskoff, cmp, op0, op1}; 2849 emit_vlmax_insn (icode, COMPARE_OP_MU, ops); 2850 } 2851 } 2852 2853 /* Expand an RVV floating-point comparison: 2854 2855 If CAN_INVERT_P is true, the caller can also handle inverted results; 2856 return true if the result is in fact inverted. */ 2857 2858 bool 2859 expand_vec_cmp_float (rtx target, rtx_code code, rtx op0, rtx op1, 2860 bool can_invert_p) 2861 { 2862 machine_mode mask_mode = GET_MODE (target); 2863 machine_mode data_mode = GET_MODE (op0); 2864 2865 /* If can_invert_p = true: 2866 It suffices to implement a u>= b as !(a < b) but with the NaNs masked off: 2867 2868 vmfeq.vv v0, va, va 2869 vmfeq.vv v1, vb, vb 2870 vmand.mm v0, v0, v1 2871 vmflt.vv v0, va, vb, v0.t 2872 vmnot.m v0, v0 2873 2874 And, if !HONOR_SNANS, then you can remove the vmand.mm by masking the 2875 second vmfeq.vv: 2876 2877 vmfeq.vv v0, va, va 2878 vmfeq.vv v0, vb, vb, v0.t 2879 vmflt.vv v0, va, vb, v0.t 2880 vmnot.m v0, v0 2881 2882 If can_invert_p = false: 2883 2884 # Example of implementing isgreater() 2885 vmfeq.vv v0, va, va # Only set where A is not NaN. 2886 vmfeq.vv v1, vb, vb # Only set where B is not NaN. 2887 vmand.mm v0, v0, v1 # Only set where A and B are ordered, 2888 vmfgt.vv v0, va, vb, v0.t # so only set flags on ordered values. 2889 */ 2890 2891 rtx eq0 = gen_reg_rtx (mask_mode); 2892 rtx eq1 = gen_reg_rtx (mask_mode); 2893 switch (code) 2894 { 2895 case EQ: 2896 case NE: 2897 case LT: 2898 case LE: 2899 case GT: 2900 case GE: 2901 case LTGT: 2902 /* There is native support for the comparison. */ 2903 expand_vec_cmp (target, code, op0, op1); 2904 return false; 2905 case UNEQ: 2906 case ORDERED: 2907 case UNORDERED: 2908 case UNLT: 2909 case UNLE: 2910 case UNGT: 2911 case UNGE: 2912 /* vmfeq.vv v0, va, va */ 2913 expand_vec_cmp (eq0, EQ, op0, op0); 2914 if (HONOR_SNANS (data_mode)) 2915 { 2916 /* 2917 vmfeq.vv v1, vb, vb 2918 vmand.mm v0, v0, v1 2919 */ 2920 expand_vec_cmp (eq1, EQ, op1, op1); 2921 insn_code icode = code_for_pred (AND, mask_mode); 2922 rtx ops[] = {eq0, eq0, eq1}; 2923 emit_vlmax_insn (icode, BINARY_MASK_OP, ops); 2924 } 2925 else 2926 { 2927 /* vmfeq.vv v0, vb, vb, v0.t */ 2928 expand_vec_cmp (eq0, EQ, op1, op1, eq0, eq0); 2929 } 2930 break; 2931 default: 2932 gcc_unreachable (); 2933 } 2934 2935 if (code == ORDERED) 2936 { 2937 emit_move_insn (target, eq0); 2938 return false; 2939 } 2940 2941 /* There is native support for the inverse comparison. */ 2942 code = reverse_condition_maybe_unordered (code); 2943 if (code == ORDERED) 2944 emit_move_insn (target, eq0); 2945 else 2946 expand_vec_cmp (eq0, code, op0, op1, eq0, eq0); 2947 2948 if (can_invert_p) 2949 { 2950 emit_move_insn (target, eq0); 2951 return true; 2952 } 2953 2954 /* We use one_cmpl<mode>2 to make Combine PASS to combine mask instructions 2955 into: vmand.mm/vmnor.mm/vmnand.mm/vmnor.mm/vmxnor.mm. */ 2956 emit_insn (gen_rtx_SET (target, gen_rtx_NOT (mask_mode, eq0))); 2957 return false; 2958 } 2959 2960 /* Modulo all SEL indices to ensure they are all in range if [0, MAX_SEL]. 2961 MAX_SEL is nunits - 1 if rtx_equal_p (op0, op1). Otherwise, it is 2962 2 * nunits - 1. */ 2963 static rtx 2964 modulo_sel_indices (rtx op0, rtx op1, rtx sel) 2965 { 2966 rtx sel_mod; 2967 machine_mode sel_mode = GET_MODE (sel); 2968 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode); 2969 poly_uint64 max_sel = rtx_equal_p (op0, op1) ? nunits - 1 : 2 * nunits - 1; 2970 /* If SEL is variable-length CONST_VECTOR, we don't need to modulo it. 2971 Or if SEL is constant-length within [0, MAX_SEL], no need to modulo the 2972 indice. */ 2973 if (CONST_VECTOR_P (sel) 2974 && (!nunits.is_constant () || const_vec_all_in_range_p (sel, 0, max_sel))) 2975 sel_mod = sel; 2976 else 2977 { 2978 rtx mod = gen_const_vector_dup (sel_mode, max_sel); 2979 sel_mod 2980 = expand_simple_binop (sel_mode, AND, sel, mod, NULL, 0, OPTAB_DIRECT); 2981 } 2982 return sel_mod; 2983 } 2984 2985 /* Implement vec_perm<mode>. */ 2986 2987 void 2988 expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) 2989 { 2990 machine_mode data_mode = GET_MODE (target); 2991 machine_mode sel_mode = GET_MODE (sel); 2992 poly_uint64 nunits = GET_MODE_NUNITS (sel_mode); 2993 2994 /* Check if the sel only references the first values vector. If each select 2995 index is in range of [0, nunits - 1]. A single vrgather instructions is 2996 enough. Since we will use vrgatherei16.vv for variable-length vector, 2997 it is never out of range and we don't need to modulo the index. */ 2998 if (nunits.is_constant () && const_vec_all_in_range_p (sel, 0, nunits - 1)) 2999 { 3000 emit_vlmax_gather_insn (target, op0, sel); 3001 return; 3002 } 3003 3004 /* Check if all the indices are same. */ 3005 rtx elt; 3006 if (const_vec_duplicate_p (sel, &elt)) 3007 { 3008 poly_uint64 value = rtx_to_poly_int64 (elt); 3009 rtx op = op0; 3010 if (maybe_gt (value, nunits - 1)) 3011 { 3012 sel = gen_const_vector_dup (sel_mode, value - nunits); 3013 op = op1; 3014 } 3015 emit_vlmax_gather_insn (target, op, sel); 3016 } 3017 3018 /* Note: vec_perm indices are supposed to wrap when they go beyond the 3019 size of the two value vectors, i.e. the upper bits of the indices 3020 are effectively ignored. RVV vrgather instead produces 0 for any 3021 out-of-range indices, so we need to modulo all the vec_perm indices 3022 to ensure they are all in range of [0, nunits - 1] when op0 == op1 3023 or all in range of [0, 2 * nunits - 1] when op0 != op1. */ 3024 rtx sel_mod = modulo_sel_indices (op0, op1, sel); 3025 3026 /* Check if the two values vectors are the same. */ 3027 if (rtx_equal_p (op0, op1)) 3028 { 3029 emit_vlmax_gather_insn (target, op0, sel_mod); 3030 return; 3031 } 3032 3033 /* This following sequence is handling the case that: 3034 __builtin_shufflevector (vec1, vec2, index...), the index can be any 3035 value in range of [0, 2 * nunits - 1]. */ 3036 machine_mode mask_mode; 3037 mask_mode = get_mask_mode (data_mode); 3038 rtx mask = gen_reg_rtx (mask_mode); 3039 rtx max_sel = gen_const_vector_dup (sel_mode, nunits); 3040 3041 /* Step 1: generate a mask that should select everything >= nunits into the 3042 * mask. */ 3043 expand_vec_cmp (mask, GEU, sel_mod, max_sel); 3044 3045 /* Step2: gather every op0 values indexed by sel into target, 3046 we don't need to care about the result of the element 3047 whose index >= nunits. */ 3048 emit_vlmax_gather_insn (target, op0, sel_mod); 3049 3050 /* Step3: shift the range from (nunits, max_of_mode] to 3051 [0, max_of_mode - nunits]. */ 3052 rtx tmp = gen_reg_rtx (sel_mode); 3053 rtx ops[] = {tmp, sel_mod, max_sel}; 3054 emit_vlmax_insn (code_for_pred (MINUS, sel_mode), BINARY_OP, ops); 3055 3056 /* Step4: gather those into the previously masked-out elements 3057 of target. */ 3058 emit_vlmax_masked_gather_mu_insn (target, op1, tmp, mask); 3059 } 3060 3061 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST for RVV. */ 3062 3063 /* vec_perm support. */ 3064 3065 struct expand_vec_perm_d 3066 { 3067 rtx target, op0, op1; 3068 vec_perm_indices perm; 3069 machine_mode vmode; 3070 machine_mode op_mode; 3071 bool one_vector_p; 3072 bool testing_p; 3073 }; 3074 3075 /* Return the appropriate index mode for gather instructions. */ 3076 opt_machine_mode 3077 get_gather_index_mode (struct expand_vec_perm_d *d) 3078 { 3079 machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); 3080 poly_uint64 nunits = GET_MODE_NUNITS (d->vmode); 3081 3082 if (GET_MODE_INNER (d->vmode) == QImode) 3083 { 3084 if (nunits.is_constant ()) 3085 { 3086 /* If indice is LMUL8 CONST_VECTOR and any element value 3087 exceed the range of 0 ~ 255, Forbid such permutation 3088 since we need vector HI mode to hold such indice and 3089 we don't have it. */ 3090 if (!d->perm.all_in_range_p (0, 255) 3091 && !get_vector_mode (HImode, nunits).exists (&sel_mode)) 3092 return opt_machine_mode (); 3093 } 3094 else 3095 { 3096 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv. 3097 Otherwise, it could overflow the index range. */ 3098 if (!get_vector_mode (HImode, nunits).exists (&sel_mode)) 3099 return opt_machine_mode (); 3100 } 3101 } 3102 else if (riscv_get_v_regno_alignment (sel_mode) > 1 3103 && GET_MODE_INNER (sel_mode) != HImode) 3104 sel_mode = get_vector_mode (HImode, nunits).require (); 3105 return sel_mode; 3106 } 3107 3108 /* Recognize the patterns that we can use merge operation to shuffle the 3109 vectors. The value of Each element (index i) in selector can only be 3110 either i or nunits + i. We will check the pattern is actually monotonic. 3111 3112 E.g. 3113 v = VEC_PERM_EXPR (v0, v1, selector), 3114 selector = { 0, nunits + 1, 2, nunits + 3, 4, nunits + 5, ... } 3115 3116 We can transform such pattern into: 3117 3118 v = vcond_mask (v0, v1, mask), 3119 mask = { 0, 1, 0, 1, 0, 1, ... }. */ 3120 3121 static bool 3122 shuffle_merge_patterns (struct expand_vec_perm_d *d) 3123 { 3124 machine_mode vmode = d->vmode; 3125 machine_mode sel_mode = related_int_vector_mode (vmode).require (); 3126 int n_patterns = d->perm.encoding ().npatterns (); 3127 poly_int64 vec_len = d->perm.length (); 3128 3129 for (int i = 0; i < n_patterns; ++i) 3130 if (!known_eq (d->perm[i], i) && !known_eq (d->perm[i], vec_len + i)) 3131 return false; 3132 3133 /* Check the pattern is monotonic here, otherwise, return false. */ 3134 for (int i = n_patterns; i < n_patterns * 2; i++) 3135 if (!d->perm.series_p (i, n_patterns, i, n_patterns) 3136 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns)) 3137 return false; 3138 3139 /* We need to use precomputed mask for such situation and such mask 3140 can only be computed in compile-time known size modes. */ 3141 bool indices_fit_selector_p 3142 = GET_MODE_BITSIZE (GET_MODE_INNER (vmode)) > 8 || known_lt (vec_len, 256); 3143 if (!indices_fit_selector_p && !vec_len.is_constant ()) 3144 return false; 3145 3146 if (d->testing_p) 3147 return true; 3148 3149 machine_mode mask_mode = get_mask_mode (vmode); 3150 rtx mask = gen_reg_rtx (mask_mode); 3151 3152 if (indices_fit_selector_p) 3153 { 3154 /* MASK = SELECTOR < NUNTIS ? 1 : 0. */ 3155 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); 3156 rtx x = gen_int_mode (vec_len, GET_MODE_INNER (sel_mode)); 3157 insn_code icode = code_for_pred_cmp_scalar (sel_mode); 3158 rtx cmp = gen_rtx_fmt_ee (LTU, mask_mode, sel, x); 3159 rtx ops[] = {mask, cmp, sel, x}; 3160 emit_vlmax_insn (icode, COMPARE_OP, ops); 3161 } 3162 else 3163 { 3164 /* For EEW8 and NUNITS may be larger than 255, we can't use vmsltu 3165 directly to generate the selector mask, instead, we can only use 3166 precomputed mask. 3167 3168 E.g. selector = <0, 257, 2, 259> for EEW8 vector with NUNITS = 256, we 3169 don't have a QImode scalar register to hold larger than 255. 3170 We also cannot hold that in a vector QImode register if LMUL = 8, and, 3171 since there is no larger HI mode vector we cannot create a larger 3172 selector. 3173 3174 As the mask is a simple {0, 1, ...} pattern and the length is known we 3175 can store it in a scalar register and broadcast it to a mask register. 3176 */ 3177 gcc_assert (vec_len.is_constant ()); 3178 int size = CEIL (GET_MODE_NUNITS (mask_mode).to_constant (), 8); 3179 machine_mode mode = get_vector_mode (QImode, size).require (); 3180 rtx tmp = gen_reg_rtx (mode); 3181 rvv_builder v (mode, 1, size); 3182 for (int i = 0; i < vec_len.to_constant () / 8; i++) 3183 { 3184 uint8_t value = 0; 3185 for (int j = 0; j < 8; j++) 3186 { 3187 int index = i * 8 + j; 3188 if (known_lt (d->perm[index], 256)) 3189 value |= 1 << j; 3190 } 3191 v.quick_push (gen_int_mode (value, QImode)); 3192 } 3193 emit_move_insn (tmp, v.build ()); 3194 emit_move_insn (mask, gen_lowpart (mask_mode, tmp)); 3195 } 3196 3197 /* TARGET = MASK ? OP0 : OP1. */ 3198 /* swap op0 and op1 since the order is opposite to pred_merge. */ 3199 rtx ops2[] = {d->target, d->op1, d->op0, mask}; 3200 emit_vlmax_insn (code_for_pred_merge (vmode), MERGE_OP, ops2); 3201 return true; 3202 } 3203 3204 /* Recognize the consecutive index that we can use a single 3205 vrgather.v[x|i] to shuffle the vectors. 3206 3207 e.g. short[8] = VEC_PERM_EXPR <a, a, {0,1,0,1,0,1,0,1}> 3208 Use SEW = 32, index = 1 vrgather.vi to get the result. */ 3209 static bool 3210 shuffle_consecutive_patterns (struct expand_vec_perm_d *d) 3211 { 3212 machine_mode vmode = d->vmode; 3213 scalar_mode smode = GET_MODE_INNER (vmode); 3214 poly_int64 vec_len = d->perm.length (); 3215 HOST_WIDE_INT elt; 3216 3217 if (!vec_len.is_constant () || !d->perm[0].is_constant (&elt)) 3218 return false; 3219 int vlen = vec_len.to_constant (); 3220 3221 /* Compute the last element index of consecutive pattern from the leading 3222 consecutive elements. */ 3223 int last_consecutive_idx = -1; 3224 int consecutive_num = -1; 3225 for (int i = 1; i < vlen; i++) 3226 { 3227 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1)) 3228 break; 3229 last_consecutive_idx = i; 3230 consecutive_num = last_consecutive_idx + 1; 3231 } 3232 3233 int new_vlen = vlen / consecutive_num; 3234 if (last_consecutive_idx < 0 || consecutive_num == vlen 3235 || !pow2p_hwi (consecutive_num) || !pow2p_hwi (new_vlen)) 3236 return false; 3237 /* VEC_PERM <..., (index, index + 1, ... index + consecutive_num - 1)>. 3238 All elements of index, index + 1, ... index + consecutive_num - 1 should 3239 locate at the same vector. */ 3240 if (maybe_ge (d->perm[0], vec_len) 3241 != maybe_ge (d->perm[last_consecutive_idx], vec_len)) 3242 return false; 3243 /* If a vector has 8 elements. We allow optimizations on consecutive 3244 patterns e.g. <0, 1, 2, 3, 0, 1, 2, 3> or <4, 5, 6, 7, 4, 5, 6, 7>. 3245 Other patterns like <2, 3, 4, 5, 2, 3, 4, 5> are not feasible patterns 3246 to be optimized. */ 3247 if (d->perm[0].to_constant () % consecutive_num != 0) 3248 return false; 3249 unsigned int container_bits = consecutive_num * GET_MODE_BITSIZE (smode); 3250 if (container_bits > 64) 3251 return false; 3252 else if (container_bits == 64) 3253 { 3254 if (!TARGET_VECTOR_ELEN_64) 3255 return false; 3256 else if (FLOAT_MODE_P (smode) && !TARGET_VECTOR_ELEN_FP_64) 3257 return false; 3258 } 3259 3260 /* Check the rest of elements are the same consecutive pattern. */ 3261 for (int i = consecutive_num; i < vlen; i++) 3262 if (maybe_ne (d->perm[i], d->perm[i % consecutive_num])) 3263 return false; 3264 3265 if (FLOAT_MODE_P (smode)) 3266 smode = float_mode_for_size (container_bits).require (); 3267 else 3268 smode = int_mode_for_size (container_bits, 0).require (); 3269 if (!get_vector_mode (smode, new_vlen).exists (&vmode)) 3270 return false; 3271 machine_mode sel_mode = related_int_vector_mode (vmode).require (); 3272 3273 /* Success! */ 3274 if (d->testing_p) 3275 return true; 3276 3277 int index = elt / consecutive_num; 3278 if (index >= new_vlen) 3279 index = index - new_vlen; 3280 rtx sel = gen_const_vector_dup (sel_mode, index); 3281 rtx op = elt >= vlen ? d->op0 : d->op1; 3282 emit_vlmax_gather_insn (gen_lowpart (vmode, d->target), 3283 gen_lowpart (vmode, op), sel); 3284 return true; 3285 } 3286 3287 /* Recognize the patterns that we can use compress operation to shuffle the 3288 vectors. The perm selector of compress pattern is divided into 2 part: 3289 The first part is the random index number < NUNITS. 3290 The second part is consecutive last N index number >= NUNITS. 3291 3292 E.g. 3293 v = VEC_PERM_EXPR (v0, v1, selector), 3294 selector = { 0, 2, 6, 7 } 3295 3296 We can transform such pattern into: 3297 3298 op1 = vcompress (op0, mask) 3299 mask = { 1, 0, 1, 0 } 3300 v = op1. */ 3301 3302 static bool 3303 shuffle_compress_patterns (struct expand_vec_perm_d *d) 3304 { 3305 machine_mode vmode = d->vmode; 3306 poly_int64 vec_len = d->perm.length (); 3307 3308 if (!vec_len.is_constant ()) 3309 return false; 3310 3311 int vlen = vec_len.to_constant (); 3312 3313 /* It's not worthwhile the compress pattern has elemenets < 4 3314 and we can't modulo indices for compress pattern. */ 3315 if (known_ge (d->perm[vlen - 1], vlen * 2) || vlen < 4) 3316 return false; 3317 3318 /* Compress pattern doesn't work for one vector. */ 3319 if (d->one_vector_p) 3320 return false; 3321 3322 /* Compress point is the point that all elements value with index i >= 3323 compress point of the selector are all consecutive series increasing and 3324 each selector value >= NUNTIS. In this case, we could compress all elements 3325 of i < compress point into the op1. */ 3326 int compress_point = -1; 3327 for (int i = 0; i < vlen; i++) 3328 { 3329 if (compress_point < 0 && known_ge (d->perm[i], vec_len)) 3330 { 3331 compress_point = i; 3332 break; 3333 } 3334 } 3335 3336 /* We don't apply compress approach if we can't find the compress point. */ 3337 if (compress_point < 0) 3338 return false; 3339 3340 /* We can only apply compress approach when all index values from 0 to 3341 compress point are increasing. */ 3342 for (int i = 1; i < compress_point; i++) 3343 if (maybe_le (d->perm[i], d->perm[i - 1])) 3344 return false; 3345 3346 /* It must be series increasing from compress point. */ 3347 for (int i = 1 + compress_point; i < vlen; i++) 3348 if (maybe_ne (d->perm[i], d->perm[i - 1] + 1)) 3349 return false; 3350 3351 /* Success! */ 3352 if (d->testing_p) 3353 return true; 3354 3355 /* Check whether we need to slideup op1 to apply compress approach. 3356 3357 E.g. For index = { 0, 2, 6, 7}, since d->perm[i - 1] = 7 which 3358 is 2 * NUNITS - 1, so we don't need to slide up. 3359 3360 For index = { 0, 2, 5, 6}, we need to slide op1 up before 3361 we apply compress approach. */ 3362 bool need_slideup_p = maybe_ne (d->perm[vlen - 1], 2 * vec_len - 1) 3363 && !const_vec_duplicate_p (d->op1); 3364 3365 /* If we leave it directly be handled by general gather, 3366 the code sequence will be: 3367 VECTOR LOAD selector 3368 GEU mask, selector, NUNITS 3369 GATHER dest, op0, selector 3370 SUB selector, selector, NUNITS 3371 GATHER dest, op1, selector, mask 3372 Each ALU operation is considered as COST = 1 and VECTOR LOAD is considered 3373 as COST = 4. So, we consider the general gather handling COST = 9. 3374 TODO: This cost is not accurate, we can adjust it by tune info. */ 3375 int general_cost = 9; 3376 3377 /* If we can use compress approach, the code squence will be: 3378 MASK LOAD mask 3379 COMPRESS op1, op0, mask 3380 If it needs slide up, it will be: 3381 MASK LOAD mask 3382 SLIDEUP op1 3383 COMPRESS op1, op0, mask 3384 By default, mask load COST = 2. 3385 TODO: This cost is not accurate, we can adjust it by tune info. */ 3386 int compress_cost = 4; 3387 3388 if (general_cost <= compress_cost) 3389 return false; 3390 3391 /* Build a mask that is true when selector element is true. */ 3392 machine_mode mask_mode = get_mask_mode (vmode); 3393 rvv_builder builder (mask_mode, vlen, 1); 3394 for (int i = 0; i < vlen; i++) 3395 { 3396 bool is_compress_index = false; 3397 for (int j = 0; j < compress_point; j++) 3398 { 3399 if (known_eq (d->perm[j], i)) 3400 { 3401 is_compress_index = true; 3402 break; 3403 } 3404 } 3405 if (is_compress_index) 3406 builder.quick_push (CONST1_RTX (BImode)); 3407 else 3408 builder.quick_push (CONST0_RTX (BImode)); 3409 } 3410 rtx mask = force_reg (mask_mode, builder.build ()); 3411 3412 rtx merge = d->op1; 3413 if (need_slideup_p) 3414 { 3415 int slideup_cnt = vlen - (d->perm[vlen - 1].to_constant () % vlen) - 1; 3416 merge = gen_reg_rtx (vmode); 3417 rtx ops[] = {merge, d->op1, gen_int_mode (slideup_cnt, Pmode)}; 3418 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEUP, vmode); 3419 emit_vlmax_insn (icode, BINARY_OP, ops); 3420 } 3421 3422 insn_code icode = code_for_pred_compress (vmode); 3423 rtx ops[] = {d->target, merge, d->op0, mask}; 3424 emit_nonvlmax_insn (icode, COMPRESS_OP_MERGE, ops, 3425 gen_int_mode (vlen, Pmode)); 3426 return true; 3427 } 3428 3429 /* Recognize decompress patterns: 3430 3431 1. VEC_PERM_EXPR op0 and op1 3432 with isel = { 0, nunits, 1, nunits + 1, ... }. 3433 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }. 3434 3435 2. VEC_PERM_EXPR op0 and op1 3436 with isel = { 1/2 nunits, 3/2 nunits, 1/2 nunits+1, 3/2 nunits+1,... }. 3437 Slide down op0 and op1 with OFFSET = 1/2 nunits. 3438 Decompress op0 and op1 vector with the mask = { 0, 1, 0, 1, ... }. 3439 */ 3440 static bool 3441 shuffle_decompress_patterns (struct expand_vec_perm_d *d) 3442 { 3443 poly_uint64 nelt = d->perm.length (); 3444 machine_mode mask_mode = get_mask_mode (d->vmode); 3445 3446 /* For constant size indices, we dont't need to handle it here. 3447 Just leave it to vec_perm<mode>. */ 3448 if (d->perm.length ().is_constant ()) 3449 return false; 3450 3451 poly_uint64 first = d->perm[0]; 3452 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt)) 3453 || !d->perm.series_p (0, 2, first, 1) 3454 || !d->perm.series_p (1, 2, first + nelt, 1)) 3455 return false; 3456 3457 /* Permuting two SEW8 variable-length vectors need vrgatherei16.vv. 3458 Otherwise, it could overflow the index range. */ 3459 machine_mode sel_mode = related_int_vector_mode (d->vmode).require (); 3460 if (GET_MODE_INNER (d->vmode) == QImode 3461 && !get_vector_mode (HImode, nelt).exists (&sel_mode)) 3462 return false; 3463 3464 /* Success! */ 3465 if (d->testing_p) 3466 return true; 3467 3468 rtx op0, op1; 3469 if (known_eq (first, 0U)) 3470 { 3471 op0 = d->op0; 3472 op1 = d->op1; 3473 } 3474 else 3475 { 3476 op0 = gen_reg_rtx (d->vmode); 3477 op1 = gen_reg_rtx (d->vmode); 3478 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, d->vmode); 3479 rtx ops0[] = {op0, d->op0, gen_int_mode (first, Pmode)}; 3480 rtx ops1[] = {op1, d->op1, gen_int_mode (first, Pmode)}; 3481 emit_vlmax_insn (icode, BINARY_OP, ops0); 3482 emit_vlmax_insn (icode, BINARY_OP, ops1); 3483 } 3484 /* Generate { 0, 1, .... } mask. */ 3485 rtx vid = gen_reg_rtx (sel_mode); 3486 rtx vid_repeat = gen_reg_rtx (sel_mode); 3487 expand_vec_series (vid, const0_rtx, const1_rtx); 3488 rtx and_ops[] = {vid_repeat, vid, const1_rtx}; 3489 emit_vlmax_insn (code_for_pred_scalar (AND, sel_mode), BINARY_OP, and_ops); 3490 rtx const_vec = gen_const_vector_dup (sel_mode, 1); 3491 rtx mask = gen_reg_rtx (mask_mode); 3492 expand_vec_cmp (mask, EQ, vid_repeat, const_vec); 3493 emit_vlmax_decompress_insn (d->target, op0, op1, mask); 3494 return true; 3495 } 3496 3497 static bool 3498 shuffle_bswap_pattern (struct expand_vec_perm_d *d) 3499 { 3500 HOST_WIDE_INT diff; 3501 unsigned i, size, step; 3502 3503 if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff) 3504 return false; 3505 3506 step = diff + 1; 3507 size = step * GET_MODE_UNIT_BITSIZE (d->vmode); 3508 3509 switch (size) 3510 { 3511 case 16: 3512 break; 3513 case 32: 3514 case 64: 3515 /* We will have VEC_PERM_EXPR after rtl expand when invoking 3516 __builtin_bswap. It will generate about 9 instructions in 3517 loop as below, no matter it is bswap16, bswap32 or bswap64. 3518 .L2: 3519 1 vle16.v v4,0(a0) 3520 2 vmv.v.x v2,a7 3521 3 vand.vv v2,v6,v2 3522 4 slli a2,a5,1 3523 5 vrgatherei16.vv v1,v4,v2 3524 6 sub a4,a4,a5 3525 7 vse16.v v1,0(a3) 3526 8 add a0,a0,a2 3527 9 add a3,a3,a2 3528 bne a4,zero,.L2 3529 3530 But for bswap16 we may have a even simple code gen, which 3531 has only 7 instructions in loop as below. 3532 .L5 3533 1 vle8.v v2,0(a5) 3534 2 addi a5,a5,32 3535 3 vsrl.vi v4,v2,8 3536 4 vsll.vi v2,v2,8 3537 5 vor.vv v4,v4,v2 3538 6 vse8.v v4,0(a4) 3539 7 addi a4,a4,32 3540 bne a5,a6,.L5 3541 3542 Unfortunately, the instructions in loop will grow to 13 and 24 3543 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn) 3544 for both the bswap64 and bswap32, but take shift and or (7 insn) 3545 for bswap16. 3546 */ 3547 default: 3548 return false; 3549 } 3550 3551 for (i = 0; i < step; i++) 3552 if (!d->perm.series_p (i, step, diff - i, step)) 3553 return false; 3554 3555 /* Disable when nunits < 4 since the later generic approach 3556 is more profitable on BSWAP. */ 3557 if (!known_gt (GET_MODE_NUNITS (d->vmode), 2)) 3558 return false; 3559 3560 if (d->testing_p) 3561 return true; 3562 3563 machine_mode vhi_mode; 3564 poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2); 3565 3566 if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode)) 3567 return false; 3568 3569 /* Step-1: Move op0 to src with VHI mode. */ 3570 rtx src = gen_reg_rtx (vhi_mode); 3571 emit_move_insn (src, gen_lowpart (vhi_mode, d->op0)); 3572 3573 /* Step-2: Shift right 8 bits to dest. */ 3574 rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode), 3575 NULL_RTX, 0, OPTAB_DIRECT); 3576 3577 /* Step-3: Shift left 8 bits to src. */ 3578 src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode), 3579 NULL_RTX, 0, OPTAB_DIRECT); 3580 3581 /* Step-4: Logic Or dest and src to dest. */ 3582 dest = expand_binop (vhi_mode, ior_optab, dest, src, 3583 NULL_RTX, 0, OPTAB_DIRECT); 3584 3585 /* Step-5: Move src to target with VQI mode. */ 3586 emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); 3587 3588 return true; 3589 } 3590 3591 /* Recognize the pattern that can be shuffled by vec_extract and slide1up 3592 approach. */ 3593 3594 static bool 3595 shuffle_extract_and_slide1up_patterns (struct expand_vec_perm_d *d) 3596 { 3597 poly_int64 nunits = GET_MODE_NUNITS (d->vmode); 3598 3599 /* Recognize { nunits - 1, nunits, nunits + 1, ... }. */ 3600 if (!d->perm.series_p (0, 2, nunits - 1, 2) 3601 || !d->perm.series_p (1, 2, nunits, 2)) 3602 return false; 3603 3604 /* Disable when nunits < 4 since the later generic approach 3605 is more profitable on indice = { nunits - 1, nunits }. */ 3606 if (!known_gt (nunits, 2)) 3607 return false; 3608 3609 /* Success! */ 3610 if (d->testing_p) 3611 return true; 3612 3613 /* Extract the last element of the first vector. */ 3614 scalar_mode smode = GET_MODE_INNER (d->vmode); 3615 rtx tmp = gen_reg_rtx (smode); 3616 emit_vec_extract (tmp, d->op0, gen_int_mode (nunits - 1, Pmode)); 3617 3618 /* Insert the scalar into element 0. */ 3619 unsigned int unspec 3620 = FLOAT_MODE_P (d->vmode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP; 3621 insn_code icode = code_for_pred_slide (unspec, d->vmode); 3622 rtx ops[] = {d->target, d->op1, tmp}; 3623 emit_vlmax_insn (icode, BINARY_OP, ops); 3624 return true; 3625 } 3626 3627 static bool 3628 shuffle_series_patterns (struct expand_vec_perm_d *d) 3629 { 3630 if (!d->one_vector_p || d->perm.encoding ().npatterns () != 1) 3631 return false; 3632 3633 poly_int64 el1 = d->perm[0]; 3634 poly_int64 el2 = d->perm[1]; 3635 poly_int64 el3 = d->perm[2]; 3636 3637 poly_int64 step1 = el2 - el1; 3638 poly_int64 step2 = el3 - el2; 3639 3640 bool need_insert = false; 3641 bool have_series = false; 3642 3643 /* Check for a full series. */ 3644 if (known_ne (step1, 0) && d->perm.series_p (0, 1, el1, step1)) 3645 have_series = true; 3646 3647 /* Check for a series starting at the second element. */ 3648 else if (known_ne (step2, 0) && d->perm.series_p (1, 1, el2, step2)) 3649 { 3650 have_series = true; 3651 need_insert = true; 3652 } 3653 3654 if (!have_series) 3655 return false; 3656 3657 /* Disable shuffle if we can't find an appropriate integer index mode for 3658 gather. */ 3659 machine_mode sel_mode; 3660 if (!get_gather_index_mode (d).exists (&sel_mode)) 3661 return false; 3662 3663 /* Success! */ 3664 if (d->testing_p) 3665 return true; 3666 3667 /* Create the series. */ 3668 machine_mode eltmode = Pmode; 3669 rtx series = gen_reg_rtx (sel_mode); 3670 expand_vec_series (series, gen_int_mode (need_insert ? el2 : el1, eltmode), 3671 gen_int_mode (need_insert ? step2 : step1, eltmode)); 3672 3673 /* Insert the remaining element if necessary. */ 3674 if (need_insert) 3675 { 3676 insn_code icode = code_for_pred_slide (UNSPEC_VSLIDE1UP, sel_mode); 3677 rtx ops[] 3678 = {series, series, gen_int_mode (el1, GET_MODE_INNER (sel_mode))}; 3679 emit_vlmax_insn (icode, BINARY_OP, ops); 3680 } 3681 3682 emit_vlmax_gather_insn (d->target, d->op0, series); 3683 3684 return true; 3685 } 3686 3687 /* Recognize the pattern that can be shuffled by generic approach. */ 3688 3689 static bool 3690 shuffle_generic_patterns (struct expand_vec_perm_d *d) 3691 { 3692 machine_mode sel_mode; 3693 3694 /* We don't enable SLP for non-power of 2 NPATTERNS. */ 3695 if (!pow2p_hwi (d->perm.encoding().npatterns ())) 3696 return false; 3697 3698 /* Disable shuffle if we can't find an appropriate integer index mode for 3699 gather. */ 3700 if (!get_gather_index_mode (d).exists (&sel_mode)) 3701 return false; 3702 3703 /* Success! */ 3704 if (d->testing_p) 3705 return true; 3706 3707 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); 3708 /* Some FIXED-VLMAX/VLS vector permutation situations call targethook 3709 instead of expand vec_perm<mode>, we handle it directly. */ 3710 expand_vec_perm (d->target, d->op0, d->op1, sel); 3711 return true; 3712 } 3713 3714 /* This function recognizes and supports different permutation patterns 3715 and enable VLA SLP auto-vectorization. */ 3716 static bool 3717 expand_vec_perm_const_1 (struct expand_vec_perm_d *d) 3718 { 3719 gcc_assert (d->op_mode != E_VOIDmode); 3720 3721 /* The pattern matching functions above are written to look for a small 3722 number to begin the sequence (0, 1, N/2). If we begin with an index 3723 from the second operand, we can swap the operands. */ 3724 poly_int64 nelt = d->perm.length (); 3725 if (known_ge (d->perm[0], nelt)) 3726 { 3727 d->perm.rotate_inputs (1); 3728 std::swap (d->op0, d->op1); 3729 } 3730 3731 if (known_gt (nelt, 1)) 3732 { 3733 if (d->vmode == d->op_mode) 3734 { 3735 if (shuffle_merge_patterns (d)) 3736 return true; 3737 if (shuffle_consecutive_patterns (d)) 3738 return true; 3739 if (shuffle_compress_patterns (d)) 3740 return true; 3741 if (shuffle_decompress_patterns (d)) 3742 return true; 3743 if (shuffle_bswap_pattern (d)) 3744 return true; 3745 if (shuffle_extract_and_slide1up_patterns (d)) 3746 return true; 3747 if (shuffle_series_patterns (d)) 3748 return true; 3749 if (shuffle_generic_patterns (d)) 3750 return true; 3751 return false; 3752 } 3753 else 3754 return false; 3755 } 3756 return false; 3757 } 3758 3759 /* This function implements TARGET_VECTORIZE_VEC_PERM_CONST by using RVV 3760 * instructions. */ 3761 bool 3762 expand_vec_perm_const (machine_mode vmode, machine_mode op_mode, rtx target, 3763 rtx op0, rtx op1, const vec_perm_indices &sel) 3764 { 3765 /* RVV doesn't have Mask type pack/unpack instructions and we don't use 3766 mask to do the iteration loop control. Just disable it directly. */ 3767 if (GET_MODE_CLASS (vmode) == MODE_VECTOR_BOOL) 3768 return false; 3769 /* FIXME: Explicitly disable VLA interleave SLP vectorization when we 3770 may encounter ICE for poly size (1, 1) vectors in loop vectorizer. 3771 Ideally, middle-end loop vectorizer should be able to disable it 3772 itself, We can remove the codes here when middle-end code is able 3773 to disable VLA SLP vectorization for poly size (1, 1) VF. */ 3774 if (!BYTES_PER_RISCV_VECTOR.is_constant () 3775 && maybe_lt (BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL, 3776 poly_int64 (16, 16))) 3777 return false; 3778 3779 struct expand_vec_perm_d d; 3780 3781 /* Check whether the mask can be applied to a single vector. */ 3782 if (sel.ninputs () == 1 || (op0 && rtx_equal_p (op0, op1))) 3783 d.one_vector_p = true; 3784 else if (sel.all_from_input_p (0)) 3785 { 3786 d.one_vector_p = true; 3787 op1 = op0; 3788 } 3789 else if (sel.all_from_input_p (1)) 3790 { 3791 d.one_vector_p = true; 3792 op0 = op1; 3793 } 3794 else 3795 d.one_vector_p = false; 3796 3797 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, 3798 sel.nelts_per_input ()); 3799 d.vmode = vmode; 3800 d.op_mode = op_mode; 3801 d.target = target; 3802 d.op0 = op0; 3803 if (op0 == op1) 3804 d.op1 = d.op0; 3805 else 3806 d.op1 = op1; 3807 d.testing_p = !target; 3808 3809 if (!d.testing_p) 3810 return expand_vec_perm_const_1 (&d); 3811 3812 rtx_insn *last = get_last_insn (); 3813 bool ret = expand_vec_perm_const_1 (&d); 3814 gcc_assert (last == get_last_insn ()); 3815 3816 return ret; 3817 } 3818 3819 /* Generate no side effects vsetvl to get the vector length. */ 3820 void 3821 expand_select_vl (rtx *ops) 3822 { 3823 poly_int64 nunits = rtx_to_poly_int64 (ops[2]); 3824 if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits)) 3825 { 3826 /* If length is known <= VF, we just use the length directly instead 3827 of using vsetvli. 3828 3829 E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]); 3830 We move 3 into _255 intead of using explicit vsetvl. */ 3831 emit_move_insn (ops[0], ops[1]); 3832 return; 3833 } 3834 /* We arbitrary picked QImode as inner scalar mode to get vector mode. 3835 since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */ 3836 scalar_int_mode mode = QImode; 3837 machine_mode rvv_mode = get_vector_mode (mode, nunits).require (); 3838 emit_insn (gen_no_side_effects_vsetvl_rtx (rvv_mode, ops[0], ops[1])); 3839 } 3840 3841 /* Expand MASK_LEN_{LOAD,STORE}. */ 3842 void 3843 expand_load_store (rtx *ops, bool is_load) 3844 { 3845 rtx mask = ops[2]; 3846 rtx len = ops[3]; 3847 machine_mode mode = GET_MODE (ops[0]); 3848 3849 if (is_vlmax_len_p (mode, len)) 3850 { 3851 /* If the length operand is equal to VF, it is VLMAX load/store. */ 3852 if (is_load) 3853 { 3854 rtx m_ops[] = {ops[0], mask, ops[1]}; 3855 emit_vlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops); 3856 } 3857 else 3858 { 3859 len = gen_reg_rtx (Pmode); 3860 emit_vlmax_vsetvl (mode, len); 3861 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len, 3862 get_avl_type_rtx (VLMAX))); 3863 } 3864 } 3865 else 3866 { 3867 if (!satisfies_constraint_vl (len)) 3868 len = force_reg (Pmode, len); 3869 if (is_load) 3870 { 3871 rtx m_ops[] = {ops[0], mask, ops[1]}; 3872 emit_nonvlmax_insn (code_for_pred_mov (mode), UNARY_OP_TAMA, m_ops, 3873 len); 3874 } 3875 else 3876 emit_insn (gen_pred_store (mode, ops[0], mask, ops[1], len, 3877 get_avl_type_rtx (NONVLMAX))); 3878 } 3879 } 3880 3881 3882 /* Return true if the operation is the floating-point operation need FRM. */ 3883 static bool 3884 needs_fp_rounding (unsigned icode, machine_mode mode) 3885 { 3886 if (!FLOAT_MODE_P (mode)) 3887 return false; 3888 3889 return icode != maybe_code_for_pred (SMIN, mode) 3890 && icode != maybe_code_for_pred (UNSPEC_VFMIN, mode) 3891 && icode != maybe_code_for_pred (SMAX, mode) 3892 && icode != maybe_code_for_pred (UNSPEC_VFMAX, mode) 3893 && icode != maybe_code_for_pred (NEG, mode) 3894 && icode != maybe_code_for_pred (ABS, mode) 3895 /* narrower-FP -> FP */ 3896 && icode != maybe_code_for_pred_extend (mode) 3897 /* narrower-INT -> FP */ 3898 && icode != maybe_code_for_pred_widen (FLOAT, mode) 3899 && icode != maybe_code_for_pred_widen (UNSIGNED_FLOAT, mode) 3900 /* vfsgnj */ 3901 && icode != maybe_code_for_pred (UNSPEC_VCOPYSIGN, mode) 3902 && icode != maybe_code_for_pred_mov (mode); 3903 } 3904 3905 /* Subroutine to expand COND_LEN_* patterns. */ 3906 static void 3907 expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len) 3908 { 3909 rtx dest = ops[0]; 3910 rtx mask = ops[1]; 3911 machine_mode mode = GET_MODE (dest); 3912 machine_mode mask_mode = GET_MODE (mask); 3913 bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode)); 3914 bool is_vlmax_len = is_vlmax_len_p (mode, len); 3915 3916 unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type; 3917 /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len, 3918 dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such 3919 simplification in RISC-V backend and may do that in middle-end in the 3920 future. */ 3921 if (is_dummy_mask && is_vlmax_len) 3922 insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P; 3923 else if (is_dummy_mask) 3924 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P; 3925 else if (is_vlmax_len) 3926 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P; 3927 else 3928 insn_flags |= TU_POLICY_P | MU_POLICY_P; 3929 3930 if (needs_fp_rounding (icode, mode)) 3931 insn_flags |= FRM_DYN_P; 3932 3933 if (is_vlmax_len) 3934 emit_vlmax_insn (icode, insn_flags, ops); 3935 else 3936 emit_nonvlmax_insn (icode, insn_flags, ops, len); 3937 } 3938 3939 /* Return RVV_VUNDEF if the ELSE value is scratch rtx. */ 3940 static rtx 3941 get_else_operand (rtx op) 3942 { 3943 return GET_CODE (op) == SCRATCH ? RVV_VUNDEF (GET_MODE (op)) : op; 3944 } 3945 3946 /* Expand unary ops COND_LEN_*. */ 3947 void 3948 expand_cond_len_unop (unsigned icode, rtx *ops) 3949 { 3950 rtx dest = ops[0]; 3951 rtx mask = ops[1]; 3952 rtx src = ops[2]; 3953 rtx merge = get_else_operand (ops[3]); 3954 rtx len = ops[4]; 3955 3956 rtx cond_ops[] = {dest, mask, merge, src}; 3957 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len); 3958 } 3959 3960 /* Expand unary ops COND_*. */ 3961 void 3962 expand_cond_unop (unsigned icode, rtx *ops) 3963 { 3964 rtx dest = ops[0]; 3965 rtx mask = ops[1]; 3966 rtx src = ops[2]; 3967 rtx merge = get_else_operand (ops[3]); 3968 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode); 3969 3970 rtx cond_ops[] = {dest, mask, merge, src}; 3971 expand_cond_len_op (icode, UNARY_OP_P, cond_ops, len); 3972 } 3973 3974 /* Expand binary ops COND_LEN_*. */ 3975 void 3976 expand_cond_len_binop (unsigned icode, rtx *ops) 3977 { 3978 rtx dest = ops[0]; 3979 rtx mask = ops[1]; 3980 rtx src1 = ops[2]; 3981 rtx src2 = ops[3]; 3982 rtx merge = get_else_operand (ops[4]); 3983 rtx len = ops[5]; 3984 3985 rtx cond_ops[] = {dest, mask, merge, src1, src2}; 3986 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len); 3987 } 3988 3989 /* Expand binary ops COND_*. */ 3990 void 3991 expand_cond_binop (unsigned icode, rtx *ops) 3992 { 3993 rtx dest = ops[0]; 3994 rtx mask = ops[1]; 3995 rtx src1 = ops[2]; 3996 rtx src2 = ops[3]; 3997 rtx merge = get_else_operand (ops[4]); 3998 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode); 3999 4000 rtx cond_ops[] = {dest, mask, merge, src1, src2}; 4001 expand_cond_len_op (icode, BINARY_OP_P, cond_ops, len); 4002 } 4003 4004 /* Prepare insn_code for gather_load/scatter_store according to 4005 the vector mode and index mode. */ 4006 static insn_code 4007 prepare_gather_scatter (machine_mode vec_mode, machine_mode idx_mode, 4008 bool is_load) 4009 { 4010 if (!is_load) 4011 return code_for_pred_indexed_store (UNSPEC_UNORDERED, vec_mode, idx_mode); 4012 else 4013 { 4014 unsigned src_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (idx_mode)); 4015 unsigned dst_eew_bitsize = GET_MODE_BITSIZE (GET_MODE_INNER (vec_mode)); 4016 if (dst_eew_bitsize == src_eew_bitsize) 4017 return code_for_pred_indexed_load_same_eew (UNSPEC_UNORDERED, vec_mode); 4018 else if (dst_eew_bitsize > src_eew_bitsize) 4019 { 4020 unsigned factor = dst_eew_bitsize / src_eew_bitsize; 4021 switch (factor) 4022 { 4023 case 2: 4024 return code_for_pred_indexed_load_x2_greater_eew ( 4025 UNSPEC_UNORDERED, vec_mode); 4026 case 4: 4027 return code_for_pred_indexed_load_x4_greater_eew ( 4028 UNSPEC_UNORDERED, vec_mode); 4029 case 8: 4030 return code_for_pred_indexed_load_x8_greater_eew ( 4031 UNSPEC_UNORDERED, vec_mode); 4032 default: 4033 gcc_unreachable (); 4034 } 4035 } 4036 else 4037 { 4038 unsigned factor = src_eew_bitsize / dst_eew_bitsize; 4039 switch (factor) 4040 { 4041 case 2: 4042 return code_for_pred_indexed_load_x2_smaller_eew ( 4043 UNSPEC_UNORDERED, vec_mode); 4044 case 4: 4045 return code_for_pred_indexed_load_x4_smaller_eew ( 4046 UNSPEC_UNORDERED, vec_mode); 4047 case 8: 4048 return code_for_pred_indexed_load_x8_smaller_eew ( 4049 UNSPEC_UNORDERED, vec_mode); 4050 default: 4051 gcc_unreachable (); 4052 } 4053 } 4054 } 4055 } 4056 4057 /* Expand LEN_MASK_{GATHER_LOAD,SCATTER_STORE}. */ 4058 void 4059 expand_gather_scatter (rtx *ops, bool is_load) 4060 { 4061 rtx ptr, vec_offset, vec_reg; 4062 bool zero_extend_p; 4063 int scale_log2; 4064 rtx mask = ops[5]; 4065 rtx len = ops[6]; 4066 if (is_load) 4067 { 4068 vec_reg = ops[0]; 4069 ptr = ops[1]; 4070 vec_offset = ops[2]; 4071 zero_extend_p = INTVAL (ops[3]); 4072 scale_log2 = exact_log2 (INTVAL (ops[4])); 4073 } 4074 else 4075 { 4076 vec_reg = ops[4]; 4077 ptr = ops[0]; 4078 vec_offset = ops[1]; 4079 zero_extend_p = INTVAL (ops[2]); 4080 scale_log2 = exact_log2 (INTVAL (ops[3])); 4081 } 4082 4083 machine_mode vec_mode = GET_MODE (vec_reg); 4084 machine_mode idx_mode = GET_MODE (vec_offset); 4085 scalar_mode inner_idx_mode = GET_MODE_INNER (idx_mode); 4086 unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode); 4087 poly_int64 nunits = GET_MODE_NUNITS (vec_mode); 4088 bool is_vlmax = is_vlmax_len_p (vec_mode, len); 4089 4090 /* Extend the offset element to address width. */ 4091 if (inner_offsize < BITS_PER_WORD) 4092 { 4093 /* 7.2. Vector Load/Store Addressing Modes. 4094 If the vector offset elements are narrower than XLEN, they are 4095 zero-extended to XLEN before adding to the ptr effective address. If 4096 the vector offset elements are wider than XLEN, the least-significant 4097 XLEN bits are used in the address calculation. An implementation must 4098 raise an illegal instruction exception if the EEW is not supported for 4099 offset elements. 4100 4101 RVV spec only refers to the scale_log == 0 case. */ 4102 if (!zero_extend_p || scale_log2 != 0) 4103 { 4104 if (zero_extend_p) 4105 inner_idx_mode 4106 = int_mode_for_size (inner_offsize * 2, 0).require (); 4107 else 4108 inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require (); 4109 machine_mode new_idx_mode 4110 = get_vector_mode (inner_idx_mode, nunits).require (); 4111 rtx tmp = gen_reg_rtx (new_idx_mode); 4112 emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode, 4113 zero_extend_p ? true : false)); 4114 vec_offset = tmp; 4115 idx_mode = new_idx_mode; 4116 } 4117 } 4118 4119 if (scale_log2 != 0) 4120 { 4121 rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset, 4122 gen_int_mode (scale_log2, Pmode), NULL_RTX, 0, 4123 OPTAB_DIRECT); 4124 vec_offset = tmp; 4125 } 4126 4127 insn_code icode = prepare_gather_scatter (vec_mode, idx_mode, is_load); 4128 if (is_vlmax) 4129 { 4130 if (is_load) 4131 { 4132 rtx load_ops[] 4133 = {vec_reg, mask, ptr, vec_offset}; 4134 emit_vlmax_insn (icode, BINARY_OP_TAMA, load_ops); 4135 } 4136 else 4137 { 4138 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg}; 4139 emit_vlmax_insn (icode, SCATTER_OP_M, store_ops); 4140 } 4141 } 4142 else 4143 { 4144 if (is_load) 4145 { 4146 rtx load_ops[] 4147 = {vec_reg, mask, ptr, vec_offset}; 4148 emit_nonvlmax_insn (icode, BINARY_OP_TAMA, load_ops, len); 4149 } 4150 else 4151 { 4152 rtx store_ops[] = {mask, ptr, vec_offset, vec_reg}; 4153 emit_nonvlmax_insn (icode, SCATTER_OP_M, store_ops, len); 4154 } 4155 } 4156 } 4157 4158 /* Expand COND_LEN_*. */ 4159 void 4160 expand_cond_len_ternop (unsigned icode, rtx *ops) 4161 { 4162 rtx dest = ops[0]; 4163 rtx mask = ops[1]; 4164 rtx src1 = ops[2]; 4165 rtx src2 = ops[3]; 4166 rtx src3 = ops[4]; 4167 rtx merge = get_else_operand (ops[5]); 4168 rtx len = ops[6]; 4169 4170 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge}; 4171 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len); 4172 } 4173 4174 /* Expand COND_*. */ 4175 void 4176 expand_cond_ternop (unsigned icode, rtx *ops) 4177 { 4178 rtx dest = ops[0]; 4179 rtx mask = ops[1]; 4180 rtx src1 = ops[2]; 4181 rtx src2 = ops[3]; 4182 rtx src3 = ops[4]; 4183 rtx merge = get_else_operand (ops[5]); 4184 rtx len = gen_int_mode (GET_MODE_NUNITS (GET_MODE (dest)), Pmode); 4185 4186 rtx cond_ops[] = {dest, mask, src1, src2, src3, merge}; 4187 expand_cond_len_op (icode, TERNARY_OP_P, cond_ops, len); 4188 } 4189 4190 /* Expand reduction operations. 4191 Case 1: ops = {scalar_dest, vector_src} 4192 Case 2: ops = {scalar_dest, vector_src, mask, vl} 4193 */ 4194 void 4195 expand_reduction (unsigned unspec, unsigned unspec_for_vl0_safe, 4196 unsigned insn_flags, rtx *ops, rtx init) 4197 { 4198 rtx scalar_dest = ops[0]; 4199 rtx vector_src = ops[1]; 4200 machine_mode vmode = GET_MODE (vector_src); 4201 machine_mode vel_mode = GET_MODE (scalar_dest); 4202 machine_mode m1_mode = get_m1_mode (vel_mode).require (); 4203 rtx vl_op = NULL_RTX; 4204 bool need_vl0_safe = false; 4205 if (need_mask_operand_p (insn_flags)) 4206 { 4207 vl_op = ops[3]; 4208 need_vl0_safe = !CONST_INT_P (vl_op) && !CONST_POLY_INT_P (vl_op); 4209 } 4210 4211 rtx m1_tmp = gen_reg_rtx (m1_mode); 4212 rtx scalar_move_ops[] = {m1_tmp, init}; 4213 insn_code icode = code_for_pred_broadcast (m1_mode); 4214 if (need_mask_operand_p (insn_flags)) 4215 { 4216 if (need_vl0_safe) 4217 emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, const1_rtx); 4218 else 4219 emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, vl_op); 4220 } 4221 else 4222 emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops); 4223 4224 rtx m1_tmp2 = gen_reg_rtx (m1_mode); 4225 rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp}; 4226 4227 if (need_vl0_safe) 4228 icode = code_for_pred (unspec_for_vl0_safe, vmode); 4229 else 4230 icode = code_for_pred (unspec, vmode); 4231 4232 if (need_mask_operand_p (insn_flags)) 4233 { 4234 rtx mask_len_reduc_ops[] = {m1_tmp2, ops[2], vector_src, m1_tmp}; 4235 emit_nonvlmax_insn (icode, insn_flags, mask_len_reduc_ops, vl_op); 4236 } 4237 else 4238 emit_vlmax_insn (icode, insn_flags, reduc_ops); 4239 4240 emit_insn (gen_pred_extract_first (m1_mode, scalar_dest, m1_tmp2)); 4241 } 4242 4243 /* Prepare ops for ternary operations. 4244 It can be called before or after RA. */ 4245 void 4246 prepare_ternary_operands (rtx *ops) 4247 { 4248 machine_mode mode = GET_MODE (ops[0]); 4249 4250 if (!rtx_equal_p (ops[5], RVV_VUNDEF (mode)) 4251 && (VECTOR_MODE_P (GET_MODE (ops[2])) 4252 && !rtx_equal_p (ops[2], ops[5])) 4253 && !rtx_equal_p (ops[3], ops[5]) 4254 && !rtx_equal_p (ops[4], ops[5])) 4255 { 4256 /* RA will fail to find vector REG and report ICE, so we pre-merge 4257 the ops for LMUL = 8. */ 4258 if (satisfies_constraint_Wc1 (ops[1])) 4259 { 4260 emit_move_insn (ops[0], ops[5]); 4261 emit_insn (gen_pred_mov (mode, ops[0], ops[1], ops[0], ops[4], ops[6], 4262 ops[7], ops[8], ops[9])); 4263 } 4264 else 4265 emit_insn (gen_pred_merge (mode, ops[0], RVV_VUNDEF (mode), ops[5], 4266 ops[4], ops[1], ops[6], ops[7], ops[9])); 4267 ops[5] = ops[4] = ops[0]; 4268 } 4269 else 4270 { 4271 /* Swap the multiplication ops if the fallback value is the 4272 second of the two. */ 4273 if (rtx_equal_p (ops[3], ops[5])) 4274 std::swap (ops[2], ops[3]); 4275 4276 /* TODO: ??? Maybe we could support splitting FMA (a, 4, b) 4277 into PLUS (ASHIFT (a, 2), b) according to uarchs. */ 4278 } 4279 gcc_assert (rtx_equal_p (ops[5], RVV_VUNDEF (mode)) 4280 || rtx_equal_p (ops[5], ops[2]) || rtx_equal_p (ops[5], ops[4])); 4281 } 4282 4283 /* Expand VEC_MASK_LEN_{LOAD_LANES,STORE_LANES}. */ 4284 void 4285 expand_lanes_load_store (rtx *ops, bool is_load) 4286 { 4287 rtx mask = ops[2]; 4288 rtx len = ops[3]; 4289 rtx addr = is_load ? XEXP (ops[1], 0) : XEXP (ops[0], 0); 4290 rtx reg = is_load ? ops[0] : ops[1]; 4291 machine_mode mode = GET_MODE (ops[0]); 4292 4293 if (is_vlmax_len_p (mode, len)) 4294 { 4295 /* If the length operand is equal to VF, it is VLMAX load/store. */ 4296 if (is_load) 4297 { 4298 rtx m_ops[] = {reg, mask, addr}; 4299 emit_vlmax_insn (code_for_pred_unit_strided_load (mode), UNARY_OP_TAMA, 4300 m_ops); 4301 } 4302 else 4303 { 4304 len = gen_reg_rtx (Pmode); 4305 emit_vlmax_vsetvl (mode, len); 4306 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len, 4307 get_avl_type_rtx (VLMAX))); 4308 } 4309 } 4310 else 4311 { 4312 if (!satisfies_constraint_vl (len)) 4313 len = force_reg (Pmode, len); 4314 if (is_load) 4315 { 4316 rtx m_ops[] = {reg, mask, addr}; 4317 emit_nonvlmax_insn (code_for_pred_unit_strided_load (mode), 4318 UNARY_OP_TAMA, m_ops, len); 4319 } 4320 else 4321 emit_insn (gen_pred_unit_strided_store (mode, mask, addr, reg, len, 4322 get_avl_type_rtx (NONVLMAX))); 4323 } 4324 } 4325 4326 /* Expand LEN_FOLD_EXTRACT_LAST. */ 4327 void 4328 expand_fold_extract_last (rtx *ops) 4329 { 4330 rtx dst = ops[0]; 4331 rtx default_value = ops[1]; 4332 rtx mask = ops[2]; 4333 rtx anchor = gen_reg_rtx (Pmode); 4334 rtx index = gen_reg_rtx (Pmode); 4335 rtx vect = ops[3]; 4336 rtx else_label = gen_label_rtx (); 4337 rtx end_label = gen_label_rtx (); 4338 rtx len = ops[4]; 4339 machine_mode mode = GET_MODE (vect); 4340 machine_mode mask_mode = GET_MODE (mask); 4341 rtx compress_vect = gen_reg_rtx (mode); 4342 rtx slide_vect = gen_reg_rtx (mode); 4343 insn_code icode; 4344 4345 if (is_vlmax_len_p (mode, len)) 4346 len = NULL_RTX; 4347 4348 /* Calculate the number of 1-bit in mask. */ 4349 rtx cpop_ops[] = {anchor, mask}; 4350 if (len) 4351 emit_nonvlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP, 4352 cpop_ops, len); 4353 else 4354 emit_vlmax_insn (code_for_pred_popcount (mask_mode, Pmode), CPOP_OP, 4355 cpop_ops); 4356 4357 riscv_expand_conditional_branch (else_label, EQ, anchor, const0_rtx); 4358 emit_insn (gen_rtx_SET (index, gen_rtx_PLUS (Pmode, anchor, constm1_rtx))); 4359 /* Compress the vector. */ 4360 icode = code_for_pred_compress (mode); 4361 rtx compress_ops[] = {compress_vect, vect, mask}; 4362 if (len) 4363 emit_nonvlmax_insn (icode, COMPRESS_OP, compress_ops, len); 4364 else 4365 emit_vlmax_insn (icode, COMPRESS_OP, compress_ops); 4366 /* Emit the slide down to index 0 in a new vector. */ 4367 rtx slide_ops[] = {slide_vect, compress_vect, index}; 4368 icode = code_for_pred_slide (UNSPEC_VSLIDEDOWN, mode); 4369 if (len) 4370 emit_nonvlmax_insn (icode, BINARY_OP, slide_ops, len); 4371 else 4372 emit_vlmax_insn (icode, BINARY_OP, slide_ops); 4373 /* Emit v(f)mv.[xf].s. */ 4374 emit_insn (gen_pred_extract_first (mode, dst, slide_vect)); 4375 4376 emit_jump_insn (gen_jump (end_label)); 4377 emit_barrier (); 4378 emit_label (else_label); 4379 emit_move_insn (dst, default_value); 4380 emit_label (end_label); 4381 } 4382 4383 /* Return true if the LMUL of comparison less than or equal to one. */ 4384 bool 4385 cmp_lmul_le_one (machine_mode mode) 4386 { 4387 if (riscv_v_ext_vector_mode_p (mode)) 4388 return known_le (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR); 4389 else if (riscv_v_ext_vls_mode_p (mode)) 4390 return known_le (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN); 4391 return false; 4392 } 4393 4394 /* Return true if the LMUL of comparison greater than one. */ 4395 bool 4396 cmp_lmul_gt_one (machine_mode mode) 4397 { 4398 if (riscv_v_ext_vector_mode_p (mode)) 4399 return known_gt (GET_MODE_SIZE (mode), BYTES_PER_RISCV_VECTOR); 4400 else if (riscv_v_ext_vls_mode_p (mode)) 4401 return known_gt (GET_MODE_BITSIZE (mode), TARGET_MIN_VLEN); 4402 return false; 4403 } 4404 4405 /* Return true if the VLS mode is legal. There are 2 cases here. 4406 4407 1. Enable VLS modes for VLA vectorization since fixed length VLMAX mode 4408 is the highest priority choice and should not conflict with VLS modes. 4409 2. Enable VLS modes for some cases in fixed-vlmax, aka the bitsize of the 4410 VLS mode are smaller than the minimal vla. 4411 4412 Take vlen = 2048 as example for case 2. 4413 4414 Note: Below table based on vlen = 2048. 4415 +----------------------------------------------------+----------------------+ 4416 | VLS mode | VLA mode | 4417 +----------------------------------------------------+----------------------+ 4418 | Name | Precision | Inner Precision | Enabled | Min mode | Min bits | 4419 +------------+-----------+-----------------+---------+-----------+----------+ 4420 | V1BI | 1 | 1 | Yes | RVVMF64BI | 32 | 4421 | V2BI | 2 | 1 | Yes | RVVMF64BI | 32 | 4422 | V4BI | 4 | 1 | Yes | RVVMF64BI | 32 | 4423 | V8BI | 8 | 1 | Yes | RVVMF64BI | 32 | 4424 | V16BI | 16 | 1 | Yes | RVVMF64BI | 32 | 4425 | V32BI | 32 | 1 | NO | RVVMF64BI | 32 | 4426 | V64BI | 64 | 1 | NO | RVVMF64BI | 32 | 4427 | ... | ... | ... | ... | RVVMF64BI | 32 | 4428 | V4096BI | 4096 | 1 | NO | RVVMF64BI | 32 | 4429 +------------+-----------+-----------------+---------+-----------+----------+ 4430 | V1QI | 8 | 8 | Yes | RVVMF8QI | 256 | 4431 | V2QI | 16 | 8 | Yes | RVVMF8QI | 256 | 4432 | V4QI | 32 | 8 | Yes | RVVMF8QI | 256 | 4433 | V8QI | 64 | 8 | Yes | RVVMF8QI | 256 | 4434 | V16QI | 128 | 8 | Yes | RVVMF8QI | 256 | 4435 | V32QI | 256 | 8 | NO | RVVMF8QI | 256 | 4436 | V64QI | 512 | 8 | NO | RVVMF8QI | 256 | 4437 | ... | ... | .. | ... | RVVMF8QI | 256 | 4438 | V4096QI | 32768 | 8 | NO | RVVMF8QI | 256 | 4439 +------------+-----------+-----------------+---------+-----------+----------+ 4440 | V1HI | 16 | 16 | Yes | RVVMF4HI | 512 | 4441 | V2HI | 32 | 16 | Yes | RVVMF4HI | 512 | 4442 | V4HI | 64 | 16 | Yes | RVVMF4HI | 512 | 4443 | V8HI | 128 | 16 | Yes | RVVMF4HI | 512 | 4444 | V16HI | 256 | 16 | Yes | RVVMF4HI | 512 | 4445 | V32HI | 512 | 16 | NO | RVVMF4HI | 512 | 4446 | V64HI | 1024 | 16 | NO | RVVMF4HI | 512 | 4447 | ... | ... | .. | ... | RVVMF4HI | 512 | 4448 | V2048HI | 32768 | 16 | NO | RVVMF4HI | 512 | 4449 +------------+-----------+-----------------+---------+-----------+----------+ 4450 | V1SI/SF | 32 | 32 | Yes | RVVMF2SI | 1024 | 4451 | V2SI/SF | 64 | 32 | Yes | RVVMF2SI | 1024 | 4452 | V4SI/SF | 128 | 32 | Yes | RVVMF2SI | 1024 | 4453 | V8SI/SF | 256 | 32 | Yes | RVVMF2SI | 1024 | 4454 | V16SI/SF | 512 | 32 | Yes | RVVMF2SI | 1024 | 4455 | V32SI/SF | 1024 | 32 | NO | RVVMF2SI | 1024 | 4456 | V64SI/SF | 2048 | 32 | NO | RVVMF2SI | 1024 | 4457 | ... | ... | .. | ... | RVVMF2SI | 1024 | 4458 | V1024SI/SF | 32768 | 32 | NO | RVVMF2SI | 1024 | 4459 +------------+-----------+-----------------+---------+-----------+----------+ 4460 | V1DI/DF | 64 | 64 | Yes | RVVM1DI | 2048 | 4461 | V2DI/DF | 128 | 64 | Yes | RVVM1DI | 2048 | 4462 | V4DI/DF | 256 | 64 | Yes | RVVM1DI | 2048 | 4463 | V8DI/DF | 512 | 64 | Yes | RVVM1DI | 2048 | 4464 | V16DI/DF | 1024 | 64 | Yes | RVVM1DI | 2048 | 4465 | V32DI/DF | 2048 | 64 | NO | RVVM1DI | 2048 | 4466 | V64DI/DF | 4096 | 64 | NO | RVVM1DI | 2048 | 4467 | ... | ... | .. | ... | RVVM1DI | 2048 | 4468 | V512DI/DF | 32768 | 64 | NO | RVVM1DI | 2048 | 4469 +------------+-----------+-----------------+---------+-----------+----------+ 4470 4471 Then we can have the condition for VLS mode in fixed-vlmax, aka: 4472 PRECISION (VLSmode) < VLEN / (64 / PRECISION(VLS_inner_mode)). */ 4473 bool 4474 vls_mode_valid_p (machine_mode vls_mode) 4475 { 4476 if (!TARGET_VECTOR || TARGET_XTHEADVECTOR) 4477 return false; 4478 4479 if (rvv_vector_bits == RVV_VECTOR_BITS_SCALABLE) 4480 { 4481 if (GET_MODE_CLASS (vls_mode) != MODE_VECTOR_BOOL 4482 && !ordered_p (TARGET_MAX_LMUL * BITS_PER_RISCV_VECTOR, 4483 GET_MODE_PRECISION (vls_mode))) 4484 /* We enable VLS modes which are aligned with TARGET_MAX_LMUL and 4485 BITS_PER_RISCV_VECTOR. 4486 4487 e.g. When TARGET_MAX_LMUL = 1 and BITS_PER_RISCV_VECTOR = (128,128). 4488 We enable VLS modes have fixed size <= 128bit. Since ordered_p is 4489 false between VLA modes with size = (128, 128) bits and VLS mode 4490 with size = 128 bits, we will end up with multiple ICEs in 4491 middle-end generic codes. */ 4492 return false; 4493 return true; 4494 } 4495 4496 if (rvv_vector_bits == RVV_VECTOR_BITS_ZVL) 4497 { 4498 machine_mode inner_mode = GET_MODE_INNER (vls_mode); 4499 int precision = GET_MODE_PRECISION (inner_mode).to_constant (); 4500 int min_vlmax_bitsize = TARGET_MIN_VLEN / (64 / precision); 4501 4502 return GET_MODE_PRECISION (vls_mode).to_constant () < min_vlmax_bitsize; 4503 } 4504 4505 return false; 4506 } 4507 4508 /* We don't have to convert the floating point to integer when the 4509 mantissa is zero. Thus, ther will be a limitation for both the 4510 single and double precision floating point. There will be no 4511 mantissa if the floating point is greater than the limit. 4512 4513 1. Half floating point. 4514 +-----------+---------------+ 4515 | float | binary layout | 4516 +-----------+---------------+ 4517 | 1023.5 | 0x63ff | 4518 +-----------+---------------+ 4519 | 1024.0 | 0x6400 | 4520 +-----------+---------------+ 4521 | 1025.0 | 0x6401 | 4522 +-----------+---------------+ 4523 | ... | ... | 4524 4525 All half floating point will be unchanged for ceil if it is 4526 greater than and equal to 1024. 4527 4528 2. Single floating point. 4529 +-----------+---------------+ 4530 | float | binary layout | 4531 +-----------+---------------+ 4532 | 8388607.5 | 0x4affffff | 4533 +-----------+---------------+ 4534 | 8388608.0 | 0x4b000000 | 4535 +-----------+---------------+ 4536 | 8388609.0 | 0x4b000001 | 4537 +-----------+---------------+ 4538 | ... | ... | 4539 4540 All single floating point will be unchanged for ceil if it is 4541 greater than and equal to 8388608. 4542 4543 3. Double floating point. 4544 +--------------------+--------------------+ 4545 | float | binary layout | 4546 +--------------------+--------------------+ 4547 | 4503599627370495.5 | 0X432fffffffffffff | 4548 +--------------------+--------------------+ 4549 | 4503599627370496.0 | 0X4330000000000000 | 4550 +--------------------+--------------------+ 4551 | 4503599627370497.0 | 0X4340000000000000 | 4552 +--------------------+--------------------+ 4553 | ... | ... | 4554 4555 All double floating point will be unchanged for ceil if it is 4556 greater than and equal to 4503599627370496. 4557 */ 4558 static rtx 4559 get_fp_rounding_coefficient (machine_mode inner_mode) 4560 { 4561 REAL_VALUE_TYPE real; 4562 4563 if (inner_mode == E_HFmode) 4564 real_from_integer (&real, inner_mode, 1024, SIGNED); 4565 else if (inner_mode == E_SFmode) 4566 real_from_integer (&real, inner_mode, 8388608, SIGNED); 4567 else if (inner_mode == E_DFmode) 4568 real_from_integer (&real, inner_mode, 4503599627370496, SIGNED); 4569 else 4570 gcc_unreachable (); 4571 4572 return const_double_from_real_value (real, inner_mode); 4573 } 4574 4575 static rtx 4576 emit_vec_float_cmp_mask (rtx fp_vector, rtx_code code, rtx fp_scalar, 4577 machine_mode vec_fp_mode) 4578 { 4579 /* Step-1: Prepare the scalar float compare register. */ 4580 rtx fp_reg = gen_reg_rtx (GET_MODE_INNER (vec_fp_mode)); 4581 emit_insn (gen_move_insn (fp_reg, fp_scalar)); 4582 4583 /* Step-2: Generate the mask. */ 4584 machine_mode mask_mode = get_mask_mode (vec_fp_mode); 4585 rtx mask = gen_reg_rtx (mask_mode); 4586 rtx cmp = gen_rtx_fmt_ee (code, mask_mode, fp_vector, fp_reg); 4587 rtx cmp_ops[] = {mask, cmp, fp_vector, fp_reg}; 4588 insn_code icode = code_for_pred_cmp_scalar (vec_fp_mode); 4589 emit_vlmax_insn (icode, COMPARE_OP, cmp_ops); 4590 4591 return mask; 4592 } 4593 4594 static void 4595 emit_vec_copysign (rtx op_dest, rtx op_src_0, rtx op_src_1, 4596 machine_mode vec_mode) 4597 { 4598 rtx sgnj_ops[] = {op_dest, op_src_0, op_src_1}; 4599 insn_code icode = code_for_pred (UNSPEC_VCOPYSIGN, vec_mode); 4600 4601 emit_vlmax_insn (icode, BINARY_OP, sgnj_ops); 4602 } 4603 4604 static void 4605 emit_vec_abs (rtx op_dest, rtx op_src, machine_mode vec_mode) 4606 { 4607 rtx abs_ops[] = {op_dest, op_src}; 4608 insn_code icode = code_for_pred (ABS, vec_mode); 4609 4610 emit_vlmax_insn (icode, UNARY_OP, abs_ops); 4611 } 4612 4613 static void 4614 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, rtx mask, 4615 insn_type type, machine_mode vec_mode) 4616 { 4617 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode); 4618 4619 if (type & USE_VUNDEF_MERGE_P) 4620 { 4621 rtx cvt_x_ops[] = {op_dest, mask, op_src}; 4622 emit_vlmax_insn (icode, type, cvt_x_ops); 4623 } 4624 else 4625 { 4626 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src}; 4627 emit_vlmax_insn (icode, type, cvt_x_ops); 4628 } 4629 } 4630 4631 static void 4632 emit_vec_cvt_x_f (rtx op_dest, rtx op_src, insn_type type, 4633 machine_mode vec_mode) 4634 { 4635 rtx ops[] = {op_dest, op_src}; 4636 insn_code icode = code_for_pred_fcvt_x_f (UNSPEC_VFCVT, vec_mode); 4637 4638 emit_vlmax_insn (icode, type, ops); 4639 } 4640 4641 static void 4642 emit_vec_narrow_cvt_x_f (rtx op_dest, rtx op_src, insn_type type, 4643 machine_mode vec_mode) 4644 { 4645 rtx ops[] = {op_dest, op_src}; 4646 insn_code icode = code_for_pred_narrow_fcvt_x_f (UNSPEC_VFCVT, vec_mode); 4647 4648 emit_vlmax_insn (icode, type, ops); 4649 } 4650 4651 static void 4652 emit_vec_widden_cvt_x_f (rtx op_dest, rtx op_src, insn_type type, 4653 machine_mode vec_mode) 4654 { 4655 rtx ops[] = {op_dest, op_src}; 4656 insn_code icode = code_for_pred_widen_fcvt_x_f (UNSPEC_VFCVT, vec_mode); 4657 4658 emit_vlmax_insn (icode, type, ops); 4659 } 4660 4661 static void 4662 emit_vec_widden_cvt_f_f (rtx op_dest, rtx op_src, insn_type type, 4663 machine_mode vec_mode) 4664 { 4665 rtx ops[] = {op_dest, op_src}; 4666 insn_code icode = code_for_pred_extend (vec_mode); 4667 4668 emit_vlmax_insn (icode, type, ops); 4669 } 4670 4671 static void 4672 emit_vec_cvt_f_x (rtx op_dest, rtx op_src, rtx mask, 4673 insn_type type, machine_mode vec_mode) 4674 { 4675 rtx cvt_fp_ops[] = {op_dest, mask, op_dest, op_src}; 4676 insn_code icode = code_for_pred (FLOAT, vec_mode); 4677 4678 emit_vlmax_insn (icode, type, cvt_fp_ops); 4679 } 4680 4681 static void 4682 emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask, 4683 insn_type type, machine_mode vec_mode) 4684 { 4685 insn_code icode = code_for_pred (FIX, vec_mode); 4686 4687 if (type & USE_VUNDEF_MERGE_P) 4688 { 4689 rtx cvt_x_ops[] = {op_dest, mask, op_src}; 4690 emit_vlmax_insn (icode, type, cvt_x_ops); 4691 } 4692 else 4693 { 4694 rtx cvt_x_ops[] = {op_dest, mask, op_dest, op_src}; 4695 emit_vlmax_insn (icode, type, cvt_x_ops); 4696 } 4697 } 4698 4699 void 4700 expand_vec_ceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4701 machine_mode vec_int_mode) 4702 { 4703 /* Step-1: Get the abs float value for mask generation. */ 4704 emit_vec_abs (op_0, op_1, vec_fp_mode); 4705 4706 /* Step-2: Generate the mask on const fp. */ 4707 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4708 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4709 4710 /* Step-3: Convert to integer on mask, with rounding up (aka ceil). */ 4711 rtx tmp = gen_reg_rtx (vec_int_mode); 4712 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RUP, vec_fp_mode); 4713 4714 /* Step-4: Convert to floating-point on mask for the final result. 4715 To avoid unnecessary frm register access, we use RUP here and it will 4716 never do the rounding up because the tmp rtx comes from the float 4717 to int conversion. */ 4718 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RUP, vec_fp_mode); 4719 4720 /* Step-5: Retrieve the sign bit for -0.0. */ 4721 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4722 } 4723 4724 void 4725 expand_vec_floor (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4726 machine_mode vec_int_mode) 4727 { 4728 /* Step-1: Get the abs float value for mask generation. */ 4729 emit_vec_abs (op_0, op_1, vec_fp_mode); 4730 4731 /* Step-2: Generate the mask on const fp. */ 4732 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4733 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4734 4735 /* Step-3: Convert to integer on mask, with rounding down (aka floor). */ 4736 rtx tmp = gen_reg_rtx (vec_int_mode); 4737 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RDN, vec_fp_mode); 4738 4739 /* Step-4: Convert to floating-point on mask for the floor result. */ 4740 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RDN, vec_fp_mode); 4741 4742 /* Step-5: Retrieve the sign bit for -0.0. */ 4743 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4744 } 4745 4746 void 4747 expand_vec_nearbyint (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4748 machine_mode vec_int_mode) 4749 { 4750 /* Step-1: Get the abs float value for mask generation. */ 4751 emit_vec_abs (op_0, op_1, vec_fp_mode); 4752 4753 /* Step-2: Generate the mask on const fp. */ 4754 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4755 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4756 4757 /* Step-3: Backup FP exception flags, nearbyint never raise exceptions. */ 4758 rtx fflags = gen_reg_rtx (SImode); 4759 emit_insn (gen_riscv_frflags (fflags)); 4760 4761 /* Step-4: Convert to integer on mask, with rounding down (aka nearbyint). */ 4762 rtx tmp = gen_reg_rtx (vec_int_mode); 4763 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode); 4764 4765 /* Step-5: Convert to floating-point on mask for the nearbyint result. */ 4766 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode); 4767 4768 /* Step-6: Restore FP exception flags. */ 4769 emit_insn (gen_riscv_fsflags (fflags)); 4770 4771 /* Step-7: Retrieve the sign bit for -0.0. */ 4772 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4773 } 4774 4775 void 4776 expand_vec_rint (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4777 machine_mode vec_int_mode) 4778 { 4779 /* Step-1: Get the abs float value for mask generation. */ 4780 emit_vec_abs (op_0, op_1, vec_fp_mode); 4781 4782 /* Step-2: Generate the mask on const fp. */ 4783 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4784 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4785 4786 /* Step-3: Convert to integer on mask, with dyn rounding (aka rint). */ 4787 rtx tmp = gen_reg_rtx (vec_int_mode); 4788 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_DYN, vec_fp_mode); 4789 4790 /* Step-4: Convert to floating-point on mask for the rint result. */ 4791 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode); 4792 4793 /* Step-5: Retrieve the sign bit for -0.0. */ 4794 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4795 } 4796 4797 void 4798 expand_vec_round (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4799 machine_mode vec_int_mode) 4800 { 4801 /* Step-1: Get the abs float value for mask generation. */ 4802 emit_vec_abs (op_0, op_1, vec_fp_mode); 4803 4804 /* Step-2: Generate the mask on const fp. */ 4805 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4806 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4807 4808 /* Step-3: Convert to integer on mask, rounding to nearest (aka round). */ 4809 rtx tmp = gen_reg_rtx (vec_int_mode); 4810 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RMM, vec_fp_mode); 4811 4812 /* Step-4: Convert to floating-point on mask for the round result. */ 4813 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RMM, vec_fp_mode); 4814 4815 /* Step-5: Retrieve the sign bit for -0.0. */ 4816 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4817 } 4818 4819 void 4820 expand_vec_trunc (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4821 machine_mode vec_int_mode) 4822 { 4823 /* Step-1: Get the abs float value for mask generation. */ 4824 emit_vec_abs (op_0, op_1, vec_fp_mode); 4825 4826 /* Step-2: Generate the mask on const fp. */ 4827 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4828 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4829 4830 /* Step-3: Convert to integer on mask, rounding to zero (aka truncate). */ 4831 rtx tmp = gen_reg_rtx (vec_int_mode); 4832 emit_vec_cvt_x_f_rtz (tmp, op_1, mask, UNARY_OP_TAMA, vec_fp_mode); 4833 4834 /* Step-4: Convert to floating-point on mask for the rint result. */ 4835 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_DYN, vec_fp_mode); 4836 4837 /* Step-5: Retrieve the sign bit for -0.0. */ 4838 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4839 } 4840 4841 void 4842 expand_vec_roundeven (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4843 machine_mode vec_int_mode) 4844 { 4845 /* Step-1: Get the abs float value for mask generation. */ 4846 emit_vec_abs (op_0, op_1, vec_fp_mode); 4847 4848 /* Step-2: Generate the mask on const fp. */ 4849 rtx const_fp = get_fp_rounding_coefficient (GET_MODE_INNER (vec_fp_mode)); 4850 rtx mask = emit_vec_float_cmp_mask (op_0, LT, const_fp, vec_fp_mode); 4851 4852 /* Step-3: Convert to integer on mask, rounding to nearest, ties to even. */ 4853 rtx tmp = gen_reg_rtx (vec_int_mode); 4854 emit_vec_cvt_x_f (tmp, op_1, mask, UNARY_OP_TAMA_FRM_RNE, vec_fp_mode); 4855 4856 /* Step-4: Convert to floating-point on mask for the rint result. */ 4857 emit_vec_cvt_f_x (op_0, tmp, mask, UNARY_OP_TAMU_FRM_RNE, vec_fp_mode); 4858 4859 /* Step-5: Retrieve the sign bit for -0.0. */ 4860 emit_vec_copysign (op_0, op_0, op_1, vec_fp_mode); 4861 } 4862 4863 /* Handling the rounding from floating-point to int/long/long long. */ 4864 static void 4865 emit_vec_rounding_to_integer (rtx op_0, rtx op_1, insn_type type, 4866 machine_mode vec_fp_mode, 4867 machine_mode vec_int_mode, 4868 machine_mode vec_bridge_mode = E_VOIDmode) 4869 { 4870 poly_uint16 vec_fp_size = GET_MODE_SIZE (vec_fp_mode); 4871 poly_uint16 vec_int_size = GET_MODE_SIZE (vec_int_mode); 4872 4873 if (known_eq (vec_fp_size, vec_int_size)) /* SF => SI, DF => DI. */ 4874 emit_vec_cvt_x_f (op_0, op_1, type, vec_fp_mode); 4875 else if (maybe_eq (vec_fp_size, vec_int_size * 2)) /* DF => SI. */ 4876 emit_vec_narrow_cvt_x_f (op_0, op_1, type, vec_fp_mode); 4877 else if (maybe_eq (vec_fp_size * 2, vec_int_size)) /* SF => DI, HF => SI. */ 4878 emit_vec_widden_cvt_x_f (op_0, op_1, type, vec_int_mode); 4879 else if (maybe_eq (vec_fp_size * 4, vec_int_size)) /* HF => DI. */ 4880 { 4881 gcc_assert (vec_bridge_mode != E_VOIDmode); 4882 4883 rtx op_sf = gen_reg_rtx (vec_bridge_mode); 4884 4885 /* Step-1: HF => SF, no rounding here. */ 4886 emit_vec_widden_cvt_f_f (op_sf, op_1, UNARY_OP, vec_bridge_mode); 4887 /* Step-2: SF => DI. */ 4888 emit_vec_widden_cvt_x_f (op_0, op_sf, type, vec_int_mode); 4889 } 4890 else 4891 gcc_unreachable (); 4892 } 4893 4894 void 4895 expand_vec_lrint (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4896 machine_mode vec_int_mode, machine_mode vec_bridge_mode) 4897 { 4898 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_DYN, vec_fp_mode, 4899 vec_int_mode, vec_bridge_mode); 4900 } 4901 4902 void 4903 expand_vec_lround (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4904 machine_mode vec_int_mode, machine_mode vec_bridge_mode) 4905 { 4906 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RMM, vec_fp_mode, 4907 vec_int_mode, vec_bridge_mode); 4908 } 4909 4910 void 4911 expand_vec_lceil (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4912 machine_mode vec_int_mode) 4913 { 4914 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RUP, vec_fp_mode, 4915 vec_int_mode); 4916 } 4917 4918 void 4919 expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode, 4920 machine_mode vec_int_mode) 4921 { 4922 emit_vec_rounding_to_integer (op_0, op_1, UNARY_OP_FRM_RDN, vec_fp_mode, 4923 vec_int_mode); 4924 } 4925 4926 /* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as 4927 well. */ 4928 void 4929 expand_popcount (rtx *ops) 4930 { 4931 rtx dst = ops[0]; 4932 rtx src = ops[1]; 4933 machine_mode mode = GET_MODE (dst); 4934 scalar_mode imode = GET_MODE_INNER (mode); 4935 static const uint64_t m5 = 0x5555555555555555ULL; 4936 static const uint64_t m3 = 0x3333333333333333ULL; 4937 static const uint64_t mf = 0x0F0F0F0F0F0F0F0FULL; 4938 static const uint64_t m1 = 0x0101010101010101ULL; 4939 4940 rtx x1 = gen_reg_rtx (mode); 4941 rtx x2 = gen_reg_rtx (mode); 4942 rtx x3 = gen_reg_rtx (mode); 4943 rtx x4 = gen_reg_rtx (mode); 4944 4945 /* x1 = src - (src >> 1) & 0x555...); */ 4946 rtx shift1 = expand_binop (mode, lshr_optab, src, GEN_INT (1), NULL, true, 4947 OPTAB_DIRECT); 4948 4949 rtx and1 = gen_reg_rtx (mode); 4950 rtx ops1[] = {and1, shift1, gen_int_mode (m5, imode)}; 4951 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP, 4952 ops1); 4953 4954 x1 = expand_binop (mode, sub_optab, src, and1, NULL, true, OPTAB_DIRECT); 4955 4956 /* x2 = (x1 & 0x3333333333333333ULL) + ((x1 >> 2) & 0x3333333333333333ULL); 4957 */ 4958 rtx and2 = gen_reg_rtx (mode); 4959 rtx ops2[] = {and2, x1, gen_int_mode (m3, imode)}; 4960 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP, 4961 ops2); 4962 4963 rtx shift2 = expand_binop (mode, lshr_optab, x1, GEN_INT (2), NULL, true, 4964 OPTAB_DIRECT); 4965 4966 rtx and22 = gen_reg_rtx (mode); 4967 rtx ops22[] = {and22, shift2, gen_int_mode (m3, imode)}; 4968 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP, 4969 ops22); 4970 4971 x2 = expand_binop (mode, add_optab, and2, and22, NULL, true, OPTAB_DIRECT); 4972 4973 /* x3 = (x2 + (x2 >> 4)) & 0x0f0f0f0f0f0f0f0fULL; */ 4974 rtx shift3 = expand_binop (mode, lshr_optab, x2, GEN_INT (4), NULL, true, 4975 OPTAB_DIRECT); 4976 4977 rtx plus3 4978 = expand_binop (mode, add_optab, x2, shift3, NULL, true, OPTAB_DIRECT); 4979 4980 rtx ops3[] = {x3, plus3, gen_int_mode (mf, imode)}; 4981 emit_vlmax_insn (code_for_pred_scalar (AND, mode), riscv_vector::BINARY_OP, 4982 ops3); 4983 4984 /* dest = (x3 * 0x0101010101010101ULL) >> 56; */ 4985 rtx mul4 = gen_reg_rtx (mode); 4986 rtx ops4[] = {mul4, x3, gen_int_mode (m1, imode)}; 4987 emit_vlmax_insn (code_for_pred_scalar (MULT, mode), riscv_vector::BINARY_OP, 4988 ops4); 4989 4990 x4 = expand_binop (mode, lshr_optab, mul4, 4991 GEN_INT (GET_MODE_BITSIZE (imode) - 8), NULL, true, 4992 OPTAB_DIRECT); 4993 4994 emit_move_insn (dst, x4); 4995 } 4996 4997 /* Return true if it is VLMAX AVL TYPE. */ 4998 bool 4999 vlmax_avl_type_p (rtx_insn *rinsn) 5000 { 5001 extract_insn_cached (rinsn); 5002 int index = get_attr_avl_type_idx (rinsn); 5003 if (index == INVALID_ATTRIBUTE) 5004 return false; 5005 5006 gcc_assert (index < recog_data.n_operands); 5007 5008 rtx avl_type = recog_data.operand[index]; 5009 return INTVAL (avl_type) == VLMAX; 5010 } 5011 5012 /* Return true if it is an RVV instruction depends on VL global 5013 status register. */ 5014 bool 5015 has_vl_op (rtx_insn *rinsn) 5016 { 5017 return recog_memoized (rinsn) >= 0 && get_attr_has_vl_op (rinsn); 5018 } 5019 5020 /* Get default tail policy. */ 5021 static bool 5022 get_default_ta () 5023 { 5024 /* For the instruction that doesn't require TA, we still need a default value 5025 to emit vsetvl. We pick up the default value according to prefer policy. */ 5026 return (bool) (get_prefer_tail_policy () & 0x1 5027 || (get_prefer_tail_policy () >> 1 & 0x1)); 5028 } 5029 5030 /* Helper function to get TA operand. */ 5031 bool 5032 tail_agnostic_p (rtx_insn *rinsn) 5033 { 5034 /* If it doesn't have TA, we return agnostic by default. */ 5035 extract_insn_cached (rinsn); 5036 int ta = get_attr_ta (rinsn); 5037 return ta == INVALID_ATTRIBUTE ? get_default_ta () : IS_AGNOSTIC (ta); 5038 } 5039 5040 /* Change insn and Assert the change always happens. */ 5041 void 5042 validate_change_or_fail (rtx object, rtx *loc, rtx new_rtx, bool in_group) 5043 { 5044 bool change_p = validate_change (object, loc, new_rtx, in_group); 5045 gcc_assert (change_p); 5046 } 5047 5048 /* Return true if it is NONVLMAX AVL TYPE. */ 5049 bool 5050 nonvlmax_avl_type_p (rtx_insn *rinsn) 5051 { 5052 extract_insn_cached (rinsn); 5053 int index = get_attr_avl_type_idx (rinsn); 5054 if (index == INVALID_ATTRIBUTE) 5055 return false; 5056 5057 gcc_assert (index < recog_data.n_operands); 5058 5059 rtx avl_type = recog_data.operand[index]; 5060 return INTVAL (avl_type) == NONVLMAX; 5061 } 5062 5063 /* Return true if RTX is RVV VLMAX AVL. */ 5064 bool 5065 vlmax_avl_p (rtx x) 5066 { 5067 return x && rtx_equal_p (x, RVV_VLMAX); 5068 } 5069 5070 /* Helper function to get SEW operand. We always have SEW value for 5071 all RVV instructions that have VTYPE OP. */ 5072 uint8_t 5073 get_sew (rtx_insn *rinsn) 5074 { 5075 return get_attr_sew (rinsn); 5076 } 5077 5078 /* Helper function to get VLMUL operand. We always have VLMUL value for 5079 all RVV instructions that have VTYPE OP. */ 5080 enum vlmul_type 5081 get_vlmul (rtx_insn *rinsn) 5082 { 5083 return (enum vlmul_type) get_attr_vlmul (rinsn); 5084 } 5085 5086 /* Count the number of REGNO in RINSN. */ 5087 int 5088 count_regno_occurrences (rtx_insn *rinsn, unsigned int regno) 5089 { 5090 int count = 0; 5091 extract_insn (rinsn); 5092 for (int i = 0; i < recog_data.n_operands; i++) 5093 if (refers_to_regno_p (regno, recog_data.operand[i])) 5094 count++; 5095 return count; 5096 } 5097 5098 /* Return true if the OP can be directly broadcasted. */ 5099 bool 5100 can_be_broadcasted_p (rtx op) 5101 { 5102 machine_mode mode = GET_MODE (op); 5103 /* We don't allow RA (register allocation) reload generate 5104 (vec_duplicate:DI reg) in RV32 system wheras we allow 5105 (vec_duplicate:DI mem) in RV32 system. */ 5106 if (!can_create_pseudo_p () && !FLOAT_MODE_P (mode) 5107 && maybe_gt (GET_MODE_SIZE (mode), GET_MODE_SIZE (Pmode)) 5108 && !satisfies_constraint_Wdm (op)) 5109 return false; 5110 5111 if (satisfies_constraint_vl (op) || register_operand (op, mode) 5112 || satisfies_constraint_Wdm (op) || rtx_equal_p (op, CONST0_RTX (mode))) 5113 return true; 5114 5115 return can_create_pseudo_p () && nonmemory_operand (op, mode); 5116 } 5117 5118 void 5119 emit_vec_extract (rtx target, rtx src, rtx index) 5120 { 5121 machine_mode vmode = GET_MODE (src); 5122 machine_mode smode = GET_MODE (target); 5123 class expand_operand ops[3]; 5124 enum insn_code icode 5125 = convert_optab_handler (vec_extract_optab, vmode, smode); 5126 gcc_assert (icode != CODE_FOR_nothing); 5127 create_output_operand (&ops[0], target, smode); 5128 ops[0].target = 1; 5129 create_input_operand (&ops[1], src, vmode); 5130 5131 poly_int64 val; 5132 if (poly_int_rtx_p (index, &val)) 5133 create_integer_operand (&ops[2], val); 5134 else 5135 create_input_operand (&ops[2], index, Pmode); 5136 5137 expand_insn (icode, 3, ops); 5138 if (ops[0].value != target) 5139 emit_move_insn (target, ops[0].value); 5140 } 5141 5142 /* Return true if the offset mode is valid mode that we use for gather/scatter 5143 autovectorization. */ 5144 bool 5145 gather_scatter_valid_offset_p (machine_mode mode) 5146 { 5147 /* If the element size of offset mode is already >= Pmode size, 5148 we don't need any extensions. */ 5149 if (known_ge (GET_MODE_SIZE (GET_MODE_INNER (mode)), UNITS_PER_WORD)) 5150 return true; 5151 5152 /* Since we are very likely extend the offset mode into vector Pmode, 5153 Disable gather/scatter autovectorization if we can't extend the offset 5154 mode into vector Pmode. */ 5155 if (!get_vector_mode (Pmode, GET_MODE_NUNITS (mode)).exists ()) 5156 return false; 5157 return true; 5158 } 5159 5160 /* Implement TARGET_ESTIMATED_POLY_VALUE. 5161 Look into the tuning structure for an estimate. 5162 KIND specifies the type of requested estimate: min, max or likely. 5163 For cores with a known VLA width all three estimates are the same. 5164 For generic VLA tuning we want to distinguish the maximum estimate from 5165 the minimum and likely ones. 5166 The likely estimate is the same as the minimum in that case to give a 5167 conservative behavior of auto-vectorizing with VLA when it is a win 5168 even for VLA vectorization. 5169 When VLA width information is available VAL.coeffs[1] is multiplied by 5170 the number of VLA chunks over the initial VLS bits. */ 5171 HOST_WIDE_INT 5172 estimated_poly_value (poly_int64 val, unsigned int kind) 5173 { 5174 unsigned int width_source 5175 = BITS_PER_RISCV_VECTOR.is_constant () 5176 ? (unsigned int) BITS_PER_RISCV_VECTOR.to_constant () 5177 : (unsigned int) RVV_VECTOR_BITS_SCALABLE; 5178 5179 /* If there is no core-specific information then the minimum and likely 5180 values are based on TARGET_MIN_VLEN vectors and the maximum is based on 5181 the architectural maximum of 65536 bits. */ 5182 unsigned int min_vlen_bytes = TARGET_MIN_VLEN / 8 - 1; 5183 if (width_source == RVV_VECTOR_BITS_SCALABLE) 5184 switch (kind) 5185 { 5186 case POLY_VALUE_MIN: 5187 case POLY_VALUE_LIKELY: 5188 return val.coeffs[0]; 5189 5190 case POLY_VALUE_MAX: 5191 return val.coeffs[0] + val.coeffs[1] * min_vlen_bytes; 5192 } 5193 5194 /* Allow BITS_PER_RISCV_VECTOR to be a bitmask of different VL, treating the 5195 lowest as likely. This could be made more general if future -mtune 5196 options need it to be. */ 5197 if (kind == POLY_VALUE_MAX) 5198 width_source = 1 << floor_log2 (width_source); 5199 else 5200 width_source = least_bit_hwi (width_source); 5201 5202 /* If the core provides width information, use that. */ 5203 HOST_WIDE_INT over_min_vlen = width_source - TARGET_MIN_VLEN; 5204 return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN; 5205 } 5206 5207 /* Return true it is whole register-register move. */ 5208 bool 5209 whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) 5210 { 5211 /* An operation is a whole-register move if either 5212 (1) Its vlmax operand equals VLMAX 5213 (2) Its vl operand equals the number of units of its mode. */ 5214 if (register_operand (ops[0], mode) 5215 && register_operand (ops[3], mode) 5216 && satisfies_constraint_vu (ops[2]) 5217 && satisfies_constraint_Wc1 (ops[1])) 5218 { 5219 if (INTVAL (ops[avl_type_index]) == VLMAX) 5220 return true; 5221 /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32 5222 into NON-VLMAX with LEN = NUNITS. */ 5223 else if (CONST_INT_P (ops[4]) 5224 && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode))) 5225 return true; 5226 } 5227 return false; 5228 } 5229 5230 /* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */ 5231 bool 5232 splat_to_scalar_move_p (rtx *ops) 5233 { 5234 return satisfies_constraint_Wc1 (ops[1]) 5235 && satisfies_constraint_vu (ops[2]) 5236 && !MEM_P (ops[3]) 5237 && satisfies_constraint_c01 (ops[4]) 5238 && INTVAL (ops[7]) == NONVLMAX 5239 && known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3]))); 5240 } 5241 5242 } // namespace riscv_vector 5243