Home | History | Annotate | Line # | Download | only in sparc
      1 ;; Scheduling description for the SPARC M8.
      2 ;;   Copyright (C) 2017-2022 Free Software Foundation, Inc.
      3 ;;
      4 ;; This file is part of GCC.
      5 ;;
      6 ;; GCC is free software; you can redistribute it and/or modify
      7 ;; it under the terms of the GNU General Public License as published by
      8 ;; the Free Software Foundation; either version 3, or (at your option)
      9 ;; any later version.
     10 ;;
     11 ;; GCC is distributed in the hope that it will be useful,
     12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 ;; GNU General Public License for more details.
     15 ;;
     16 ;; You should have received a copy of the GNU General Public License
     17 ;; along with GCC; see the file COPYING3.  If not see
     18 ;; <http://www.gnu.org/licenses/>.
     19 
     20 ;; Thigs to improve:
     21 ;;
     22 ;; - Store instructions are implemented by micro-ops, one of which
     23 ;;   generates the store address and is executed in the store address
     24 ;;   generation unit in the slot0.  We need to model that.
     25 ;;
     26 ;; - There are two V3 pipes connected to different slots.  The current
     27 ;;   implementation assumes that all the instructions executing in a
     28 ;;   V3 pipe are issued to the unit in slot3.
     29 ;;
     30 ;; - Single-issue ALU operations incur an additional cycle of latency to
     31 ;;   slot 0 and slot 1 instructions.  This is not currently reflected
     32 ;;   in the DFA.
     33 
     34 (define_automaton "m8_0")
     35 
     36 ;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
     37 ;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
     38 ;; PQEX corresponds to slots 2 and 3.  The core can issue 4
     39 ;; instructions per-cycle, and up to 4 instructions are committed each
     40 ;; cycle.
     41 ;;
     42 ;;                            
     43 ;;                   m8_slot0  - Load Unit.
     44 ;;                             - Store address gen. Unit.
     45 ;;                                                       
     46 ;;                            
     47 ;;   === PQLS ==>    m8_slot1  - Store data unit.
     48 ;;                             - Branch unit.
     49 ;;                                            
     50 ;;                             
     51 ;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).                     
     52 ;;                             - 3-cycles Crypto Unit (SPU2).
     53 ;;                                                     
     54 ;;                   m8_slot3  - Integer Unit (EXU3).
     55 ;;                             - 3-cycles Crypto Unit (SPU3).
     56 ;;                             - Floating-point and graphics unit (FPG).
     57 ;;                             - Long-latency Crypto Unit.
     58 ;;                             - Oracle Numbers Unit (ONU).
     59 
     60 (define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")
     61 
     62 ;; Some instructions stall the pipeline and avoid any other
     63 ;; instruction to be issued in the same cycle.  We assume the same for
     64 ;; multi-instruction insns.
     65 
     66 (define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")
     67 
     68 (define_insn_reservation "m8_single" 1
     69   (and (eq_attr "cpu" "m8")
     70        (eq_attr "type" "multi,savew,flushw,trap,bmask"))
     71   "m8_single_issue")
     72 
     73 ;; Most of the instructions executing in the integer units have a
     74 ;; latency of 1.
     75 
     76 (define_insn_reservation "m8_integer" 1
     77   (and (eq_attr "cpu" "m8")
     78        (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
     79   "(m8_slot2 | m8_slot3)")
     80 
     81 ;; Flushing the instruction memory takes 27 cycles.
     82 
     83 
     84 (define_insn_reservation "m8_iflush" 27
     85   (and (eq_attr "cpu" "m8")
     86        (eq_attr "type" "iflush"))
     87   "(m8_slot2 | m8_slot3), nothing*26")
     88 
     89 ;; The integer multiplication instructions have a latency of 10 cycles
     90 ;; and execute in integer units.
     91 ;;
     92 ;; Likewise for array*, edge* and pdistn instructions.
     93 ;;
     94 ;; However, the latency is only 9 cycles if the consumer of the
     95 ;; operation is also capable of 9 cycles latency.  We model this with
     96 ;; a bypass.
     97 
     98 (define_insn_reservation "m8_imul" 10
     99   (and (eq_attr "cpu" "m8")
    100        (eq_attr "type" "imul,array,edge,edgen,pdistn"))
    101   "(m8_slot2 | m8_slot3), nothing*12")
    102 
    103 (define_bypass 9 "m8_imul" "m8_imul")
    104 
    105 ;; The integer division instructions `sdiv' and `udivx' have a latency
    106 ;; of 30 cycles and execute in integer units.
    107 
    108 (define_insn_reservation "m8_idiv" 30
    109   (and (eq_attr "cpu" "m8")
    110        (eq_attr "type" "idiv"))
    111   "(m8_slot2 | m8_slot3), nothing*29")
    112 
    113 ;; Both integer and floating-point load instructions have a latency of
    114 ;; only 3 cycles,and execute in the slot0.
    115 ;;
    116 ;; Misaligned load instructions feature a latency of 11 cycles.
    117 ;;
    118 ;; The prefetch instruction also executes in the load unit, but it's
    119 ;; latency is only 1 cycle.
    120 
    121 (define_insn_reservation "m8_load" 3
    122   (and (eq_attr "cpu" "m8")
    123        (ior (eq_attr "type" "fpload,sload")
    124             (and (eq_attr "type" "load")
    125                  (eq_attr "subtype" "regular"))))
    126   "m8_slot0, nothing*2")
    127 
    128 ;; (define_insn_reservation "m8_load_misalign" 11
    129 ;;  (and (eq_attr "cpu" "m8")
    130 ;;       (eq_attr "type" "load_mis,fpload_mis"))
    131 ;;  "m8_slot0, nothing*10")
    132 
    133 (define_insn_reservation "m8_prefetch" 1
    134   (and (eq_attr "cpu" "m8")
    135        (eq_attr "type" "load")
    136        (eq_attr "subtype" "prefetch"))
    137   "m8_slot0")
    138 
    139 ;; Both integer and floating-point store instructions have a latency
    140 ;; of 1 cycle, and execute in the store data unit in slot1.
    141 ;;
    142 ;; However, misaligned store instructions feature a latency of 3
    143 ;; cycles.
    144 
    145 (define_insn_reservation "m8_store" 1
    146   (and (eq_attr "cpu" "m8")
    147        (eq_attr "type" "store,fpstore"))
    148   "m8_slot1")
    149 
    150 ;; (define_insn_reservation "m8_store_misalign" 3
    151 ;;   (and (eq_attr "cpu" "m8")
    152 ;;        (eq_attr "type" "store_mis,fpstore_mis"))
    153 ;;   "m8_slot1, nothing*2")
    154 
    155 ;; Control-transfer instructions execute in the Branch Unit in the
    156 ;; slot1.
    157 
    158 (define_insn_reservation "m8_cti" 1
    159   (and (eq_attr "cpu" "m8")
    160        (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
    161   "m8_slot1")
    162 
    163 ;; Many instructions executing in the Floating-point and Graphics Unit
    164 ;; (FGU) serving slot3 feature a default latency of 9 cycles.
    165 
    166 (define_insn_reservation "m8_fp" 9
    167   (and (eq_attr "cpu" "m8")
    168        (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
    169             (and (eq_attr "type" "fga")
    170                  (eq_attr "subtype" "fpu"))))
    171   "m8_slot3, nothing*8")
    172 
    173 ;; Floating-point division and floating-point square-root instructions
    174 ;; have high latencies.  They execute in the FGU.
    175 
    176 (define_insn_reservation "m8_fpdivs" 26
    177   (and (eq_attr "cpu" "m8")
    178        (eq_attr "type" "fpdivs"))
    179   "m8_slot3, nothing*25")
    180 
    181 (define_insn_reservation "m8_fpsqrts" 33
    182   (and (eq_attr "cpu" "m8")
    183        (eq_attr "type" "fpsqrts"))
    184   "m8_slot3, nothing*32")
    185 
    186 (define_insn_reservation "m8_fpdivd" 30
    187   (and (eq_attr "cpu" "m8")
    188        (eq_attr "type" "fpdivd"))
    189   "m8_slot3, nothing*29")
    190 
    191 (define_insn_reservation "m8_fpsqrtd" 41
    192   (and (eq_attr "cpu" "m8")
    193        (eq_attr "type" "fpsqrtd"))
    194   "m8_slot3, nothing*40")
    195 
    196 ;; SIMD VIS instructions executing in the Floating-point and graphics
    197 ;; unit (FPG) in slot3 usually have a latency of 5 cycles.
    198 ;;
    199 ;; However, the latency for many instructions is only 3 cycles if the
    200 ;; consumer can also be executed in 3 cycles.  We model this with a
    201 ;; bypass.  In these cases the instructions are executed in one of the
    202 ;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
    203 ;; and 3.
    204 
    205 (define_insn_reservation "m8_vis" 5
    206   (and (eq_attr "cpu" "m8")
    207        (ior (eq_attr "type" "viscmp,lzd")
    208             (and (eq_attr "type" "fga")
    209                  (eq_attr "subtype" "maxmin,cmask,other"))
    210             (and (eq_attr "type" "vismv")
    211                  (eq_attr "subtype" "single,movstouw"))
    212             (and (eq_attr "type" "visl")
    213                  (eq_attr "subtype" "single"))))
    214   "m8_slot3, nothing*4")
    215 
    216 (define_bypass 3 "m8_vis" "m8_vis")
    217 
    218 (define_insn_reservation "m8_gsr" 5
    219   (and (eq_attr "cpu" "m8")
    220        (eq_attr "type" "gsr")
    221        (eq_attr "subtype" "alignaddr"))
    222   "m8_slot3, nothing*4")
    223 
    224 ;; A few VIS instructions have a latency of 1.
    225 
    226 (define_insn_reservation "m8_vis_1cycle" 1
    227   (and (eq_attr "cpu" "m8")
    228        (ior (and (eq_attr "type" "vismv")
    229                  (eq_attr "subtype" "double,movxtod,movdtox"))
    230             (and (eq_attr "type" "visl")
    231                  (eq_attr "subtype" "double"))
    232             (and (eq_attr "type" "fga")
    233                  (eq_attr "subtype" "addsub64"))))
    234   "m8_slot3")
    235 
    236 ;; Reading and writing to the gsr register takes more than 70 cycles.
    237 
    238 (define_insn_reservation "m8_gsr_reg" 70
    239   (and (eq_attr "cpu" "m8")
    240        (eq_attr "type" "gsr")
    241        (eq_attr "subtype" "reg"))
    242   "m8_slot3, nothing*69")
    243