1 ;; Scheduling description for the SPARC M8. 2 ;; Copyright (C) 2017-2022 Free Software Foundation, Inc. 3 ;; 4 ;; This file is part of GCC. 5 ;; 6 ;; GCC is free software; you can redistribute it and/or modify 7 ;; it under the terms of the GNU General Public License as published by 8 ;; the Free Software Foundation; either version 3, or (at your option) 9 ;; any later version. 10 ;; 11 ;; GCC is distributed in the hope that it will be useful, 12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of 13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 ;; GNU General Public License for more details. 15 ;; 16 ;; You should have received a copy of the GNU General Public License 17 ;; along with GCC; see the file COPYING3. If not see 18 ;; <http://www.gnu.org/licenses/>. 19 20 ;; Thigs to improve: 21 ;; 22 ;; - Store instructions are implemented by micro-ops, one of which 23 ;; generates the store address and is executed in the store address 24 ;; generation unit in the slot0. We need to model that. 25 ;; 26 ;; - There are two V3 pipes connected to different slots. The current 27 ;; implementation assumes that all the instructions executing in a 28 ;; V3 pipe are issued to the unit in slot3. 29 ;; 30 ;; - Single-issue ALU operations incur an additional cycle of latency to 31 ;; slot 0 and slot 1 instructions. This is not currently reflected 32 ;; in the DFA. 33 34 (define_automaton "m8_0") 35 36 ;; The S5 core has two dual-issue queues, PQLS and PQEX. Each queue 37 ;; is divided into two slots: PQLS corresponds to slots 0 and 1, and 38 ;; PQEX corresponds to slots 2 and 3. The core can issue 4 39 ;; instructions per-cycle, and up to 4 instructions are committed each 40 ;; cycle. 41 ;; 42 ;; 43 ;; m8_slot0 - Load Unit. 44 ;; - Store address gen. Unit. 45 ;; 46 ;; 47 ;; === PQLS ==> m8_slot1 - Store data unit. 48 ;; - Branch unit. 49 ;; 50 ;; 51 ;; === PQEX ==> m8_slot2 - Integer Unit (EXU2). 52 ;; - 3-cycles Crypto Unit (SPU2). 53 ;; 54 ;; m8_slot3 - Integer Unit (EXU3). 55 ;; - 3-cycles Crypto Unit (SPU3). 56 ;; - Floating-point and graphics unit (FPG). 57 ;; - Long-latency Crypto Unit. 58 ;; - Oracle Numbers Unit (ONU). 59 60 (define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0") 61 62 ;; Some instructions stall the pipeline and avoid any other 63 ;; instruction to be issued in the same cycle. We assume the same for 64 ;; multi-instruction insns. 65 66 (define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3") 67 68 (define_insn_reservation "m8_single" 1 69 (and (eq_attr "cpu" "m8") 70 (eq_attr "type" "multi,savew,flushw,trap,bmask")) 71 "m8_single_issue") 72 73 ;; Most of the instructions executing in the integer units have a 74 ;; latency of 1. 75 76 (define_insn_reservation "m8_integer" 1 77 (and (eq_attr "cpu" "m8") 78 (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask")) 79 "(m8_slot2 | m8_slot3)") 80 81 ;; Flushing the instruction memory takes 27 cycles. 82 83 84 (define_insn_reservation "m8_iflush" 27 85 (and (eq_attr "cpu" "m8") 86 (eq_attr "type" "iflush")) 87 "(m8_slot2 | m8_slot3), nothing*26") 88 89 ;; The integer multiplication instructions have a latency of 10 cycles 90 ;; and execute in integer units. 91 ;; 92 ;; Likewise for array*, edge* and pdistn instructions. 93 ;; 94 ;; However, the latency is only 9 cycles if the consumer of the 95 ;; operation is also capable of 9 cycles latency. We model this with 96 ;; a bypass. 97 98 (define_insn_reservation "m8_imul" 10 99 (and (eq_attr "cpu" "m8") 100 (eq_attr "type" "imul,array,edge,edgen,pdistn")) 101 "(m8_slot2 | m8_slot3), nothing*12") 102 103 (define_bypass 9 "m8_imul" "m8_imul") 104 105 ;; The integer division instructions `sdiv' and `udivx' have a latency 106 ;; of 30 cycles and execute in integer units. 107 108 (define_insn_reservation "m8_idiv" 30 109 (and (eq_attr "cpu" "m8") 110 (eq_attr "type" "idiv")) 111 "(m8_slot2 | m8_slot3), nothing*29") 112 113 ;; Both integer and floating-point load instructions have a latency of 114 ;; only 3 cycles,and execute in the slot0. 115 ;; 116 ;; Misaligned load instructions feature a latency of 11 cycles. 117 ;; 118 ;; The prefetch instruction also executes in the load unit, but it's 119 ;; latency is only 1 cycle. 120 121 (define_insn_reservation "m8_load" 3 122 (and (eq_attr "cpu" "m8") 123 (ior (eq_attr "type" "fpload,sload") 124 (and (eq_attr "type" "load") 125 (eq_attr "subtype" "regular")))) 126 "m8_slot0, nothing*2") 127 128 ;; (define_insn_reservation "m8_load_misalign" 11 129 ;; (and (eq_attr "cpu" "m8") 130 ;; (eq_attr "type" "load_mis,fpload_mis")) 131 ;; "m8_slot0, nothing*10") 132 133 (define_insn_reservation "m8_prefetch" 1 134 (and (eq_attr "cpu" "m8") 135 (eq_attr "type" "load") 136 (eq_attr "subtype" "prefetch")) 137 "m8_slot0") 138 139 ;; Both integer and floating-point store instructions have a latency 140 ;; of 1 cycle, and execute in the store data unit in slot1. 141 ;; 142 ;; However, misaligned store instructions feature a latency of 3 143 ;; cycles. 144 145 (define_insn_reservation "m8_store" 1 146 (and (eq_attr "cpu" "m8") 147 (eq_attr "type" "store,fpstore")) 148 "m8_slot1") 149 150 ;; (define_insn_reservation "m8_store_misalign" 3 151 ;; (and (eq_attr "cpu" "m8") 152 ;; (eq_attr "type" "store_mis,fpstore_mis")) 153 ;; "m8_slot1, nothing*2") 154 155 ;; Control-transfer instructions execute in the Branch Unit in the 156 ;; slot1. 157 158 (define_insn_reservation "m8_cti" 1 159 (and (eq_attr "cpu" "m8") 160 (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return")) 161 "m8_slot1") 162 163 ;; Many instructions executing in the Floating-point and Graphics Unit 164 ;; (FGU) serving slot3 feature a default latency of 9 cycles. 165 166 (define_insn_reservation "m8_fp" 9 167 (and (eq_attr "cpu" "m8") 168 (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist") 169 (and (eq_attr "type" "fga") 170 (eq_attr "subtype" "fpu")))) 171 "m8_slot3, nothing*8") 172 173 ;; Floating-point division and floating-point square-root instructions 174 ;; have high latencies. They execute in the FGU. 175 176 (define_insn_reservation "m8_fpdivs" 26 177 (and (eq_attr "cpu" "m8") 178 (eq_attr "type" "fpdivs")) 179 "m8_slot3, nothing*25") 180 181 (define_insn_reservation "m8_fpsqrts" 33 182 (and (eq_attr "cpu" "m8") 183 (eq_attr "type" "fpsqrts")) 184 "m8_slot3, nothing*32") 185 186 (define_insn_reservation "m8_fpdivd" 30 187 (and (eq_attr "cpu" "m8") 188 (eq_attr "type" "fpdivd")) 189 "m8_slot3, nothing*29") 190 191 (define_insn_reservation "m8_fpsqrtd" 41 192 (and (eq_attr "cpu" "m8") 193 (eq_attr "type" "fpsqrtd")) 194 "m8_slot3, nothing*40") 195 196 ;; SIMD VIS instructions executing in the Floating-point and graphics 197 ;; unit (FPG) in slot3 usually have a latency of 5 cycles. 198 ;; 199 ;; However, the latency for many instructions is only 3 cycles if the 200 ;; consumer can also be executed in 3 cycles. We model this with a 201 ;; bypass. In these cases the instructions are executed in one of the 202 ;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2 203 ;; and 3. 204 205 (define_insn_reservation "m8_vis" 5 206 (and (eq_attr "cpu" "m8") 207 (ior (eq_attr "type" "viscmp,lzd") 208 (and (eq_attr "type" "fga") 209 (eq_attr "subtype" "maxmin,cmask,other")) 210 (and (eq_attr "type" "vismv") 211 (eq_attr "subtype" "single,movstouw")) 212 (and (eq_attr "type" "visl") 213 (eq_attr "subtype" "single")))) 214 "m8_slot3, nothing*4") 215 216 (define_bypass 3 "m8_vis" "m8_vis") 217 218 (define_insn_reservation "m8_gsr" 5 219 (and (eq_attr "cpu" "m8") 220 (eq_attr "type" "gsr") 221 (eq_attr "subtype" "alignaddr")) 222 "m8_slot3, nothing*4") 223 224 ;; A few VIS instructions have a latency of 1. 225 226 (define_insn_reservation "m8_vis_1cycle" 1 227 (and (eq_attr "cpu" "m8") 228 (ior (and (eq_attr "type" "vismv") 229 (eq_attr "subtype" "double,movxtod,movdtox")) 230 (and (eq_attr "type" "visl") 231 (eq_attr "subtype" "double")) 232 (and (eq_attr "type" "fga") 233 (eq_attr "subtype" "addsub64")))) 234 "m8_slot3") 235 236 ;; Reading and writing to the gsr register takes more than 70 cycles. 237 238 (define_insn_reservation "m8_gsr_reg" 70 239 (and (eq_attr "cpu" "m8") 240 (eq_attr "type" "gsr") 241 (eq_attr "subtype" "reg")) 242 "m8_slot3, nothing*69") 243