Home | History | Annotate | Line # | Download | only in i386
      1 ;; Scheduling for Core 2 and derived processors.
      2 ;; Copyright (C) 2004-2022 Free Software Foundation, Inc.
      3 ;;
      4 ;; This file is part of GCC.
      5 ;;
      6 ;; GCC is free software; you can redistribute it and/or modify
      7 ;; it under the terms of the GNU General Public License as published by
      8 ;; the Free Software Foundation; either version 3, or (at your option)
      9 ;; any later version.
     10 ;;
     11 ;; GCC is distributed in the hope that it will be useful,
     12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 ;; GNU General Public License for more details.
     15 ;;
     16 ;; You should have received a copy of the GNU General Public License
     17 ;; along with GCC; see the file COPYING3.  If not see
     18 ;; <http://www.gnu.org/licenses/>.  */
     19 
     20 ;; The scheduling description in this file is based on the one in ppro.md,
     21 ;; with additional information obtained from
     22 ;;
     23 ;;    "How to optimize for the Pentium family of microprocessors",
     24 ;;    by Agner Fog, PhD.
     25 ;;
     26 ;; The major difference from the P6 pipeline is one extra decoder, and
     27 ;; one extra execute unit.  Due to micro-op fusion, many insns no longer
     28 ;; need to be decoded in decoder 0, but can be handled by all of them.
     29 
     30 ;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to
     31 ;; model issue latencies of idiv, fdiv and ssediv type insns.
     32 (define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store")
     33 
     34 ;; The CPU domain, used for Core i7 bypass latencies
     35 (define_attr "i7_domain" "int,float,simd"
     36   (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint")
     37 	   (const_string "float")
     38 	 (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul,
     39 			  sse,ssemov,sseadd,sseadd1,ssemul,ssecmp,ssecomi,ssecvt,
     40 			  ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg")
     41 	   (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF")
     42 		    (const_string "float")
     43 		  (eq_attr "mode" "SI")
     44 		    (const_string "int")]
     45 		  (const_string "simd"))
     46 	 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft")
     47 	   (const_string "simd")]
     48 	(const_string "int")))
     49 
     50 ;; As for the Pentium Pro,
     51 ;;  - an instruction with 1 uop can be decoded by any of the three
     52 ;;    decoders in one cycle.
     53 ;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
     54 ;;    but still in only one cycle.
     55 ;;  - a complex (microcode) instruction can also only be decoded by
     56 ;;    decoder 0, and this takes an unspecified number of cycles.
     57 ;;
     58 ;; The goal is to schedule such that we have a few-one-one uops sequence
     59 ;; in each cycle, to decode as many instructions per cycle as possible.
     60 (define_cpu_unit "c2_decoder0" "core2_decoder")
     61 (define_cpu_unit "c2_decoder1" "core2_decoder")
     62 (define_cpu_unit "c2_decoder2" "core2_decoder")
     63 (define_cpu_unit "c2_decoder3" "core2_decoder")
     64 
     65 ;; We first wish to find an instruction for c2_decoder0, so exclude
     66 ;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is
     67 ;; reserved.
     68 (presence_set "c2_decoder1" "c2_decoder0")
     69 (presence_set "c2_decoder2" "c2_decoder0")
     70 (presence_set "c2_decoder3" "c2_decoder0")
     71 
     72 ;; Most instructions can be decoded on any of the three decoders.
     73 (define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)")
     74 
     75 ;; The out-of-order core has six pipelines.  These are similar to the
     76 ;; Pentium Pro's five pipelines.  Port 2 is responsible for memory loads,
     77 ;; port 3 for store address calculations, port 4 for memory stores, and
     78 ;; ports 0, 1 and 5 for everything else.
     79 
     80 (define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core")
     81 (define_cpu_unit "c2_p2" "core2_load")
     82 (define_cpu_unit "c2_p3,c2_p4" "core2_store")
     83 (define_cpu_unit "c2_idiv" "core2_idiv")
     84 (define_cpu_unit "c2_fdiv" "core2_fdiv")
     85 (define_cpu_unit "c2_ssediv" "core2_ssediv")
     86 
     87 ;; Only the irregular instructions have to be modeled here.  A load
     88 ;; increases the latency by 2 or 3, or by nothing if the manual gives
     89 ;; a latency already.  Store latencies are not accounted for.
     90 ;;
     91 ;; The simple instructions follow a very regular pattern of 1 uop per
     92 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
     93 ;; on port 4 and port 3.  These instructions are modelled at the bottom
     94 ;; of this file.
     95 ;;
     96 ;; For microcoded instructions we don't know how many uops are produced.
     97 ;; These instructions are the "complex" ones in the Intel manuals.  All
     98 ;; we _do_ know is that they typically produce four or more uops, so
     99 ;; they can only be decoded on c2_decoder0.  Modelling their latencies
    100 ;; doesn't make sense because we don't know how these instructions are
    101 ;; executed in the core.  So we just model that they can only be decoded
    102 ;; on decoder 0, and say that it takes a little while before the result
    103 ;; is available.
    104 (define_insn_reservation "c2_complex_insn" 6
    105 			 (and (eq_attr "cpu" "core2,nehalem")
    106 			      (eq_attr "type" "other,multi,str"))
    107 			 "c2_decoder0")
    108 
    109 (define_insn_reservation "c2_call" 1
    110 			 (and (eq_attr "cpu" "core2,nehalem")
    111 			      (eq_attr "type" "call,callv"))
    112 			 "c2_decoder0")
    113 
    114 ;; imov with memory operands does not use the integer units.
    115 ;; imovx always decodes to one uop, and also doesn't use the integer
    116 ;; units if it has memory operands.
    117 (define_insn_reservation "c2_imov" 1
    118 			 (and (eq_attr "cpu" "core2,nehalem")
    119 			      (and (eq_attr "memory" "none")
    120 				   (eq_attr "type" "imov,imovx")))
    121 			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
    122 
    123 (define_insn_reservation "c2_imov_load" 4
    124 			 (and (eq_attr "cpu" "core2,nehalem")
    125 			      (and (eq_attr "memory" "load")
    126 				   (eq_attr "type" "imov,imovx")))
    127 			 "c2_decodern,c2_p2")
    128 
    129 (define_insn_reservation "c2_imov_store" 1
    130 			 (and (eq_attr "cpu" "core2,nehalem")
    131 			      (and (eq_attr "memory" "store")
    132 				   (eq_attr "type" "imov")))
    133 			 "c2_decodern,c2_p4+c2_p3")
    134 
    135 (define_insn_reservation "c2_icmov" 2
    136 			 (and (eq_attr "cpu" "core2,nehalem")
    137 			      (and (eq_attr "memory" "none")
    138 				   (eq_attr "type" "icmov")))
    139 			 "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2")
    140 
    141 (define_insn_reservation "c2_icmov_load" 2
    142 			 (and (eq_attr "cpu" "core2,nehalem")
    143 			      (and (eq_attr "memory" "load")
    144 				   (eq_attr "type" "icmov")))
    145 			 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2")
    146 
    147 (define_insn_reservation "c2_push_reg" 1
    148 			 (and (eq_attr "cpu" "core2,nehalem")
    149 			      (and (eq_attr "memory" "store")
    150 				   (eq_attr "type" "push")))
    151 			 "c2_decodern,c2_p4+c2_p3")
    152 
    153 (define_insn_reservation "c2_push_mem" 1
    154 			 (and (eq_attr "cpu" "core2,nehalem")
    155 			      (and (eq_attr "memory" "both")
    156 				   (eq_attr "type" "push")))
    157 			 "c2_decoder0,c2_p2,c2_p4+c2_p3")
    158 
    159 ;; lea executes on port 0 with latency one and throughput 1.
    160 (define_insn_reservation "c2_lea" 1
    161 			 (and (eq_attr "cpu" "core2,nehalem")
    162 			      (and (eq_attr "memory" "none")
    163 				   (eq_attr "type" "lea")))
    164 			 "c2_decodern,c2_p0")
    165 
    166 ;; Shift and rotate decode as two uops which can go to port 0 or 5.
    167 ;; The load and store units need to be reserved when memory operands
    168 ;; are involved.
    169 (define_insn_reservation "c2_shift_rotate" 1
    170 			 (and (eq_attr "cpu" "core2,nehalem")
    171 			      (and (eq_attr "memory" "none")
    172 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
    173 			 "c2_decodern,(c2_p0|c2_p5)")
    174 
    175 (define_insn_reservation "c2_shift_rotate_mem" 4
    176 			 (and (eq_attr "cpu" "core2,nehalem")
    177 			      (and (eq_attr "memory" "!none")
    178 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
    179 			 "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3")
    180 
    181 ;; See comments in ppro.md for the corresponding reservation.
    182 (define_insn_reservation "c2_branch" 1
    183 			 (and (eq_attr "cpu" "core2,nehalem")
    184 			      (and (eq_attr "memory" "none")
    185 				   (eq_attr "type" "ibr")))
    186 			 "c2_decodern,c2_p5")
    187 
    188 ;; ??? Indirect branches probably have worse latency than this.
    189 (define_insn_reservation "c2_indirect_branch" 6
    190 			 (and (eq_attr "cpu" "core2,nehalem")
    191 			      (and (eq_attr "memory" "!none")
    192 				   (eq_attr "type" "ibr")))
    193 			 "c2_decoder0,c2_p2+c2_p5")
    194 
    195 (define_insn_reservation "c2_leave" 4
    196 			 (and (eq_attr "cpu" "core2,nehalem")
    197 			      (eq_attr "type" "leave"))
    198 			 "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)")
    199 
    200 ;; mul and imul with two/three operands only execute on port 1 for HImode
    201 ;; and SImode, port 0 for DImode.
    202 (define_insn_reservation "c2_imul_hisi" 3
    203 			 (and (eq_attr "cpu" "core2,nehalem")
    204 			      (and (eq_attr "memory" "none")
    205 				   (and (eq_attr "mode" "HI,SI")
    206 					(eq_attr "type" "imul"))))
    207 			 "c2_decodern,c2_p1")
    208 
    209 (define_insn_reservation "c2_imul_hisi_mem" 3
    210 			 (and (eq_attr "cpu" "core2,nehalem")
    211 			      (and (eq_attr "memory" "!none")
    212 				   (and (eq_attr "mode" "HI,SI")
    213 					(eq_attr "type" "imul"))))
    214 			 "c2_decoder0,c2_p2+c2_p1")
    215 
    216 (define_insn_reservation "c2_imul_di" 5
    217 			 (and (eq_attr "cpu" "core2,nehalem")
    218 			      (and (eq_attr "memory" "none")
    219 				   (and (eq_attr "mode" "DI")
    220 					(eq_attr "type" "imul"))))
    221 			 "c2_decodern,c2_p0")
    222 
    223 (define_insn_reservation "c2_imul_di_mem" 5
    224 			 (and (eq_attr "cpu" "core2,nehalem")
    225 			      (and (eq_attr "memory" "!none")
    226 				   (and (eq_attr "mode" "DI")
    227 					(eq_attr "type" "imul"))))
    228 			 "c2_decoder0,c2_p2+c2_p0")
    229 
    230 ;; div and idiv are very similar, so we model them the same.
    231 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
    232 ;; These issue latencies are modelled via the c2_div automaton.
    233 (define_insn_reservation "c2_idiv_QI" 19
    234 			 (and (eq_attr "cpu" "core2,nehalem")
    235 			      (and (eq_attr "memory" "none")
    236 				   (and (eq_attr "mode" "QI")
    237 					(eq_attr "type" "idiv"))))
    238 			 "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
    239 
    240 (define_insn_reservation "c2_idiv_QI_load" 19
    241 			 (and (eq_attr "cpu" "core2,nehalem")
    242 			      (and (eq_attr "memory" "load")
    243 				   (and (eq_attr "mode" "QI")
    244 					(eq_attr "type" "idiv"))))
    245 			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9")
    246 
    247 (define_insn_reservation "c2_idiv_HI" 23
    248 			 (and (eq_attr "cpu" "core2,nehalem")
    249 			      (and (eq_attr "memory" "none")
    250 				   (and (eq_attr "mode" "HI")
    251 					(eq_attr "type" "idiv"))))
    252 			 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17")
    253 
    254 (define_insn_reservation "c2_idiv_HI_load" 23
    255 			 (and (eq_attr "cpu" "core2,nehalem")
    256 			      (and (eq_attr "memory" "load")
    257 				   (and (eq_attr "mode" "HI")
    258 					(eq_attr "type" "idiv"))))
    259 			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18")
    260 
    261 (define_insn_reservation "c2_idiv_SI" 39
    262 			 (and (eq_attr "cpu" "core2,nehalem")
    263 			      (and (eq_attr "memory" "none")
    264 				   (and (eq_attr "mode" "SI")
    265 					(eq_attr "type" "idiv"))))
    266 			 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33")
    267 
    268 (define_insn_reservation "c2_idiv_SI_load" 39
    269 			 (and (eq_attr "cpu" "core2,nehalem")
    270 			      (and (eq_attr "memory" "load")
    271 				   (and (eq_attr "mode" "SI")
    272 					(eq_attr "type" "idiv"))))
    273 			 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34")
    274 
    275 ;; x87 floating point operations.
    276 
    277 (define_insn_reservation "c2_fxch" 0
    278 			 (and (eq_attr "cpu" "core2,nehalem")
    279 			      (eq_attr "type" "fxch"))
    280 			 "c2_decodern")
    281 
    282 (define_insn_reservation "c2_fop" 3
    283 			 (and (eq_attr "cpu" "core2,nehalem")
    284 			      (and (eq_attr "memory" "none,unknown")
    285 				   (eq_attr "type" "fop")))
    286 			 "c2_decodern,c2_p1")
    287 
    288 (define_insn_reservation "c2_fop_load" 5
    289 			 (and (eq_attr "cpu" "core2,nehalem")
    290 			      (and (eq_attr "memory" "load")
    291 				   (eq_attr "type" "fop")))
    292 			 "c2_decoder0,c2_p2+c2_p1,c2_p1")
    293 
    294 (define_insn_reservation "c2_fop_store" 3
    295 			 (and (eq_attr "cpu" "core2,nehalem")
    296 			      (and (eq_attr "memory" "store")
    297 				   (eq_attr "type" "fop")))
    298 			 "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3")
    299 
    300 (define_insn_reservation "c2_fop_both" 5
    301 			 (and (eq_attr "cpu" "core2,nehalem")
    302 			      (and (eq_attr "memory" "both")
    303 				   (eq_attr "type" "fop")))
    304 			 "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3")
    305 
    306 (define_insn_reservation "c2_fsgn" 1
    307 			 (and (eq_attr "cpu" "core2,nehalem")
    308 			      (eq_attr "type" "fsgn"))
    309 			 "c2_decodern,c2_p0")
    310 
    311 (define_insn_reservation "c2_fistp" 5
    312 			 (and (eq_attr "cpu" "core2,nehalem")
    313 			      (eq_attr "type" "fistp"))
    314 			 "c2_decoder0,c2_p0*2,c2_p4+c2_p3")
    315 
    316 (define_insn_reservation "c2_fcmov" 2
    317 			 (and (eq_attr "cpu" "core2,nehalem")
    318 			      (eq_attr "type" "fcmov"))
    319 			 "c2_decoder0,c2_p0*2")
    320 
    321 (define_insn_reservation "c2_fcmp" 1
    322 			 (and (eq_attr "cpu" "core2,nehalem")
    323 			      (and (eq_attr "memory" "none")
    324 				   (eq_attr "type" "fcmp")))
    325 			 "c2_decodern,c2_p1")
    326 
    327 (define_insn_reservation "c2_fcmp_load" 4
    328 			 (and (eq_attr "cpu" "core2,nehalem")
    329 			      (and (eq_attr "memory" "load")
    330 				   (eq_attr "type" "fcmp")))
    331 			 "c2_decoder0,c2_p2+c2_p1")
    332 
    333 (define_insn_reservation "c2_fmov" 1
    334 			 (and (eq_attr "cpu" "core2,nehalem")
    335 			      (and (eq_attr "memory" "none")
    336 				   (eq_attr "type" "fmov")))
    337 			 "c2_decodern,c2_p0")
    338 
    339 (define_insn_reservation "c2_fmov_load" 1
    340 			 (and (eq_attr "cpu" "core2,nehalem")
    341 			      (and (eq_attr "memory" "load")
    342 				   (and (eq_attr "mode" "!XF")
    343 					(eq_attr "type" "fmov"))))
    344 			 "c2_decodern,c2_p2")
    345 
    346 (define_insn_reservation "c2_fmov_XF_load" 3
    347 			 (and (eq_attr "cpu" "core2,nehalem")
    348 			      (and (eq_attr "memory" "load")
    349 				   (and (eq_attr "mode" "XF")
    350 					(eq_attr "type" "fmov"))))
    351 			 "c2_decoder0,(c2_p2+c2_p0)*2")
    352 
    353 (define_insn_reservation "c2_fmov_store" 1
    354 			 (and (eq_attr "cpu" "core2,nehalem")
    355 			      (and (eq_attr "memory" "store")
    356 				   (and (eq_attr "mode" "!XF")
    357 					(eq_attr "type" "fmov"))))
    358 			 "c2_decodern,c2_p3+c2_p4")
    359 
    360 (define_insn_reservation "c2_fmov_XF_store" 3
    361 			 (and (eq_attr "cpu" "core2,nehalem")
    362 			      (and (eq_attr "memory" "store")
    363 				   (and (eq_attr "mode" "XF")
    364 					(eq_attr "type" "fmov"))))
    365 			 "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)")
    366 
    367 ;; fmul executes on port 0 with latency 5.  It has issue latency 2,
    368 ;; but we don't model this.
    369 (define_insn_reservation "c2_fmul" 5
    370 			 (and (eq_attr "cpu" "core2,nehalem")
    371 			      (and (eq_attr "memory" "none")
    372 				   (eq_attr "type" "fmul")))
    373 			 "c2_decoder0,c2_p0*2")
    374 
    375 (define_insn_reservation "c2_fmul_load" 6
    376 			 (and (eq_attr "cpu" "core2,nehalem")
    377 			      (and (eq_attr "memory" "load")
    378 				   (eq_attr "type" "fmul")))
    379 			 "c2_decoder0,c2_p2+c2_p0,c2_p0")
    380 
    381 ;; fdiv latencies depend on the mode of the operands.  XFmode gives
    382 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
    383 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
    384 ;; that.  Throughput is equal to latency - 1, which we model using the
    385 ;; c2_div automaton.
    386 (define_insn_reservation "c2_fdiv_SF" 18
    387 			 (and (eq_attr "cpu" "core2,nehalem")
    388 			      (and (eq_attr "memory" "none")
    389 				   (and (eq_attr "mode" "SF")
    390 					(eq_attr "type" "fdiv,fpspc"))))
    391 			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16")
    392 
    393 (define_insn_reservation "c2_fdiv_SF_load" 19
    394 			 (and (eq_attr "cpu" "core2,nehalem")
    395 			      (and (eq_attr "memory" "load")
    396 				   (and (eq_attr "mode" "SF")
    397 					(eq_attr "type" "fdiv,fpspc"))))
    398 			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16")
    399 
    400 (define_insn_reservation "c2_fdiv_DF" 32
    401 			 (and (eq_attr "cpu" "core2,nehalem")
    402 			      (and (eq_attr "memory" "none")
    403 				   (and (eq_attr "mode" "DF")
    404 					(eq_attr "type" "fdiv,fpspc"))))
    405 			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30")
    406 
    407 (define_insn_reservation "c2_fdiv_DF_load" 33
    408 			 (and (eq_attr "cpu" "core2,nehalem")
    409 			      (and (eq_attr "memory" "load")
    410 				   (and (eq_attr "mode" "DF")
    411 					(eq_attr "type" "fdiv,fpspc"))))
    412 			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30")
    413 
    414 (define_insn_reservation "c2_fdiv_XF" 38
    415 			 (and (eq_attr "cpu" "core2,nehalem")
    416 			      (and (eq_attr "memory" "none")
    417 				   (and (eq_attr "mode" "XF")
    418 					(eq_attr "type" "fdiv,fpspc"))))
    419 			 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36")
    420 
    421 (define_insn_reservation "c2_fdiv_XF_load" 39
    422 			 (and (eq_attr "cpu" "core2,nehalem")
    423 			      (and (eq_attr "memory" "load")
    424 				   (and (eq_attr "mode" "XF")
    425 					(eq_attr "type" "fdiv,fpspc"))))
    426 			 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36")
    427 
    428 ;; MMX instructions.
    429 
    430 (define_insn_reservation "c2_mmx_add" 1
    431 			 (and (eq_attr "cpu" "core2,nehalem")
    432 			      (and (eq_attr "memory" "none")
    433 				   (eq_attr "type" "mmxadd,sseiadd")))
    434 			 "c2_decodern,c2_p0|c2_p5")
    435 
    436 (define_insn_reservation "c2_mmx_add_load" 2
    437 			 (and (eq_attr "cpu" "core2,nehalem")
    438 			      (and (eq_attr "memory" "load")
    439 				   (eq_attr "type" "mmxadd,sseiadd")))
    440 			 "c2_decodern,c2_p2+c2_p0|c2_p5")
    441 
    442 (define_insn_reservation "c2_mmx_shft" 1
    443 			 (and (eq_attr "cpu" "core2,nehalem")
    444 			      (and (eq_attr "memory" "none")
    445 				   (eq_attr "type" "mmxshft")))
    446 			 "c2_decodern,c2_p0|c2_p5")
    447 
    448 (define_insn_reservation "c2_mmx_shft_load" 2
    449 			 (and (eq_attr "cpu" "core2,nehalem")
    450 			      (and (eq_attr "memory" "load")
    451 				   (eq_attr "type" "mmxshft")))
    452 			 "c2_decoder0,c2_p2+c2_p1")
    453 
    454 (define_insn_reservation "c2_mmx_sse_shft" 1
    455 			 (and (eq_attr "cpu" "core2,nehalem")
    456 			      (and (eq_attr "memory" "none")
    457 				   (and (eq_attr "type" "sseishft")
    458 					(eq_attr "length_immediate" "!0"))))
    459 			 "c2_decodern,c2_p1")
    460 
    461 (define_insn_reservation "c2_mmx_sse_shft_load" 2
    462 			 (and (eq_attr "cpu" "core2,nehalem")
    463 			      (and (eq_attr "memory" "load")
    464 				   (and (eq_attr "type" "sseishft")
    465 					(eq_attr "length_immediate" "!0"))))
    466 			 "c2_decodern,c2_p1")
    467 
    468 (define_insn_reservation "c2_mmx_sse_shft1" 2
    469 			 (and (eq_attr "cpu" "core2,nehalem")
    470 			      (and (eq_attr "memory" "none")
    471 				   (and (eq_attr "type" "sseishft")
    472 					(eq_attr "length_immediate" "0"))))
    473 			 "c2_decodern,c2_p1")
    474 
    475 (define_insn_reservation "c2_mmx_sse_shft1_load" 3
    476 			 (and (eq_attr "cpu" "core2,nehalem")
    477 			      (and (eq_attr "memory" "load")
    478 				   (and (eq_attr "type" "sseishft")
    479 					(eq_attr "length_immediate" "0"))))
    480 			 "c2_decodern,c2_p1")
    481 
    482 (define_insn_reservation "c2_mmx_mul" 3
    483 			 (and (eq_attr "cpu" "core2,nehalem")
    484 			      (and (eq_attr "memory" "none")
    485 				   (eq_attr "type" "mmxmul,sseimul")))
    486 			 "c2_decodern,c2_p1")
    487 
    488 (define_insn_reservation "c2_mmx_mul_load" 3
    489 			 (and (eq_attr "cpu" "core2,nehalem")
    490 			      (and (eq_attr "memory" "none")
    491 				   (eq_attr "type" "mmxmul,sseimul")))
    492 			 "c2_decoder0,c2_p2+c2_p1")
    493 
    494 (define_insn_reservation "c2_sse_mmxcvt" 4
    495 			 (and (eq_attr "cpu" "core2,nehalem")
    496 			      (and (eq_attr "mode" "DI")
    497 				   (eq_attr "type" "mmxcvt")))
    498 			 "c2_decodern,c2_p1")
    499 
    500 ;; FIXME: These are Pentium III only, but we cannot tell here if
    501 ;; we're generating code for PentiumPro/Pentium II or Pentium III
    502 ;; (define_insn_reservation "c2_sse_mmxshft" 2
    503 ;;			 (and (eq_attr "cpu" "core2,nehalem")
    504 ;;			      (and (eq_attr "mode" "TI")
    505 ;;				   (eq_attr "type" "mmxshft")))
    506 ;;			 "c2_decodern,c2_p0")
    507 
    508 ;; The sfence instruction.
    509 (define_insn_reservation "c2_sse_sfence" 3
    510 			 (and (eq_attr "cpu" "core2,nehalem")
    511 			      (and (eq_attr "memory" "unknown")
    512 				   (eq_attr "type" "sse")))
    513 			 "c2_decoder0,c2_p4+c2_p3")
    514 
    515 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
    516 (define_insn_reservation "c2_sse_SFDF" 3
    517 			 (and (eq_attr "cpu" "core2,nehalem")
    518 			      (and (eq_attr "mode" "SF,DF")
    519 				   (eq_attr "type" "sse")))
    520 			 "c2_decodern,c2_p0")
    521 
    522 (define_insn_reservation "c2_sse_V4SF" 4
    523 			 (and (eq_attr "cpu" "core2,nehalem")
    524 			      (and (eq_attr "mode" "V4SF")
    525 				   (eq_attr "type" "sse")))
    526 			 "c2_decoder0,c2_p1*2")
    527 
    528 (define_insn_reservation "c2_sse_addcmp" 3
    529 			 (and (eq_attr "cpu" "core2,nehalem")
    530 			      (and (eq_attr "memory" "none")
    531 				   (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi")))
    532 			 "c2_decodern,c2_p1")
    533 
    534 (define_insn_reservation "c2_sse_addcmp_load" 3
    535 			 (and (eq_attr "cpu" "core2,nehalem")
    536 			      (and (eq_attr "memory" "load")
    537 				   (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi")))
    538 			 "c2_decodern,c2_p2+c2_p1")
    539 
    540 (define_insn_reservation "c2_sse_mul_SF" 4
    541 			 (and (eq_attr "cpu" "core2,nehalem")
    542 			      (and (eq_attr "memory" "none")
    543 				   (and (eq_attr "mode" "SF,V4SF")
    544 					(eq_attr "type" "ssemul"))))
    545 			"c2_decodern,c2_p0")
    546 
    547 (define_insn_reservation "c2_sse_mul_SF_load" 4
    548 			 (and (eq_attr "cpu" "core2,nehalem")
    549 			      (and (eq_attr "memory" "load")
    550 				   (and (eq_attr "mode" "SF,V4SF")
    551 					(eq_attr "type" "ssemul"))))
    552 			"c2_decodern,c2_p2+c2_p0")
    553 
    554 (define_insn_reservation "c2_sse_mul_DF" 5
    555 			 (and (eq_attr "cpu" "core2,nehalem")
    556 			      (and (eq_attr "memory" "none")
    557 				   (and (eq_attr "mode" "DF,V2DF")
    558 					(eq_attr "type" "ssemul"))))
    559 			"c2_decodern,c2_p0")
    560 
    561 (define_insn_reservation "c2_sse_mul_DF_load" 5
    562 			 (and (eq_attr "cpu" "core2,nehalem")
    563 			      (and (eq_attr "memory" "load")
    564 				   (and (eq_attr "mode" "DF,V2DF")
    565 					(eq_attr "type" "ssemul"))))
    566 			"c2_decodern,c2_p2+c2_p0")
    567 
    568 (define_insn_reservation "c2_sse_div_SF" 18
    569 			 (and (eq_attr "cpu" "core2,nehalem")
    570 			      (and (eq_attr "memory" "none")
    571 				   (and (eq_attr "mode" "SF,V4SF")
    572 					(eq_attr "type" "ssediv"))))
    573 			 "c2_decodern,c2_p0,c2_ssediv*17")
    574 
    575 (define_insn_reservation "c2_sse_div_SF_load" 18
    576 			 (and (eq_attr "cpu" "core2,nehalem")
    577 			      (and (eq_attr "memory" "none")
    578 				   (and (eq_attr "mode" "SF,V4SF")
    579 					(eq_attr "type" "ssediv"))))
    580 			 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17")
    581 
    582 (define_insn_reservation "c2_sse_div_DF" 32
    583 			 (and (eq_attr "cpu" "core2,nehalem")
    584 			      (and (eq_attr "memory" "none")
    585 				   (and (eq_attr "mode" "DF,V2DF")
    586 					(eq_attr "type" "ssediv"))))
    587 			 "c2_decodern,c2_p0,c2_ssediv*31")
    588 
    589 (define_insn_reservation "c2_sse_div_DF_load" 32
    590 			 (and (eq_attr "cpu" "core2,nehalem")
    591 			      (and (eq_attr "memory" "none")
    592 				   (and (eq_attr "mode" "DF,V2DF")
    593 					(eq_attr "type" "ssediv"))))
    594 			 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31")
    595 
    596 ;; FIXME: these have limited throughput
    597 (define_insn_reservation "c2_sse_icvt_SF" 4
    598 			 (and (eq_attr "cpu" "core2,nehalem")
    599 			      (and (eq_attr "memory" "none")
    600 				   (and (eq_attr "mode" "SF")
    601 					(eq_attr "type" "sseicvt"))))
    602 			 "c2_decodern,c2_p1")
    603 
    604 (define_insn_reservation "c2_sse_icvt_SF_load" 4
    605 			 (and (eq_attr "cpu" "core2,nehalem")
    606 			      (and (eq_attr "memory" "!none")
    607 				   (and (eq_attr "mode" "SF")
    608 					(eq_attr "type" "sseicvt"))))
    609 			 "c2_decodern,c2_p2+c2_p1")
    610 
    611 (define_insn_reservation "c2_sse_icvt_DF" 4
    612 			 (and (eq_attr "cpu" "core2,nehalem")
    613 			      (and (eq_attr "memory" "none")
    614 				   (and (eq_attr "mode" "DF")
    615 					(eq_attr "type" "sseicvt"))))
    616 			 "c2_decoder0,c2_p0+c2_p1")
    617 
    618 (define_insn_reservation "c2_sse_icvt_DF_load" 4
    619 			 (and (eq_attr "cpu" "core2,nehalem")
    620 			      (and (eq_attr "memory" "!none")
    621 				   (and (eq_attr "mode" "DF")
    622 					(eq_attr "type" "sseicvt"))))
    623 			 "c2_decoder0,(c2_p2+c2_p1)")
    624 
    625 (define_insn_reservation "c2_sse_icvt_SI" 3
    626 			 (and (eq_attr "cpu" "core2,nehalem")
    627 			      (and (eq_attr "memory" "none")
    628 				   (and (eq_attr "mode" "SI")
    629 					(eq_attr "type" "sseicvt"))))
    630 			 "c2_decodern,c2_p1")
    631 
    632 (define_insn_reservation "c2_sse_icvt_SI_load" 3
    633 			 (and (eq_attr "cpu" "core2,nehalem")
    634 			      (and (eq_attr "memory" "!none")
    635 				   (and (eq_attr "mode" "SI")
    636 					(eq_attr "type" "sseicvt"))))
    637 			 "c2_decodern,(c2_p2+c2_p1)")
    638 
    639 (define_insn_reservation "c2_sse_mov" 1
    640 			 (and (eq_attr "cpu" "core2,nehalem")
    641 			      (and (eq_attr "memory" "none")
    642 				   (eq_attr "type" "ssemov")))
    643 			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
    644 
    645 (define_insn_reservation "c2_sse_mov_load" 2
    646 			 (and (eq_attr "cpu" "core2,nehalem")
    647 			      (and (eq_attr "memory" "load")
    648 				   (eq_attr "type" "ssemov")))
    649 			 "c2_decodern,c2_p2")
    650 
    651 (define_insn_reservation "c2_sse_mov_store" 1
    652 			 (and (eq_attr "cpu" "core2,nehalem")
    653 			      (and (eq_attr "memory" "store")
    654 				   (eq_attr "type" "ssemov")))
    655 			 "c2_decodern,c2_p4+c2_p3")
    656 
    657 ;; All other instructions are modelled as simple instructions.
    658 ;; We have already modelled all i387 floating point instructions, so all
    659 ;; other instructions execute on either port 0, 1 or 5.  This includes
    660 ;; the ALU units, and the MMX units.
    661 ;;
    662 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
    663 ;; the three decoders.  Loads benefit from micro-op fusion and can be
    664 ;; treated in the same way.
    665 (define_insn_reservation "c2_insn" 1
    666 			 (and (eq_attr "cpu" "core2,nehalem")
    667 			      (and (eq_attr "memory" "none,unknown")
    668 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
    669 			 "c2_decodern,(c2_p0|c2_p1|c2_p5)")
    670 
    671 (define_insn_reservation "c2_insn_load" 4
    672 			 (and (eq_attr "cpu" "core2,nehalem")
    673 			      (and (eq_attr "memory" "load")
    674 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
    675 			 "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)")
    676 
    677 ;; register-memory instructions have three uops,  so they have to be
    678 ;; decoded on c2_decoder0.
    679 (define_insn_reservation "c2_insn_store" 1
    680 			 (and (eq_attr "cpu" "core2,nehalem")
    681 			      (and (eq_attr "memory" "store")
    682 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp")))
    683 			 "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
    684 
    685 ;; read-modify-store instructions produce 4 uops so they have to be
    686 ;; decoded on c2_decoder0 as well.
    687 (define_insn_reservation "c2_insn_both" 4
    688 			 (and (eq_attr "cpu" "core2,nehalem")
    689 			      (and (eq_attr "memory" "both")
    690 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp")))
    691 			 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3")
    692