Home | History | Annotate | Line # | Download | only in i386
      1 ;; Scheduling for the Intel P6 family of processors
      2 ;; Copyright (C) 2004-2022 Free Software Foundation, Inc.
      3 ;;
      4 ;; This file is part of GCC.
      5 ;;
      6 ;; GCC is free software; you can redistribute it and/or modify
      7 ;; it under the terms of the GNU General Public License as published by
      8 ;; the Free Software Foundation; either version 3, or (at your option)
      9 ;; any later version.
     10 ;;
     11 ;; GCC is distributed in the hope that it will be useful,
     12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
     13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     14 ;; GNU General Public License for more details.
     15 ;;
     16 ;; You should have received a copy of the GNU General Public License
     17 ;; along with GCC; see the file COPYING3.  If not see
     18 ;; <http://www.gnu.org/licenses/>.  */
     19 
     20 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
     21 ;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
     22 ;; based on information that can be found in the following three documents:
     23 ;;
     24 ;;    "P6 Family of Processors Hardware Developer's Manual",
     25 ;;    Intel, September 1999.
     26 ;;
     27 ;;    "Intel Architecture Optimization Manual",
     28 ;;    Intel, 1999 (Order Number: 245127-001).
     29 ;;
     30 ;;    "How to optimize for the Pentium family of microprocessors",
     31 ;;    by Agner Fog, PhD.
     32 ;;
     33 ;; The P6 pipeline has three major components:
     34 ;;   1) the FETCH/DECODE unit, an in-order issue front-end
     35 ;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
     36 ;;   3) the RETIRE unit, an in-order retirement unit
     37 ;;
     38 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
     39 ;; retirement unit are naturally in-order.
     40 ;;
     41 ;;                       BUS INTERFACE UNIT
     42 ;;                     /                   \
     43 ;;                L1 ICACHE             L1 DCACHE
     44 ;;              /     |     \              |     \
     45 ;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
     46 ;;              \     |     /              |        |
     47 ;;            INSTRUCTION POOL   __________|_______/
     48 ;;          (inc. reorder buffer)
     49 ;;
     50 ;; Since the P6 CPUs execute instructions out-of-order, the most important
     51 ;; consideration in performance tuning is making sure enough micro-ops are
     52 ;; ready for execution in the out-of-order core, while not stalling the
     53 ;; decoder.
     54 ;;
     55 ;; TODO:
     56 ;; - Find a less crude way to model complex instructions, in
     57 ;;   particular how many cycles they take to be decoded.
     58 ;; - Include decoder latencies in the total reservation latencies.
     59 ;;   This isn't necessary right now because we assume for every
     60 ;;   instruction that it never blocks a decoder.
     61 ;; - Figure out where the p0 and p1 reservations come from.  These
     62 ;;   appear not to be in the manual
     63 ;; - Lots more because I'm sure this is still far from optimal :-)
     64 
     65 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
     66 ;; latencies of idiv and fdiv type insns.
     67 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
     68 
     69 ;; Simple instructions of the register-register form have only one uop.
     70 ;; Load instructions are also only one uop.  Store instructions decode to
     71 ;; two uops, and simple read-modify instructions also take two uops.
     72 ;; Simple instructions of the register-memory form have two to three uops.
     73 ;; Simple read-modify-write instructions have four uops.  The rules for
     74 ;; the decoder are simple:
     75 ;;  - an instruction with 1 uop can be decoded by any of the three
     76 ;;    decoders in one cycle.
     77 ;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
     78 ;;    but still in only one cycle.
     79 ;;  - a complex (microcode) instruction can also only be decoded by
     80 ;;    decoder 0, and this takes an unspecified number of cycles.
     81 ;;
     82 ;; The goal is to schedule such that we have a few-one-one uops sequence
     83 ;; in each cycle, to decode as many instructions per cycle as possible.
     84 (define_cpu_unit "decoder0" "ppro_decoder")
     85 (define_cpu_unit "decoder1" "ppro_decoder")
     86 (define_cpu_unit "decoder2" "ppro_decoder")
     87 
     88 ;; We first wish to find an instruction for decoder0, so exclude
     89 ;; decoder1 and decoder2 from being reserved until decoder 0 is
     90 ;; reserved.
     91 (presence_set "decoder1" "decoder0")
     92 (presence_set "decoder2" "decoder0")
     93 
     94 ;; Most instructions can be decoded on any of the three decoders.
     95 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
     96 
     97 ;; The out-of-order core has five pipelines.  During each cycle, the core
     98 ;; may dispatch zero or one uop on the port of any of the five pipelines
     99 ;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
    100 ;; 3 uops per cycle is more realistic.
    101 ;;
    102 ;; Two of the five pipelines contain several execution units:
    103 ;;
    104 ;; Port 0	Port 1		Port 2		Port 3		Port 4
    105 ;; ALU		ALU		LOAD		SAC		SDA
    106 ;; FPU		JUE
    107 ;; AGU		MMX
    108 ;; MMX		P3FPU
    109 ;; P3FPU
    110 ;;
    111 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
    112 ;;  JUE = Jump Execution Unit, AGU = Address Generation Unit)
    113 ;;
    114 (define_cpu_unit "p0,p1" "ppro_core")
    115 (define_cpu_unit "p2" "ppro_load")
    116 (define_cpu_unit "p3,p4" "ppro_store")
    117 (define_cpu_unit "idiv" "ppro_idiv")
    118 (define_cpu_unit "fdiv" "ppro_fdiv")
    119 
    120 ;; Only the irregular instructions have to be modeled here.  A load
    121 ;; increases the latency by 2 or 3, or by nothing if the manual gives
    122 ;; a latency already.  Store latencies are not accounted for.
    123 ;;
    124 ;; The simple instructions follow a very regular pattern of 1 uop per
    125 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
    126 ;; on port 4 and port 3.  These instructions are modelled at the bottom
    127 ;; of this file.
    128 ;;
    129 ;; For microcoded instructions we don't know how many uops are produced.
    130 ;; These instructions are the "complex" ones in the Intel manuals.  All
    131 ;; we _do_ know is that they typically produce four or more uops, so
    132 ;; they can only be decoded on decoder0.  Modelling their latencies
    133 ;; doesn't make sense because we don't know how these instructions are
    134 ;; executed in the core.  So we just model that they can only be decoded
    135 ;; on decoder 0, and say that it takes a little while before the result
    136 ;; is available.
    137 (define_insn_reservation "ppro_complex_insn" 6
    138 			 (and (eq_attr "cpu" "pentiumpro")
    139 			      (eq_attr "type" "other,multi,call,callv,str"))
    140 			 "decoder0")
    141 
    142 ;; imov with memory operands does not use the integer units.
    143 (define_insn_reservation "ppro_imov" 1
    144 			 (and (eq_attr "cpu" "pentiumpro")
    145 			      (and (eq_attr "memory" "none")
    146 				   (eq_attr "type" "imov")))
    147 			 "decodern,(p0|p1)")
    148 
    149 (define_insn_reservation "ppro_imov_load" 4
    150 			 (and (eq_attr "cpu" "pentiumpro")
    151 			      (and (eq_attr "memory" "load")
    152 				   (eq_attr "type" "imov")))
    153 			 "decodern,p2")
    154 
    155 (define_insn_reservation "ppro_imov_store" 1
    156 			 (and (eq_attr "cpu" "pentiumpro")
    157 			      (and (eq_attr "memory" "store")
    158 				   (eq_attr "type" "imov")))
    159 			 "decoder0,p4+p3")
    160 
    161 ;; imovx always decodes to one uop, and also doesn't use the integer
    162 ;; units if it has memory operands.
    163 (define_insn_reservation "ppro_imovx" 1
    164 			 (and (eq_attr "cpu" "pentiumpro")
    165 			      (and (eq_attr "memory" "none")
    166 				   (eq_attr "type" "imovx")))
    167 			 "decodern,(p0|p1)")
    168 
    169 (define_insn_reservation "ppro_imovx_load" 4
    170 			 (and (eq_attr "cpu" "pentiumpro")
    171 			      (and (eq_attr "memory" "load")
    172 				   (eq_attr "type" "imovx")))
    173 			 "decodern,p2")
    174 
    175 ;; lea executes on port 0 with latency one and throughput 1.
    176 (define_insn_reservation "ppro_lea" 1
    177 			 (and (eq_attr "cpu" "pentiumpro")
    178 			      (and (eq_attr "memory" "none")
    179 				   (eq_attr "type" "lea")))
    180 			 "decodern,p0")
    181 
    182 ;; Shift and rotate execute on port 0 with latency and throughput 1.
    183 ;; The load and store units need to be reserved when memory operands
    184 ;; are involved.
    185 (define_insn_reservation "ppro_shift_rotate" 1
    186 			 (and (eq_attr "cpu" "pentiumpro")
    187 			      (and (eq_attr "memory" "none")
    188 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
    189 			 "decodern,p0")
    190 
    191 (define_insn_reservation "ppro_shift_rotate_mem" 4
    192 			 (and (eq_attr "cpu" "pentiumpro")
    193 			      (and (eq_attr "memory" "!none")
    194 				   (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
    195 			 "decoder0,p2+p0,p4+p3")
    196 
    197 
    198 ;; The P6 has a sophisticated branch prediction mechanism to minimize
    199 ;; latencies due to branching.  In particular, it has a fast way to
    200 ;; execute branches that are taken multiple times (such as in loops).
    201 ;; Branches not taken suffer no penalty, and correctly predicted
    202 ;; branches cost only one fetch cycle.  Mispredicted branches are very
    203 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
    204 ;;
    205 ;; Unfortunately all this makes it quite difficult to properly model
    206 ;; the latencies for the compiler.  Here I've made the choice to be
    207 ;; optimistic and assume branches are often predicted correctly, so
    208 ;; they have latency 1, and the decoders are not blocked.
    209 ;;
    210 ;; In addition, the model assumes a branch always decodes to only 1 uop,
    211 ;; which is not exactly true because there are a few instructions that
    212 ;; decode to 2 uops or microcode.  But this probably gives the best
    213 ;; results because we can assume these instructions can decode on all
    214 ;; decoders.
    215 (define_insn_reservation "ppro_branch" 1
    216 			 (and (eq_attr "cpu" "pentiumpro")
    217 			      (and (eq_attr "memory" "none")
    218 				   (eq_attr "type" "ibr")))
    219 			 "decodern,p1")
    220 
    221 ;; ??? Indirect branches probably have worse latency than this.
    222 (define_insn_reservation "ppro_indirect_branch" 6
    223 			 (and (eq_attr "cpu" "pentiumpro")
    224 			      (and (eq_attr "memory" "!none")
    225 				   (eq_attr "type" "ibr")))
    226 			 "decoder0,p2+p1")
    227 
    228 (define_insn_reservation "ppro_leave" 4
    229 			 (and (eq_attr "cpu" "pentiumpro")
    230 			      (eq_attr "type" "leave"))
    231 			 "decoder0,p2+(p0|p1),(p0|p1)")
    232 
    233 ;; imul has throughput one, but latency 4, and can only execute on port 0.
    234 (define_insn_reservation "ppro_imul" 4
    235 			 (and (eq_attr "cpu" "pentiumpro")
    236 			      (and (eq_attr "memory" "none")
    237 				   (eq_attr "type" "imul")))
    238 			 "decodern,p0")
    239 
    240 (define_insn_reservation "ppro_imul_mem" 4
    241 			 (and (eq_attr "cpu" "pentiumpro")
    242 			      (and (eq_attr "memory" "!none")
    243 				   (eq_attr "type" "imul")))
    244 			 "decoder0,p2+p0")
    245 
    246 ;; div and idiv are very similar, so we model them the same.
    247 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
    248 ;; These issue latencies are modelled via the ppro_div automaton.
    249 (define_insn_reservation "ppro_idiv_QI" 19
    250 			 (and (eq_attr "cpu" "pentiumpro")
    251 			      (and (eq_attr "memory" "none")
    252 				   (and (eq_attr "mode" "QI")
    253 					(eq_attr "type" "idiv"))))
    254 			 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
    255 
    256 (define_insn_reservation "ppro_idiv_QI_load" 19
    257 			 (and (eq_attr "cpu" "pentiumpro")
    258 			      (and (eq_attr "memory" "load")
    259 				   (and (eq_attr "mode" "QI")
    260 					(eq_attr "type" "idiv"))))
    261 			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
    262 
    263 (define_insn_reservation "ppro_idiv_HI" 23
    264 			 (and (eq_attr "cpu" "pentiumpro")
    265 			      (and (eq_attr "memory" "none")
    266 				   (and (eq_attr "mode" "HI")
    267 					(eq_attr "type" "idiv"))))
    268 			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
    269 
    270 (define_insn_reservation "ppro_idiv_HI_load" 23
    271 			 (and (eq_attr "cpu" "pentiumpro")
    272 			      (and (eq_attr "memory" "load")
    273 				   (and (eq_attr "mode" "HI")
    274 					(eq_attr "type" "idiv"))))
    275 			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
    276 
    277 (define_insn_reservation "ppro_idiv_SI" 39
    278 			 (and (eq_attr "cpu" "pentiumpro")
    279 			      (and (eq_attr "memory" "none")
    280 				   (and (eq_attr "mode" "SI")
    281 					(eq_attr "type" "idiv"))))
    282 			 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
    283 
    284 (define_insn_reservation "ppro_idiv_SI_load" 39
    285 			 (and (eq_attr "cpu" "pentiumpro")
    286 			      (and (eq_attr "memory" "load")
    287 				   (and (eq_attr "mode" "SI")
    288 					(eq_attr "type" "idiv"))))
    289 			 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
    290 
    291 ;; Floating point operations always execute on port 0.
    292 ;; ??? where do these latencies come from? fadd has latency 3 and
    293 ;;     has throughput "1/cycle (align with FADD)".  What do they
    294 ;;     mean and how can we model that?
    295 (define_insn_reservation "ppro_fop" 3
    296 			 (and (eq_attr "cpu" "pentiumpro")
    297 			      (and (eq_attr "memory" "none,unknown")
    298 				   (eq_attr "type" "fop")))
    299 			 "decodern,p0")
    300 
    301 (define_insn_reservation "ppro_fop_load" 5
    302 			 (and (eq_attr "cpu" "pentiumpro")
    303 			      (and (eq_attr "memory" "load")
    304 				   (eq_attr "type" "fop")))
    305 			 "decoder0,p2+p0,p0")
    306 
    307 (define_insn_reservation "ppro_fop_store" 3
    308 			 (and (eq_attr "cpu" "pentiumpro")
    309 			      (and (eq_attr "memory" "store")
    310 				   (eq_attr "type" "fop")))
    311 			 "decoder0,p0,p0,p0+p4+p3")
    312 
    313 (define_insn_reservation "ppro_fop_both" 5
    314 			 (and (eq_attr "cpu" "pentiumpro")
    315 			      (and (eq_attr "memory" "both")
    316 				   (eq_attr "type" "fop")))
    317 			 "decoder0,p2+p0,p0+p4+p3")
    318 
    319 (define_insn_reservation "ppro_fsgn" 1
    320 			 (and (eq_attr "cpu" "pentiumpro")
    321 			      (eq_attr "type" "fsgn"))
    322 			 "decodern,p0")
    323 
    324 (define_insn_reservation "ppro_fistp" 5
    325 			 (and (eq_attr "cpu" "pentiumpro")
    326 			      (eq_attr "type" "fistp"))
    327 			 "decoder0,p0*2,p4+p3")
    328 
    329 (define_insn_reservation "ppro_fcmov" 2
    330 			 (and (eq_attr "cpu" "pentiumpro")
    331 			      (eq_attr "type" "fcmov"))
    332 			 "decoder0,p0*2")
    333 
    334 (define_insn_reservation "ppro_fcmp" 1
    335 			 (and (eq_attr "cpu" "pentiumpro")
    336 			      (and (eq_attr "memory" "none")
    337 				   (eq_attr "type" "fcmp")))
    338 			 "decodern,p0")
    339 
    340 (define_insn_reservation "ppro_fcmp_load" 4
    341 			 (and (eq_attr "cpu" "pentiumpro")
    342 			      (and (eq_attr "memory" "load")
    343 				   (eq_attr "type" "fcmp")))
    344 			 "decoder0,p2+p0")
    345 
    346 (define_insn_reservation "ppro_fmov" 1
    347 			 (and (eq_attr "cpu" "pentiumpro")
    348 			      (and (eq_attr "memory" "none")
    349 				   (eq_attr "type" "fmov")))
    350 			 "decodern,p0")
    351 
    352 (define_insn_reservation "ppro_fmov_load" 1
    353 			 (and (eq_attr "cpu" "pentiumpro")
    354 			      (and (eq_attr "memory" "load")
    355 				   (and (eq_attr "mode" "!XF")
    356 					(eq_attr "type" "fmov"))))
    357 			 "decodern,p2")
    358 
    359 (define_insn_reservation "ppro_fmov_XF_load" 3
    360 			 (and (eq_attr "cpu" "pentiumpro")
    361 			      (and (eq_attr "memory" "load")
    362 				   (and (eq_attr "mode" "XF")
    363 					(eq_attr "type" "fmov"))))
    364 			 "decoder0,(p2+p0)*2")
    365 
    366 (define_insn_reservation "ppro_fmov_store" 1
    367 			 (and (eq_attr "cpu" "pentiumpro")
    368 			      (and (eq_attr "memory" "store")
    369 				   (and (eq_attr "mode" "!XF")
    370 					(eq_attr "type" "fmov"))))
    371 			 "decodern,p0")
    372 
    373 (define_insn_reservation "ppro_fmov_XF_store" 3
    374 			 (and (eq_attr "cpu" "pentiumpro")
    375 			      (and (eq_attr "memory" "store")
    376 				   (and (eq_attr "mode" "XF")
    377 					(eq_attr "type" "fmov"))))
    378 			 "decoder0,(p0+p4),(p0+p3)")
    379 
    380 ;; fmul executes on port 0 with latency 5.  It has issue latency 2,
    381 ;; but we don't model this.
    382 (define_insn_reservation "ppro_fmul" 5
    383 			 (and (eq_attr "cpu" "pentiumpro")
    384 			      (and (eq_attr "memory" "none")
    385 				   (eq_attr "type" "fmul")))
    386 			 "decoder0,p0*2")
    387 
    388 (define_insn_reservation "ppro_fmul_load" 6
    389 			 (and (eq_attr "cpu" "pentiumpro")
    390 			      (and (eq_attr "memory" "load")
    391 				   (eq_attr "type" "fmul")))
    392 			 "decoder0,p2+p0,p0")
    393 
    394 ;; fdiv latencies depend on the mode of the operands.  XFmode gives
    395 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
    396 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
    397 ;; that.  Throughput is equal to latency - 1, which we model using the
    398 ;; ppro_div automaton.
    399 (define_insn_reservation "ppro_fdiv_SF" 18
    400 			 (and (eq_attr "cpu" "pentiumpro")
    401 			      (and (eq_attr "memory" "none")
    402 				   (and (eq_attr "mode" "SF")
    403 					(eq_attr "type" "fdiv,fpspc"))))
    404 			 "decodern,p0+fdiv,fdiv*16")
    405 
    406 (define_insn_reservation "ppro_fdiv_SF_load" 19
    407 			 (and (eq_attr "cpu" "pentiumpro")
    408 			      (and (eq_attr "memory" "load")
    409 				   (and (eq_attr "mode" "SF")
    410 					(eq_attr "type" "fdiv,fpspc"))))
    411 			 "decoder0,p2+p0+fdiv,fdiv*16")
    412 
    413 (define_insn_reservation "ppro_fdiv_DF" 32
    414 			 (and (eq_attr "cpu" "pentiumpro")
    415 			      (and (eq_attr "memory" "none")
    416 				   (and (eq_attr "mode" "DF")
    417 					(eq_attr "type" "fdiv,fpspc"))))
    418 			 "decodern,p0+fdiv,fdiv*30")
    419 
    420 (define_insn_reservation "ppro_fdiv_DF_load" 33
    421 			 (and (eq_attr "cpu" "pentiumpro")
    422 			      (and (eq_attr "memory" "load")
    423 				   (and (eq_attr "mode" "DF")
    424 					(eq_attr "type" "fdiv,fpspc"))))
    425 			 "decoder0,p2+p0+fdiv,fdiv*30")
    426 
    427 (define_insn_reservation "ppro_fdiv_XF" 38
    428 			 (and (eq_attr "cpu" "pentiumpro")
    429 			      (and (eq_attr "memory" "none")
    430 				   (and (eq_attr "mode" "XF")
    431 					(eq_attr "type" "fdiv,fpspc"))))
    432 			 "decodern,p0+fdiv,fdiv*36")
    433 
    434 (define_insn_reservation "ppro_fdiv_XF_load" 39
    435 			 (and (eq_attr "cpu" "pentiumpro")
    436 			      (and (eq_attr "memory" "load")
    437 				   (and (eq_attr "mode" "XF")
    438 					(eq_attr "type" "fdiv,fpspc"))))
    439 			 "decoder0,p2+p0+fdiv,fdiv*36")
    440 
    441 ;; MMX instructions can execute on either port 0 or port 1 with a
    442 ;; throughput of 1/cycle.
    443 ;;   on port 0:	- ALU (latency 1)
    444 ;;		- Multiplier Unit (latency 3)
    445 ;;   on port 1:	- ALU (latency 1)
    446 ;;		- Shift Unit (latency 1)
    447 ;;
    448 ;; MMX instructions are either of the type reg-reg, or read-modify, and
    449 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
    450 ;; so they behave as "simple" instructions that need no special modelling.
    451 ;; We only have to model mmxshft and mmxmul.
    452 (define_insn_reservation "ppro_mmx_shft" 1
    453 			 (and (eq_attr "cpu" "pentiumpro")
    454 			      (and (eq_attr "memory" "none")
    455 				   (eq_attr "type" "mmxshft")))
    456 			 "decodern,p1")
    457 
    458 (define_insn_reservation "ppro_mmx_shft_load" 2
    459 			 (and (eq_attr "cpu" "pentiumpro")
    460 			      (and (eq_attr "memory" "none")
    461 				   (eq_attr "type" "mmxshft")))
    462 			 "decoder0,p2+p1")
    463 
    464 (define_insn_reservation "ppro_mmx_mul" 3
    465 			 (and (eq_attr "cpu" "pentiumpro")
    466 			      (and (eq_attr "memory" "none")
    467 				   (eq_attr "type" "mmxmul")))
    468 			 "decodern,p0")
    469 
    470 (define_insn_reservation "ppro_mmx_mul_load" 3
    471 			 (and (eq_attr "cpu" "pentiumpro")
    472 			      (and (eq_attr "memory" "none")
    473 				   (eq_attr "type" "mmxmul")))
    474 			 "decoder0,p2+p0")
    475 
    476 (define_insn_reservation "ppro_sse_mmxcvt" 4
    477 			 (and (eq_attr "cpu" "pentiumpro")
    478 			      (and (eq_attr "mode" "DI")
    479 				   (eq_attr "type" "mmxcvt")))
    480 			 "decodern,p1")
    481 
    482 ;; FIXME: These are Pentium III only, but we cannot tell here if
    483 ;; we're generating code for PentiumPro/Pentium II or Pentium III
    484 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
    485 ;;			 (and (eq_attr "cpu" "pentiumpro")
    486 ;;			      (and (eq_attr "mode" "DI")
    487 ;;				   (eq_attr "type" "mmxshft")))
    488 ;;			 "decodern,p0")
    489 
    490 ;; SSE is very complicated, and takes a bit more effort.
    491 ;; ??? I assumed that all SSE instructions decode on decoder0,
    492 ;;     but is this correct?
    493 
    494 ;; The sfence instruction.
    495 (define_insn_reservation "ppro_sse_sfence" 3
    496 			 (and (eq_attr "cpu" "pentiumpro")
    497 			      (and (eq_attr "memory" "unknown")
    498 				   (eq_attr "type" "sse")))
    499 			 "decoder0,p4+p3")
    500 
    501 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
    502 (define_insn_reservation "ppro_sse_SF" 3
    503 			 (and (eq_attr "cpu" "pentiumpro")
    504 			      (and (eq_attr "mode" "SF")
    505 				   (eq_attr "type" "sse")))
    506 			 "decodern,p0")
    507 
    508 (define_insn_reservation "ppro_sse_add_SF" 3
    509 			 (and (eq_attr "cpu" "pentiumpro")
    510 			      (and (eq_attr "memory" "none")
    511 				   (and (eq_attr "mode" "SF")
    512 					(eq_attr "type" "sseadd,sseadd1"))))
    513 			 "decodern,p1")
    514 
    515 (define_insn_reservation "ppro_sse_add_SF_load" 3
    516 			 (and (eq_attr "cpu" "pentiumpro")
    517 			      (and (eq_attr "memory" "load")
    518 				   (and (eq_attr "mode" "SF")
    519 					(eq_attr "type" "sseadd,sseadd1"))))
    520 			 "decoder0,p2+p1")
    521 
    522 (define_insn_reservation "ppro_sse_cmp_SF" 3
    523 			 (and (eq_attr "cpu" "pentiumpro")
    524 			      (and (eq_attr "memory" "none")
    525 				   (and (eq_attr "mode" "SF")
    526 					(eq_attr "type" "ssecmp"))))
    527 			 "decoder0,p1")
    528 
    529 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
    530 			 (and (eq_attr "cpu" "pentiumpro")
    531 			      (and (eq_attr "memory" "load")
    532 				   (and (eq_attr "mode" "SF")
    533 					(eq_attr "type" "ssecmp"))))
    534 			 "decoder0,p2+p1")
    535 
    536 (define_insn_reservation "ppro_sse_comi_SF" 1
    537 			 (and (eq_attr "cpu" "pentiumpro")
    538 			      (and (eq_attr "memory" "none")
    539 				   (and (eq_attr "mode" "SF")
    540 					(eq_attr "type" "ssecomi"))))
    541 			 "decodern,p0")
    542 
    543 (define_insn_reservation "ppro_sse_comi_SF_load" 1
    544 			 (and (eq_attr "cpu" "pentiumpro")
    545 			      (and (eq_attr "memory" "load")
    546 				   (and (eq_attr "mode" "SF")
    547 					(eq_attr "type" "ssecomi"))))
    548 			 "decoder0,p2+p0")
    549 
    550 (define_insn_reservation "ppro_sse_mul_SF" 4
    551 			 (and (eq_attr "cpu" "pentiumpro")
    552 			      (and (eq_attr "memory" "none")
    553 				   (and (eq_attr "mode" "SF")
    554 					(eq_attr "type" "ssemul"))))
    555 			"decodern,p0")
    556 
    557 (define_insn_reservation "ppro_sse_mul_SF_load" 4
    558 			 (and (eq_attr "cpu" "pentiumpro")
    559 			      (and (eq_attr "memory" "load")
    560 				   (and (eq_attr "mode" "SF")
    561 					(eq_attr "type" "ssemul"))))
    562 			"decoder0,p2+p0")
    563 
    564 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
    565 (define_insn_reservation "ppro_sse_div_SF" 18
    566 			 (and (eq_attr "cpu" "pentiumpro")
    567 			      (and (eq_attr "memory" "none")
    568 				   (and (eq_attr "mode" "SF")
    569 					(eq_attr "type" "ssediv"))))
    570 			 "decoder0,p0*17")
    571 
    572 (define_insn_reservation "ppro_sse_div_SF_load" 18
    573 			 (and (eq_attr "cpu" "pentiumpro")
    574 			      (and (eq_attr "memory" "none")
    575 				   (and (eq_attr "mode" "SF")
    576 					(eq_attr "type" "ssediv"))))
    577 			 "decoder0,(p2+p0),p0*16")
    578 
    579 (define_insn_reservation "ppro_sse_icvt_SF" 4
    580 			 (and (eq_attr "cpu" "pentiumpro")
    581 			      (and (eq_attr "mode" "SF")
    582 				   (eq_attr "type" "sseicvt")))
    583 			 "decoder0,(p2+p1)*2")
    584 
    585 (define_insn_reservation "ppro_sse_icvt_SI" 3
    586 			 (and (eq_attr "cpu" "pentiumpro")
    587 			      (and (eq_attr "mode" "SI")
    588 				   (eq_attr "type" "sseicvt")))
    589 			 "decoder0,(p2+p1)")
    590 
    591 (define_insn_reservation "ppro_sse_mov_SF" 3
    592 			 (and (eq_attr "cpu" "pentiumpro")
    593 			      (and (eq_attr "memory" "none")
    594 				   (and (eq_attr "mode" "SF")
    595 					(eq_attr "type" "ssemov"))))
    596 			 "decoder0,(p0|p1)")
    597 
    598 (define_insn_reservation "ppro_sse_mov_SF_load" 3
    599 			 (and (eq_attr "cpu" "pentiumpro")
    600 			      (and (eq_attr "memory" "load")
    601 				   (and (eq_attr "mode" "SF")
    602 					(eq_attr "type" "ssemov"))))
    603 			 "decoder0,p2+(p0|p1)")
    604 
    605 (define_insn_reservation "ppro_sse_mov_SF_store" 3
    606 			 (and (eq_attr "cpu" "pentiumpro")
    607 			      (and (eq_attr "memory" "store")
    608 				   (and (eq_attr "mode" "SF")
    609 					(eq_attr "type" "ssemov"))))
    610 			 "decoder0,p4+p3")
    611 
    612 (define_insn_reservation "ppro_sse_V4SF" 4
    613 			 (and (eq_attr "cpu" "pentiumpro")
    614 			      (and (eq_attr "mode" "V4SF")
    615 				   (eq_attr "type" "sse")))
    616 			 "decoder0,p1*2")
    617 
    618 (define_insn_reservation "ppro_sse_add_V4SF" 3
    619 			 (and (eq_attr "cpu" "pentiumpro")
    620 			      (and (eq_attr "memory" "none")
    621 				   (and (eq_attr "mode" "V4SF")
    622 					(eq_attr "type" "sseadd,sseadd1"))))
    623 			 "decoder0,p1*2")
    624 
    625 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
    626 			 (and (eq_attr "cpu" "pentiumpro")
    627 			      (and (eq_attr "memory" "load")
    628 				   (and (eq_attr "mode" "V4SF")
    629 					(eq_attr "type" "sseadd,sseadd1"))))
    630 			 "decoder0,(p2+p1)*2")
    631 
    632 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
    633 			 (and (eq_attr "cpu" "pentiumpro")
    634 			      (and (eq_attr "memory" "none")
    635 				   (and (eq_attr "mode" "V4SF")
    636 					(eq_attr "type" "ssecmp"))))
    637 			 "decoder0,p1*2")
    638 
    639 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
    640 			 (and (eq_attr "cpu" "pentiumpro")
    641 			      (and (eq_attr "memory" "load")
    642 				   (and (eq_attr "mode" "V4SF")
    643 					(eq_attr "type" "ssecmp"))))
    644 			 "decoder0,(p2+p1)*2")
    645 
    646 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
    647 			 (and (eq_attr "cpu" "pentiumpro")
    648 			      (and (eq_attr "memory" "none,unknown")
    649 				   (and (eq_attr "mode" "V4SF")
    650 					(eq_attr "type" "ssecvt"))))
    651 			 "decoder0,p1*2")
    652 
    653 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
    654 			 (and (eq_attr "cpu" "pentiumpro")
    655 			      (and (eq_attr "memory" "!none,unknown")
    656 				   (and (eq_attr "mode" "V4SF")
    657 					(eq_attr "type" "ssecmp"))))
    658 			 "decoder0,p1,p4+p3")
    659 
    660 (define_insn_reservation "ppro_sse_mul_V4SF" 5
    661 			 (and (eq_attr "cpu" "pentiumpro")
    662 			      (and (eq_attr "memory" "none")
    663 				   (and (eq_attr "mode" "V4SF")
    664 					(eq_attr "type" "ssemul"))))
    665 			"decoder0,p0*2")
    666 
    667 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
    668 			 (and (eq_attr "cpu" "pentiumpro")
    669 			      (and (eq_attr "memory" "load")
    670 				   (and (eq_attr "mode" "V4SF")
    671 					(eq_attr "type" "ssemul"))))
    672 			"decoder0,(p2+p0)*2")
    673 
    674 ;; FIXME: p0 really closed this long???
    675 (define_insn_reservation "ppro_sse_div_V4SF" 48
    676 			 (and (eq_attr "cpu" "pentiumpro")
    677 			      (and (eq_attr "memory" "none")
    678 				   (and (eq_attr "mode" "V4SF")
    679 					(eq_attr "type" "ssediv"))))
    680 			 "decoder0,p0*34")
    681 
    682 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
    683 			 (and (eq_attr "cpu" "pentiumpro")
    684 			      (and (eq_attr "memory" "load")
    685 				   (and (eq_attr "mode" "V4SF")
    686 					(eq_attr "type" "ssediv"))))
    687 			 "decoder0,(p2+p0)*2,p0*32")
    688 
    689 (define_insn_reservation "ppro_sse_log_V4SF" 2
    690 			 (and (eq_attr "cpu" "pentiumpro")
    691 			      (and (eq_attr "memory" "none")
    692 				   (and (eq_attr "mode" "V4SF")
    693 					(eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
    694 			 "decodern,p1")
    695 
    696 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
    697 			 (and (eq_attr "cpu" "pentiumpro")
    698 			      (and (eq_attr "memory" "load")
    699 				   (and (eq_attr "mode" "V4SF")
    700 					(eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
    701 			 "decoder0,(p2+p1)")
    702 
    703 (define_insn_reservation "ppro_sse_mov_V4SF" 1
    704 			 (and (eq_attr "cpu" "pentiumpro")
    705 			      (and (eq_attr "memory" "none")
    706 				   (and (eq_attr "mode" "V4SF")
    707 					(eq_attr "type" "ssemov"))))
    708 			 "decoder0,(p0|p1)*2")
    709 
    710 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
    711 			 (and (eq_attr "cpu" "pentiumpro")
    712 			      (and (eq_attr "memory" "load")
    713 				   (and (eq_attr "mode" "V4SF")
    714 					(eq_attr "type" "ssemov"))))
    715 			 "decoder0,p2*2")
    716 
    717 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
    718 			 (and (eq_attr "cpu" "pentiumpro")
    719 			      (and (eq_attr "memory" "store")
    720 				   (and (eq_attr "mode" "V4SF")
    721 					(eq_attr "type" "ssemov"))))
    722 			 "decoder0,(p4+p3)*2")
    723 
    724 ;; All other instructions are modelled as simple instructions.
    725 ;; We have already modelled all i387 floating point instructions, so all
    726 ;; other instructions execute on either port 0 or port 1.  This includes
    727 ;; the ALU units, and the MMX units.
    728 ;;
    729 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
    730 ;; the three decoders.
    731 (define_insn_reservation "ppro_insn" 1
    732 			 (and (eq_attr "cpu" "pentiumpro")
    733 			      (and (eq_attr "memory" "none,unknown")
    734 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
    735 			 "decodern,(p0|p1)")
    736 
    737 ;; read-modify and register-memory instructions have 2 or three uops,
    738 ;; so they have to be decoded on decoder0.
    739 (define_insn_reservation "ppro_insn_load" 3
    740 			 (and (eq_attr "cpu" "pentiumpro")
    741 			      (and (eq_attr "memory" "load")
    742 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
    743 			 "decoder0,p2+(p0|p1)")
    744 
    745 (define_insn_reservation "ppro_insn_store" 1
    746 			 (and (eq_attr "cpu" "pentiumpro")
    747 			      (and (eq_attr "memory" "store")
    748 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
    749 			 "decoder0,(p0|p1),p4+p3")
    750 
    751 ;; read-modify-store instructions produce 4 uops so they have to be
    752 ;; decoded on decoder0 as well.
    753 (define_insn_reservation "ppro_insn_both" 4
    754 			 (and (eq_attr "cpu" "pentiumpro")
    755 			      (and (eq_attr "memory" "both")
    756 				   (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
    757 			 "decoder0,p2+(p0|p1),p4+p3")
    758 
    759