1 1.1 mrg /* Array prefetching. 2 1.1 mrg Copyright (C) 2005-2022 Free Software Foundation, Inc. 3 1.1 mrg 4 1.1 mrg This file is part of GCC. 5 1.1 mrg 6 1.1 mrg GCC is free software; you can redistribute it and/or modify it 7 1.1 mrg under the terms of the GNU General Public License as published by the 8 1.1 mrg Free Software Foundation; either version 3, or (at your option) any 9 1.1 mrg later version. 10 1.1 mrg 11 1.1 mrg GCC is distributed in the hope that it will be useful, but WITHOUT 12 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 1.1 mrg for more details. 15 1.1 mrg 16 1.1 mrg You should have received a copy of the GNU General Public License 17 1.1 mrg along with GCC; see the file COPYING3. If not see 18 1.1 mrg <http://www.gnu.org/licenses/>. */ 19 1.1 mrg 20 1.1 mrg #include "config.h" 21 1.1 mrg #include "system.h" 22 1.1 mrg #include "coretypes.h" 23 1.1 mrg #include "backend.h" 24 1.1 mrg #include "target.h" 25 1.1 mrg #include "rtl.h" 26 1.1 mrg #include "tree.h" 27 1.1 mrg #include "gimple.h" 28 1.1 mrg #include "predict.h" 29 1.1 mrg #include "tree-pass.h" 30 1.1 mrg #include "gimple-ssa.h" 31 1.1 mrg #include "optabs-query.h" 32 1.1 mrg #include "tree-pretty-print.h" 33 1.1 mrg #include "fold-const.h" 34 1.1 mrg #include "stor-layout.h" 35 1.1 mrg #include "gimplify.h" 36 1.1 mrg #include "gimple-iterator.h" 37 1.1 mrg #include "gimplify-me.h" 38 1.1 mrg #include "tree-ssa-loop-ivopts.h" 39 1.1 mrg #include "tree-ssa-loop-manip.h" 40 1.1 mrg #include "tree-ssa-loop-niter.h" 41 1.1 mrg #include "tree-ssa-loop.h" 42 1.1 mrg #include "ssa.h" 43 1.1 mrg #include "tree-into-ssa.h" 44 1.1 mrg #include "cfgloop.h" 45 1.1 mrg #include "tree-scalar-evolution.h" 46 1.1 mrg #include "langhooks.h" 47 1.1 mrg #include "tree-inline.h" 48 1.1 mrg #include "tree-data-ref.h" 49 1.1 mrg #include "diagnostic-core.h" 50 1.1 mrg #include "dbgcnt.h" 51 1.1 mrg 52 1.1 mrg /* This pass inserts prefetch instructions to optimize cache usage during 53 1.1 mrg accesses to arrays in loops. It processes loops sequentially and: 54 1.1 mrg 55 1.1 mrg 1) Gathers all memory references in the single loop. 56 1.1 mrg 2) For each of the references it decides when it is profitable to prefetch 57 1.1 mrg it. To do it, we evaluate the reuse among the accesses, and determines 58 1.1 mrg two values: PREFETCH_BEFORE (meaning that it only makes sense to do 59 1.1 mrg prefetching in the first PREFETCH_BEFORE iterations of the loop) and 60 1.1 mrg PREFETCH_MOD (meaning that it only makes sense to prefetch in the 61 1.1 mrg iterations of the loop that are zero modulo PREFETCH_MOD). For example 62 1.1 mrg (assuming cache line size is 64 bytes, char has size 1 byte and there 63 1.1 mrg is no hardware sequential prefetch): 64 1.1 mrg 65 1.1 mrg char *a; 66 1.1 mrg for (i = 0; i < max; i++) 67 1.1 mrg { 68 1.1 mrg a[255] = ...; (0) 69 1.1 mrg a[i] = ...; (1) 70 1.1 mrg a[i + 64] = ...; (2) 71 1.1 mrg a[16*i] = ...; (3) 72 1.1 mrg a[187*i] = ...; (4) 73 1.1 mrg a[187*i + 50] = ...; (5) 74 1.1 mrg } 75 1.1 mrg 76 1.1 mrg (0) obviously has PREFETCH_BEFORE 1 77 1.1 mrg (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory 78 1.1 mrg location 64 iterations before it, and PREFETCH_MOD 64 (since 79 1.1 mrg it hits the same cache line otherwise). 80 1.1 mrg (2) has PREFETCH_MOD 64 81 1.1 mrg (3) has PREFETCH_MOD 4 82 1.1 mrg (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 83 1.1 mrg the cache line accessed by (5) is the same with probability only 84 1.1 mrg 7/32. 85 1.1 mrg (5) has PREFETCH_MOD 1 as well. 86 1.1 mrg 87 1.1 mrg Additionally, we use data dependence analysis to determine for each 88 1.1 mrg reference the distance till the first reuse; this information is used 89 1.1 mrg to determine the temporality of the issued prefetch instruction. 90 1.1 mrg 91 1.1 mrg 3) We determine how much ahead we need to prefetch. The number of 92 1.1 mrg iterations needed is time to fetch / time spent in one iteration of 93 1.1 mrg the loop. The problem is that we do not know either of these values, 94 1.1 mrg so we just make a heuristic guess based on a magic (possibly) 95 1.1 mrg target-specific constant and size of the loop. 96 1.1 mrg 97 1.1 mrg 4) Determine which of the references we prefetch. We take into account 98 1.1 mrg that there is a maximum number of simultaneous prefetches (provided 99 1.1 mrg by machine description). We prefetch as many prefetches as possible 100 1.1 mrg while still within this bound (starting with those with lowest 101 1.1 mrg prefetch_mod, since they are responsible for most of the cache 102 1.1 mrg misses). 103 1.1 mrg 104 1.1 mrg 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD 105 1.1 mrg and PREFETCH_BEFORE requirements (within some bounds), and to avoid 106 1.1 mrg prefetching nonaccessed memory. 107 1.1 mrg TODO -- actually implement peeling. 108 1.1 mrg 109 1.1 mrg 6) We actually emit the prefetch instructions. ??? Perhaps emit the 110 1.1 mrg prefetch instructions with guards in cases where 5) was not sufficient 111 1.1 mrg to satisfy the constraints? 112 1.1 mrg 113 1.1 mrg A cost model is implemented to determine whether or not prefetching is 114 1.1 mrg profitable for a given loop. The cost model has three heuristics: 115 1.1 mrg 116 1.1 mrg 1. Function trip_count_to_ahead_ratio_too_small_p implements a 117 1.1 mrg heuristic that determines whether or not the loop has too few 118 1.1 mrg iterations (compared to ahead). Prefetching is not likely to be 119 1.1 mrg beneficial if the trip count to ahead ratio is below a certain 120 1.1 mrg minimum. 121 1.1 mrg 122 1.1 mrg 2. Function mem_ref_count_reasonable_p implements a heuristic that 123 1.1 mrg determines whether the given loop has enough CPU ops that can be 124 1.1 mrg overlapped with cache missing memory ops. If not, the loop 125 1.1 mrg won't benefit from prefetching. In the implementation, 126 1.1 mrg prefetching is not considered beneficial if the ratio between 127 1.1 mrg the instruction count and the mem ref count is below a certain 128 1.1 mrg minimum. 129 1.1 mrg 130 1.1 mrg 3. Function insn_to_prefetch_ratio_too_small_p implements a 131 1.1 mrg heuristic that disables prefetching in a loop if the prefetching 132 1.1 mrg cost is above a certain limit. The relative prefetching cost is 133 1.1 mrg estimated by taking the ratio between the prefetch count and the 134 1.1 mrg total intruction count (this models the I-cache cost). 135 1.1 mrg 136 1.1 mrg The limits used in these heuristics are defined as parameters with 137 1.1 mrg reasonable default values. Machine-specific default values will be 138 1.1 mrg added later. 139 1.1 mrg 140 1.1 mrg Some other TODO: 141 1.1 mrg -- write and use more general reuse analysis (that could be also used 142 1.1 mrg in other cache aimed loop optimizations) 143 1.1 mrg -- make it behave sanely together with the prefetches given by user 144 1.1 mrg (now we just ignore them; at the very least we should avoid 145 1.1 mrg optimizing loops in that user put his own prefetches) 146 1.1 mrg -- we assume cache line size alignment of arrays; this could be 147 1.1 mrg improved. */ 148 1.1 mrg 149 1.1 mrg /* Magic constants follow. These should be replaced by machine specific 150 1.1 mrg numbers. */ 151 1.1 mrg 152 1.1 mrg /* True if write can be prefetched by a read prefetch. */ 153 1.1 mrg 154 1.1 mrg #ifndef WRITE_CAN_USE_READ_PREFETCH 155 1.1 mrg #define WRITE_CAN_USE_READ_PREFETCH 1 156 1.1 mrg #endif 157 1.1 mrg 158 1.1 mrg /* True if read can be prefetched by a write prefetch. */ 159 1.1 mrg 160 1.1 mrg #ifndef READ_CAN_USE_WRITE_PREFETCH 161 1.1 mrg #define READ_CAN_USE_WRITE_PREFETCH 0 162 1.1 mrg #endif 163 1.1 mrg 164 1.1 mrg /* The size of the block loaded by a single prefetch. Usually, this is 165 1.1 mrg the same as cache line size (at the moment, we only consider one level 166 1.1 mrg of cache hierarchy). */ 167 1.1 mrg 168 1.1 mrg #ifndef PREFETCH_BLOCK 169 1.1 mrg #define PREFETCH_BLOCK param_l1_cache_line_size 170 1.1 mrg #endif 171 1.1 mrg 172 1.1 mrg /* Do we have a forward hardware sequential prefetching? */ 173 1.1 mrg 174 1.1 mrg #ifndef HAVE_FORWARD_PREFETCH 175 1.1 mrg #define HAVE_FORWARD_PREFETCH 0 176 1.1 mrg #endif 177 1.1 mrg 178 1.1 mrg /* Do we have a backward hardware sequential prefetching? */ 179 1.1 mrg 180 1.1 mrg #ifndef HAVE_BACKWARD_PREFETCH 181 1.1 mrg #define HAVE_BACKWARD_PREFETCH 0 182 1.1 mrg #endif 183 1.1 mrg 184 1.1 mrg /* In some cases we are only able to determine that there is a certain 185 1.1 mrg probability that the two accesses hit the same cache line. In this 186 1.1 mrg case, we issue the prefetches for both of them if this probability 187 1.1 mrg is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */ 188 1.1 mrg 189 1.1 mrg #ifndef ACCEPTABLE_MISS_RATE 190 1.1 mrg #define ACCEPTABLE_MISS_RATE 50 191 1.1 mrg #endif 192 1.1 mrg 193 1.1 mrg #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024)) 194 1.1 mrg #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024)) 195 1.1 mrg 196 1.1 mrg /* We consider a memory access nontemporal if it is not reused sooner than 197 1.1 mrg after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore 198 1.1 mrg accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 199 1.1 mrg so that we use nontemporal prefetches e.g. if single memory location 200 1.1 mrg is accessed several times in a single iteration of the loop. */ 201 1.1 mrg #define NONTEMPORAL_FRACTION 16 202 1.1 mrg 203 1.1 mrg /* In case we have to emit a memory fence instruction after the loop that 204 1.1 mrg uses nontemporal stores, this defines the builtin to use. */ 205 1.1 mrg 206 1.1 mrg #ifndef FENCE_FOLLOWING_MOVNT 207 1.1 mrg #define FENCE_FOLLOWING_MOVNT NULL_TREE 208 1.1 mrg #endif 209 1.1 mrg 210 1.1 mrg /* It is not profitable to prefetch when the trip count is not at 211 1.1 mrg least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance. 212 1.1 mrg For example, in a loop with a prefetch ahead distance of 10, 213 1.1 mrg supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is 214 1.1 mrg profitable to prefetch when the trip count is greater or equal to 215 1.1 mrg 40. In that case, 30 out of the 40 iterations will benefit from 216 1.1 mrg prefetching. */ 217 1.1 mrg 218 1.1 mrg #ifndef TRIP_COUNT_TO_AHEAD_RATIO 219 1.1 mrg #define TRIP_COUNT_TO_AHEAD_RATIO 4 220 1.1 mrg #endif 221 1.1 mrg 222 1.1 mrg /* The group of references between that reuse may occur. */ 223 1.1 mrg 224 1.1 mrg struct mem_ref_group 225 1.1 mrg { 226 1.1 mrg tree base; /* Base of the reference. */ 227 1.1 mrg tree step; /* Step of the reference. */ 228 1.1 mrg struct mem_ref *refs; /* References in the group. */ 229 1.1 mrg struct mem_ref_group *next; /* Next group of references. */ 230 1.1 mrg unsigned int uid; /* Group UID, used only for debugging. */ 231 1.1 mrg }; 232 1.1 mrg 233 1.1 mrg /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 234 1.1 mrg 235 1.1 mrg #define PREFETCH_ALL HOST_WIDE_INT_M1U 236 1.1 mrg 237 1.1 mrg /* Do not generate a prefetch if the unroll factor is significantly less 238 1.1 mrg than what is required by the prefetch. This is to avoid redundant 239 1.1 mrg prefetches. For example, when prefetch_mod is 16 and unroll_factor is 240 1.1 mrg 2, prefetching requires unrolling the loop 16 times, but 241 1.1 mrg the loop is actually unrolled twice. In this case (ratio = 8), 242 1.1 mrg prefetching is not likely to be beneficial. */ 243 1.1 mrg 244 1.1 mrg #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 245 1.1 mrg #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4 246 1.1 mrg #endif 247 1.1 mrg 248 1.1 mrg /* Some of the prefetch computations have quadratic complexity. We want to 249 1.1 mrg avoid huge compile times and, therefore, want to limit the amount of 250 1.1 mrg memory references per loop where we consider prefetching. */ 251 1.1 mrg 252 1.1 mrg #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP 253 1.1 mrg #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200 254 1.1 mrg #endif 255 1.1 mrg 256 1.1 mrg /* The memory reference. */ 257 1.1 mrg 258 1.1 mrg struct mem_ref 259 1.1 mrg { 260 1.1 mrg gimple *stmt; /* Statement in that the reference appears. */ 261 1.1 mrg tree mem; /* The reference. */ 262 1.1 mrg HOST_WIDE_INT delta; /* Constant offset of the reference. */ 263 1.1 mrg struct mem_ref_group *group; /* The group of references it belongs to. */ 264 1.1 mrg unsigned HOST_WIDE_INT prefetch_mod; 265 1.1 mrg /* Prefetch only each PREFETCH_MOD-th 266 1.1 mrg iteration. */ 267 1.1 mrg unsigned HOST_WIDE_INT prefetch_before; 268 1.1 mrg /* Prefetch only first PREFETCH_BEFORE 269 1.1 mrg iterations. */ 270 1.1 mrg unsigned reuse_distance; /* The amount of data accessed before the first 271 1.1 mrg reuse of this value. */ 272 1.1 mrg struct mem_ref *next; /* The next reference in the group. */ 273 1.1 mrg unsigned int uid; /* Ref UID, used only for debugging. */ 274 1.1 mrg unsigned write_p : 1; /* Is it a write? */ 275 1.1 mrg unsigned independent_p : 1; /* True if the reference is independent on 276 1.1 mrg all other references inside the loop. */ 277 1.1 mrg unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */ 278 1.1 mrg unsigned storent_p : 1; /* True if we changed the store to a 279 1.1 mrg nontemporal one. */ 280 1.1 mrg }; 281 1.1 mrg 282 1.1 mrg /* Dumps information about memory reference */ 283 1.1 mrg static void 284 1.1 mrg dump_mem_details (FILE *file, tree base, tree step, 285 1.1 mrg HOST_WIDE_INT delta, bool write_p) 286 1.1 mrg { 287 1.1 mrg fprintf (file, "(base "); 288 1.1 mrg print_generic_expr (file, base, TDF_SLIM); 289 1.1 mrg fprintf (file, ", step "); 290 1.1 mrg if (cst_and_fits_in_hwi (step)) 291 1.1 mrg fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step)); 292 1.1 mrg else 293 1.1 mrg print_generic_expr (file, step, TDF_SLIM); 294 1.1 mrg fprintf (file, ")\n"); 295 1.1 mrg fprintf (file, " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta); 296 1.1 mrg fprintf (file, " %s\n\n", write_p ? "write" : "read"); 297 1.1 mrg } 298 1.1 mrg 299 1.1 mrg /* Dumps information about reference REF to FILE. */ 300 1.1 mrg 301 1.1 mrg static void 302 1.1 mrg dump_mem_ref (FILE *file, struct mem_ref *ref) 303 1.1 mrg { 304 1.1 mrg fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid); 305 1.1 mrg print_generic_expr (file, ref->mem, TDF_SLIM); 306 1.1 mrg fprintf (file, ")\n"); 307 1.1 mrg } 308 1.1 mrg 309 1.1 mrg /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 310 1.1 mrg exist. */ 311 1.1 mrg 312 1.1 mrg static struct mem_ref_group * 313 1.1 mrg find_or_create_group (struct mem_ref_group **groups, tree base, tree step) 314 1.1 mrg { 315 1.1 mrg /* Global count for setting struct mem_ref_group->uid. */ 316 1.1 mrg static unsigned int last_mem_ref_group_uid = 0; 317 1.1 mrg 318 1.1 mrg struct mem_ref_group *group; 319 1.1 mrg 320 1.1 mrg for (; *groups; groups = &(*groups)->next) 321 1.1 mrg { 322 1.1 mrg if (operand_equal_p ((*groups)->step, step, 0) 323 1.1 mrg && operand_equal_p ((*groups)->base, base, 0)) 324 1.1 mrg return *groups; 325 1.1 mrg 326 1.1 mrg /* If step is an integer constant, keep the list of groups sorted 327 1.1 mrg by decreasing step. */ 328 1.1 mrg if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step) 329 1.1 mrg && int_cst_value ((*groups)->step) < int_cst_value (step)) 330 1.1 mrg break; 331 1.1 mrg } 332 1.1 mrg 333 1.1 mrg group = XNEW (struct mem_ref_group); 334 1.1 mrg group->base = base; 335 1.1 mrg group->step = step; 336 1.1 mrg group->refs = NULL; 337 1.1 mrg group->uid = ++last_mem_ref_group_uid; 338 1.1 mrg group->next = *groups; 339 1.1 mrg *groups = group; 340 1.1 mrg 341 1.1 mrg return group; 342 1.1 mrg } 343 1.1 mrg 344 1.1 mrg /* Records a memory reference MEM in GROUP with offset DELTA and write status 345 1.1 mrg WRITE_P. The reference occurs in statement STMT. */ 346 1.1 mrg 347 1.1 mrg static void 348 1.1 mrg record_ref (struct mem_ref_group *group, gimple *stmt, tree mem, 349 1.1 mrg HOST_WIDE_INT delta, bool write_p) 350 1.1 mrg { 351 1.1 mrg unsigned int last_mem_ref_uid = 0; 352 1.1 mrg struct mem_ref **aref; 353 1.1 mrg 354 1.1 mrg /* Do not record the same address twice. */ 355 1.1 mrg for (aref = &group->refs; *aref; aref = &(*aref)->next) 356 1.1 mrg { 357 1.1 mrg last_mem_ref_uid = (*aref)->uid; 358 1.1 mrg 359 1.1 mrg /* It does not have to be possible for write reference to reuse the read 360 1.1 mrg prefetch, or vice versa. */ 361 1.1 mrg if (!WRITE_CAN_USE_READ_PREFETCH 362 1.1 mrg && write_p 363 1.1 mrg && !(*aref)->write_p) 364 1.1 mrg continue; 365 1.1 mrg if (!READ_CAN_USE_WRITE_PREFETCH 366 1.1 mrg && !write_p 367 1.1 mrg && (*aref)->write_p) 368 1.1 mrg continue; 369 1.1 mrg 370 1.1 mrg if ((*aref)->delta == delta) 371 1.1 mrg return; 372 1.1 mrg } 373 1.1 mrg 374 1.1 mrg (*aref) = XNEW (struct mem_ref); 375 1.1 mrg (*aref)->stmt = stmt; 376 1.1 mrg (*aref)->mem = mem; 377 1.1 mrg (*aref)->delta = delta; 378 1.1 mrg (*aref)->write_p = write_p; 379 1.1 mrg (*aref)->prefetch_before = PREFETCH_ALL; 380 1.1 mrg (*aref)->prefetch_mod = 1; 381 1.1 mrg (*aref)->reuse_distance = 0; 382 1.1 mrg (*aref)->issue_prefetch_p = false; 383 1.1 mrg (*aref)->group = group; 384 1.1 mrg (*aref)->next = NULL; 385 1.1 mrg (*aref)->independent_p = false; 386 1.1 mrg (*aref)->storent_p = false; 387 1.1 mrg (*aref)->uid = last_mem_ref_uid + 1; 388 1.1 mrg 389 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 390 1.1 mrg { 391 1.1 mrg dump_mem_ref (dump_file, *aref); 392 1.1 mrg 393 1.1 mrg fprintf (dump_file, " group %u ", group->uid); 394 1.1 mrg dump_mem_details (dump_file, group->base, group->step, delta, 395 1.1 mrg write_p); 396 1.1 mrg } 397 1.1 mrg } 398 1.1 mrg 399 1.1 mrg /* Release memory references in GROUPS. */ 400 1.1 mrg 401 1.1 mrg static void 402 1.1 mrg release_mem_refs (struct mem_ref_group *groups) 403 1.1 mrg { 404 1.1 mrg struct mem_ref_group *next_g; 405 1.1 mrg struct mem_ref *ref, *next_r; 406 1.1 mrg 407 1.1 mrg for (; groups; groups = next_g) 408 1.1 mrg { 409 1.1 mrg next_g = groups->next; 410 1.1 mrg for (ref = groups->refs; ref; ref = next_r) 411 1.1 mrg { 412 1.1 mrg next_r = ref->next; 413 1.1 mrg free (ref); 414 1.1 mrg } 415 1.1 mrg free (groups); 416 1.1 mrg } 417 1.1 mrg } 418 1.1 mrg 419 1.1 mrg /* A structure used to pass arguments to idx_analyze_ref. */ 420 1.1 mrg 421 1.1 mrg struct ar_data 422 1.1 mrg { 423 1.1 mrg class loop *loop; /* Loop of the reference. */ 424 1.1 mrg gimple *stmt; /* Statement of the reference. */ 425 1.1 mrg tree *step; /* Step of the memory reference. */ 426 1.1 mrg HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 427 1.1 mrg }; 428 1.1 mrg 429 1.1 mrg /* Analyzes a single INDEX of a memory reference to obtain information 430 1.1 mrg described at analyze_ref. Callback for for_each_index. */ 431 1.1 mrg 432 1.1 mrg static bool 433 1.1 mrg idx_analyze_ref (tree base, tree *index, void *data) 434 1.1 mrg { 435 1.1 mrg struct ar_data *ar_data = (struct ar_data *) data; 436 1.1 mrg tree ibase, step, stepsize; 437 1.1 mrg HOST_WIDE_INT idelta = 0, imult = 1; 438 1.1 mrg affine_iv iv; 439 1.1 mrg 440 1.1 mrg if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt), 441 1.1 mrg *index, &iv, true)) 442 1.1 mrg return false; 443 1.1 mrg ibase = iv.base; 444 1.1 mrg step = iv.step; 445 1.1 mrg 446 1.1 mrg if (TREE_CODE (ibase) == POINTER_PLUS_EXPR 447 1.1 mrg && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 448 1.1 mrg { 449 1.1 mrg idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 450 1.1 mrg ibase = TREE_OPERAND (ibase, 0); 451 1.1 mrg } 452 1.1 mrg if (cst_and_fits_in_hwi (ibase)) 453 1.1 mrg { 454 1.1 mrg idelta += int_cst_value (ibase); 455 1.1 mrg ibase = build_int_cst (TREE_TYPE (ibase), 0); 456 1.1 mrg } 457 1.1 mrg 458 1.1 mrg if (TREE_CODE (base) == ARRAY_REF) 459 1.1 mrg { 460 1.1 mrg stepsize = array_ref_element_size (base); 461 1.1 mrg if (!cst_and_fits_in_hwi (stepsize)) 462 1.1 mrg return false; 463 1.1 mrg imult = int_cst_value (stepsize); 464 1.1 mrg step = fold_build2 (MULT_EXPR, sizetype, 465 1.1 mrg fold_convert (sizetype, step), 466 1.1 mrg fold_convert (sizetype, stepsize)); 467 1.1 mrg idelta *= imult; 468 1.1 mrg } 469 1.1 mrg 470 1.1 mrg if (*ar_data->step == NULL_TREE) 471 1.1 mrg *ar_data->step = step; 472 1.1 mrg else 473 1.1 mrg *ar_data->step = fold_build2 (PLUS_EXPR, sizetype, 474 1.1 mrg fold_convert (sizetype, *ar_data->step), 475 1.1 mrg fold_convert (sizetype, step)); 476 1.1 mrg *ar_data->delta += idelta; 477 1.1 mrg *index = ibase; 478 1.1 mrg 479 1.1 mrg return true; 480 1.1 mrg } 481 1.1 mrg 482 1.1 mrg /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and 483 1.1 mrg STEP are integer constants and iter is number of iterations of LOOP. The 484 1.1 mrg reference occurs in statement STMT. Strips nonaddressable component 485 1.1 mrg references from REF_P. */ 486 1.1 mrg 487 1.1 mrg static bool 488 1.1 mrg analyze_ref (class loop *loop, tree *ref_p, tree *base, 489 1.1 mrg tree *step, HOST_WIDE_INT *delta, 490 1.1 mrg gimple *stmt) 491 1.1 mrg { 492 1.1 mrg struct ar_data ar_data; 493 1.1 mrg tree off; 494 1.1 mrg HOST_WIDE_INT bit_offset; 495 1.1 mrg tree ref = *ref_p; 496 1.1 mrg 497 1.1 mrg *step = NULL_TREE; 498 1.1 mrg *delta = 0; 499 1.1 mrg 500 1.1 mrg /* First strip off the component references. Ignore bitfields. 501 1.1 mrg Also strip off the real and imagine parts of a complex, so that 502 1.1 mrg they can have the same base. */ 503 1.1 mrg if (TREE_CODE (ref) == REALPART_EXPR 504 1.1 mrg || TREE_CODE (ref) == IMAGPART_EXPR 505 1.1 mrg || (TREE_CODE (ref) == COMPONENT_REF 506 1.1 mrg && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))) 507 1.1 mrg { 508 1.1 mrg if (TREE_CODE (ref) == IMAGPART_EXPR) 509 1.1 mrg *delta += int_size_in_bytes (TREE_TYPE (ref)); 510 1.1 mrg ref = TREE_OPERAND (ref, 0); 511 1.1 mrg } 512 1.1 mrg 513 1.1 mrg *ref_p = ref; 514 1.1 mrg 515 1.1 mrg for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 516 1.1 mrg { 517 1.1 mrg off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1)); 518 1.1 mrg bit_offset = TREE_INT_CST_LOW (off); 519 1.1 mrg gcc_assert (bit_offset % BITS_PER_UNIT == 0); 520 1.1 mrg 521 1.1 mrg *delta += bit_offset / BITS_PER_UNIT; 522 1.1 mrg } 523 1.1 mrg 524 1.1 mrg *base = unshare_expr (ref); 525 1.1 mrg ar_data.loop = loop; 526 1.1 mrg ar_data.stmt = stmt; 527 1.1 mrg ar_data.step = step; 528 1.1 mrg ar_data.delta = delta; 529 1.1 mrg return for_each_index (base, idx_analyze_ref, &ar_data); 530 1.1 mrg } 531 1.1 mrg 532 1.1 mrg /* Record a memory reference REF to the list REFS. The reference occurs in 533 1.1 mrg LOOP in statement STMT and it is write if WRITE_P. Returns true if the 534 1.1 mrg reference was recorded, false otherwise. */ 535 1.1 mrg 536 1.1 mrg static bool 537 1.1 mrg gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs, 538 1.1 mrg tree ref, bool write_p, gimple *stmt) 539 1.1 mrg { 540 1.1 mrg tree base, step; 541 1.1 mrg HOST_WIDE_INT delta; 542 1.1 mrg struct mem_ref_group *agrp; 543 1.1 mrg 544 1.1 mrg if (get_base_address (ref) == NULL) 545 1.1 mrg return false; 546 1.1 mrg 547 1.1 mrg if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 548 1.1 mrg return false; 549 1.1 mrg /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */ 550 1.1 mrg if (step == NULL_TREE) 551 1.1 mrg return false; 552 1.1 mrg 553 1.1 mrg /* Stop if the address of BASE could not be taken. */ 554 1.1 mrg if (may_be_nonaddressable_p (base)) 555 1.1 mrg return false; 556 1.1 mrg 557 1.1 mrg /* Limit non-constant step prefetching only to the innermost loops and 558 1.1 mrg only when the step is loop invariant in the entire loop nest. */ 559 1.1 mrg if (!cst_and_fits_in_hwi (step)) 560 1.1 mrg { 561 1.1 mrg if (loop->inner != NULL) 562 1.1 mrg { 563 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 564 1.1 mrg { 565 1.1 mrg fprintf (dump_file, "Memory expression %p\n",(void *) ref ); 566 1.1 mrg print_generic_expr (dump_file, ref, TDF_SLIM); 567 1.1 mrg fprintf (dump_file,":"); 568 1.1 mrg dump_mem_details (dump_file, base, step, delta, write_p); 569 1.1 mrg fprintf (dump_file, 570 1.1 mrg "Ignoring %p, non-constant step prefetching is " 571 1.1 mrg "limited to inner most loops \n", 572 1.1 mrg (void *) ref); 573 1.1 mrg } 574 1.1 mrg return false; 575 1.1 mrg } 576 1.1 mrg else 577 1.1 mrg { 578 1.1 mrg if (!expr_invariant_in_loop_p (loop_outermost (loop), step)) 579 1.1 mrg { 580 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 581 1.1 mrg { 582 1.1 mrg fprintf (dump_file, "Memory expression %p\n",(void *) ref ); 583 1.1 mrg print_generic_expr (dump_file, ref, TDF_SLIM); 584 1.1 mrg fprintf (dump_file,":"); 585 1.1 mrg dump_mem_details (dump_file, base, step, delta, write_p); 586 1.1 mrg fprintf (dump_file, 587 1.1 mrg "Not prefetching, ignoring %p due to " 588 1.1 mrg "loop variant step\n", 589 1.1 mrg (void *) ref); 590 1.1 mrg } 591 1.1 mrg return false; 592 1.1 mrg } 593 1.1 mrg } 594 1.1 mrg } 595 1.1 mrg 596 1.1 mrg /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 597 1.1 mrg are integer constants. */ 598 1.1 mrg agrp = find_or_create_group (refs, base, step); 599 1.1 mrg record_ref (agrp, stmt, ref, delta, write_p); 600 1.1 mrg 601 1.1 mrg return true; 602 1.1 mrg } 603 1.1 mrg 604 1.1 mrg /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to 605 1.1 mrg true if there are no other memory references inside the loop. */ 606 1.1 mrg 607 1.1 mrg static struct mem_ref_group * 608 1.1 mrg gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count) 609 1.1 mrg { 610 1.1 mrg basic_block *body = get_loop_body_in_dom_order (loop); 611 1.1 mrg basic_block bb; 612 1.1 mrg unsigned i; 613 1.1 mrg gimple_stmt_iterator bsi; 614 1.1 mrg gimple *stmt; 615 1.1 mrg tree lhs, rhs; 616 1.1 mrg struct mem_ref_group *refs = NULL; 617 1.1 mrg 618 1.1 mrg *no_other_refs = true; 619 1.1 mrg *ref_count = 0; 620 1.1 mrg 621 1.1 mrg /* Scan the loop body in order, so that the former references precede the 622 1.1 mrg later ones. */ 623 1.1 mrg for (i = 0; i < loop->num_nodes; i++) 624 1.1 mrg { 625 1.1 mrg bb = body[i]; 626 1.1 mrg if (bb->loop_father != loop) 627 1.1 mrg continue; 628 1.1 mrg 629 1.1 mrg for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 630 1.1 mrg { 631 1.1 mrg stmt = gsi_stmt (bsi); 632 1.1 mrg 633 1.1 mrg if (gimple_code (stmt) != GIMPLE_ASSIGN) 634 1.1 mrg { 635 1.1 mrg if (gimple_vuse (stmt) 636 1.1 mrg || (is_gimple_call (stmt) 637 1.1 mrg && !(gimple_call_flags (stmt) & ECF_CONST))) 638 1.1 mrg *no_other_refs = false; 639 1.1 mrg continue; 640 1.1 mrg } 641 1.1 mrg 642 1.1 mrg if (! gimple_vuse (stmt)) 643 1.1 mrg continue; 644 1.1 mrg 645 1.1 mrg lhs = gimple_assign_lhs (stmt); 646 1.1 mrg rhs = gimple_assign_rhs1 (stmt); 647 1.1 mrg 648 1.1 mrg if (REFERENCE_CLASS_P (rhs)) 649 1.1 mrg { 650 1.1 mrg *no_other_refs &= gather_memory_references_ref (loop, &refs, 651 1.1 mrg rhs, false, stmt); 652 1.1 mrg *ref_count += 1; 653 1.1 mrg } 654 1.1 mrg if (REFERENCE_CLASS_P (lhs)) 655 1.1 mrg { 656 1.1 mrg *no_other_refs &= gather_memory_references_ref (loop, &refs, 657 1.1 mrg lhs, true, stmt); 658 1.1 mrg *ref_count += 1; 659 1.1 mrg } 660 1.1 mrg } 661 1.1 mrg } 662 1.1 mrg free (body); 663 1.1 mrg 664 1.1 mrg return refs; 665 1.1 mrg } 666 1.1 mrg 667 1.1 mrg /* Prune the prefetch candidate REF using the self-reuse. */ 668 1.1 mrg 669 1.1 mrg static void 670 1.1 mrg prune_ref_by_self_reuse (struct mem_ref *ref) 671 1.1 mrg { 672 1.1 mrg HOST_WIDE_INT step; 673 1.1 mrg bool backward; 674 1.1 mrg 675 1.1 mrg /* If the step size is non constant, we cannot calculate prefetch_mod. */ 676 1.1 mrg if (!cst_and_fits_in_hwi (ref->group->step)) 677 1.1 mrg return; 678 1.1 mrg 679 1.1 mrg step = int_cst_value (ref->group->step); 680 1.1 mrg 681 1.1 mrg backward = step < 0; 682 1.1 mrg 683 1.1 mrg if (step == 0) 684 1.1 mrg { 685 1.1 mrg /* Prefetch references to invariant address just once. */ 686 1.1 mrg ref->prefetch_before = 1; 687 1.1 mrg return; 688 1.1 mrg } 689 1.1 mrg 690 1.1 mrg if (backward) 691 1.1 mrg step = -step; 692 1.1 mrg 693 1.1 mrg if (step > PREFETCH_BLOCK) 694 1.1 mrg return; 695 1.1 mrg 696 1.1 mrg if ((backward && HAVE_BACKWARD_PREFETCH) 697 1.1 mrg || (!backward && HAVE_FORWARD_PREFETCH)) 698 1.1 mrg { 699 1.1 mrg ref->prefetch_before = 1; 700 1.1 mrg return; 701 1.1 mrg } 702 1.1 mrg 703 1.1 mrg ref->prefetch_mod = PREFETCH_BLOCK / step; 704 1.1 mrg } 705 1.1 mrg 706 1.1 mrg /* Divides X by BY, rounding down. */ 707 1.1 mrg 708 1.1 mrg static HOST_WIDE_INT 709 1.1 mrg ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) 710 1.1 mrg { 711 1.1 mrg gcc_assert (by > 0); 712 1.1 mrg 713 1.1 mrg if (x >= 0) 714 1.1 mrg return x / (HOST_WIDE_INT) by; 715 1.1 mrg else 716 1.1 mrg return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by; 717 1.1 mrg } 718 1.1 mrg 719 1.1 mrg /* Given a CACHE_LINE_SIZE and two inductive memory references 720 1.1 mrg with a common STEP greater than CACHE_LINE_SIZE and an address 721 1.1 mrg difference DELTA, compute the probability that they will fall 722 1.1 mrg in different cache lines. Return true if the computed miss rate 723 1.1 mrg is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the 724 1.1 mrg number of distinct iterations after which the pattern repeats itself. 725 1.1 mrg ALIGN_UNIT is the unit of alignment in bytes. */ 726 1.1 mrg 727 1.1 mrg static bool 728 1.1 mrg is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size, 729 1.1 mrg HOST_WIDE_INT step, HOST_WIDE_INT delta, 730 1.1 mrg unsigned HOST_WIDE_INT distinct_iters, 731 1.1 mrg int align_unit) 732 1.1 mrg { 733 1.1 mrg unsigned align, iter; 734 1.1 mrg int total_positions, miss_positions, max_allowed_miss_positions; 735 1.1 mrg int address1, address2, cache_line1, cache_line2; 736 1.1 mrg 737 1.1 mrg /* It always misses if delta is greater than or equal to the cache 738 1.1 mrg line size. */ 739 1.1 mrg if (delta >= (HOST_WIDE_INT) cache_line_size) 740 1.1 mrg return false; 741 1.1 mrg 742 1.1 mrg gcc_assert (align_unit > 0); 743 1.1 mrg 744 1.1 mrg miss_positions = 0; 745 1.1 mrg total_positions = (cache_line_size / align_unit) * distinct_iters; 746 1.1 mrg max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000; 747 1.1 mrg 748 1.1 mrg /* Iterate through all possible alignments of the first 749 1.1 mrg memory reference within its cache line. */ 750 1.1 mrg for (align = 0; align < cache_line_size; align += align_unit) 751 1.1 mrg 752 1.1 mrg /* Iterate through all distinct iterations. */ 753 1.1 mrg for (iter = 0; iter < distinct_iters; iter++) 754 1.1 mrg { 755 1.1 mrg address1 = align + step * iter; 756 1.1 mrg address2 = address1 + delta; 757 1.1 mrg cache_line1 = address1 / cache_line_size; 758 1.1 mrg cache_line2 = address2 / cache_line_size; 759 1.1 mrg if (cache_line1 != cache_line2) 760 1.1 mrg { 761 1.1 mrg miss_positions += 1; 762 1.1 mrg if (miss_positions > max_allowed_miss_positions) 763 1.1 mrg return false; 764 1.1 mrg } 765 1.1 mrg } 766 1.1 mrg return true; 767 1.1 mrg } 768 1.1 mrg 769 1.1 mrg /* Prune the prefetch candidate REF using the reuse with BY. 770 1.1 mrg If BY_IS_BEFORE is true, BY is before REF in the loop. */ 771 1.1 mrg 772 1.1 mrg static void 773 1.1 mrg prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 774 1.1 mrg bool by_is_before) 775 1.1 mrg { 776 1.1 mrg HOST_WIDE_INT step; 777 1.1 mrg bool backward; 778 1.1 mrg HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 779 1.1 mrg HOST_WIDE_INT delta = delta_b - delta_r; 780 1.1 mrg HOST_WIDE_INT hit_from; 781 1.1 mrg unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 782 1.1 mrg HOST_WIDE_INT reduced_step; 783 1.1 mrg unsigned HOST_WIDE_INT reduced_prefetch_block; 784 1.1 mrg tree ref_type; 785 1.1 mrg int align_unit; 786 1.1 mrg 787 1.1 mrg /* If the step is non constant we cannot calculate prefetch_before. */ 788 1.1 mrg if (!cst_and_fits_in_hwi (ref->group->step)) { 789 1.1 mrg return; 790 1.1 mrg } 791 1.1 mrg 792 1.1 mrg step = int_cst_value (ref->group->step); 793 1.1 mrg 794 1.1 mrg backward = step < 0; 795 1.1 mrg 796 1.1 mrg 797 1.1 mrg if (delta == 0) 798 1.1 mrg { 799 1.1 mrg /* If the references has the same address, only prefetch the 800 1.1 mrg former. */ 801 1.1 mrg if (by_is_before) 802 1.1 mrg ref->prefetch_before = 0; 803 1.1 mrg 804 1.1 mrg return; 805 1.1 mrg } 806 1.1 mrg 807 1.1 mrg if (!step) 808 1.1 mrg { 809 1.1 mrg /* If the reference addresses are invariant and fall into the 810 1.1 mrg same cache line, prefetch just the first one. */ 811 1.1 mrg if (!by_is_before) 812 1.1 mrg return; 813 1.1 mrg 814 1.1 mrg if (ddown (ref->delta, PREFETCH_BLOCK) 815 1.1 mrg != ddown (by->delta, PREFETCH_BLOCK)) 816 1.1 mrg return; 817 1.1 mrg 818 1.1 mrg ref->prefetch_before = 0; 819 1.1 mrg return; 820 1.1 mrg } 821 1.1 mrg 822 1.1 mrg /* Only prune the reference that is behind in the array. */ 823 1.1 mrg if (backward) 824 1.1 mrg { 825 1.1 mrg if (delta > 0) 826 1.1 mrg return; 827 1.1 mrg 828 1.1 mrg /* Transform the data so that we may assume that the accesses 829 1.1 mrg are forward. */ 830 1.1 mrg delta = - delta; 831 1.1 mrg step = -step; 832 1.1 mrg delta_r = PREFETCH_BLOCK - 1 - delta_r; 833 1.1 mrg delta_b = PREFETCH_BLOCK - 1 - delta_b; 834 1.1 mrg } 835 1.1 mrg else 836 1.1 mrg { 837 1.1 mrg if (delta < 0) 838 1.1 mrg return; 839 1.1 mrg } 840 1.1 mrg 841 1.1 mrg /* Check whether the two references are likely to hit the same cache 842 1.1 mrg line, and how distant the iterations in that it occurs are from 843 1.1 mrg each other. */ 844 1.1 mrg 845 1.1 mrg if (step <= PREFETCH_BLOCK) 846 1.1 mrg { 847 1.1 mrg /* The accesses are sure to meet. Let us check when. */ 848 1.1 mrg hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 849 1.1 mrg prefetch_before = (hit_from - delta_r + step - 1) / step; 850 1.1 mrg 851 1.1 mrg /* Do not reduce prefetch_before if we meet beyond cache size. */ 852 1.1 mrg if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step)) 853 1.1 mrg prefetch_before = PREFETCH_ALL; 854 1.1 mrg if (prefetch_before < ref->prefetch_before) 855 1.1 mrg ref->prefetch_before = prefetch_before; 856 1.1 mrg 857 1.1 mrg return; 858 1.1 mrg } 859 1.1 mrg 860 1.1 mrg /* A more complicated case with step > prefetch_block. First reduce 861 1.1 mrg the ratio between the step and the cache line size to its simplest 862 1.1 mrg terms. The resulting denominator will then represent the number of 863 1.1 mrg distinct iterations after which each address will go back to its 864 1.1 mrg initial location within the cache line. This computation assumes 865 1.1 mrg that PREFETCH_BLOCK is a power of two. */ 866 1.1 mrg prefetch_block = PREFETCH_BLOCK; 867 1.1 mrg reduced_prefetch_block = prefetch_block; 868 1.1 mrg reduced_step = step; 869 1.1 mrg while ((reduced_step & 1) == 0 870 1.1 mrg && reduced_prefetch_block > 1) 871 1.1 mrg { 872 1.1 mrg reduced_step >>= 1; 873 1.1 mrg reduced_prefetch_block >>= 1; 874 1.1 mrg } 875 1.1 mrg 876 1.1 mrg prefetch_before = delta / step; 877 1.1 mrg delta %= step; 878 1.1 mrg ref_type = TREE_TYPE (ref->mem); 879 1.1 mrg align_unit = TYPE_ALIGN (ref_type) / 8; 880 1.1 mrg if (is_miss_rate_acceptable (prefetch_block, step, delta, 881 1.1 mrg reduced_prefetch_block, align_unit)) 882 1.1 mrg { 883 1.1 mrg /* Do not reduce prefetch_before if we meet beyond cache size. */ 884 1.1 mrg if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK) 885 1.1 mrg prefetch_before = PREFETCH_ALL; 886 1.1 mrg if (prefetch_before < ref->prefetch_before) 887 1.1 mrg ref->prefetch_before = prefetch_before; 888 1.1 mrg 889 1.1 mrg return; 890 1.1 mrg } 891 1.1 mrg 892 1.1 mrg /* Try also the following iteration. */ 893 1.1 mrg prefetch_before++; 894 1.1 mrg delta = step - delta; 895 1.1 mrg if (is_miss_rate_acceptable (prefetch_block, step, delta, 896 1.1 mrg reduced_prefetch_block, align_unit)) 897 1.1 mrg { 898 1.1 mrg if (prefetch_before < ref->prefetch_before) 899 1.1 mrg ref->prefetch_before = prefetch_before; 900 1.1 mrg 901 1.1 mrg return; 902 1.1 mrg } 903 1.1 mrg 904 1.1 mrg /* The ref probably does not reuse by. */ 905 1.1 mrg return; 906 1.1 mrg } 907 1.1 mrg 908 1.1 mrg /* Prune the prefetch candidate REF using the reuses with other references 909 1.1 mrg in REFS. */ 910 1.1 mrg 911 1.1 mrg static void 912 1.1 mrg prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) 913 1.1 mrg { 914 1.1 mrg struct mem_ref *prune_by; 915 1.1 mrg bool before = true; 916 1.1 mrg 917 1.1 mrg prune_ref_by_self_reuse (ref); 918 1.1 mrg 919 1.1 mrg for (prune_by = refs; prune_by; prune_by = prune_by->next) 920 1.1 mrg { 921 1.1 mrg if (prune_by == ref) 922 1.1 mrg { 923 1.1 mrg before = false; 924 1.1 mrg continue; 925 1.1 mrg } 926 1.1 mrg 927 1.1 mrg if (!WRITE_CAN_USE_READ_PREFETCH 928 1.1 mrg && ref->write_p 929 1.1 mrg && !prune_by->write_p) 930 1.1 mrg continue; 931 1.1 mrg if (!READ_CAN_USE_WRITE_PREFETCH 932 1.1 mrg && !ref->write_p 933 1.1 mrg && prune_by->write_p) 934 1.1 mrg continue; 935 1.1 mrg 936 1.1 mrg prune_ref_by_group_reuse (ref, prune_by, before); 937 1.1 mrg } 938 1.1 mrg } 939 1.1 mrg 940 1.1 mrg /* Prune the prefetch candidates in GROUP using the reuse analysis. */ 941 1.1 mrg 942 1.1 mrg static void 943 1.1 mrg prune_group_by_reuse (struct mem_ref_group *group) 944 1.1 mrg { 945 1.1 mrg struct mem_ref *ref_pruned; 946 1.1 mrg 947 1.1 mrg for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) 948 1.1 mrg { 949 1.1 mrg prune_ref_by_reuse (ref_pruned, group->refs); 950 1.1 mrg 951 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 952 1.1 mrg { 953 1.1 mrg dump_mem_ref (dump_file, ref_pruned); 954 1.1 mrg 955 1.1 mrg if (ref_pruned->prefetch_before == PREFETCH_ALL 956 1.1 mrg && ref_pruned->prefetch_mod == 1) 957 1.1 mrg fprintf (dump_file, " no restrictions"); 958 1.1 mrg else if (ref_pruned->prefetch_before == 0) 959 1.1 mrg fprintf (dump_file, " do not prefetch"); 960 1.1 mrg else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) 961 1.1 mrg fprintf (dump_file, " prefetch once"); 962 1.1 mrg else 963 1.1 mrg { 964 1.1 mrg if (ref_pruned->prefetch_before != PREFETCH_ALL) 965 1.1 mrg { 966 1.1 mrg fprintf (dump_file, " prefetch before "); 967 1.1 mrg fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 968 1.1 mrg ref_pruned->prefetch_before); 969 1.1 mrg } 970 1.1 mrg if (ref_pruned->prefetch_mod != 1) 971 1.1 mrg { 972 1.1 mrg fprintf (dump_file, " prefetch mod "); 973 1.1 mrg fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 974 1.1 mrg ref_pruned->prefetch_mod); 975 1.1 mrg } 976 1.1 mrg } 977 1.1 mrg fprintf (dump_file, "\n"); 978 1.1 mrg } 979 1.1 mrg } 980 1.1 mrg } 981 1.1 mrg 982 1.1 mrg /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ 983 1.1 mrg 984 1.1 mrg static void 985 1.1 mrg prune_by_reuse (struct mem_ref_group *groups) 986 1.1 mrg { 987 1.1 mrg for (; groups; groups = groups->next) 988 1.1 mrg prune_group_by_reuse (groups); 989 1.1 mrg } 990 1.1 mrg 991 1.1 mrg /* Returns true if we should issue prefetch for REF. */ 992 1.1 mrg 993 1.1 mrg static bool 994 1.1 mrg should_issue_prefetch_p (struct mem_ref *ref) 995 1.1 mrg { 996 1.1 mrg /* Do we want to issue prefetches for non-constant strides? */ 997 1.1 mrg if (!cst_and_fits_in_hwi (ref->group->step) 998 1.1 mrg && param_prefetch_dynamic_strides == 0) 999 1.1 mrg { 1000 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1001 1.1 mrg fprintf (dump_file, 1002 1.1 mrg "Skipping non-constant step for reference %u:%u\n", 1003 1.1 mrg ref->group->uid, ref->uid); 1004 1.1 mrg return false; 1005 1.1 mrg } 1006 1.1 mrg 1007 1.1 mrg /* Some processors may have a hardware prefetcher that may conflict with 1008 1.1 mrg prefetch hints for a range of strides. Make sure we don't issue 1009 1.1 mrg prefetches for such cases if the stride is within this particular 1010 1.1 mrg range. */ 1011 1.1 mrg if (cst_and_fits_in_hwi (ref->group->step) 1012 1.1 mrg && abs_hwi (int_cst_value (ref->group->step)) 1013 1.1 mrg < (HOST_WIDE_INT) param_prefetch_minimum_stride) 1014 1.1 mrg { 1015 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1016 1.1 mrg fprintf (dump_file, 1017 1.1 mrg "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC 1018 1.1 mrg ") is less than the mininum required stride of %d\n", 1019 1.1 mrg ref->group->uid, ref->uid, int_cst_value (ref->group->step), 1020 1.1 mrg param_prefetch_minimum_stride); 1021 1.1 mrg return false; 1022 1.1 mrg } 1023 1.1 mrg 1024 1.1 mrg /* For now do not issue prefetches for only first few of the 1025 1.1 mrg iterations. */ 1026 1.1 mrg if (ref->prefetch_before != PREFETCH_ALL) 1027 1.1 mrg { 1028 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1029 1.1 mrg fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n", 1030 1.1 mrg ref->group->uid, ref->uid); 1031 1.1 mrg return false; 1032 1.1 mrg } 1033 1.1 mrg 1034 1.1 mrg /* Do not prefetch nontemporal stores. */ 1035 1.1 mrg if (ref->storent_p) 1036 1.1 mrg { 1037 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1038 1.1 mrg fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid); 1039 1.1 mrg return false; 1040 1.1 mrg } 1041 1.1 mrg 1042 1.1 mrg return true; 1043 1.1 mrg } 1044 1.1 mrg 1045 1.1 mrg /* Decide which of the prefetch candidates in GROUPS to prefetch. 1046 1.1 mrg AHEAD is the number of iterations to prefetch ahead (which corresponds 1047 1.1 mrg to the number of simultaneous instances of one prefetch running at a 1048 1.1 mrg time). UNROLL_FACTOR is the factor by that the loop is going to be 1049 1.1 mrg unrolled. Returns true if there is anything to prefetch. */ 1050 1.1 mrg 1051 1.1 mrg static bool 1052 1.1 mrg schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, 1053 1.1 mrg unsigned ahead) 1054 1.1 mrg { 1055 1.1 mrg unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots; 1056 1.1 mrg unsigned slots_per_prefetch; 1057 1.1 mrg struct mem_ref *ref; 1058 1.1 mrg bool any = false; 1059 1.1 mrg 1060 1.1 mrg /* At most param_simultaneous_prefetches should be running 1061 1.1 mrg at the same time. */ 1062 1.1 mrg remaining_prefetch_slots = param_simultaneous_prefetches; 1063 1.1 mrg 1064 1.1 mrg /* The prefetch will run for AHEAD iterations of the original loop, i.e., 1065 1.1 mrg AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration, 1066 1.1 mrg it will need a prefetch slot. */ 1067 1.1 mrg slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor; 1068 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1069 1.1 mrg fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n", 1070 1.1 mrg slots_per_prefetch); 1071 1.1 mrg 1072 1.1 mrg /* For now we just take memory references one by one and issue 1073 1.1 mrg prefetches for as many as possible. The groups are sorted 1074 1.1 mrg starting with the largest step, since the references with 1075 1.1 mrg large step are more likely to cause many cache misses. */ 1076 1.1 mrg 1077 1.1 mrg for (; groups; groups = groups->next) 1078 1.1 mrg for (ref = groups->refs; ref; ref = ref->next) 1079 1.1 mrg { 1080 1.1 mrg if (!should_issue_prefetch_p (ref)) 1081 1.1 mrg continue; 1082 1.1 mrg 1083 1.1 mrg /* The loop is far from being sufficiently unrolled for this 1084 1.1 mrg prefetch. Do not generate prefetch to avoid many redudant 1085 1.1 mrg prefetches. */ 1086 1.1 mrg if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO) 1087 1.1 mrg continue; 1088 1.1 mrg 1089 1.1 mrg /* If we need to prefetch the reference each PREFETCH_MOD iterations, 1090 1.1 mrg and we unroll the loop UNROLL_FACTOR times, we need to insert 1091 1.1 mrg ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each 1092 1.1 mrg iteration. */ 1093 1.1 mrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1094 1.1 mrg / ref->prefetch_mod); 1095 1.1 mrg prefetch_slots = n_prefetches * slots_per_prefetch; 1096 1.1 mrg 1097 1.1 mrg /* If more than half of the prefetches would be lost anyway, do not 1098 1.1 mrg issue the prefetch. */ 1099 1.1 mrg if (2 * remaining_prefetch_slots < prefetch_slots) 1100 1.1 mrg continue; 1101 1.1 mrg 1102 1.1 mrg /* Stop prefetching if debug counter is activated. */ 1103 1.1 mrg if (!dbg_cnt (prefetch)) 1104 1.1 mrg continue; 1105 1.1 mrg 1106 1.1 mrg ref->issue_prefetch_p = true; 1107 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1108 1.1 mrg fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n", 1109 1.1 mrg ref->group->uid, ref->uid); 1110 1.1 mrg 1111 1.1 mrg if (remaining_prefetch_slots <= prefetch_slots) 1112 1.1 mrg return true; 1113 1.1 mrg remaining_prefetch_slots -= prefetch_slots; 1114 1.1 mrg any = true; 1115 1.1 mrg } 1116 1.1 mrg 1117 1.1 mrg return any; 1118 1.1 mrg } 1119 1.1 mrg 1120 1.1 mrg /* Return TRUE if no prefetch is going to be generated in the given 1121 1.1 mrg GROUPS. */ 1122 1.1 mrg 1123 1.1 mrg static bool 1124 1.1 mrg nothing_to_prefetch_p (struct mem_ref_group *groups) 1125 1.1 mrg { 1126 1.1 mrg struct mem_ref *ref; 1127 1.1 mrg 1128 1.1 mrg for (; groups; groups = groups->next) 1129 1.1 mrg for (ref = groups->refs; ref; ref = ref->next) 1130 1.1 mrg if (should_issue_prefetch_p (ref)) 1131 1.1 mrg return false; 1132 1.1 mrg 1133 1.1 mrg return true; 1134 1.1 mrg } 1135 1.1 mrg 1136 1.1 mrg /* Estimate the number of prefetches in the given GROUPS. 1137 1.1 mrg UNROLL_FACTOR is the factor by which LOOP was unrolled. */ 1138 1.1 mrg 1139 1.1 mrg static int 1140 1.1 mrg estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor) 1141 1.1 mrg { 1142 1.1 mrg struct mem_ref *ref; 1143 1.1 mrg unsigned n_prefetches; 1144 1.1 mrg int prefetch_count = 0; 1145 1.1 mrg 1146 1.1 mrg for (; groups; groups = groups->next) 1147 1.1 mrg for (ref = groups->refs; ref; ref = ref->next) 1148 1.1 mrg if (should_issue_prefetch_p (ref)) 1149 1.1 mrg { 1150 1.1 mrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1151 1.1 mrg / ref->prefetch_mod); 1152 1.1 mrg prefetch_count += n_prefetches; 1153 1.1 mrg } 1154 1.1 mrg 1155 1.1 mrg return prefetch_count; 1156 1.1 mrg } 1157 1.1 mrg 1158 1.1 mrg /* Issue prefetches for the reference REF into loop as decided before. 1159 1.1 mrg HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR 1160 1.1 mrg is the factor by which LOOP was unrolled. */ 1161 1.1 mrg 1162 1.1 mrg static void 1163 1.1 mrg issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 1164 1.1 mrg { 1165 1.1 mrg HOST_WIDE_INT delta; 1166 1.1 mrg tree addr, addr_base, write_p, local, forward; 1167 1.1 mrg gcall *prefetch; 1168 1.1 mrg gimple_stmt_iterator bsi; 1169 1.1 mrg unsigned n_prefetches, ap; 1170 1.1 mrg bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; 1171 1.1 mrg 1172 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1173 1.1 mrg fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n", 1174 1.1 mrg nontemporal ? " nontemporal" : "", 1175 1.1 mrg ref->group->uid, ref->uid); 1176 1.1 mrg 1177 1.1 mrg bsi = gsi_for_stmt (ref->stmt); 1178 1.1 mrg 1179 1.1 mrg n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 1180 1.1 mrg / ref->prefetch_mod); 1181 1.1 mrg addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 1182 1.1 mrg addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base), 1183 1.1 mrg true, NULL, true, GSI_SAME_STMT); 1184 1.1 mrg write_p = ref->write_p ? integer_one_node : integer_zero_node; 1185 1.1 mrg local = nontemporal ? integer_zero_node : integer_three_node; 1186 1.1 mrg 1187 1.1 mrg for (ap = 0; ap < n_prefetches; ap++) 1188 1.1 mrg { 1189 1.1 mrg if (cst_and_fits_in_hwi (ref->group->step)) 1190 1.1 mrg { 1191 1.1 mrg /* Determine the address to prefetch. */ 1192 1.1 mrg delta = (ahead + ap * ref->prefetch_mod) * 1193 1.1 mrg int_cst_value (ref->group->step); 1194 1.1 mrg addr = fold_build_pointer_plus_hwi (addr_base, delta); 1195 1.1 mrg addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, 1196 1.1 mrg NULL, true, GSI_SAME_STMT); 1197 1.1 mrg } 1198 1.1 mrg else 1199 1.1 mrg { 1200 1.1 mrg /* The step size is non-constant but loop-invariant. We use the 1201 1.1 mrg heuristic to simply prefetch ahead iterations ahead. */ 1202 1.1 mrg forward = fold_build2 (MULT_EXPR, sizetype, 1203 1.1 mrg fold_convert (sizetype, ref->group->step), 1204 1.1 mrg fold_convert (sizetype, size_int (ahead))); 1205 1.1 mrg addr = fold_build_pointer_plus (addr_base, forward); 1206 1.1 mrg addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, 1207 1.1 mrg NULL, true, GSI_SAME_STMT); 1208 1.1 mrg } 1209 1.1 mrg 1210 1.1 mrg if (addr_base != addr 1211 1.1 mrg && TREE_CODE (addr_base) == SSA_NAME 1212 1.1 mrg && TREE_CODE (addr) == SSA_NAME) 1213 1.1 mrg { 1214 1.1 mrg duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base)); 1215 1.1 mrg /* As this isn't a plain copy we have to reset alignment 1216 1.1 mrg information. */ 1217 1.1 mrg if (SSA_NAME_PTR_INFO (addr)) 1218 1.1 mrg mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr)); 1219 1.1 mrg } 1220 1.1 mrg 1221 1.1 mrg /* Create the prefetch instruction. */ 1222 1.1 mrg prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH), 1223 1.1 mrg 3, addr, write_p, local); 1224 1.1 mrg gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT); 1225 1.1 mrg } 1226 1.1 mrg } 1227 1.1 mrg 1228 1.1 mrg /* Issue prefetches for the references in GROUPS into loop as decided before. 1229 1.1 mrg HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the 1230 1.1 mrg factor by that LOOP was unrolled. */ 1231 1.1 mrg 1232 1.1 mrg static void 1233 1.1 mrg issue_prefetches (struct mem_ref_group *groups, 1234 1.1 mrg unsigned unroll_factor, unsigned ahead) 1235 1.1 mrg { 1236 1.1 mrg struct mem_ref *ref; 1237 1.1 mrg 1238 1.1 mrg for (; groups; groups = groups->next) 1239 1.1 mrg for (ref = groups->refs; ref; ref = ref->next) 1240 1.1 mrg if (ref->issue_prefetch_p) 1241 1.1 mrg issue_prefetch_ref (ref, unroll_factor, ahead); 1242 1.1 mrg } 1243 1.1 mrg 1244 1.1 mrg /* Returns true if REF is a memory write for that a nontemporal store insn 1245 1.1 mrg can be used. */ 1246 1.1 mrg 1247 1.1 mrg static bool 1248 1.1 mrg nontemporal_store_p (struct mem_ref *ref) 1249 1.1 mrg { 1250 1.1 mrg machine_mode mode; 1251 1.1 mrg enum insn_code code; 1252 1.1 mrg 1253 1.1 mrg /* REF must be a write that is not reused. We require it to be independent 1254 1.1 mrg on all other memory references in the loop, as the nontemporal stores may 1255 1.1 mrg be reordered with respect to other memory references. */ 1256 1.1 mrg if (!ref->write_p 1257 1.1 mrg || !ref->independent_p 1258 1.1 mrg || ref->reuse_distance < L2_CACHE_SIZE_BYTES) 1259 1.1 mrg return false; 1260 1.1 mrg 1261 1.1 mrg /* Check that we have the storent instruction for the mode. */ 1262 1.1 mrg mode = TYPE_MODE (TREE_TYPE (ref->mem)); 1263 1.1 mrg if (mode == BLKmode) 1264 1.1 mrg return false; 1265 1.1 mrg 1266 1.1 mrg code = optab_handler (storent_optab, mode); 1267 1.1 mrg return code != CODE_FOR_nothing; 1268 1.1 mrg } 1269 1.1 mrg 1270 1.1 mrg /* If REF is a nontemporal store, we mark the corresponding modify statement 1271 1.1 mrg and return true. Otherwise, we return false. */ 1272 1.1 mrg 1273 1.1 mrg static bool 1274 1.1 mrg mark_nontemporal_store (struct mem_ref *ref) 1275 1.1 mrg { 1276 1.1 mrg if (!nontemporal_store_p (ref)) 1277 1.1 mrg return false; 1278 1.1 mrg 1279 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1280 1.1 mrg fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n", 1281 1.1 mrg ref->group->uid, ref->uid); 1282 1.1 mrg 1283 1.1 mrg gimple_assign_set_nontemporal_move (ref->stmt, true); 1284 1.1 mrg ref->storent_p = true; 1285 1.1 mrg 1286 1.1 mrg return true; 1287 1.1 mrg } 1288 1.1 mrg 1289 1.1 mrg /* Issue a memory fence instruction after LOOP. */ 1290 1.1 mrg 1291 1.1 mrg static void 1292 1.1 mrg emit_mfence_after_loop (class loop *loop) 1293 1.1 mrg { 1294 1.1 mrg auto_vec<edge> exits = get_loop_exit_edges (loop); 1295 1.1 mrg edge exit; 1296 1.1 mrg gcall *call; 1297 1.1 mrg gimple_stmt_iterator bsi; 1298 1.1 mrg unsigned i; 1299 1.1 mrg 1300 1.1 mrg FOR_EACH_VEC_ELT (exits, i, exit) 1301 1.1 mrg { 1302 1.1 mrg call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0); 1303 1.1 mrg 1304 1.1 mrg if (!single_pred_p (exit->dest) 1305 1.1 mrg /* If possible, we prefer not to insert the fence on other paths 1306 1.1 mrg in cfg. */ 1307 1.1 mrg && !(exit->flags & EDGE_ABNORMAL)) 1308 1.1 mrg split_loop_exit_edge (exit); 1309 1.1 mrg bsi = gsi_after_labels (exit->dest); 1310 1.1 mrg 1311 1.1 mrg gsi_insert_before (&bsi, call, GSI_NEW_STMT); 1312 1.1 mrg } 1313 1.1 mrg 1314 1.1 mrg update_ssa (TODO_update_ssa_only_virtuals); 1315 1.1 mrg } 1316 1.1 mrg 1317 1.1 mrg /* Returns true if we can use storent in loop, false otherwise. */ 1318 1.1 mrg 1319 1.1 mrg static bool 1320 1.1 mrg may_use_storent_in_loop_p (class loop *loop) 1321 1.1 mrg { 1322 1.1 mrg bool ret = true; 1323 1.1 mrg 1324 1.1 mrg if (loop->inner != NULL) 1325 1.1 mrg return false; 1326 1.1 mrg 1327 1.1 mrg /* If we must issue a mfence insn after using storent, check that there 1328 1.1 mrg is a suitable place for it at each of the loop exits. */ 1329 1.1 mrg if (FENCE_FOLLOWING_MOVNT != NULL_TREE) 1330 1.1 mrg { 1331 1.1 mrg auto_vec<edge> exits = get_loop_exit_edges (loop); 1332 1.1 mrg unsigned i; 1333 1.1 mrg edge exit; 1334 1.1 mrg 1335 1.1 mrg FOR_EACH_VEC_ELT (exits, i, exit) 1336 1.1 mrg if ((exit->flags & EDGE_ABNORMAL) 1337 1.1 mrg && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) 1338 1.1 mrg ret = false; 1339 1.1 mrg } 1340 1.1 mrg 1341 1.1 mrg return ret; 1342 1.1 mrg } 1343 1.1 mrg 1344 1.1 mrg /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory 1345 1.1 mrg references in the loop. */ 1346 1.1 mrg 1347 1.1 mrg static void 1348 1.1 mrg mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups) 1349 1.1 mrg { 1350 1.1 mrg struct mem_ref *ref; 1351 1.1 mrg bool any = false; 1352 1.1 mrg 1353 1.1 mrg if (!may_use_storent_in_loop_p (loop)) 1354 1.1 mrg return; 1355 1.1 mrg 1356 1.1 mrg for (; groups; groups = groups->next) 1357 1.1 mrg for (ref = groups->refs; ref; ref = ref->next) 1358 1.1 mrg any |= mark_nontemporal_store (ref); 1359 1.1 mrg 1360 1.1 mrg if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE) 1361 1.1 mrg emit_mfence_after_loop (loop); 1362 1.1 mrg } 1363 1.1 mrg 1364 1.1 mrg /* Determines whether we can profitably unroll LOOP FACTOR times, and if 1365 1.1 mrg this is the case, fill in DESC by the description of number of 1366 1.1 mrg iterations. */ 1367 1.1 mrg 1368 1.1 mrg static bool 1369 1.1 mrg should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc, 1370 1.1 mrg unsigned factor) 1371 1.1 mrg { 1372 1.1 mrg if (!can_unroll_loop_p (loop, factor, desc)) 1373 1.1 mrg return false; 1374 1.1 mrg 1375 1.1 mrg /* We only consider loops without control flow for unrolling. This is not 1376 1.1 mrg a hard restriction -- tree_unroll_loop works with arbitrary loops 1377 1.1 mrg as well; but the unrolling/prefetching is usually more profitable for 1378 1.1 mrg loops consisting of a single basic block, and we want to limit the 1379 1.1 mrg code growth. */ 1380 1.1 mrg if (loop->num_nodes > 2) 1381 1.1 mrg return false; 1382 1.1 mrg 1383 1.1 mrg return true; 1384 1.1 mrg } 1385 1.1 mrg 1386 1.1 mrg /* Determine the coefficient by that unroll LOOP, from the information 1387 1.1 mrg contained in the list of memory references REFS. Description of 1388 1.1 mrg number of iterations of LOOP is stored to DESC. NINSNS is the number of 1389 1.1 mrg insns of the LOOP. EST_NITER is the estimated number of iterations of 1390 1.1 mrg the loop, or -1 if no estimate is available. */ 1391 1.1 mrg 1392 1.1 mrg static unsigned 1393 1.1 mrg determine_unroll_factor (class loop *loop, struct mem_ref_group *refs, 1394 1.1 mrg unsigned ninsns, class tree_niter_desc *desc, 1395 1.1 mrg HOST_WIDE_INT est_niter) 1396 1.1 mrg { 1397 1.1 mrg unsigned upper_bound; 1398 1.1 mrg unsigned nfactor, factor, mod_constraint; 1399 1.1 mrg struct mem_ref_group *agp; 1400 1.1 mrg struct mem_ref *ref; 1401 1.1 mrg 1402 1.1 mrg /* First check whether the loop is not too large to unroll. We ignore 1403 1.1 mrg PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us 1404 1.1 mrg from unrolling them enough to make exactly one cache line covered by each 1405 1.1 mrg iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent 1406 1.1 mrg us from unrolling the loops too many times in cases where we only expect 1407 1.1 mrg gains from better scheduling and decreasing loop overhead, which is not 1408 1.1 mrg the case here. */ 1409 1.1 mrg upper_bound = param_max_unrolled_insns / ninsns; 1410 1.1 mrg 1411 1.1 mrg /* If we unrolled the loop more times than it iterates, the unrolled version 1412 1.1 mrg of the loop would be never entered. */ 1413 1.1 mrg if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound) 1414 1.1 mrg upper_bound = est_niter; 1415 1.1 mrg 1416 1.1 mrg if (upper_bound <= 1) 1417 1.1 mrg return 1; 1418 1.1 mrg 1419 1.1 mrg /* Choose the factor so that we may prefetch each cache just once, 1420 1.1 mrg but bound the unrolling by UPPER_BOUND. */ 1421 1.1 mrg factor = 1; 1422 1.1 mrg for (agp = refs; agp; agp = agp->next) 1423 1.1 mrg for (ref = agp->refs; ref; ref = ref->next) 1424 1.1 mrg if (should_issue_prefetch_p (ref)) 1425 1.1 mrg { 1426 1.1 mrg mod_constraint = ref->prefetch_mod; 1427 1.1 mrg nfactor = least_common_multiple (mod_constraint, factor); 1428 1.1 mrg if (nfactor <= upper_bound) 1429 1.1 mrg factor = nfactor; 1430 1.1 mrg } 1431 1.1 mrg 1432 1.1 mrg if (!should_unroll_loop_p (loop, desc, factor)) 1433 1.1 mrg return 1; 1434 1.1 mrg 1435 1.1 mrg return factor; 1436 1.1 mrg } 1437 1.1 mrg 1438 1.1 mrg /* Returns the total volume of the memory references REFS, taking into account 1439 1.1 mrg reuses in the innermost loop and cache line size. TODO -- we should also 1440 1.1 mrg take into account reuses across the iterations of the loops in the loop 1441 1.1 mrg nest. */ 1442 1.1 mrg 1443 1.1 mrg static unsigned 1444 1.1 mrg volume_of_references (struct mem_ref_group *refs) 1445 1.1 mrg { 1446 1.1 mrg unsigned volume = 0; 1447 1.1 mrg struct mem_ref_group *gr; 1448 1.1 mrg struct mem_ref *ref; 1449 1.1 mrg 1450 1.1 mrg for (gr = refs; gr; gr = gr->next) 1451 1.1 mrg for (ref = gr->refs; ref; ref = ref->next) 1452 1.1 mrg { 1453 1.1 mrg /* Almost always reuses another value? */ 1454 1.1 mrg if (ref->prefetch_before != PREFETCH_ALL) 1455 1.1 mrg continue; 1456 1.1 mrg 1457 1.1 mrg /* If several iterations access the same cache line, use the size of 1458 1.1 mrg the line divided by this number. Otherwise, a cache line is 1459 1.1 mrg accessed in each iteration. TODO -- in the latter case, we should 1460 1.1 mrg take the size of the reference into account, rounding it up on cache 1461 1.1 mrg line size multiple. */ 1462 1.1 mrg volume += param_l1_cache_line_size / ref->prefetch_mod; 1463 1.1 mrg } 1464 1.1 mrg return volume; 1465 1.1 mrg } 1466 1.1 mrg 1467 1.1 mrg /* Returns the volume of memory references accessed across VEC iterations of 1468 1.1 mrg loops, whose sizes are described in the LOOP_SIZES array. N is the number 1469 1.1 mrg of the loops in the nest (length of VEC and LOOP_SIZES vectors). */ 1470 1.1 mrg 1471 1.1 mrg static unsigned 1472 1.1 mrg volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n) 1473 1.1 mrg { 1474 1.1 mrg unsigned i; 1475 1.1 mrg 1476 1.1 mrg for (i = 0; i < n; i++) 1477 1.1 mrg if (vec[i] != 0) 1478 1.1 mrg break; 1479 1.1 mrg 1480 1.1 mrg if (i == n) 1481 1.1 mrg return 0; 1482 1.1 mrg 1483 1.1 mrg gcc_assert (vec[i] > 0); 1484 1.1 mrg 1485 1.1 mrg /* We ignore the parts of the distance vector in subloops, since usually 1486 1.1 mrg the numbers of iterations are much smaller. */ 1487 1.1 mrg return loop_sizes[i] * vec[i]; 1488 1.1 mrg } 1489 1.1 mrg 1490 1.1 mrg /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE 1491 1.1 mrg at the position corresponding to the loop of the step. N is the depth 1492 1.1 mrg of the considered loop nest, and, LOOP is its innermost loop. */ 1493 1.1 mrg 1494 1.1 mrg static void 1495 1.1 mrg add_subscript_strides (tree access_fn, unsigned stride, 1496 1.1 mrg HOST_WIDE_INT *strides, unsigned n, class loop *loop) 1497 1.1 mrg { 1498 1.1 mrg class loop *aloop; 1499 1.1 mrg tree step; 1500 1.1 mrg HOST_WIDE_INT astep; 1501 1.1 mrg unsigned min_depth = loop_depth (loop) - n; 1502 1.1 mrg 1503 1.1 mrg while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC) 1504 1.1 mrg { 1505 1.1 mrg aloop = get_chrec_loop (access_fn); 1506 1.1 mrg step = CHREC_RIGHT (access_fn); 1507 1.1 mrg access_fn = CHREC_LEFT (access_fn); 1508 1.1 mrg 1509 1.1 mrg if ((unsigned) loop_depth (aloop) <= min_depth) 1510 1.1 mrg continue; 1511 1.1 mrg 1512 1.1 mrg if (tree_fits_shwi_p (step)) 1513 1.1 mrg astep = tree_to_shwi (step); 1514 1.1 mrg else 1515 1.1 mrg astep = param_l1_cache_line_size; 1516 1.1 mrg 1517 1.1 mrg strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride; 1518 1.1 mrg 1519 1.1 mrg } 1520 1.1 mrg } 1521 1.1 mrg 1522 1.1 mrg /* Returns the volume of memory references accessed between two consecutive 1523 1.1 mrg self-reuses of the reference DR. We consider the subscripts of DR in N 1524 1.1 mrg loops, and LOOP_SIZES contains the volumes of accesses in each of the 1525 1.1 mrg loops. LOOP is the innermost loop of the current loop nest. */ 1526 1.1 mrg 1527 1.1 mrg static unsigned 1528 1.1 mrg self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n, 1529 1.1 mrg class loop *loop) 1530 1.1 mrg { 1531 1.1 mrg tree stride, access_fn; 1532 1.1 mrg HOST_WIDE_INT *strides, astride; 1533 1.1 mrg vec<tree> access_fns; 1534 1.1 mrg tree ref = DR_REF (dr); 1535 1.1 mrg unsigned i, ret = ~0u; 1536 1.1 mrg 1537 1.1 mrg /* In the following example: 1538 1.1 mrg 1539 1.1 mrg for (i = 0; i < N; i++) 1540 1.1 mrg for (j = 0; j < N; j++) 1541 1.1 mrg use (a[j][i]); 1542 1.1 mrg the same cache line is accessed each N steps (except if the change from 1543 1.1 mrg i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse, 1544 1.1 mrg we cannot rely purely on the results of the data dependence analysis. 1545 1.1 mrg 1546 1.1 mrg Instead, we compute the stride of the reference in each loop, and consider 1547 1.1 mrg the innermost loop in that the stride is less than cache size. */ 1548 1.1 mrg 1549 1.1 mrg strides = XCNEWVEC (HOST_WIDE_INT, n); 1550 1.1 mrg access_fns = DR_ACCESS_FNS (dr); 1551 1.1 mrg 1552 1.1 mrg FOR_EACH_VEC_ELT (access_fns, i, access_fn) 1553 1.1 mrg { 1554 1.1 mrg /* Keep track of the reference corresponding to the subscript, so that we 1555 1.1 mrg know its stride. */ 1556 1.1 mrg while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) 1557 1.1 mrg ref = TREE_OPERAND (ref, 0); 1558 1.1 mrg 1559 1.1 mrg if (TREE_CODE (ref) == ARRAY_REF) 1560 1.1 mrg { 1561 1.1 mrg stride = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1562 1.1 mrg if (tree_fits_uhwi_p (stride)) 1563 1.1 mrg astride = tree_to_uhwi (stride); 1564 1.1 mrg else 1565 1.1 mrg astride = param_l1_cache_line_size; 1566 1.1 mrg 1567 1.1 mrg ref = TREE_OPERAND (ref, 0); 1568 1.1 mrg } 1569 1.1 mrg else 1570 1.1 mrg astride = 1; 1571 1.1 mrg 1572 1.1 mrg add_subscript_strides (access_fn, astride, strides, n, loop); 1573 1.1 mrg } 1574 1.1 mrg 1575 1.1 mrg for (i = n; i-- > 0; ) 1576 1.1 mrg { 1577 1.1 mrg unsigned HOST_WIDE_INT s; 1578 1.1 mrg 1579 1.1 mrg s = strides[i] < 0 ? -strides[i] : strides[i]; 1580 1.1 mrg 1581 1.1 mrg if (s < (unsigned) param_l1_cache_line_size 1582 1.1 mrg && (loop_sizes[i] 1583 1.1 mrg > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION))) 1584 1.1 mrg { 1585 1.1 mrg ret = loop_sizes[i]; 1586 1.1 mrg break; 1587 1.1 mrg } 1588 1.1 mrg } 1589 1.1 mrg 1590 1.1 mrg free (strides); 1591 1.1 mrg return ret; 1592 1.1 mrg } 1593 1.1 mrg 1594 1.1 mrg /* Determines the distance till the first reuse of each reference in REFS 1595 1.1 mrg in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other 1596 1.1 mrg memory references in the loop. Return false if the analysis fails. */ 1597 1.1 mrg 1598 1.1 mrg static bool 1599 1.1 mrg determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs, 1600 1.1 mrg bool no_other_refs) 1601 1.1 mrg { 1602 1.1 mrg class loop *nest, *aloop; 1603 1.1 mrg vec<data_reference_p> datarefs = vNULL; 1604 1.1 mrg vec<ddr_p> dependences = vNULL; 1605 1.1 mrg struct mem_ref_group *gr; 1606 1.1 mrg struct mem_ref *ref, *refb; 1607 1.1 mrg auto_vec<loop_p> vloops; 1608 1.1 mrg unsigned *loop_data_size; 1609 1.1 mrg unsigned i, j, n; 1610 1.1 mrg unsigned volume, dist, adist; 1611 1.1 mrg HOST_WIDE_INT vol; 1612 1.1 mrg data_reference_p dr; 1613 1.1 mrg ddr_p dep; 1614 1.1 mrg 1615 1.1 mrg if (loop->inner) 1616 1.1 mrg return true; 1617 1.1 mrg 1618 1.1 mrg /* Find the outermost loop of the loop nest of loop (we require that 1619 1.1 mrg there are no sibling loops inside the nest). */ 1620 1.1 mrg nest = loop; 1621 1.1 mrg while (1) 1622 1.1 mrg { 1623 1.1 mrg aloop = loop_outer (nest); 1624 1.1 mrg 1625 1.1 mrg if (aloop == current_loops->tree_root 1626 1.1 mrg || aloop->inner->next) 1627 1.1 mrg break; 1628 1.1 mrg 1629 1.1 mrg nest = aloop; 1630 1.1 mrg } 1631 1.1 mrg 1632 1.1 mrg /* For each loop, determine the amount of data accessed in each iteration. 1633 1.1 mrg We use this to estimate whether the reference is evicted from the 1634 1.1 mrg cache before its reuse. */ 1635 1.1 mrg find_loop_nest (nest, &vloops); 1636 1.1 mrg n = vloops.length (); 1637 1.1 mrg loop_data_size = XNEWVEC (unsigned, n); 1638 1.1 mrg volume = volume_of_references (refs); 1639 1.1 mrg i = n; 1640 1.1 mrg while (i-- != 0) 1641 1.1 mrg { 1642 1.1 mrg loop_data_size[i] = volume; 1643 1.1 mrg /* Bound the volume by the L2 cache size, since above this bound, 1644 1.1 mrg all dependence distances are equivalent. */ 1645 1.1 mrg if (volume > L2_CACHE_SIZE_BYTES) 1646 1.1 mrg continue; 1647 1.1 mrg 1648 1.1 mrg aloop = vloops[i]; 1649 1.1 mrg vol = estimated_stmt_executions_int (aloop); 1650 1.1 mrg if (vol == -1) 1651 1.1 mrg vol = expected_loop_iterations (aloop); 1652 1.1 mrg volume *= vol; 1653 1.1 mrg } 1654 1.1 mrg 1655 1.1 mrg /* Prepare the references in the form suitable for data dependence 1656 1.1 mrg analysis. We ignore unanalyzable data references (the results 1657 1.1 mrg are used just as a heuristics to estimate temporality of the 1658 1.1 mrg references, hence we do not need to worry about correctness). */ 1659 1.1 mrg for (gr = refs; gr; gr = gr->next) 1660 1.1 mrg for (ref = gr->refs; ref; ref = ref->next) 1661 1.1 mrg { 1662 1.1 mrg dr = create_data_ref (loop_preheader_edge (nest), 1663 1.1 mrg loop_containing_stmt (ref->stmt), 1664 1.1 mrg ref->mem, ref->stmt, !ref->write_p, false); 1665 1.1 mrg 1666 1.1 mrg if (dr) 1667 1.1 mrg { 1668 1.1 mrg ref->reuse_distance = volume; 1669 1.1 mrg dr->aux = ref; 1670 1.1 mrg datarefs.safe_push (dr); 1671 1.1 mrg } 1672 1.1 mrg else 1673 1.1 mrg no_other_refs = false; 1674 1.1 mrg } 1675 1.1 mrg 1676 1.1 mrg FOR_EACH_VEC_ELT (datarefs, i, dr) 1677 1.1 mrg { 1678 1.1 mrg dist = self_reuse_distance (dr, loop_data_size, n, loop); 1679 1.1 mrg ref = (struct mem_ref *) dr->aux; 1680 1.1 mrg if (ref->reuse_distance > dist) 1681 1.1 mrg ref->reuse_distance = dist; 1682 1.1 mrg 1683 1.1 mrg if (no_other_refs) 1684 1.1 mrg ref->independent_p = true; 1685 1.1 mrg } 1686 1.1 mrg 1687 1.1 mrg if (!compute_all_dependences (datarefs, &dependences, vloops, true)) 1688 1.1 mrg return false; 1689 1.1 mrg 1690 1.1 mrg FOR_EACH_VEC_ELT (dependences, i, dep) 1691 1.1 mrg { 1692 1.1 mrg if (DDR_ARE_DEPENDENT (dep) == chrec_known) 1693 1.1 mrg continue; 1694 1.1 mrg 1695 1.1 mrg ref = (struct mem_ref *) DDR_A (dep)->aux; 1696 1.1 mrg refb = (struct mem_ref *) DDR_B (dep)->aux; 1697 1.1 mrg 1698 1.1 mrg if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know 1699 1.1 mrg || DDR_COULD_BE_INDEPENDENT_P (dep) 1700 1.1 mrg || DDR_NUM_DIST_VECTS (dep) == 0) 1701 1.1 mrg { 1702 1.1 mrg /* If the dependence cannot be analyzed, assume that there might be 1703 1.1 mrg a reuse. */ 1704 1.1 mrg dist = 0; 1705 1.1 mrg 1706 1.1 mrg ref->independent_p = false; 1707 1.1 mrg refb->independent_p = false; 1708 1.1 mrg } 1709 1.1 mrg else 1710 1.1 mrg { 1711 1.1 mrg /* The distance vectors are normalized to be always lexicographically 1712 1.1 mrg positive, hence we cannot tell just from them whether DDR_A comes 1713 1.1 mrg before DDR_B or vice versa. However, it is not important, 1714 1.1 mrg anyway -- if DDR_A is close to DDR_B, then it is either reused in 1715 1.1 mrg DDR_B (and it is not nontemporal), or it reuses the value of DDR_B 1716 1.1 mrg in cache (and marking it as nontemporal would not affect 1717 1.1 mrg anything). */ 1718 1.1 mrg 1719 1.1 mrg dist = volume; 1720 1.1 mrg for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++) 1721 1.1 mrg { 1722 1.1 mrg adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j), 1723 1.1 mrg loop_data_size, n); 1724 1.1 mrg 1725 1.1 mrg /* If this is a dependence in the innermost loop (i.e., the 1726 1.1 mrg distances in all superloops are zero) and it is not 1727 1.1 mrg the trivial self-dependence with distance zero, record that 1728 1.1 mrg the references are not completely independent. */ 1729 1.1 mrg if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1) 1730 1.1 mrg && (ref != refb 1731 1.1 mrg || DDR_DIST_VECT (dep, j)[n-1] != 0)) 1732 1.1 mrg { 1733 1.1 mrg ref->independent_p = false; 1734 1.1 mrg refb->independent_p = false; 1735 1.1 mrg } 1736 1.1 mrg 1737 1.1 mrg /* Ignore accesses closer than 1738 1.1 mrg L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, 1739 1.1 mrg so that we use nontemporal prefetches e.g. if single memory 1740 1.1 mrg location is accessed several times in a single iteration of 1741 1.1 mrg the loop. */ 1742 1.1 mrg if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION) 1743 1.1 mrg continue; 1744 1.1 mrg 1745 1.1 mrg if (adist < dist) 1746 1.1 mrg dist = adist; 1747 1.1 mrg } 1748 1.1 mrg } 1749 1.1 mrg 1750 1.1 mrg if (ref->reuse_distance > dist) 1751 1.1 mrg ref->reuse_distance = dist; 1752 1.1 mrg if (refb->reuse_distance > dist) 1753 1.1 mrg refb->reuse_distance = dist; 1754 1.1 mrg } 1755 1.1 mrg 1756 1.1 mrg free_dependence_relations (dependences); 1757 1.1 mrg free_data_refs (datarefs); 1758 1.1 mrg free (loop_data_size); 1759 1.1 mrg 1760 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1761 1.1 mrg { 1762 1.1 mrg fprintf (dump_file, "Reuse distances:\n"); 1763 1.1 mrg for (gr = refs; gr; gr = gr->next) 1764 1.1 mrg for (ref = gr->refs; ref; ref = ref->next) 1765 1.1 mrg fprintf (dump_file, " reference %u:%u distance %u\n", 1766 1.1 mrg ref->group->uid, ref->uid, ref->reuse_distance); 1767 1.1 mrg } 1768 1.1 mrg 1769 1.1 mrg return true; 1770 1.1 mrg } 1771 1.1 mrg 1772 1.1 mrg /* Determine whether or not the trip count to ahead ratio is too small based 1773 1.1 mrg on prefitablility consideration. 1774 1.1 mrg AHEAD: the iteration ahead distance, 1775 1.1 mrg EST_NITER: the estimated trip count. */ 1776 1.1 mrg 1777 1.1 mrg static bool 1778 1.1 mrg trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter) 1779 1.1 mrg { 1780 1.1 mrg /* Assume trip count to ahead ratio is big enough if the trip count could not 1781 1.1 mrg be estimated at compile time. */ 1782 1.1 mrg if (est_niter < 0) 1783 1.1 mrg return false; 1784 1.1 mrg 1785 1.1 mrg if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead)) 1786 1.1 mrg { 1787 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1788 1.1 mrg fprintf (dump_file, 1789 1.1 mrg "Not prefetching -- loop estimated to roll only %d times\n", 1790 1.1 mrg (int) est_niter); 1791 1.1 mrg return true; 1792 1.1 mrg } 1793 1.1 mrg 1794 1.1 mrg return false; 1795 1.1 mrg } 1796 1.1 mrg 1797 1.1 mrg /* Determine whether or not the number of memory references in the loop is 1798 1.1 mrg reasonable based on the profitablity and compilation time considerations. 1799 1.1 mrg NINSNS: estimated number of instructions in the loop, 1800 1.1 mrg MEM_REF_COUNT: total number of memory references in the loop. */ 1801 1.1 mrg 1802 1.1 mrg static bool 1803 1.1 mrg mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count) 1804 1.1 mrg { 1805 1.1 mrg int insn_to_mem_ratio; 1806 1.1 mrg 1807 1.1 mrg if (mem_ref_count == 0) 1808 1.1 mrg return false; 1809 1.1 mrg 1810 1.1 mrg /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis 1811 1.1 mrg (compute_all_dependences) have high costs based on quadratic complexity. 1812 1.1 mrg To avoid huge compilation time, we give up prefetching if mem_ref_count 1813 1.1 mrg is too large. */ 1814 1.1 mrg if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP) 1815 1.1 mrg return false; 1816 1.1 mrg 1817 1.1 mrg /* Prefetching improves performance by overlapping cache missing 1818 1.1 mrg memory accesses with CPU operations. If the loop does not have 1819 1.1 mrg enough CPU operations to overlap with memory operations, prefetching 1820 1.1 mrg won't give a significant benefit. One approximate way of checking 1821 1.1 mrg this is to require the ratio of instructions to memory references to 1822 1.1 mrg be above a certain limit. This approximation works well in practice. 1823 1.1 mrg TODO: Implement a more precise computation by estimating the time 1824 1.1 mrg for each CPU or memory op in the loop. Time estimates for memory ops 1825 1.1 mrg should account for cache misses. */ 1826 1.1 mrg insn_to_mem_ratio = ninsns / mem_ref_count; 1827 1.1 mrg 1828 1.1 mrg if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio) 1829 1.1 mrg { 1830 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1831 1.1 mrg fprintf (dump_file, 1832 1.1 mrg "Not prefetching -- instruction to memory reference ratio (%d) too small\n", 1833 1.1 mrg insn_to_mem_ratio); 1834 1.1 mrg return false; 1835 1.1 mrg } 1836 1.1 mrg 1837 1.1 mrg return true; 1838 1.1 mrg } 1839 1.1 mrg 1840 1.1 mrg /* Determine whether or not the instruction to prefetch ratio in the loop is 1841 1.1 mrg too small based on the profitablity consideration. 1842 1.1 mrg NINSNS: estimated number of instructions in the loop, 1843 1.1 mrg PREFETCH_COUNT: an estimate of the number of prefetches, 1844 1.1 mrg UNROLL_FACTOR: the factor to unroll the loop if prefetching. */ 1845 1.1 mrg 1846 1.1 mrg static bool 1847 1.1 mrg insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count, 1848 1.1 mrg unsigned unroll_factor) 1849 1.1 mrg { 1850 1.1 mrg int insn_to_prefetch_ratio; 1851 1.1 mrg 1852 1.1 mrg /* Prefetching most likely causes performance degradation when the instruction 1853 1.1 mrg to prefetch ratio is too small. Too many prefetch instructions in a loop 1854 1.1 mrg may reduce the I-cache performance. 1855 1.1 mrg (unroll_factor * ninsns) is used to estimate the number of instructions in 1856 1.1 mrg the unrolled loop. This implementation is a bit simplistic -- the number 1857 1.1 mrg of issued prefetch instructions is also affected by unrolling. So, 1858 1.1 mrg prefetch_mod and the unroll factor should be taken into account when 1859 1.1 mrg determining prefetch_count. Also, the number of insns of the unrolled 1860 1.1 mrg loop will usually be significantly smaller than the number of insns of the 1861 1.1 mrg original loop * unroll_factor (at least the induction variable increases 1862 1.1 mrg and the exit branches will get eliminated), so it might be better to use 1863 1.1 mrg tree_estimate_loop_size + estimated_unrolled_size. */ 1864 1.1 mrg insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count; 1865 1.1 mrg if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio) 1866 1.1 mrg { 1867 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1868 1.1 mrg fprintf (dump_file, 1869 1.1 mrg "Not prefetching -- instruction to prefetch ratio (%d) too small\n", 1870 1.1 mrg insn_to_prefetch_ratio); 1871 1.1 mrg return true; 1872 1.1 mrg } 1873 1.1 mrg 1874 1.1 mrg return false; 1875 1.1 mrg } 1876 1.1 mrg 1877 1.1 mrg 1878 1.1 mrg /* Issue prefetch instructions for array references in LOOP. Returns 1879 1.1 mrg true if the LOOP was unrolled. */ 1880 1.1 mrg 1881 1.1 mrg static bool 1882 1.1 mrg loop_prefetch_arrays (class loop *loop) 1883 1.1 mrg { 1884 1.1 mrg struct mem_ref_group *refs; 1885 1.1 mrg unsigned ahead, ninsns, time, unroll_factor; 1886 1.1 mrg HOST_WIDE_INT est_niter; 1887 1.1 mrg class tree_niter_desc desc; 1888 1.1 mrg bool unrolled = false, no_other_refs; 1889 1.1 mrg unsigned prefetch_count; 1890 1.1 mrg unsigned mem_ref_count; 1891 1.1 mrg 1892 1.1 mrg if (optimize_loop_nest_for_size_p (loop)) 1893 1.1 mrg { 1894 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1895 1.1 mrg fprintf (dump_file, " ignored (cold area)\n"); 1896 1.1 mrg return false; 1897 1.1 mrg } 1898 1.1 mrg 1899 1.1 mrg /* FIXME: the time should be weighted by the probabilities of the blocks in 1900 1.1 mrg the loop body. */ 1901 1.1 mrg time = tree_num_loop_insns (loop, &eni_time_weights); 1902 1.1 mrg if (time == 0) 1903 1.1 mrg return false; 1904 1.1 mrg 1905 1.1 mrg ahead = (param_prefetch_latency + time - 1) / time; 1906 1.1 mrg est_niter = estimated_stmt_executions_int (loop); 1907 1.1 mrg if (est_niter == -1) 1908 1.1 mrg est_niter = likely_max_stmt_executions_int (loop); 1909 1.1 mrg 1910 1.1 mrg /* Prefetching is not likely to be profitable if the trip count to ahead 1911 1.1 mrg ratio is too small. */ 1912 1.1 mrg if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter)) 1913 1.1 mrg return false; 1914 1.1 mrg 1915 1.1 mrg ninsns = tree_num_loop_insns (loop, &eni_size_weights); 1916 1.1 mrg 1917 1.1 mrg /* Step 1: gather the memory references. */ 1918 1.1 mrg refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count); 1919 1.1 mrg 1920 1.1 mrg /* Give up prefetching if the number of memory references in the 1921 1.1 mrg loop is not reasonable based on profitablity and compilation time 1922 1.1 mrg considerations. */ 1923 1.1 mrg if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count)) 1924 1.1 mrg goto fail; 1925 1.1 mrg 1926 1.1 mrg /* Step 2: estimate the reuse effects. */ 1927 1.1 mrg prune_by_reuse (refs); 1928 1.1 mrg 1929 1.1 mrg if (nothing_to_prefetch_p (refs)) 1930 1.1 mrg goto fail; 1931 1.1 mrg 1932 1.1 mrg if (!determine_loop_nest_reuse (loop, refs, no_other_refs)) 1933 1.1 mrg goto fail; 1934 1.1 mrg 1935 1.1 mrg /* Step 3: determine unroll factor. */ 1936 1.1 mrg unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc, 1937 1.1 mrg est_niter); 1938 1.1 mrg 1939 1.1 mrg /* Estimate prefetch count for the unrolled loop. */ 1940 1.1 mrg prefetch_count = estimate_prefetch_count (refs, unroll_factor); 1941 1.1 mrg if (prefetch_count == 0) 1942 1.1 mrg goto fail; 1943 1.1 mrg 1944 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1945 1.1 mrg fprintf (dump_file, "Ahead %d, unroll factor %d, trip count " 1946 1.1 mrg HOST_WIDE_INT_PRINT_DEC "\n" 1947 1.1 mrg "insn count %d, mem ref count %d, prefetch count %d\n", 1948 1.1 mrg ahead, unroll_factor, est_niter, 1949 1.1 mrg ninsns, mem_ref_count, prefetch_count); 1950 1.1 mrg 1951 1.1 mrg /* Prefetching is not likely to be profitable if the instruction to prefetch 1952 1.1 mrg ratio is too small. */ 1953 1.1 mrg if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count, 1954 1.1 mrg unroll_factor)) 1955 1.1 mrg goto fail; 1956 1.1 mrg 1957 1.1 mrg mark_nontemporal_stores (loop, refs); 1958 1.1 mrg 1959 1.1 mrg /* Step 4: what to prefetch? */ 1960 1.1 mrg if (!schedule_prefetches (refs, unroll_factor, ahead)) 1961 1.1 mrg goto fail; 1962 1.1 mrg 1963 1.1 mrg /* Step 5: unroll the loop. TODO -- peeling of first and last few 1964 1.1 mrg iterations so that we do not issue superfluous prefetches. */ 1965 1.1 mrg if (unroll_factor != 1) 1966 1.1 mrg { 1967 1.1 mrg tree_unroll_loop (loop, unroll_factor, &desc); 1968 1.1 mrg unrolled = true; 1969 1.1 mrg } 1970 1.1 mrg 1971 1.1 mrg /* Step 6: issue the prefetches. */ 1972 1.1 mrg issue_prefetches (refs, unroll_factor, ahead); 1973 1.1 mrg 1974 1.1 mrg fail: 1975 1.1 mrg release_mem_refs (refs); 1976 1.1 mrg return unrolled; 1977 1.1 mrg } 1978 1.1 mrg 1979 1.1 mrg /* Issue prefetch instructions for array references in loops. */ 1980 1.1 mrg 1981 1.1 mrg unsigned int 1982 1.1 mrg tree_ssa_prefetch_arrays (void) 1983 1.1 mrg { 1984 1.1 mrg bool unrolled = false; 1985 1.1 mrg int todo_flags = 0; 1986 1.1 mrg 1987 1.1 mrg if (!targetm.have_prefetch () 1988 1.1 mrg /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. 1989 1.1 mrg -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part 1990 1.1 mrg of processor costs and i486 does not have prefetch, but 1991 1.1 mrg -march=pentium4 causes targetm.have_prefetch to be true. Ugh. */ 1992 1.1 mrg || PREFETCH_BLOCK == 0) 1993 1.1 mrg return 0; 1994 1.1 mrg 1995 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 1996 1.1 mrg { 1997 1.1 mrg fprintf (dump_file, "Prefetching parameters:\n"); 1998 1.1 mrg fprintf (dump_file, " simultaneous prefetches: %d\n", 1999 1.1 mrg param_simultaneous_prefetches); 2000 1.1 mrg fprintf (dump_file, " prefetch latency: %d\n", param_prefetch_latency); 2001 1.1 mrg fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); 2002 1.1 mrg fprintf (dump_file, " L1 cache size: %d lines, %d kB\n", 2003 1.1 mrg L1_CACHE_SIZE_BYTES / param_l1_cache_line_size, 2004 1.1 mrg param_l1_cache_size); 2005 1.1 mrg fprintf (dump_file, " L1 cache line size: %d\n", 2006 1.1 mrg param_l1_cache_line_size); 2007 1.1 mrg fprintf (dump_file, " L2 cache size: %d kB\n", param_l2_cache_size); 2008 1.1 mrg fprintf (dump_file, " min insn-to-prefetch ratio: %d \n", 2009 1.1 mrg param_min_insn_to_prefetch_ratio); 2010 1.1 mrg fprintf (dump_file, " min insn-to-mem ratio: %d \n", 2011 1.1 mrg param_prefetch_min_insn_to_mem_ratio); 2012 1.1 mrg fprintf (dump_file, "\n"); 2013 1.1 mrg } 2014 1.1 mrg 2015 1.1 mrg initialize_original_copy_tables (); 2016 1.1 mrg 2017 1.1 mrg if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH)) 2018 1.1 mrg { 2019 1.1 mrg tree type = build_function_type_list (void_type_node, 2020 1.1 mrg const_ptr_type_node, NULL_TREE); 2021 1.1 mrg tree decl = add_builtin_function ("__builtin_prefetch", type, 2022 1.1 mrg BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 2023 1.1 mrg NULL, NULL_TREE); 2024 1.1 mrg DECL_IS_NOVOPS (decl) = true; 2025 1.1 mrg set_builtin_decl (BUILT_IN_PREFETCH, decl, false); 2026 1.1 mrg } 2027 1.1 mrg 2028 1.1 mrg for (auto loop : loops_list (cfun, LI_FROM_INNERMOST)) 2029 1.1 mrg { 2030 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 2031 1.1 mrg fprintf (dump_file, "Processing loop %d:\n", loop->num); 2032 1.1 mrg 2033 1.1 mrg unrolled |= loop_prefetch_arrays (loop); 2034 1.1 mrg 2035 1.1 mrg if (dump_file && (dump_flags & TDF_DETAILS)) 2036 1.1 mrg fprintf (dump_file, "\n\n"); 2037 1.1 mrg } 2038 1.1 mrg 2039 1.1 mrg if (unrolled) 2040 1.1 mrg { 2041 1.1 mrg scev_reset (); 2042 1.1 mrg todo_flags |= TODO_cleanup_cfg; 2043 1.1 mrg } 2044 1.1 mrg 2045 1.1 mrg free_original_copy_tables (); 2046 1.1 mrg return todo_flags; 2047 1.1 mrg } 2048 1.1 mrg 2049 1.1 mrg /* Prefetching. */ 2050 1.1 mrg 2051 1.1 mrg namespace { 2052 1.1 mrg 2053 1.1 mrg const pass_data pass_data_loop_prefetch = 2054 1.1 mrg { 2055 1.1 mrg GIMPLE_PASS, /* type */ 2056 1.1 mrg "aprefetch", /* name */ 2057 1.1 mrg OPTGROUP_LOOP, /* optinfo_flags */ 2058 1.1 mrg TV_TREE_PREFETCH, /* tv_id */ 2059 1.1 mrg ( PROP_cfg | PROP_ssa ), /* properties_required */ 2060 1.1 mrg 0, /* properties_provided */ 2061 1.1 mrg 0, /* properties_destroyed */ 2062 1.1 mrg 0, /* todo_flags_start */ 2063 1.1 mrg 0, /* todo_flags_finish */ 2064 1.1 mrg }; 2065 1.1 mrg 2066 1.1 mrg class pass_loop_prefetch : public gimple_opt_pass 2067 1.1 mrg { 2068 1.1 mrg public: 2069 1.1 mrg pass_loop_prefetch (gcc::context *ctxt) 2070 1.1 mrg : gimple_opt_pass (pass_data_loop_prefetch, ctxt) 2071 1.1 mrg {} 2072 1.1 mrg 2073 1.1 mrg /* opt_pass methods: */ 2074 1.1 mrg virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; } 2075 1.1 mrg virtual unsigned int execute (function *); 2076 1.1 mrg 2077 1.1 mrg }; // class pass_loop_prefetch 2078 1.1 mrg 2079 1.1 mrg unsigned int 2080 1.1 mrg pass_loop_prefetch::execute (function *fun) 2081 1.1 mrg { 2082 1.1 mrg if (number_of_loops (fun) <= 1) 2083 1.1 mrg return 0; 2084 1.1 mrg 2085 1.1 mrg if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0) 2086 1.1 mrg { 2087 1.1 mrg static bool warned = false; 2088 1.1 mrg 2089 1.1 mrg if (!warned) 2090 1.1 mrg { 2091 1.1 mrg warning (OPT_Wdisabled_optimization, 2092 1.1 mrg "%<l1-cache-size%> parameter is not a power of two %d", 2093 1.1 mrg PREFETCH_BLOCK); 2094 1.1 mrg warned = true; 2095 1.1 mrg } 2096 1.1 mrg return 0; 2097 1.1 mrg } 2098 1.1 mrg 2099 1.1 mrg return tree_ssa_prefetch_arrays (); 2100 1.1 mrg } 2101 1.1 mrg 2102 1.1 mrg } // anon namespace 2103 1.1 mrg 2104 1.1 mrg gimple_opt_pass * 2105 1.1 mrg make_pass_loop_prefetch (gcc::context *ctxt) 2106 1.1 mrg { 2107 1.1 mrg return new pass_loop_prefetch (ctxt); 2108 1.1 mrg } 2109 1.1 mrg 2110 1.1 mrg 2111