Home | History | Annotate | Line # | Download | only in gcc
      1  1.1  mrg /* Array prefetching.
      2  1.1  mrg    Copyright (C) 2005-2022 Free Software Foundation, Inc.
      3  1.1  mrg 
      4  1.1  mrg This file is part of GCC.
      5  1.1  mrg 
      6  1.1  mrg GCC is free software; you can redistribute it and/or modify it
      7  1.1  mrg under the terms of the GNU General Public License as published by the
      8  1.1  mrg Free Software Foundation; either version 3, or (at your option) any
      9  1.1  mrg later version.
     10  1.1  mrg 
     11  1.1  mrg GCC is distributed in the hope that it will be useful, but WITHOUT
     12  1.1  mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
     13  1.1  mrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
     14  1.1  mrg for more details.
     15  1.1  mrg 
     16  1.1  mrg You should have received a copy of the GNU General Public License
     17  1.1  mrg along with GCC; see the file COPYING3.  If not see
     18  1.1  mrg <http://www.gnu.org/licenses/>.  */
     19  1.1  mrg 
     20  1.1  mrg #include "config.h"
     21  1.1  mrg #include "system.h"
     22  1.1  mrg #include "coretypes.h"
     23  1.1  mrg #include "backend.h"
     24  1.1  mrg #include "target.h"
     25  1.1  mrg #include "rtl.h"
     26  1.1  mrg #include "tree.h"
     27  1.1  mrg #include "gimple.h"
     28  1.1  mrg #include "predict.h"
     29  1.1  mrg #include "tree-pass.h"
     30  1.1  mrg #include "gimple-ssa.h"
     31  1.1  mrg #include "optabs-query.h"
     32  1.1  mrg #include "tree-pretty-print.h"
     33  1.1  mrg #include "fold-const.h"
     34  1.1  mrg #include "stor-layout.h"
     35  1.1  mrg #include "gimplify.h"
     36  1.1  mrg #include "gimple-iterator.h"
     37  1.1  mrg #include "gimplify-me.h"
     38  1.1  mrg #include "tree-ssa-loop-ivopts.h"
     39  1.1  mrg #include "tree-ssa-loop-manip.h"
     40  1.1  mrg #include "tree-ssa-loop-niter.h"
     41  1.1  mrg #include "tree-ssa-loop.h"
     42  1.1  mrg #include "ssa.h"
     43  1.1  mrg #include "tree-into-ssa.h"
     44  1.1  mrg #include "cfgloop.h"
     45  1.1  mrg #include "tree-scalar-evolution.h"
     46  1.1  mrg #include "langhooks.h"
     47  1.1  mrg #include "tree-inline.h"
     48  1.1  mrg #include "tree-data-ref.h"
     49  1.1  mrg #include "diagnostic-core.h"
     50  1.1  mrg #include "dbgcnt.h"
     51  1.1  mrg 
     52  1.1  mrg /* This pass inserts prefetch instructions to optimize cache usage during
     53  1.1  mrg    accesses to arrays in loops.  It processes loops sequentially and:
     54  1.1  mrg 
     55  1.1  mrg    1) Gathers all memory references in the single loop.
     56  1.1  mrg    2) For each of the references it decides when it is profitable to prefetch
     57  1.1  mrg       it.  To do it, we evaluate the reuse among the accesses, and determines
     58  1.1  mrg       two values: PREFETCH_BEFORE (meaning that it only makes sense to do
     59  1.1  mrg       prefetching in the first PREFETCH_BEFORE iterations of the loop) and
     60  1.1  mrg       PREFETCH_MOD (meaning that it only makes sense to prefetch in the
     61  1.1  mrg       iterations of the loop that are zero modulo PREFETCH_MOD).  For example
     62  1.1  mrg       (assuming cache line size is 64 bytes, char has size 1 byte and there
     63  1.1  mrg       is no hardware sequential prefetch):
     64  1.1  mrg 
     65  1.1  mrg       char *a;
     66  1.1  mrg       for (i = 0; i < max; i++)
     67  1.1  mrg 	{
     68  1.1  mrg 	  a[255] = ...;		(0)
     69  1.1  mrg 	  a[i] = ...;		(1)
     70  1.1  mrg 	  a[i + 64] = ...;	(2)
     71  1.1  mrg 	  a[16*i] = ...;	(3)
     72  1.1  mrg 	  a[187*i] = ...;	(4)
     73  1.1  mrg 	  a[187*i + 50] = ...;	(5)
     74  1.1  mrg 	}
     75  1.1  mrg 
     76  1.1  mrg        (0) obviously has PREFETCH_BEFORE 1
     77  1.1  mrg        (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
     78  1.1  mrg            location 64 iterations before it, and PREFETCH_MOD 64 (since
     79  1.1  mrg 	   it hits the same cache line otherwise).
     80  1.1  mrg        (2) has PREFETCH_MOD 64
     81  1.1  mrg        (3) has PREFETCH_MOD 4
     82  1.1  mrg        (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
     83  1.1  mrg            the cache line accessed by (5) is the same with probability only
     84  1.1  mrg 	   7/32.
     85  1.1  mrg        (5) has PREFETCH_MOD 1 as well.
     86  1.1  mrg 
     87  1.1  mrg       Additionally, we use data dependence analysis to determine for each
     88  1.1  mrg       reference the distance till the first reuse; this information is used
     89  1.1  mrg       to determine the temporality of the issued prefetch instruction.
     90  1.1  mrg 
     91  1.1  mrg    3) We determine how much ahead we need to prefetch.  The number of
     92  1.1  mrg       iterations needed is time to fetch / time spent in one iteration of
     93  1.1  mrg       the loop.  The problem is that we do not know either of these values,
     94  1.1  mrg       so we just make a heuristic guess based on a magic (possibly)
     95  1.1  mrg       target-specific constant and size of the loop.
     96  1.1  mrg 
     97  1.1  mrg    4) Determine which of the references we prefetch.  We take into account
     98  1.1  mrg       that there is a maximum number of simultaneous prefetches (provided
     99  1.1  mrg       by machine description).  We prefetch as many prefetches as possible
    100  1.1  mrg       while still within this bound (starting with those with lowest
    101  1.1  mrg       prefetch_mod, since they are responsible for most of the cache
    102  1.1  mrg       misses).
    103  1.1  mrg 
    104  1.1  mrg    5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
    105  1.1  mrg       and PREFETCH_BEFORE requirements (within some bounds), and to avoid
    106  1.1  mrg       prefetching nonaccessed memory.
    107  1.1  mrg       TODO -- actually implement peeling.
    108  1.1  mrg 
    109  1.1  mrg    6) We actually emit the prefetch instructions.  ??? Perhaps emit the
    110  1.1  mrg       prefetch instructions with guards in cases where 5) was not sufficient
    111  1.1  mrg       to satisfy the constraints?
    112  1.1  mrg 
    113  1.1  mrg    A cost model is implemented to determine whether or not prefetching is
    114  1.1  mrg    profitable for a given loop.  The cost model has three heuristics:
    115  1.1  mrg 
    116  1.1  mrg    1. Function trip_count_to_ahead_ratio_too_small_p implements a
    117  1.1  mrg       heuristic that determines whether or not the loop has too few
    118  1.1  mrg       iterations (compared to ahead).  Prefetching is not likely to be
    119  1.1  mrg       beneficial if the trip count to ahead ratio is below a certain
    120  1.1  mrg       minimum.
    121  1.1  mrg 
    122  1.1  mrg    2. Function mem_ref_count_reasonable_p implements a heuristic that
    123  1.1  mrg       determines whether the given loop has enough CPU ops that can be
    124  1.1  mrg       overlapped with cache missing memory ops.  If not, the loop
    125  1.1  mrg       won't benefit from prefetching.  In the implementation,
    126  1.1  mrg       prefetching is not considered beneficial if the ratio between
    127  1.1  mrg       the instruction count and the mem ref count is below a certain
    128  1.1  mrg       minimum.
    129  1.1  mrg 
    130  1.1  mrg    3. Function insn_to_prefetch_ratio_too_small_p implements a
    131  1.1  mrg       heuristic that disables prefetching in a loop if the prefetching
    132  1.1  mrg       cost is above a certain limit.  The relative prefetching cost is
    133  1.1  mrg       estimated by taking the ratio between the prefetch count and the
    134  1.1  mrg       total intruction count (this models the I-cache cost).
    135  1.1  mrg 
    136  1.1  mrg    The limits used in these heuristics are defined as parameters with
    137  1.1  mrg    reasonable default values. Machine-specific default values will be
    138  1.1  mrg    added later.
    139  1.1  mrg 
    140  1.1  mrg    Some other TODO:
    141  1.1  mrg       -- write and use more general reuse analysis (that could be also used
    142  1.1  mrg 	 in other cache aimed loop optimizations)
    143  1.1  mrg       -- make it behave sanely together with the prefetches given by user
    144  1.1  mrg 	 (now we just ignore them; at the very least we should avoid
    145  1.1  mrg 	 optimizing loops in that user put his own prefetches)
    146  1.1  mrg       -- we assume cache line size alignment of arrays; this could be
    147  1.1  mrg 	 improved.  */
    148  1.1  mrg 
    149  1.1  mrg /* Magic constants follow.  These should be replaced by machine specific
    150  1.1  mrg    numbers.  */
    151  1.1  mrg 
    152  1.1  mrg /* True if write can be prefetched by a read prefetch.  */
    153  1.1  mrg 
    154  1.1  mrg #ifndef WRITE_CAN_USE_READ_PREFETCH
    155  1.1  mrg #define WRITE_CAN_USE_READ_PREFETCH 1
    156  1.1  mrg #endif
    157  1.1  mrg 
    158  1.1  mrg /* True if read can be prefetched by a write prefetch. */
    159  1.1  mrg 
    160  1.1  mrg #ifndef READ_CAN_USE_WRITE_PREFETCH
    161  1.1  mrg #define READ_CAN_USE_WRITE_PREFETCH 0
    162  1.1  mrg #endif
    163  1.1  mrg 
    164  1.1  mrg /* The size of the block loaded by a single prefetch.  Usually, this is
    165  1.1  mrg    the same as cache line size (at the moment, we only consider one level
    166  1.1  mrg    of cache hierarchy).  */
    167  1.1  mrg 
    168  1.1  mrg #ifndef PREFETCH_BLOCK
    169  1.1  mrg #define PREFETCH_BLOCK param_l1_cache_line_size
    170  1.1  mrg #endif
    171  1.1  mrg 
    172  1.1  mrg /* Do we have a forward hardware sequential prefetching?  */
    173  1.1  mrg 
    174  1.1  mrg #ifndef HAVE_FORWARD_PREFETCH
    175  1.1  mrg #define HAVE_FORWARD_PREFETCH 0
    176  1.1  mrg #endif
    177  1.1  mrg 
    178  1.1  mrg /* Do we have a backward hardware sequential prefetching?  */
    179  1.1  mrg 
    180  1.1  mrg #ifndef HAVE_BACKWARD_PREFETCH
    181  1.1  mrg #define HAVE_BACKWARD_PREFETCH 0
    182  1.1  mrg #endif
    183  1.1  mrg 
    184  1.1  mrg /* In some cases we are only able to determine that there is a certain
    185  1.1  mrg    probability that the two accesses hit the same cache line.  In this
    186  1.1  mrg    case, we issue the prefetches for both of them if this probability
    187  1.1  mrg    is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
    188  1.1  mrg 
    189  1.1  mrg #ifndef ACCEPTABLE_MISS_RATE
    190  1.1  mrg #define ACCEPTABLE_MISS_RATE 50
    191  1.1  mrg #endif
    192  1.1  mrg 
    193  1.1  mrg #define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
    194  1.1  mrg #define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
    195  1.1  mrg 
    196  1.1  mrg /* We consider a memory access nontemporal if it is not reused sooner than
    197  1.1  mrg    after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
    198  1.1  mrg    accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
    199  1.1  mrg    so that we use nontemporal prefetches e.g. if single memory location
    200  1.1  mrg    is accessed several times in a single iteration of the loop.  */
    201  1.1  mrg #define NONTEMPORAL_FRACTION 16
    202  1.1  mrg 
    203  1.1  mrg /* In case we have to emit a memory fence instruction after the loop that
    204  1.1  mrg    uses nontemporal stores, this defines the builtin to use.  */
    205  1.1  mrg 
    206  1.1  mrg #ifndef FENCE_FOLLOWING_MOVNT
    207  1.1  mrg #define FENCE_FOLLOWING_MOVNT NULL_TREE
    208  1.1  mrg #endif
    209  1.1  mrg 
    210  1.1  mrg /* It is not profitable to prefetch when the trip count is not at
    211  1.1  mrg    least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
    212  1.1  mrg    For example, in a loop with a prefetch ahead distance of 10,
    213  1.1  mrg    supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
    214  1.1  mrg    profitable to prefetch when the trip count is greater or equal to
    215  1.1  mrg    40.  In that case, 30 out of the 40 iterations will benefit from
    216  1.1  mrg    prefetching.  */
    217  1.1  mrg 
    218  1.1  mrg #ifndef TRIP_COUNT_TO_AHEAD_RATIO
    219  1.1  mrg #define TRIP_COUNT_TO_AHEAD_RATIO 4
    220  1.1  mrg #endif
    221  1.1  mrg 
    222  1.1  mrg /* The group of references between that reuse may occur.  */
    223  1.1  mrg 
    224  1.1  mrg struct mem_ref_group
    225  1.1  mrg {
    226  1.1  mrg   tree base;			/* Base of the reference.  */
    227  1.1  mrg   tree step;			/* Step of the reference.  */
    228  1.1  mrg   struct mem_ref *refs;		/* References in the group.  */
    229  1.1  mrg   struct mem_ref_group *next;	/* Next group of references.  */
    230  1.1  mrg   unsigned int uid;		/* Group UID, used only for debugging.  */
    231  1.1  mrg };
    232  1.1  mrg 
    233  1.1  mrg /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
    234  1.1  mrg 
    235  1.1  mrg #define PREFETCH_ALL		HOST_WIDE_INT_M1U
    236  1.1  mrg 
    237  1.1  mrg /* Do not generate a prefetch if the unroll factor is significantly less
    238  1.1  mrg    than what is required by the prefetch.  This is to avoid redundant
    239  1.1  mrg    prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
    240  1.1  mrg    2, prefetching requires unrolling the loop 16 times, but
    241  1.1  mrg    the loop is actually unrolled twice.  In this case (ratio = 8),
    242  1.1  mrg    prefetching is not likely to be beneficial.  */
    243  1.1  mrg 
    244  1.1  mrg #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
    245  1.1  mrg #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
    246  1.1  mrg #endif
    247  1.1  mrg 
    248  1.1  mrg /* Some of the prefetch computations have quadratic complexity.  We want to
    249  1.1  mrg    avoid huge compile times and, therefore, want to limit the amount of
    250  1.1  mrg    memory references per loop where we consider prefetching.  */
    251  1.1  mrg 
    252  1.1  mrg #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
    253  1.1  mrg #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
    254  1.1  mrg #endif
    255  1.1  mrg 
    256  1.1  mrg /* The memory reference.  */
    257  1.1  mrg 
    258  1.1  mrg struct mem_ref
    259  1.1  mrg {
    260  1.1  mrg   gimple *stmt;			/* Statement in that the reference appears.  */
    261  1.1  mrg   tree mem;			/* The reference.  */
    262  1.1  mrg   HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
    263  1.1  mrg   struct mem_ref_group *group;	/* The group of references it belongs to.  */
    264  1.1  mrg   unsigned HOST_WIDE_INT prefetch_mod;
    265  1.1  mrg 				/* Prefetch only each PREFETCH_MOD-th
    266  1.1  mrg 				   iteration.  */
    267  1.1  mrg   unsigned HOST_WIDE_INT prefetch_before;
    268  1.1  mrg 				/* Prefetch only first PREFETCH_BEFORE
    269  1.1  mrg 				   iterations.  */
    270  1.1  mrg   unsigned reuse_distance;	/* The amount of data accessed before the first
    271  1.1  mrg 				   reuse of this value.  */
    272  1.1  mrg   struct mem_ref *next;		/* The next reference in the group.  */
    273  1.1  mrg   unsigned int uid;		/* Ref UID, used only for debugging.  */
    274  1.1  mrg   unsigned write_p : 1;		/* Is it a write?  */
    275  1.1  mrg   unsigned independent_p : 1;	/* True if the reference is independent on
    276  1.1  mrg 				   all other references inside the loop.  */
    277  1.1  mrg   unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
    278  1.1  mrg   unsigned storent_p : 1;	/* True if we changed the store to a
    279  1.1  mrg 				   nontemporal one.  */
    280  1.1  mrg };
    281  1.1  mrg 
    282  1.1  mrg /* Dumps information about memory reference */
    283  1.1  mrg static void
    284  1.1  mrg dump_mem_details (FILE *file, tree base, tree step,
    285  1.1  mrg 	    HOST_WIDE_INT delta, bool write_p)
    286  1.1  mrg {
    287  1.1  mrg   fprintf (file, "(base ");
    288  1.1  mrg   print_generic_expr (file, base, TDF_SLIM);
    289  1.1  mrg   fprintf (file, ", step ");
    290  1.1  mrg   if (cst_and_fits_in_hwi (step))
    291  1.1  mrg     fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
    292  1.1  mrg   else
    293  1.1  mrg     print_generic_expr (file, step, TDF_SLIM);
    294  1.1  mrg   fprintf (file, ")\n");
    295  1.1  mrg   fprintf (file, "  delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
    296  1.1  mrg   fprintf (file, "  %s\n\n", write_p ? "write" : "read");
    297  1.1  mrg }
    298  1.1  mrg 
    299  1.1  mrg /* Dumps information about reference REF to FILE.  */
    300  1.1  mrg 
    301  1.1  mrg static void
    302  1.1  mrg dump_mem_ref (FILE *file, struct mem_ref *ref)
    303  1.1  mrg {
    304  1.1  mrg   fprintf (file, "reference %u:%u (", ref->group->uid, ref->uid);
    305  1.1  mrg   print_generic_expr (file, ref->mem, TDF_SLIM);
    306  1.1  mrg   fprintf (file, ")\n");
    307  1.1  mrg }
    308  1.1  mrg 
    309  1.1  mrg /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
    310  1.1  mrg    exist.  */
    311  1.1  mrg 
    312  1.1  mrg static struct mem_ref_group *
    313  1.1  mrg find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
    314  1.1  mrg {
    315  1.1  mrg   /* Global count for setting struct mem_ref_group->uid.  */
    316  1.1  mrg   static unsigned int last_mem_ref_group_uid = 0;
    317  1.1  mrg 
    318  1.1  mrg   struct mem_ref_group *group;
    319  1.1  mrg 
    320  1.1  mrg   for (; *groups; groups = &(*groups)->next)
    321  1.1  mrg     {
    322  1.1  mrg       if (operand_equal_p ((*groups)->step, step, 0)
    323  1.1  mrg 	  && operand_equal_p ((*groups)->base, base, 0))
    324  1.1  mrg 	return *groups;
    325  1.1  mrg 
    326  1.1  mrg       /* If step is an integer constant, keep the list of groups sorted
    327  1.1  mrg          by decreasing step.  */
    328  1.1  mrg       if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
    329  1.1  mrg 	  && int_cst_value ((*groups)->step) < int_cst_value (step))
    330  1.1  mrg 	break;
    331  1.1  mrg     }
    332  1.1  mrg 
    333  1.1  mrg   group = XNEW (struct mem_ref_group);
    334  1.1  mrg   group->base = base;
    335  1.1  mrg   group->step = step;
    336  1.1  mrg   group->refs = NULL;
    337  1.1  mrg   group->uid = ++last_mem_ref_group_uid;
    338  1.1  mrg   group->next = *groups;
    339  1.1  mrg   *groups = group;
    340  1.1  mrg 
    341  1.1  mrg   return group;
    342  1.1  mrg }
    343  1.1  mrg 
    344  1.1  mrg /* Records a memory reference MEM in GROUP with offset DELTA and write status
    345  1.1  mrg    WRITE_P.  The reference occurs in statement STMT.  */
    346  1.1  mrg 
    347  1.1  mrg static void
    348  1.1  mrg record_ref (struct mem_ref_group *group, gimple *stmt, tree mem,
    349  1.1  mrg 	    HOST_WIDE_INT delta, bool write_p)
    350  1.1  mrg {
    351  1.1  mrg   unsigned int last_mem_ref_uid = 0;
    352  1.1  mrg   struct mem_ref **aref;
    353  1.1  mrg 
    354  1.1  mrg   /* Do not record the same address twice.  */
    355  1.1  mrg   for (aref = &group->refs; *aref; aref = &(*aref)->next)
    356  1.1  mrg     {
    357  1.1  mrg       last_mem_ref_uid = (*aref)->uid;
    358  1.1  mrg 
    359  1.1  mrg       /* It does not have to be possible for write reference to reuse the read
    360  1.1  mrg 	 prefetch, or vice versa.  */
    361  1.1  mrg       if (!WRITE_CAN_USE_READ_PREFETCH
    362  1.1  mrg 	  && write_p
    363  1.1  mrg 	  && !(*aref)->write_p)
    364  1.1  mrg 	continue;
    365  1.1  mrg       if (!READ_CAN_USE_WRITE_PREFETCH
    366  1.1  mrg 	  && !write_p
    367  1.1  mrg 	  && (*aref)->write_p)
    368  1.1  mrg 	continue;
    369  1.1  mrg 
    370  1.1  mrg       if ((*aref)->delta == delta)
    371  1.1  mrg 	return;
    372  1.1  mrg     }
    373  1.1  mrg 
    374  1.1  mrg   (*aref) = XNEW (struct mem_ref);
    375  1.1  mrg   (*aref)->stmt = stmt;
    376  1.1  mrg   (*aref)->mem = mem;
    377  1.1  mrg   (*aref)->delta = delta;
    378  1.1  mrg   (*aref)->write_p = write_p;
    379  1.1  mrg   (*aref)->prefetch_before = PREFETCH_ALL;
    380  1.1  mrg   (*aref)->prefetch_mod = 1;
    381  1.1  mrg   (*aref)->reuse_distance = 0;
    382  1.1  mrg   (*aref)->issue_prefetch_p = false;
    383  1.1  mrg   (*aref)->group = group;
    384  1.1  mrg   (*aref)->next = NULL;
    385  1.1  mrg   (*aref)->independent_p = false;
    386  1.1  mrg   (*aref)->storent_p = false;
    387  1.1  mrg   (*aref)->uid = last_mem_ref_uid + 1;
    388  1.1  mrg 
    389  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
    390  1.1  mrg     {
    391  1.1  mrg       dump_mem_ref (dump_file, *aref);
    392  1.1  mrg 
    393  1.1  mrg       fprintf (dump_file, "  group %u ", group->uid);
    394  1.1  mrg       dump_mem_details (dump_file, group->base, group->step, delta,
    395  1.1  mrg 			write_p);
    396  1.1  mrg     }
    397  1.1  mrg }
    398  1.1  mrg 
    399  1.1  mrg /* Release memory references in GROUPS.  */
    400  1.1  mrg 
    401  1.1  mrg static void
    402  1.1  mrg release_mem_refs (struct mem_ref_group *groups)
    403  1.1  mrg {
    404  1.1  mrg   struct mem_ref_group *next_g;
    405  1.1  mrg   struct mem_ref *ref, *next_r;
    406  1.1  mrg 
    407  1.1  mrg   for (; groups; groups = next_g)
    408  1.1  mrg     {
    409  1.1  mrg       next_g = groups->next;
    410  1.1  mrg       for (ref = groups->refs; ref; ref = next_r)
    411  1.1  mrg 	{
    412  1.1  mrg 	  next_r = ref->next;
    413  1.1  mrg 	  free (ref);
    414  1.1  mrg 	}
    415  1.1  mrg       free (groups);
    416  1.1  mrg     }
    417  1.1  mrg }
    418  1.1  mrg 
    419  1.1  mrg /* A structure used to pass arguments to idx_analyze_ref.  */
    420  1.1  mrg 
    421  1.1  mrg struct ar_data
    422  1.1  mrg {
    423  1.1  mrg   class loop *loop;			/* Loop of the reference.  */
    424  1.1  mrg   gimple *stmt;				/* Statement of the reference.  */
    425  1.1  mrg   tree *step;				/* Step of the memory reference.  */
    426  1.1  mrg   HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
    427  1.1  mrg };
    428  1.1  mrg 
    429  1.1  mrg /* Analyzes a single INDEX of a memory reference to obtain information
    430  1.1  mrg    described at analyze_ref.  Callback for for_each_index.  */
    431  1.1  mrg 
    432  1.1  mrg static bool
    433  1.1  mrg idx_analyze_ref (tree base, tree *index, void *data)
    434  1.1  mrg {
    435  1.1  mrg   struct ar_data *ar_data = (struct ar_data *) data;
    436  1.1  mrg   tree ibase, step, stepsize;
    437  1.1  mrg   HOST_WIDE_INT idelta = 0, imult = 1;
    438  1.1  mrg   affine_iv iv;
    439  1.1  mrg 
    440  1.1  mrg   if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
    441  1.1  mrg 		  *index, &iv, true))
    442  1.1  mrg     return false;
    443  1.1  mrg   ibase = iv.base;
    444  1.1  mrg   step = iv.step;
    445  1.1  mrg 
    446  1.1  mrg   if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
    447  1.1  mrg       && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
    448  1.1  mrg     {
    449  1.1  mrg       idelta = int_cst_value (TREE_OPERAND (ibase, 1));
    450  1.1  mrg       ibase = TREE_OPERAND (ibase, 0);
    451  1.1  mrg     }
    452  1.1  mrg   if (cst_and_fits_in_hwi (ibase))
    453  1.1  mrg     {
    454  1.1  mrg       idelta += int_cst_value (ibase);
    455  1.1  mrg       ibase = build_int_cst (TREE_TYPE (ibase), 0);
    456  1.1  mrg     }
    457  1.1  mrg 
    458  1.1  mrg   if (TREE_CODE (base) == ARRAY_REF)
    459  1.1  mrg     {
    460  1.1  mrg       stepsize = array_ref_element_size (base);
    461  1.1  mrg       if (!cst_and_fits_in_hwi (stepsize))
    462  1.1  mrg 	return false;
    463  1.1  mrg       imult = int_cst_value (stepsize);
    464  1.1  mrg       step = fold_build2 (MULT_EXPR, sizetype,
    465  1.1  mrg 			  fold_convert (sizetype, step),
    466  1.1  mrg 			  fold_convert (sizetype, stepsize));
    467  1.1  mrg       idelta *= imult;
    468  1.1  mrg     }
    469  1.1  mrg 
    470  1.1  mrg   if (*ar_data->step == NULL_TREE)
    471  1.1  mrg     *ar_data->step = step;
    472  1.1  mrg   else
    473  1.1  mrg     *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
    474  1.1  mrg 				  fold_convert (sizetype, *ar_data->step),
    475  1.1  mrg 				  fold_convert (sizetype, step));
    476  1.1  mrg   *ar_data->delta += idelta;
    477  1.1  mrg   *index = ibase;
    478  1.1  mrg 
    479  1.1  mrg   return true;
    480  1.1  mrg }
    481  1.1  mrg 
    482  1.1  mrg /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
    483  1.1  mrg    STEP are integer constants and iter is number of iterations of LOOP.  The
    484  1.1  mrg    reference occurs in statement STMT.  Strips nonaddressable component
    485  1.1  mrg    references from REF_P.  */
    486  1.1  mrg 
    487  1.1  mrg static bool
    488  1.1  mrg analyze_ref (class loop *loop, tree *ref_p, tree *base,
    489  1.1  mrg 	     tree *step, HOST_WIDE_INT *delta,
    490  1.1  mrg 	     gimple *stmt)
    491  1.1  mrg {
    492  1.1  mrg   struct ar_data ar_data;
    493  1.1  mrg   tree off;
    494  1.1  mrg   HOST_WIDE_INT bit_offset;
    495  1.1  mrg   tree ref = *ref_p;
    496  1.1  mrg 
    497  1.1  mrg   *step = NULL_TREE;
    498  1.1  mrg   *delta = 0;
    499  1.1  mrg 
    500  1.1  mrg   /* First strip off the component references.  Ignore bitfields.
    501  1.1  mrg      Also strip off the real and imagine parts of a complex, so that
    502  1.1  mrg      they can have the same base.  */
    503  1.1  mrg   if (TREE_CODE (ref) == REALPART_EXPR
    504  1.1  mrg       || TREE_CODE (ref) == IMAGPART_EXPR
    505  1.1  mrg       || (TREE_CODE (ref) == COMPONENT_REF
    506  1.1  mrg           && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
    507  1.1  mrg     {
    508  1.1  mrg       if (TREE_CODE (ref) == IMAGPART_EXPR)
    509  1.1  mrg         *delta += int_size_in_bytes (TREE_TYPE (ref));
    510  1.1  mrg       ref = TREE_OPERAND (ref, 0);
    511  1.1  mrg     }
    512  1.1  mrg 
    513  1.1  mrg   *ref_p = ref;
    514  1.1  mrg 
    515  1.1  mrg   for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
    516  1.1  mrg     {
    517  1.1  mrg       off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
    518  1.1  mrg       bit_offset = TREE_INT_CST_LOW (off);
    519  1.1  mrg       gcc_assert (bit_offset % BITS_PER_UNIT == 0);
    520  1.1  mrg 
    521  1.1  mrg       *delta += bit_offset / BITS_PER_UNIT;
    522  1.1  mrg     }
    523  1.1  mrg 
    524  1.1  mrg   *base = unshare_expr (ref);
    525  1.1  mrg   ar_data.loop = loop;
    526  1.1  mrg   ar_data.stmt = stmt;
    527  1.1  mrg   ar_data.step = step;
    528  1.1  mrg   ar_data.delta = delta;
    529  1.1  mrg   return for_each_index (base, idx_analyze_ref, &ar_data);
    530  1.1  mrg }
    531  1.1  mrg 
    532  1.1  mrg /* Record a memory reference REF to the list REFS.  The reference occurs in
    533  1.1  mrg    LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
    534  1.1  mrg    reference was recorded, false otherwise.  */
    535  1.1  mrg 
    536  1.1  mrg static bool
    537  1.1  mrg gather_memory_references_ref (class loop *loop, struct mem_ref_group **refs,
    538  1.1  mrg 			      tree ref, bool write_p, gimple *stmt)
    539  1.1  mrg {
    540  1.1  mrg   tree base, step;
    541  1.1  mrg   HOST_WIDE_INT delta;
    542  1.1  mrg   struct mem_ref_group *agrp;
    543  1.1  mrg 
    544  1.1  mrg   if (get_base_address (ref) == NULL)
    545  1.1  mrg     return false;
    546  1.1  mrg 
    547  1.1  mrg   if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
    548  1.1  mrg     return false;
    549  1.1  mrg   /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
    550  1.1  mrg   if (step == NULL_TREE)
    551  1.1  mrg     return false;
    552  1.1  mrg 
    553  1.1  mrg   /* Stop if the address of BASE could not be taken.  */
    554  1.1  mrg   if (may_be_nonaddressable_p (base))
    555  1.1  mrg     return false;
    556  1.1  mrg 
    557  1.1  mrg   /* Limit non-constant step prefetching only to the innermost loops and
    558  1.1  mrg      only when the step is loop invariant in the entire loop nest. */
    559  1.1  mrg   if (!cst_and_fits_in_hwi (step))
    560  1.1  mrg     {
    561  1.1  mrg       if (loop->inner != NULL)
    562  1.1  mrg         {
    563  1.1  mrg           if (dump_file && (dump_flags & TDF_DETAILS))
    564  1.1  mrg             {
    565  1.1  mrg               fprintf (dump_file, "Memory expression %p\n",(void *) ref );
    566  1.1  mrg 	      print_generic_expr (dump_file, ref, TDF_SLIM);
    567  1.1  mrg 	      fprintf (dump_file,":");
    568  1.1  mrg               dump_mem_details (dump_file, base, step, delta, write_p);
    569  1.1  mrg               fprintf (dump_file,
    570  1.1  mrg                        "Ignoring %p, non-constant step prefetching is "
    571  1.1  mrg                        "limited to inner most loops \n",
    572  1.1  mrg                        (void *) ref);
    573  1.1  mrg             }
    574  1.1  mrg             return false;
    575  1.1  mrg          }
    576  1.1  mrg       else
    577  1.1  mrg         {
    578  1.1  mrg           if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
    579  1.1  mrg           {
    580  1.1  mrg             if (dump_file && (dump_flags & TDF_DETAILS))
    581  1.1  mrg               {
    582  1.1  mrg                 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
    583  1.1  mrg 		print_generic_expr (dump_file, ref, TDF_SLIM);
    584  1.1  mrg                 fprintf (dump_file,":");
    585  1.1  mrg                 dump_mem_details (dump_file, base, step, delta, write_p);
    586  1.1  mrg                 fprintf (dump_file,
    587  1.1  mrg                          "Not prefetching, ignoring %p due to "
    588  1.1  mrg                          "loop variant step\n",
    589  1.1  mrg                          (void *) ref);
    590  1.1  mrg               }
    591  1.1  mrg               return false;
    592  1.1  mrg             }
    593  1.1  mrg         }
    594  1.1  mrg     }
    595  1.1  mrg 
    596  1.1  mrg   /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
    597  1.1  mrg      are integer constants.  */
    598  1.1  mrg   agrp = find_or_create_group (refs, base, step);
    599  1.1  mrg   record_ref (agrp, stmt, ref, delta, write_p);
    600  1.1  mrg 
    601  1.1  mrg   return true;
    602  1.1  mrg }
    603  1.1  mrg 
    604  1.1  mrg /* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
    605  1.1  mrg    true if there are no other memory references inside the loop.  */
    606  1.1  mrg 
    607  1.1  mrg static struct mem_ref_group *
    608  1.1  mrg gather_memory_references (class loop *loop, bool *no_other_refs, unsigned *ref_count)
    609  1.1  mrg {
    610  1.1  mrg   basic_block *body = get_loop_body_in_dom_order (loop);
    611  1.1  mrg   basic_block bb;
    612  1.1  mrg   unsigned i;
    613  1.1  mrg   gimple_stmt_iterator bsi;
    614  1.1  mrg   gimple *stmt;
    615  1.1  mrg   tree lhs, rhs;
    616  1.1  mrg   struct mem_ref_group *refs = NULL;
    617  1.1  mrg 
    618  1.1  mrg   *no_other_refs = true;
    619  1.1  mrg   *ref_count = 0;
    620  1.1  mrg 
    621  1.1  mrg   /* Scan the loop body in order, so that the former references precede the
    622  1.1  mrg      later ones.  */
    623  1.1  mrg   for (i = 0; i < loop->num_nodes; i++)
    624  1.1  mrg     {
    625  1.1  mrg       bb = body[i];
    626  1.1  mrg       if (bb->loop_father != loop)
    627  1.1  mrg 	continue;
    628  1.1  mrg 
    629  1.1  mrg       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
    630  1.1  mrg 	{
    631  1.1  mrg 	  stmt = gsi_stmt (bsi);
    632  1.1  mrg 
    633  1.1  mrg 	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
    634  1.1  mrg 	    {
    635  1.1  mrg 	      if (gimple_vuse (stmt)
    636  1.1  mrg 		  || (is_gimple_call (stmt)
    637  1.1  mrg 		      && !(gimple_call_flags (stmt) & ECF_CONST)))
    638  1.1  mrg 		*no_other_refs = false;
    639  1.1  mrg 	      continue;
    640  1.1  mrg 	    }
    641  1.1  mrg 
    642  1.1  mrg 	  if (! gimple_vuse (stmt))
    643  1.1  mrg 	    continue;
    644  1.1  mrg 
    645  1.1  mrg 	  lhs = gimple_assign_lhs (stmt);
    646  1.1  mrg 	  rhs = gimple_assign_rhs1 (stmt);
    647  1.1  mrg 
    648  1.1  mrg 	  if (REFERENCE_CLASS_P (rhs))
    649  1.1  mrg 	    {
    650  1.1  mrg 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
    651  1.1  mrg 							    rhs, false, stmt);
    652  1.1  mrg 	    *ref_count += 1;
    653  1.1  mrg 	    }
    654  1.1  mrg 	  if (REFERENCE_CLASS_P (lhs))
    655  1.1  mrg 	    {
    656  1.1  mrg 	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
    657  1.1  mrg 							    lhs, true, stmt);
    658  1.1  mrg 	    *ref_count += 1;
    659  1.1  mrg 	    }
    660  1.1  mrg 	}
    661  1.1  mrg     }
    662  1.1  mrg   free (body);
    663  1.1  mrg 
    664  1.1  mrg   return refs;
    665  1.1  mrg }
    666  1.1  mrg 
    667  1.1  mrg /* Prune the prefetch candidate REF using the self-reuse.  */
    668  1.1  mrg 
    669  1.1  mrg static void
    670  1.1  mrg prune_ref_by_self_reuse (struct mem_ref *ref)
    671  1.1  mrg {
    672  1.1  mrg   HOST_WIDE_INT step;
    673  1.1  mrg   bool backward;
    674  1.1  mrg 
    675  1.1  mrg   /* If the step size is non constant, we cannot calculate prefetch_mod.  */
    676  1.1  mrg   if (!cst_and_fits_in_hwi (ref->group->step))
    677  1.1  mrg     return;
    678  1.1  mrg 
    679  1.1  mrg   step = int_cst_value (ref->group->step);
    680  1.1  mrg 
    681  1.1  mrg   backward = step < 0;
    682  1.1  mrg 
    683  1.1  mrg   if (step == 0)
    684  1.1  mrg     {
    685  1.1  mrg       /* Prefetch references to invariant address just once.  */
    686  1.1  mrg       ref->prefetch_before = 1;
    687  1.1  mrg       return;
    688  1.1  mrg     }
    689  1.1  mrg 
    690  1.1  mrg   if (backward)
    691  1.1  mrg     step = -step;
    692  1.1  mrg 
    693  1.1  mrg   if (step > PREFETCH_BLOCK)
    694  1.1  mrg     return;
    695  1.1  mrg 
    696  1.1  mrg   if ((backward && HAVE_BACKWARD_PREFETCH)
    697  1.1  mrg       || (!backward && HAVE_FORWARD_PREFETCH))
    698  1.1  mrg     {
    699  1.1  mrg       ref->prefetch_before = 1;
    700  1.1  mrg       return;
    701  1.1  mrg     }
    702  1.1  mrg 
    703  1.1  mrg   ref->prefetch_mod = PREFETCH_BLOCK / step;
    704  1.1  mrg }
    705  1.1  mrg 
    706  1.1  mrg /* Divides X by BY, rounding down.  */
    707  1.1  mrg 
    708  1.1  mrg static HOST_WIDE_INT
    709  1.1  mrg ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
    710  1.1  mrg {
    711  1.1  mrg   gcc_assert (by > 0);
    712  1.1  mrg 
    713  1.1  mrg   if (x >= 0)
    714  1.1  mrg     return x / (HOST_WIDE_INT) by;
    715  1.1  mrg   else
    716  1.1  mrg     return (x + (HOST_WIDE_INT) by - 1) / (HOST_WIDE_INT) by;
    717  1.1  mrg }
    718  1.1  mrg 
    719  1.1  mrg /* Given a CACHE_LINE_SIZE and two inductive memory references
    720  1.1  mrg    with a common STEP greater than CACHE_LINE_SIZE and an address
    721  1.1  mrg    difference DELTA, compute the probability that they will fall
    722  1.1  mrg    in different cache lines.  Return true if the computed miss rate
    723  1.1  mrg    is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
    724  1.1  mrg    number of distinct iterations after which the pattern repeats itself.
    725  1.1  mrg    ALIGN_UNIT is the unit of alignment in bytes.  */
    726  1.1  mrg 
    727  1.1  mrg static bool
    728  1.1  mrg is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
    729  1.1  mrg 		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
    730  1.1  mrg 		   unsigned HOST_WIDE_INT distinct_iters,
    731  1.1  mrg 		   int align_unit)
    732  1.1  mrg {
    733  1.1  mrg   unsigned align, iter;
    734  1.1  mrg   int total_positions, miss_positions, max_allowed_miss_positions;
    735  1.1  mrg   int address1, address2, cache_line1, cache_line2;
    736  1.1  mrg 
    737  1.1  mrg   /* It always misses if delta is greater than or equal to the cache
    738  1.1  mrg      line size.  */
    739  1.1  mrg   if (delta >= (HOST_WIDE_INT) cache_line_size)
    740  1.1  mrg     return false;
    741  1.1  mrg 
    742  1.1  mrg   gcc_assert (align_unit > 0);
    743  1.1  mrg 
    744  1.1  mrg   miss_positions = 0;
    745  1.1  mrg   total_positions = (cache_line_size / align_unit) * distinct_iters;
    746  1.1  mrg   max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
    747  1.1  mrg 
    748  1.1  mrg   /* Iterate through all possible alignments of the first
    749  1.1  mrg      memory reference within its cache line.  */
    750  1.1  mrg   for (align = 0; align < cache_line_size; align += align_unit)
    751  1.1  mrg 
    752  1.1  mrg     /* Iterate through all distinct iterations.  */
    753  1.1  mrg     for (iter = 0; iter < distinct_iters; iter++)
    754  1.1  mrg       {
    755  1.1  mrg 	address1 = align + step * iter;
    756  1.1  mrg 	address2 = address1 + delta;
    757  1.1  mrg 	cache_line1 = address1 / cache_line_size;
    758  1.1  mrg 	cache_line2 = address2 / cache_line_size;
    759  1.1  mrg 	if (cache_line1 != cache_line2)
    760  1.1  mrg 	  {
    761  1.1  mrg 	    miss_positions += 1;
    762  1.1  mrg             if (miss_positions > max_allowed_miss_positions)
    763  1.1  mrg 	      return false;
    764  1.1  mrg           }
    765  1.1  mrg       }
    766  1.1  mrg   return true;
    767  1.1  mrg }
    768  1.1  mrg 
    769  1.1  mrg /* Prune the prefetch candidate REF using the reuse with BY.
    770  1.1  mrg    If BY_IS_BEFORE is true, BY is before REF in the loop.  */
    771  1.1  mrg 
    772  1.1  mrg static void
    773  1.1  mrg prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
    774  1.1  mrg 			  bool by_is_before)
    775  1.1  mrg {
    776  1.1  mrg   HOST_WIDE_INT step;
    777  1.1  mrg   bool backward;
    778  1.1  mrg   HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
    779  1.1  mrg   HOST_WIDE_INT delta = delta_b - delta_r;
    780  1.1  mrg   HOST_WIDE_INT hit_from;
    781  1.1  mrg   unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
    782  1.1  mrg   HOST_WIDE_INT reduced_step;
    783  1.1  mrg   unsigned HOST_WIDE_INT reduced_prefetch_block;
    784  1.1  mrg   tree ref_type;
    785  1.1  mrg   int align_unit;
    786  1.1  mrg 
    787  1.1  mrg   /* If the step is non constant we cannot calculate prefetch_before.  */
    788  1.1  mrg   if (!cst_and_fits_in_hwi (ref->group->step)) {
    789  1.1  mrg     return;
    790  1.1  mrg   }
    791  1.1  mrg 
    792  1.1  mrg   step = int_cst_value (ref->group->step);
    793  1.1  mrg 
    794  1.1  mrg   backward = step < 0;
    795  1.1  mrg 
    796  1.1  mrg 
    797  1.1  mrg   if (delta == 0)
    798  1.1  mrg     {
    799  1.1  mrg       /* If the references has the same address, only prefetch the
    800  1.1  mrg 	 former.  */
    801  1.1  mrg       if (by_is_before)
    802  1.1  mrg 	ref->prefetch_before = 0;
    803  1.1  mrg 
    804  1.1  mrg       return;
    805  1.1  mrg     }
    806  1.1  mrg 
    807  1.1  mrg   if (!step)
    808  1.1  mrg     {
    809  1.1  mrg       /* If the reference addresses are invariant and fall into the
    810  1.1  mrg 	 same cache line, prefetch just the first one.  */
    811  1.1  mrg       if (!by_is_before)
    812  1.1  mrg 	return;
    813  1.1  mrg 
    814  1.1  mrg       if (ddown (ref->delta, PREFETCH_BLOCK)
    815  1.1  mrg 	  != ddown (by->delta, PREFETCH_BLOCK))
    816  1.1  mrg 	return;
    817  1.1  mrg 
    818  1.1  mrg       ref->prefetch_before = 0;
    819  1.1  mrg       return;
    820  1.1  mrg     }
    821  1.1  mrg 
    822  1.1  mrg   /* Only prune the reference that is behind in the array.  */
    823  1.1  mrg   if (backward)
    824  1.1  mrg     {
    825  1.1  mrg       if (delta > 0)
    826  1.1  mrg 	return;
    827  1.1  mrg 
    828  1.1  mrg       /* Transform the data so that we may assume that the accesses
    829  1.1  mrg 	 are forward.  */
    830  1.1  mrg       delta = - delta;
    831  1.1  mrg       step = -step;
    832  1.1  mrg       delta_r = PREFETCH_BLOCK - 1 - delta_r;
    833  1.1  mrg       delta_b = PREFETCH_BLOCK - 1 - delta_b;
    834  1.1  mrg     }
    835  1.1  mrg   else
    836  1.1  mrg     {
    837  1.1  mrg       if (delta < 0)
    838  1.1  mrg 	return;
    839  1.1  mrg     }
    840  1.1  mrg 
    841  1.1  mrg   /* Check whether the two references are likely to hit the same cache
    842  1.1  mrg      line, and how distant the iterations in that it occurs are from
    843  1.1  mrg      each other.  */
    844  1.1  mrg 
    845  1.1  mrg   if (step <= PREFETCH_BLOCK)
    846  1.1  mrg     {
    847  1.1  mrg       /* The accesses are sure to meet.  Let us check when.  */
    848  1.1  mrg       hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
    849  1.1  mrg       prefetch_before = (hit_from - delta_r + step - 1) / step;
    850  1.1  mrg 
    851  1.1  mrg       /* Do not reduce prefetch_before if we meet beyond cache size.  */
    852  1.1  mrg       if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
    853  1.1  mrg         prefetch_before = PREFETCH_ALL;
    854  1.1  mrg       if (prefetch_before < ref->prefetch_before)
    855  1.1  mrg 	ref->prefetch_before = prefetch_before;
    856  1.1  mrg 
    857  1.1  mrg       return;
    858  1.1  mrg     }
    859  1.1  mrg 
    860  1.1  mrg   /* A more complicated case with step > prefetch_block.  First reduce
    861  1.1  mrg      the ratio between the step and the cache line size to its simplest
    862  1.1  mrg      terms.  The resulting denominator will then represent the number of
    863  1.1  mrg      distinct iterations after which each address will go back to its
    864  1.1  mrg      initial location within the cache line.  This computation assumes
    865  1.1  mrg      that PREFETCH_BLOCK is a power of two.  */
    866  1.1  mrg   prefetch_block = PREFETCH_BLOCK;
    867  1.1  mrg   reduced_prefetch_block = prefetch_block;
    868  1.1  mrg   reduced_step = step;
    869  1.1  mrg   while ((reduced_step & 1) == 0
    870  1.1  mrg 	 && reduced_prefetch_block > 1)
    871  1.1  mrg     {
    872  1.1  mrg       reduced_step >>= 1;
    873  1.1  mrg       reduced_prefetch_block >>= 1;
    874  1.1  mrg     }
    875  1.1  mrg 
    876  1.1  mrg   prefetch_before = delta / step;
    877  1.1  mrg   delta %= step;
    878  1.1  mrg   ref_type = TREE_TYPE (ref->mem);
    879  1.1  mrg   align_unit = TYPE_ALIGN (ref_type) / 8;
    880  1.1  mrg   if (is_miss_rate_acceptable (prefetch_block, step, delta,
    881  1.1  mrg 			       reduced_prefetch_block, align_unit))
    882  1.1  mrg     {
    883  1.1  mrg       /* Do not reduce prefetch_before if we meet beyond cache size.  */
    884  1.1  mrg       if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
    885  1.1  mrg         prefetch_before = PREFETCH_ALL;
    886  1.1  mrg       if (prefetch_before < ref->prefetch_before)
    887  1.1  mrg 	ref->prefetch_before = prefetch_before;
    888  1.1  mrg 
    889  1.1  mrg       return;
    890  1.1  mrg     }
    891  1.1  mrg 
    892  1.1  mrg   /* Try also the following iteration.  */
    893  1.1  mrg   prefetch_before++;
    894  1.1  mrg   delta = step - delta;
    895  1.1  mrg   if (is_miss_rate_acceptable (prefetch_block, step, delta,
    896  1.1  mrg 			       reduced_prefetch_block, align_unit))
    897  1.1  mrg     {
    898  1.1  mrg       if (prefetch_before < ref->prefetch_before)
    899  1.1  mrg 	ref->prefetch_before = prefetch_before;
    900  1.1  mrg 
    901  1.1  mrg       return;
    902  1.1  mrg     }
    903  1.1  mrg 
    904  1.1  mrg   /* The ref probably does not reuse by.  */
    905  1.1  mrg   return;
    906  1.1  mrg }
    907  1.1  mrg 
    908  1.1  mrg /* Prune the prefetch candidate REF using the reuses with other references
    909  1.1  mrg    in REFS.  */
    910  1.1  mrg 
    911  1.1  mrg static void
    912  1.1  mrg prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
    913  1.1  mrg {
    914  1.1  mrg   struct mem_ref *prune_by;
    915  1.1  mrg   bool before = true;
    916  1.1  mrg 
    917  1.1  mrg   prune_ref_by_self_reuse (ref);
    918  1.1  mrg 
    919  1.1  mrg   for (prune_by = refs; prune_by; prune_by = prune_by->next)
    920  1.1  mrg     {
    921  1.1  mrg       if (prune_by == ref)
    922  1.1  mrg 	{
    923  1.1  mrg 	  before = false;
    924  1.1  mrg 	  continue;
    925  1.1  mrg 	}
    926  1.1  mrg 
    927  1.1  mrg       if (!WRITE_CAN_USE_READ_PREFETCH
    928  1.1  mrg 	  && ref->write_p
    929  1.1  mrg 	  && !prune_by->write_p)
    930  1.1  mrg 	continue;
    931  1.1  mrg       if (!READ_CAN_USE_WRITE_PREFETCH
    932  1.1  mrg 	  && !ref->write_p
    933  1.1  mrg 	  && prune_by->write_p)
    934  1.1  mrg 	continue;
    935  1.1  mrg 
    936  1.1  mrg       prune_ref_by_group_reuse (ref, prune_by, before);
    937  1.1  mrg     }
    938  1.1  mrg }
    939  1.1  mrg 
    940  1.1  mrg /* Prune the prefetch candidates in GROUP using the reuse analysis.  */
    941  1.1  mrg 
    942  1.1  mrg static void
    943  1.1  mrg prune_group_by_reuse (struct mem_ref_group *group)
    944  1.1  mrg {
    945  1.1  mrg   struct mem_ref *ref_pruned;
    946  1.1  mrg 
    947  1.1  mrg   for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
    948  1.1  mrg     {
    949  1.1  mrg       prune_ref_by_reuse (ref_pruned, group->refs);
    950  1.1  mrg 
    951  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
    952  1.1  mrg 	{
    953  1.1  mrg 	  dump_mem_ref (dump_file, ref_pruned);
    954  1.1  mrg 
    955  1.1  mrg 	  if (ref_pruned->prefetch_before == PREFETCH_ALL
    956  1.1  mrg 	      && ref_pruned->prefetch_mod == 1)
    957  1.1  mrg 	    fprintf (dump_file, " no restrictions");
    958  1.1  mrg 	  else if (ref_pruned->prefetch_before == 0)
    959  1.1  mrg 	    fprintf (dump_file, " do not prefetch");
    960  1.1  mrg 	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
    961  1.1  mrg 	    fprintf (dump_file, " prefetch once");
    962  1.1  mrg 	  else
    963  1.1  mrg 	    {
    964  1.1  mrg 	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
    965  1.1  mrg 		{
    966  1.1  mrg 		  fprintf (dump_file, " prefetch before ");
    967  1.1  mrg 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
    968  1.1  mrg 			   ref_pruned->prefetch_before);
    969  1.1  mrg 		}
    970  1.1  mrg 	      if (ref_pruned->prefetch_mod != 1)
    971  1.1  mrg 		{
    972  1.1  mrg 		  fprintf (dump_file, " prefetch mod ");
    973  1.1  mrg 		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
    974  1.1  mrg 			   ref_pruned->prefetch_mod);
    975  1.1  mrg 		}
    976  1.1  mrg 	    }
    977  1.1  mrg 	  fprintf (dump_file, "\n");
    978  1.1  mrg 	}
    979  1.1  mrg     }
    980  1.1  mrg }
    981  1.1  mrg 
    982  1.1  mrg /* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
    983  1.1  mrg 
    984  1.1  mrg static void
    985  1.1  mrg prune_by_reuse (struct mem_ref_group *groups)
    986  1.1  mrg {
    987  1.1  mrg   for (; groups; groups = groups->next)
    988  1.1  mrg     prune_group_by_reuse (groups);
    989  1.1  mrg }
    990  1.1  mrg 
    991  1.1  mrg /* Returns true if we should issue prefetch for REF.  */
    992  1.1  mrg 
    993  1.1  mrg static bool
    994  1.1  mrg should_issue_prefetch_p (struct mem_ref *ref)
    995  1.1  mrg {
    996  1.1  mrg   /* Do we want to issue prefetches for non-constant strides?  */
    997  1.1  mrg   if (!cst_and_fits_in_hwi (ref->group->step)
    998  1.1  mrg       && param_prefetch_dynamic_strides == 0)
    999  1.1  mrg     {
   1000  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1001  1.1  mrg 	fprintf (dump_file,
   1002  1.1  mrg 		 "Skipping non-constant step for reference %u:%u\n",
   1003  1.1  mrg 		 ref->group->uid, ref->uid);
   1004  1.1  mrg       return false;
   1005  1.1  mrg     }
   1006  1.1  mrg 
   1007  1.1  mrg   /* Some processors may have a hardware prefetcher that may conflict with
   1008  1.1  mrg      prefetch hints for a range of strides.  Make sure we don't issue
   1009  1.1  mrg      prefetches for such cases if the stride is within this particular
   1010  1.1  mrg      range.  */
   1011  1.1  mrg   if (cst_and_fits_in_hwi (ref->group->step)
   1012  1.1  mrg       && abs_hwi (int_cst_value (ref->group->step))
   1013  1.1  mrg 	  < (HOST_WIDE_INT) param_prefetch_minimum_stride)
   1014  1.1  mrg     {
   1015  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1016  1.1  mrg 	fprintf (dump_file,
   1017  1.1  mrg 		 "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
   1018  1.1  mrg 		 ") is less than the mininum required stride of %d\n",
   1019  1.1  mrg 		 ref->group->uid, ref->uid, int_cst_value (ref->group->step),
   1020  1.1  mrg 		 param_prefetch_minimum_stride);
   1021  1.1  mrg       return false;
   1022  1.1  mrg     }
   1023  1.1  mrg 
   1024  1.1  mrg   /* For now do not issue prefetches for only first few of the
   1025  1.1  mrg      iterations.  */
   1026  1.1  mrg   if (ref->prefetch_before != PREFETCH_ALL)
   1027  1.1  mrg     {
   1028  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1029  1.1  mrg         fprintf (dump_file, "Ignoring reference %u:%u due to prefetch_before\n",
   1030  1.1  mrg 		 ref->group->uid, ref->uid);
   1031  1.1  mrg       return false;
   1032  1.1  mrg     }
   1033  1.1  mrg 
   1034  1.1  mrg   /* Do not prefetch nontemporal stores.  */
   1035  1.1  mrg   if (ref->storent_p)
   1036  1.1  mrg     {
   1037  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1038  1.1  mrg         fprintf (dump_file, "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
   1039  1.1  mrg       return false;
   1040  1.1  mrg     }
   1041  1.1  mrg 
   1042  1.1  mrg   return true;
   1043  1.1  mrg }
   1044  1.1  mrg 
   1045  1.1  mrg /* Decide which of the prefetch candidates in GROUPS to prefetch.
   1046  1.1  mrg    AHEAD is the number of iterations to prefetch ahead (which corresponds
   1047  1.1  mrg    to the number of simultaneous instances of one prefetch running at a
   1048  1.1  mrg    time).  UNROLL_FACTOR is the factor by that the loop is going to be
   1049  1.1  mrg    unrolled.  Returns true if there is anything to prefetch.  */
   1050  1.1  mrg 
   1051  1.1  mrg static bool
   1052  1.1  mrg schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
   1053  1.1  mrg 		     unsigned ahead)
   1054  1.1  mrg {
   1055  1.1  mrg   unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
   1056  1.1  mrg   unsigned slots_per_prefetch;
   1057  1.1  mrg   struct mem_ref *ref;
   1058  1.1  mrg   bool any = false;
   1059  1.1  mrg 
   1060  1.1  mrg   /* At most param_simultaneous_prefetches should be running
   1061  1.1  mrg      at the same time.  */
   1062  1.1  mrg   remaining_prefetch_slots = param_simultaneous_prefetches;
   1063  1.1  mrg 
   1064  1.1  mrg   /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
   1065  1.1  mrg      AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
   1066  1.1  mrg      it will need a prefetch slot.  */
   1067  1.1  mrg   slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
   1068  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1069  1.1  mrg     fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
   1070  1.1  mrg 	     slots_per_prefetch);
   1071  1.1  mrg 
   1072  1.1  mrg   /* For now we just take memory references one by one and issue
   1073  1.1  mrg      prefetches for as many as possible.  The groups are sorted
   1074  1.1  mrg      starting with the largest step, since the references with
   1075  1.1  mrg      large step are more likely to cause many cache misses.  */
   1076  1.1  mrg 
   1077  1.1  mrg   for (; groups; groups = groups->next)
   1078  1.1  mrg     for (ref = groups->refs; ref; ref = ref->next)
   1079  1.1  mrg       {
   1080  1.1  mrg 	if (!should_issue_prefetch_p (ref))
   1081  1.1  mrg 	  continue;
   1082  1.1  mrg 
   1083  1.1  mrg         /* The loop is far from being sufficiently unrolled for this
   1084  1.1  mrg            prefetch.  Do not generate prefetch to avoid many redudant
   1085  1.1  mrg            prefetches.  */
   1086  1.1  mrg         if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
   1087  1.1  mrg           continue;
   1088  1.1  mrg 
   1089  1.1  mrg 	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
   1090  1.1  mrg 	   and we unroll the loop UNROLL_FACTOR times, we need to insert
   1091  1.1  mrg 	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
   1092  1.1  mrg 	   iteration.  */
   1093  1.1  mrg 	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
   1094  1.1  mrg 			/ ref->prefetch_mod);
   1095  1.1  mrg 	prefetch_slots = n_prefetches * slots_per_prefetch;
   1096  1.1  mrg 
   1097  1.1  mrg 	/* If more than half of the prefetches would be lost anyway, do not
   1098  1.1  mrg 	   issue the prefetch.  */
   1099  1.1  mrg 	if (2 * remaining_prefetch_slots < prefetch_slots)
   1100  1.1  mrg 	  continue;
   1101  1.1  mrg 
   1102  1.1  mrg 	/* Stop prefetching if debug counter is activated.  */
   1103  1.1  mrg 	if (!dbg_cnt (prefetch))
   1104  1.1  mrg 	  continue;
   1105  1.1  mrg 
   1106  1.1  mrg 	ref->issue_prefetch_p = true;
   1107  1.1  mrg 	if (dump_file && (dump_flags & TDF_DETAILS))
   1108  1.1  mrg 	  fprintf (dump_file, "Decided to issue prefetch for reference %u:%u\n",
   1109  1.1  mrg 		   ref->group->uid, ref->uid);
   1110  1.1  mrg 
   1111  1.1  mrg 	if (remaining_prefetch_slots <= prefetch_slots)
   1112  1.1  mrg 	  return true;
   1113  1.1  mrg 	remaining_prefetch_slots -= prefetch_slots;
   1114  1.1  mrg 	any = true;
   1115  1.1  mrg       }
   1116  1.1  mrg 
   1117  1.1  mrg   return any;
   1118  1.1  mrg }
   1119  1.1  mrg 
   1120  1.1  mrg /* Return TRUE if no prefetch is going to be generated in the given
   1121  1.1  mrg    GROUPS.  */
   1122  1.1  mrg 
   1123  1.1  mrg static bool
   1124  1.1  mrg nothing_to_prefetch_p (struct mem_ref_group *groups)
   1125  1.1  mrg {
   1126  1.1  mrg   struct mem_ref *ref;
   1127  1.1  mrg 
   1128  1.1  mrg   for (; groups; groups = groups->next)
   1129  1.1  mrg     for (ref = groups->refs; ref; ref = ref->next)
   1130  1.1  mrg       if (should_issue_prefetch_p (ref))
   1131  1.1  mrg 	return false;
   1132  1.1  mrg 
   1133  1.1  mrg   return true;
   1134  1.1  mrg }
   1135  1.1  mrg 
   1136  1.1  mrg /* Estimate the number of prefetches in the given GROUPS.
   1137  1.1  mrg    UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
   1138  1.1  mrg 
   1139  1.1  mrg static int
   1140  1.1  mrg estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
   1141  1.1  mrg {
   1142  1.1  mrg   struct mem_ref *ref;
   1143  1.1  mrg   unsigned n_prefetches;
   1144  1.1  mrg   int prefetch_count = 0;
   1145  1.1  mrg 
   1146  1.1  mrg   for (; groups; groups = groups->next)
   1147  1.1  mrg     for (ref = groups->refs; ref; ref = ref->next)
   1148  1.1  mrg       if (should_issue_prefetch_p (ref))
   1149  1.1  mrg 	{
   1150  1.1  mrg 	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
   1151  1.1  mrg 			  / ref->prefetch_mod);
   1152  1.1  mrg 	  prefetch_count += n_prefetches;
   1153  1.1  mrg 	}
   1154  1.1  mrg 
   1155  1.1  mrg   return prefetch_count;
   1156  1.1  mrg }
   1157  1.1  mrg 
   1158  1.1  mrg /* Issue prefetches for the reference REF into loop as decided before.
   1159  1.1  mrg    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
   1160  1.1  mrg    is the factor by which LOOP was unrolled.  */
   1161  1.1  mrg 
   1162  1.1  mrg static void
   1163  1.1  mrg issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
   1164  1.1  mrg {
   1165  1.1  mrg   HOST_WIDE_INT delta;
   1166  1.1  mrg   tree addr, addr_base, write_p, local, forward;
   1167  1.1  mrg   gcall *prefetch;
   1168  1.1  mrg   gimple_stmt_iterator bsi;
   1169  1.1  mrg   unsigned n_prefetches, ap;
   1170  1.1  mrg   bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
   1171  1.1  mrg 
   1172  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1173  1.1  mrg     fprintf (dump_file, "Issued%s prefetch for reference %u:%u.\n",
   1174  1.1  mrg 	     nontemporal ? " nontemporal" : "",
   1175  1.1  mrg 	     ref->group->uid, ref->uid);
   1176  1.1  mrg 
   1177  1.1  mrg   bsi = gsi_for_stmt (ref->stmt);
   1178  1.1  mrg 
   1179  1.1  mrg   n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
   1180  1.1  mrg 		  / ref->prefetch_mod);
   1181  1.1  mrg   addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
   1182  1.1  mrg   addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
   1183  1.1  mrg 					true, NULL, true, GSI_SAME_STMT);
   1184  1.1  mrg   write_p = ref->write_p ? integer_one_node : integer_zero_node;
   1185  1.1  mrg   local = nontemporal ? integer_zero_node : integer_three_node;
   1186  1.1  mrg 
   1187  1.1  mrg   for (ap = 0; ap < n_prefetches; ap++)
   1188  1.1  mrg     {
   1189  1.1  mrg       if (cst_and_fits_in_hwi (ref->group->step))
   1190  1.1  mrg         {
   1191  1.1  mrg           /* Determine the address to prefetch.  */
   1192  1.1  mrg           delta = (ahead + ap * ref->prefetch_mod) *
   1193  1.1  mrg 		   int_cst_value (ref->group->step);
   1194  1.1  mrg           addr = fold_build_pointer_plus_hwi (addr_base, delta);
   1195  1.1  mrg           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
   1196  1.1  mrg 					   NULL, true, GSI_SAME_STMT);
   1197  1.1  mrg         }
   1198  1.1  mrg       else
   1199  1.1  mrg         {
   1200  1.1  mrg           /* The step size is non-constant but loop-invariant.  We use the
   1201  1.1  mrg              heuristic to simply prefetch ahead iterations ahead.  */
   1202  1.1  mrg           forward = fold_build2 (MULT_EXPR, sizetype,
   1203  1.1  mrg                                  fold_convert (sizetype, ref->group->step),
   1204  1.1  mrg                                  fold_convert (sizetype, size_int (ahead)));
   1205  1.1  mrg           addr = fold_build_pointer_plus (addr_base, forward);
   1206  1.1  mrg           addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
   1207  1.1  mrg 					   NULL, true, GSI_SAME_STMT);
   1208  1.1  mrg       }
   1209  1.1  mrg 
   1210  1.1  mrg       if (addr_base != addr
   1211  1.1  mrg 	  && TREE_CODE (addr_base) == SSA_NAME
   1212  1.1  mrg 	  && TREE_CODE (addr) == SSA_NAME)
   1213  1.1  mrg 	{
   1214  1.1  mrg 	  duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
   1215  1.1  mrg 	  /* As this isn't a plain copy we have to reset alignment
   1216  1.1  mrg 	     information.  */
   1217  1.1  mrg 	  if (SSA_NAME_PTR_INFO (addr))
   1218  1.1  mrg 	    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
   1219  1.1  mrg 	}
   1220  1.1  mrg 
   1221  1.1  mrg       /* Create the prefetch instruction.  */
   1222  1.1  mrg       prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
   1223  1.1  mrg 				    3, addr, write_p, local);
   1224  1.1  mrg       gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
   1225  1.1  mrg     }
   1226  1.1  mrg }
   1227  1.1  mrg 
   1228  1.1  mrg /* Issue prefetches for the references in GROUPS into loop as decided before.
   1229  1.1  mrg    HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
   1230  1.1  mrg    factor by that LOOP was unrolled.  */
   1231  1.1  mrg 
   1232  1.1  mrg static void
   1233  1.1  mrg issue_prefetches (struct mem_ref_group *groups,
   1234  1.1  mrg 		  unsigned unroll_factor, unsigned ahead)
   1235  1.1  mrg {
   1236  1.1  mrg   struct mem_ref *ref;
   1237  1.1  mrg 
   1238  1.1  mrg   for (; groups; groups = groups->next)
   1239  1.1  mrg     for (ref = groups->refs; ref; ref = ref->next)
   1240  1.1  mrg       if (ref->issue_prefetch_p)
   1241  1.1  mrg 	issue_prefetch_ref (ref, unroll_factor, ahead);
   1242  1.1  mrg }
   1243  1.1  mrg 
   1244  1.1  mrg /* Returns true if REF is a memory write for that a nontemporal store insn
   1245  1.1  mrg    can be used.  */
   1246  1.1  mrg 
   1247  1.1  mrg static bool
   1248  1.1  mrg nontemporal_store_p (struct mem_ref *ref)
   1249  1.1  mrg {
   1250  1.1  mrg   machine_mode mode;
   1251  1.1  mrg   enum insn_code code;
   1252  1.1  mrg 
   1253  1.1  mrg   /* REF must be a write that is not reused.  We require it to be independent
   1254  1.1  mrg      on all other memory references in the loop, as the nontemporal stores may
   1255  1.1  mrg      be reordered with respect to other memory references.  */
   1256  1.1  mrg   if (!ref->write_p
   1257  1.1  mrg       || !ref->independent_p
   1258  1.1  mrg       || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
   1259  1.1  mrg     return false;
   1260  1.1  mrg 
   1261  1.1  mrg   /* Check that we have the storent instruction for the mode.  */
   1262  1.1  mrg   mode = TYPE_MODE (TREE_TYPE (ref->mem));
   1263  1.1  mrg   if (mode == BLKmode)
   1264  1.1  mrg     return false;
   1265  1.1  mrg 
   1266  1.1  mrg   code = optab_handler (storent_optab, mode);
   1267  1.1  mrg   return code != CODE_FOR_nothing;
   1268  1.1  mrg }
   1269  1.1  mrg 
   1270  1.1  mrg /* If REF is a nontemporal store, we mark the corresponding modify statement
   1271  1.1  mrg    and return true.  Otherwise, we return false.  */
   1272  1.1  mrg 
   1273  1.1  mrg static bool
   1274  1.1  mrg mark_nontemporal_store (struct mem_ref *ref)
   1275  1.1  mrg {
   1276  1.1  mrg   if (!nontemporal_store_p (ref))
   1277  1.1  mrg     return false;
   1278  1.1  mrg 
   1279  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1280  1.1  mrg     fprintf (dump_file, "Marked reference %u:%u as a nontemporal store.\n",
   1281  1.1  mrg 	     ref->group->uid, ref->uid);
   1282  1.1  mrg 
   1283  1.1  mrg   gimple_assign_set_nontemporal_move (ref->stmt, true);
   1284  1.1  mrg   ref->storent_p = true;
   1285  1.1  mrg 
   1286  1.1  mrg   return true;
   1287  1.1  mrg }
   1288  1.1  mrg 
   1289  1.1  mrg /* Issue a memory fence instruction after LOOP.  */
   1290  1.1  mrg 
   1291  1.1  mrg static void
   1292  1.1  mrg emit_mfence_after_loop (class loop *loop)
   1293  1.1  mrg {
   1294  1.1  mrg   auto_vec<edge> exits = get_loop_exit_edges (loop);
   1295  1.1  mrg   edge exit;
   1296  1.1  mrg   gcall *call;
   1297  1.1  mrg   gimple_stmt_iterator bsi;
   1298  1.1  mrg   unsigned i;
   1299  1.1  mrg 
   1300  1.1  mrg   FOR_EACH_VEC_ELT (exits, i, exit)
   1301  1.1  mrg     {
   1302  1.1  mrg       call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
   1303  1.1  mrg 
   1304  1.1  mrg       if (!single_pred_p (exit->dest)
   1305  1.1  mrg 	  /* If possible, we prefer not to insert the fence on other paths
   1306  1.1  mrg 	     in cfg.  */
   1307  1.1  mrg 	  && !(exit->flags & EDGE_ABNORMAL))
   1308  1.1  mrg 	split_loop_exit_edge (exit);
   1309  1.1  mrg       bsi = gsi_after_labels (exit->dest);
   1310  1.1  mrg 
   1311  1.1  mrg       gsi_insert_before (&bsi, call, GSI_NEW_STMT);
   1312  1.1  mrg     }
   1313  1.1  mrg 
   1314  1.1  mrg   update_ssa (TODO_update_ssa_only_virtuals);
   1315  1.1  mrg }
   1316  1.1  mrg 
   1317  1.1  mrg /* Returns true if we can use storent in loop, false otherwise.  */
   1318  1.1  mrg 
   1319  1.1  mrg static bool
   1320  1.1  mrg may_use_storent_in_loop_p (class loop *loop)
   1321  1.1  mrg {
   1322  1.1  mrg   bool ret = true;
   1323  1.1  mrg 
   1324  1.1  mrg   if (loop->inner != NULL)
   1325  1.1  mrg     return false;
   1326  1.1  mrg 
   1327  1.1  mrg   /* If we must issue a mfence insn after using storent, check that there
   1328  1.1  mrg      is a suitable place for it at each of the loop exits.  */
   1329  1.1  mrg   if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
   1330  1.1  mrg     {
   1331  1.1  mrg       auto_vec<edge> exits = get_loop_exit_edges (loop);
   1332  1.1  mrg       unsigned i;
   1333  1.1  mrg       edge exit;
   1334  1.1  mrg 
   1335  1.1  mrg       FOR_EACH_VEC_ELT (exits, i, exit)
   1336  1.1  mrg 	if ((exit->flags & EDGE_ABNORMAL)
   1337  1.1  mrg 	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
   1338  1.1  mrg 	  ret = false;
   1339  1.1  mrg     }
   1340  1.1  mrg 
   1341  1.1  mrg   return ret;
   1342  1.1  mrg }
   1343  1.1  mrg 
   1344  1.1  mrg /* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
   1345  1.1  mrg    references in the loop.  */
   1346  1.1  mrg 
   1347  1.1  mrg static void
   1348  1.1  mrg mark_nontemporal_stores (class loop *loop, struct mem_ref_group *groups)
   1349  1.1  mrg {
   1350  1.1  mrg   struct mem_ref *ref;
   1351  1.1  mrg   bool any = false;
   1352  1.1  mrg 
   1353  1.1  mrg   if (!may_use_storent_in_loop_p (loop))
   1354  1.1  mrg     return;
   1355  1.1  mrg 
   1356  1.1  mrg   for (; groups; groups = groups->next)
   1357  1.1  mrg     for (ref = groups->refs; ref; ref = ref->next)
   1358  1.1  mrg       any |= mark_nontemporal_store (ref);
   1359  1.1  mrg 
   1360  1.1  mrg   if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
   1361  1.1  mrg     emit_mfence_after_loop (loop);
   1362  1.1  mrg }
   1363  1.1  mrg 
   1364  1.1  mrg /* Determines whether we can profitably unroll LOOP FACTOR times, and if
   1365  1.1  mrg    this is the case, fill in DESC by the description of number of
   1366  1.1  mrg    iterations.  */
   1367  1.1  mrg 
   1368  1.1  mrg static bool
   1369  1.1  mrg should_unroll_loop_p (class loop *loop, class tree_niter_desc *desc,
   1370  1.1  mrg 		      unsigned factor)
   1371  1.1  mrg {
   1372  1.1  mrg   if (!can_unroll_loop_p (loop, factor, desc))
   1373  1.1  mrg     return false;
   1374  1.1  mrg 
   1375  1.1  mrg   /* We only consider loops without control flow for unrolling.  This is not
   1376  1.1  mrg      a hard restriction -- tree_unroll_loop works with arbitrary loops
   1377  1.1  mrg      as well; but the unrolling/prefetching is usually more profitable for
   1378  1.1  mrg      loops consisting of a single basic block, and we want to limit the
   1379  1.1  mrg      code growth.  */
   1380  1.1  mrg   if (loop->num_nodes > 2)
   1381  1.1  mrg     return false;
   1382  1.1  mrg 
   1383  1.1  mrg   return true;
   1384  1.1  mrg }
   1385  1.1  mrg 
   1386  1.1  mrg /* Determine the coefficient by that unroll LOOP, from the information
   1387  1.1  mrg    contained in the list of memory references REFS.  Description of
   1388  1.1  mrg    number of iterations of LOOP is stored to DESC.  NINSNS is the number of
   1389  1.1  mrg    insns of the LOOP.  EST_NITER is the estimated number of iterations of
   1390  1.1  mrg    the loop, or -1 if no estimate is available.  */
   1391  1.1  mrg 
   1392  1.1  mrg static unsigned
   1393  1.1  mrg determine_unroll_factor (class loop *loop, struct mem_ref_group *refs,
   1394  1.1  mrg 			 unsigned ninsns, class tree_niter_desc *desc,
   1395  1.1  mrg 			 HOST_WIDE_INT est_niter)
   1396  1.1  mrg {
   1397  1.1  mrg   unsigned upper_bound;
   1398  1.1  mrg   unsigned nfactor, factor, mod_constraint;
   1399  1.1  mrg   struct mem_ref_group *agp;
   1400  1.1  mrg   struct mem_ref *ref;
   1401  1.1  mrg 
   1402  1.1  mrg   /* First check whether the loop is not too large to unroll.  We ignore
   1403  1.1  mrg      PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
   1404  1.1  mrg      from unrolling them enough to make exactly one cache line covered by each
   1405  1.1  mrg      iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
   1406  1.1  mrg      us from unrolling the loops too many times in cases where we only expect
   1407  1.1  mrg      gains from better scheduling and decreasing loop overhead, which is not
   1408  1.1  mrg      the case here.  */
   1409  1.1  mrg   upper_bound = param_max_unrolled_insns / ninsns;
   1410  1.1  mrg 
   1411  1.1  mrg   /* If we unrolled the loop more times than it iterates, the unrolled version
   1412  1.1  mrg      of the loop would be never entered.  */
   1413  1.1  mrg   if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
   1414  1.1  mrg     upper_bound = est_niter;
   1415  1.1  mrg 
   1416  1.1  mrg   if (upper_bound <= 1)
   1417  1.1  mrg     return 1;
   1418  1.1  mrg 
   1419  1.1  mrg   /* Choose the factor so that we may prefetch each cache just once,
   1420  1.1  mrg      but bound the unrolling by UPPER_BOUND.  */
   1421  1.1  mrg   factor = 1;
   1422  1.1  mrg   for (agp = refs; agp; agp = agp->next)
   1423  1.1  mrg     for (ref = agp->refs; ref; ref = ref->next)
   1424  1.1  mrg       if (should_issue_prefetch_p (ref))
   1425  1.1  mrg 	{
   1426  1.1  mrg 	  mod_constraint = ref->prefetch_mod;
   1427  1.1  mrg 	  nfactor = least_common_multiple (mod_constraint, factor);
   1428  1.1  mrg 	  if (nfactor <= upper_bound)
   1429  1.1  mrg 	    factor = nfactor;
   1430  1.1  mrg 	}
   1431  1.1  mrg 
   1432  1.1  mrg   if (!should_unroll_loop_p (loop, desc, factor))
   1433  1.1  mrg     return 1;
   1434  1.1  mrg 
   1435  1.1  mrg   return factor;
   1436  1.1  mrg }
   1437  1.1  mrg 
   1438  1.1  mrg /* Returns the total volume of the memory references REFS, taking into account
   1439  1.1  mrg    reuses in the innermost loop and cache line size.  TODO -- we should also
   1440  1.1  mrg    take into account reuses across the iterations of the loops in the loop
   1441  1.1  mrg    nest.  */
   1442  1.1  mrg 
   1443  1.1  mrg static unsigned
   1444  1.1  mrg volume_of_references (struct mem_ref_group *refs)
   1445  1.1  mrg {
   1446  1.1  mrg   unsigned volume = 0;
   1447  1.1  mrg   struct mem_ref_group *gr;
   1448  1.1  mrg   struct mem_ref *ref;
   1449  1.1  mrg 
   1450  1.1  mrg   for (gr = refs; gr; gr = gr->next)
   1451  1.1  mrg     for (ref = gr->refs; ref; ref = ref->next)
   1452  1.1  mrg       {
   1453  1.1  mrg 	/* Almost always reuses another value?  */
   1454  1.1  mrg 	if (ref->prefetch_before != PREFETCH_ALL)
   1455  1.1  mrg 	  continue;
   1456  1.1  mrg 
   1457  1.1  mrg 	/* If several iterations access the same cache line, use the size of
   1458  1.1  mrg 	   the line divided by this number.  Otherwise, a cache line is
   1459  1.1  mrg 	   accessed in each iteration.  TODO -- in the latter case, we should
   1460  1.1  mrg 	   take the size of the reference into account, rounding it up on cache
   1461  1.1  mrg 	   line size multiple.  */
   1462  1.1  mrg 	volume += param_l1_cache_line_size / ref->prefetch_mod;
   1463  1.1  mrg       }
   1464  1.1  mrg   return volume;
   1465  1.1  mrg }
   1466  1.1  mrg 
   1467  1.1  mrg /* Returns the volume of memory references accessed across VEC iterations of
   1468  1.1  mrg    loops, whose sizes are described in the LOOP_SIZES array.  N is the number
   1469  1.1  mrg    of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
   1470  1.1  mrg 
   1471  1.1  mrg static unsigned
   1472  1.1  mrg volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
   1473  1.1  mrg {
   1474  1.1  mrg   unsigned i;
   1475  1.1  mrg 
   1476  1.1  mrg   for (i = 0; i < n; i++)
   1477  1.1  mrg     if (vec[i] != 0)
   1478  1.1  mrg       break;
   1479  1.1  mrg 
   1480  1.1  mrg   if (i == n)
   1481  1.1  mrg     return 0;
   1482  1.1  mrg 
   1483  1.1  mrg   gcc_assert (vec[i] > 0);
   1484  1.1  mrg 
   1485  1.1  mrg   /* We ignore the parts of the distance vector in subloops, since usually
   1486  1.1  mrg      the numbers of iterations are much smaller.  */
   1487  1.1  mrg   return loop_sizes[i] * vec[i];
   1488  1.1  mrg }
   1489  1.1  mrg 
   1490  1.1  mrg /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
   1491  1.1  mrg    at the position corresponding to the loop of the step.  N is the depth
   1492  1.1  mrg    of the considered loop nest, and, LOOP is its innermost loop.  */
   1493  1.1  mrg 
   1494  1.1  mrg static void
   1495  1.1  mrg add_subscript_strides (tree access_fn, unsigned stride,
   1496  1.1  mrg 		       HOST_WIDE_INT *strides, unsigned n, class loop *loop)
   1497  1.1  mrg {
   1498  1.1  mrg   class loop *aloop;
   1499  1.1  mrg   tree step;
   1500  1.1  mrg   HOST_WIDE_INT astep;
   1501  1.1  mrg   unsigned min_depth = loop_depth (loop) - n;
   1502  1.1  mrg 
   1503  1.1  mrg   while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
   1504  1.1  mrg     {
   1505  1.1  mrg       aloop = get_chrec_loop (access_fn);
   1506  1.1  mrg       step = CHREC_RIGHT (access_fn);
   1507  1.1  mrg       access_fn = CHREC_LEFT (access_fn);
   1508  1.1  mrg 
   1509  1.1  mrg       if ((unsigned) loop_depth (aloop) <= min_depth)
   1510  1.1  mrg 	continue;
   1511  1.1  mrg 
   1512  1.1  mrg       if (tree_fits_shwi_p (step))
   1513  1.1  mrg 	astep = tree_to_shwi (step);
   1514  1.1  mrg       else
   1515  1.1  mrg 	astep = param_l1_cache_line_size;
   1516  1.1  mrg 
   1517  1.1  mrg       strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
   1518  1.1  mrg 
   1519  1.1  mrg     }
   1520  1.1  mrg }
   1521  1.1  mrg 
   1522  1.1  mrg /* Returns the volume of memory references accessed between two consecutive
   1523  1.1  mrg    self-reuses of the reference DR.  We consider the subscripts of DR in N
   1524  1.1  mrg    loops, and LOOP_SIZES contains the volumes of accesses in each of the
   1525  1.1  mrg    loops.  LOOP is the innermost loop of the current loop nest.  */
   1526  1.1  mrg 
   1527  1.1  mrg static unsigned
   1528  1.1  mrg self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
   1529  1.1  mrg 		     class loop *loop)
   1530  1.1  mrg {
   1531  1.1  mrg   tree stride, access_fn;
   1532  1.1  mrg   HOST_WIDE_INT *strides, astride;
   1533  1.1  mrg   vec<tree> access_fns;
   1534  1.1  mrg   tree ref = DR_REF (dr);
   1535  1.1  mrg   unsigned i, ret = ~0u;
   1536  1.1  mrg 
   1537  1.1  mrg   /* In the following example:
   1538  1.1  mrg 
   1539  1.1  mrg      for (i = 0; i < N; i++)
   1540  1.1  mrg        for (j = 0; j < N; j++)
   1541  1.1  mrg          use (a[j][i]);
   1542  1.1  mrg      the same cache line is accessed each N steps (except if the change from
   1543  1.1  mrg      i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
   1544  1.1  mrg      we cannot rely purely on the results of the data dependence analysis.
   1545  1.1  mrg 
   1546  1.1  mrg      Instead, we compute the stride of the reference in each loop, and consider
   1547  1.1  mrg      the innermost loop in that the stride is less than cache size.  */
   1548  1.1  mrg 
   1549  1.1  mrg   strides = XCNEWVEC (HOST_WIDE_INT, n);
   1550  1.1  mrg   access_fns = DR_ACCESS_FNS (dr);
   1551  1.1  mrg 
   1552  1.1  mrg   FOR_EACH_VEC_ELT (access_fns, i, access_fn)
   1553  1.1  mrg     {
   1554  1.1  mrg       /* Keep track of the reference corresponding to the subscript, so that we
   1555  1.1  mrg 	 know its stride.  */
   1556  1.1  mrg       while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
   1557  1.1  mrg 	ref = TREE_OPERAND (ref, 0);
   1558  1.1  mrg 
   1559  1.1  mrg       if (TREE_CODE (ref) == ARRAY_REF)
   1560  1.1  mrg 	{
   1561  1.1  mrg 	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
   1562  1.1  mrg 	  if (tree_fits_uhwi_p (stride))
   1563  1.1  mrg 	    astride = tree_to_uhwi (stride);
   1564  1.1  mrg 	  else
   1565  1.1  mrg 	    astride = param_l1_cache_line_size;
   1566  1.1  mrg 
   1567  1.1  mrg 	  ref = TREE_OPERAND (ref, 0);
   1568  1.1  mrg 	}
   1569  1.1  mrg       else
   1570  1.1  mrg 	astride = 1;
   1571  1.1  mrg 
   1572  1.1  mrg       add_subscript_strides (access_fn, astride, strides, n, loop);
   1573  1.1  mrg     }
   1574  1.1  mrg 
   1575  1.1  mrg   for (i = n; i-- > 0; )
   1576  1.1  mrg     {
   1577  1.1  mrg       unsigned HOST_WIDE_INT s;
   1578  1.1  mrg 
   1579  1.1  mrg       s = strides[i] < 0 ?  -strides[i] : strides[i];
   1580  1.1  mrg 
   1581  1.1  mrg       if (s < (unsigned) param_l1_cache_line_size
   1582  1.1  mrg 	  && (loop_sizes[i]
   1583  1.1  mrg 	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
   1584  1.1  mrg 	{
   1585  1.1  mrg 	  ret = loop_sizes[i];
   1586  1.1  mrg 	  break;
   1587  1.1  mrg 	}
   1588  1.1  mrg     }
   1589  1.1  mrg 
   1590  1.1  mrg   free (strides);
   1591  1.1  mrg   return ret;
   1592  1.1  mrg }
   1593  1.1  mrg 
   1594  1.1  mrg /* Determines the distance till the first reuse of each reference in REFS
   1595  1.1  mrg    in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
   1596  1.1  mrg    memory references in the loop.  Return false if the analysis fails.  */
   1597  1.1  mrg 
   1598  1.1  mrg static bool
   1599  1.1  mrg determine_loop_nest_reuse (class loop *loop, struct mem_ref_group *refs,
   1600  1.1  mrg 			   bool no_other_refs)
   1601  1.1  mrg {
   1602  1.1  mrg   class loop *nest, *aloop;
   1603  1.1  mrg   vec<data_reference_p> datarefs = vNULL;
   1604  1.1  mrg   vec<ddr_p> dependences = vNULL;
   1605  1.1  mrg   struct mem_ref_group *gr;
   1606  1.1  mrg   struct mem_ref *ref, *refb;
   1607  1.1  mrg   auto_vec<loop_p> vloops;
   1608  1.1  mrg   unsigned *loop_data_size;
   1609  1.1  mrg   unsigned i, j, n;
   1610  1.1  mrg   unsigned volume, dist, adist;
   1611  1.1  mrg   HOST_WIDE_INT vol;
   1612  1.1  mrg   data_reference_p dr;
   1613  1.1  mrg   ddr_p dep;
   1614  1.1  mrg 
   1615  1.1  mrg   if (loop->inner)
   1616  1.1  mrg     return true;
   1617  1.1  mrg 
   1618  1.1  mrg   /* Find the outermost loop of the loop nest of loop (we require that
   1619  1.1  mrg      there are no sibling loops inside the nest).  */
   1620  1.1  mrg   nest = loop;
   1621  1.1  mrg   while (1)
   1622  1.1  mrg     {
   1623  1.1  mrg       aloop = loop_outer (nest);
   1624  1.1  mrg 
   1625  1.1  mrg       if (aloop == current_loops->tree_root
   1626  1.1  mrg 	  || aloop->inner->next)
   1627  1.1  mrg 	break;
   1628  1.1  mrg 
   1629  1.1  mrg       nest = aloop;
   1630  1.1  mrg     }
   1631  1.1  mrg 
   1632  1.1  mrg   /* For each loop, determine the amount of data accessed in each iteration.
   1633  1.1  mrg      We use this to estimate whether the reference is evicted from the
   1634  1.1  mrg      cache before its reuse.  */
   1635  1.1  mrg   find_loop_nest (nest, &vloops);
   1636  1.1  mrg   n = vloops.length ();
   1637  1.1  mrg   loop_data_size = XNEWVEC (unsigned, n);
   1638  1.1  mrg   volume = volume_of_references (refs);
   1639  1.1  mrg   i = n;
   1640  1.1  mrg   while (i-- != 0)
   1641  1.1  mrg     {
   1642  1.1  mrg       loop_data_size[i] = volume;
   1643  1.1  mrg       /* Bound the volume by the L2 cache size, since above this bound,
   1644  1.1  mrg 	 all dependence distances are equivalent.  */
   1645  1.1  mrg       if (volume > L2_CACHE_SIZE_BYTES)
   1646  1.1  mrg 	continue;
   1647  1.1  mrg 
   1648  1.1  mrg       aloop = vloops[i];
   1649  1.1  mrg       vol = estimated_stmt_executions_int (aloop);
   1650  1.1  mrg       if (vol == -1)
   1651  1.1  mrg 	vol = expected_loop_iterations (aloop);
   1652  1.1  mrg       volume *= vol;
   1653  1.1  mrg     }
   1654  1.1  mrg 
   1655  1.1  mrg   /* Prepare the references in the form suitable for data dependence
   1656  1.1  mrg      analysis.  We ignore unanalyzable data references (the results
   1657  1.1  mrg      are used just as a heuristics to estimate temporality of the
   1658  1.1  mrg      references, hence we do not need to worry about correctness).  */
   1659  1.1  mrg   for (gr = refs; gr; gr = gr->next)
   1660  1.1  mrg     for (ref = gr->refs; ref; ref = ref->next)
   1661  1.1  mrg       {
   1662  1.1  mrg 	dr = create_data_ref (loop_preheader_edge (nest),
   1663  1.1  mrg 			      loop_containing_stmt (ref->stmt),
   1664  1.1  mrg 			      ref->mem, ref->stmt, !ref->write_p, false);
   1665  1.1  mrg 
   1666  1.1  mrg 	if (dr)
   1667  1.1  mrg 	  {
   1668  1.1  mrg 	    ref->reuse_distance = volume;
   1669  1.1  mrg 	    dr->aux = ref;
   1670  1.1  mrg 	    datarefs.safe_push (dr);
   1671  1.1  mrg 	  }
   1672  1.1  mrg 	else
   1673  1.1  mrg 	  no_other_refs = false;
   1674  1.1  mrg       }
   1675  1.1  mrg 
   1676  1.1  mrg   FOR_EACH_VEC_ELT (datarefs, i, dr)
   1677  1.1  mrg     {
   1678  1.1  mrg       dist = self_reuse_distance (dr, loop_data_size, n, loop);
   1679  1.1  mrg       ref = (struct mem_ref *) dr->aux;
   1680  1.1  mrg       if (ref->reuse_distance > dist)
   1681  1.1  mrg 	ref->reuse_distance = dist;
   1682  1.1  mrg 
   1683  1.1  mrg       if (no_other_refs)
   1684  1.1  mrg 	ref->independent_p = true;
   1685  1.1  mrg     }
   1686  1.1  mrg 
   1687  1.1  mrg   if (!compute_all_dependences (datarefs, &dependences, vloops, true))
   1688  1.1  mrg     return false;
   1689  1.1  mrg 
   1690  1.1  mrg   FOR_EACH_VEC_ELT (dependences, i, dep)
   1691  1.1  mrg     {
   1692  1.1  mrg       if (DDR_ARE_DEPENDENT (dep) == chrec_known)
   1693  1.1  mrg 	continue;
   1694  1.1  mrg 
   1695  1.1  mrg       ref = (struct mem_ref *) DDR_A (dep)->aux;
   1696  1.1  mrg       refb = (struct mem_ref *) DDR_B (dep)->aux;
   1697  1.1  mrg 
   1698  1.1  mrg       if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
   1699  1.1  mrg 	  || DDR_COULD_BE_INDEPENDENT_P (dep)
   1700  1.1  mrg 	  || DDR_NUM_DIST_VECTS (dep) == 0)
   1701  1.1  mrg 	{
   1702  1.1  mrg 	  /* If the dependence cannot be analyzed, assume that there might be
   1703  1.1  mrg 	     a reuse.  */
   1704  1.1  mrg 	  dist = 0;
   1705  1.1  mrg 
   1706  1.1  mrg 	  ref->independent_p = false;
   1707  1.1  mrg 	  refb->independent_p = false;
   1708  1.1  mrg 	}
   1709  1.1  mrg       else
   1710  1.1  mrg 	{
   1711  1.1  mrg 	  /* The distance vectors are normalized to be always lexicographically
   1712  1.1  mrg 	     positive, hence we cannot tell just from them whether DDR_A comes
   1713  1.1  mrg 	     before DDR_B or vice versa.  However, it is not important,
   1714  1.1  mrg 	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
   1715  1.1  mrg 	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
   1716  1.1  mrg 	     in cache (and marking it as nontemporal would not affect
   1717  1.1  mrg 	     anything).  */
   1718  1.1  mrg 
   1719  1.1  mrg 	  dist = volume;
   1720  1.1  mrg 	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
   1721  1.1  mrg 	    {
   1722  1.1  mrg 	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
   1723  1.1  mrg 					     loop_data_size, n);
   1724  1.1  mrg 
   1725  1.1  mrg 	      /* If this is a dependence in the innermost loop (i.e., the
   1726  1.1  mrg 		 distances in all superloops are zero) and it is not
   1727  1.1  mrg 		 the trivial self-dependence with distance zero, record that
   1728  1.1  mrg 		 the references are not completely independent.  */
   1729  1.1  mrg 	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
   1730  1.1  mrg 		  && (ref != refb
   1731  1.1  mrg 		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
   1732  1.1  mrg 		{
   1733  1.1  mrg 		  ref->independent_p = false;
   1734  1.1  mrg 		  refb->independent_p = false;
   1735  1.1  mrg 		}
   1736  1.1  mrg 
   1737  1.1  mrg 	      /* Ignore accesses closer than
   1738  1.1  mrg 		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
   1739  1.1  mrg 	      	 so that we use nontemporal prefetches e.g. if single memory
   1740  1.1  mrg 		 location is accessed several times in a single iteration of
   1741  1.1  mrg 		 the loop.  */
   1742  1.1  mrg 	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
   1743  1.1  mrg 		continue;
   1744  1.1  mrg 
   1745  1.1  mrg 	      if (adist < dist)
   1746  1.1  mrg 		dist = adist;
   1747  1.1  mrg 	    }
   1748  1.1  mrg 	}
   1749  1.1  mrg 
   1750  1.1  mrg       if (ref->reuse_distance > dist)
   1751  1.1  mrg 	ref->reuse_distance = dist;
   1752  1.1  mrg       if (refb->reuse_distance > dist)
   1753  1.1  mrg 	refb->reuse_distance = dist;
   1754  1.1  mrg     }
   1755  1.1  mrg 
   1756  1.1  mrg   free_dependence_relations (dependences);
   1757  1.1  mrg   free_data_refs (datarefs);
   1758  1.1  mrg   free (loop_data_size);
   1759  1.1  mrg 
   1760  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1761  1.1  mrg     {
   1762  1.1  mrg       fprintf (dump_file, "Reuse distances:\n");
   1763  1.1  mrg       for (gr = refs; gr; gr = gr->next)
   1764  1.1  mrg 	for (ref = gr->refs; ref; ref = ref->next)
   1765  1.1  mrg 	  fprintf (dump_file, " reference %u:%u distance %u\n",
   1766  1.1  mrg 		   ref->group->uid, ref->uid, ref->reuse_distance);
   1767  1.1  mrg     }
   1768  1.1  mrg 
   1769  1.1  mrg   return true;
   1770  1.1  mrg }
   1771  1.1  mrg 
   1772  1.1  mrg /* Determine whether or not the trip count to ahead ratio is too small based
   1773  1.1  mrg    on prefitablility consideration.
   1774  1.1  mrg    AHEAD: the iteration ahead distance,
   1775  1.1  mrg    EST_NITER: the estimated trip count.  */
   1776  1.1  mrg 
   1777  1.1  mrg static bool
   1778  1.1  mrg trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
   1779  1.1  mrg {
   1780  1.1  mrg   /* Assume trip count to ahead ratio is big enough if the trip count could not
   1781  1.1  mrg      be estimated at compile time.  */
   1782  1.1  mrg   if (est_niter < 0)
   1783  1.1  mrg     return false;
   1784  1.1  mrg 
   1785  1.1  mrg   if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
   1786  1.1  mrg     {
   1787  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1788  1.1  mrg 	fprintf (dump_file,
   1789  1.1  mrg 		 "Not prefetching -- loop estimated to roll only %d times\n",
   1790  1.1  mrg 		 (int) est_niter);
   1791  1.1  mrg       return true;
   1792  1.1  mrg     }
   1793  1.1  mrg 
   1794  1.1  mrg   return false;
   1795  1.1  mrg }
   1796  1.1  mrg 
   1797  1.1  mrg /* Determine whether or not the number of memory references in the loop is
   1798  1.1  mrg    reasonable based on the profitablity and compilation time considerations.
   1799  1.1  mrg    NINSNS: estimated number of instructions in the loop,
   1800  1.1  mrg    MEM_REF_COUNT: total number of memory references in the loop.  */
   1801  1.1  mrg 
   1802  1.1  mrg static bool
   1803  1.1  mrg mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
   1804  1.1  mrg {
   1805  1.1  mrg   int insn_to_mem_ratio;
   1806  1.1  mrg 
   1807  1.1  mrg   if (mem_ref_count == 0)
   1808  1.1  mrg     return false;
   1809  1.1  mrg 
   1810  1.1  mrg   /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
   1811  1.1  mrg      (compute_all_dependences) have high costs based on quadratic complexity.
   1812  1.1  mrg      To avoid huge compilation time, we give up prefetching if mem_ref_count
   1813  1.1  mrg      is too large.  */
   1814  1.1  mrg   if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
   1815  1.1  mrg     return false;
   1816  1.1  mrg 
   1817  1.1  mrg   /* Prefetching improves performance by overlapping cache missing
   1818  1.1  mrg      memory accesses with CPU operations.  If the loop does not have
   1819  1.1  mrg      enough CPU operations to overlap with memory operations, prefetching
   1820  1.1  mrg      won't give a significant benefit.  One approximate way of checking
   1821  1.1  mrg      this is to require the ratio of instructions to memory references to
   1822  1.1  mrg      be above a certain limit.  This approximation works well in practice.
   1823  1.1  mrg      TODO: Implement a more precise computation by estimating the time
   1824  1.1  mrg      for each CPU or memory op in the loop. Time estimates for memory ops
   1825  1.1  mrg      should account for cache misses.  */
   1826  1.1  mrg   insn_to_mem_ratio = ninsns / mem_ref_count;
   1827  1.1  mrg 
   1828  1.1  mrg   if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
   1829  1.1  mrg     {
   1830  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1831  1.1  mrg         fprintf (dump_file,
   1832  1.1  mrg 		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
   1833  1.1  mrg 		 insn_to_mem_ratio);
   1834  1.1  mrg       return false;
   1835  1.1  mrg     }
   1836  1.1  mrg 
   1837  1.1  mrg   return true;
   1838  1.1  mrg }
   1839  1.1  mrg 
   1840  1.1  mrg /* Determine whether or not the instruction to prefetch ratio in the loop is
   1841  1.1  mrg    too small based on the profitablity consideration.
   1842  1.1  mrg    NINSNS: estimated number of instructions in the loop,
   1843  1.1  mrg    PREFETCH_COUNT: an estimate of the number of prefetches,
   1844  1.1  mrg    UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
   1845  1.1  mrg 
   1846  1.1  mrg static bool
   1847  1.1  mrg insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
   1848  1.1  mrg                                      unsigned unroll_factor)
   1849  1.1  mrg {
   1850  1.1  mrg   int insn_to_prefetch_ratio;
   1851  1.1  mrg 
   1852  1.1  mrg   /* Prefetching most likely causes performance degradation when the instruction
   1853  1.1  mrg      to prefetch ratio is too small.  Too many prefetch instructions in a loop
   1854  1.1  mrg      may reduce the I-cache performance.
   1855  1.1  mrg      (unroll_factor * ninsns) is used to estimate the number of instructions in
   1856  1.1  mrg      the unrolled loop.  This implementation is a bit simplistic -- the number
   1857  1.1  mrg      of issued prefetch instructions is also affected by unrolling.  So,
   1858  1.1  mrg      prefetch_mod and the unroll factor should be taken into account when
   1859  1.1  mrg      determining prefetch_count.  Also, the number of insns of the unrolled
   1860  1.1  mrg      loop will usually be significantly smaller than the number of insns of the
   1861  1.1  mrg      original loop * unroll_factor (at least the induction variable increases
   1862  1.1  mrg      and the exit branches will get eliminated), so it might be better to use
   1863  1.1  mrg      tree_estimate_loop_size + estimated_unrolled_size.  */
   1864  1.1  mrg   insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
   1865  1.1  mrg   if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
   1866  1.1  mrg     {
   1867  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1868  1.1  mrg         fprintf (dump_file,
   1869  1.1  mrg 		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
   1870  1.1  mrg 		 insn_to_prefetch_ratio);
   1871  1.1  mrg       return true;
   1872  1.1  mrg     }
   1873  1.1  mrg 
   1874  1.1  mrg   return false;
   1875  1.1  mrg }
   1876  1.1  mrg 
   1877  1.1  mrg 
   1878  1.1  mrg /* Issue prefetch instructions for array references in LOOP.  Returns
   1879  1.1  mrg    true if the LOOP was unrolled.  */
   1880  1.1  mrg 
   1881  1.1  mrg static bool
   1882  1.1  mrg loop_prefetch_arrays (class loop *loop)
   1883  1.1  mrg {
   1884  1.1  mrg   struct mem_ref_group *refs;
   1885  1.1  mrg   unsigned ahead, ninsns, time, unroll_factor;
   1886  1.1  mrg   HOST_WIDE_INT est_niter;
   1887  1.1  mrg   class tree_niter_desc desc;
   1888  1.1  mrg   bool unrolled = false, no_other_refs;
   1889  1.1  mrg   unsigned prefetch_count;
   1890  1.1  mrg   unsigned mem_ref_count;
   1891  1.1  mrg 
   1892  1.1  mrg   if (optimize_loop_nest_for_size_p (loop))
   1893  1.1  mrg     {
   1894  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   1895  1.1  mrg 	fprintf (dump_file, "  ignored (cold area)\n");
   1896  1.1  mrg       return false;
   1897  1.1  mrg     }
   1898  1.1  mrg 
   1899  1.1  mrg   /* FIXME: the time should be weighted by the probabilities of the blocks in
   1900  1.1  mrg      the loop body.  */
   1901  1.1  mrg   time = tree_num_loop_insns (loop, &eni_time_weights);
   1902  1.1  mrg   if (time == 0)
   1903  1.1  mrg     return false;
   1904  1.1  mrg 
   1905  1.1  mrg   ahead = (param_prefetch_latency + time - 1) / time;
   1906  1.1  mrg   est_niter = estimated_stmt_executions_int (loop);
   1907  1.1  mrg   if (est_niter == -1)
   1908  1.1  mrg     est_niter = likely_max_stmt_executions_int (loop);
   1909  1.1  mrg 
   1910  1.1  mrg   /* Prefetching is not likely to be profitable if the trip count to ahead
   1911  1.1  mrg      ratio is too small.  */
   1912  1.1  mrg   if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
   1913  1.1  mrg     return false;
   1914  1.1  mrg 
   1915  1.1  mrg   ninsns = tree_num_loop_insns (loop, &eni_size_weights);
   1916  1.1  mrg 
   1917  1.1  mrg   /* Step 1: gather the memory references.  */
   1918  1.1  mrg   refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
   1919  1.1  mrg 
   1920  1.1  mrg   /* Give up prefetching if the number of memory references in the
   1921  1.1  mrg      loop is not reasonable based on profitablity and compilation time
   1922  1.1  mrg      considerations.  */
   1923  1.1  mrg   if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
   1924  1.1  mrg     goto fail;
   1925  1.1  mrg 
   1926  1.1  mrg   /* Step 2: estimate the reuse effects.  */
   1927  1.1  mrg   prune_by_reuse (refs);
   1928  1.1  mrg 
   1929  1.1  mrg   if (nothing_to_prefetch_p (refs))
   1930  1.1  mrg     goto fail;
   1931  1.1  mrg 
   1932  1.1  mrg   if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
   1933  1.1  mrg     goto fail;
   1934  1.1  mrg 
   1935  1.1  mrg   /* Step 3: determine unroll factor.  */
   1936  1.1  mrg   unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
   1937  1.1  mrg 					   est_niter);
   1938  1.1  mrg 
   1939  1.1  mrg   /* Estimate prefetch count for the unrolled loop.  */
   1940  1.1  mrg   prefetch_count = estimate_prefetch_count (refs, unroll_factor);
   1941  1.1  mrg   if (prefetch_count == 0)
   1942  1.1  mrg     goto fail;
   1943  1.1  mrg 
   1944  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1945  1.1  mrg     fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
   1946  1.1  mrg 	     HOST_WIDE_INT_PRINT_DEC "\n"
   1947  1.1  mrg 	     "insn count %d, mem ref count %d, prefetch count %d\n",
   1948  1.1  mrg 	     ahead, unroll_factor, est_niter,
   1949  1.1  mrg 	     ninsns, mem_ref_count, prefetch_count);
   1950  1.1  mrg 
   1951  1.1  mrg   /* Prefetching is not likely to be profitable if the instruction to prefetch
   1952  1.1  mrg      ratio is too small.  */
   1953  1.1  mrg   if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
   1954  1.1  mrg 					  unroll_factor))
   1955  1.1  mrg     goto fail;
   1956  1.1  mrg 
   1957  1.1  mrg   mark_nontemporal_stores (loop, refs);
   1958  1.1  mrg 
   1959  1.1  mrg   /* Step 4: what to prefetch?  */
   1960  1.1  mrg   if (!schedule_prefetches (refs, unroll_factor, ahead))
   1961  1.1  mrg     goto fail;
   1962  1.1  mrg 
   1963  1.1  mrg   /* Step 5: unroll the loop.  TODO -- peeling of first and last few
   1964  1.1  mrg      iterations so that we do not issue superfluous prefetches.  */
   1965  1.1  mrg   if (unroll_factor != 1)
   1966  1.1  mrg     {
   1967  1.1  mrg       tree_unroll_loop (loop, unroll_factor, &desc);
   1968  1.1  mrg       unrolled = true;
   1969  1.1  mrg     }
   1970  1.1  mrg 
   1971  1.1  mrg   /* Step 6: issue the prefetches.  */
   1972  1.1  mrg   issue_prefetches (refs, unroll_factor, ahead);
   1973  1.1  mrg 
   1974  1.1  mrg fail:
   1975  1.1  mrg   release_mem_refs (refs);
   1976  1.1  mrg   return unrolled;
   1977  1.1  mrg }
   1978  1.1  mrg 
   1979  1.1  mrg /* Issue prefetch instructions for array references in loops.  */
   1980  1.1  mrg 
   1981  1.1  mrg unsigned int
   1982  1.1  mrg tree_ssa_prefetch_arrays (void)
   1983  1.1  mrg {
   1984  1.1  mrg   bool unrolled = false;
   1985  1.1  mrg   int todo_flags = 0;
   1986  1.1  mrg 
   1987  1.1  mrg   if (!targetm.have_prefetch ()
   1988  1.1  mrg       /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
   1989  1.1  mrg 	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
   1990  1.1  mrg 	 of processor costs and i486 does not have prefetch, but
   1991  1.1  mrg 	 -march=pentium4 causes targetm.have_prefetch to be true.  Ugh.  */
   1992  1.1  mrg       || PREFETCH_BLOCK == 0)
   1993  1.1  mrg     return 0;
   1994  1.1  mrg 
   1995  1.1  mrg   if (dump_file && (dump_flags & TDF_DETAILS))
   1996  1.1  mrg     {
   1997  1.1  mrg       fprintf (dump_file, "Prefetching parameters:\n");
   1998  1.1  mrg       fprintf (dump_file, "    simultaneous prefetches: %d\n",
   1999  1.1  mrg 	       param_simultaneous_prefetches);
   2000  1.1  mrg       fprintf (dump_file, "    prefetch latency: %d\n", param_prefetch_latency);
   2001  1.1  mrg       fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
   2002  1.1  mrg       fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
   2003  1.1  mrg 	       L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
   2004  1.1  mrg 	       param_l1_cache_size);
   2005  1.1  mrg       fprintf (dump_file, "    L1 cache line size: %d\n",
   2006  1.1  mrg 	       param_l1_cache_line_size);
   2007  1.1  mrg       fprintf (dump_file, "    L2 cache size: %d kB\n", param_l2_cache_size);
   2008  1.1  mrg       fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
   2009  1.1  mrg 	       param_min_insn_to_prefetch_ratio);
   2010  1.1  mrg       fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
   2011  1.1  mrg 	       param_prefetch_min_insn_to_mem_ratio);
   2012  1.1  mrg       fprintf (dump_file, "\n");
   2013  1.1  mrg     }
   2014  1.1  mrg 
   2015  1.1  mrg   initialize_original_copy_tables ();
   2016  1.1  mrg 
   2017  1.1  mrg   if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
   2018  1.1  mrg     {
   2019  1.1  mrg       tree type = build_function_type_list (void_type_node,
   2020  1.1  mrg 					    const_ptr_type_node, NULL_TREE);
   2021  1.1  mrg       tree decl = add_builtin_function ("__builtin_prefetch", type,
   2022  1.1  mrg 					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
   2023  1.1  mrg 					NULL, NULL_TREE);
   2024  1.1  mrg       DECL_IS_NOVOPS (decl) = true;
   2025  1.1  mrg       set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
   2026  1.1  mrg     }
   2027  1.1  mrg 
   2028  1.1  mrg   for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
   2029  1.1  mrg     {
   2030  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   2031  1.1  mrg 	fprintf (dump_file, "Processing loop %d:\n", loop->num);
   2032  1.1  mrg 
   2033  1.1  mrg       unrolled |= loop_prefetch_arrays (loop);
   2034  1.1  mrg 
   2035  1.1  mrg       if (dump_file && (dump_flags & TDF_DETAILS))
   2036  1.1  mrg 	fprintf (dump_file, "\n\n");
   2037  1.1  mrg     }
   2038  1.1  mrg 
   2039  1.1  mrg   if (unrolled)
   2040  1.1  mrg     {
   2041  1.1  mrg       scev_reset ();
   2042  1.1  mrg       todo_flags |= TODO_cleanup_cfg;
   2043  1.1  mrg     }
   2044  1.1  mrg 
   2045  1.1  mrg   free_original_copy_tables ();
   2046  1.1  mrg   return todo_flags;
   2047  1.1  mrg }
   2048  1.1  mrg 
   2049  1.1  mrg /* Prefetching.  */
   2050  1.1  mrg 
   2051  1.1  mrg namespace {
   2052  1.1  mrg 
   2053  1.1  mrg const pass_data pass_data_loop_prefetch =
   2054  1.1  mrg {
   2055  1.1  mrg   GIMPLE_PASS, /* type */
   2056  1.1  mrg   "aprefetch", /* name */
   2057  1.1  mrg   OPTGROUP_LOOP, /* optinfo_flags */
   2058  1.1  mrg   TV_TREE_PREFETCH, /* tv_id */
   2059  1.1  mrg   ( PROP_cfg | PROP_ssa ), /* properties_required */
   2060  1.1  mrg   0, /* properties_provided */
   2061  1.1  mrg   0, /* properties_destroyed */
   2062  1.1  mrg   0, /* todo_flags_start */
   2063  1.1  mrg   0, /* todo_flags_finish */
   2064  1.1  mrg };
   2065  1.1  mrg 
   2066  1.1  mrg class pass_loop_prefetch : public gimple_opt_pass
   2067  1.1  mrg {
   2068  1.1  mrg public:
   2069  1.1  mrg   pass_loop_prefetch (gcc::context *ctxt)
   2070  1.1  mrg     : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
   2071  1.1  mrg   {}
   2072  1.1  mrg 
   2073  1.1  mrg   /* opt_pass methods: */
   2074  1.1  mrg   virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
   2075  1.1  mrg   virtual unsigned int execute (function *);
   2076  1.1  mrg 
   2077  1.1  mrg }; // class pass_loop_prefetch
   2078  1.1  mrg 
   2079  1.1  mrg unsigned int
   2080  1.1  mrg pass_loop_prefetch::execute (function *fun)
   2081  1.1  mrg {
   2082  1.1  mrg   if (number_of_loops (fun) <= 1)
   2083  1.1  mrg     return 0;
   2084  1.1  mrg 
   2085  1.1  mrg   if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
   2086  1.1  mrg     {
   2087  1.1  mrg       static bool warned = false;
   2088  1.1  mrg 
   2089  1.1  mrg       if (!warned)
   2090  1.1  mrg 	{
   2091  1.1  mrg 	  warning (OPT_Wdisabled_optimization,
   2092  1.1  mrg 		   "%<l1-cache-size%> parameter is not a power of two %d",
   2093  1.1  mrg 		   PREFETCH_BLOCK);
   2094  1.1  mrg 	  warned = true;
   2095  1.1  mrg 	}
   2096  1.1  mrg       return 0;
   2097  1.1  mrg     }
   2098  1.1  mrg 
   2099  1.1  mrg   return tree_ssa_prefetch_arrays ();
   2100  1.1  mrg }
   2101  1.1  mrg 
   2102  1.1  mrg } // anon namespace
   2103  1.1  mrg 
   2104  1.1  mrg gimple_opt_pass *
   2105  1.1  mrg make_pass_loop_prefetch (gcc::context *ctxt)
   2106  1.1  mrg {
   2107  1.1  mrg   return new pass_loop_prefetch (ctxt);
   2108  1.1  mrg }
   2109  1.1  mrg 
   2110  1.1  mrg 
   2111