1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33
34#include "intel_batchbuffer.h"
35#include "brw_context.h"
36#include "brw_state.h"
37#include "brw_defines.h"
38
39#define VS 0
40#define GS 1
41#define CLP 2
42#define SF 3
43#define CS 4
44
45/** @file brw_urb.c
46 *
47 * Manages the division of the URB space between the various fixed-function
48 * units.
49 *
50 * See the Thread Initiation Management section of the GEN4 B-Spec, and
51 * the individual *_STATE structures for restrictions on numbers of
52 * entries and threads.
53 */
54
55/*
56 * Generally, a unit requires a min_nr_entries based on how many entries
57 * it produces before the downstream unit gets unblocked and can use and
58 * dereference some of its handles.
59 *
60 * The SF unit preallocates a PUE at the start of thread dispatch, and only
61 * uses that one.  So it requires one entry per thread.
62 *
63 * For CLIP, the SF unit will hold the previous primitive while the
64 * next is getting assembled, meaning that linestrips require 3 CLIP VUEs
65 * (vertices) to ensure continued processing, trifans require 4, and tristrips
66 * require 5.  There can be 1 or 2 threads, and each has the same requirement.
67 *
68 * GS has the same requirement as CLIP, but it never handles tristrips,
69 * so we can lower the minimum to 4 for the POLYGONs (trifans) it produces.
70 * We only run it single-threaded.
71 *
72 * For VS, the number of entries may be 8, 12, 16, or 32 (or 64 on G4X).
73 * Each thread processes 2 preallocated VUEs (vertices) at a time, and they
74 * get streamed down as soon as threads processing earlier vertices get
75 * theirs accepted.
76 *
77 * Each unit will take the number of URB entries we give it (based on the
78 * entry size calculated in brw_vs_emit.c for VUEs, brw_sf_emit.c for PUEs,
79 * and brw_curbe.c for the CURBEs) and decide its maximum number of
80 * threads it can support based on that. in brw_*_state.c.
81 *
82 * XXX: Are the min_entry_size numbers useful?
83 * XXX: Verify min_nr_entries, esp for VS.
84 * XXX: Verify SF min_entry_size.
85 */
86static const struct {
87   GLuint min_nr_entries;
88   GLuint preferred_nr_entries;
89   GLuint min_entry_size;
90   GLuint max_entry_size;
91} limits[CS+1] = {
92   { 16, 32, 1, 5 },			/* vs */
93   { 4, 8,  1, 5 },			/* gs */
94   { 5, 10,  1, 5 },			/* clp */
95   { 1, 8,  1, 12 },		        /* sf */
96   { 1, 4,  1, 32 }			/* cs */
97};
98
99
100static bool check_urb_layout(struct brw_context *brw)
101{
102   brw->urb.vs_start = 0;
103   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
104   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
105   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
106   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
107
108   return brw->urb.cs_start + brw->urb.nr_cs_entries *
109      brw->urb.csize <= brw->urb.size;
110}
111
112/* Most minimal update, forces re-emit of URB fence packet after GS
113 * unit turned on/off.
114 */
115void
116brw_calculate_urb_fence(struct brw_context *brw, unsigned csize,
117                        unsigned vsize, unsigned sfsize)
118{
119   const struct gen_device_info *devinfo = &brw->screen->devinfo;
120
121   if (csize < limits[CS].min_entry_size)
122      csize = limits[CS].min_entry_size;
123
124   if (vsize < limits[VS].min_entry_size)
125      vsize = limits[VS].min_entry_size;
126
127   if (sfsize < limits[SF].min_entry_size)
128      sfsize = limits[SF].min_entry_size;
129
130   if (brw->urb.vsize < vsize ||
131       brw->urb.sfsize < sfsize ||
132       brw->urb.csize < csize ||
133       (brw->urb.constrained && (brw->urb.vsize > vsize ||
134				 brw->urb.sfsize > sfsize ||
135				 brw->urb.csize > csize))) {
136
137
138      brw->urb.csize = csize;
139      brw->urb.sfsize = sfsize;
140      brw->urb.vsize = vsize;
141
142      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
143      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;
144      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
145      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
146      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;
147
148      brw->urb.constrained = 0;
149
150      if (devinfo->gen == 5) {
151         brw->urb.nr_vs_entries = 128;
152         brw->urb.nr_sf_entries = 48;
153         if (check_urb_layout(brw)) {
154            goto done;
155         } else {
156            brw->urb.constrained = 1;
157            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
158            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
159         }
160      } else if (devinfo->is_g4x) {
161	 brw->urb.nr_vs_entries = 64;
162	 if (check_urb_layout(brw)) {
163	    goto done;
164	 } else {
165	    brw->urb.constrained = 1;
166	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
167	 }
168      }
169
170      if (!check_urb_layout(brw)) {
171	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;
172	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;
173	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
174	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;
175	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;
176
177	 /* Mark us as operating with constrained nr_entries, so that next
178	  * time we recalculate we'll resize the fences in the hope of
179	  * escaping constrained mode and getting back to normal performance.
180	  */
181	 brw->urb.constrained = 1;
182
183	 if (!check_urb_layout(brw)) {
184	    /* This is impossible, given the maximal sizes of urb
185	     * entries and the values for minimum nr of entries
186	     * provided above.
187	     */
188	    fprintf(stderr, "couldn't calculate URB layout!\n");
189	    exit(1);
190	 }
191
192	 if (unlikely(INTEL_DEBUG & (DEBUG_URB|DEBUG_PERF)))
193	    fprintf(stderr, "URB CONSTRAINED\n");
194      }
195
196done:
197      if (unlikely(INTEL_DEBUG & DEBUG_URB))
198	 fprintf(stderr,
199                 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
200                 brw->urb.vs_start,
201                 brw->urb.gs_start,
202                 brw->urb.clip_start,
203                 brw->urb.sf_start,
204                 brw->urb.cs_start,
205                 brw->urb.size);
206
207      brw->ctx.NewDriverState |= BRW_NEW_URB_FENCE;
208   }
209}
210
211static void recalculate_urb_fence( struct brw_context *brw )
212{
213   brw_calculate_urb_fence(brw, brw->curbe.total_size,
214                           brw_vue_prog_data(brw->vs.base.prog_data)->urb_entry_size,
215                           brw->sf.prog_data->urb_entry_size);
216}
217
218
219const struct brw_tracked_state brw_recalculate_urb_fence = {
220   .dirty = {
221      .mesa = 0,
222      .brw = BRW_NEW_BLORP |
223             BRW_NEW_PUSH_CONSTANT_ALLOCATION |
224             BRW_NEW_SF_PROG_DATA |
225             BRW_NEW_VS_PROG_DATA,
226   },
227   .emit = recalculate_urb_fence
228};
229
230
231
232
233
234void brw_upload_urb_fence(struct brw_context *brw)
235{
236   struct brw_urb_fence uf;
237   memset(&uf, 0, sizeof(uf));
238
239   uf.header.opcode = CMD_URB_FENCE;
240   uf.header.length = sizeof(uf)/4-2;
241   uf.header.vs_realloc = 1;
242   uf.header.gs_realloc = 1;
243   uf.header.clp_realloc = 1;
244   uf.header.sf_realloc = 1;
245   uf.header.vfe_realloc = 1;
246   uf.header.cs_realloc = 1;
247
248   /* The ordering below is correct, not the layout in the
249    * instruction.
250    *
251    * There are 256/384 urb reg pairs in total.
252    */
253   uf.bits0.vs_fence  = brw->urb.gs_start;
254   uf.bits0.gs_fence  = brw->urb.clip_start;
255   uf.bits0.clp_fence = brw->urb.sf_start;
256   uf.bits1.sf_fence  = brw->urb.cs_start;
257   uf.bits1.cs_fence  = brw->urb.size;
258
259   /* erratum: URB_FENCE must not cross a 64byte cacheline */
260   if ((USED_BATCH(brw->batch) & 15) > 12) {
261      int pad = 16 - (USED_BATCH(brw->batch) & 15);
262      do
263         *brw->batch.map_next++ = MI_NOOP;
264      while (--pad);
265   }
266
267   intel_batchbuffer_data(brw, &uf, sizeof(uf));
268}
269