1/*
2 * Copyright 2012 Francisco Jerez
3 * Copyright 2015 Samuel Pitoiset
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining
6 * a copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial
15 * portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 *
25 */
26
27#include "nv50/nv50_context.h"
28#include "nv50/nv50_compute.xml.h"
29
30#include "codegen/nv50_ir_driver.h"
31
32int
33nv50_screen_compute_setup(struct nv50_screen *screen,
34                          struct nouveau_pushbuf *push)
35{
36   struct nouveau_device *dev = screen->base.device;
37   struct nouveau_object *chan = screen->base.channel;
38   struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;
39   unsigned obj_class;
40   int i, ret;
41
42   switch (dev->chipset & 0xf0) {
43   case 0x50:
44   case 0x80:
45   case 0x90:
46      obj_class = NV50_COMPUTE_CLASS;
47      break;
48   case 0xa0:
49      switch (dev->chipset) {
50      case 0xa3:
51      case 0xa5:
52      case 0xa8:
53         obj_class = NVA3_COMPUTE_CLASS;
54         break;
55      default:
56         obj_class = NV50_COMPUTE_CLASS;
57         break;
58      }
59      break;
60   default:
61      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
62      return -1;
63   }
64
65   ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,
66                            &screen->compute);
67   if (ret)
68      return ret;
69
70   BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
71   PUSH_DATA (push, screen->compute->handle);
72
73   BEGIN_NV04(push, NV50_CP(UNK02A0), 1);
74   PUSH_DATA (push, 1);
75   BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);
76   PUSH_DATA (push, fifo->vram);
77   BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);
78   PUSH_DATAh(push, screen->stack_bo->offset);
79   PUSH_DATA (push, screen->stack_bo->offset);
80   BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);
81   PUSH_DATA (push, 4);
82
83   BEGIN_NV04(push, NV50_CP(UNK0290), 1);
84   PUSH_DATA (push, 1);
85   BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);
86   PUSH_DATA (push, 1);
87   BEGIN_NV04(push, NV50_CP(REG_MODE), 1);
88   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
89   BEGIN_NV04(push, NV50_CP(UNK0384), 1);
90   PUSH_DATA (push, 0x100);
91   BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);
92   PUSH_DATA (push, fifo->vram);
93
94   for (i = 0; i < 15; i++) {
95      BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);
96      PUSH_DATA (push, 0);
97      PUSH_DATA (push, 0);
98      BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);
99      PUSH_DATA (push, 0);
100      BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);
101      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
102   }
103
104   BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);
105   PUSH_DATA (push, 0);
106   PUSH_DATA (push, 0);
107   BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);
108   PUSH_DATA (push, ~0);
109   BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);
110   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
111
112   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);
113   PUSH_DATA (push, 7);
114   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);
115   PUSH_DATA (push, 1);
116   BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);
117   PUSH_DATA (push, 7);
118   BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);
119   PUSH_DATA (push, 1);
120   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
121   PUSH_DATA (push, 0);
122
123   BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);
124   PUSH_DATA (push, fifo->vram);
125   BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);
126   PUSH_DATA (push, 0x54);
127   BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);
128   PUSH_DATA (push, 0);
129
130   BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);
131   PUSH_DATA (push, fifo->vram);
132   BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);
133   PUSH_DATAh(push, screen->txc->offset);
134   PUSH_DATA (push, screen->txc->offset);
135   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
136
137   BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);
138   PUSH_DATA (push, fifo->vram);
139   BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);
140   PUSH_DATAh(push, screen->txc->offset + 65536);
141   PUSH_DATA (push, screen->txc->offset + 65536);
142   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
143
144   BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);
145   PUSH_DATA (push, fifo->vram);
146
147   BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);
148   PUSH_DATA (push, fifo->vram);
149   BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);
150   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
151   PUSH_DATA (push, screen->tls_bo->offset + 65536);
152   BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);
153   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
154
155   return 0;
156}
157
158static void
159nv50_compute_validate_globals(struct nv50_context *nv50)
160{
161   unsigned i;
162
163   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
164        ++i) {
165      struct pipe_resource *res = *util_dynarray_element(
166         &nv50->global_residents, struct pipe_resource *, i);
167      if (res)
168         nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,
169                                  nv04_resource(res), NOUVEAU_BO_RDWR);
170   }
171}
172
173static struct nv50_state_validate
174validate_list_cp[] = {
175   { nv50_compprog_validate,              NV50_NEW_CP_PROGRAM     },
176   { nv50_compute_validate_globals,       NV50_NEW_CP_GLOBALS     },
177};
178
179static bool
180nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)
181{
182   bool ret;
183
184   /* TODO: validate textures, samplers, surfaces */
185   ret = nv50_state_validate(nv50, mask, validate_list_cp,
186                             ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,
187                             nv50->bufctx_cp);
188
189   if (unlikely(nv50->state.flushed))
190      nv50_bufctx_fence(nv50->bufctx_cp, true);
191   return ret;
192}
193
194static void
195nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
196{
197   struct nv50_screen *screen = nv50->screen;
198   struct nouveau_pushbuf *push = screen->base.pushbuf;
199   unsigned size = align(nv50->compprog->parm_size, 0x4);
200
201   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
202   PUSH_DATA (push, (size / 4) << 8);
203
204   if (size) {
205      struct nouveau_mm_allocation *mm;
206      struct nouveau_bo *bo = NULL;
207      unsigned offset;
208
209      mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);
210      assert(mm);
211
212      nouveau_bo_map(bo, 0, screen->base.client);
213      memcpy(bo->map + offset, input, size);
214
215      nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
216      nouveau_pushbuf_bufctx(push, nv50->bufctx);
217      nouveau_pushbuf_validate(push);
218
219      BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4);
220      nouveau_pushbuf_data(push, bo, offset, size);
221
222      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
223      nouveau_bo_ref(NULL, &bo);
224      nouveau_bufctx_reset(nv50->bufctx, 0);
225   }
226}
227
228static uint32_t
229nv50_compute_find_symbol(struct nv50_context *nv50, uint32_t label)
230{
231   struct nv50_program *prog = nv50->compprog;
232   const struct nv50_ir_prog_symbol *syms =
233      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
234   unsigned i;
235
236   for (i = 0; i < prog->cp.num_syms; ++i) {
237      if (syms[i].label == label)
238         return prog->code_base + syms[i].offset;
239   }
240   return prog->code_base; /* no symbols or symbol not found */
241}
242
243void
244nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
245{
246   struct nv50_context *nv50 = nv50_context(pipe);
247   struct nouveau_pushbuf *push = nv50->base.pushbuf;
248   unsigned block_size = info->block[0] * info->block[1] * info->block[2];
249   struct nv50_program *cp = nv50->compprog;
250   bool ret;
251
252   ret = !nv50_state_validate_cp(nv50, ~0);
253   if (ret) {
254      NOUVEAU_ERR("Failed to launch grid !\n");
255      return;
256   }
257
258   nv50_compute_upload_input(nv50, info->input);
259
260   BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
261   PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc));
262
263   BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
264   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
265   BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
266   PUSH_DATA (push, cp->max_gpr);
267
268   /* grid/block setup */
269   BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);
270   PUSH_DATA (push, info->block[1] << 16 | info->block[0]);
271   PUSH_DATA (push, info->block[2]);
272   BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);
273   PUSH_DATA (push, 1 << 16 | block_size);
274   BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);
275   PUSH_DATA (push, 1);
276   BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);
277   PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]);
278   BEGIN_NV04(push, NV50_CP(GRIDID), 1);
279   PUSH_DATA (push, 1);
280
281   /* kernel launching */
282   BEGIN_NV04(push, NV50_CP(LAUNCH), 1);
283   PUSH_DATA (push, 0);
284   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
285   PUSH_DATA (push, 0);
286
287   /* bind a compute shader clobbers fragment shader state */
288   nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
289}
290