1/* 2 * Copyright 2010 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 20 * OTHER DEALINGS IN THE SOFTWARE. 21 */ 22 23#include "pipe/p_defines.h" 24 25#include "compiler/nir/nir.h" 26#include "tgsi/tgsi_ureg.h" 27 28#include "nvc0/nvc0_context.h" 29 30#include "codegen/nv50_ir_driver.h" 31#include "nvc0/nve4_compute.h" 32 33/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than 34 * 124 scalar varying values. 35 */ 36static uint32_t 37nvc0_shader_input_address(unsigned sn, unsigned si) 38{ 39 switch (sn) { 40 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; 41 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; 42 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; 43 case TGSI_SEMANTIC_PRIMID: return 0x060; 44 case TGSI_SEMANTIC_LAYER: return 0x064; 45 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; 46 case TGSI_SEMANTIC_PSIZE: return 0x06c; 47 case TGSI_SEMANTIC_POSITION: return 0x070; 48 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; 49 case TGSI_SEMANTIC_FOG: return 0x2e8; 50 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; 51 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; 52 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; 53 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; 54 case TGSI_SEMANTIC_PCOORD: return 0x2e0; 55 case TGSI_SEMANTIC_TESSCOORD: return 0x2f0; 56 case TGSI_SEMANTIC_INSTANCEID: return 0x2f8; 57 case TGSI_SEMANTIC_VERTEXID: return 0x2fc; 58 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; 59 default: 60 assert(!"invalid TGSI input semantic"); 61 return ~0; 62 } 63} 64 65static uint32_t 66nvc0_shader_output_address(unsigned sn, unsigned si) 67{ 68 switch (sn) { 69 case TGSI_SEMANTIC_TESSOUTER: return 0x000 + si * 0x4; 70 case TGSI_SEMANTIC_TESSINNER: return 0x010 + si * 0x4; 71 case TGSI_SEMANTIC_PATCH: return 0x020 + si * 0x10; 72 case TGSI_SEMANTIC_PRIMID: return 0x060; 73 case TGSI_SEMANTIC_LAYER: return 0x064; 74 case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068; 75 case TGSI_SEMANTIC_PSIZE: return 0x06c; 76 case TGSI_SEMANTIC_POSITION: return 0x070; 77 case TGSI_SEMANTIC_GENERIC: return 0x080 + si * 0x10; 78 case TGSI_SEMANTIC_FOG: return 0x2e8; 79 case TGSI_SEMANTIC_COLOR: return 0x280 + si * 0x10; 80 case TGSI_SEMANTIC_BCOLOR: return 0x2a0 + si * 0x10; 81 case TGSI_SEMANTIC_CLIPDIST: return 0x2c0 + si * 0x10; 82 case TGSI_SEMANTIC_CLIPVERTEX: return 0x270; 83 case TGSI_SEMANTIC_TEXCOORD: return 0x300 + si * 0x10; 84 /* case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0; */ 85 case TGSI_SEMANTIC_EDGEFLAG: return ~0; 86 default: 87 assert(!"invalid TGSI output semantic"); 88 return ~0; 89 } 90} 91 92static int 93nvc0_vp_assign_input_slots(struct nv50_ir_prog_info *info) 94{ 95 unsigned i, c, n; 96 97 for (n = 0, i = 0; i < info->numInputs; ++i) { 98 switch (info->in[i].sn) { 99 case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */ 100 case TGSI_SEMANTIC_VERTEXID: 101 info->in[i].mask = 0x1; 102 info->in[i].slot[0] = 103 nvc0_shader_input_address(info->in[i].sn, 0) / 4; 104 continue; 105 default: 106 break; 107 } 108 for (c = 0; c < 4; ++c) 109 info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4; 110 ++n; 111 } 112 113 return 0; 114} 115 116static int 117nvc0_sp_assign_input_slots(struct nv50_ir_prog_info *info) 118{ 119 unsigned offset; 120 unsigned i, c; 121 122 for (i = 0; i < info->numInputs; ++i) { 123 offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si); 124 125 for (c = 0; c < 4; ++c) 126 info->in[i].slot[c] = (offset + c * 0x4) / 4; 127 } 128 129 return 0; 130} 131 132static int 133nvc0_fp_assign_output_slots(struct nv50_ir_prog_info *info) 134{ 135 unsigned count = info->prop.fp.numColourResults * 4; 136 unsigned i, c; 137 138 /* Compute the relative position of each color output, since skipped MRT 139 * positions will not have registers allocated to them. 140 */ 141 unsigned colors[8] = {0}; 142 for (i = 0; i < info->numOutputs; ++i) 143 if (info->out[i].sn == TGSI_SEMANTIC_COLOR) 144 colors[info->out[i].si] = 1; 145 for (i = 0, c = 0; i < 8; i++) 146 if (colors[i]) 147 colors[i] = c++; 148 for (i = 0; i < info->numOutputs; ++i) 149 if (info->out[i].sn == TGSI_SEMANTIC_COLOR) 150 for (c = 0; c < 4; ++c) 151 info->out[i].slot[c] = colors[info->out[i].si] * 4 + c; 152 153 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) 154 info->out[info->io.sampleMask].slot[0] = count++; 155 else 156 if (info->target >= 0xe0) 157 count++; /* on Kepler, depth is always last colour reg + 2 */ 158 159 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS) 160 info->out[info->io.fragDepth].slot[2] = count; 161 162 return 0; 163} 164 165static int 166nvc0_sp_assign_output_slots(struct nv50_ir_prog_info *info) 167{ 168 unsigned offset; 169 unsigned i, c; 170 171 for (i = 0; i < info->numOutputs; ++i) { 172 offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si); 173 174 for (c = 0; c < 4; ++c) 175 info->out[i].slot[c] = (offset + c * 0x4) / 4; 176 } 177 178 return 0; 179} 180 181static int 182nvc0_program_assign_varying_slots(struct nv50_ir_prog_info *info) 183{ 184 int ret; 185 186 if (info->type == PIPE_SHADER_VERTEX) 187 ret = nvc0_vp_assign_input_slots(info); 188 else 189 ret = nvc0_sp_assign_input_slots(info); 190 if (ret) 191 return ret; 192 193 if (info->type == PIPE_SHADER_FRAGMENT) 194 ret = nvc0_fp_assign_output_slots(info); 195 else 196 ret = nvc0_sp_assign_output_slots(info); 197 return ret; 198} 199 200static inline void 201nvc0_vtgp_hdr_update_oread(struct nvc0_program *vp, uint8_t slot) 202{ 203 uint8_t min = (vp->hdr[4] >> 12) & 0xff; 204 uint8_t max = (vp->hdr[4] >> 24); 205 206 min = MIN2(min, slot); 207 max = MAX2(max, slot); 208 209 vp->hdr[4] = (max << 24) | (min << 12); 210} 211 212/* Common part of header generation for VP, TCP, TEP and GP. */ 213static int 214nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) 215{ 216 unsigned i, c, a; 217 218 for (i = 0; i < info->numInputs; ++i) { 219 if (info->in[i].patch) 220 continue; 221 for (c = 0; c < 4; ++c) { 222 a = info->in[i].slot[c]; 223 if (info->in[i].mask & (1 << c)) 224 vp->hdr[5 + a / 32] |= 1 << (a % 32); 225 } 226 } 227 228 for (i = 0; i < info->numOutputs; ++i) { 229 if (info->out[i].patch) 230 continue; 231 for (c = 0; c < 4; ++c) { 232 if (!(info->out[i].mask & (1 << c))) 233 continue; 234 assert(info->out[i].slot[c] >= 0x40 / 4); 235 a = info->out[i].slot[c] - 0x40 / 4; 236 vp->hdr[13 + a / 32] |= 1 << (a % 32); 237 if (info->out[i].oread) 238 nvc0_vtgp_hdr_update_oread(vp, info->out[i].slot[c]); 239 } 240 } 241 242 for (i = 0; i < info->numSysVals; ++i) { 243 switch (info->sv[i].sn) { 244 case TGSI_SEMANTIC_PRIMID: 245 vp->hdr[5] |= 1 << 24; 246 break; 247 case TGSI_SEMANTIC_INSTANCEID: 248 vp->hdr[10] |= 1 << 30; 249 break; 250 case TGSI_SEMANTIC_VERTEXID: 251 vp->hdr[10] |= 1 << 31; 252 break; 253 case TGSI_SEMANTIC_TESSCOORD: 254 /* We don't have the mask, nor the slots populated. While this could 255 * be achieved, the vast majority of the time if either of the coords 256 * are read, then both will be read. 257 */ 258 nvc0_vtgp_hdr_update_oread(vp, 0x2f0 / 4); 259 nvc0_vtgp_hdr_update_oread(vp, 0x2f4 / 4); 260 break; 261 default: 262 break; 263 } 264 } 265 266 vp->vp.clip_enable = (1 << info->io.clipDistances) - 1; 267 vp->vp.cull_enable = 268 ((1 << info->io.cullDistances) - 1) << info->io.clipDistances; 269 for (i = 0; i < info->io.cullDistances; ++i) 270 vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4); 271 272 if (info->io.genUserClip < 0) 273 vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */ 274 275 return 0; 276} 277 278static int 279nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info) 280{ 281 vp->hdr[0] = 0x20061 | (1 << 10); 282 vp->hdr[4] = 0xff000; 283 284 return nvc0_vtgp_gen_header(vp, info); 285} 286 287static void 288nvc0_tp_get_tess_mode(struct nvc0_program *tp, struct nv50_ir_prog_info *info) 289{ 290 if (info->prop.tp.outputPrim == PIPE_PRIM_MAX) { 291 tp->tp.tess_mode = ~0; 292 return; 293 } 294 switch (info->prop.tp.domain) { 295 case PIPE_PRIM_LINES: 296 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_ISOLINES; 297 break; 298 case PIPE_PRIM_TRIANGLES: 299 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_TRIANGLES; 300 break; 301 case PIPE_PRIM_QUADS: 302 tp->tp.tess_mode = NVC0_3D_TESS_MODE_PRIM_QUADS; 303 break; 304 default: 305 tp->tp.tess_mode = ~0; 306 return; 307 } 308 309 /* It seems like lines want the "CW" bit to indicate they're connected, and 310 * spit out errors in dmesg when the "CONNECTED" bit is set. 311 */ 312 if (info->prop.tp.outputPrim != PIPE_PRIM_POINTS) { 313 if (info->prop.tp.domain == PIPE_PRIM_LINES) 314 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; 315 else 316 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CONNECTED; 317 } 318 319 /* Winding only matters for triangles/quads, not lines. */ 320 if (info->prop.tp.domain != PIPE_PRIM_LINES && 321 info->prop.tp.outputPrim != PIPE_PRIM_POINTS && 322 info->prop.tp.winding > 0) 323 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_CW; 324 325 switch (info->prop.tp.partitioning) { 326 case PIPE_TESS_SPACING_EQUAL: 327 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_EQUAL; 328 break; 329 case PIPE_TESS_SPACING_FRACTIONAL_ODD: 330 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_ODD; 331 break; 332 case PIPE_TESS_SPACING_FRACTIONAL_EVEN: 333 tp->tp.tess_mode |= NVC0_3D_TESS_MODE_SPACING_FRACTIONAL_EVEN; 334 break; 335 default: 336 assert(!"invalid tessellator partitioning"); 337 break; 338 } 339} 340 341static int 342nvc0_tcp_gen_header(struct nvc0_program *tcp, struct nv50_ir_prog_info *info) 343{ 344 unsigned opcs = 6; /* output patch constants (at least the TessFactors) */ 345 346 tcp->tp.input_patch_size = info->prop.tp.inputPatchSize; 347 348 if (info->numPatchConstants) 349 opcs = 8 + info->numPatchConstants * 4; 350 351 tcp->hdr[0] = 0x20061 | (2 << 10); 352 353 tcp->hdr[1] = opcs << 24; 354 tcp->hdr[2] = info->prop.tp.outputPatchSize << 24; 355 356 tcp->hdr[4] = 0xff000; /* initial min/max parallel output read address */ 357 358 nvc0_vtgp_gen_header(tcp, info); 359 360 if (info->target >= NVISA_GM107_CHIPSET) { 361 /* On GM107+, the number of output patch components has moved in the TCP 362 * header, but it seems like blob still also uses the old position. 363 * Also, the high 8-bits are located inbetween the min/max parallel 364 * field and has to be set after updating the outputs. */ 365 tcp->hdr[3] = (opcs & 0x0f) << 28; 366 tcp->hdr[4] |= (opcs & 0xf0) << 16; 367 } 368 369 nvc0_tp_get_tess_mode(tcp, info); 370 371 return 0; 372} 373 374static int 375nvc0_tep_gen_header(struct nvc0_program *tep, struct nv50_ir_prog_info *info) 376{ 377 tep->tp.input_patch_size = ~0; 378 379 tep->hdr[0] = 0x20061 | (3 << 10); 380 tep->hdr[4] = 0xff000; 381 382 nvc0_vtgp_gen_header(tep, info); 383 384 nvc0_tp_get_tess_mode(tep, info); 385 386 tep->hdr[18] |= 0x3 << 12; /* ? */ 387 388 return 0; 389} 390 391static int 392nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info) 393{ 394 gp->hdr[0] = 0x20061 | (4 << 10); 395 396 gp->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24; 397 398 switch (info->prop.gp.outputPrim) { 399 case PIPE_PRIM_POINTS: 400 gp->hdr[3] = 0x01000000; 401 gp->hdr[0] |= 0xf0000000; 402 break; 403 case PIPE_PRIM_LINE_STRIP: 404 gp->hdr[3] = 0x06000000; 405 gp->hdr[0] |= 0x10000000; 406 break; 407 case PIPE_PRIM_TRIANGLE_STRIP: 408 gp->hdr[3] = 0x07000000; 409 gp->hdr[0] |= 0x10000000; 410 break; 411 default: 412 assert(0); 413 break; 414 } 415 416 gp->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024); 417 418 return nvc0_vtgp_gen_header(gp, info); 419} 420 421#define NVC0_INTERP_FLAT (1 << 0) 422#define NVC0_INTERP_PERSPECTIVE (2 << 0) 423#define NVC0_INTERP_LINEAR (3 << 0) 424#define NVC0_INTERP_CENTROID (1 << 2) 425 426static uint8_t 427nvc0_hdr_interp_mode(const struct nv50_ir_varying *var) 428{ 429 if (var->linear) 430 return NVC0_INTERP_LINEAR; 431 if (var->flat) 432 return NVC0_INTERP_FLAT; 433 return NVC0_INTERP_PERSPECTIVE; 434} 435 436static int 437nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info) 438{ 439 unsigned i, c, a, m; 440 441 /* just 00062 on Kepler */ 442 fp->hdr[0] = 0x20062 | (5 << 10); 443 fp->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */ 444 445 if (info->prop.fp.usesDiscard) 446 fp->hdr[0] |= 0x8000; 447 if (info->prop.fp.numColourResults > 1) 448 fp->hdr[0] |= 0x4000; 449 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) 450 fp->hdr[19] |= 0x1; 451 if (info->prop.fp.writesDepth) { 452 fp->hdr[19] |= 0x2; 453 fp->flags[0] = 0x11; /* deactivate ZCULL */ 454 } 455 456 for (i = 0; i < info->numInputs; ++i) { 457 m = nvc0_hdr_interp_mode(&info->in[i]); 458 if (info->in[i].sn == TGSI_SEMANTIC_COLOR) { 459 fp->fp.colors |= 1 << info->in[i].si; 460 if (info->in[i].sc) 461 fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4); 462 } 463 for (c = 0; c < 4; ++c) { 464 if (!(info->in[i].mask & (1 << c))) 465 continue; 466 a = info->in[i].slot[c]; 467 if (info->in[i].slot[0] >= (0x060 / 4) && 468 info->in[i].slot[0] <= (0x07c / 4)) { 469 fp->hdr[5] |= 1 << (24 + (a - 0x060 / 4)); 470 } else 471 if (info->in[i].slot[0] >= (0x2c0 / 4) && 472 info->in[i].slot[0] <= (0x2fc / 4)) { 473 fp->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000; 474 } else { 475 if (info->in[i].slot[c] < (0x040 / 4) || 476 info->in[i].slot[c] > (0x380 / 4)) 477 continue; 478 a *= 2; 479 if (info->in[i].slot[0] >= (0x300 / 4)) 480 a -= 32; 481 fp->hdr[4 + a / 32] |= m << (a % 32); 482 } 483 } 484 } 485 /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */ 486 if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET) 487 fp->hdr[5] |= 0x30000000; 488 489 for (i = 0; i < info->numOutputs; ++i) { 490 if (info->out[i].sn == TGSI_SEMANTIC_COLOR) 491 fp->hdr[18] |= 0xf << (4 * info->out[i].si); 492 } 493 494 /* There are no "regular" attachments, but the shader still needs to be 495 * executed. It seems like it wants to think that it has some color 496 * outputs in order to actually run. 497 */ 498 if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth) 499 fp->hdr[18] |= 0xf; 500 501 fp->fp.early_z = info->prop.fp.earlyFragTests; 502 fp->fp.sample_mask_in = info->prop.fp.usesSampleMaskIn; 503 fp->fp.reads_framebuffer = info->prop.fp.readsFramebuffer; 504 fp->fp.post_depth_coverage = info->prop.fp.postDepthCoverage; 505 506 /* Mark position xy and layer as read */ 507 if (fp->fp.reads_framebuffer) 508 fp->hdr[5] |= 0x32000000; 509 510 return 0; 511} 512 513static struct nvc0_transform_feedback_state * 514nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info, 515 const struct pipe_stream_output_info *pso) 516{ 517 struct nvc0_transform_feedback_state *tfb; 518 unsigned b, i, c; 519 520 tfb = MALLOC_STRUCT(nvc0_transform_feedback_state); 521 if (!tfb) 522 return NULL; 523 for (b = 0; b < 4; ++b) { 524 tfb->stride[b] = pso->stride[b] * 4; 525 tfb->varying_count[b] = 0; 526 } 527 memset(tfb->varying_index, 0xff, sizeof(tfb->varying_index)); /* = skip */ 528 529 for (i = 0; i < pso->num_outputs; ++i) { 530 unsigned s = pso->output[i].start_component; 531 unsigned p = pso->output[i].dst_offset; 532 const unsigned r = pso->output[i].register_index; 533 b = pso->output[i].output_buffer; 534 535 if (r >= info->numOutputs) 536 continue; 537 538 for (c = 0; c < pso->output[i].num_components; ++c) 539 tfb->varying_index[b][p++] = info->out[r].slot[s + c]; 540 541 tfb->varying_count[b] = MAX2(tfb->varying_count[b], p); 542 tfb->stream[b] = pso->output[i].stream; 543 } 544 for (b = 0; b < 4; ++b) // zero unused indices (looks nicer) 545 for (c = tfb->varying_count[b]; c & 3; ++c) 546 tfb->varying_index[b][c] = 0; 547 548 return tfb; 549} 550 551#ifdef DEBUG 552static void 553nvc0_program_dump(struct nvc0_program *prog) 554{ 555 unsigned pos; 556 557 if (prog->type != PIPE_SHADER_COMPUTE) { 558 debug_printf("dumping HDR for type %i\n", prog->type); 559 for (pos = 0; pos < ARRAY_SIZE(prog->hdr); ++pos) 560 debug_printf("HDR[%02"PRIxPTR"] = 0x%08x\n", 561 pos * sizeof(prog->hdr[0]), prog->hdr[pos]); 562 } 563 debug_printf("shader binary code (0x%x bytes):", prog->code_size); 564 for (pos = 0; pos < prog->code_size / 4; ++pos) { 565 if ((pos % 8) == 0) 566 debug_printf("\n"); 567 debug_printf("%08x ", prog->code[pos]); 568 } 569 debug_printf("\n"); 570} 571#endif 572 573bool 574nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, 575 struct pipe_debug_callback *debug) 576{ 577 struct nv50_ir_prog_info *info; 578 int ret; 579 580 info = CALLOC_STRUCT(nv50_ir_prog_info); 581 if (!info) 582 return false; 583 584 info->type = prog->type; 585 info->target = chipset; 586 587 info->bin.sourceRep = prog->pipe.type; 588 switch (prog->pipe.type) { 589 case PIPE_SHADER_IR_TGSI: 590 info->bin.source = (void *)prog->pipe.tokens; 591 break; 592 case PIPE_SHADER_IR_NIR: 593 info->bin.source = (void *)nir_shader_clone(NULL, prog->pipe.ir.nir); 594 break; 595 default: 596 assert(!"unsupported IR!"); 597 free(info); 598 return false; 599 } 600 601#ifdef DEBUG 602 info->target = debug_get_num_option("NV50_PROG_CHIPSET", chipset); 603 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); 604 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); 605 info->omitLineNum = debug_get_num_option("NV50_PROG_DEBUG_OMIT_LINENUM", 0); 606#else 607 info->optLevel = 3; 608#endif 609 610 info->bin.smemSize = prog->cp.smem_size; 611 info->io.genUserClip = prog->vp.num_ucps; 612 info->io.auxCBSlot = 15; 613 info->io.msInfoCBSlot = 15; 614 info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; 615 info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; 616 info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; 617 info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); 618 info->io.suInfoBase = NVC0_CB_AUX_SU_INFO(0); 619 if (info->target >= NVISA_GK104_CHIPSET) { 620 info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); 621 info->io.fbtexBindBase = NVC0_CB_AUX_FB_TEX_INFO; 622 info->io.bindlessBase = NVC0_CB_AUX_BINDLESS_INFO(0); 623 } 624 625 if (prog->type == PIPE_SHADER_COMPUTE) { 626 if (info->target >= NVISA_GK104_CHIPSET) { 627 info->io.auxCBSlot = 7; 628 info->io.msInfoCBSlot = 7; 629 info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0); 630 } 631 info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO(0); 632 } else { 633 info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO; 634 } 635 636 info->assignSlots = nvc0_program_assign_varying_slots; 637 638 ret = nv50_ir_generate_code(info); 639 if (ret) { 640 NOUVEAU_ERR("shader translation failed: %i\n", ret); 641 goto out; 642 } 643 if (prog->type != PIPE_SHADER_COMPUTE) 644 FREE(info->bin.syms); 645 646 prog->code = info->bin.code; 647 prog->code_size = info->bin.codeSize; 648 prog->relocs = info->bin.relocData; 649 prog->fixups = info->bin.fixupData; 650 prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1)); 651 prog->cp.smem_size = info->bin.smemSize; 652 prog->num_barriers = info->numBarriers; 653 654 prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS; 655 prog->vp.need_draw_parameters = info->prop.vp.usesDrawParameters; 656 657 if (info->io.edgeFlagOut < PIPE_MAX_ATTRIBS) 658 info->out[info->io.edgeFlagOut].mask = 0; /* for headergen */ 659 prog->vp.edgeflag = info->io.edgeFlagIn; 660 661 switch (prog->type) { 662 case PIPE_SHADER_VERTEX: 663 ret = nvc0_vp_gen_header(prog, info); 664 break; 665 case PIPE_SHADER_TESS_CTRL: 666 ret = nvc0_tcp_gen_header(prog, info); 667 break; 668 case PIPE_SHADER_TESS_EVAL: 669 ret = nvc0_tep_gen_header(prog, info); 670 break; 671 case PIPE_SHADER_GEOMETRY: 672 ret = nvc0_gp_gen_header(prog, info); 673 break; 674 case PIPE_SHADER_FRAGMENT: 675 ret = nvc0_fp_gen_header(prog, info); 676 break; 677 case PIPE_SHADER_COMPUTE: 678 prog->cp.syms = info->bin.syms; 679 prog->cp.num_syms = info->bin.numSyms; 680 break; 681 default: 682 ret = -1; 683 NOUVEAU_ERR("unknown program type: %u\n", prog->type); 684 break; 685 } 686 if (ret) 687 goto out; 688 689 if (info->bin.tlsSpace) { 690 assert(info->bin.tlsSpace < (1 << 24)); 691 prog->hdr[0] |= 1 << 26; 692 prog->hdr[1] |= align(info->bin.tlsSpace, 0x10); /* l[] size */ 693 prog->need_tls = true; 694 } 695 /* TODO: factor 2 only needed where joinat/precont is used, 696 * and we only have to count non-uniform branches 697 */ 698 /* 699 if ((info->maxCFDepth * 2) > 16) { 700 prog->hdr[2] |= (((info->maxCFDepth * 2) + 47) / 48) * 0x200; 701 prog->need_tls = true; 702 } 703 */ 704 if (info->io.globalAccess) 705 prog->hdr[0] |= 1 << 26; 706 if (info->io.globalAccess & 0x2) 707 prog->hdr[0] |= 1 << 16; 708 if (info->io.fp64) 709 prog->hdr[0] |= 1 << 27; 710 711 if (prog->pipe.stream_output.num_outputs) 712 prog->tfb = nvc0_program_create_tfb_state(info, 713 &prog->pipe.stream_output); 714 715 pipe_debug_message(debug, SHADER_INFO, 716 "type: %d, local: %d, shared: %d, gpr: %d, inst: %d, bytes: %d", 717 prog->type, info->bin.tlsSpace, info->bin.smemSize, 718 prog->num_gprs, info->bin.instructions, 719 info->bin.codeSize); 720 721#ifdef DEBUG 722 if (debug_get_option("NV50_PROG_CHIPSET", NULL) && info->dbgFlags) 723 nvc0_program_dump(prog); 724#endif 725 726out: 727 if (info->bin.sourceRep == PIPE_SHADER_IR_NIR) 728 ralloc_free((void *)info->bin.source); 729 FREE(info); 730 return !ret; 731} 732 733static inline int 734nvc0_program_alloc_code(struct nvc0_context *nvc0, struct nvc0_program *prog) 735{ 736 struct nvc0_screen *screen = nvc0->screen; 737 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 738 int ret; 739 uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 740 741 /* On Fermi, SP_START_ID must be aligned to 0x40. 742 * On Kepler, the first instruction must be aligned to 0x80 because 743 * latency information is expected only at certain positions. 744 */ 745 if (screen->base.class_3d >= NVE4_3D_CLASS) 746 size = size + (is_cp ? 0x40 : 0x70); 747 size = align(size, 0x40); 748 749 ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem); 750 if (ret) 751 return ret; 752 prog->code_base = prog->mem->start; 753 754 if (!is_cp) { 755 if (screen->base.class_3d >= NVE4_3D_CLASS) { 756 switch (prog->mem->start & 0xff) { 757 case 0x40: prog->code_base += 0x70; break; 758 case 0x80: prog->code_base += 0x30; break; 759 case 0xc0: prog->code_base += 0x70; break; 760 default: 761 prog->code_base += 0x30; 762 assert((prog->mem->start & 0xff) == 0x00); 763 break; 764 } 765 } 766 } else { 767 if (screen->base.class_3d >= NVE4_3D_CLASS) { 768 if (prog->mem->start & 0x40) 769 prog->code_base += 0x40; 770 assert((prog->code_base & 0x7f) == 0x00); 771 } 772 } 773 774 return 0; 775} 776 777static inline void 778nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog) 779{ 780 struct nvc0_screen *screen = nvc0->screen; 781 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 782 uint32_t code_pos = prog->code_base + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 783 784 if (prog->relocs) 785 nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, 786 screen->lib_code->start, 0); 787 if (prog->fixups) { 788 nv50_ir_apply_fixups(prog->fixups, prog->code, 789 prog->fp.force_persample_interp, 790 prog->fp.flatshade, 791 0 /* alphatest */); 792 for (int i = 0; i < 2; i++) { 793 unsigned mask = prog->fp.color_interp[i] >> 4; 794 unsigned interp = prog->fp.color_interp[i] & 3; 795 if (!mask) 796 continue; 797 prog->hdr[14] &= ~(0xff << (8 * i)); 798 if (prog->fp.flatshade) 799 interp = NVC0_INTERP_FLAT; 800 for (int c = 0; c < 4; c++) 801 if (mask & (1 << c)) 802 prog->hdr[14] |= interp << (2 * (4 * i + c)); 803 } 804 } 805 806 if (!is_cp) 807 nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base, 808 NV_VRAM_DOMAIN(&screen->base), 809 NVC0_SHADER_HEADER_SIZE, prog->hdr); 810 811 nvc0->base.push_data(&nvc0->base, screen->text, code_pos, 812 NV_VRAM_DOMAIN(&screen->base), prog->code_size, 813 prog->code); 814} 815 816bool 817nvc0_program_upload(struct nvc0_context *nvc0, struct nvc0_program *prog) 818{ 819 struct nvc0_screen *screen = nvc0->screen; 820 const bool is_cp = prog->type == PIPE_SHADER_COMPUTE; 821 int ret; 822 uint32_t size = prog->code_size + (is_cp ? 0 : NVC0_SHADER_HEADER_SIZE); 823 824 ret = nvc0_program_alloc_code(nvc0, prog); 825 if (ret) { 826 struct nouveau_heap *heap = screen->text_heap; 827 struct nvc0_program *progs[] = { /* Sorted accordingly to SP_START_ID */ 828 nvc0->compprog, nvc0->vertprog, nvc0->tctlprog, 829 nvc0->tevlprog, nvc0->gmtyprog, nvc0->fragprog 830 }; 831 832 /* Note that the code library, which is allocated before anything else, 833 * does not have a priv pointer. We can stop once we hit it. 834 */ 835 while (heap->next && heap->next->priv) { 836 struct nvc0_program *evict = heap->next->priv; 837 nouveau_heap_free(&evict->mem); 838 } 839 debug_printf("WARNING: out of code space, evicting all shaders.\n"); 840 841 /* Make sure to synchronize before deleting the code segment. */ 842 IMMED_NVC0(nvc0->base.pushbuf, NVC0_3D(SERIALIZE), 0); 843 844 if ((screen->text->size << 1) <= (1 << 23)) { 845 ret = nvc0_screen_resize_text_area(screen, screen->text->size << 1); 846 if (ret) { 847 NOUVEAU_ERR("Error allocating TEXT area: %d\n", ret); 848 return false; 849 } 850 851 /* Re-upload the builtin function into the new code segment. */ 852 nvc0_program_library_upload(nvc0); 853 } 854 855 ret = nvc0_program_alloc_code(nvc0, prog); 856 if (ret) { 857 NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size); 858 return false; 859 } 860 861 /* All currently bound shaders have to be reuploaded. */ 862 for (int i = 0; i < ARRAY_SIZE(progs); i++) { 863 if (!progs[i] || progs[i] == prog) 864 continue; 865 866 ret = nvc0_program_alloc_code(nvc0, progs[i]); 867 if (ret) { 868 NOUVEAU_ERR("failed to re-upload a shader after code eviction.\n"); 869 return false; 870 } 871 nvc0_program_upload_code(nvc0, progs[i]); 872 873 if (progs[i]->type == PIPE_SHADER_COMPUTE) { 874 /* Caches have to be invalidated but the CP_START_ID will be 875 * updated in the launch_grid functions. */ 876 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_CP(FLUSH), 1); 877 PUSH_DATA (nvc0->base.pushbuf, NVC0_COMPUTE_FLUSH_CODE); 878 } else { 879 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(SP_START_ID(i)), 1); 880 PUSH_DATA (nvc0->base.pushbuf, progs[i]->code_base); 881 } 882 } 883 } 884 885 nvc0_program_upload_code(nvc0, prog); 886 887#ifdef DEBUG 888 if (debug_get_bool_option("NV50_PROG_DEBUG", false)) 889 nvc0_program_dump(prog); 890#endif 891 892 BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1); 893 PUSH_DATA (nvc0->base.pushbuf, 0x1011); 894 895 return true; 896} 897 898/* Upload code for builtin functions like integer division emulation. */ 899void 900nvc0_program_library_upload(struct nvc0_context *nvc0) 901{ 902 struct nvc0_screen *screen = nvc0->screen; 903 int ret; 904 uint32_t size; 905 const uint32_t *code; 906 907 if (screen->lib_code) 908 return; 909 910 nv50_ir_get_target_library(screen->base.device->chipset, &code, &size); 911 if (!size) 912 return; 913 914 ret = nouveau_heap_alloc(screen->text_heap, align(size, 0x100), NULL, 915 &screen->lib_code); 916 if (ret) 917 return; 918 919 nvc0->base.push_data(&nvc0->base, 920 screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base), 921 size, code); 922 /* no need for a memory barrier, will be emitted with first program */ 923} 924 925void 926nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog) 927{ 928 const struct pipe_shader_state pipe = prog->pipe; 929 const ubyte type = prog->type; 930 931 if (prog->mem) 932 nouveau_heap_free(&prog->mem); 933 FREE(prog->code); /* may be 0 for hardcoded shaders */ 934 FREE(prog->relocs); 935 FREE(prog->fixups); 936 if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms) 937 FREE(prog->cp.syms); 938 if (prog->tfb) { 939 if (nvc0->state.tfb == prog->tfb) 940 nvc0->state.tfb = NULL; 941 FREE(prog->tfb); 942 } 943 944 memset(prog, 0, sizeof(*prog)); 945 946 prog->pipe = pipe; 947 prog->type = type; 948} 949 950uint32_t 951nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label) 952{ 953 const struct nv50_ir_prog_symbol *syms = 954 (const struct nv50_ir_prog_symbol *)prog->cp.syms; 955 unsigned base = 0; 956 unsigned i; 957 if (prog->type != PIPE_SHADER_COMPUTE) 958 base = NVC0_SHADER_HEADER_SIZE; 959 for (i = 0; i < prog->cp.num_syms; ++i) 960 if (syms[i].label == label) 961 return prog->code_base + base + syms[i].offset; 962 return prog->code_base; /* no symbols or symbol not found */ 963} 964 965void 966nvc0_program_init_tcp_empty(struct nvc0_context *nvc0) 967{ 968 struct ureg_program *ureg; 969 970 ureg = ureg_create(PIPE_SHADER_TESS_CTRL); 971 if (!ureg) 972 return; 973 974 ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1); 975 ureg_END(ureg); 976 977 nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe); 978} 979