1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 */ 23#include "r600_sq.h" 24#include "r600_opcodes.h" 25#include "r600_formats.h" 26#include "r600_shader.h" 27#include "r600d.h" 28 29#include <errno.h> 30#include "util/u_bitcast.h" 31#include "util/u_dump.h" 32#include "util/u_memory.h" 33#include "util/u_math.h" 34#include "pipe/p_shader_tokens.h" 35 36#include "sb/sb_public.h" 37 38#define NUM_OF_CYCLES 3 39#define NUM_OF_COMPONENTS 4 40 41static inline bool alu_writes(struct r600_bytecode_alu *alu) 42{ 43 return alu->dst.write || alu->is_op3; 44} 45 46static inline unsigned int r600_bytecode_get_num_operands(const struct r600_bytecode_alu *alu) 47{ 48 return r600_isa_alu(alu->op)->src_count; 49} 50 51static struct r600_bytecode_cf *r600_bytecode_cf(void) 52{ 53 struct r600_bytecode_cf *cf = CALLOC_STRUCT(r600_bytecode_cf); 54 55 if (!cf) 56 return NULL; 57 list_inithead(&cf->list); 58 list_inithead(&cf->alu); 59 list_inithead(&cf->vtx); 60 list_inithead(&cf->tex); 61 list_inithead(&cf->gds); 62 return cf; 63} 64 65static struct r600_bytecode_alu *r600_bytecode_alu(void) 66{ 67 struct r600_bytecode_alu *alu = CALLOC_STRUCT(r600_bytecode_alu); 68 69 if (!alu) 70 return NULL; 71 list_inithead(&alu->list); 72 return alu; 73} 74 75static struct r600_bytecode_vtx *r600_bytecode_vtx(void) 76{ 77 struct r600_bytecode_vtx *vtx = CALLOC_STRUCT(r600_bytecode_vtx); 78 79 if (!vtx) 80 return NULL; 81 list_inithead(&vtx->list); 82 return vtx; 83} 84 85static struct r600_bytecode_tex *r600_bytecode_tex(void) 86{ 87 struct r600_bytecode_tex *tex = CALLOC_STRUCT(r600_bytecode_tex); 88 89 if (!tex) 90 return NULL; 91 list_inithead(&tex->list); 92 return tex; 93} 94 95static struct r600_bytecode_gds *r600_bytecode_gds(void) 96{ 97 struct r600_bytecode_gds *gds = CALLOC_STRUCT(r600_bytecode_gds); 98 99 if (gds == NULL) 100 return NULL; 101 list_inithead(&gds->list); 102 return gds; 103} 104 105static unsigned stack_entry_size(enum radeon_family chip) { 106 /* Wavefront size: 107 * 64: R600/RV670/RV770/Cypress/R740/Barts/Turks/Caicos/ 108 * Aruba/Sumo/Sumo2/redwood/juniper 109 * 32: R630/R730/R710/Palm/Cedar 110 * 16: R610/Rs780 111 * 112 * Stack row size: 113 * Wavefront Size 16 32 48 64 114 * Columns per Row (R6xx/R7xx/R8xx only) 8 8 4 4 115 * Columns per Row (R9xx+) 8 4 4 4 */ 116 117 switch (chip) { 118 /* FIXME: are some chips missing here? */ 119 /* wavefront size 16 */ 120 case CHIP_RV610: 121 case CHIP_RS780: 122 case CHIP_RV620: 123 case CHIP_RS880: 124 /* wavefront size 32 */ 125 case CHIP_RV630: 126 case CHIP_RV635: 127 case CHIP_RV730: 128 case CHIP_RV710: 129 case CHIP_PALM: 130 case CHIP_CEDAR: 131 return 8; 132 133 /* wavefront size 64 */ 134 default: 135 return 4; 136 } 137} 138 139void r600_bytecode_init(struct r600_bytecode *bc, 140 enum chip_class chip_class, 141 enum radeon_family family, 142 bool has_compressed_msaa_texturing) 143{ 144 static unsigned next_shader_id = 0; 145 146 bc->debug_id = ++next_shader_id; 147 148 if ((chip_class == R600) && 149 (family != CHIP_RV670 && family != CHIP_RS780 && family != CHIP_RS880)) { 150 bc->ar_handling = AR_HANDLE_RV6XX; 151 bc->r6xx_nop_after_rel_dst = 1; 152 } else { 153 bc->ar_handling = AR_HANDLE_NORMAL; 154 bc->r6xx_nop_after_rel_dst = 0; 155 } 156 157 list_inithead(&bc->cf); 158 bc->chip_class = chip_class; 159 bc->family = family; 160 bc->has_compressed_msaa_texturing = has_compressed_msaa_texturing; 161 bc->stack.entry_size = stack_entry_size(family); 162} 163 164int r600_bytecode_add_cf(struct r600_bytecode *bc) 165{ 166 struct r600_bytecode_cf *cf = r600_bytecode_cf(); 167 168 if (!cf) 169 return -ENOMEM; 170 list_addtail(&cf->list, &bc->cf); 171 if (bc->cf_last) { 172 cf->id = bc->cf_last->id + 2; 173 if (bc->cf_last->eg_alu_extended) { 174 /* take into account extended alu size */ 175 cf->id += 2; 176 bc->ndw += 2; 177 } 178 } 179 bc->cf_last = cf; 180 bc->ncf++; 181 bc->ndw += 2; 182 bc->force_add_cf = 0; 183 bc->ar_loaded = 0; 184 return 0; 185} 186 187int r600_bytecode_add_output(struct r600_bytecode *bc, 188 const struct r600_bytecode_output *output) 189{ 190 int r; 191 192 if (output->gpr >= bc->ngpr) 193 bc->ngpr = output->gpr + 1; 194 195 if (bc->cf_last && (bc->cf_last->op == output->op || 196 (bc->cf_last->op == CF_OP_EXPORT && 197 output->op == CF_OP_EXPORT_DONE)) && 198 output->type == bc->cf_last->output.type && 199 output->elem_size == bc->cf_last->output.elem_size && 200 output->swizzle_x == bc->cf_last->output.swizzle_x && 201 output->swizzle_y == bc->cf_last->output.swizzle_y && 202 output->swizzle_z == bc->cf_last->output.swizzle_z && 203 output->swizzle_w == bc->cf_last->output.swizzle_w && 204 output->comp_mask == bc->cf_last->output.comp_mask && 205 (output->burst_count + bc->cf_last->output.burst_count) <= 16) { 206 207 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr && 208 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) { 209 210 bc->cf_last->op = bc->cf_last->output.op = output->op; 211 bc->cf_last->output.gpr = output->gpr; 212 bc->cf_last->output.array_base = output->array_base; 213 bc->cf_last->output.burst_count += output->burst_count; 214 return 0; 215 216 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) && 217 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) { 218 219 bc->cf_last->op = bc->cf_last->output.op = output->op; 220 bc->cf_last->output.burst_count += output->burst_count; 221 return 0; 222 } 223 } 224 225 r = r600_bytecode_add_cf(bc); 226 if (r) 227 return r; 228 bc->cf_last->op = output->op; 229 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bytecode_output)); 230 bc->cf_last->barrier = 1; 231 return 0; 232} 233 234int r600_bytecode_add_pending_output(struct r600_bytecode *bc, 235 const struct r600_bytecode_output *output) 236{ 237 assert(bc->n_pending_outputs + 1 < ARRAY_SIZE(bc->pending_outputs)); 238 bc->pending_outputs[bc->n_pending_outputs++] = *output; 239 240 return 0; 241} 242 243void r600_bytecode_need_wait_ack(struct r600_bytecode *bc, boolean need_wait_ack) 244{ 245 bc->need_wait_ack = need_wait_ack; 246} 247 248boolean r600_bytecode_get_need_wait_ack(struct r600_bytecode *bc) 249{ 250 return bc->need_wait_ack; 251} 252 253/* alu instructions that can ony exits once per group */ 254static int is_alu_once_inst(struct r600_bytecode_alu *alu) 255{ 256 return r600_isa_alu(alu->op)->flags & (AF_KILL | AF_PRED) || alu->is_lds_idx_op || alu->op == ALU_OP0_GROUP_BARRIER; 257} 258 259static int is_alu_reduction_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 260{ 261 return (r600_isa_alu(alu->op)->flags & AF_REPL) && 262 (r600_isa_alu_slots(bc->isa->hw_class, alu->op) == AF_4V); 263} 264 265static int is_alu_mova_inst(struct r600_bytecode_alu *alu) 266{ 267 return r600_isa_alu(alu->op)->flags & AF_MOVA; 268} 269 270static int alu_uses_rel(struct r600_bytecode_alu *alu) 271{ 272 unsigned num_src = r600_bytecode_get_num_operands(alu); 273 unsigned src; 274 275 if (alu->dst.rel) { 276 return 1; 277 } 278 279 for (src = 0; src < num_src; ++src) { 280 if (alu->src[src].rel) { 281 return 1; 282 } 283 } 284 return 0; 285} 286 287static int is_lds_read(int sel) 288{ 289 return sel == EG_V_SQ_ALU_SRC_LDS_OQ_A_POP || sel == EG_V_SQ_ALU_SRC_LDS_OQ_B_POP; 290} 291 292static int alu_uses_lds(struct r600_bytecode_alu *alu) 293{ 294 unsigned num_src = r600_bytecode_get_num_operands(alu); 295 unsigned src; 296 297 for (src = 0; src < num_src; ++src) { 298 if (is_lds_read(alu->src[src].sel)) { 299 return 1; 300 } 301 } 302 return 0; 303} 304 305static int is_alu_64bit_inst(struct r600_bytecode_alu *alu) 306{ 307 const struct alu_op_info *op = r600_isa_alu(alu->op); 308 return (op->flags & AF_64); 309} 310 311static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 312{ 313 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 314 return !(slots & AF_S); 315} 316 317static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 318{ 319 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 320 return !(slots & AF_V); 321} 322 323/* alu instructions that can execute on any unit */ 324static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) 325{ 326 unsigned slots = r600_isa_alu_slots(bc->isa->hw_class, alu->op); 327 return slots == AF_VS; 328} 329 330static int is_nop_inst(struct r600_bytecode_alu *alu) 331{ 332 return alu->op == ALU_OP0_NOP; 333} 334 335static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first, 336 struct r600_bytecode_alu *assignment[5]) 337{ 338 struct r600_bytecode_alu *alu; 339 unsigned i, chan, trans; 340 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 341 342 for (i = 0; i < max_slots; i++) 343 assignment[i] = NULL; 344 345 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bytecode_alu, alu->list.next, list)) { 346 chan = alu->dst.chan; 347 if (max_slots == 4) 348 trans = 0; 349 else if (is_alu_trans_unit_inst(bc, alu)) 350 trans = 1; 351 else if (is_alu_vec_unit_inst(bc, alu)) 352 trans = 0; 353 else if (assignment[chan]) 354 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */ 355 else 356 trans = 0; 357 358 if (trans) { 359 if (assignment[4]) { 360 assert(0); /* ALU.Trans has already been allocated. */ 361 return -1; 362 } 363 assignment[4] = alu; 364 } else { 365 if (assignment[chan]) { 366 assert(0); /* ALU.chan has already been allocated. */ 367 return -1; 368 } 369 assignment[chan] = alu; 370 } 371 372 if (alu->last) 373 break; 374 } 375 return 0; 376} 377 378struct alu_bank_swizzle { 379 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS]; 380 int hw_cfile_addr[4]; 381 int hw_cfile_elem[4]; 382}; 383 384static const unsigned cycle_for_bank_swizzle_vec[][3] = { 385 [SQ_ALU_VEC_012] = { 0, 1, 2 }, 386 [SQ_ALU_VEC_021] = { 0, 2, 1 }, 387 [SQ_ALU_VEC_120] = { 1, 2, 0 }, 388 [SQ_ALU_VEC_102] = { 1, 0, 2 }, 389 [SQ_ALU_VEC_201] = { 2, 0, 1 }, 390 [SQ_ALU_VEC_210] = { 2, 1, 0 } 391}; 392 393static const unsigned cycle_for_bank_swizzle_scl[][3] = { 394 [SQ_ALU_SCL_210] = { 2, 1, 0 }, 395 [SQ_ALU_SCL_122] = { 1, 2, 2 }, 396 [SQ_ALU_SCL_212] = { 2, 1, 2 }, 397 [SQ_ALU_SCL_221] = { 2, 2, 1 } 398}; 399 400static void init_bank_swizzle(struct alu_bank_swizzle *bs) 401{ 402 int i, cycle, component; 403 /* set up gpr use */ 404 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++) 405 for (component = 0; component < NUM_OF_COMPONENTS; component++) 406 bs->hw_gpr[cycle][component] = -1; 407 for (i = 0; i < 4; i++) 408 bs->hw_cfile_addr[i] = -1; 409 for (i = 0; i < 4; i++) 410 bs->hw_cfile_elem[i] = -1; 411} 412 413static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle) 414{ 415 if (bs->hw_gpr[cycle][chan] == -1) 416 bs->hw_gpr[cycle][chan] = sel; 417 else if (bs->hw_gpr[cycle][chan] != (int)sel) { 418 /* Another scalar operation has already used the GPR read port for the channel. */ 419 return -1; 420 } 421 return 0; 422} 423 424static int reserve_cfile(const struct r600_bytecode *bc, 425 struct alu_bank_swizzle *bs, unsigned sel, unsigned chan) 426{ 427 int res, num_res = 4; 428 if (bc->chip_class >= R700) { 429 num_res = 2; 430 chan /= 2; 431 } 432 for (res = 0; res < num_res; ++res) { 433 if (bs->hw_cfile_addr[res] == -1) { 434 bs->hw_cfile_addr[res] = sel; 435 bs->hw_cfile_elem[res] = chan; 436 return 0; 437 } else if (bs->hw_cfile_addr[res] == sel && 438 bs->hw_cfile_elem[res] == chan) 439 return 0; /* Read for this scalar element already reserved, nothing to do here. */ 440 } 441 /* All cfile read ports are used, cannot reference vector element. */ 442 return -1; 443} 444 445static int is_gpr(unsigned sel) 446{ 447 return (sel <= 127); 448} 449 450/* CB constants start at 512, and get translated to a kcache index when ALU 451 * clauses are constructed. Note that we handle kcache constants the same way 452 * as (the now gone) cfile constants, is that really required? */ 453static int is_cfile(unsigned sel) 454{ 455 return (sel > 255 && sel < 512) || 456 (sel > 511 && sel < 4607) || /* Kcache before translation. */ 457 (sel > 127 && sel < 192); /* Kcache after translation. */ 458} 459 460static int is_const(int sel) 461{ 462 return is_cfile(sel) || 463 (sel >= V_SQ_ALU_SRC_0 && 464 sel <= V_SQ_ALU_SRC_LITERAL); 465} 466 467static int check_vector(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 468 struct alu_bank_swizzle *bs, int bank_swizzle) 469{ 470 int r, src, num_src, sel, elem, cycle; 471 472 num_src = r600_bytecode_get_num_operands(alu); 473 for (src = 0; src < num_src; src++) { 474 sel = alu->src[src].sel; 475 elem = alu->src[src].chan; 476 if (is_gpr(sel)) { 477 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src]; 478 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan) 479 /* Nothing to do; special-case optimization, 480 * second source uses first source’s reservation. */ 481 continue; 482 else { 483 r = reserve_gpr(bs, sel, elem, cycle); 484 if (r) 485 return r; 486 } 487 } else if (is_cfile(sel)) { 488 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 489 if (r) 490 return r; 491 } 492 /* No restrictions on PV, PS, literal or special constants. */ 493 } 494 return 0; 495} 496 497static int check_scalar(const struct r600_bytecode *bc, const struct r600_bytecode_alu *alu, 498 struct alu_bank_swizzle *bs, int bank_swizzle) 499{ 500 int r, src, num_src, const_count, sel, elem, cycle; 501 502 num_src = r600_bytecode_get_num_operands(alu); 503 for (const_count = 0, src = 0; src < num_src; ++src) { 504 sel = alu->src[src].sel; 505 elem = alu->src[src].chan; 506 if (is_const(sel)) { /* Any constant, including literal and inline constants. */ 507 if (const_count >= 2) 508 /* More than two references to a constant in 509 * transcendental operation. */ 510 return -1; 511 else 512 const_count++; 513 } 514 if (is_cfile(sel)) { 515 r = reserve_cfile(bc, bs, (alu->src[src].kc_bank<<16) + sel, elem); 516 if (r) 517 return r; 518 } 519 } 520 for (src = 0; src < num_src; ++src) { 521 sel = alu->src[src].sel; 522 elem = alu->src[src].chan; 523 if (is_gpr(sel)) { 524 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 525 if (cycle < const_count) 526 /* Cycle for GPR load conflicts with 527 * constant load in transcendental operation. */ 528 return -1; 529 r = reserve_gpr(bs, sel, elem, cycle); 530 if (r) 531 return r; 532 } 533 /* PV PS restrictions */ 534 if (const_count && (sel == 254 || sel == 255)) { 535 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src]; 536 if (cycle < const_count) 537 return -1; 538 } 539 } 540 return 0; 541} 542 543static int check_and_set_bank_swizzle(const struct r600_bytecode *bc, 544 struct r600_bytecode_alu *slots[5]) 545{ 546 struct alu_bank_swizzle bs; 547 int bank_swizzle[5]; 548 int i, r = 0, forced = 1; 549 boolean scalar_only = bc->chip_class == CAYMAN ? false : true; 550 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 551 552 for (i = 0; i < max_slots; i++) { 553 if (slots[i]) { 554 if (slots[i]->bank_swizzle_force) { 555 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force; 556 } else { 557 forced = 0; 558 } 559 } 560 561 if (i < 4 && slots[i]) 562 scalar_only = false; 563 } 564 if (forced) 565 return 0; 566 567 /* Just check every possible combination of bank swizzle. 568 * Not very efficent, but works on the first try in most of the cases. */ 569 for (i = 0; i < 4; i++) 570 if (!slots[i] || !slots[i]->bank_swizzle_force) 571 bank_swizzle[i] = SQ_ALU_VEC_012; 572 else 573 bank_swizzle[i] = slots[i]->bank_swizzle; 574 575 bank_swizzle[4] = SQ_ALU_SCL_210; 576 while(bank_swizzle[4] <= SQ_ALU_SCL_221) { 577 578 init_bank_swizzle(&bs); 579 if (scalar_only == false) { 580 for (i = 0; i < 4; i++) { 581 if (slots[i]) { 582 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]); 583 if (r) 584 break; 585 } 586 } 587 } else 588 r = 0; 589 590 if (!r && max_slots == 5 && slots[4]) { 591 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]); 592 } 593 if (!r) { 594 for (i = 0; i < max_slots; i++) { 595 if (slots[i]) 596 slots[i]->bank_swizzle = bank_swizzle[i]; 597 } 598 return 0; 599 } 600 601 if (scalar_only) { 602 bank_swizzle[4]++; 603 } else { 604 for (i = 0; i < max_slots; i++) { 605 if (!slots[i] || !slots[i]->bank_swizzle_force) { 606 bank_swizzle[i]++; 607 if (bank_swizzle[i] <= SQ_ALU_VEC_210) 608 break; 609 else if (i < max_slots - 1) 610 bank_swizzle[i] = SQ_ALU_VEC_012; 611 else 612 return -1; 613 } 614 } 615 } 616 } 617 618 /* Couldn't find a working swizzle. */ 619 return -1; 620} 621 622static int replace_gpr_with_pv_ps(struct r600_bytecode *bc, 623 struct r600_bytecode_alu *slots[5], struct r600_bytecode_alu *alu_prev) 624{ 625 struct r600_bytecode_alu *prev[5]; 626 int gpr[5], chan[5]; 627 int i, j, r, src, num_src; 628 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 629 630 r = assign_alu_units(bc, alu_prev, prev); 631 if (r) 632 return r; 633 634 for (i = 0; i < max_slots; ++i) { 635 if (prev[i] && alu_writes(prev[i]) && !prev[i]->dst.rel) { 636 637 if (is_alu_64bit_inst(prev[i])) { 638 gpr[i] = -1; 639 continue; 640 } 641 642 gpr[i] = prev[i]->dst.sel; 643 /* cube writes more than PV.X */ 644 if (is_alu_reduction_inst(bc, prev[i])) 645 chan[i] = 0; 646 else 647 chan[i] = prev[i]->dst.chan; 648 } else 649 gpr[i] = -1; 650 } 651 652 for (i = 0; i < max_slots; ++i) { 653 struct r600_bytecode_alu *alu = slots[i]; 654 if (!alu) 655 continue; 656 657 if (is_alu_64bit_inst(alu)) 658 continue; 659 num_src = r600_bytecode_get_num_operands(alu); 660 for (src = 0; src < num_src; ++src) { 661 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel) 662 continue; 663 664 if (bc->chip_class < CAYMAN) { 665 if (alu->src[src].sel == gpr[4] && 666 alu->src[src].chan == chan[4] && 667 alu_prev->pred_sel == alu->pred_sel) { 668 alu->src[src].sel = V_SQ_ALU_SRC_PS; 669 alu->src[src].chan = 0; 670 continue; 671 } 672 } 673 674 for (j = 0; j < 4; ++j) { 675 if (alu->src[src].sel == gpr[j] && 676 alu->src[src].chan == j && 677 alu_prev->pred_sel == alu->pred_sel) { 678 alu->src[src].sel = V_SQ_ALU_SRC_PV; 679 alu->src[src].chan = chan[j]; 680 break; 681 } 682 } 683 } 684 } 685 686 return 0; 687} 688 689void r600_bytecode_special_constants(uint32_t value, unsigned *sel) 690{ 691 switch(value) { 692 case 0: 693 *sel = V_SQ_ALU_SRC_0; 694 break; 695 case 1: 696 *sel = V_SQ_ALU_SRC_1_INT; 697 break; 698 case -1: 699 *sel = V_SQ_ALU_SRC_M_1_INT; 700 break; 701 case 0x3F800000: /* 1.0f */ 702 *sel = V_SQ_ALU_SRC_1; 703 break; 704 case 0x3F000000: /* 0.5f */ 705 *sel = V_SQ_ALU_SRC_0_5; 706 break; 707 default: 708 *sel = V_SQ_ALU_SRC_LITERAL; 709 break; 710 } 711} 712 713/* compute how many literal are needed */ 714static int r600_bytecode_alu_nliterals(struct r600_bytecode_alu *alu, 715 uint32_t literal[4], unsigned *nliteral) 716{ 717 unsigned num_src = r600_bytecode_get_num_operands(alu); 718 unsigned i, j; 719 720 for (i = 0; i < num_src; ++i) { 721 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 722 uint32_t value = alu->src[i].value; 723 unsigned found = 0; 724 for (j = 0; j < *nliteral; ++j) { 725 if (literal[j] == value) { 726 found = 1; 727 break; 728 } 729 } 730 if (!found) { 731 if (*nliteral >= 4) 732 return -EINVAL; 733 literal[(*nliteral)++] = value; 734 } 735 } 736 } 737 return 0; 738} 739 740static void r600_bytecode_alu_adjust_literals(struct r600_bytecode_alu *alu, 741 uint32_t literal[4], unsigned nliteral) 742{ 743 unsigned num_src = r600_bytecode_get_num_operands(alu); 744 unsigned i, j; 745 746 for (i = 0; i < num_src; ++i) { 747 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) { 748 uint32_t value = alu->src[i].value; 749 for (j = 0; j < nliteral; ++j) { 750 if (literal[j] == value) { 751 alu->src[i].chan = j; 752 break; 753 } 754 } 755 } 756 } 757} 758 759static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu *slots[5], 760 struct r600_bytecode_alu *alu_prev) 761{ 762 struct r600_bytecode_alu *prev[5]; 763 struct r600_bytecode_alu *result[5] = { NULL }; 764 765 uint8_t interp_xz = 0; 766 767 uint32_t literal[4], prev_literal[4]; 768 unsigned nliteral = 0, prev_nliteral = 0; 769 770 int i, j, r, src, num_src; 771 int num_once_inst = 0; 772 int have_mova = 0, have_rel = 0; 773 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 774 775 r = assign_alu_units(bc, alu_prev, prev); 776 if (r) 777 return r; 778 779 for (i = 0; i < max_slots; ++i) { 780 if (prev[i]) { 781 if (prev[i]->pred_sel) 782 return 0; 783 if (is_alu_once_inst(prev[i])) 784 return 0; 785 786 if (prev[i]->op == ALU_OP2_INTERP_X) 787 interp_xz |= 1; 788 if (prev[i]->op == ALU_OP2_INTERP_Z) 789 interp_xz |= 2; 790 } 791 if (slots[i]) { 792 if (slots[i]->pred_sel) 793 return 0; 794 if (is_alu_once_inst(slots[i])) 795 return 0; 796 if (slots[i]->op == ALU_OP2_INTERP_X) 797 interp_xz |= 1; 798 if (slots[i]->op == ALU_OP2_INTERP_Z) 799 interp_xz |= 2; 800 } 801 if (interp_xz == 3) 802 return 0; 803 } 804 805 for (i = 0; i < max_slots; ++i) { 806 struct r600_bytecode_alu *alu; 807 808 if (num_once_inst > 0) 809 return 0; 810 811 /* check number of literals */ 812 if (prev[i]) { 813 if (r600_bytecode_alu_nliterals(prev[i], literal, &nliteral)) 814 return 0; 815 if (r600_bytecode_alu_nliterals(prev[i], prev_literal, &prev_nliteral)) 816 return 0; 817 if (is_alu_mova_inst(prev[i])) { 818 if (have_rel) 819 return 0; 820 have_mova = 1; 821 } 822 823 if (alu_uses_rel(prev[i])) { 824 if (have_mova) { 825 return 0; 826 } 827 have_rel = 1; 828 } 829 if (alu_uses_lds(prev[i])) 830 return 0; 831 832 num_once_inst += is_alu_once_inst(prev[i]); 833 } 834 if (slots[i] && r600_bytecode_alu_nliterals(slots[i], literal, &nliteral)) 835 return 0; 836 837 /* Let's check used slots. */ 838 if (prev[i] && !slots[i]) { 839 result[i] = prev[i]; 840 continue; 841 } else if (prev[i] && slots[i]) { 842 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) { 843 /* Trans unit is still free try to use it. */ 844 if (is_alu_any_unit_inst(bc, slots[i]) && !alu_uses_lds(slots[i])) { 845 result[i] = prev[i]; 846 result[4] = slots[i]; 847 } else if (is_alu_any_unit_inst(bc, prev[i])) { 848 if (slots[i]->dst.sel == prev[i]->dst.sel && 849 alu_writes(slots[i]) && 850 alu_writes(prev[i])) 851 return 0; 852 853 result[i] = slots[i]; 854 result[4] = prev[i]; 855 } else 856 return 0; 857 } else 858 return 0; 859 } else if(!slots[i]) { 860 continue; 861 } else { 862 if (max_slots == 5 && slots[i] && prev[4] && 863 slots[i]->dst.sel == prev[4]->dst.sel && 864 slots[i]->dst.chan == prev[4]->dst.chan && 865 alu_writes(slots[i]) && 866 alu_writes(prev[4])) 867 return 0; 868 869 result[i] = slots[i]; 870 } 871 872 alu = slots[i]; 873 num_once_inst += is_alu_once_inst(alu); 874 875 /* don't reschedule NOPs */ 876 if (is_nop_inst(alu)) 877 return 0; 878 879 if (is_alu_mova_inst(alu)) { 880 if (have_rel) { 881 return 0; 882 } 883 have_mova = 1; 884 } 885 886 if (alu_uses_rel(alu)) { 887 if (have_mova) { 888 return 0; 889 } 890 have_rel = 1; 891 } 892 893 if (alu->op == ALU_OP0_SET_CF_IDX0 || 894 alu->op == ALU_OP0_SET_CF_IDX1) 895 return 0; /* data hazard with MOVA */ 896 897 /* Let's check source gprs */ 898 num_src = r600_bytecode_get_num_operands(alu); 899 for (src = 0; src < num_src; ++src) { 900 901 /* Constants don't matter. */ 902 if (!is_gpr(alu->src[src].sel)) 903 continue; 904 905 for (j = 0; j < max_slots; ++j) { 906 if (!prev[j] || !alu_writes(prev[j])) 907 continue; 908 909 /* If it's relative then we can't determin which gpr is really used. */ 910 if (prev[j]->dst.chan == alu->src[src].chan && 911 (prev[j]->dst.sel == alu->src[src].sel || 912 prev[j]->dst.rel || alu->src[src].rel)) 913 return 0; 914 } 915 } 916 } 917 918 /* more than one PRED_ or KILL_ ? */ 919 if (num_once_inst > 1) 920 return 0; 921 922 /* check if the result can still be swizzlet */ 923 r = check_and_set_bank_swizzle(bc, result); 924 if (r) 925 return 0; 926 927 /* looks like everything worked out right, apply the changes */ 928 929 /* undo adding previus literals */ 930 bc->cf_last->ndw -= align(prev_nliteral, 2); 931 932 /* sort instructions */ 933 for (i = 0; i < max_slots; ++i) { 934 slots[i] = result[i]; 935 if (result[i]) { 936 list_del(&result[i]->list); 937 result[i]->last = 0; 938 list_addtail(&result[i]->list, &bc->cf_last->alu); 939 } 940 } 941 942 /* determine new last instruction */ 943 LIST_ENTRY(struct r600_bytecode_alu, bc->cf_last->alu.prev, list)->last = 1; 944 945 /* determine new first instruction */ 946 for (i = 0; i < max_slots; ++i) { 947 if (result[i]) { 948 bc->cf_last->curr_bs_head = result[i]; 949 break; 950 } 951 } 952 953 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head; 954 bc->cf_last->prev2_bs_head = NULL; 955 956 return 0; 957} 958 959/* we'll keep kcache sets sorted by bank & addr */ 960static int r600_bytecode_alloc_kcache_line(struct r600_bytecode *bc, 961 struct r600_bytecode_kcache *kcache, 962 unsigned bank, unsigned line, unsigned index_mode) 963{ 964 int i, kcache_banks = bc->chip_class >= EVERGREEN ? 4 : 2; 965 966 for (i = 0; i < kcache_banks; i++) { 967 if (kcache[i].mode) { 968 int d; 969 970 if (kcache[i].bank < bank) 971 continue; 972 973 if ((kcache[i].bank == bank && kcache[i].addr > line+1) || 974 kcache[i].bank > bank) { 975 /* try to insert new line */ 976 if (kcache[kcache_banks-1].mode) { 977 /* all sets are in use */ 978 return -ENOMEM; 979 } 980 981 memmove(&kcache[i+1],&kcache[i], (kcache_banks-i-1)*sizeof(struct r600_bytecode_kcache)); 982 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 983 kcache[i].bank = bank; 984 kcache[i].addr = line; 985 kcache[i].index_mode = index_mode; 986 return 0; 987 } 988 989 d = line - kcache[i].addr; 990 991 if (d == -1) { 992 kcache[i].addr--; 993 if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_2) { 994 /* we are prepending the line to the current set, 995 * discarding the existing second line, 996 * so we'll have to insert line+2 after it */ 997 line += 2; 998 continue; 999 } else if (kcache[i].mode == V_SQ_CF_KCACHE_LOCK_1) { 1000 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 1001 return 0; 1002 } else { 1003 /* V_SQ_CF_KCACHE_LOCK_LOOP_INDEX is not supported */ 1004 return -ENOMEM; 1005 } 1006 } else if (d == 1) { 1007 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_2; 1008 return 0; 1009 } else if (d == 0) 1010 return 0; 1011 } else { /* free kcache set - use it */ 1012 kcache[i].mode = V_SQ_CF_KCACHE_LOCK_1; 1013 kcache[i].bank = bank; 1014 kcache[i].addr = line; 1015 kcache[i].index_mode = index_mode; 1016 return 0; 1017 } 1018 } 1019 return -ENOMEM; 1020} 1021 1022static int r600_bytecode_alloc_inst_kcache_lines(struct r600_bytecode *bc, 1023 struct r600_bytecode_kcache *kcache, 1024 struct r600_bytecode_alu *alu) 1025{ 1026 int i, r; 1027 1028 for (i = 0; i < 3; i++) { 1029 unsigned bank, line, sel = alu->src[i].sel, index_mode; 1030 1031 if (sel < 512) 1032 continue; 1033 1034 bank = alu->src[i].kc_bank; 1035 assert(bank < R600_MAX_HW_CONST_BUFFERS); 1036 line = (sel-512)>>4; 1037 index_mode = alu->src[i].kc_rel ? 1 : 0; // V_SQ_CF_INDEX_0 / V_SQ_CF_INDEX_NONE 1038 1039 if ((r = r600_bytecode_alloc_kcache_line(bc, kcache, bank, line, index_mode))) 1040 return r; 1041 } 1042 return 0; 1043} 1044 1045static int r600_bytecode_assign_kcache_banks( 1046 struct r600_bytecode_alu *alu, 1047 struct r600_bytecode_kcache * kcache) 1048{ 1049 int i, j; 1050 1051 /* Alter the src operands to refer to the kcache. */ 1052 for (i = 0; i < 3; ++i) { 1053 static const unsigned int base[] = {128, 160, 256, 288}; 1054 unsigned int line, sel = alu->src[i].sel, found = 0; 1055 1056 if (sel < 512) 1057 continue; 1058 1059 sel -= 512; 1060 line = sel>>4; 1061 1062 for (j = 0; j < 4 && !found; ++j) { 1063 switch (kcache[j].mode) { 1064 case V_SQ_CF_KCACHE_NOP: 1065 case V_SQ_CF_KCACHE_LOCK_LOOP_INDEX: 1066 R600_ERR("unexpected kcache line mode\n"); 1067 return -ENOMEM; 1068 default: 1069 if (kcache[j].bank == alu->src[i].kc_bank && 1070 kcache[j].addr <= line && 1071 line < kcache[j].addr + kcache[j].mode) { 1072 alu->src[i].sel = sel - (kcache[j].addr<<4); 1073 alu->src[i].sel += base[j]; 1074 found=1; 1075 } 1076 } 1077 } 1078 } 1079 return 0; 1080} 1081 1082static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, 1083 struct r600_bytecode_alu *alu, 1084 unsigned type) 1085{ 1086 struct r600_bytecode_kcache kcache_sets[4]; 1087 struct r600_bytecode_kcache *kcache = kcache_sets; 1088 int r; 1089 1090 memcpy(kcache, bc->cf_last->kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1091 1092 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1093 /* can't alloc, need to start new clause */ 1094 if ((r = r600_bytecode_add_cf(bc))) { 1095 return r; 1096 } 1097 bc->cf_last->op = type; 1098 1099 /* retry with the new clause */ 1100 kcache = bc->cf_last->kcache; 1101 if ((r = r600_bytecode_alloc_inst_kcache_lines(bc, kcache, alu))) { 1102 /* can't alloc again- should never happen */ 1103 return r; 1104 } 1105 } else { 1106 /* update kcache sets */ 1107 memcpy(bc->cf_last->kcache, kcache, 4 * sizeof(struct r600_bytecode_kcache)); 1108 } 1109 1110 /* if we actually used more than 2 kcache sets, or have relative indexing - use ALU_EXTENDED on eg+ */ 1111 if (kcache[2].mode != V_SQ_CF_KCACHE_NOP || 1112 kcache[0].index_mode || kcache[1].index_mode || kcache[2].index_mode || kcache[3].index_mode) { 1113 if (bc->chip_class < EVERGREEN) 1114 return -ENOMEM; 1115 bc->cf_last->eg_alu_extended = 1; 1116 } 1117 1118 return 0; 1119} 1120 1121static int insert_nop_r6xx(struct r600_bytecode *bc) 1122{ 1123 struct r600_bytecode_alu alu; 1124 int r, i; 1125 1126 for (i = 0; i < 4; i++) { 1127 memset(&alu, 0, sizeof(alu)); 1128 alu.op = ALU_OP0_NOP; 1129 alu.src[0].chan = i; 1130 alu.dst.chan = i; 1131 alu.last = (i == 3); 1132 r = r600_bytecode_add_alu(bc, &alu); 1133 if (r) 1134 return r; 1135 } 1136 return 0; 1137} 1138 1139/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1140static int load_ar_r6xx(struct r600_bytecode *bc) 1141{ 1142 struct r600_bytecode_alu alu; 1143 int r; 1144 1145 if (bc->ar_loaded) 1146 return 0; 1147 1148 /* hack to avoid making MOVA the last instruction in the clause */ 1149 if ((bc->cf_last->ndw>>1) >= 110) 1150 bc->force_add_cf = 1; 1151 1152 memset(&alu, 0, sizeof(alu)); 1153 alu.op = ALU_OP1_MOVA_GPR_INT; 1154 alu.src[0].sel = bc->ar_reg; 1155 alu.src[0].chan = bc->ar_chan; 1156 alu.last = 1; 1157 alu.index_mode = INDEX_MODE_LOOP; 1158 r = r600_bytecode_add_alu(bc, &alu); 1159 if (r) 1160 return r; 1161 1162 /* no requirement to set uses waterfall on MOVA_GPR_INT */ 1163 bc->ar_loaded = 1; 1164 return 0; 1165} 1166 1167/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ 1168static int load_ar(struct r600_bytecode *bc) 1169{ 1170 struct r600_bytecode_alu alu; 1171 int r; 1172 1173 if (bc->ar_handling) 1174 return load_ar_r6xx(bc); 1175 1176 if (bc->ar_loaded) 1177 return 0; 1178 1179 /* hack to avoid making MOVA the last instruction in the clause */ 1180 if ((bc->cf_last->ndw>>1) >= 110) 1181 bc->force_add_cf = 1; 1182 1183 memset(&alu, 0, sizeof(alu)); 1184 alu.op = ALU_OP1_MOVA_INT; 1185 alu.src[0].sel = bc->ar_reg; 1186 alu.src[0].chan = bc->ar_chan; 1187 alu.last = 1; 1188 r = r600_bytecode_add_alu(bc, &alu); 1189 if (r) 1190 return r; 1191 1192 bc->cf_last->r6xx_uses_waterfall = 1; 1193 bc->ar_loaded = 1; 1194 return 0; 1195} 1196 1197int r600_bytecode_add_alu_type(struct r600_bytecode *bc, 1198 const struct r600_bytecode_alu *alu, unsigned type) 1199{ 1200 struct r600_bytecode_alu *nalu = r600_bytecode_alu(); 1201 struct r600_bytecode_alu *lalu; 1202 int i, r; 1203 1204 if (!nalu) 1205 return -ENOMEM; 1206 memcpy(nalu, alu, sizeof(struct r600_bytecode_alu)); 1207 1208 if (alu->is_op3) { 1209 /* will fail later since alu does not support it. */ 1210 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1211 } 1212 1213 if (bc->cf_last != NULL && bc->cf_last->op != type) { 1214 /* check if we could add it anyway */ 1215 if (bc->cf_last->op == CF_OP_ALU && 1216 type == CF_OP_ALU_PUSH_BEFORE) { 1217 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) { 1218 if (lalu->execute_mask) { 1219 bc->force_add_cf = 1; 1220 break; 1221 } 1222 } 1223 } else 1224 bc->force_add_cf = 1; 1225 } 1226 1227 /* cf can contains only alu or only vtx or only tex */ 1228 if (bc->cf_last == NULL || bc->force_add_cf) { 1229 r = r600_bytecode_add_cf(bc); 1230 if (r) { 1231 free(nalu); 1232 return r; 1233 } 1234 } 1235 bc->cf_last->op = type; 1236 1237 /* Load index register if required */ 1238 if (bc->chip_class >= EVERGREEN) { 1239 for (i = 0; i < 3; i++) 1240 if (nalu->src[i].kc_bank && nalu->src[i].kc_rel) 1241 egcm_load_index_reg(bc, 0, true); 1242 } 1243 1244 /* Check AR usage and load it if required */ 1245 for (i = 0; i < 3; i++) 1246 if (nalu->src[i].rel && !bc->ar_loaded) 1247 load_ar(bc); 1248 1249 if (nalu->dst.rel && !bc->ar_loaded) 1250 load_ar(bc); 1251 1252 /* Setup the kcache for this ALU instruction. This will start a new 1253 * ALU clause if needed. */ 1254 if ((r = r600_bytecode_alloc_kcache_lines(bc, nalu, type))) { 1255 free(nalu); 1256 return r; 1257 } 1258 1259 if (!bc->cf_last->curr_bs_head) { 1260 bc->cf_last->curr_bs_head = nalu; 1261 } 1262 /* number of gpr == the last gpr used in any alu */ 1263 for (i = 0; i < 3; i++) { 1264 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) { 1265 bc->ngpr = nalu->src[i].sel + 1; 1266 } 1267 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL) 1268 r600_bytecode_special_constants(nalu->src[i].value, 1269 &nalu->src[i].sel); 1270 } 1271 if (nalu->dst.sel >= bc->ngpr) { 1272 bc->ngpr = nalu->dst.sel + 1; 1273 } 1274 list_addtail(&nalu->list, &bc->cf_last->alu); 1275 /* each alu use 2 dwords */ 1276 bc->cf_last->ndw += 2; 1277 bc->ndw += 2; 1278 1279 /* process cur ALU instructions for bank swizzle */ 1280 if (nalu->last) { 1281 uint32_t literal[4]; 1282 unsigned nliteral; 1283 struct r600_bytecode_alu *slots[5]; 1284 int max_slots = bc->chip_class == CAYMAN ? 4 : 5; 1285 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots); 1286 if (r) 1287 return r; 1288 1289 if (bc->cf_last->prev_bs_head) { 1290 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head); 1291 if (r) 1292 return r; 1293 } 1294 1295 if (bc->cf_last->prev_bs_head) { 1296 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head); 1297 if (r) 1298 return r; 1299 } 1300 1301 r = check_and_set_bank_swizzle(bc, slots); 1302 if (r) 1303 return r; 1304 1305 for (i = 0, nliteral = 0; i < max_slots; i++) { 1306 if (slots[i]) { 1307 r = r600_bytecode_alu_nliterals(slots[i], literal, &nliteral); 1308 if (r) 1309 return r; 1310 } 1311 } 1312 bc->cf_last->ndw += align(nliteral, 2); 1313 1314 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots) 1315 * worst case */ 1316 if ((bc->cf_last->ndw >> 1) >= 120) { 1317 bc->force_add_cf = 1; 1318 } 1319 1320 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head; 1321 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; 1322 bc->cf_last->curr_bs_head = NULL; 1323 } 1324 1325 if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst) 1326 insert_nop_r6xx(bc); 1327 1328 /* Might need to insert spill write ops after current clause */ 1329 if (nalu->last && bc->n_pending_outputs) { 1330 while (bc->n_pending_outputs) { 1331 r = r600_bytecode_add_output(bc, &bc->pending_outputs[--bc->n_pending_outputs]); 1332 if (r) 1333 return r; 1334 } 1335 } 1336 1337 return 0; 1338} 1339 1340int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu) 1341{ 1342 return r600_bytecode_add_alu_type(bc, alu, CF_OP_ALU); 1343} 1344 1345static unsigned r600_bytecode_num_tex_and_vtx_instructions(const struct r600_bytecode *bc) 1346{ 1347 switch (bc->chip_class) { 1348 case R600: 1349 return 8; 1350 1351 case R700: 1352 case EVERGREEN: 1353 case CAYMAN: 1354 return 16; 1355 1356 default: 1357 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1358 return 8; 1359 } 1360} 1361 1362static inline boolean last_inst_was_not_vtx_fetch(struct r600_bytecode *bc) 1363{ 1364 return !((r600_isa_cf(bc->cf_last->op)->flags & CF_FETCH) && 1365 bc->cf_last->op != CF_OP_GDS && 1366 (bc->chip_class == CAYMAN || 1367 bc->cf_last->op != CF_OP_TEX)); 1368} 1369 1370static int r600_bytecode_add_vtx_internal(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx, 1371 bool use_tc) 1372{ 1373 struct r600_bytecode_vtx *nvtx = r600_bytecode_vtx(); 1374 int r; 1375 1376 if (!nvtx) 1377 return -ENOMEM; 1378 memcpy(nvtx, vtx, sizeof(struct r600_bytecode_vtx)); 1379 1380 /* Load index register if required */ 1381 if (bc->chip_class >= EVERGREEN) { 1382 if (vtx->buffer_index_mode) 1383 egcm_load_index_reg(bc, vtx->buffer_index_mode - 1, false); 1384 } 1385 1386 /* cf can contains only alu or only vtx or only tex */ 1387 if (bc->cf_last == NULL || 1388 last_inst_was_not_vtx_fetch(bc) || 1389 bc->force_add_cf) { 1390 r = r600_bytecode_add_cf(bc); 1391 if (r) { 1392 free(nvtx); 1393 return r; 1394 } 1395 switch (bc->chip_class) { 1396 case R600: 1397 case R700: 1398 bc->cf_last->op = CF_OP_VTX; 1399 break; 1400 case EVERGREEN: 1401 if (use_tc) 1402 bc->cf_last->op = CF_OP_TEX; 1403 else 1404 bc->cf_last->op = CF_OP_VTX; 1405 break; 1406 case CAYMAN: 1407 bc->cf_last->op = CF_OP_TEX; 1408 break; 1409 default: 1410 R600_ERR("Unknown chip class %d.\n", bc->chip_class); 1411 free(nvtx); 1412 return -EINVAL; 1413 } 1414 } 1415 list_addtail(&nvtx->list, &bc->cf_last->vtx); 1416 /* each fetch use 4 dwords */ 1417 bc->cf_last->ndw += 4; 1418 bc->ndw += 4; 1419 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1420 bc->force_add_cf = 1; 1421 1422 bc->ngpr = MAX2(bc->ngpr, vtx->src_gpr + 1); 1423 bc->ngpr = MAX2(bc->ngpr, vtx->dst_gpr + 1); 1424 1425 return 0; 1426} 1427 1428int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1429{ 1430 return r600_bytecode_add_vtx_internal(bc, vtx, false); 1431} 1432 1433int r600_bytecode_add_vtx_tc(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx) 1434{ 1435 return r600_bytecode_add_vtx_internal(bc, vtx, true); 1436} 1437 1438int r600_bytecode_add_tex(struct r600_bytecode *bc, const struct r600_bytecode_tex *tex) 1439{ 1440 struct r600_bytecode_tex *ntex = r600_bytecode_tex(); 1441 int r; 1442 1443 if (!ntex) 1444 return -ENOMEM; 1445 memcpy(ntex, tex, sizeof(struct r600_bytecode_tex)); 1446 1447 /* Load index register if required */ 1448 if (bc->chip_class >= EVERGREEN) { 1449 if (tex->sampler_index_mode || tex->resource_index_mode) 1450 egcm_load_index_reg(bc, 1, false); 1451 } 1452 1453 /* we can't fetch data und use it as texture lookup address in the same TEX clause */ 1454 if (bc->cf_last != NULL && 1455 bc->cf_last->op == CF_OP_TEX) { 1456 struct r600_bytecode_tex *ttex; 1457 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) { 1458 if (ttex->dst_gpr == ntex->src_gpr && 1459 (ttex->dst_sel_x < 4 || ttex->dst_sel_y < 4 || 1460 ttex->dst_sel_z < 4 || ttex->dst_sel_w < 4)) { 1461 bc->force_add_cf = 1; 1462 break; 1463 } 1464 } 1465 /* slight hack to make gradients always go into same cf */ 1466 if (ntex->op == FETCH_OP_SET_GRADIENTS_H) 1467 bc->force_add_cf = 1; 1468 } 1469 1470 /* cf can contains only alu or only vtx or only tex */ 1471 if (bc->cf_last == NULL || 1472 bc->cf_last->op != CF_OP_TEX || 1473 bc->force_add_cf) { 1474 r = r600_bytecode_add_cf(bc); 1475 if (r) { 1476 free(ntex); 1477 return r; 1478 } 1479 bc->cf_last->op = CF_OP_TEX; 1480 } 1481 if (ntex->src_gpr >= bc->ngpr) { 1482 bc->ngpr = ntex->src_gpr + 1; 1483 } 1484 if (ntex->dst_gpr >= bc->ngpr) { 1485 bc->ngpr = ntex->dst_gpr + 1; 1486 } 1487 list_addtail(&ntex->list, &bc->cf_last->tex); 1488 /* each texture fetch use 4 dwords */ 1489 bc->cf_last->ndw += 4; 1490 bc->ndw += 4; 1491 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1492 bc->force_add_cf = 1; 1493 return 0; 1494} 1495 1496int r600_bytecode_add_gds(struct r600_bytecode *bc, const struct r600_bytecode_gds *gds) 1497{ 1498 struct r600_bytecode_gds *ngds = r600_bytecode_gds(); 1499 int r; 1500 1501 if (ngds == NULL) 1502 return -ENOMEM; 1503 memcpy(ngds, gds, sizeof(struct r600_bytecode_gds)); 1504 1505 if (bc->chip_class >= EVERGREEN) { 1506 if (gds->uav_index_mode) 1507 egcm_load_index_reg(bc, gds->uav_index_mode - 1, false); 1508 } 1509 1510 if (bc->cf_last == NULL || 1511 bc->cf_last->op != CF_OP_GDS || 1512 bc->force_add_cf) { 1513 r = r600_bytecode_add_cf(bc); 1514 if (r) { 1515 free(ngds); 1516 return r; 1517 } 1518 bc->cf_last->op = CF_OP_GDS; 1519 } 1520 1521 list_addtail(&ngds->list, &bc->cf_last->gds); 1522 bc->cf_last->ndw += 4; /* each GDS uses 4 dwords */ 1523 if ((bc->cf_last->ndw / 4) >= r600_bytecode_num_tex_and_vtx_instructions(bc)) 1524 bc->force_add_cf = 1; 1525 return 0; 1526} 1527 1528int r600_bytecode_add_cfinst(struct r600_bytecode *bc, unsigned op) 1529{ 1530 int r; 1531 1532 /* Emit WAIT_ACK before control flow to ensure pending writes are always acked. */ 1533 if (op != CF_OP_MEM_SCRATCH && bc->need_wait_ack) { 1534 bc->need_wait_ack = false; 1535 r = r600_bytecode_add_cfinst(bc, CF_OP_WAIT_ACK); 1536 } 1537 1538 r = r600_bytecode_add_cf(bc); 1539 if (r) 1540 return r; 1541 1542 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE; 1543 bc->cf_last->op = op; 1544 return 0; 1545} 1546 1547int cm_bytecode_add_cf_end(struct r600_bytecode *bc) 1548{ 1549 return r600_bytecode_add_cfinst(bc, CF_OP_CF_END); 1550} 1551 1552/* common to all 3 families */ 1553static int r600_bytecode_vtx_build(struct r600_bytecode *bc, struct r600_bytecode_vtx *vtx, unsigned id) 1554{ 1555 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) 1556 return r700_bytecode_fetch_mem_build(bc, vtx, id); 1557 bc->bytecode[id] = S_SQ_VTX_WORD0_VTX_INST(r600_isa_fetch_opcode(bc->isa->hw_class, vtx->op)) | 1558 S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | 1559 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) | 1560 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) | 1561 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x); 1562 if (bc->chip_class < CAYMAN) 1563 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count); 1564 id++; 1565 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) | 1566 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) | 1567 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) | 1568 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) | 1569 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) | 1570 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) | 1571 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) | 1572 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) | 1573 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) | 1574 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr); 1575 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)| 1576 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian); 1577 if (bc->chip_class >= EVERGREEN) 1578 bc->bytecode[id] |= ((vtx->buffer_index_mode & 0x3) << 21); // S_SQ_VTX_WORD2_BIM(vtx->buffer_index_mode); 1579 if (bc->chip_class < CAYMAN) 1580 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1); 1581 id++; 1582 bc->bytecode[id++] = 0; 1583 return 0; 1584} 1585 1586/* common to all 3 families */ 1587static int r600_bytecode_tex_build(struct r600_bytecode *bc, struct r600_bytecode_tex *tex, unsigned id) 1588{ 1589 bc->bytecode[id] = S_SQ_TEX_WORD0_TEX_INST( 1590 r600_isa_fetch_opcode(bc->isa->hw_class, tex->op)) | 1591 EG_S_SQ_TEX_WORD0_INST_MOD(tex->inst_mod) | 1592 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) | 1593 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) | 1594 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel); 1595 if (bc->chip_class >= EVERGREEN) 1596 bc->bytecode[id] |= ((tex->sampler_index_mode & 0x3) << 27) | // S_SQ_TEX_WORD0_SIM(tex->sampler_index_mode); 1597 ((tex->resource_index_mode & 0x3) << 25); // S_SQ_TEX_WORD0_RIM(tex->resource_index_mode) 1598 id++; 1599 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) | 1600 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) | 1601 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) | 1602 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) | 1603 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) | 1604 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) | 1605 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) | 1606 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) | 1607 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) | 1608 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) | 1609 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w); 1610 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) | 1611 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) | 1612 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) | 1613 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) | 1614 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) | 1615 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) | 1616 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) | 1617 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w); 1618 bc->bytecode[id++] = 0; 1619 return 0; 1620} 1621 1622/* r600 only, r700/eg bits in r700_asm.c */ 1623static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecode_alu *alu, unsigned id) 1624{ 1625 unsigned opcode = r600_isa_alu_opcode(bc->isa->hw_class, alu->op); 1626 1627 /* don't replace gpr by pv or ps for destination register */ 1628 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | 1629 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) | 1630 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | 1631 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | 1632 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | 1633 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | 1634 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | 1635 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | 1636 S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) | 1637 S_SQ_ALU_WORD0_PRED_SEL(alu->pred_sel) | 1638 S_SQ_ALU_WORD0_LAST(alu->last); 1639 1640 if (alu->is_op3) { 1641 assert(!alu->src[0].abs && !alu->src[1].abs && !alu->src[2].abs); 1642 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1643 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1644 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1645 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1646 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) | 1647 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) | 1648 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) | 1649 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) | 1650 S_SQ_ALU_WORD1_OP3_ALU_INST(opcode) | 1651 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle); 1652 } else { 1653 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | 1654 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | 1655 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | 1656 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | 1657 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) | 1658 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) | 1659 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) | 1660 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) | 1661 S_SQ_ALU_WORD1_OP2_ALU_INST(opcode) | 1662 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) | 1663 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->execute_mask) | 1664 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->update_pred); 1665 } 1666 return 0; 1667} 1668 1669static void r600_bytecode_cf_vtx_build(uint32_t *bytecode, const struct r600_bytecode_cf *cf) 1670{ 1671 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1); 1672 *bytecode++ = S_SQ_CF_WORD1_CF_INST(r600_isa_cf_opcode(ISA_CC_R600, cf->op)) | 1673 S_SQ_CF_WORD1_BARRIER(1) | 1674 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1)| 1675 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1676} 1677 1678/* common for r600/r700 - eg in eg_asm.c */ 1679static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) 1680{ 1681 unsigned id = cf->id; 1682 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1683 unsigned opcode = r600_isa_cf_opcode(bc->isa->hw_class, cf->op); 1684 1685 1686 if (cf->op == CF_NATIVE) { 1687 bc->bytecode[id++] = cf->isa[0]; 1688 bc->bytecode[id++] = cf->isa[1]; 1689 } else if (cfop->flags & CF_ALU) { 1690 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) | 1691 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) | 1692 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) | 1693 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank); 1694 1695 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(opcode) | 1696 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) | 1697 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) | 1698 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) | 1699 S_SQ_CF_ALU_WORD1_BARRIER(1) | 1700 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chip_class == R600 ? cf->r6xx_uses_waterfall : 0) | 1701 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1); 1702 } else if (cfop->flags & CF_FETCH) { 1703 if (bc->chip_class == R700) 1704 r700_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1705 else 1706 r600_bytecode_cf_vtx_build(&bc->bytecode[id], cf); 1707 } else if (cfop->flags & CF_EXP) { 1708 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1709 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1710 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1711 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1712 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1713 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1714 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) | 1715 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) | 1716 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) | 1717 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) | 1718 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1719 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1720 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program); 1721 } else if (cfop->flags & CF_MEM) { 1722 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | 1723 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | 1724 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | 1725 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type) | 1726 S_SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR(cf->output.index_gpr); 1727 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | 1728 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->barrier) | 1729 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(opcode) | 1730 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->end_of_program) | 1731 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) | 1732 S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask); 1733 } else { 1734 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1); 1735 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(opcode) | 1736 S_SQ_CF_WORD1_BARRIER(1) | 1737 S_SQ_CF_WORD1_COND(cf->cond) | 1738 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count) | 1739 S_SQ_CF_WORD1_END_OF_PROGRAM(cf->end_of_program); 1740 } 1741 return 0; 1742} 1743 1744int r600_bytecode_build(struct r600_bytecode *bc) 1745{ 1746 struct r600_bytecode_cf *cf; 1747 struct r600_bytecode_alu *alu; 1748 struct r600_bytecode_vtx *vtx; 1749 struct r600_bytecode_tex *tex; 1750 struct r600_bytecode_gds *gds; 1751 uint32_t literal[4]; 1752 unsigned nliteral; 1753 unsigned addr; 1754 int i, r; 1755 1756 if (!bc->nstack) { // If not 0, Stack_size already provided by llvm 1757 if (bc->stack.max_entries) 1758 bc->nstack = bc->stack.max_entries; 1759 else if (bc->type == PIPE_SHADER_VERTEX || 1760 bc->type == PIPE_SHADER_TESS_EVAL || 1761 bc->type == PIPE_SHADER_TESS_CTRL) 1762 bc->nstack = 1; 1763 } 1764 1765 /* first path compute addr of each CF block */ 1766 /* addr start after all the CF instructions */ 1767 addr = bc->cf_last->id + 2; 1768 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1769 if (r600_isa_cf(cf->op)->flags & CF_FETCH) { 1770 addr += 3; 1771 addr &= 0xFFFFFFFCUL; 1772 } 1773 cf->addr = addr; 1774 addr += cf->ndw; 1775 bc->ndw = cf->addr + cf->ndw; 1776 } 1777 free(bc->bytecode); 1778 bc->bytecode = calloc(4, bc->ndw); 1779 if (bc->bytecode == NULL) 1780 return -ENOMEM; 1781 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 1782 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 1783 addr = cf->addr; 1784 if (bc->chip_class >= EVERGREEN) 1785 r = eg_bytecode_cf_build(bc, cf); 1786 else 1787 r = r600_bytecode_cf_build(bc, cf); 1788 if (r) 1789 return r; 1790 if (cfop->flags & CF_ALU) { 1791 nliteral = 0; 1792 memset(literal, 0, sizeof(literal)); 1793 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 1794 r = r600_bytecode_alu_nliterals(alu, literal, &nliteral); 1795 if (r) 1796 return r; 1797 r600_bytecode_alu_adjust_literals(alu, literal, nliteral); 1798 r600_bytecode_assign_kcache_banks(alu, cf->kcache); 1799 1800 switch(bc->chip_class) { 1801 case R600: 1802 r = r600_bytecode_alu_build(bc, alu, addr); 1803 break; 1804 case R700: 1805 r = r700_bytecode_alu_build(bc, alu, addr); 1806 break; 1807 case EVERGREEN: 1808 case CAYMAN: 1809 r = eg_bytecode_alu_build(bc, alu, addr); 1810 break; 1811 default: 1812 R600_ERR("unknown chip class %d.\n", bc->chip_class); 1813 return -EINVAL; 1814 } 1815 if (r) 1816 return r; 1817 addr += 2; 1818 if (alu->last) { 1819 for (i = 0; i < align(nliteral, 2); ++i) { 1820 bc->bytecode[addr++] = literal[i]; 1821 } 1822 nliteral = 0; 1823 memset(literal, 0, sizeof(literal)); 1824 } 1825 } 1826 } else if (cf->op == CF_OP_VTX) { 1827 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1828 r = r600_bytecode_vtx_build(bc, vtx, addr); 1829 if (r) 1830 return r; 1831 addr += 4; 1832 } 1833 } else if (cf->op == CF_OP_GDS) { 1834 assert(bc->chip_class >= EVERGREEN); 1835 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 1836 r = eg_bytecode_gds_build(bc, gds, addr); 1837 if (r) 1838 return r; 1839 addr += 4; 1840 } 1841 } else if (cf->op == CF_OP_TEX) { 1842 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 1843 assert(bc->chip_class >= EVERGREEN); 1844 r = r600_bytecode_vtx_build(bc, vtx, addr); 1845 if (r) 1846 return r; 1847 addr += 4; 1848 } 1849 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 1850 r = r600_bytecode_tex_build(bc, tex, addr); 1851 if (r) 1852 return r; 1853 addr += 4; 1854 } 1855 } 1856 } 1857 return 0; 1858} 1859 1860void r600_bytecode_clear(struct r600_bytecode *bc) 1861{ 1862 struct r600_bytecode_cf *cf = NULL, *next_cf; 1863 1864 free(bc->bytecode); 1865 bc->bytecode = NULL; 1866 1867 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) { 1868 struct r600_bytecode_alu *alu = NULL, *next_alu; 1869 struct r600_bytecode_tex *tex = NULL, *next_tex; 1870 struct r600_bytecode_tex *vtx = NULL, *next_vtx; 1871 struct r600_bytecode_gds *gds = NULL, *next_gds; 1872 1873 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) { 1874 free(alu); 1875 } 1876 1877 list_inithead(&cf->alu); 1878 1879 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) { 1880 free(tex); 1881 } 1882 1883 list_inithead(&cf->tex); 1884 1885 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) { 1886 free(vtx); 1887 } 1888 1889 list_inithead(&cf->vtx); 1890 1891 LIST_FOR_EACH_ENTRY_SAFE(gds, next_gds, &cf->gds, list) { 1892 free(gds); 1893 } 1894 1895 list_inithead(&cf->gds); 1896 1897 free(cf); 1898 } 1899 1900 list_inithead(&cf->list); 1901} 1902 1903static int print_swizzle(unsigned swz) 1904{ 1905 const char * swzchars = "xyzw01?_"; 1906 assert(swz<8 && swz != 6); 1907 return fprintf(stderr, "%c", swzchars[swz]); 1908} 1909 1910static int print_sel(unsigned sel, unsigned rel, unsigned index_mode, 1911 unsigned need_brackets) 1912{ 1913 int o = 0; 1914 if (rel && index_mode >= 5 && sel < 128) 1915 o += fprintf(stderr, "G"); 1916 if (rel || need_brackets) { 1917 o += fprintf(stderr, "["); 1918 } 1919 o += fprintf(stderr, "%d", sel); 1920 if (rel) { 1921 if (index_mode == 0 || index_mode == 6) 1922 o += fprintf(stderr, "+AR"); 1923 else if (index_mode == 4) 1924 o += fprintf(stderr, "+AL"); 1925 } 1926 if (rel || need_brackets) { 1927 o += fprintf(stderr, "]"); 1928 } 1929 return o; 1930} 1931 1932static int print_dst(struct r600_bytecode_alu *alu) 1933{ 1934 int o = 0; 1935 unsigned sel = alu->dst.sel; 1936 char reg_char = 'R'; 1937 if (sel > 128 - 4) { /* clause temporary gpr */ 1938 sel -= 128 - 4; 1939 reg_char = 'T'; 1940 } 1941 1942 if (alu_writes(alu)) { 1943 o += fprintf(stderr, "%c", reg_char); 1944 o += print_sel(alu->dst.sel, alu->dst.rel, alu->index_mode, 0); 1945 } else { 1946 o += fprintf(stderr, "__"); 1947 } 1948 o += fprintf(stderr, "."); 1949 o += print_swizzle(alu->dst.chan); 1950 return o; 1951} 1952 1953static int print_src(struct r600_bytecode_alu *alu, unsigned idx) 1954{ 1955 int o = 0; 1956 struct r600_bytecode_alu_src *src = &alu->src[idx]; 1957 unsigned sel = src->sel, need_sel = 1, need_chan = 1, need_brackets = 0; 1958 1959 if (src->neg) 1960 o += fprintf(stderr,"-"); 1961 if (src->abs) 1962 o += fprintf(stderr,"|"); 1963 1964 if (sel < 128 - 4) { 1965 o += fprintf(stderr, "R"); 1966 } else if (sel < 128) { 1967 o += fprintf(stderr, "T"); 1968 sel -= 128 - 4; 1969 } else if (sel < 160) { 1970 o += fprintf(stderr, "KC0"); 1971 need_brackets = 1; 1972 sel -= 128; 1973 } else if (sel < 192) { 1974 o += fprintf(stderr, "KC1"); 1975 need_brackets = 1; 1976 sel -= 160; 1977 } else if (sel >= 512) { 1978 o += fprintf(stderr, "C%d", src->kc_bank); 1979 need_brackets = 1; 1980 sel -= 512; 1981 } else if (sel >= 448) { 1982 o += fprintf(stderr, "Param"); 1983 sel -= 448; 1984 need_chan = 0; 1985 } else if (sel >= 288) { 1986 o += fprintf(stderr, "KC3"); 1987 need_brackets = 1; 1988 sel -= 288; 1989 } else if (sel >= 256) { 1990 o += fprintf(stderr, "KC2"); 1991 need_brackets = 1; 1992 sel -= 256; 1993 } else { 1994 need_sel = 0; 1995 need_chan = 0; 1996 switch (sel) { 1997 case EG_V_SQ_ALU_SRC_LDS_DIRECT_A: 1998 o += fprintf(stderr, "LDS_A[0x%08X]", src->value); 1999 break; 2000 case EG_V_SQ_ALU_SRC_LDS_DIRECT_B: 2001 o += fprintf(stderr, "LDS_B[0x%08X]", src->value); 2002 break; 2003 case EG_V_SQ_ALU_SRC_LDS_OQ_A: 2004 o += fprintf(stderr, "LDS_OQ_A"); 2005 need_chan = 1; 2006 break; 2007 case EG_V_SQ_ALU_SRC_LDS_OQ_B: 2008 o += fprintf(stderr, "LDS_OQ_B"); 2009 need_chan = 1; 2010 break; 2011 case EG_V_SQ_ALU_SRC_LDS_OQ_A_POP: 2012 o += fprintf(stderr, "LDS_OQ_A_POP"); 2013 need_chan = 1; 2014 break; 2015 case EG_V_SQ_ALU_SRC_LDS_OQ_B_POP: 2016 o += fprintf(stderr, "LDS_OQ_B_POP"); 2017 need_chan = 1; 2018 break; 2019 case EG_V_SQ_ALU_SRC_TIME_LO: 2020 o += fprintf(stderr, "TIME_LO"); 2021 break; 2022 case EG_V_SQ_ALU_SRC_TIME_HI: 2023 o += fprintf(stderr, "TIME_HI"); 2024 break; 2025 case EG_V_SQ_ALU_SRC_SE_ID: 2026 o += fprintf(stderr, "SE_ID"); 2027 break; 2028 case EG_V_SQ_ALU_SRC_SIMD_ID: 2029 o += fprintf(stderr, "SIMD_ID"); 2030 break; 2031 case EG_V_SQ_ALU_SRC_HW_WAVE_ID: 2032 o += fprintf(stderr, "HW_WAVE_ID"); 2033 break; 2034 case V_SQ_ALU_SRC_PS: 2035 o += fprintf(stderr, "PS"); 2036 break; 2037 case V_SQ_ALU_SRC_PV: 2038 o += fprintf(stderr, "PV"); 2039 need_chan = 1; 2040 break; 2041 case V_SQ_ALU_SRC_LITERAL: 2042 o += fprintf(stderr, "[0x%08X %f]", src->value, u_bitcast_u2f(src->value)); 2043 break; 2044 case V_SQ_ALU_SRC_0_5: 2045 o += fprintf(stderr, "0.5"); 2046 break; 2047 case V_SQ_ALU_SRC_M_1_INT: 2048 o += fprintf(stderr, "-1"); 2049 break; 2050 case V_SQ_ALU_SRC_1_INT: 2051 o += fprintf(stderr, "1"); 2052 break; 2053 case V_SQ_ALU_SRC_1: 2054 o += fprintf(stderr, "1.0"); 2055 break; 2056 case V_SQ_ALU_SRC_0: 2057 o += fprintf(stderr, "0"); 2058 break; 2059 default: 2060 o += fprintf(stderr, "??IMM_%d", sel); 2061 break; 2062 } 2063 } 2064 2065 if (need_sel) 2066 o += print_sel(sel, src->rel, alu->index_mode, need_brackets); 2067 2068 if (need_chan) { 2069 o += fprintf(stderr, "."); 2070 o += print_swizzle(src->chan); 2071 } 2072 2073 if (src->abs) 2074 o += fprintf(stderr,"|"); 2075 2076 return o; 2077} 2078 2079static int print_indent(int p, int c) 2080{ 2081 int o = 0; 2082 while (p++ < c) 2083 o += fprintf(stderr, " "); 2084 return o; 2085} 2086 2087void r600_bytecode_disasm(struct r600_bytecode *bc) 2088{ 2089 const char *index_mode[] = {"CF_INDEX_NONE", "CF_INDEX_0", "CF_INDEX_1"}; 2090 static int index = 0; 2091 struct r600_bytecode_cf *cf = NULL; 2092 struct r600_bytecode_alu *alu = NULL; 2093 struct r600_bytecode_vtx *vtx = NULL; 2094 struct r600_bytecode_tex *tex = NULL; 2095 struct r600_bytecode_gds *gds = NULL; 2096 2097 unsigned i, id, ngr = 0, last; 2098 uint32_t literal[4]; 2099 unsigned nliteral; 2100 char chip = '6'; 2101 2102 switch (bc->chip_class) { 2103 case R700: 2104 chip = '7'; 2105 break; 2106 case EVERGREEN: 2107 chip = 'E'; 2108 break; 2109 case CAYMAN: 2110 chip = 'C'; 2111 break; 2112 case R600: 2113 default: 2114 chip = '6'; 2115 break; 2116 } 2117 fprintf(stderr, "bytecode %d dw -- %d gprs -- %d nstack -------------\n", 2118 bc->ndw, bc->ngpr, bc->nstack); 2119 fprintf(stderr, "shader %d -- %c\n", index++, chip); 2120 2121 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { 2122 id = cf->id; 2123 if (cf->op == CF_NATIVE) { 2124 fprintf(stderr, "%04d %08X %08X CF_NATIVE\n", id, bc->bytecode[id], 2125 bc->bytecode[id + 1]); 2126 } else { 2127 const struct cf_op_info *cfop = r600_isa_cf(cf->op); 2128 if (cfop->flags & CF_ALU) { 2129 if (cf->eg_alu_extended) { 2130 fprintf(stderr, "%04d %08X %08X %s\n", id, bc->bytecode[id], 2131 bc->bytecode[id + 1], "ALU_EXT"); 2132 id += 2; 2133 } 2134 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2135 bc->bytecode[id + 1], cfop->name); 2136 fprintf(stderr, "%d @%d ", cf->ndw / 2, cf->addr); 2137 for (i = 0; i < 4; ++i) { 2138 if (cf->kcache[i].mode) { 2139 int c_start = (cf->kcache[i].addr << 4); 2140 int c_end = c_start + (cf->kcache[i].mode << 4); 2141 fprintf(stderr, "KC%d[CB%d:%d-%d%s%s] ", 2142 i, cf->kcache[i].bank, c_start, c_end, 2143 cf->kcache[i].index_mode ? " " : "", 2144 cf->kcache[i].index_mode ? index_mode[cf->kcache[i].index_mode] : ""); 2145 } 2146 } 2147 fprintf(stderr, "\n"); 2148 } else if (cfop->flags & CF_FETCH) { 2149 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2150 bc->bytecode[id + 1], cfop->name); 2151 fprintf(stderr, "%d @%d ", cf->ndw / 4, cf->addr); 2152 if (cf->vpm) 2153 fprintf(stderr, "VPM "); 2154 if (cf->end_of_program) 2155 fprintf(stderr, "EOP "); 2156 fprintf(stderr, "\n"); 2157 2158 } else if (cfop->flags & CF_EXP) { 2159 int o = 0; 2160 const char *exp_type[] = {"PIXEL", "POS ", "PARAM"}; 2161 o += fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2162 bc->bytecode[id + 1], cfop->name); 2163 o += print_indent(o, 43); 2164 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2165 if (cf->output.burst_count > 1) { 2166 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2167 cf->output.array_base + cf->output.burst_count - 1); 2168 2169 o += print_indent(o, 55); 2170 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2171 cf->output.gpr + cf->output.burst_count - 1); 2172 } else { 2173 o += fprintf(stderr, "%d ", cf->output.array_base); 2174 o += print_indent(o, 55); 2175 o += fprintf(stderr, "R%d.", cf->output.gpr); 2176 } 2177 2178 o += print_swizzle(cf->output.swizzle_x); 2179 o += print_swizzle(cf->output.swizzle_y); 2180 o += print_swizzle(cf->output.swizzle_z); 2181 o += print_swizzle(cf->output.swizzle_w); 2182 2183 print_indent(o, 67); 2184 2185 fprintf(stderr, " ES:%X ", cf->output.elem_size); 2186 if (cf->mark) 2187 fprintf(stderr, "MARK "); 2188 if (!cf->barrier) 2189 fprintf(stderr, "NO_BARRIER "); 2190 if (cf->end_of_program) 2191 fprintf(stderr, "EOP "); 2192 fprintf(stderr, "\n"); 2193 } else if (r600_isa_cf(cf->op)->flags & CF_MEM) { 2194 int o = 0; 2195 const char *exp_type[] = {"WRITE", "WRITE_IND", "WRITE_ACK", 2196 "WRITE_IND_ACK"}; 2197 o += fprintf(stderr, "%04d %08X %08X %s ", id, 2198 bc->bytecode[id], bc->bytecode[id + 1], cfop->name); 2199 o += print_indent(o, 43); 2200 o += fprintf(stderr, "%s ", exp_type[cf->output.type]); 2201 2202 if (r600_isa_cf(cf->op)->flags & CF_RAT) { 2203 o += fprintf(stderr, "RAT%d", cf->rat.id); 2204 if (cf->rat.index_mode) { 2205 o += fprintf(stderr, "[IDX%d]", cf->rat.index_mode - 1); 2206 } 2207 o += fprintf(stderr, " INST: %d ", cf->rat.inst); 2208 } 2209 2210 if (cf->output.burst_count > 1) { 2211 o += fprintf(stderr, "%d-%d ", cf->output.array_base, 2212 cf->output.array_base + cf->output.burst_count - 1); 2213 o += print_indent(o, 55); 2214 o += fprintf(stderr, "R%d-%d.", cf->output.gpr, 2215 cf->output.gpr + cf->output.burst_count - 1); 2216 } else { 2217 o += fprintf(stderr, "%d ", cf->output.array_base); 2218 o += print_indent(o, 55); 2219 o += fprintf(stderr, "R%d.", cf->output.gpr); 2220 } 2221 for (i = 0; i < 4; ++i) { 2222 if (cf->output.comp_mask & (1 << i)) 2223 o += print_swizzle(i); 2224 else 2225 o += print_swizzle(7); 2226 } 2227 2228 if (cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND || 2229 cf->output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND) 2230 o += fprintf(stderr, " R%d", cf->output.index_gpr); 2231 2232 o += print_indent(o, 67); 2233 2234 fprintf(stderr, " ES:%i ", cf->output.elem_size); 2235 if (cf->output.array_size != 0xFFF) 2236 fprintf(stderr, "AS:%i ", cf->output.array_size); 2237 if (cf->mark) 2238 fprintf(stderr, "MARK "); 2239 if (!cf->barrier) 2240 fprintf(stderr, "NO_BARRIER "); 2241 if (cf->end_of_program) 2242 fprintf(stderr, "EOP "); 2243 2244 if (cf->output.mark) 2245 fprintf(stderr, "MARK "); 2246 2247 fprintf(stderr, "\n"); 2248 } else { 2249 fprintf(stderr, "%04d %08X %08X %s ", id, bc->bytecode[id], 2250 bc->bytecode[id + 1], cfop->name); 2251 fprintf(stderr, "@%d ", cf->cf_addr); 2252 if (cf->cond) 2253 fprintf(stderr, "CND:%X ", cf->cond); 2254 if (cf->pop_count) 2255 fprintf(stderr, "POP:%X ", cf->pop_count); 2256 if (cf->count && (cfop->flags & CF_EMIT)) 2257 fprintf(stderr, "STREAM%d ", cf->count); 2258 if (cf->vpm) 2259 fprintf(stderr, "VPM "); 2260 if (cf->end_of_program) 2261 fprintf(stderr, "EOP "); 2262 fprintf(stderr, "\n"); 2263 } 2264 } 2265 2266 id = cf->addr; 2267 nliteral = 0; 2268 last = 1; 2269 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { 2270 const char *omod_str[] = {"","*2","*4","/2"}; 2271 const struct alu_op_info *aop = r600_isa_alu(alu->op); 2272 int o = 0; 2273 2274 r600_bytecode_alu_nliterals(alu, literal, &nliteral); 2275 o += fprintf(stderr, " %04d %08X %08X ", id, bc->bytecode[id], bc->bytecode[id+1]); 2276 if (last) 2277 o += fprintf(stderr, "%4d ", ++ngr); 2278 else 2279 o += fprintf(stderr, " "); 2280 o += fprintf(stderr, "%c%c %c ", alu->execute_mask ? 'M':' ', 2281 alu->update_pred ? 'P':' ', 2282 alu->pred_sel ? alu->pred_sel==2 ? '0':'1':' '); 2283 2284 o += fprintf(stderr, "%s%s%s ", aop->name, 2285 omod_str[alu->omod], alu->dst.clamp ? "_sat":""); 2286 2287 o += print_indent(o,60); 2288 o += print_dst(alu); 2289 for (i = 0; i < aop->src_count; ++i) { 2290 o += fprintf(stderr, i == 0 ? ", ": ", "); 2291 o += print_src(alu, i); 2292 } 2293 2294 if (alu->bank_swizzle) { 2295 o += print_indent(o,75); 2296 o += fprintf(stderr, " BS:%d", alu->bank_swizzle); 2297 } 2298 2299 fprintf(stderr, "\n"); 2300 id += 2; 2301 2302 if (alu->last) { 2303 for (i = 0; i < nliteral; i++, id++) { 2304 float *f = (float*)(bc->bytecode + id); 2305 o = fprintf(stderr, " %04d %08X", id, bc->bytecode[id]); 2306 print_indent(o, 60); 2307 fprintf(stderr, " %f (%d)\n", *f, *(bc->bytecode + id)); 2308 } 2309 id += nliteral & 1; 2310 nliteral = 0; 2311 } 2312 last = alu->last; 2313 } 2314 2315 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) { 2316 int o = 0; 2317 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2318 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2319 2320 o += fprintf(stderr, "%s ", r600_isa_fetch(tex->op)->name); 2321 2322 o += print_indent(o, 50); 2323 2324 o += fprintf(stderr, "R%d.", tex->dst_gpr); 2325 o += print_swizzle(tex->dst_sel_x); 2326 o += print_swizzle(tex->dst_sel_y); 2327 o += print_swizzle(tex->dst_sel_z); 2328 o += print_swizzle(tex->dst_sel_w); 2329 2330 o += fprintf(stderr, ", R%d.", tex->src_gpr); 2331 o += print_swizzle(tex->src_sel_x); 2332 o += print_swizzle(tex->src_sel_y); 2333 o += print_swizzle(tex->src_sel_z); 2334 o += print_swizzle(tex->src_sel_w); 2335 2336 o += fprintf(stderr, ", RID:%d", tex->resource_id); 2337 o += fprintf(stderr, ", SID:%d ", tex->sampler_id); 2338 2339 if (tex->sampler_index_mode) 2340 fprintf(stderr, "SQ_%s ", index_mode[tex->sampler_index_mode]); 2341 2342 if (tex->lod_bias) 2343 fprintf(stderr, "LB:%d ", tex->lod_bias); 2344 2345 fprintf(stderr, "CT:%c%c%c%c ", 2346 tex->coord_type_x ? 'N' : 'U', 2347 tex->coord_type_y ? 'N' : 'U', 2348 tex->coord_type_z ? 'N' : 'U', 2349 tex->coord_type_w ? 'N' : 'U'); 2350 2351 if (tex->offset_x) 2352 fprintf(stderr, "OX:%d ", tex->offset_x); 2353 if (tex->offset_y) 2354 fprintf(stderr, "OY:%d ", tex->offset_y); 2355 if (tex->offset_z) 2356 fprintf(stderr, "OZ:%d ", tex->offset_z); 2357 2358 id += 4; 2359 fprintf(stderr, "\n"); 2360 } 2361 2362 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) { 2363 int o = 0; 2364 const char * fetch_type[] = {"VERTEX", "INSTANCE", ""}; 2365 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2366 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2367 2368 o += fprintf(stderr, "%s ", r600_isa_fetch(vtx->op)->name); 2369 2370 o += print_indent(o, 50); 2371 2372 o += fprintf(stderr, "R%d.", vtx->dst_gpr); 2373 o += print_swizzle(vtx->dst_sel_x); 2374 o += print_swizzle(vtx->dst_sel_y); 2375 o += print_swizzle(vtx->dst_sel_z); 2376 o += print_swizzle(vtx->dst_sel_w); 2377 2378 o += fprintf(stderr, ", R%d.", vtx->src_gpr); 2379 o += print_swizzle(vtx->src_sel_x); 2380 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) 2381 o += print_swizzle(vtx->src_sel_y); 2382 2383 if (vtx->offset) 2384 fprintf(stderr, " +%db", vtx->offset); 2385 2386 o += print_indent(o, 55); 2387 2388 fprintf(stderr, ", RID:%d ", vtx->buffer_id); 2389 2390 fprintf(stderr, "%s ", fetch_type[vtx->fetch_type]); 2391 2392 if (bc->chip_class < CAYMAN && vtx->mega_fetch_count) 2393 fprintf(stderr, "MFC:%d ", vtx->mega_fetch_count); 2394 2395 if (bc->chip_class >= EVERGREEN && vtx->buffer_index_mode) 2396 fprintf(stderr, "SQ_%s ", index_mode[vtx->buffer_index_mode]); 2397 2398 if (r600_isa_fetch(vtx->op)->flags & FF_MEM) { 2399 if (vtx->uncached) 2400 fprintf(stderr, "UNCACHED "); 2401 if (vtx->indexed) 2402 fprintf(stderr, "INDEXED:%d ", vtx->indexed); 2403 2404 fprintf(stderr, "ELEM_SIZE:%d ", vtx->elem_size); 2405 if (vtx->burst_count) 2406 fprintf(stderr, "BURST_COUNT:%d ", vtx->burst_count); 2407 fprintf(stderr, "ARRAY_BASE:%d ", vtx->array_base); 2408 fprintf(stderr, "ARRAY_SIZE:%d ", vtx->array_size); 2409 } 2410 2411 fprintf(stderr, "UCF:%d ", vtx->use_const_fields); 2412 fprintf(stderr, "FMT(DTA:%d ", vtx->data_format); 2413 fprintf(stderr, "NUM:%d ", vtx->num_format_all); 2414 fprintf(stderr, "COMP:%d ", vtx->format_comp_all); 2415 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all); 2416 2417 id += 4; 2418 } 2419 2420 LIST_FOR_EACH_ENTRY(gds, &cf->gds, list) { 2421 int o = 0; 2422 o += fprintf(stderr, " %04d %08X %08X %08X ", id, bc->bytecode[id], 2423 bc->bytecode[id + 1], bc->bytecode[id + 2]); 2424 2425 o += fprintf(stderr, "%s ", r600_isa_fetch(gds->op)->name); 2426 2427 if (gds->op != FETCH_OP_TF_WRITE) { 2428 o += fprintf(stderr, "R%d.", gds->dst_gpr); 2429 o += print_swizzle(gds->dst_sel_x); 2430 o += print_swizzle(gds->dst_sel_y); 2431 o += print_swizzle(gds->dst_sel_z); 2432 o += print_swizzle(gds->dst_sel_w); 2433 } 2434 2435 o += fprintf(stderr, ", R%d.", gds->src_gpr); 2436 o += print_swizzle(gds->src_sel_x); 2437 o += print_swizzle(gds->src_sel_y); 2438 o += print_swizzle(gds->src_sel_z); 2439 2440 if (gds->op != FETCH_OP_TF_WRITE) { 2441 o += fprintf(stderr, ", R%d.", gds->src_gpr2); 2442 } 2443 if (gds->alloc_consume) { 2444 o += fprintf(stderr, " UAV: %d", gds->uav_id); 2445 if (gds->uav_index_mode) 2446 o += fprintf(stderr, "[%s]", index_mode[gds->uav_index_mode]); 2447 } 2448 fprintf(stderr, "\n"); 2449 id += 4; 2450 } 2451 } 2452 2453 fprintf(stderr, "--------------------------------------\n"); 2454} 2455 2456void r600_vertex_data_type(enum pipe_format pformat, 2457 unsigned *format, 2458 unsigned *num_format, unsigned *format_comp, unsigned *endian) 2459{ 2460 const struct util_format_description *desc; 2461 unsigned i; 2462 2463 *format = 0; 2464 *num_format = 0; 2465 *format_comp = 0; 2466 *endian = ENDIAN_NONE; 2467 2468 if (pformat == PIPE_FORMAT_R11G11B10_FLOAT) { 2469 *format = FMT_10_11_11_FLOAT; 2470 *endian = r600_endian_swap(32); 2471 return; 2472 } 2473 2474 if (pformat == PIPE_FORMAT_B5G6R5_UNORM) { 2475 *format = FMT_5_6_5; 2476 *endian = r600_endian_swap(16); 2477 return; 2478 } 2479 2480 if (pformat == PIPE_FORMAT_B5G5R5A1_UNORM) { 2481 *format = FMT_1_5_5_5; 2482 *endian = r600_endian_swap(16); 2483 return; 2484 } 2485 2486 if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) { 2487 *format = FMT_5_5_5_1; 2488 return; 2489 } 2490 2491 desc = util_format_description(pformat); 2492 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { 2493 goto out_unknown; 2494 } 2495 2496 /* Find the first non-VOID channel. */ 2497 for (i = 0; i < 4; i++) { 2498 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { 2499 break; 2500 } 2501 } 2502 2503 *endian = r600_endian_swap(desc->channel[i].size); 2504 2505 switch (desc->channel[i].type) { 2506 /* Half-floats, floats, ints */ 2507 case UTIL_FORMAT_TYPE_FLOAT: 2508 switch (desc->channel[i].size) { 2509 case 16: 2510 switch (desc->nr_channels) { 2511 case 1: 2512 *format = FMT_16_FLOAT; 2513 break; 2514 case 2: 2515 *format = FMT_16_16_FLOAT; 2516 break; 2517 case 3: 2518 case 4: 2519 *format = FMT_16_16_16_16_FLOAT; 2520 break; 2521 } 2522 break; 2523 case 32: 2524 switch (desc->nr_channels) { 2525 case 1: 2526 *format = FMT_32_FLOAT; 2527 break; 2528 case 2: 2529 *format = FMT_32_32_FLOAT; 2530 break; 2531 case 3: 2532 *format = FMT_32_32_32_FLOAT; 2533 break; 2534 case 4: 2535 *format = FMT_32_32_32_32_FLOAT; 2536 break; 2537 } 2538 break; 2539 default: 2540 goto out_unknown; 2541 } 2542 break; 2543 /* Unsigned ints */ 2544 case UTIL_FORMAT_TYPE_UNSIGNED: 2545 /* Signed ints */ 2546 case UTIL_FORMAT_TYPE_SIGNED: 2547 switch (desc->channel[i].size) { 2548 case 4: 2549 switch (desc->nr_channels) { 2550 case 2: 2551 *format = FMT_4_4; 2552 break; 2553 case 4: 2554 *format = FMT_4_4_4_4; 2555 break; 2556 } 2557 break; 2558 case 8: 2559 switch (desc->nr_channels) { 2560 case 1: 2561 *format = FMT_8; 2562 break; 2563 case 2: 2564 *format = FMT_8_8; 2565 break; 2566 case 3: 2567 case 4: 2568 *format = FMT_8_8_8_8; 2569 break; 2570 } 2571 break; 2572 case 10: 2573 if (desc->nr_channels != 4) 2574 goto out_unknown; 2575 2576 *format = FMT_2_10_10_10; 2577 break; 2578 case 16: 2579 switch (desc->nr_channels) { 2580 case 1: 2581 *format = FMT_16; 2582 break; 2583 case 2: 2584 *format = FMT_16_16; 2585 break; 2586 case 3: 2587 case 4: 2588 *format = FMT_16_16_16_16; 2589 break; 2590 } 2591 break; 2592 case 32: 2593 switch (desc->nr_channels) { 2594 case 1: 2595 *format = FMT_32; 2596 break; 2597 case 2: 2598 *format = FMT_32_32; 2599 break; 2600 case 3: 2601 *format = FMT_32_32_32; 2602 break; 2603 case 4: 2604 *format = FMT_32_32_32_32; 2605 break; 2606 } 2607 break; 2608 default: 2609 goto out_unknown; 2610 } 2611 break; 2612 default: 2613 goto out_unknown; 2614 } 2615 2616 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2617 *format_comp = 1; 2618 } 2619 2620 *num_format = 0; 2621 if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED || 2622 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { 2623 if (!desc->channel[i].normalized) { 2624 if (desc->channel[i].pure_integer) 2625 *num_format = 1; 2626 else 2627 *num_format = 2; 2628 } 2629 } 2630 return; 2631out_unknown: 2632 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat)); 2633} 2634 2635void *r600_create_vertex_fetch_shader(struct pipe_context *ctx, 2636 unsigned count, 2637 const struct pipe_vertex_element *elements) 2638{ 2639 struct r600_context *rctx = (struct r600_context *)ctx; 2640 struct r600_bytecode bc; 2641 struct r600_bytecode_vtx vtx; 2642 const struct util_format_description *desc; 2643 unsigned fetch_resource_start = rctx->b.chip_class >= EVERGREEN ? 0 : 160; 2644 unsigned format, num_format, format_comp, endian; 2645 uint32_t *bytecode; 2646 int i, j, r, fs_size; 2647 struct r600_fetch_shader *shader; 2648 unsigned no_sb = rctx->screen->b.debug_flags & DBG_NO_SB || 2649 (rctx->screen->b.debug_flags & DBG_NIR); 2650 unsigned sb_disasm = !no_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM); 2651 2652 assert(count < 32); 2653 2654 memset(&bc, 0, sizeof(bc)); 2655 r600_bytecode_init(&bc, rctx->b.chip_class, rctx->b.family, 2656 rctx->screen->has_compressed_msaa_texturing); 2657 2658 bc.isa = rctx->isa; 2659 2660 for (i = 0; i < count; i++) { 2661 if (elements[i].instance_divisor > 1) { 2662 if (rctx->b.chip_class == CAYMAN) { 2663 for (j = 0; j < 4; j++) { 2664 struct r600_bytecode_alu alu; 2665 memset(&alu, 0, sizeof(alu)); 2666 alu.op = ALU_OP2_MULHI_UINT; 2667 alu.src[0].sel = 0; 2668 alu.src[0].chan = 3; 2669 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2670 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2671 alu.dst.sel = i + 1; 2672 alu.dst.chan = j; 2673 alu.dst.write = j == 3; 2674 alu.last = j == 3; 2675 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2676 r600_bytecode_clear(&bc); 2677 return NULL; 2678 } 2679 } 2680 } else { 2681 struct r600_bytecode_alu alu; 2682 memset(&alu, 0, sizeof(alu)); 2683 alu.op = ALU_OP2_MULHI_UINT; 2684 alu.src[0].sel = 0; 2685 alu.src[0].chan = 3; 2686 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; 2687 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1; 2688 alu.dst.sel = i + 1; 2689 alu.dst.chan = 3; 2690 alu.dst.write = 1; 2691 alu.last = 1; 2692 if ((r = r600_bytecode_add_alu(&bc, &alu))) { 2693 r600_bytecode_clear(&bc); 2694 return NULL; 2695 } 2696 } 2697 } 2698 } 2699 2700 for (i = 0; i < count; i++) { 2701 r600_vertex_data_type(elements[i].src_format, 2702 &format, &num_format, &format_comp, &endian); 2703 2704 desc = util_format_description(elements[i].src_format); 2705 if (!desc) { 2706 r600_bytecode_clear(&bc); 2707 R600_ERR("unknown format %d\n", elements[i].src_format); 2708 return NULL; 2709 } 2710 2711 if (elements[i].src_offset > 65535) { 2712 r600_bytecode_clear(&bc); 2713 R600_ERR("too big src_offset: %u\n", elements[i].src_offset); 2714 return NULL; 2715 } 2716 2717 memset(&vtx, 0, sizeof(vtx)); 2718 vtx.buffer_id = elements[i].vertex_buffer_index + fetch_resource_start; 2719 vtx.fetch_type = elements[i].instance_divisor ? SQ_VTX_FETCH_INSTANCE_DATA : SQ_VTX_FETCH_VERTEX_DATA; 2720 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0; 2721 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0; 2722 vtx.mega_fetch_count = 0x1F; 2723 vtx.dst_gpr = i + 1; 2724 vtx.dst_sel_x = desc->swizzle[0]; 2725 vtx.dst_sel_y = desc->swizzle[1]; 2726 vtx.dst_sel_z = desc->swizzle[2]; 2727 vtx.dst_sel_w = desc->swizzle[3]; 2728 vtx.data_format = format; 2729 vtx.num_format_all = num_format; 2730 vtx.format_comp_all = format_comp; 2731 vtx.offset = elements[i].src_offset; 2732 vtx.endian = endian; 2733 2734 if ((r = r600_bytecode_add_vtx(&bc, &vtx))) { 2735 r600_bytecode_clear(&bc); 2736 return NULL; 2737 } 2738 } 2739 2740 r600_bytecode_add_cfinst(&bc, CF_OP_RET); 2741 2742 if ((r = r600_bytecode_build(&bc))) { 2743 r600_bytecode_clear(&bc); 2744 return NULL; 2745 } 2746 2747 if (rctx->screen->b.debug_flags & DBG_FS) { 2748 fprintf(stderr, "--------------------------------------------------------------\n"); 2749 fprintf(stderr, "Vertex elements state:\n"); 2750 for (i = 0; i < count; i++) { 2751 fprintf(stderr, " "); 2752 util_dump_vertex_element(stderr, elements+i); 2753 fprintf(stderr, "\n"); 2754 } 2755 2756 if (!sb_disasm) { 2757 r600_bytecode_disasm(&bc); 2758 2759 fprintf(stderr, "______________________________________________________________\n"); 2760 } else { 2761 r600_sb_bytecode_process(rctx, &bc, NULL, 1 /*dump*/, 0 /*optimize*/); 2762 } 2763 } 2764 2765 fs_size = bc.ndw*4; 2766 2767 /* Allocate the CSO. */ 2768 shader = CALLOC_STRUCT(r600_fetch_shader); 2769 if (!shader) { 2770 r600_bytecode_clear(&bc); 2771 return NULL; 2772 } 2773 2774 u_suballocator_alloc(&rctx->allocator_fetch_shader, fs_size, 256, 2775 &shader->offset, 2776 (struct pipe_resource**)&shader->buffer); 2777 if (!shader->buffer) { 2778 r600_bytecode_clear(&bc); 2779 FREE(shader); 2780 return NULL; 2781 } 2782 2783 bytecode = r600_buffer_map_sync_with_rings 2784 (&rctx->b, shader->buffer, 2785 PIPE_MAP_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); 2786 bytecode += shader->offset / 4; 2787 2788 if (R600_BIG_ENDIAN) { 2789 for (i = 0; i < fs_size / 4; ++i) { 2790 bytecode[i] = util_cpu_to_le32(bc.bytecode[i]); 2791 } 2792 } else { 2793 memcpy(bytecode, bc.bytecode, fs_size); 2794 } 2795 rctx->b.ws->buffer_unmap(rctx->b.ws, shader->buffer->buf); 2796 2797 r600_bytecode_clear(&bc); 2798 return shader; 2799} 2800 2801void r600_bytecode_alu_read(struct r600_bytecode *bc, 2802 struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1) 2803{ 2804 /* WORD0 */ 2805 alu->src[0].sel = G_SQ_ALU_WORD0_SRC0_SEL(word0); 2806 alu->src[0].rel = G_SQ_ALU_WORD0_SRC0_REL(word0); 2807 alu->src[0].chan = G_SQ_ALU_WORD0_SRC0_CHAN(word0); 2808 alu->src[0].neg = G_SQ_ALU_WORD0_SRC0_NEG(word0); 2809 alu->src[1].sel = G_SQ_ALU_WORD0_SRC1_SEL(word0); 2810 alu->src[1].rel = G_SQ_ALU_WORD0_SRC1_REL(word0); 2811 alu->src[1].chan = G_SQ_ALU_WORD0_SRC1_CHAN(word0); 2812 alu->src[1].neg = G_SQ_ALU_WORD0_SRC1_NEG(word0); 2813 alu->index_mode = G_SQ_ALU_WORD0_INDEX_MODE(word0); 2814 alu->pred_sel = G_SQ_ALU_WORD0_PRED_SEL(word0); 2815 alu->last = G_SQ_ALU_WORD0_LAST(word0); 2816 2817 /* WORD1 */ 2818 alu->bank_swizzle = G_SQ_ALU_WORD1_BANK_SWIZZLE(word1); 2819 if (alu->bank_swizzle) 2820 alu->bank_swizzle_force = alu->bank_swizzle; 2821 alu->dst.sel = G_SQ_ALU_WORD1_DST_GPR(word1); 2822 alu->dst.rel = G_SQ_ALU_WORD1_DST_REL(word1); 2823 alu->dst.chan = G_SQ_ALU_WORD1_DST_CHAN(word1); 2824 alu->dst.clamp = G_SQ_ALU_WORD1_CLAMP(word1); 2825 if (G_SQ_ALU_WORD1_ENCODING(word1)) /*ALU_DWORD1_OP3*/ 2826 { 2827 alu->is_op3 = 1; 2828 alu->src[2].sel = G_SQ_ALU_WORD1_OP3_SRC2_SEL(word1); 2829 alu->src[2].rel = G_SQ_ALU_WORD1_OP3_SRC2_REL(word1); 2830 alu->src[2].chan = G_SQ_ALU_WORD1_OP3_SRC2_CHAN(word1); 2831 alu->src[2].neg = G_SQ_ALU_WORD1_OP3_SRC2_NEG(word1); 2832 alu->op = r600_isa_alu_by_opcode(bc->isa, 2833 G_SQ_ALU_WORD1_OP3_ALU_INST(word1), /* is_op3 = */ 1); 2834 2835 } 2836 else /*ALU_DWORD1_OP2*/ 2837 { 2838 alu->src[0].abs = G_SQ_ALU_WORD1_OP2_SRC0_ABS(word1); 2839 alu->src[1].abs = G_SQ_ALU_WORD1_OP2_SRC1_ABS(word1); 2840 alu->op = r600_isa_alu_by_opcode(bc->isa, 2841 G_SQ_ALU_WORD1_OP2_ALU_INST(word1), /* is_op3 = */ 0); 2842 alu->omod = G_SQ_ALU_WORD1_OP2_OMOD(word1); 2843 alu->dst.write = G_SQ_ALU_WORD1_OP2_WRITE_MASK(word1); 2844 alu->update_pred = G_SQ_ALU_WORD1_OP2_UPDATE_PRED(word1); 2845 alu->execute_mask = 2846 G_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(word1); 2847 } 2848} 2849 2850#if 0 2851void r600_bytecode_export_read(struct r600_bytecode *bc, 2852 struct r600_bytecode_output *output, uint32_t word0, uint32_t word1) 2853{ 2854 output->array_base = G_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(word0); 2855 output->type = G_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(word0); 2856 output->gpr = G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(word0); 2857 output->elem_size = G_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(word0); 2858 2859 output->swizzle_x = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(word1); 2860 output->swizzle_y = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(word1); 2861 output->swizzle_z = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(word1); 2862 output->swizzle_w = G_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(word1); 2863 output->burst_count = G_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(word1); 2864 output->end_of_program = G_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(word1); 2865 output->op = r600_isa_cf_by_opcode(bc->isa, 2866 G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(word1), 0); 2867 output->barrier = G_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(word1); 2868 output->array_size = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(word1); 2869 output->comp_mask = G_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(word1); 2870} 2871#endif 2872