1/*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24/**
25 * \file brw_vec4_tes.cpp
26 *
27 * Tessellaton evaluation shader specific code derived from the vec4_visitor class.
28 */
29
30#include "brw_vec4_tes.h"
31#include "brw_cfg.h"
32#include "dev/gen_debug.h"
33
34namespace brw {
35
36vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
37                                  void *log_data,
38                                  const struct brw_tes_prog_key *key,
39                                  struct brw_tes_prog_data *prog_data,
40                                  const nir_shader *shader,
41                                  void *mem_ctx,
42                                  int shader_time_index)
43   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
44                  shader, mem_ctx, false, shader_time_index)
45{
46}
47
48void
49vec4_tes_visitor::setup_payload()
50{
51   int reg = 0;
52
53   /* The payload always contains important data in r0 and r1, which contains
54    * the URB handles that are passed on to the URB write at the end
55    * of the thread.
56    */
57   reg += 2;
58
59   reg = setup_uniforms(reg);
60
61   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
62      for (int i = 0; i < 3; i++) {
63         if (inst->src[i].file != ATTR)
64            continue;
65
66         bool is_64bit = type_sz(inst->src[i].type) == 8;
67
68         unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
69         struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
70         grf = stride(grf, 0, is_64bit ? 2 : 4, 1);
71         grf.swizzle = inst->src[i].swizzle;
72         grf.type = inst->src[i].type;
73         grf.abs = inst->src[i].abs;
74         grf.negate = inst->src[i].negate;
75
76         /* For 64-bit attributes we can end up with components XY in the
77          * second half of a register and components ZW in the first half
78          * of the next. Fix it up here.
79          */
80         if (is_64bit && grf.subnr > 0) {
81            /* We can't do swizzles that mix XY and ZW channels in this case.
82             * Such cases should have been handled by the scalarization pass.
83             */
84            assert((brw_mask_for_swizzle(grf.swizzle) & 0x3) ^
85                   (brw_mask_for_swizzle(grf.swizzle) & 0xc));
86            if (brw_mask_for_swizzle(grf.swizzle) & 0xc) {
87               grf.subnr = 0;
88               grf.nr++;
89               grf.swizzle -= BRW_SWIZZLE_ZZZZ;
90            }
91         }
92
93         inst->src[i] = grf;
94      }
95   }
96
97   reg += 8 * prog_data->urb_read_length;
98
99   this->first_non_payload_grf = reg;
100}
101
102
103void
104vec4_tes_visitor::emit_prolog()
105{
106   input_read_header = src_reg(this, glsl_type::uvec4_type);
107   emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
108
109   this->current_annotation = NULL;
110}
111
112
113void
114vec4_tes_visitor::emit_urb_write_header(int mrf)
115{
116   /* No need to do anything for DS; an implied write to this MRF will be
117    * performed by VS_OPCODE_URB_WRITE.
118    */
119   (void) mrf;
120}
121
122
123vec4_instruction *
124vec4_tes_visitor::emit_urb_write_opcode(bool complete)
125{
126   /* For DS, the URB writes end the thread. */
127   if (complete) {
128      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
129         emit_shader_time_end();
130   }
131
132   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
133   inst->urb_write_flags = complete ?
134      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
135
136   return inst;
137}
138
139void
140vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
141{
142   const struct brw_tes_prog_data *tes_prog_data =
143      (const struct brw_tes_prog_data *) prog_data;
144
145   switch (instr->intrinsic) {
146   case nir_intrinsic_load_tess_coord:
147      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
148      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
149               src_reg(brw_vec8_grf(1, 0))));
150      break;
151   case nir_intrinsic_load_tess_level_outer:
152      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
153         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
154                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
155                          BRW_SWIZZLE_ZWZW)));
156      } else {
157         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
158                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
159                          BRW_SWIZZLE_WZYX)));
160      }
161      break;
162   case nir_intrinsic_load_tess_level_inner:
163      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
164         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
165                  swizzle(src_reg(ATTR, 0, glsl_type::vec4_type),
166                          BRW_SWIZZLE_WZYX)));
167      } else {
168         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
169                  src_reg(ATTR, 1, glsl_type::float_type)));
170      }
171      break;
172   case nir_intrinsic_load_primitive_id:
173      emit(TES_OPCODE_GET_PRIMITIVE_ID,
174           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
175      break;
176
177   case nir_intrinsic_load_input:
178   case nir_intrinsic_load_per_vertex_input: {
179      src_reg indirect_offset = get_indirect_offset(instr);
180      unsigned imm_offset = instr->const_index[0];
181      src_reg header = input_read_header;
182      bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
183      unsigned first_component = nir_intrinsic_component(instr);
184      if (is_64bit)
185         first_component /= 2;
186
187      if (indirect_offset.file != BAD_FILE) {
188         src_reg clamped_indirect_offset = src_reg(this, glsl_type::uvec4_type);
189
190         /* Page 190 of "Volume 7: 3D Media GPGPU Engine (Haswell)" says the
191          * valid range of the offset is [0, 0FFFFFFFh].
192          */
193         emit_minmax(BRW_CONDITIONAL_L,
194                     dst_reg(clamped_indirect_offset),
195                     retype(indirect_offset, BRW_REGISTER_TYPE_UD),
196                     brw_imm_ud(0x0fffffffu));
197
198         header = src_reg(this, glsl_type::uvec4_type);
199         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
200              input_read_header, clamped_indirect_offset);
201      } else {
202         /* Arbitrarily only push up to 24 vec4 slots worth of data,
203          * which is 12 registers (since each holds 2 vec4 slots).
204          */
205         const unsigned max_push_slots = 24;
206         if (imm_offset < max_push_slots) {
207            const glsl_type *src_glsl_type =
208               is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type;
209            src_reg src = src_reg(ATTR, imm_offset, src_glsl_type);
210            src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
211
212            const brw_reg_type dst_reg_type =
213               is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D;
214            emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src));
215
216            prog_data->urb_read_length =
217               MAX2(prog_data->urb_read_length,
218                    DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2));
219            break;
220         }
221      }
222
223      if (!is_64bit) {
224         dst_reg temp(this, glsl_type::ivec4_type);
225         vec4_instruction *read =
226            emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
227         read->offset = imm_offset;
228         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
229
230         src_reg src = src_reg(temp);
231         src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
232
233         /* Copy to target.  We might end up with some funky writemasks landing
234          * in here, but we really don't want them in the above pseudo-ops.
235          */
236         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
237         dst.writemask = brw_writemask_for_size(instr->num_components);
238         emit(MOV(dst, src));
239      } else {
240         /* For 64-bit we need to load twice as many 32-bit components, and for
241          * dvec3/4 we need to emit 2 URB Read messages
242          */
243         dst_reg temp(this, glsl_type::dvec4_type);
244         dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D);
245
246         vec4_instruction *read =
247            emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header));
248         read->offset = imm_offset;
249         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
250
251         if (instr->num_components > 2) {
252            read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE),
253                        src_reg(header));
254            read->offset = imm_offset + 1;
255            read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
256         }
257
258         src_reg temp_as_src = src_reg(temp);
259         temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
260
261         dst_reg shuffled(this, glsl_type::dvec4_type);
262         shuffle_64bit_data(shuffled, temp_as_src, false);
263
264         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
265         dst.writemask = brw_writemask_for_size(instr->num_components);
266         emit(MOV(dst, src_reg(shuffled)));
267      }
268      break;
269   }
270   default:
271      vec4_visitor::nir_emit_intrinsic(instr);
272   }
273}
274
275
276void
277vec4_tes_visitor::emit_thread_end()
278{
279   /* For DS, we always end the thread by emitting a single vertex.
280    * emit_urb_write_opcode() will take care of setting the eot flag on the
281    * SEND instruction.
282    */
283   emit_vertex();
284}
285
286} /* namespace brw */
287