1/**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28/**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36#include "pipe/p_defines.h"
37#include "pipe/p_state.h"
38#include "pipe/p_shader_tokens.h"
39#include "util/u_debug.h"
40#include "util/u_dump.h"
41#include "util/u_memory.h"
42#include "util/u_math.h"
43#include "util/u_format.h"
44#include "util/u_cpu_detect.h"
45#include "util/format_rgb9e5.h"
46#include "lp_bld_debug.h"
47#include "lp_bld_type.h"
48#include "lp_bld_const.h"
49#include "lp_bld_conv.h"
50#include "lp_bld_arit.h"
51#include "lp_bld_bitarit.h"
52#include "lp_bld_logic.h"
53#include "lp_bld_printf.h"
54#include "lp_bld_swizzle.h"
55#include "lp_bld_flow.h"
56#include "lp_bld_gather.h"
57#include "lp_bld_format.h"
58#include "lp_bld_sample.h"
59#include "lp_bld_sample_aos.h"
60#include "lp_bld_struct.h"
61#include "lp_bld_quad.h"
62#include "lp_bld_pack.h"
63#include "lp_bld_intr.h"
64
65
66/**
67 * Generate code to fetch a texel from a texture at int coords (x, y, z).
68 * The computation depends on whether the texture is 1D, 2D or 3D.
69 * The result, texel, will be float vectors:
70 *   texel[0] = red values
71 *   texel[1] = green values
72 *   texel[2] = blue values
73 *   texel[3] = alpha values
74 */
75static void
76lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
77                          LLVMValueRef width,
78                          LLVMValueRef height,
79                          LLVMValueRef depth,
80                          LLVMValueRef x,
81                          LLVMValueRef y,
82                          LLVMValueRef z,
83                          LLVMValueRef y_stride,
84                          LLVMValueRef z_stride,
85                          LLVMValueRef data_ptr,
86                          LLVMValueRef mipoffsets,
87                          LLVMValueRef texel_out[4])
88{
89   const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
90   const unsigned dims = bld->dims;
91   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
92   LLVMBuilderRef builder = bld->gallivm->builder;
93   LLVMValueRef offset;
94   LLVMValueRef i, j;
95   LLVMValueRef use_border = NULL;
96
97   /* use_border = x < 0 || x >= width || y < 0 || y >= height */
98   if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
99                                              static_state->min_img_filter,
100                                              static_state->mag_img_filter)) {
101      LLVMValueRef b1, b2;
102      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
103      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
104      use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
105   }
106
107   if (dims >= 2 &&
108       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
109                                              static_state->min_img_filter,
110                                              static_state->mag_img_filter)) {
111      LLVMValueRef b1, b2;
112      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
113      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
114      if (use_border) {
115         use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
116         use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
117      }
118      else {
119         use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
120      }
121   }
122
123   if (dims == 3 &&
124       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
125                                              static_state->min_img_filter,
126                                              static_state->mag_img_filter)) {
127      LLVMValueRef b1, b2;
128      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
129      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
130      if (use_border) {
131         use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
132         use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
133      }
134      else {
135         use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
136      }
137   }
138
139   /* convert x,y,z coords to linear offset from start of texture, in bytes */
140   lp_build_sample_offset(&bld->int_coord_bld,
141                          bld->format_desc,
142                          x, y, z, y_stride, z_stride,
143                          &offset, &i, &j);
144   if (mipoffsets) {
145      offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
146   }
147
148   if (use_border) {
149      /* If we can sample the border color, it means that texcoords may
150       * lie outside the bounds of the texture image.  We need to do
151       * something to prevent reading out of bounds and causing a segfault.
152       *
153       * Simply AND the texture coords with !use_border.  This will cause
154       * coords which are out of bounds to become zero.  Zero's guaranteed
155       * to be inside the texture image.
156       */
157      offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
158   }
159
160   lp_build_fetch_rgba_soa(bld->gallivm,
161                           bld->format_desc,
162                           bld->texel_type, TRUE,
163                           data_ptr, offset,
164                           i, j,
165                           bld->cache,
166                           texel_out);
167
168   /*
169    * Note: if we find an app which frequently samples the texture border
170    * we might want to implement a true conditional here to avoid sampling
171    * the texture whenever possible (since that's quite a bit of code).
172    * Ex:
173    *   if (use_border) {
174    *      texel = border_color;
175    *   }
176    *   else {
177    *      texel = sample_texture(coord);
178    *   }
179    * As it is now, we always sample the texture, then selectively replace
180    * the texel color results with the border color.
181    */
182
183   if (use_border) {
184      /* select texel color or border color depending on use_border. */
185      const struct util_format_description *format_desc = bld->format_desc;
186      int chan;
187      struct lp_type border_type = bld->texel_type;
188      border_type.length = 4;
189      /*
190       * Only replace channels which are actually present. The others should
191       * get optimized away eventually by sampler_view swizzle anyway but it's
192       * easier too.
193       */
194      for (chan = 0; chan < 4; chan++) {
195         unsigned chan_s;
196         /* reverse-map channel... */
197         for (chan_s = 0; chan_s < 4; chan_s++) {
198            if (chan_s == format_desc->swizzle[chan]) {
199               break;
200            }
201         }
202         if (chan_s <= 3) {
203            /* use the already clamped color */
204            LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
205            LLVMValueRef border_chan;
206
207            border_chan = lp_build_extract_broadcast(bld->gallivm,
208                                                     border_type,
209                                                     bld->texel_type,
210                                                     bld->border_color_clamped,
211                                                     idx);
212            texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
213                                              border_chan, texel_out[chan]);
214         }
215      }
216   }
217}
218
219
220/**
221 * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
222 * (Note that with pot sizes could do this much more easily post-scale
223 * with some bit arithmetic.)
224 */
225static LLVMValueRef
226lp_build_coord_mirror(struct lp_build_sample_context *bld,
227                      LLVMValueRef coord, boolean posOnly)
228{
229   struct lp_build_context *coord_bld = &bld->coord_bld;
230   LLVMValueRef fract;
231   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
232
233   /*
234    * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
235    * it all works out. (The result is in range [-1, 1.0], negative if
236    * the coord is in the "odd" section, otherwise positive.)
237    */
238
239   coord = lp_build_mul(coord_bld, coord, half);
240   fract = lp_build_round(coord_bld, coord);
241   fract = lp_build_sub(coord_bld, coord, fract);
242   coord = lp_build_add(coord_bld, fract, fract);
243
244   if (posOnly) {
245      /*
246       * Theoretically it's not quite 100% accurate because the spec says
247       * that ultimately a scaled coord of -x.0 should map to int coord
248       * -x + 1 with mirroring, not -x (this does not matter for bilinear
249       * filtering).
250       */
251      coord = lp_build_abs(coord_bld, coord);
252      /* kill off NaNs */
253      /* XXX: not safe without arch rounding, fract can be anything. */
254      coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
255                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
256   }
257
258   return coord;
259}
260
261
262/**
263 * Helper to compute the first coord and the weight for
264 * linear wrap repeat npot textures
265 */
266void
267lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
268                                  LLVMValueRef coord_f,
269                                  LLVMValueRef length_i,
270                                  LLVMValueRef length_f,
271                                  LLVMValueRef *coord0_i,
272                                  LLVMValueRef *weight_f)
273{
274   struct lp_build_context *coord_bld = &bld->coord_bld;
275   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
276   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
277   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
278                                                int_coord_bld->one);
279   LLVMValueRef mask;
280   /* wrap with normalized floats is just fract */
281   coord_f = lp_build_fract(coord_bld, coord_f);
282   /* mul by size and subtract 0.5 */
283   coord_f = lp_build_mul(coord_bld, coord_f, length_f);
284   coord_f = lp_build_sub(coord_bld, coord_f, half);
285   /*
286    * we avoided the 0.5/length division before the repeat wrap,
287    * now need to fix up edge cases with selects
288    */
289   /*
290    * Note we do a float (unordered) compare so we can eliminate NaNs.
291    * (Otherwise would need fract_safe above).
292    */
293   mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
294                           PIPE_FUNC_LESS, coord_f, coord_bld->zero);
295
296   /* convert to int, compute lerp weight */
297   lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
298   *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
299}
300
301
302/**
303 * Build LLVM code for texture wrap mode for linear filtering.
304 * \param x0_out  returns first integer texcoord
305 * \param x1_out  returns second integer texcoord
306 * \param weight_out  returns linear interpolation weight
307 */
308static void
309lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
310                            boolean is_gather,
311                            LLVMValueRef coord,
312                            LLVMValueRef length,
313                            LLVMValueRef length_f,
314                            LLVMValueRef offset,
315                            boolean is_pot,
316                            unsigned wrap_mode,
317                            LLVMValueRef *x0_out,
318                            LLVMValueRef *x1_out,
319                            LLVMValueRef *weight_out)
320{
321   struct lp_build_context *coord_bld = &bld->coord_bld;
322   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
323   LLVMBuilderRef builder = bld->gallivm->builder;
324   LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
325   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
326   LLVMValueRef coord0, coord1, weight;
327
328   switch(wrap_mode) {
329   case PIPE_TEX_WRAP_REPEAT:
330      if (is_pot) {
331         /* mul by size and subtract 0.5 */
332         coord = lp_build_mul(coord_bld, coord, length_f);
333         coord = lp_build_sub(coord_bld, coord, half);
334         if (offset) {
335            offset = lp_build_int_to_float(coord_bld, offset);
336            coord = lp_build_add(coord_bld, coord, offset);
337         }
338         /* convert to int, compute lerp weight */
339         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
340         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
341         /* repeat wrap */
342         coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
343         coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
344      }
345      else {
346         LLVMValueRef mask;
347         if (offset) {
348            offset = lp_build_int_to_float(coord_bld, offset);
349            offset = lp_build_div(coord_bld, offset, length_f);
350            coord = lp_build_add(coord_bld, coord, offset);
351         }
352         lp_build_coord_repeat_npot_linear(bld, coord,
353                                           length, length_f,
354                                           &coord0, &weight);
355         mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
356                                 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
357         coord1 = LLVMBuildAnd(builder,
358                               lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
359                               mask, "");
360      }
361      break;
362
363   case PIPE_TEX_WRAP_CLAMP:
364      if (bld->static_sampler_state->normalized_coords) {
365         /* scale coord to length */
366         coord = lp_build_mul(coord_bld, coord, length_f);
367      }
368      if (offset) {
369         offset = lp_build_int_to_float(coord_bld, offset);
370         coord = lp_build_add(coord_bld, coord, offset);
371      }
372
373      /*
374       * clamp to [0, length]
375       *
376       * Unlike some other wrap modes, this should be correct for gather
377       * too. GL_CLAMP explicitly does this clamp on the coord prior to
378       * actual wrapping (which is per sample).
379       */
380      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
381
382      coord = lp_build_sub(coord_bld, coord, half);
383
384      /* convert to int, compute lerp weight */
385      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
386      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
387      break;
388
389   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
390      {
391         struct lp_build_context abs_coord_bld = bld->coord_bld;
392         abs_coord_bld.type.sign = FALSE;
393
394         if (bld->static_sampler_state->normalized_coords) {
395            /* mul by tex size */
396            coord = lp_build_mul(coord_bld, coord, length_f);
397         }
398         if (offset) {
399            offset = lp_build_int_to_float(coord_bld, offset);
400            coord = lp_build_add(coord_bld, coord, offset);
401         }
402
403         /* clamp to length max */
404         coord = lp_build_min_ext(coord_bld, coord, length_f,
405                                  GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
406         if (!is_gather) {
407            /* subtract 0.5 */
408            coord = lp_build_sub(coord_bld, coord, half);
409            /* clamp to [0, length - 0.5] */
410            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
411            /* convert to int, compute lerp weight */
412            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
413            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
414         } else {
415            /*
416             * The non-gather path will end up with coords 0, 1 if coord was
417             * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
418             * really matter what the second coord is). But for gather, we
419             * really need to end up with coords 0, 0.
420             */
421            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
422            coord0 = lp_build_sub(coord_bld, coord, half);
423            coord1 = lp_build_add(coord_bld, coord, half);
424            /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
425            coord0 = lp_build_itrunc(coord_bld, coord0);
426            coord1 = lp_build_itrunc(coord_bld, coord1);
427            weight = coord_bld->undef;
428         }
429         /* coord1 = min(coord1, length-1) */
430         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
431         break;
432      }
433
434   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435      if (bld->static_sampler_state->normalized_coords) {
436         /* scale coord to length */
437         coord = lp_build_mul(coord_bld, coord, length_f);
438      }
439      if (offset) {
440         offset = lp_build_int_to_float(coord_bld, offset);
441         coord = lp_build_add(coord_bld, coord, offset);
442      }
443      /*
444       * We don't need any clamp. Technically, for very large (pos or neg)
445       * (or infinite) values, clamp against [-length, length] would be
446       * correct, but we don't need to guarantee any specific
447       * result for such coords (the ifloor will be undefined, but for modes
448       * requiring border all resulting coords are safe).
449       */
450      coord = lp_build_sub(coord_bld, coord, half);
451      /* convert to int, compute lerp weight */
452      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
453      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
454      break;
455
456   case PIPE_TEX_WRAP_MIRROR_REPEAT:
457      if (offset) {
458         offset = lp_build_int_to_float(coord_bld, offset);
459         offset = lp_build_div(coord_bld, offset, length_f);
460         coord = lp_build_add(coord_bld, coord, offset);
461      }
462      if (!is_gather) {
463         /* compute mirror function */
464         coord = lp_build_coord_mirror(bld, coord, TRUE);
465
466         /* scale coord to length */
467         coord = lp_build_mul(coord_bld, coord, length_f);
468         coord = lp_build_sub(coord_bld, coord, half);
469
470         /* convert to int, compute lerp weight */
471         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
472         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
473
474         /* coord0 = max(coord0, 0) */
475         coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
476         /* coord1 = min(coord1, length-1) */
477         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
478      } else {
479         /*
480          * This is pretty reasonable in the end,  all what the tests care
481          * about is nasty edge cases (scaled coords x.5, so the individual
482          * coords are actually integers, which is REALLY tricky to get right
483          * due to this working differently both for negative numbers as well
484          * as for even/odd cases). But with enough magic it's not too complex
485          * after all.
486          * Maybe should try a bit arithmetic one though for POT textures...
487          */
488         LLVMValueRef isNeg;
489         /*
490          * Wrapping just once still works, even though it means we can
491          * get "wrong" sign due to performing mirror in the middle of the
492          * two coords (because this can only happen very near the odd/even
493          * edges, so both coords will actually end up as 0 or length - 1
494          * in the end).
495          * For GL4 gather with per-sample offsets we'd need to the mirroring
496          * per coord too.
497          */
498         coord = lp_build_coord_mirror(bld, coord, FALSE);
499         coord = lp_build_mul(coord_bld, coord, length_f);
500
501         /*
502          * NaNs should be safe here, we'll do away with them with
503          * the ones' complement plus min.
504          */
505         coord0 = lp_build_sub(coord_bld, coord, half);
506         coord0 = lp_build_ifloor(coord_bld, coord0);
507         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
508         /* ones complement for neg numbers (mirror(negX) = X - 1)  */
509         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
510                              coord0, int_coord_bld->zero);
511         coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
512         isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
513                              coord1, int_coord_bld->zero);
514         coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
515         coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
516         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
517
518         weight = coord_bld->undef;
519      }
520      break;
521
522   case PIPE_TEX_WRAP_MIRROR_CLAMP:
523      if (bld->static_sampler_state->normalized_coords) {
524         /* scale coord to length */
525         coord = lp_build_mul(coord_bld, coord, length_f);
526      }
527      if (offset) {
528         offset = lp_build_int_to_float(coord_bld, offset);
529         coord = lp_build_add(coord_bld, coord, offset);
530      }
531      /*
532       * XXX: probably not correct for gather, albeit I'm not
533       * entirely sure as it's poorly specified. The wrapping looks
534       * correct according to the spec which is against gl 1.2.1,
535       * however negative values will be swapped - gl re-specified
536       * wrapping with newer versions (no more pre-clamp except with
537       * GL_CLAMP).
538       */
539      coord = lp_build_abs(coord_bld, coord);
540
541      /* clamp to [0, length] */
542      coord = lp_build_min_ext(coord_bld, coord, length_f,
543                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
544
545      coord = lp_build_sub(coord_bld, coord, half);
546
547      /* convert to int, compute lerp weight */
548      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
549      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
550      break;
551
552   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
553      {
554         struct lp_build_context abs_coord_bld = bld->coord_bld;
555         abs_coord_bld.type.sign = FALSE;
556
557         if (bld->static_sampler_state->normalized_coords) {
558            /* scale coord to length */
559            coord = lp_build_mul(coord_bld, coord, length_f);
560         }
561         if (offset) {
562            offset = lp_build_int_to_float(coord_bld, offset);
563            coord = lp_build_add(coord_bld, coord, offset);
564         }
565         if (!is_gather) {
566            coord = lp_build_abs(coord_bld, coord);
567
568            /* clamp to length max */
569            coord = lp_build_min_ext(coord_bld, coord, length_f,
570                                     GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
571            /* subtract 0.5 */
572            coord = lp_build_sub(coord_bld, coord, half);
573            /* clamp to [0, length - 0.5] */
574            coord = lp_build_max(coord_bld, coord, coord_bld->zero);
575
576            /* convert to int, compute lerp weight */
577            lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
578            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
579            /* coord1 = min(coord1, length-1) */
580            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
581         } else {
582            /*
583             * The non-gather path will swap coord0/1 if coord was negative,
584             * which is ok for filtering since the filter weight matches
585             * accordingly. Also, if coord is close to zero, coord0/1 will
586             * be 0 and 1, instead of 0 and 0 (again ok due to filter
587             * weight being 0.0). Both issues need to be fixed for gather.
588             */
589            LLVMValueRef isNeg;
590
591            /*
592             * Actually wanted to cheat here and use:
593             * coord1 = lp_build_iround(coord_bld, coord);
594             * but it's not good enough for some tests (even piglit
595             * textureGather is set up in a way so the coords area always
596             * .5, that is right at the crossover points).
597             * So do ordinary sub/floor, then do ones' complement
598             * for negative numbers.
599             * (Note can't just do sub|add/abs/itrunc per coord neither -
600             * because the spec demands that mirror(3.0) = 3 but
601             * mirror(-3.0) = 2.)
602             */
603            coord = lp_build_sub(coord_bld, coord, half);
604            coord0 = lp_build_ifloor(coord_bld, coord);
605            coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
606            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
607                                 int_coord_bld->zero);
608            coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
609            coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
610
611            isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
612                                 int_coord_bld->zero);
613            coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
614            coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
615
616            weight = coord_bld->undef;
617         }
618      }
619      break;
620
621   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
622      {
623         if (bld->static_sampler_state->normalized_coords) {
624            /* scale coord to length */
625            coord = lp_build_mul(coord_bld, coord, length_f);
626         }
627         if (offset) {
628            offset = lp_build_int_to_float(coord_bld, offset);
629            coord = lp_build_add(coord_bld, coord, offset);
630         }
631         /*
632          * XXX: probably not correct for gather due to swapped
633          * order if coord is negative (same rationale as for
634          * MIRROR_CLAMP).
635          */
636         coord = lp_build_abs(coord_bld, coord);
637
638         /*
639          * We don't need any clamp. Technically, for very large
640          * (or infinite) values, clamp against length would be
641          * correct, but we don't need to guarantee any specific
642          * result for such coords (the ifloor will be undefined, but
643          * for modes requiring border all resulting coords are safe).
644          */
645         coord = lp_build_sub(coord_bld, coord, half);
646
647         /* convert to int, compute lerp weight */
648         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
649         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
650      }
651      break;
652
653   default:
654      assert(0);
655      coord0 = NULL;
656      coord1 = NULL;
657      weight = NULL;
658   }
659
660   *x0_out = coord0;
661   *x1_out = coord1;
662   *weight_out = weight;
663}
664
665
666/**
667 * Build LLVM code for texture wrap mode for nearest filtering.
668 * \param coord  the incoming texcoord (nominally in [0,1])
669 * \param length  the texture size along one dimension, as int vector
670 * \param length_f  the texture size along one dimension, as float vector
671 * \param offset  texel offset along one dimension (as int vector)
672 * \param is_pot  if TRUE, length is a power of two
673 * \param wrap_mode  one of PIPE_TEX_WRAP_x
674 */
675static LLVMValueRef
676lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
677                             LLVMValueRef coord,
678                             LLVMValueRef length,
679                             LLVMValueRef length_f,
680                             LLVMValueRef offset,
681                             boolean is_pot,
682                             unsigned wrap_mode)
683{
684   struct lp_build_context *coord_bld = &bld->coord_bld;
685   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
686   LLVMBuilderRef builder = bld->gallivm->builder;
687   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
688   LLVMValueRef icoord;
689
690   switch(wrap_mode) {
691   case PIPE_TEX_WRAP_REPEAT:
692      if (is_pot) {
693         coord = lp_build_mul(coord_bld, coord, length_f);
694         icoord = lp_build_ifloor(coord_bld, coord);
695         if (offset) {
696            icoord = lp_build_add(int_coord_bld, icoord, offset);
697         }
698         icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
699      }
700      else {
701          if (offset) {
702             offset = lp_build_int_to_float(coord_bld, offset);
703             offset = lp_build_div(coord_bld, offset, length_f);
704             coord = lp_build_add(coord_bld, coord, offset);
705          }
706          /* take fraction, unnormalize */
707          coord = lp_build_fract_safe(coord_bld, coord);
708          coord = lp_build_mul(coord_bld, coord, length_f);
709          icoord = lp_build_itrunc(coord_bld, coord);
710      }
711      break;
712
713   case PIPE_TEX_WRAP_CLAMP:
714   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
715      if (bld->static_sampler_state->normalized_coords) {
716         /* scale coord to length */
717         coord = lp_build_mul(coord_bld, coord, length_f);
718      }
719
720      if (offset) {
721         offset = lp_build_int_to_float(coord_bld, offset);
722         coord = lp_build_add(coord_bld, coord, offset);
723      }
724      /* floor */
725      /* use itrunc instead since we clamp to 0 anyway */
726      icoord = lp_build_itrunc(coord_bld, coord);
727
728      /* clamp to [0, length - 1]. */
729      icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
730                              length_minus_one);
731      break;
732
733   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
734      if (bld->static_sampler_state->normalized_coords) {
735         /* scale coord to length */
736         coord = lp_build_mul(coord_bld, coord, length_f);
737      }
738      /* no clamp necessary, border masking will handle this */
739      icoord = lp_build_ifloor(coord_bld, coord);
740      if (offset) {
741         icoord = lp_build_add(int_coord_bld, icoord, offset);
742      }
743      break;
744
745   case PIPE_TEX_WRAP_MIRROR_REPEAT:
746      if (offset) {
747         offset = lp_build_int_to_float(coord_bld, offset);
748         offset = lp_build_div(coord_bld, offset, length_f);
749         coord = lp_build_add(coord_bld, coord, offset);
750      }
751      /* compute mirror function */
752      coord = lp_build_coord_mirror(bld, coord, TRUE);
753
754      /* scale coord to length */
755      assert(bld->static_sampler_state->normalized_coords);
756      coord = lp_build_mul(coord_bld, coord, length_f);
757
758      /* itrunc == ifloor here */
759      icoord = lp_build_itrunc(coord_bld, coord);
760
761      /* clamp to [0, length - 1] */
762      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
763      break;
764
765   case PIPE_TEX_WRAP_MIRROR_CLAMP:
766   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
767      if (bld->static_sampler_state->normalized_coords) {
768         /* scale coord to length */
769         coord = lp_build_mul(coord_bld, coord, length_f);
770      }
771      if (offset) {
772         offset = lp_build_int_to_float(coord_bld, offset);
773         coord = lp_build_add(coord_bld, coord, offset);
774      }
775      coord = lp_build_abs(coord_bld, coord);
776
777      /* itrunc == ifloor here */
778      icoord = lp_build_itrunc(coord_bld, coord);
779      /*
780       * Use unsigned min due to possible undef values (NaNs, overflow)
781       */
782      {
783         struct lp_build_context abs_coord_bld = *int_coord_bld;
784         abs_coord_bld.type.sign = FALSE;
785         /* clamp to [0, length - 1] */
786         icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
787      }
788      break;
789
790   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
791      if (bld->static_sampler_state->normalized_coords) {
792         /* scale coord to length */
793         coord = lp_build_mul(coord_bld, coord, length_f);
794      }
795      if (offset) {
796         offset = lp_build_int_to_float(coord_bld, offset);
797         coord = lp_build_add(coord_bld, coord, offset);
798      }
799      coord = lp_build_abs(coord_bld, coord);
800
801      /* itrunc == ifloor here */
802      icoord = lp_build_itrunc(coord_bld, coord);
803      break;
804
805   default:
806      assert(0);
807      icoord = NULL;
808   }
809
810   return icoord;
811}
812
813
814/**
815 * Do shadow test/comparison.
816 * \param p shadow ref value
817 * \param texel  the texel to compare against
818 */
819static LLVMValueRef
820lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
821                            LLVMValueRef p,
822                            LLVMValueRef texel)
823{
824   struct lp_build_context *texel_bld = &bld->texel_bld;
825   LLVMValueRef res;
826
827   if (0) {
828      //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
829      lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
830   }
831
832   /* result = (p FUNC texel) ? 1 : 0 */
833   /*
834    * honor d3d10 floating point rules here, which state that comparisons
835    * are ordered except NOT_EQUAL which is unordered.
836    */
837   if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
838      res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
839                                 p, texel);
840   }
841   else {
842      res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
843                         p, texel);
844   }
845   return res;
846}
847
848
849/**
850 * Generate code to sample a mipmap level with nearest filtering.
851 * If sampling a cube texture, r = cube face in [0,5].
852 */
853static void
854lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
855                              LLVMValueRef size,
856                              LLVMValueRef row_stride_vec,
857                              LLVMValueRef img_stride_vec,
858                              LLVMValueRef data_ptr,
859                              LLVMValueRef mipoffsets,
860                              const LLVMValueRef *coords,
861                              const LLVMValueRef *offsets,
862                              LLVMValueRef colors_out[4])
863{
864   const unsigned dims = bld->dims;
865   LLVMValueRef width_vec;
866   LLVMValueRef height_vec;
867   LLVMValueRef depth_vec;
868   LLVMValueRef flt_size;
869   LLVMValueRef flt_width_vec;
870   LLVMValueRef flt_height_vec;
871   LLVMValueRef flt_depth_vec;
872   LLVMValueRef x, y = NULL, z = NULL;
873
874   lp_build_extract_image_sizes(bld,
875                                &bld->int_size_bld,
876                                bld->int_coord_type,
877                                size,
878                                &width_vec, &height_vec, &depth_vec);
879
880   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
881
882   lp_build_extract_image_sizes(bld,
883                                &bld->float_size_bld,
884                                bld->coord_type,
885                                flt_size,
886                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
887
888   /*
889    * Compute integer texcoords.
890    */
891   x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
892                                    flt_width_vec, offsets[0],
893                                    bld->static_texture_state->pot_width,
894                                    bld->static_sampler_state->wrap_s);
895   lp_build_name(x, "tex.x.wrapped");
896
897   if (dims >= 2) {
898      y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
899                                       flt_height_vec, offsets[1],
900                                       bld->static_texture_state->pot_height,
901                                       bld->static_sampler_state->wrap_t);
902      lp_build_name(y, "tex.y.wrapped");
903
904      if (dims == 3) {
905         z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
906                                          flt_depth_vec, offsets[2],
907                                          bld->static_texture_state->pot_depth,
908                                          bld->static_sampler_state->wrap_r);
909         lp_build_name(z, "tex.z.wrapped");
910      }
911   }
912   if (has_layer_coord(bld->static_texture_state->target)) {
913      if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
914         /* add cube layer to face */
915         z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
916      }
917      else {
918         z = coords[2];
919      }
920      lp_build_name(z, "tex.z.layer");
921   }
922
923   /*
924    * Get texture colors.
925    */
926   lp_build_sample_texel_soa(bld,
927                             width_vec, height_vec, depth_vec,
928                             x, y, z,
929                             row_stride_vec, img_stride_vec,
930                             data_ptr, mipoffsets, colors_out);
931
932   if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
933      LLVMValueRef cmpval;
934      cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
935      /* this is really just a AND 1.0, cmpval but llvm is clever enough */
936      colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
937                                      bld->texel_bld.one, bld->texel_bld.zero);
938      colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
939   }
940
941}
942
943
944/**
945 * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
946 */
947static LLVMValueRef
948lp_build_masklerp(struct lp_build_context *bld,
949                 LLVMValueRef weight,
950                 LLVMValueRef mask0,
951                 LLVMValueRef mask1)
952{
953   struct gallivm_state *gallivm = bld->gallivm;
954   LLVMBuilderRef builder = gallivm->builder;
955   LLVMValueRef weight2;
956
957   weight2 = lp_build_sub(bld, bld->one, weight);
958   weight = LLVMBuildBitCast(builder, weight,
959                              lp_build_int_vec_type(gallivm, bld->type), "");
960   weight2 = LLVMBuildBitCast(builder, weight2,
961                              lp_build_int_vec_type(gallivm, bld->type), "");
962   weight = LLVMBuildAnd(builder, weight, mask1, "");
963   weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
964   weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
965   weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
966   return lp_build_add(bld, weight, weight2);
967}
968
969/**
970 * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
971 */
972static LLVMValueRef
973lp_build_masklerp2d(struct lp_build_context *bld,
974                    LLVMValueRef weight0,
975                    LLVMValueRef weight1,
976                    LLVMValueRef mask00,
977                    LLVMValueRef mask01,
978                    LLVMValueRef mask10,
979                    LLVMValueRef mask11)
980{
981   LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
982   LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
983   return lp_build_lerp(bld, weight1, val0, val1, 0);
984}
985
986/*
987 * this is a bit excessive code for something OpenGL just recommends
988 * but does not require.
989 */
990#define ACCURATE_CUBE_CORNERS 1
991
992/**
993 * Generate code to sample a mipmap level with linear filtering.
994 * If sampling a cube texture, r = cube face in [0,5].
995 * If linear_mask is present, only pixels having their mask set
996 * will receive linear filtering, the rest will use nearest.
997 */
998static void
999lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1000                             boolean is_gather,
1001                             LLVMValueRef size,
1002                             LLVMValueRef linear_mask,
1003                             LLVMValueRef row_stride_vec,
1004                             LLVMValueRef img_stride_vec,
1005                             LLVMValueRef data_ptr,
1006                             LLVMValueRef mipoffsets,
1007                             const LLVMValueRef *coords,
1008                             const LLVMValueRef *offsets,
1009                             LLVMValueRef colors_out[4])
1010{
1011   LLVMBuilderRef builder = bld->gallivm->builder;
1012   struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1013   struct lp_build_context *coord_bld = &bld->coord_bld;
1014   struct lp_build_context *texel_bld = &bld->texel_bld;
1015   const unsigned dims = bld->dims;
1016   LLVMValueRef width_vec;
1017   LLVMValueRef height_vec;
1018   LLVMValueRef depth_vec;
1019   LLVMValueRef flt_size;
1020   LLVMValueRef flt_width_vec;
1021   LLVMValueRef flt_height_vec;
1022   LLVMValueRef flt_depth_vec;
1023   LLVMValueRef fall_off[4], have_corners;
1024   LLVMValueRef z1 = NULL;
1025   LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1026   LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1027   LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1028   LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1029   LLVMValueRef xs[4], ys[4], zs[4];
1030   LLVMValueRef neighbors[2][2][4];
1031   int chan, texel_index;
1032   boolean seamless_cube_filter, accurate_cube_corners;
1033   unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1034
1035   seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1036                           bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1037                          bld->static_sampler_state->seamless_cube_map;
1038
1039   accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
1040
1041   lp_build_extract_image_sizes(bld,
1042                                &bld->int_size_bld,
1043                                bld->int_coord_type,
1044                                size,
1045                                &width_vec, &height_vec, &depth_vec);
1046
1047   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1048
1049   lp_build_extract_image_sizes(bld,
1050                                &bld->float_size_bld,
1051                                bld->coord_type,
1052                                flt_size,
1053                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1054
1055   /*
1056    * Compute integer texcoords.
1057    */
1058
1059   if (!seamless_cube_filter) {
1060      lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1061                                  flt_width_vec, offsets[0],
1062                                  bld->static_texture_state->pot_width,
1063                                  bld->static_sampler_state->wrap_s,
1064                                  &x00, &x01, &s_fpart);
1065      lp_build_name(x00, "tex.x0.wrapped");
1066      lp_build_name(x01, "tex.x1.wrapped");
1067      x10 = x00;
1068      x11 = x01;
1069
1070      if (dims >= 2) {
1071         lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1072                                     flt_height_vec, offsets[1],
1073                                     bld->static_texture_state->pot_height,
1074                                     bld->static_sampler_state->wrap_t,
1075                                     &y00, &y10, &t_fpart);
1076         lp_build_name(y00, "tex.y0.wrapped");
1077         lp_build_name(y10, "tex.y1.wrapped");
1078         y01 = y00;
1079         y11 = y10;
1080
1081         if (dims == 3) {
1082            lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1083                                        flt_depth_vec, offsets[2],
1084                                        bld->static_texture_state->pot_depth,
1085                                        bld->static_sampler_state->wrap_r,
1086                                        &z00, &z1, &r_fpart);
1087            z01 = z10 = z11 = z00;
1088            lp_build_name(z00, "tex.z0.wrapped");
1089            lp_build_name(z1, "tex.z1.wrapped");
1090         }
1091      }
1092      if (has_layer_coord(bld->static_texture_state->target)) {
1093         if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1094            /* add cube layer to face */
1095            z00 = z01 = z10 = z11 = z1 =
1096               lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1097         }
1098         else {
1099            z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1100         }
1101         lp_build_name(z00, "tex.z0.layer");
1102         lp_build_name(z1, "tex.z1.layer");
1103      }
1104   }
1105   else {
1106      struct lp_build_if_state edge_if;
1107      LLVMTypeRef int1t;
1108      LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1109      LLVMValueRef coord0, coord1, have_edge, have_corner;
1110      LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1111      LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1112      LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1113      LLVMValueRef face = coords[2];
1114      LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1115      LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1116      /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1117      height_vec = width_vec;
1118      flt_height_vec = flt_width_vec;
1119
1120      /* XXX the overflow logic is actually sort of duplicated with trilinear,
1121       * since an overflow in one mip should also have a corresponding overflow
1122       * in another.
1123       */
1124      /* should always have normalized coords, and offsets are undefined */
1125      assert(bld->static_sampler_state->normalized_coords);
1126      /*
1127       * The coords should all be between [0,1] however we can have NaNs,
1128       * which will wreak havoc. In particular the y1_clamped value below
1129       * can be -INT_MAX (on x86) and be propagated right through (probably
1130       * other values might be bogus in the end too).
1131       * So kill off the NaNs here.
1132       */
1133      coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1134                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1135      coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1136      /* instead of clamp, build mask if overflowed */
1137      coord0 = lp_build_sub(coord_bld, coord0, half);
1138      /* convert to int, compute lerp weight */
1139      /* not ideal with AVX (and no AVX2) */
1140      lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1141      x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1142      coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1143                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1144      coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1145      coord1 = lp_build_sub(coord_bld, coord1, half);
1146      lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1147      y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1148
1149      fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1150      fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1151      fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1152      fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1153
1154      fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1155      fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1156      have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1157      have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1158
1159      /* needed for accurate corner filtering branch later, rely on 0 init */
1160      int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1161      have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1162
1163      for (texel_index = 0; texel_index < 4; texel_index++) {
1164         xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1165         ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1166         zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1167      }
1168
1169      lp_build_if(&edge_if, bld->gallivm, have_edge);
1170
1171      have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1172      have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1173      LLVMBuildStore(builder, have_corner, have_corners);
1174
1175      /*
1176       * Need to feed clamped values here for cheap corner handling,
1177       * but only for y coord (as when falling off both edges we only
1178       * fall off the x one) - this should be sufficient.
1179       */
1180      y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1181      y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1182
1183      /*
1184       * Get all possible new coords.
1185       */
1186      lp_build_cube_new_coords(ivec_bld, face,
1187                               x0, x1, y0_clamped, y1_clamped,
1188                               length_minus_one,
1189                               new_faces, new_xcoords, new_ycoords);
1190
1191      /* handle fall off x-, x+ direction */
1192      /* determine new coords, face (not both fall_off vars can be true at same time) */
1193      x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1194      y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1195      x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1196      y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1197      x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1198      y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1199      x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1200      y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1201
1202      z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1203      z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1204
1205      /* handle fall off y-, y+ direction */
1206      /*
1207       * Cheap corner logic: just hack up things so a texel doesn't fall
1208       * off both sides (which means filter weights will be wrong but we'll only
1209       * use valid texels in the filter).
1210       * This means however (y) coords must additionally be clamped (see above).
1211       * This corner handling should be fully OpenGL (but not d3d10) compliant.
1212       */
1213      fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1214      fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1215      fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1216      fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1217
1218      x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1219      y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1220      x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1221      y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1222      x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1223      y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1224      x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1225      y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1226
1227      z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1228      z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1229      z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1230      z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1231
1232      if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1233         /* now can add cube layer to face (per sample) */
1234         z00 = lp_build_add(ivec_bld, z00, coords[3]);
1235         z01 = lp_build_add(ivec_bld, z01, coords[3]);
1236         z10 = lp_build_add(ivec_bld, z10, coords[3]);
1237         z11 = lp_build_add(ivec_bld, z11, coords[3]);
1238      }
1239
1240      LLVMBuildStore(builder, x00, xs[0]);
1241      LLVMBuildStore(builder, x01, xs[1]);
1242      LLVMBuildStore(builder, x10, xs[2]);
1243      LLVMBuildStore(builder, x11, xs[3]);
1244      LLVMBuildStore(builder, y00, ys[0]);
1245      LLVMBuildStore(builder, y01, ys[1]);
1246      LLVMBuildStore(builder, y10, ys[2]);
1247      LLVMBuildStore(builder, y11, ys[3]);
1248      LLVMBuildStore(builder, z00, zs[0]);
1249      LLVMBuildStore(builder, z01, zs[1]);
1250      LLVMBuildStore(builder, z10, zs[2]);
1251      LLVMBuildStore(builder, z11, zs[3]);
1252
1253      lp_build_else(&edge_if);
1254
1255      LLVMBuildStore(builder, x0, xs[0]);
1256      LLVMBuildStore(builder, x1, xs[1]);
1257      LLVMBuildStore(builder, x0, xs[2]);
1258      LLVMBuildStore(builder, x1, xs[3]);
1259      LLVMBuildStore(builder, y0, ys[0]);
1260      LLVMBuildStore(builder, y0, ys[1]);
1261      LLVMBuildStore(builder, y1, ys[2]);
1262      LLVMBuildStore(builder, y1, ys[3]);
1263      if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1264         LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1265         LLVMBuildStore(builder, cube_layer, zs[0]);
1266         LLVMBuildStore(builder, cube_layer, zs[1]);
1267         LLVMBuildStore(builder, cube_layer, zs[2]);
1268         LLVMBuildStore(builder, cube_layer, zs[3]);
1269      }
1270      else {
1271         LLVMBuildStore(builder, face, zs[0]);
1272         LLVMBuildStore(builder, face, zs[1]);
1273         LLVMBuildStore(builder, face, zs[2]);
1274         LLVMBuildStore(builder, face, zs[3]);
1275      }
1276
1277      lp_build_endif(&edge_if);
1278
1279      x00 = LLVMBuildLoad(builder, xs[0], "");
1280      x01 = LLVMBuildLoad(builder, xs[1], "");
1281      x10 = LLVMBuildLoad(builder, xs[2], "");
1282      x11 = LLVMBuildLoad(builder, xs[3], "");
1283      y00 = LLVMBuildLoad(builder, ys[0], "");
1284      y01 = LLVMBuildLoad(builder, ys[1], "");
1285      y10 = LLVMBuildLoad(builder, ys[2], "");
1286      y11 = LLVMBuildLoad(builder, ys[3], "");
1287      z00 = LLVMBuildLoad(builder, zs[0], "");
1288      z01 = LLVMBuildLoad(builder, zs[1], "");
1289      z10 = LLVMBuildLoad(builder, zs[2], "");
1290      z11 = LLVMBuildLoad(builder, zs[3], "");
1291   }
1292
1293   if (linear_mask) {
1294      /*
1295       * Whack filter weights into place. Whatever texel had more weight is
1296       * the one which should have been selected by nearest filtering hence
1297       * just use 100% weight for it.
1298       */
1299      struct lp_build_context *c_bld = &bld->coord_bld;
1300      LLVMValueRef w1_mask, w1_weight;
1301      LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1302
1303      w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1304      /* this select is really just a "and" */
1305      w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1306      s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1307      if (dims >= 2) {
1308         w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1309         w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1310         t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1311         if (dims == 3) {
1312            w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1313            w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1314            r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1315         }
1316      }
1317   }
1318
1319   /*
1320    * Get texture colors.
1321    */
1322   /* get x0/x1 texels */
1323   lp_build_sample_texel_soa(bld,
1324                             width_vec, height_vec, depth_vec,
1325                             x00, y00, z00,
1326                             row_stride_vec, img_stride_vec,
1327                             data_ptr, mipoffsets, neighbors[0][0]);
1328   lp_build_sample_texel_soa(bld,
1329                             width_vec, height_vec, depth_vec,
1330                             x01, y01, z01,
1331                             row_stride_vec, img_stride_vec,
1332                             data_ptr, mipoffsets, neighbors[0][1]);
1333
1334   if (dims == 1) {
1335      assert(!is_gather);
1336      if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1337         /* Interpolate two samples from 1D image to produce one color */
1338         for (chan = 0; chan < 4; chan++) {
1339            colors_out[chan] = lp_build_lerp(texel_bld, s_fpart,
1340                                             neighbors[0][0][chan],
1341                                             neighbors[0][1][chan],
1342                                             0);
1343         }
1344      }
1345      else {
1346         LLVMValueRef cmpval0, cmpval1;
1347         cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1348         cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1349         /* simplified lerp, AND mask with weight and add */
1350         colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1351                                           cmpval0, cmpval1);
1352         colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1353      }
1354   }
1355   else {
1356      /* 2D/3D texture */
1357      struct lp_build_if_state corner_if;
1358      LLVMValueRef colors0[4], colorss[4];
1359
1360      /* get x0/x1 texels at y1 */
1361      lp_build_sample_texel_soa(bld,
1362                                width_vec, height_vec, depth_vec,
1363                                x10, y10, z10,
1364                                row_stride_vec, img_stride_vec,
1365                                data_ptr, mipoffsets, neighbors[1][0]);
1366      lp_build_sample_texel_soa(bld,
1367                                width_vec, height_vec, depth_vec,
1368                                x11, y11, z11,
1369                                row_stride_vec, img_stride_vec,
1370                                data_ptr, mipoffsets, neighbors[1][1]);
1371
1372      /*
1373       * To avoid having to duplicate linear_mask / fetch code use
1374       * another branch (with corner condition though edge would work
1375       * as well) here.
1376       */
1377      if (accurate_cube_corners) {
1378         LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1379         LLVMValueRef have_corner, one_third;
1380
1381         colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1382         colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1383         colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1384         colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1385
1386         have_corner = LLVMBuildLoad(builder, have_corners, "");
1387
1388         lp_build_if(&corner_if, bld->gallivm, have_corner);
1389
1390         one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1391                                        1.0f/3.0f);
1392
1393         /* find corner */
1394         c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1395         c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1396         c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1397         c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1398         c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1399         c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1400         c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1401         c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1402
1403         if (!is_gather) {
1404            /*
1405             * we can't use standard 2d lerp as we need per-element weight
1406             * in case of corners, so just calculate bilinear result as
1407             * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1408             * (This is actually less work than using 2d lerp, 7 vs. 9
1409             * instructions, however calculating the weights needs another 6,
1410             * so actually probably not slower than 2d lerp only for 4 channels
1411             * as weights only need to be calculated once - of course fixing
1412             * the weights has additional cost.)
1413             */
1414            LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1415            wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1416            wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1417            w00 = lp_build_mul(coord_bld, wx0, wy0);
1418            w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1419            w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1420            w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1421
1422            /* find corner weight */
1423            c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1424            c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1425            c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1426            c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1427
1428            /*
1429             * add 1/3 of the corner weight to the weight of the 3 other
1430             * samples and null out corner weight.
1431             */
1432            c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1433            w00 = lp_build_add(coord_bld, w00, c_weight);
1434            w00 = lp_build_andnot(coord_bld, w00, c00f);
1435            w01 = lp_build_add(coord_bld, w01, c_weight);
1436            w01 = lp_build_andnot(coord_bld, w01, c01f);
1437            w10 = lp_build_add(coord_bld, w10, c_weight);
1438            w10 = lp_build_andnot(coord_bld, w10, c10f);
1439            w11 = lp_build_add(coord_bld, w11, c_weight);
1440            w11 = lp_build_andnot(coord_bld, w11, c11f);
1441
1442            if (bld->static_sampler_state->compare_mode ==
1443                PIPE_TEX_COMPARE_NONE) {
1444               for (chan = 0; chan < 4; chan++) {
1445                  colors0[chan] = lp_build_mul(coord_bld, w00,
1446                                               neighbors[0][0][chan]);
1447                  tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1448                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1449                  tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1450                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1451                  tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1452                  colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1453               }
1454            }
1455            else {
1456               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1457               cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1458                                                      neighbors[0][0][0]);
1459               cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1460                                                      neighbors[0][1][0]);
1461               cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1462                                                      neighbors[1][0][0]);
1463               cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1464                                                      neighbors[1][1][0]);
1465               /*
1466                * inputs to interpolation are just masks so just add
1467                * masked weights together
1468                */
1469               cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1470                                           coord_bld->vec_type, "");
1471               cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1472                                           coord_bld->vec_type, "");
1473               cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1474                                           coord_bld->vec_type, "");
1475               cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1476                                           coord_bld->vec_type, "");
1477               colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1478               tmp = lp_build_and(coord_bld, w01, cmpval01);
1479               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1480               tmp = lp_build_and(coord_bld, w10, cmpval10);
1481               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1482               tmp = lp_build_and(coord_bld, w11, cmpval11);
1483               colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1484               colors0[1] = colors0[2] = colors0[3] = colors0[0];
1485            }
1486         }
1487         else {
1488            /*
1489             * We don't have any weights to adjust, so instead calculate
1490             * the fourth texel as simply the average of the other 3.
1491             * (This would work for non-gather too, however we'd have
1492             * a boatload more of the select stuff due to there being
1493             * 4 times as many colors as weights.)
1494             */
1495            LLVMValueRef col00, col01, col10, col11;
1496            LLVMValueRef colc, colc0, colc1;
1497            col10 = lp_build_swizzle_soa_channel(texel_bld,
1498                                                 neighbors[1][0], chan_swiz);
1499            col11 = lp_build_swizzle_soa_channel(texel_bld,
1500                                                 neighbors[1][1], chan_swiz);
1501            col01 = lp_build_swizzle_soa_channel(texel_bld,
1502                                                 neighbors[0][1], chan_swiz);
1503            col00 = lp_build_swizzle_soa_channel(texel_bld,
1504                                                 neighbors[0][0], chan_swiz);
1505
1506            /*
1507             * The spec says for comparison filtering, the comparison
1508             * must happen before synthesizing the new value.
1509             * This means all gathered values are always 0 or 1,
1510             * except for the non-existing texel, which can be 0,1/3,2/3,1...
1511             * Seems like we'd be allowed to just return 0 or 1 too, so we
1512             * could simplify and pass down the compare mask values to the
1513             * end (using int arithmetic/compare on the mask values to
1514             * construct the fourth texel) and only there convert to floats
1515             * but it's probably not worth it (it might be easier for the cpu
1516             * but not for the code)...
1517             */
1518            if (bld->static_sampler_state->compare_mode !=
1519                PIPE_TEX_COMPARE_NONE) {
1520               LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1521               cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1522               cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1523               cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1524               cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1525               col00 = lp_build_select(texel_bld, cmpval00,
1526                                       texel_bld->one, texel_bld->zero);
1527               col01 = lp_build_select(texel_bld, cmpval01,
1528                                       texel_bld->one, texel_bld->zero);
1529               col10 = lp_build_select(texel_bld, cmpval10,
1530                                       texel_bld->one, texel_bld->zero);
1531               col11 = lp_build_select(texel_bld, cmpval11,
1532                                       texel_bld->one, texel_bld->zero);
1533            }
1534
1535            /*
1536             * Null out corner color.
1537             */
1538            col00 = lp_build_andnot(coord_bld, col00, c00f);
1539            col01 = lp_build_andnot(coord_bld, col01, c01f);
1540            col10 = lp_build_andnot(coord_bld, col10, c10f);
1541            col11 = lp_build_andnot(coord_bld, col11, c11f);
1542
1543            /*
1544             * New corner texel color is all colors added / 3.
1545             */
1546            colc0 = lp_build_add(coord_bld, col00, col01);
1547            colc1 = lp_build_add(coord_bld, col10, col11);
1548            colc = lp_build_add(coord_bld, colc0, colc1);
1549            colc = lp_build_mul(coord_bld, one_third, colc);
1550
1551            /*
1552             * Replace the corner texel color with the new value.
1553             */
1554            col00 = lp_build_select(coord_bld, c00, colc, col00);
1555            col01 = lp_build_select(coord_bld, c01, colc, col01);
1556            col10 = lp_build_select(coord_bld, c10, colc, col10);
1557            col11 = lp_build_select(coord_bld, c11, colc, col11);
1558
1559            colors0[0] = col10;
1560            colors0[1] = col11;
1561            colors0[2] = col01;
1562            colors0[3] = col00;
1563         }
1564
1565         LLVMBuildStore(builder, colors0[0], colorss[0]);
1566         LLVMBuildStore(builder, colors0[1], colorss[1]);
1567         LLVMBuildStore(builder, colors0[2], colorss[2]);
1568         LLVMBuildStore(builder, colors0[3], colorss[3]);
1569
1570         lp_build_else(&corner_if);
1571      }
1572
1573      if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1574         if (is_gather) {
1575            /*
1576             * Just assign the red channel (no component selection yet).
1577             * This is a bit hackish, we usually do the swizzle at the
1578             * end of sampling (much less values to swizzle), but this
1579             * obviously cannot work when using gather.
1580             */
1581            colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1582                                                      neighbors[1][0],
1583                                                      chan_swiz);
1584            colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1585                                                      neighbors[1][1],
1586                                                      chan_swiz);
1587            colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1588                                                      neighbors[0][1],
1589                                                      chan_swiz);
1590            colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1591                                                      neighbors[0][0],
1592                                                      chan_swiz);
1593         }
1594         else {
1595            /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1596            for (chan = 0; chan < 4; chan++) {
1597               colors0[chan] = lp_build_lerp_2d(texel_bld,
1598                                                s_fpart, t_fpart,
1599                                                neighbors[0][0][chan],
1600                                                neighbors[0][1][chan],
1601                                                neighbors[1][0][chan],
1602                                                neighbors[1][1][chan],
1603                                                0);
1604            }
1605         }
1606      }
1607      else {
1608         LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1609         cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1610         cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1611         cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1612         cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1613
1614         if (is_gather) {
1615            /* more hacks for swizzling, should be X, ONE or ZERO... */
1616            colors0[0] = lp_build_select(texel_bld, cmpval10,
1617                                         texel_bld->one, texel_bld->zero);
1618            colors0[1] = lp_build_select(texel_bld, cmpval11,
1619                                         texel_bld->one, texel_bld->zero);
1620            colors0[2] = lp_build_select(texel_bld, cmpval01,
1621                                         texel_bld->one, texel_bld->zero);
1622            colors0[3] = lp_build_select(texel_bld, cmpval00,
1623                                         texel_bld->one, texel_bld->zero);
1624         }
1625         else {
1626            colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1627                                             cmpval00, cmpval01, cmpval10, cmpval11);
1628            colors0[1] = colors0[2] = colors0[3] = colors0[0];
1629         }
1630      }
1631
1632      if (accurate_cube_corners) {
1633         LLVMBuildStore(builder, colors0[0], colorss[0]);
1634         LLVMBuildStore(builder, colors0[1], colorss[1]);
1635         LLVMBuildStore(builder, colors0[2], colorss[2]);
1636         LLVMBuildStore(builder, colors0[3], colorss[3]);
1637
1638         lp_build_endif(&corner_if);
1639
1640         colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1641         colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1642         colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1643         colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1644      }
1645
1646      if (dims == 3) {
1647         LLVMValueRef neighbors1[2][2][4];
1648         LLVMValueRef colors1[4];
1649
1650         assert(!is_gather);
1651
1652         /* get x0/x1/y0/y1 texels at z1 */
1653         lp_build_sample_texel_soa(bld,
1654                                   width_vec, height_vec, depth_vec,
1655                                   x00, y00, z1,
1656                                   row_stride_vec, img_stride_vec,
1657                                   data_ptr, mipoffsets, neighbors1[0][0]);
1658         lp_build_sample_texel_soa(bld,
1659                                   width_vec, height_vec, depth_vec,
1660                                   x01, y01, z1,
1661                                   row_stride_vec, img_stride_vec,
1662                                   data_ptr, mipoffsets, neighbors1[0][1]);
1663         lp_build_sample_texel_soa(bld,
1664                                   width_vec, height_vec, depth_vec,
1665                                   x10, y10, z1,
1666                                   row_stride_vec, img_stride_vec,
1667                                   data_ptr, mipoffsets, neighbors1[1][0]);
1668         lp_build_sample_texel_soa(bld,
1669                                   width_vec, height_vec, depth_vec,
1670                                   x11, y11, z1,
1671                                   row_stride_vec, img_stride_vec,
1672                                   data_ptr, mipoffsets, neighbors1[1][1]);
1673
1674         if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1675            /* Bilinear interpolate the four samples from the second Z slice */
1676            for (chan = 0; chan < 4; chan++) {
1677               colors1[chan] = lp_build_lerp_2d(texel_bld,
1678                                                s_fpart, t_fpart,
1679                                                neighbors1[0][0][chan],
1680                                                neighbors1[0][1][chan],
1681                                                neighbors1[1][0][chan],
1682                                                neighbors1[1][1][chan],
1683                                                0);
1684            }
1685            /* Linearly interpolate the two samples from the two 3D slices */
1686            for (chan = 0; chan < 4; chan++) {
1687               colors_out[chan] = lp_build_lerp(texel_bld,
1688                                                r_fpart,
1689                                                colors0[chan], colors1[chan],
1690                                                0);
1691            }
1692         }
1693         else {
1694            LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1695            cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1696            cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1697            cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1698            cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1699            colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1700                                             cmpval00, cmpval01, cmpval10, cmpval11);
1701            /* Linearly interpolate the two samples from the two 3D slices */
1702            colors_out[0] = lp_build_lerp(texel_bld,
1703                                          r_fpart,
1704                                          colors0[0], colors1[0],
1705                                          0);
1706            colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1707         }
1708      }
1709      else {
1710         /* 2D tex */
1711         for (chan = 0; chan < 4; chan++) {
1712            colors_out[chan] = colors0[chan];
1713         }
1714      }
1715   }
1716   if (is_gather) {
1717      /*
1718       * For gather, we can't do our usual channel swizzling done later,
1719       * so do it here. It only really matters for 0/1 swizzles in case
1720       * of comparison filtering, since in this case the results would be
1721       * wrong, without comparison it should all work out alright but it
1722       * can't hurt to do that here, since it will instantly drop all
1723       * calculations above, though it's a rather stupid idea to do
1724       * gather on a channel which will always return 0 or 1 in any case...
1725       */
1726      if (chan_swiz == PIPE_SWIZZLE_1) {
1727         for (chan = 0; chan < 4; chan++) {
1728            colors_out[chan] = texel_bld->one;
1729         }
1730      } else if (chan_swiz == PIPE_SWIZZLE_0) {
1731         for (chan = 0; chan < 4; chan++) {
1732            colors_out[chan] = texel_bld->zero;
1733         }
1734      }
1735   }
1736}
1737
1738
1739/**
1740 * Sample the texture/mipmap using given image filter and mip filter.
1741 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1742 * from (vectors or scalars).
1743 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1744 */
1745static void
1746lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1747                       unsigned img_filter,
1748                       unsigned mip_filter,
1749                       boolean is_gather,
1750                       const LLVMValueRef *coords,
1751                       const LLVMValueRef *offsets,
1752                       LLVMValueRef ilevel0,
1753                       LLVMValueRef ilevel1,
1754                       LLVMValueRef lod_fpart,
1755                       LLVMValueRef *colors_out)
1756{
1757   LLVMBuilderRef builder = bld->gallivm->builder;
1758   LLVMValueRef size0 = NULL;
1759   LLVMValueRef size1 = NULL;
1760   LLVMValueRef row_stride0_vec = NULL;
1761   LLVMValueRef row_stride1_vec = NULL;
1762   LLVMValueRef img_stride0_vec = NULL;
1763   LLVMValueRef img_stride1_vec = NULL;
1764   LLVMValueRef data_ptr0 = NULL;
1765   LLVMValueRef data_ptr1 = NULL;
1766   LLVMValueRef mipoff0 = NULL;
1767   LLVMValueRef mipoff1 = NULL;
1768   LLVMValueRef colors0[4], colors1[4];
1769   unsigned chan;
1770
1771   /* sample the first mipmap level */
1772   lp_build_mipmap_level_sizes(bld, ilevel0,
1773                               &size0,
1774                               &row_stride0_vec, &img_stride0_vec);
1775   if (bld->num_mips == 1) {
1776      data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1777   }
1778   else {
1779      /* This path should work for num_lods 1 too but slightly less efficient */
1780      data_ptr0 = bld->base_ptr;
1781      mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1782   }
1783   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1784      lp_build_sample_image_nearest(bld, size0,
1785                                    row_stride0_vec, img_stride0_vec,
1786                                    data_ptr0, mipoff0, coords, offsets,
1787                                    colors0);
1788   }
1789   else {
1790      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1791      lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1792                                   row_stride0_vec, img_stride0_vec,
1793                                   data_ptr0, mipoff0, coords, offsets,
1794                                   colors0);
1795   }
1796
1797   /* Store the first level's colors in the output variables */
1798   for (chan = 0; chan < 4; chan++) {
1799       LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1800   }
1801
1802   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1803      struct lp_build_if_state if_ctx;
1804      LLVMValueRef need_lerp;
1805
1806      /* need_lerp = lod_fpart > 0 */
1807      if (bld->num_lods == 1) {
1808         need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1809                                   lod_fpart, bld->lodf_bld.zero,
1810                                   "need_lerp");
1811      }
1812      else {
1813         /*
1814          * We'll do mip filtering if any of the quads (or individual
1815          * pixel in case of per-pixel lod) need it.
1816          * It might be better to split the vectors here and only fetch/filter
1817          * quads which need it (if there's one lod per quad).
1818          */
1819         need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1820                                      PIPE_FUNC_GREATER,
1821                                      lod_fpart, bld->lodf_bld.zero);
1822         need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1823         lp_build_name(need_lerp, "need_lerp");
1824      }
1825
1826      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1827      {
1828         /*
1829          * We unfortunately need to clamp lod_fpart here since we can get
1830          * negative values which would screw up filtering if not all
1831          * lod_fpart values have same sign.
1832          */
1833         lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1834                                  bld->lodf_bld.zero);
1835         /* sample the second mipmap level */
1836         lp_build_mipmap_level_sizes(bld, ilevel1,
1837                                     &size1,
1838                                     &row_stride1_vec, &img_stride1_vec);
1839         if (bld->num_mips == 1) {
1840            data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1841         }
1842         else {
1843            data_ptr1 = bld->base_ptr;
1844            mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1845         }
1846         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1847            lp_build_sample_image_nearest(bld, size1,
1848                                          row_stride1_vec, img_stride1_vec,
1849                                          data_ptr1, mipoff1, coords, offsets,
1850                                          colors1);
1851         }
1852         else {
1853            lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1854                                         row_stride1_vec, img_stride1_vec,
1855                                         data_ptr1, mipoff1, coords, offsets,
1856                                         colors1);
1857         }
1858
1859         /* interpolate samples from the two mipmap levels */
1860
1861         if (bld->num_lods != bld->coord_type.length)
1862            lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1863                                                              bld->lodf_bld.type,
1864                                                              bld->texel_bld.type,
1865                                                              lod_fpart);
1866
1867         for (chan = 0; chan < 4; chan++) {
1868            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1869                                          colors0[chan], colors1[chan],
1870                                          0);
1871            LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1872         }
1873      }
1874      lp_build_endif(&if_ctx);
1875   }
1876}
1877
1878
1879/**
1880 * Sample the texture/mipmap using given mip filter, and using
1881 * both nearest and linear filtering at the same time depending
1882 * on linear_mask.
1883 * lod can be per quad but linear_mask is always per pixel.
1884 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1885 * from (vectors or scalars).
1886 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1887 */
1888static void
1889lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1890                            LLVMValueRef linear_mask,
1891                            unsigned mip_filter,
1892                            const LLVMValueRef *coords,
1893                            const LLVMValueRef *offsets,
1894                            LLVMValueRef ilevel0,
1895                            LLVMValueRef ilevel1,
1896                            LLVMValueRef lod_fpart,
1897                            LLVMValueRef lod_positive,
1898                            LLVMValueRef *colors_out)
1899{
1900   LLVMBuilderRef builder = bld->gallivm->builder;
1901   LLVMValueRef size0 = NULL;
1902   LLVMValueRef size1 = NULL;
1903   LLVMValueRef row_stride0_vec = NULL;
1904   LLVMValueRef row_stride1_vec = NULL;
1905   LLVMValueRef img_stride0_vec = NULL;
1906   LLVMValueRef img_stride1_vec = NULL;
1907   LLVMValueRef data_ptr0 = NULL;
1908   LLVMValueRef data_ptr1 = NULL;
1909   LLVMValueRef mipoff0 = NULL;
1910   LLVMValueRef mipoff1 = NULL;
1911   LLVMValueRef colors0[4], colors1[4];
1912   unsigned chan;
1913
1914   /* sample the first mipmap level */
1915   lp_build_mipmap_level_sizes(bld, ilevel0,
1916                               &size0,
1917                               &row_stride0_vec, &img_stride0_vec);
1918   if (bld->num_mips == 1) {
1919      data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1920   }
1921   else {
1922      /* This path should work for num_lods 1 too but slightly less efficient */
1923      data_ptr0 = bld->base_ptr;
1924      mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1925   }
1926
1927   lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1928                                row_stride0_vec, img_stride0_vec,
1929                                data_ptr0, mipoff0, coords, offsets,
1930                                colors0);
1931
1932   /* Store the first level's colors in the output variables */
1933   for (chan = 0; chan < 4; chan++) {
1934       LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1935   }
1936
1937   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1938      struct lp_build_if_state if_ctx;
1939      LLVMValueRef need_lerp;
1940
1941      /*
1942       * We'll do mip filtering if any of the quads (or individual
1943       * pixel in case of per-pixel lod) need it.
1944       * Note using lod_positive here not lod_fpart since it may be the same
1945       * condition as that used in the outer "if" in the caller hence llvm
1946       * should be able to merge the branches in this case.
1947       */
1948      need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1949      lp_build_name(need_lerp, "need_lerp");
1950
1951      lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1952      {
1953         /*
1954          * We unfortunately need to clamp lod_fpart here since we can get
1955          * negative values which would screw up filtering if not all
1956          * lod_fpart values have same sign.
1957          */
1958         lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1959                                  bld->lodf_bld.zero);
1960         /* sample the second mipmap level */
1961         lp_build_mipmap_level_sizes(bld, ilevel1,
1962                                     &size1,
1963                                     &row_stride1_vec, &img_stride1_vec);
1964         if (bld->num_mips == 1) {
1965            data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1966         }
1967         else {
1968            data_ptr1 = bld->base_ptr;
1969            mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1970         }
1971
1972         lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
1973                                      row_stride1_vec, img_stride1_vec,
1974                                      data_ptr1, mipoff1, coords, offsets,
1975                                      colors1);
1976
1977         /* interpolate samples from the two mipmap levels */
1978
1979         if (bld->num_lods != bld->coord_type.length)
1980            lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1981                                                              bld->lodf_bld.type,
1982                                                              bld->texel_bld.type,
1983                                                              lod_fpart);
1984
1985         for (chan = 0; chan < 4; chan++) {
1986            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1987                                          colors0[chan], colors1[chan],
1988                                          0);
1989            LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1990         }
1991      }
1992      lp_build_endif(&if_ctx);
1993   }
1994}
1995
1996
1997/**
1998 * Build (per-coord) layer value.
1999 * Either clamp layer to valid values or fill in optional out_of_bounds
2000 * value and just return value unclamped.
2001 */
2002static LLVMValueRef
2003lp_build_layer_coord(struct lp_build_sample_context *bld,
2004                     unsigned texture_unit,
2005                     boolean is_cube_array,
2006                     LLVMValueRef layer,
2007                     LLVMValueRef *out_of_bounds)
2008{
2009   LLVMValueRef num_layers;
2010   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2011
2012   num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2013                                          bld->context_ptr, texture_unit);
2014
2015   if (out_of_bounds) {
2016      LLVMValueRef out1, out;
2017      assert(!is_cube_array);
2018      num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2019      out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2020      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2021      *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2022      return layer;
2023   }
2024   else {
2025      LLVMValueRef maxlayer;
2026      LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2027                                       bld->int_bld.one;
2028      maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2029      maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2030      return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2031   }
2032}
2033
2034
2035/**
2036 * Calculate cube face, lod, mip levels.
2037 */
2038static void
2039lp_build_sample_common(struct lp_build_sample_context *bld,
2040                       boolean is_lodq,
2041                       unsigned texture_index,
2042                       unsigned sampler_index,
2043                       LLVMValueRef *coords,
2044                       const struct lp_derivatives *derivs, /* optional */
2045                       LLVMValueRef lod_bias, /* optional */
2046                       LLVMValueRef explicit_lod, /* optional */
2047                       LLVMValueRef *lod_pos_or_zero,
2048                       LLVMValueRef *lod,
2049                       LLVMValueRef *lod_fpart,
2050                       LLVMValueRef *ilevel0,
2051                       LLVMValueRef *ilevel1)
2052{
2053   const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2054   const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2055   const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2056   const unsigned target = bld->static_texture_state->target;
2057   LLVMValueRef first_level, cube_rho = NULL;
2058   LLVMValueRef lod_ipart = NULL;
2059   struct lp_derivatives cube_derivs;
2060
2061   /*
2062   printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2063          mip_filter, min_filter, mag_filter);
2064   */
2065
2066   /*
2067    * Choose cube face, recompute texcoords for the chosen face and
2068    * compute rho here too (as it requires transform of derivatives).
2069    */
2070   if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2071      boolean need_derivs;
2072      need_derivs = ((min_filter != mag_filter ||
2073                      mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2074                      !bld->static_sampler_state->min_max_lod_equal &&
2075                      !explicit_lod);
2076      lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2077      derivs = &cube_derivs;
2078      if (target == PIPE_TEXTURE_CUBE_ARRAY) {
2079         /* calculate cube layer coord now */
2080         LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2081         LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2082         layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2083         coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2084         /* because of seamless filtering can't add it to face (coords[2]) here. */
2085      }
2086   }
2087   else if (target == PIPE_TEXTURE_1D_ARRAY ||
2088            target == PIPE_TEXTURE_2D_ARRAY) {
2089      coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2090      coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2091   }
2092
2093   if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2094      /*
2095       * Clamp p coords to [0,1] for fixed function depth texture format here.
2096       * Technically this is not entirely correct for unorm depth as the ref value
2097       * should be converted to the depth format (quantization!) and comparison
2098       * then done in texture format. This would actually help performance (since
2099       * only need to do it once and could save the per-sample conversion of texels
2100       * to floats instead), but it would need more messy code (would need to push
2101       * at least some bits down to actual fetch so conversion could be skipped,
2102       * and would have ugly interaction with border color, would need to convert
2103       * border color to that format too or do some other tricks to make it work).
2104       */
2105      const struct util_format_description *format_desc = bld->format_desc;
2106      unsigned chan_type;
2107      /* not entirely sure we couldn't end up with non-valid swizzle here */
2108      chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2109                     format_desc->channel[format_desc->swizzle[0]].type :
2110                     UTIL_FORMAT_TYPE_FLOAT;
2111      if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2112         coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2113                                    bld->coord_bld.zero, bld->coord_bld.one);
2114      }
2115   }
2116
2117   /*
2118    * Compute the level of detail (float).
2119    */
2120   if (min_filter != mag_filter ||
2121       mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2122      /* Need to compute lod either to choose mipmap levels or to
2123       * distinguish between minification/magnification with one mipmap level.
2124       */
2125      lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2126                            coords[0], coords[1], coords[2], cube_rho,
2127                            derivs, lod_bias, explicit_lod,
2128                            mip_filter, lod,
2129                            &lod_ipart, lod_fpart, lod_pos_or_zero);
2130      if (is_lodq) {
2131         LLVMValueRef last_level;
2132         last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2133                                                     bld->gallivm,
2134                                                     bld->context_ptr,
2135                                                     texture_index);
2136         first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2137                                                       bld->gallivm,
2138                                                       bld->context_ptr,
2139                                                       texture_index);
2140         last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2141         last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2142         last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2143
2144         switch (mip_filter) {
2145         case PIPE_TEX_MIPFILTER_NONE:
2146            *lod_fpart = bld->lodf_bld.zero;
2147            break;
2148         case PIPE_TEX_MIPFILTER_NEAREST:
2149             *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2150             /* fallthrough */
2151         case PIPE_TEX_MIPFILTER_LINEAR:
2152            *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2153                                        bld->lodf_bld.zero, last_level);
2154            break;
2155         }
2156         return;
2157      }
2158
2159   } else {
2160      lod_ipart = bld->lodi_bld.zero;
2161      *lod_pos_or_zero = bld->lodi_bld.zero;
2162   }
2163
2164   if (bld->num_lods != bld->num_mips) {
2165      /* only makes sense if there's just a single mip level */
2166      assert(bld->num_mips == 1);
2167      lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2168   }
2169
2170   /*
2171    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2172    */
2173   switch (mip_filter) {
2174   default:
2175      assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2176      /* fall-through */
2177   case PIPE_TEX_MIPFILTER_NONE:
2178      /* always use mip level 0 */
2179      first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2180                                                    bld->gallivm, bld->context_ptr,
2181                                                    texture_index);
2182      first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2183      *ilevel0 = first_level;
2184      break;
2185   case PIPE_TEX_MIPFILTER_NEAREST:
2186      assert(lod_ipart);
2187      lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2188      break;
2189   case PIPE_TEX_MIPFILTER_LINEAR:
2190      assert(lod_ipart);
2191      assert(*lod_fpart);
2192      lp_build_linear_mip_levels(bld, texture_index,
2193                                 lod_ipart, lod_fpart,
2194                                 ilevel0, ilevel1);
2195      break;
2196   }
2197}
2198
2199static void
2200lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2201                            unsigned sampler_unit)
2202{
2203   struct gallivm_state *gallivm = bld->gallivm;
2204   LLVMBuilderRef builder = gallivm->builder;
2205   LLVMValueRef border_color_ptr =
2206      bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2207                                       bld->context_ptr, sampler_unit);
2208   LLVMValueRef border_color;
2209   const struct util_format_description *format_desc = bld->format_desc;
2210   struct lp_type vec4_type = bld->texel_type;
2211   struct lp_build_context vec4_bld;
2212   LLVMValueRef min_clamp = NULL;
2213   LLVMValueRef max_clamp = NULL;
2214
2215   /*
2216    * For normalized format need to clamp border color (technically
2217    * probably should also quantize the data). Really sucks doing this
2218    * here but can't avoid at least for now since this is part of
2219    * sampler state and texture format is part of sampler_view state.
2220    * GL expects also expects clamping for uint/sint formats too so
2221    * do that as well (d3d10 can't end up here with uint/sint since it
2222    * only supports them with ld).
2223    */
2224   vec4_type.length = 4;
2225   lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2226
2227   /*
2228    * Vectorized clamping of border color. Loading is a bit of a hack since
2229    * we just cast the pointer to float array to pointer to vec4
2230    * (int or float).
2231    */
2232   border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2233                                             lp_build_const_int32(gallivm, 0));
2234   border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2235                                       LLVMPointerType(vec4_bld.vec_type, 0), "");
2236   border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2237   /* we don't have aligned type in the dynamic state unfortunately */
2238   LLVMSetAlignment(border_color, 4);
2239
2240   /*
2241    * Instead of having some incredibly complex logic which will try to figure out
2242    * clamping necessary for each channel, simply use the first channel, and treat
2243    * mixed signed/unsigned normalized formats specially.
2244    * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2245    * good reason.)
2246    */
2247   if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2248      int chan;
2249      /* d/s needs special handling because both present means just sampling depth */
2250      if (util_format_is_depth_and_stencil(format_desc->format)) {
2251         chan = format_desc->swizzle[0];
2252      }
2253      else {
2254         chan = util_format_get_first_non_void_channel(format_desc->format);
2255      }
2256      if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2257         unsigned chan_type = format_desc->channel[chan].type;
2258         unsigned chan_norm = format_desc->channel[chan].normalized;
2259         unsigned chan_pure = format_desc->channel[chan].pure_integer;
2260         if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2261            if (chan_norm) {
2262               min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2263               max_clamp = vec4_bld.one;
2264            }
2265            else if (chan_pure) {
2266               /*
2267                * Border color was stored as int, hence need min/max clamp
2268                * only if chan has less than 32 bits..
2269                */
2270               unsigned chan_size = format_desc->channel[chan].size;
2271               if (chan_size < 32) {
2272                  min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2273                                                     0 - (1 << (chan_size - 1)));
2274                  max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2275                                                     (1 << (chan_size - 1)) - 1);
2276               }
2277            }
2278            /* TODO: no idea about non-pure, non-normalized! */
2279         }
2280         else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2281            if (chan_norm) {
2282               min_clamp = vec4_bld.zero;
2283               max_clamp = vec4_bld.one;
2284            }
2285            /*
2286             * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2287             * we use Z32_FLOAT_S8X24 to imply sampling depth component
2288             * and ignoring stencil, which will blow up here if we try to
2289             * do a uint clamp in a float texel build...
2290             * And even if we had that format, mesa st also thinks using z24s8
2291             * means depth sampling ignoring stencil.
2292             */
2293            else if (chan_pure) {
2294               /*
2295                * Border color was stored as uint, hence never need min
2296                * clamp, and only need max clamp if chan has less than 32 bits.
2297                */
2298               unsigned chan_size = format_desc->channel[chan].size;
2299               if (chan_size < 32) {
2300                  max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2301                                                     (1 << chan_size) - 1);
2302               }
2303               /* TODO: no idea about non-pure, non-normalized! */
2304            }
2305         }
2306         else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2307            /* TODO: I have no idea what clamp this would need if any! */
2308         }
2309      }
2310      /* mixed plain formats (or different pure size) */
2311      switch (format_desc->format) {
2312      case PIPE_FORMAT_B10G10R10A2_UINT:
2313      case PIPE_FORMAT_R10G10B10A2_UINT:
2314      {
2315         unsigned max10 = (1 << 10) - 1;
2316         max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2317                                        max10, (1 << 2) - 1, NULL);
2318      }
2319         break;
2320      case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2321         min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2322                                        -1.0F, 0.0F, NULL);
2323         max_clamp = vec4_bld.one;
2324         break;
2325      case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2326      case PIPE_FORMAT_R5SG5SB6U_NORM:
2327         min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2328                                        0.0F, 0.0F, NULL);
2329         max_clamp = vec4_bld.one;
2330         break;
2331      default:
2332         break;
2333      }
2334   }
2335   else {
2336      /* cannot figure this out from format description */
2337      if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2338         /* s3tc formats are always unorm */
2339         min_clamp = vec4_bld.zero;
2340         max_clamp = vec4_bld.one;
2341      }
2342      else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2343               format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
2344         switch (format_desc->format) {
2345         case PIPE_FORMAT_RGTC1_UNORM:
2346         case PIPE_FORMAT_RGTC2_UNORM:
2347         case PIPE_FORMAT_LATC1_UNORM:
2348         case PIPE_FORMAT_LATC2_UNORM:
2349         case PIPE_FORMAT_ETC1_RGB8:
2350            min_clamp = vec4_bld.zero;
2351            max_clamp = vec4_bld.one;
2352            break;
2353         case PIPE_FORMAT_RGTC1_SNORM:
2354         case PIPE_FORMAT_RGTC2_SNORM:
2355         case PIPE_FORMAT_LATC1_SNORM:
2356         case PIPE_FORMAT_LATC2_SNORM:
2357            min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2358            max_clamp = vec4_bld.one;
2359            break;
2360         default:
2361            assert(0);
2362            break;
2363         }
2364      }
2365      /*
2366       * all others from subsampled/other group, though we don't care
2367       * about yuv (and should not have any from zs here)
2368       */
2369      else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2370         switch (format_desc->format) {
2371         case PIPE_FORMAT_R8G8_B8G8_UNORM:
2372         case PIPE_FORMAT_G8R8_G8B8_UNORM:
2373         case PIPE_FORMAT_G8R8_B8R8_UNORM:
2374         case PIPE_FORMAT_R8G8_R8B8_UNORM:
2375         case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2376            min_clamp = vec4_bld.zero;
2377            max_clamp = vec4_bld.one;
2378            break;
2379         case PIPE_FORMAT_R8G8Bx_SNORM:
2380            min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2381            max_clamp = vec4_bld.one;
2382            break;
2383            /*
2384             * Note smallfloat formats usually don't need clamping
2385             * (they still have infinite range) however this is not
2386             * true for r11g11b10 and r9g9b9e5, which can't represent
2387             * negative numbers (and additionally r9g9b9e5 can't represent
2388             * very large numbers). d3d10 seems happy without clamping in
2389             * this case, but gl spec is pretty clear: "for floating
2390             * point and integer formats, border values are clamped to
2391             * the representable range of the format" so do that here.
2392             */
2393         case PIPE_FORMAT_R11G11B10_FLOAT:
2394            min_clamp = vec4_bld.zero;
2395            break;
2396         case PIPE_FORMAT_R9G9B9E5_FLOAT:
2397            min_clamp = vec4_bld.zero;
2398            max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2399            break;
2400         default:
2401            assert(0);
2402            break;
2403         }
2404      }
2405   }
2406
2407   if (min_clamp) {
2408      border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2409   }
2410   if (max_clamp) {
2411      border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2412   }
2413
2414   bld->border_color_clamped = border_color;
2415}
2416
2417
2418/**
2419 * General texture sampling codegen.
2420 * This function handles texture sampling for all texture targets (1D,
2421 * 2D, 3D, cube) and all filtering modes.
2422 */
2423static void
2424lp_build_sample_general(struct lp_build_sample_context *bld,
2425                        unsigned sampler_unit,
2426                        boolean is_gather,
2427                        const LLVMValueRef *coords,
2428                        const LLVMValueRef *offsets,
2429                        LLVMValueRef lod_positive,
2430                        LLVMValueRef lod_fpart,
2431                        LLVMValueRef ilevel0,
2432                        LLVMValueRef ilevel1,
2433                        LLVMValueRef *colors_out)
2434{
2435   LLVMBuilderRef builder = bld->gallivm->builder;
2436   const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2437   const unsigned mip_filter = sampler_state->min_mip_filter;
2438   const unsigned min_filter = sampler_state->min_img_filter;
2439   const unsigned mag_filter = sampler_state->mag_img_filter;
2440   LLVMValueRef texels[4];
2441   unsigned chan;
2442
2443   /* if we need border color, (potentially) clamp it now */
2444   if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2445                                              min_filter,
2446                                              mag_filter) ||
2447       (bld->dims > 1 &&
2448           lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2449                                                  min_filter,
2450                                                  mag_filter)) ||
2451       (bld->dims > 2 &&
2452           lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2453                                                  min_filter,
2454                                                  mag_filter))) {
2455      lp_build_clamp_border_color(bld, sampler_unit);
2456   }
2457
2458
2459   /*
2460    * Get/interpolate texture colors.
2461    */
2462
2463   for (chan = 0; chan < 4; ++chan) {
2464     texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2465     lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2466   }
2467
2468   if (min_filter == mag_filter) {
2469      /* no need to distinguish between minification and magnification */
2470      lp_build_sample_mipmap(bld, min_filter, mip_filter,
2471                             is_gather,
2472                             coords, offsets,
2473                             ilevel0, ilevel1, lod_fpart,
2474                             texels);
2475   }
2476   else {
2477      /*
2478       * Could also get rid of the if-logic and always use mipmap_both, both
2479       * for the single lod and multi-lod case if nothing really uses this.
2480       */
2481      if (bld->num_lods == 1) {
2482         /* Emit conditional to choose min image filter or mag image filter
2483          * depending on the lod being > 0 or <= 0, respectively.
2484          */
2485         struct lp_build_if_state if_ctx;
2486
2487         lod_positive = LLVMBuildTrunc(builder, lod_positive,
2488                                       LLVMInt1TypeInContext(bld->gallivm->context),
2489                                       "lod_pos");
2490
2491         lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2492         {
2493            /* Use the minification filter */
2494            lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2495                                   coords, offsets,
2496                                   ilevel0, ilevel1, lod_fpart,
2497                                   texels);
2498         }
2499         lp_build_else(&if_ctx);
2500         {
2501            /* Use the magnification filter */
2502            lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2503                                   FALSE,
2504                                   coords, offsets,
2505                                   ilevel0, NULL, NULL,
2506                                   texels);
2507         }
2508         lp_build_endif(&if_ctx);
2509      }
2510      else {
2511         LLVMValueRef need_linear, linear_mask;
2512         unsigned mip_filter_for_nearest;
2513         struct lp_build_if_state if_ctx;
2514
2515         if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2516            linear_mask = lod_positive;
2517            mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2518         }
2519         else {
2520            linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2521            mip_filter_for_nearest = mip_filter;
2522         }
2523         need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2524                                               linear_mask);
2525         lp_build_name(need_linear, "need_linear");
2526
2527         if (bld->num_lods != bld->coord_type.length) {
2528            linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2529                                                                bld->lodi_type,
2530                                                                bld->int_coord_type,
2531                                                                linear_mask);
2532         }
2533
2534         lp_build_if(&if_ctx, bld->gallivm, need_linear);
2535         {
2536            /*
2537             * Do sampling with both filters simultaneously. This means using
2538             * a linear filter and doing some tricks (with weights) for the pixels
2539             * which need nearest filter.
2540             * Note that it's probably rare some pixels need nearest and some
2541             * linear filter but the fixups required for the nearest pixels
2542             * aren't all that complicated so just always run a combined path
2543             * if at least some pixels require linear.
2544             */
2545            lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2546                                        coords, offsets,
2547                                        ilevel0, ilevel1,
2548                                        lod_fpart, lod_positive,
2549                                        texels);
2550         }
2551         lp_build_else(&if_ctx);
2552         {
2553            /*
2554             * All pixels require just nearest filtering, which is way
2555             * cheaper than linear, hence do a separate path for that.
2556             */
2557            lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2558                                   mip_filter_for_nearest, FALSE,
2559                                   coords, offsets,
2560                                   ilevel0, ilevel1, lod_fpart,
2561                                   texels);
2562         }
2563         lp_build_endif(&if_ctx);
2564      }
2565   }
2566
2567   for (chan = 0; chan < 4; ++chan) {
2568     colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2569     lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2570   }
2571}
2572
2573
2574/**
2575 * Texel fetch function.
2576 * In contrast to general sampling there is no filtering, no coord minification,
2577 * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2578 * directly to be applied to the selected mip level (after adding texel offsets).
2579 * This function handles texel fetch for all targets where texel fetch is supported
2580 * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2581 */
2582static void
2583lp_build_fetch_texel(struct lp_build_sample_context *bld,
2584                     unsigned texture_unit,
2585                     const LLVMValueRef *coords,
2586                     LLVMValueRef explicit_lod,
2587                     const LLVMValueRef *offsets,
2588                     LLVMValueRef *colors_out)
2589{
2590   struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2591   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2592   unsigned dims = bld->dims, chan;
2593   unsigned target = bld->static_texture_state->target;
2594   boolean out_of_bound_ret_zero = TRUE;
2595   LLVMValueRef size, ilevel;
2596   LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2597   LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2598   LLVMValueRef width, height, depth, i, j;
2599   LLVMValueRef offset, out_of_bounds, out1;
2600
2601   out_of_bounds = int_coord_bld->zero;
2602
2603   if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2604      if (bld->num_mips != int_coord_bld->type.length) {
2605         ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2606                                            perquadi_bld->type, explicit_lod, 0);
2607      }
2608      else {
2609         ilevel = explicit_lod;
2610      }
2611      lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2612                                 out_of_bound_ret_zero ? &out_of_bounds : NULL);
2613   }
2614   else {
2615      assert(bld->num_mips == 1);
2616      if (bld->static_texture_state->target != PIPE_BUFFER) {
2617         ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
2618                                                  bld->context_ptr, texture_unit);
2619      }
2620      else {
2621         ilevel = lp_build_const_int32(bld->gallivm, 0);
2622      }
2623   }
2624   lp_build_mipmap_level_sizes(bld, ilevel,
2625                               &size,
2626                               &row_stride_vec, &img_stride_vec);
2627   lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2628                                size, &width, &height, &depth);
2629
2630   if (target == PIPE_TEXTURE_1D_ARRAY ||
2631       target == PIPE_TEXTURE_2D_ARRAY) {
2632      if (out_of_bound_ret_zero) {
2633         z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
2634         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2635      }
2636      else {
2637         z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
2638      }
2639   }
2640
2641   /* This is a lot like border sampling */
2642   if (offsets[0]) {
2643      /*
2644       * coords are really unsigned, offsets are signed, but I don't think
2645       * exceeding 31 bits is possible
2646       */
2647      x = lp_build_add(int_coord_bld, x, offsets[0]);
2648   }
2649   out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2650   out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2651   out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2652   out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2653
2654   if (dims >= 2) {
2655      if (offsets[1]) {
2656         y = lp_build_add(int_coord_bld, y, offsets[1]);
2657      }
2658      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2659      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2660      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2661      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2662
2663      if (dims >= 3) {
2664         if (offsets[2]) {
2665            z = lp_build_add(int_coord_bld, z, offsets[2]);
2666         }
2667         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2668         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2669         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2670         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2671      }
2672   }
2673
2674   lp_build_sample_offset(int_coord_bld,
2675                          bld->format_desc,
2676                          x, y, z, row_stride_vec, img_stride_vec,
2677                          &offset, &i, &j);
2678
2679   if (bld->static_texture_state->target != PIPE_BUFFER) {
2680      offset = lp_build_add(int_coord_bld, offset,
2681                            lp_build_get_mip_offsets(bld, ilevel));
2682   }
2683
2684   offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2685
2686   lp_build_fetch_rgba_soa(bld->gallivm,
2687                           bld->format_desc,
2688                           bld->texel_type, TRUE,
2689                           bld->base_ptr, offset,
2690                           i, j,
2691                           bld->cache,
2692                           colors_out);
2693
2694   if (out_of_bound_ret_zero) {
2695      /*
2696       * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2697       * Could use min/max above instead of out-of-bounds comparisons
2698       * if we don't care about the result returned for out-of-bounds.
2699       */
2700      for (chan = 0; chan < 4; chan++) {
2701         colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2702                                            bld->texel_bld.zero, colors_out[chan]);
2703      }
2704   }
2705}
2706
2707
2708/**
2709 * Just set texels to white instead of actually sampling the texture.
2710 * For debugging.
2711 */
2712void
2713lp_build_sample_nop(struct gallivm_state *gallivm,
2714                    struct lp_type type,
2715                    const LLVMValueRef *coords,
2716                    LLVMValueRef texel_out[4])
2717{
2718   LLVMValueRef one = lp_build_one(gallivm, type);
2719   unsigned chan;
2720
2721   for (chan = 0; chan < 4; chan++) {
2722      texel_out[chan] = one;
2723   }
2724}
2725
2726
2727/**
2728 * Build the actual texture sampling code.
2729 * 'texel' will return a vector of four LLVMValueRefs corresponding to
2730 * R, G, B, A.
2731 * \param type  vector float type to use for coords, etc.
2732 * \param sample_key
2733 * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2734 */
2735static void
2736lp_build_sample_soa_code(struct gallivm_state *gallivm,
2737                         const struct lp_static_texture_state *static_texture_state,
2738                         const struct lp_static_sampler_state *static_sampler_state,
2739                         struct lp_sampler_dynamic_state *dynamic_state,
2740                         struct lp_type type,
2741                         unsigned sample_key,
2742                         unsigned texture_index,
2743                         unsigned sampler_index,
2744                         LLVMValueRef context_ptr,
2745                         LLVMValueRef thread_data_ptr,
2746                         const LLVMValueRef *coords,
2747                         const LLVMValueRef *offsets,
2748                         const struct lp_derivatives *derivs, /* optional */
2749                         LLVMValueRef lod, /* optional */
2750                         LLVMValueRef texel_out[4])
2751{
2752   unsigned target = static_texture_state->target;
2753   unsigned dims = texture_dims(target);
2754   unsigned num_quads = type.length / 4;
2755   unsigned mip_filter, min_img_filter, mag_img_filter, i;
2756   struct lp_build_sample_context bld;
2757   struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2758   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2759   LLVMBuilderRef builder = gallivm->builder;
2760   LLVMValueRef tex_width, newcoords[5];
2761   enum lp_sampler_lod_property lod_property;
2762   enum lp_sampler_lod_control lod_control;
2763   enum lp_sampler_op_type op_type;
2764   LLVMValueRef lod_bias = NULL;
2765   LLVMValueRef explicit_lod = NULL;
2766   boolean op_is_tex, op_is_lodq, op_is_gather;
2767
2768   if (0) {
2769      enum pipe_format fmt = static_texture_state->format;
2770      debug_printf("Sample from %s\n", util_format_name(fmt));
2771   }
2772
2773   lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
2774                     LP_SAMPLER_LOD_PROPERTY_SHIFT;
2775   lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
2776                    LP_SAMPLER_LOD_CONTROL_SHIFT;
2777   op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
2778                 LP_SAMPLER_OP_TYPE_SHIFT;
2779
2780   op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
2781   op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
2782   op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
2783
2784   if (lod_control == LP_SAMPLER_LOD_BIAS) {
2785      lod_bias = lod;
2786      assert(lod);
2787      assert(derivs == NULL);
2788   }
2789   else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
2790      explicit_lod = lod;
2791      assert(lod);
2792      assert(derivs == NULL);
2793   }
2794   else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
2795      assert(derivs);
2796      assert(lod == NULL);
2797   }
2798   else {
2799      assert(derivs == NULL);
2800      assert(lod == NULL);
2801   }
2802
2803   if (static_texture_state->format == PIPE_FORMAT_NONE) {
2804      /*
2805       * If there's nothing bound, format is NONE, and we must return
2806       * all zero as mandated by d3d10 in this case.
2807       */
2808      unsigned chan;
2809      LLVMValueRef zero = lp_build_zero(gallivm, type);
2810      for (chan = 0; chan < 4; chan++) {
2811         texel_out[chan] = zero;
2812      }
2813      return;
2814   }
2815
2816   assert(type.floating);
2817
2818   /* Setup our build context */
2819   memset(&bld, 0, sizeof bld);
2820   bld.gallivm = gallivm;
2821   bld.context_ptr = context_ptr;
2822   bld.static_sampler_state = &derived_sampler_state;
2823   bld.static_texture_state = static_texture_state;
2824   bld.dynamic_state = dynamic_state;
2825   bld.format_desc = util_format_description(static_texture_state->format);
2826   bld.dims = dims;
2827
2828   if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
2829      bld.no_quad_lod = TRUE;
2830   }
2831   if (gallivm_perf & GALLIVM_PERF_NO_RHO_APPROX || op_is_lodq) {
2832      bld.no_rho_approx = TRUE;
2833   }
2834   if (gallivm_perf & GALLIVM_PERF_NO_BRILINEAR || op_is_lodq) {
2835      bld.no_brilinear = TRUE;
2836   }
2837
2838   bld.vector_width = lp_type_width(type);
2839
2840   bld.float_type = lp_type_float(32);
2841   bld.int_type = lp_type_int(32);
2842   bld.coord_type = type;
2843   bld.int_coord_type = lp_int_type(type);
2844   bld.float_size_in_type = lp_type_float(32);
2845   bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2846   bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2847   bld.texel_type = type;
2848
2849   /* always using the first channel hopefully should be safe,
2850    * if not things WILL break in other places anyway.
2851    */
2852   if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2853       bld.format_desc->channel[0].pure_integer) {
2854      if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2855         bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2856      }
2857      else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2858         bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2859      }
2860   }
2861   else if (util_format_has_stencil(bld.format_desc) &&
2862       !util_format_has_depth(bld.format_desc)) {
2863      /* for stencil only formats, sample stencil (uint) */
2864      bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2865   }
2866
2867   if (!static_texture_state->level_zero_only ||
2868       !static_sampler_state->max_lod_pos || op_is_lodq) {
2869      derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2870   } else {
2871      derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2872   }
2873   if (op_is_gather) {
2874      /*
2875       * gather4 is exactly like GL_LINEAR filtering but in the end skipping
2876       * the actual filtering. Using mostly the same paths, so cube face
2877       * selection, coord wrapping etc. all naturally uses the same code.
2878       */
2879      derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2880      derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
2881      derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
2882   }
2883   mip_filter = derived_sampler_state.min_mip_filter;
2884
2885   if (0) {
2886      debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2887   }
2888
2889   if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2890       static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2891   {
2892      /*
2893       * Seamless filtering ignores wrap modes.
2894       * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2895       * bilinear it's not correct but way better than using for instance repeat.
2896       * Note we even set this for non-seamless. Technically GL allows any wrap
2897       * mode, which made sense when supporting true borders (can get seamless
2898       * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2899       * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2900       * up the sampler state (as it makes it texture dependent).
2901       */
2902      derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2903      derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2904   }
2905   /*
2906    * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
2907    * so AoS path could be used. Not sure it's worth the trouble...
2908    */
2909
2910   min_img_filter = derived_sampler_state.min_img_filter;
2911   mag_img_filter = derived_sampler_state.mag_img_filter;
2912
2913
2914   /*
2915    * This is all a bit complicated different paths are chosen for performance
2916    * reasons.
2917    * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2918    * everything (the last two options are equivalent for 4-wide case).
2919    * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2920    * lod is calculated then the lod value extracted afterwards so making this
2921    * case basically the same as far as lod handling is concerned for the
2922    * further sample/filter code as the 1 lod for everything case.
2923    * Different lod handling mostly shows up when building mipmap sizes
2924    * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2925    * (getting the fractional part of the lod to the right texels).
2926    */
2927
2928   /*
2929    * There are other situations where at least the multiple int lods could be
2930    * avoided like min and max lod being equal.
2931    */
2932   bld.num_mips = bld.num_lods = 1;
2933
2934   if (bld.no_quad_lod && bld.no_rho_approx &&
2935       ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
2936         (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2937          static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
2938        op_is_lodq)) {
2939      /*
2940       * special case for using per-pixel lod even for implicit lod,
2941       * which is generally never required (ok by APIs) except to please
2942       * some (somewhat broken imho) tests (because per-pixel face selection
2943       * can cause derivatives to be different for pixels outside the primitive
2944       * due to the major axis division even if pre-project derivatives are
2945       * looking normal).
2946       * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
2947       * cube maps we do indeed get per-pixel lod values).
2948       */
2949      bld.num_mips = type.length;
2950      bld.num_lods = type.length;
2951   }
2952   else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2953       (explicit_lod || lod_bias || derivs)) {
2954      if ((!op_is_tex && target != PIPE_BUFFER) ||
2955          (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2956         bld.num_mips = type.length;
2957         bld.num_lods = type.length;
2958      }
2959      else if (op_is_tex && min_img_filter != mag_img_filter) {
2960         bld.num_mips = 1;
2961         bld.num_lods = type.length;
2962      }
2963   }
2964   /* TODO: for true scalar_lod should only use 1 lod value */
2965   else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
2966            (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2967      bld.num_mips = num_quads;
2968      bld.num_lods = num_quads;
2969   }
2970   else if (op_is_tex && min_img_filter != mag_img_filter) {
2971      bld.num_mips = 1;
2972      bld.num_lods = num_quads;
2973   }
2974
2975
2976   bld.lodf_type = type;
2977   /* we want native vector size to be able to use our intrinsics */
2978   if (bld.num_lods != type.length) {
2979      /* TODO: this currently always has to be per-quad or per-element */
2980      bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2981   }
2982   bld.lodi_type = lp_int_type(bld.lodf_type);
2983   bld.levelf_type = bld.lodf_type;
2984   if (bld.num_mips == 1) {
2985      bld.levelf_type.length = 1;
2986   }
2987   bld.leveli_type = lp_int_type(bld.levelf_type);
2988   bld.float_size_type = bld.float_size_in_type;
2989   /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2990    * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2991   if (bld.num_mips > 1) {
2992      bld.float_size_type.length = bld.num_mips == type.length ?
2993                                      bld.num_mips * bld.float_size_in_type.length :
2994                                      type.length;
2995   }
2996   bld.int_size_type = lp_int_type(bld.float_size_type);
2997
2998   lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
2999   lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3000   lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3001   lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3002   lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3003   lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3004   lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3005   lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3006   lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3007   lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3008   lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3009   lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3010   lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3011   lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3012
3013   /* Get the dynamic state */
3014   tex_width = dynamic_state->width(dynamic_state, gallivm,
3015                                    context_ptr, texture_index);
3016   bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3017                                                    context_ptr, texture_index);
3018   bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3019                                                    context_ptr, texture_index);
3020   bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3021                                          context_ptr, texture_index);
3022   bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3023                                                context_ptr, texture_index);
3024   /* Note that mip_offsets is an array[level] of offsets to texture images */
3025
3026   if (dynamic_state->cache_ptr && thread_data_ptr) {
3027      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3028                                           thread_data_ptr, texture_index);
3029   }
3030
3031   /* width, height, depth as single int vector */
3032   if (dims <= 1) {
3033      bld.int_size = tex_width;
3034   }
3035   else {
3036      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3037                                            tex_width,
3038                                            LLVMConstInt(i32t, 0, 0), "");
3039      if (dims >= 2) {
3040         LLVMValueRef tex_height =
3041            dynamic_state->height(dynamic_state, gallivm,
3042                                  context_ptr, texture_index);
3043         bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3044                                               tex_height,
3045                                               LLVMConstInt(i32t, 1, 0), "");
3046         if (dims >= 3) {
3047            LLVMValueRef tex_depth =
3048               dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3049                                    texture_index);
3050            bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3051                                                  tex_depth,
3052                                                  LLVMConstInt(i32t, 2, 0), "");
3053         }
3054      }
3055   }
3056
3057   for (i = 0; i < 5; i++) {
3058      newcoords[i] = coords[i];
3059   }
3060
3061   if (util_format_is_pure_integer(static_texture_state->format) &&
3062       !util_format_has_depth(bld.format_desc) && op_is_tex &&
3063       (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3064        static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3065        static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3066      /*
3067       * Bail if impossible filtering is specified (the awkard additional
3068       * depth check is because it is legal in gallium to have things like S8Z24
3069       * here which would say it's pure int despite such formats should sample
3070       * the depth component).
3071       * In GL such filters make the texture incomplete, this makes it robust
3072       * against state trackers which set this up regardless (we'd crash in the
3073       * lerp later otherwise).
3074       * At least in some apis it may be legal to use such filters with lod
3075       * queries and/or gather (at least for gather d3d10 says only the wrap
3076       * bits are really used hence filter bits are likely simply ignored).
3077       * For fetch, we don't get valid samplers either way here.
3078       */
3079      unsigned chan;
3080      LLVMValueRef zero = lp_build_zero(gallivm, type);
3081      for (chan = 0; chan < 4; chan++) {
3082         texel_out[chan] = zero;
3083      }
3084      return;
3085   }
3086
3087   if (0) {
3088      /* For debug: no-op texture sampling */
3089      lp_build_sample_nop(gallivm,
3090                          bld.texel_type,
3091                          newcoords,
3092                          texel_out);
3093   }
3094
3095   else if (op_type == LP_SAMPLER_OP_FETCH) {
3096      lp_build_fetch_texel(&bld, texture_index, newcoords,
3097                           lod, offsets,
3098                           texel_out);
3099   }
3100
3101   else {
3102      LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3103      LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3104      boolean use_aos;
3105
3106      use_aos = util_format_fits_8unorm(bld.format_desc) &&
3107                op_is_tex &&
3108                /* not sure this is strictly needed or simply impossible */
3109                derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3110                lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3111
3112      use_aos &= bld.num_lods <= num_quads ||
3113                 derived_sampler_state.min_img_filter ==
3114                    derived_sampler_state.mag_img_filter;
3115
3116      if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3117         use_aos = 0;
3118      }
3119
3120      if (dims > 1) {
3121         use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3122         if (dims > 2) {
3123            use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3124         }
3125      }
3126      if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3127           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3128          derived_sampler_state.seamless_cube_map &&
3129          (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3130           derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3131         /* theoretically possible with AoS filtering but not implemented (complex!) */
3132         use_aos = 0;
3133      }
3134
3135      if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3136          !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3137         debug_printf("%s: using floating point linear filtering for %s\n",
3138                      __FUNCTION__, bld.format_desc->short_name);
3139         debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3140                      "  wraps %d  wrapt %d  wrapr %d\n",
3141                      derived_sampler_state.min_img_filter,
3142                      derived_sampler_state.mag_img_filter,
3143                      derived_sampler_state.min_mip_filter,
3144                      static_texture_state->target,
3145                      derived_sampler_state.seamless_cube_map,
3146                      derived_sampler_state.wrap_s,
3147                      derived_sampler_state.wrap_t,
3148                      derived_sampler_state.wrap_r);
3149      }
3150
3151      lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3152                             newcoords,
3153                             derivs, lod_bias, explicit_lod,
3154                             &lod_positive, &lod, &lod_fpart,
3155                             &ilevel0, &ilevel1);
3156
3157      if (op_is_lodq) {
3158         texel_out[0] = lod_fpart;
3159         texel_out[1] = lod;
3160         texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3161         return;
3162      }
3163
3164      if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3165         /* The aos path doesn't do seamless filtering so simply add cube layer
3166          * to face now.
3167          */
3168         newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3169      }
3170
3171      /*
3172       * we only try 8-wide sampling with soa or if we have AVX2
3173       * as it appears to be a loss with just AVX)
3174       */
3175      if (num_quads == 1 || !use_aos ||
3176          (util_cpu_caps.has_avx2 &&
3177           (bld.num_lods == 1 ||
3178            derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3179         if (use_aos) {
3180            /* do sampling/filtering with fixed pt arithmetic */
3181            lp_build_sample_aos(&bld, sampler_index,
3182                                newcoords[0], newcoords[1],
3183                                newcoords[2],
3184                                offsets, lod_positive, lod_fpart,
3185                                ilevel0, ilevel1,
3186                                texel_out);
3187         }
3188
3189         else {
3190            lp_build_sample_general(&bld, sampler_index,
3191                                    op_type == LP_SAMPLER_OP_GATHER,
3192                                    newcoords, offsets,
3193                                    lod_positive, lod_fpart,
3194                                    ilevel0, ilevel1,
3195                                    texel_out);
3196         }
3197      }
3198      else {
3199         unsigned j;
3200         struct lp_build_sample_context bld4;
3201         struct lp_type type4 = type;
3202         unsigned i;
3203         LLVMValueRef texelout4[4];
3204         LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3205
3206         type4.length = 4;
3207
3208         /* Setup our build context */
3209         memset(&bld4, 0, sizeof bld4);
3210         bld4.no_quad_lod = bld.no_quad_lod;
3211         bld4.no_rho_approx = bld.no_rho_approx;
3212         bld4.no_brilinear = bld.no_brilinear;
3213         bld4.gallivm = bld.gallivm;
3214         bld4.context_ptr = bld.context_ptr;
3215         bld4.static_texture_state = bld.static_texture_state;
3216         bld4.static_sampler_state = bld.static_sampler_state;
3217         bld4.dynamic_state = bld.dynamic_state;
3218         bld4.format_desc = bld.format_desc;
3219         bld4.dims = bld.dims;
3220         bld4.row_stride_array = bld.row_stride_array;
3221         bld4.img_stride_array = bld.img_stride_array;
3222         bld4.base_ptr = bld.base_ptr;
3223         bld4.mip_offsets = bld.mip_offsets;
3224         bld4.int_size = bld.int_size;
3225         bld4.cache = bld.cache;
3226
3227         bld4.vector_width = lp_type_width(type4);
3228
3229         bld4.float_type = lp_type_float(32);
3230         bld4.int_type = lp_type_int(32);
3231         bld4.coord_type = type4;
3232         bld4.int_coord_type = lp_int_type(type4);
3233         bld4.float_size_in_type = lp_type_float(32);
3234         bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3235         bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3236         bld4.texel_type = bld.texel_type;
3237         bld4.texel_type.length = 4;
3238
3239         bld4.num_mips = bld4.num_lods = 1;
3240         if (bld4.no_quad_lod && bld4.no_rho_approx &&
3241             (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3242              static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3243             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3244            bld4.num_mips = type4.length;
3245            bld4.num_lods = type4.length;
3246         }
3247         if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3248             (explicit_lod || lod_bias || derivs)) {
3249            if ((!op_is_tex && target != PIPE_BUFFER) ||
3250                (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3251               bld4.num_mips = type4.length;
3252               bld4.num_lods = type4.length;
3253            }
3254            else if (op_is_tex && min_img_filter != mag_img_filter) {
3255               bld4.num_mips = 1;
3256               bld4.num_lods = type4.length;
3257            }
3258         }
3259
3260         /* we want native vector size to be able to use our intrinsics */
3261         bld4.lodf_type = type4;
3262         if (bld4.num_lods != type4.length) {
3263            bld4.lodf_type.length = 1;
3264         }
3265         bld4.lodi_type = lp_int_type(bld4.lodf_type);
3266         bld4.levelf_type = type4;
3267         if (bld4.num_mips != type4.length) {
3268            bld4.levelf_type.length = 1;
3269         }
3270         bld4.leveli_type = lp_int_type(bld4.levelf_type);
3271         bld4.float_size_type = bld4.float_size_in_type;
3272         if (bld4.num_mips > 1) {
3273            bld4.float_size_type.length = bld4.num_mips == type4.length ?
3274                                            bld4.num_mips * bld4.float_size_in_type.length :
3275                                            type4.length;
3276         }
3277         bld4.int_size_type = lp_int_type(bld4.float_size_type);
3278
3279         lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3280         lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3281         lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3282         lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3283         lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3284         lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3285         lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3286         lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3287         lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3288         lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3289         lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3290         lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3291         lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3292         lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3293
3294         for (i = 0; i < num_quads; i++) {
3295            LLVMValueRef s4, t4, r4;
3296            LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3297            LLVMValueRef ilevel04, ilevel14 = NULL;
3298            LLVMValueRef offsets4[4] = { NULL };
3299            unsigned num_lods = bld4.num_lods;
3300
3301            s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3302            t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3303            r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3304
3305            if (offsets[0]) {
3306               offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3307               if (dims > 1) {
3308                  offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3309                  if (dims > 2) {
3310                     offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3311                  }
3312               }
3313            }
3314            lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3315            ilevel04 = bld.num_mips == 1 ? ilevel0 :
3316                          lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3317            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3318               ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3319               lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3320            }
3321
3322            if (use_aos) {
3323               /* do sampling/filtering with fixed pt arithmetic */
3324               lp_build_sample_aos(&bld4, sampler_index,
3325                                   s4, t4, r4, offsets4,
3326                                   lod_positive4, lod_fpart4,
3327                                   ilevel04, ilevel14,
3328                                   texelout4);
3329            }
3330
3331            else {
3332               /* this path is currently unreachable and hence might break easily... */
3333               LLVMValueRef newcoords4[5];
3334               newcoords4[0] = s4;
3335               newcoords4[1] = t4;
3336               newcoords4[2] = r4;
3337               newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3338               newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3339
3340               lp_build_sample_general(&bld4, sampler_index,
3341                                       op_type == LP_SAMPLER_OP_GATHER,
3342                                       newcoords4, offsets4,
3343                                       lod_positive4, lod_fpart4,
3344                                       ilevel04, ilevel14,
3345                                       texelout4);
3346            }
3347            for (j = 0; j < 4; j++) {
3348               texelouttmp[j][i] = texelout4[j];
3349            }
3350         }
3351
3352         for (j = 0; j < 4; j++) {
3353            texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3354         }
3355      }
3356   }
3357
3358   if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3359      apply_sampler_swizzle(&bld, texel_out);
3360   }
3361
3362   /*
3363    * texel type can be a (32bit) int/uint (for pure int formats only),
3364    * however we are expected to always return floats (storage is untyped).
3365    */
3366   if (!bld.texel_type.floating) {
3367      unsigned chan;
3368      for (chan = 0; chan < 4; chan++) {
3369         texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3370                                            lp_build_vec_type(gallivm, type), "");
3371      }
3372   }
3373}
3374
3375
3376#define USE_TEX_FUNC_CALL 1
3377
3378#define LP_MAX_TEX_FUNC_ARGS 32
3379
3380static inline void
3381get_target_info(enum pipe_texture_target target,
3382                unsigned *num_coords, unsigned *num_derivs,
3383                unsigned *num_offsets, unsigned *layer)
3384{
3385   unsigned dims = texture_dims(target);
3386   *num_coords = dims;
3387   *num_offsets = dims;
3388   *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3389                  target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3390   *layer = has_layer_coord(target) ? 2: 0;
3391   if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3392      /*
3393       * dims doesn't include r coord for cubes - this is handled
3394       * by layer instead, but need to fix up for cube arrays...
3395       */
3396      *layer = 3;
3397      *num_coords = 3;
3398   }
3399}
3400
3401
3402/**
3403 * Generate the function body for a texture sampling function.
3404 */
3405static void
3406lp_build_sample_gen_func(struct gallivm_state *gallivm,
3407                         const struct lp_static_texture_state *static_texture_state,
3408                         const struct lp_static_sampler_state *static_sampler_state,
3409                         struct lp_sampler_dynamic_state *dynamic_state,
3410                         struct lp_type type,
3411                         unsigned texture_index,
3412                         unsigned sampler_index,
3413                         LLVMValueRef function,
3414                         unsigned num_args,
3415                         unsigned sample_key)
3416{
3417   LLVMBuilderRef old_builder;
3418   LLVMBasicBlockRef block;
3419   LLVMValueRef coords[5];
3420   LLVMValueRef offsets[3] = { NULL };
3421   LLVMValueRef lod = NULL;
3422   LLVMValueRef context_ptr;
3423   LLVMValueRef thread_data_ptr = NULL;
3424   LLVMValueRef texel_out[4];
3425   struct lp_derivatives derivs;
3426   struct lp_derivatives *deriv_ptr = NULL;
3427   unsigned num_param = 0;
3428   unsigned i, num_coords, num_derivs, num_offsets, layer;
3429   enum lp_sampler_lod_control lod_control;
3430   boolean need_cache = FALSE;
3431
3432   lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3433                    LP_SAMPLER_LOD_CONTROL_SHIFT;
3434
3435   get_target_info(static_texture_state->target,
3436                   &num_coords, &num_derivs, &num_offsets, &layer);
3437
3438   if (dynamic_state->cache_ptr) {
3439      const struct util_format_description *format_desc;
3440      format_desc = util_format_description(static_texture_state->format);
3441      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3442         need_cache = TRUE;
3443      }
3444   }
3445
3446   /* "unpack" arguments */
3447   context_ptr = LLVMGetParam(function, num_param++);
3448   if (need_cache) {
3449      thread_data_ptr = LLVMGetParam(function, num_param++);
3450   }
3451   for (i = 0; i < num_coords; i++) {
3452      coords[i] = LLVMGetParam(function, num_param++);
3453   }
3454   for (i = num_coords; i < 5; i++) {
3455      /* This is rather unfortunate... */
3456      coords[i] = lp_build_undef(gallivm, type);
3457   }
3458   if (layer) {
3459      coords[layer] = LLVMGetParam(function, num_param++);
3460   }
3461   if (sample_key & LP_SAMPLER_SHADOW) {
3462      coords[4] = LLVMGetParam(function, num_param++);
3463   }
3464   if (sample_key & LP_SAMPLER_OFFSETS) {
3465      for (i = 0; i < num_offsets; i++) {
3466         offsets[i] = LLVMGetParam(function, num_param++);
3467      }
3468   }
3469   if (lod_control == LP_SAMPLER_LOD_BIAS ||
3470       lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3471      lod = LLVMGetParam(function, num_param++);
3472   }
3473   else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3474      for (i = 0; i < num_derivs; i++) {
3475         derivs.ddx[i] = LLVMGetParam(function, num_param++);
3476         derivs.ddy[i] = LLVMGetParam(function, num_param++);
3477      }
3478      deriv_ptr = &derivs;
3479   }
3480
3481   assert(num_args == num_param);
3482
3483   /*
3484    * Function body
3485    */
3486
3487   old_builder = gallivm->builder;
3488   block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3489   gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
3490   LLVMPositionBuilderAtEnd(gallivm->builder, block);
3491
3492   lp_build_sample_soa_code(gallivm,
3493                            static_texture_state,
3494                            static_sampler_state,
3495                            dynamic_state,
3496                            type,
3497                            sample_key,
3498                            texture_index,
3499                            sampler_index,
3500                            context_ptr,
3501                            thread_data_ptr,
3502                            coords,
3503                            offsets,
3504                            deriv_ptr,
3505                            lod,
3506                            texel_out);
3507
3508   LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
3509
3510   LLVMDisposeBuilder(gallivm->builder);
3511   gallivm->builder = old_builder;
3512
3513   gallivm_verify_function(gallivm, function);
3514}
3515
3516
3517/**
3518 * Call the matching function for texture sampling.
3519 * If there's no match, generate a new one.
3520 */
3521static void
3522lp_build_sample_soa_func(struct gallivm_state *gallivm,
3523                         const struct lp_static_texture_state *static_texture_state,
3524                         const struct lp_static_sampler_state *static_sampler_state,
3525                         struct lp_sampler_dynamic_state *dynamic_state,
3526                         const struct lp_sampler_params *params)
3527{
3528   LLVMBuilderRef builder = gallivm->builder;
3529   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
3530                             LLVMGetInsertBlock(builder)));
3531   LLVMValueRef function, inst;
3532   LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
3533   LLVMBasicBlockRef bb;
3534   LLVMValueRef tex_ret;
3535   unsigned num_args = 0;
3536   char func_name[64];
3537   unsigned i, num_coords, num_derivs, num_offsets, layer;
3538   unsigned texture_index = params->texture_index;
3539   unsigned sampler_index = params->sampler_index;
3540   unsigned sample_key = params->sample_key;
3541   const LLVMValueRef *coords = params->coords;
3542   const LLVMValueRef *offsets = params->offsets;
3543   const struct lp_derivatives *derivs = params->derivs;
3544   enum lp_sampler_lod_control lod_control;
3545   boolean need_cache = FALSE;
3546
3547   lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3548                    LP_SAMPLER_LOD_CONTROL_SHIFT;
3549
3550   get_target_info(static_texture_state->target,
3551                   &num_coords, &num_derivs, &num_offsets, &layer);
3552
3553   if (dynamic_state->cache_ptr) {
3554      const struct util_format_description *format_desc;
3555      format_desc = util_format_description(static_texture_state->format);
3556      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3557         need_cache = TRUE;
3558      }
3559   }
3560   /*
3561    * texture function matches are found by name.
3562    * Thus the name has to include both the texture and sampler unit
3563    * (which covers all static state) plus the actual texture function
3564    * (including things like offsets, shadow coord, lod control).
3565    * Additionally lod_property has to be included too.
3566    */
3567
3568   util_snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
3569                 texture_index, sampler_index, sample_key);
3570
3571   function = LLVMGetNamedFunction(module, func_name);
3572
3573   if(!function) {
3574      LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
3575      LLVMTypeRef ret_type;
3576      LLVMTypeRef function_type;
3577      LLVMTypeRef val_type[4];
3578      unsigned num_param = 0;
3579
3580      /*
3581       * Generate the function prototype.
3582       */
3583
3584      arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
3585      if (need_cache) {
3586         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
3587      }
3588      for (i = 0; i < num_coords; i++) {
3589         arg_types[num_param++] = LLVMTypeOf(coords[0]);
3590         assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
3591      }
3592      if (layer) {
3593         arg_types[num_param++] = LLVMTypeOf(coords[layer]);
3594         assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
3595      }
3596      if (sample_key & LP_SAMPLER_SHADOW) {
3597         arg_types[num_param++] = LLVMTypeOf(coords[0]);
3598      }
3599      if (sample_key & LP_SAMPLER_OFFSETS) {
3600         for (i = 0; i < num_offsets; i++) {
3601            arg_types[num_param++] = LLVMTypeOf(offsets[0]);
3602            assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
3603         }
3604      }
3605      if (lod_control == LP_SAMPLER_LOD_BIAS ||
3606          lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3607         arg_types[num_param++] = LLVMTypeOf(params->lod);
3608      }
3609      else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3610         for (i = 0; i < num_derivs; i++) {
3611            arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
3612            arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
3613            assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
3614            assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
3615         }
3616      }
3617
3618      val_type[0] = val_type[1] = val_type[2] = val_type[3] =
3619         lp_build_vec_type(gallivm, params->type);
3620      ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
3621      function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
3622      function = LLVMAddFunction(module, func_name, function_type);
3623
3624      for (i = 0; i < num_param; ++i) {
3625         if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
3626
3627            lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3628         }
3629      }
3630
3631      LLVMSetFunctionCallConv(function, LLVMFastCallConv);
3632      LLVMSetLinkage(function, LLVMInternalLinkage);
3633
3634      lp_build_sample_gen_func(gallivm,
3635                               static_texture_state,
3636                               static_sampler_state,
3637                               dynamic_state,
3638                               params->type,
3639                               texture_index,
3640                               sampler_index,
3641                               function,
3642                               num_param,
3643                               sample_key);
3644   }
3645
3646   num_args = 0;
3647   args[num_args++] = params->context_ptr;
3648   if (need_cache) {
3649      args[num_args++] = params->thread_data_ptr;
3650   }
3651   for (i = 0; i < num_coords; i++) {
3652      args[num_args++] = coords[i];
3653   }
3654   if (layer) {
3655      args[num_args++] = coords[layer];
3656   }
3657   if (sample_key & LP_SAMPLER_SHADOW) {
3658      args[num_args++] = coords[4];
3659   }
3660   if (sample_key & LP_SAMPLER_OFFSETS) {
3661      for (i = 0; i < num_offsets; i++) {
3662         args[num_args++] = offsets[i];
3663      }
3664   }
3665   if (lod_control == LP_SAMPLER_LOD_BIAS ||
3666       lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3667      args[num_args++] = params->lod;
3668   }
3669   else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3670      for (i = 0; i < num_derivs; i++) {
3671         args[num_args++] = derivs->ddx[i];
3672         args[num_args++] = derivs->ddy[i];
3673      }
3674   }
3675
3676   assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
3677
3678   tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
3679   bb = LLVMGetInsertBlock(builder);
3680   inst = LLVMGetLastInstruction(bb);
3681   LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
3682
3683   for (i = 0; i < 4; i++) {
3684      params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
3685   }
3686}
3687
3688
3689/**
3690 * Build texture sampling code.
3691 * Either via a function call or inline it directly.
3692 */
3693void
3694lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
3695                    const struct lp_static_sampler_state *static_sampler_state,
3696                    struct lp_sampler_dynamic_state *dynamic_state,
3697                    struct gallivm_state *gallivm,
3698                    const struct lp_sampler_params *params)
3699{
3700   boolean use_tex_func = FALSE;
3701
3702   /*
3703    * Do not use a function call if the sampling is "simple enough".
3704    * We define this by
3705    * a) format
3706    * b) no mips (either one level only or no mip filter)
3707    * No mips will definitely make the code smaller, though
3708    * the format requirement is a bit iffy - there's some (SoA) formats
3709    * which definitely generate less code. This does happen to catch
3710    * some important cases though which are hurt quite a bit by using
3711    * a call (though not really because of the call overhead but because
3712    * they are reusing the same texture unit with some of the same
3713    * parameters).
3714    * Ideally we'd let llvm recognize this stuff by doing IPO passes.
3715    */
3716
3717   if (USE_TEX_FUNC_CALL) {
3718      const struct util_format_description *format_desc;
3719      boolean simple_format;
3720      boolean simple_tex;
3721      enum lp_sampler_op_type op_type;
3722      format_desc = util_format_description(static_texture_state->format);
3723      simple_format = !format_desc ||
3724                         (util_format_is_rgba8_variant(format_desc) &&
3725                          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
3726
3727      op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3728                    LP_SAMPLER_OP_TYPE_SHIFT;
3729      simple_tex =
3730         op_type != LP_SAMPLER_OP_TEXTURE ||
3731           ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
3732             static_texture_state->level_zero_only == TRUE) &&
3733            static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
3734
3735      use_tex_func = format_desc && !(simple_format && simple_tex);
3736   }
3737
3738   if (use_tex_func) {
3739      lp_build_sample_soa_func(gallivm,
3740                               static_texture_state,
3741                               static_sampler_state,
3742                               dynamic_state,
3743                               params);
3744   }
3745   else {
3746      lp_build_sample_soa_code(gallivm,
3747                               static_texture_state,
3748                               static_sampler_state,
3749                               dynamic_state,
3750                               params->type,
3751                               params->sample_key,
3752                               params->texture_index,
3753                               params->sampler_index,
3754                               params->context_ptr,
3755                               params->thread_data_ptr,
3756                               params->coords,
3757                               params->offsets,
3758                               params->derivs,
3759                               params->lod,
3760                               params->texel);
3761   }
3762}
3763
3764
3765void
3766lp_build_size_query_soa(struct gallivm_state *gallivm,
3767                        const struct lp_static_texture_state *static_state,
3768                        struct lp_sampler_dynamic_state *dynamic_state,
3769                        const struct lp_sampler_size_query_params *params)
3770{
3771   LLVMValueRef lod, level = 0, size;
3772   LLVMValueRef first_level = NULL;
3773   int dims, i;
3774   boolean has_array;
3775   unsigned num_lods = 1;
3776   struct lp_build_context bld_int_vec4;
3777   LLVMValueRef context_ptr = params->context_ptr;
3778   unsigned texture_unit = params->texture_unit;
3779   unsigned target = params->target;
3780
3781   if (static_state->format == PIPE_FORMAT_NONE) {
3782      /*
3783       * If there's nothing bound, format is NONE, and we must return
3784       * all zero as mandated by d3d10 in this case.
3785       */
3786      unsigned chan;
3787      LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
3788      for (chan = 0; chan < 4; chan++) {
3789         params->sizes_out[chan] = zero;
3790      }
3791      return;
3792   }
3793
3794   /*
3795    * Do some sanity verification about bound texture and shader dcl target.
3796    * Not entirely sure what's possible but assume array/non-array
3797    * always compatible (probably not ok for OpenGL but d3d10 has no
3798    * distinction of arrays at the resource level).
3799    * Everything else looks bogus (though not entirely sure about rect/2d).
3800    * Currently disabled because it causes assertion failures if there's
3801    * nothing bound (or rather a dummy texture, not that this case would
3802    * return the right values).
3803    */
3804   if (0 && static_state->target != target) {
3805      if (static_state->target == PIPE_TEXTURE_1D)
3806         assert(target == PIPE_TEXTURE_1D_ARRAY);
3807      else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
3808         assert(target == PIPE_TEXTURE_1D);
3809      else if (static_state->target == PIPE_TEXTURE_2D)
3810         assert(target == PIPE_TEXTURE_2D_ARRAY);
3811      else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
3812         assert(target == PIPE_TEXTURE_2D);
3813      else if (static_state->target == PIPE_TEXTURE_CUBE)
3814         assert(target == PIPE_TEXTURE_CUBE_ARRAY);
3815      else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3816         assert(target == PIPE_TEXTURE_CUBE);
3817      else
3818         assert(0);
3819   }
3820
3821   dims = texture_dims(target);
3822
3823   switch (target) {
3824   case PIPE_TEXTURE_1D_ARRAY:
3825   case PIPE_TEXTURE_2D_ARRAY:
3826   case PIPE_TEXTURE_CUBE_ARRAY:
3827      has_array = TRUE;
3828      break;
3829   default:
3830      has_array = FALSE;
3831      break;
3832   }
3833
3834   assert(!params->int_type.floating);
3835
3836   lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
3837
3838   if (params->explicit_lod) {
3839      /* FIXME: this needs to honor per-element lod */
3840      lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
3841                                    lp_build_const_int32(gallivm, 0), "");
3842      first_level = dynamic_state->first_level(dynamic_state, gallivm,
3843                                               context_ptr, texture_unit);
3844      level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
3845      lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
3846   } else {
3847      lod = bld_int_vec4.zero;
3848   }
3849
3850   size = bld_int_vec4.undef;
3851
3852   size = LLVMBuildInsertElement(gallivm->builder, size,
3853                                 dynamic_state->width(dynamic_state, gallivm,
3854                                                      context_ptr, texture_unit),
3855                                 lp_build_const_int32(gallivm, 0), "");
3856
3857   if (dims >= 2) {
3858      size = LLVMBuildInsertElement(gallivm->builder, size,
3859                                    dynamic_state->height(dynamic_state, gallivm,
3860                                                          context_ptr, texture_unit),
3861                                    lp_build_const_int32(gallivm, 1), "");
3862   }
3863
3864   if (dims >= 3) {
3865      size = LLVMBuildInsertElement(gallivm->builder, size,
3866                                    dynamic_state->depth(dynamic_state, gallivm,
3867                                                         context_ptr, texture_unit),
3868                                    lp_build_const_int32(gallivm, 2), "");
3869   }
3870
3871   size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
3872
3873   if (has_array) {
3874      LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
3875                                                 context_ptr, texture_unit);
3876      if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3877         /*
3878          * It looks like GL wants number of cubes, d3d10.1 has it undefined?
3879          * Could avoid this by passing in number of cubes instead of total
3880          * number of layers (might make things easier elsewhere too).
3881          */
3882         LLVMValueRef six = lp_build_const_int32(gallivm, 6);
3883         layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
3884      }
3885      size = LLVMBuildInsertElement(gallivm->builder, size, layers,
3886                                    lp_build_const_int32(gallivm, dims), "");
3887   }
3888
3889   /*
3890    * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
3891    * if level is out of bounds (note this can't cover unbound texture
3892    * here, which also requires returning zero).
3893    */
3894   if (params->explicit_lod && params->is_sviewinfo) {
3895      LLVMValueRef last_level, out, out1;
3896      struct lp_build_context leveli_bld;
3897
3898      /* everything is scalar for now */
3899      lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
3900      last_level = dynamic_state->last_level(dynamic_state, gallivm,
3901                                             context_ptr, texture_unit);
3902
3903      out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
3904      out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
3905      out = lp_build_or(&leveli_bld, out, out1);
3906      if (num_lods == 1) {
3907         out = lp_build_broadcast_scalar(&bld_int_vec4, out);
3908      }
3909      else {
3910         /* TODO */
3911         assert(0);
3912      }
3913      size = lp_build_andnot(&bld_int_vec4, size, out);
3914   }
3915   for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
3916      params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
3917                                                size,
3918                                                lp_build_const_int32(gallivm, i));
3919   }
3920   if (params->is_sviewinfo) {
3921      for (; i < 4; i++) {
3922         params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
3923      }
3924   }
3925
3926   /*
3927    * if there's no explicit_lod (buffers, rects) queries requiring nr of
3928    * mips would be illegal.
3929    */
3930   if (params->is_sviewinfo && params->explicit_lod) {
3931      struct lp_build_context bld_int_scalar;
3932      LLVMValueRef num_levels;
3933      lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
3934
3935      if (static_state->level_zero_only) {
3936         num_levels = bld_int_scalar.one;
3937      }
3938      else {
3939         LLVMValueRef last_level;
3940
3941         last_level = dynamic_state->last_level(dynamic_state, gallivm,
3942                                                context_ptr, texture_unit);
3943         num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3944         num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3945      }
3946      params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
3947                                        num_levels);
3948   }
3949}
3950