1/**************************************************************************
2 *
3 * Copyright 2010-2021 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
18 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
19 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
20 * USE OR OTHER DEALINGS IN THE SOFTWARE.
21 *
22 * The above copyright notice and this permission notice (including the
23 * next paragraph) shall be included in all copies or substantial portions
24 * of the Software.
25 *
26 **************************************************************************/
27
28
29#include "pipe/p_config.h"
30
31#include "util/u_math.h"
32#include "util/u_cpu_detect.h"
33#include "util/u_pack_color.h"
34#include "util/u_surface.h"
35#include "util/u_sse.h"
36
37#include "lp_jit.h"
38#include "lp_rast.h"
39#include "lp_debug.h"
40#include "lp_state_fs.h"
41#include "lp_linear_priv.h"
42
43
44#if defined(PIPE_ARCH_SSE)
45
46#include <emmintrin.h>
47
48
49struct nearest_sampler {
50   PIPE_ALIGN_VAR(16) uint32_t out[64];
51
52   const struct lp_jit_texture *texture;
53   float fsrc_x;                /* src_x0 */
54   float fsrc_y;                /* src_y0 */
55   float fdsdx;              /* sx */
56   float fdsdy;              /* sx */
57   float fdtdx;              /* sy */
58   float fdtdy;              /* sy */
59   int width;
60   int y;
61
62   const uint32_t *(*fetch)(struct nearest_sampler *samp);
63};
64
65
66struct linear_interp {
67   PIPE_ALIGN_VAR(16) uint32_t out[64];
68   __m128i a0;
69   __m128i dadx;
70   __m128i dady;
71   int width;                   /* rounded up to multiple of 4 */
72   boolean is_constant;
73};
74
75/* Organize all the information needed for blending in one place.
76 * Could have blend function pointer here, but we currently always
77 * know which one we want to call.
78 */
79struct color_blend {
80   const uint32_t *src;
81   uint8_t *color;
82   int stride;
83   int width;                   /* the exact width */
84};
85
86
87/* Organize all the information needed for running each of the shaders
88 * in one place.
89 */
90struct shader {
91   PIPE_ALIGN_VAR(16) uint32_t out0[64];
92   const uint32_t *src0;
93   const uint32_t *src1;
94   __m128i const0;
95   int width;                   /* rounded up to multiple of 4 */
96};
97
98
99/* For a row of pixels, perform add/one/inv_src_alpha (ie
100 * premultiplied alpha) blending between the incoming pixels and the
101 * destination buffer.
102 *
103 * Used to implement the BLIT_RGBA + blend shader, there are no
104 * operations from the pixel shader left to implement at this level -
105 * effectively the pixel shader was just a texture fetch which has
106 * already been performed.  This routine then purely implements
107 * blending.
108 */
109static void
110blend_premul(struct color_blend *blend)
111{
112   const uint32_t *src = blend->src;  /* aligned */
113   uint32_t *dst = (uint32_t *)blend->color;      /* unaligned */
114   int width = blend->width;
115   int i;
116   __m128i tmp;
117   union { __m128i m128; uint ui[4]; } dstreg;
118
119   blend->color += blend->stride;
120
121   for (i = 0; i + 3 < width; i += 4) {
122      tmp = _mm_loadu_si128((const __m128i *)&dst[i]);  /* UNALIGNED READ */
123      dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i],
124                                             tmp);
125      _mm_storeu_si128((__m128i *)&dst[i], dstreg.m128); /* UNALIGNED WRITE */
126   }
127
128   if (i < width) {
129      int j;
130      for (j = 0; j < width - i ; j++) {
131         dstreg.ui[j] = dst[i+j];
132      }
133      dstreg.m128 = util_sse2_blend_premul_4(*(const __m128i *)&src[i],
134                                             dstreg.m128);
135      for (; i < width; i++)
136         dst[i] = dstreg.ui[i&3];
137   }
138}
139
140
141static void
142blend_noop(struct color_blend *blend)
143{
144   memcpy(blend->color, blend->src, blend->width * sizeof(unsigned));
145   blend->color += blend->stride;
146}
147
148
149static void
150init_blend(struct color_blend *blend,
151           int x, int y, int width, int height,
152           uint8_t *color,
153           int stride)
154{
155   blend->color = color + x * 4 + y * stride;
156   blend->stride = stride;
157   blend->width = width;
158}
159
160
161/*
162 * Perform nearest filtered lookup of a row of texels.  Texture lookup
163 * is assumed to be axis aligned but with arbitrary scaling.
164 *
165 * Texture coordinate interpolation is performed in 24.8 fixed point.
166 * Note that the longest span we will encounter is 64 pixels long,
167 * meaning that 8 fractional bits is more than sufficient to represent
168 * the shallowest gradient possible within this span.
169 *
170 * After 64 pixels (ie. in the next tile), the starting point will be
171 * recalculated with floating point arithmetic.
172 *
173 * XXX: migrate this to use Jose's quad blitter texture fetch routines.
174 */
175static const uint32_t *
176fetch_row(struct nearest_sampler *samp)
177{
178   int y = samp->y++;
179   uint32_t *row = samp->out;
180   const struct lp_jit_texture *texture = samp->texture;
181   int yy = util_iround(samp->fsrc_y + samp->fdtdy * y);
182   const uint32_t *src_row =
183      (const uint32_t *)((const uint8_t *)texture->base +
184                         yy * texture->row_stride[0]);
185   int iscale_x = samp->fdsdx * 256;
186   int acc      = samp->fsrc_x * 256 + 128;
187   int width    = samp->width;
188   int i;
189
190   for (i = 0; i < width; i++) {
191      row[i] = src_row[acc>>8];
192      acc += iscale_x;
193   }
194
195   return row;
196}
197
198/* Version of fetch_row which can cope with texture edges.  In
199 * practise, aero never triggers this.
200 */
201static const uint32_t *
202fetch_row_clamped(struct nearest_sampler *samp)
203{
204   int y = samp->y++;
205   uint32_t *row = samp->out;
206   const struct lp_jit_texture *texture = samp->texture;
207
208   int yy = util_iround(samp->fsrc_y + samp->fdtdy * y);
209
210   const uint32_t *src_row =
211      (const uint32_t *)((const uint8_t *)texture->base +
212                         CLAMP(yy, 0, texture->height-1) *
213                         texture->row_stride[0]);
214   float src_x0 = samp->fsrc_x;
215   float scale_x = samp->fdsdx;
216   int width    = samp->width;
217   int i;
218
219   for (i = 0; i < width; i++) {
220      row[i] = src_row[CLAMP(util_iround(src_x0 + i*scale_x),0,texture->width-1)];
221   }
222
223   return row;
224}
225
226/* It vary rarely happens that some non-axis-aligned texturing creeps
227 * into the linear path.  Handle it here.  The alternative would be
228 * more pre-checking or an option to fallback by returning false from
229 * jit_linear.
230 */
231static const uint32_t *
232fetch_row_xy_clamped(struct nearest_sampler *samp)
233{
234   int y = samp->y++;
235   uint32_t *row = samp->out;
236   const struct lp_jit_texture *texture = samp->texture;
237   float yrow = samp->fsrc_y + samp->fdtdy * y;
238   float xrow = samp->fsrc_x + samp->fdsdy * y;
239   int width  = samp->width;
240   int i;
241
242   for (i = 0; i < width; i++) {
243      int yy = util_iround(yrow + samp->fdtdx * i);
244      int xx = util_iround(xrow + samp->fdsdx * i);
245
246      const uint32_t *src_row =
247         (const uint32_t *)((const uint8_t *)texture->base +
248                            CLAMP(yy, 0, texture->height-1) *
249                            texture->row_stride[0]);
250
251      row[i] = src_row[CLAMP(xx,0,texture->width-1)];
252   }
253
254   return row;
255}
256
257
258static boolean
259init_nearest_sampler(struct nearest_sampler *samp,
260                     const struct lp_jit_texture *texture,
261                     int x0, int y0,
262                     int width, int height,
263                     float s0, float dsdx, float dsdy,
264                     float t0, float dtdx, float dtdy,
265                     float w0, float dwdx, float dwdy)
266{
267   int i;
268   float oow = 1.0f / w0;
269
270   if (dwdx != 0.0 || dwdy != 0.0)
271      return FALSE;
272
273   samp->texture = texture;
274   samp->width = width;
275   samp->fdsdx = dsdx * texture->width * oow;
276   samp->fdsdy = dsdy * texture->width * oow;
277   samp->fdtdx = dtdx * texture->height * oow;
278   samp->fdtdy = dtdy * texture->height * oow;
279   samp->fsrc_x = (samp->fdsdx * x0 +
280                   samp->fdsdy * y0 +
281                   s0 * texture->width * oow - 0.5f);
282
283   samp->fsrc_y = (samp->fdtdx * x0 +
284                   samp->fdtdy * y0 +
285                   t0 * texture->height * oow - 0.5f);
286   samp->y = 0;
287
288   /* Because we want to permit consumers of this data to round up to
289    * the next multiple of 4, and because we don't want valgrind to
290    * complain about uninitialized reads, set the last bit of the
291    * buffer to zero:
292    */
293   for (i = width; i & 3; i++)
294      samp->out[i] = 0;
295
296   if (dsdy != 0 || dtdx != 0)
297   {
298      /* Arbitrary texture lookup:
299       */
300      samp->fetch = fetch_row_xy_clamped;
301   }
302   else
303   {
304      /* Axis aligned stretch blit, abitrary scaling factors including
305       * flipped, minifying and magnifying:
306       */
307      int isrc_x = util_iround(samp->fsrc_x);
308      int isrc_y = util_iround(samp->fsrc_y);
309      int isrc_x1 = util_iround(samp->fsrc_x + width * samp->fdsdx);
310      int isrc_y1 = util_iround(samp->fsrc_y + height * samp->fdtdy);
311
312      /* Look at the maximum and minimum texture coordinates we will be
313       * fetching and figure out if we need to use clamping.  There is
314       * similar code in u_blit_sw.c which takes a better approach to
315       * this which could be substituted later.
316       */
317      if (isrc_x  <= texture->width  && isrc_x  >= 0 &&
318          isrc_y  <= texture->height && isrc_y  >= 0 &&
319          isrc_x1 <= texture->width  && isrc_x1 >= 0 &&
320          isrc_y1 <= texture->height && isrc_y1 >= 0)
321      {
322         samp->fetch = fetch_row;
323      }
324      else {
325         samp->fetch = fetch_row_clamped;
326      }
327   }
328
329   return TRUE;
330}
331
332
333static const uint32_t *
334shade_rgb1(struct shader *shader)
335{
336   const __m128i rgb1 = _mm_set1_epi32(0xff000000);
337   const uint32_t *src0 = shader->src0;
338   uint32_t *dst = shader->out0;
339   int width = shader->width;
340   int i;
341
342   for (i = 0; i + 3 < width; i += 4) {
343      __m128i s = *(const __m128i *)&src0[i];
344      *(__m128i *)&dst[i] = _mm_or_si128(s, rgb1);
345   }
346
347   return shader->out0;
348}
349
350
351static void
352init_shader(struct shader *shader,
353           int x, int y, int width, int height)
354{
355   shader->width = align(width, 4);
356}
357
358
359/* Linear shader which implements the BLIT_RGBA shader with the
360 * additional constraints imposed by lp_setup_is_blit().
361 */
362static boolean
363blit_rgba_blit(const struct lp_rast_state *state,
364               unsigned x, unsigned y,
365               unsigned width, unsigned height,
366               const float (*a0)[4],
367               const float (*dadx)[4],
368               const float (*dady)[4],
369               uint8_t *color,
370               unsigned stride)
371{
372   const struct lp_jit_context *context = &state->jit_context;
373   const struct lp_jit_texture *texture = &context->textures[0];
374   const uint8_t *src;
375   unsigned src_stride;
376   int src_x, src_y;
377
378   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
379
380   /* Require w==1.0:
381    */
382   if (a0[0][3] != 1.0 ||
383       dadx[0][3] != 0.0 ||
384       dady[0][3] != 0.0)
385      return FALSE;
386
387   src_x = x + util_iround(a0[1][0]*texture->width - 0.5f);
388   src_y = y + util_iround(a0[1][1]*texture->height - 0.5f);
389
390   src = texture->base;
391   src_stride = texture->row_stride[0];
392
393   /* Fall back to blit_rgba() if clamping required:
394    */
395   if (src_x < 0 ||
396       src_y < 0 ||
397       src_x + width > texture->width ||
398       src_y + height > texture->height)
399      return FALSE;
400
401   util_copy_rect(color, PIPE_FORMAT_B8G8R8A8_UNORM, stride,
402                  x, y,
403                  width, height,
404                  src, src_stride,
405                  src_x, src_y);
406
407   return TRUE;
408}
409
410
411/* Linear shader which implements the BLIT_RGB1 shader, with the
412 * additional constraints imposed by lp_setup_is_blit().
413 */
414static boolean
415blit_rgb1_blit(const struct lp_rast_state *state,
416               unsigned x, unsigned y,
417               unsigned width, unsigned height,
418               const float (*a0)[4],
419               const float (*dadx)[4],
420               const float (*dady)[4],
421               uint8_t *color,
422               unsigned stride)
423{
424   const struct lp_jit_context *context = &state->jit_context;
425   const struct lp_jit_texture *texture = &context->textures[0];
426   const uint8_t *src;
427   unsigned src_stride;
428   int src_x, src_y;
429
430   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
431
432   /* Require w==1.0:
433    */
434   if (a0[0][3] != 1.0 ||
435       dadx[0][3] != 0.0 ||
436       dady[0][3] != 0.0)
437      return FALSE;
438
439   color += x * 4 + y * stride;
440
441   src_x = x + util_iround(a0[1][0]*texture->width - 0.5f);
442   src_y = y + util_iround(a0[1][1]*texture->height - 0.5f);
443
444   src = texture->base;
445   src_stride = texture->row_stride[0];
446   src += src_x * 4;
447   src += src_y * src_stride;
448
449   if (src_x < 0 ||
450       src_y < 0 ||
451       src_x + width > texture->width ||
452       src_y + height > texture->height)
453      return FALSE;
454
455   for (y = 0; y < height; y++) {
456      const uint32_t *src_row = (const uint32_t *)src;
457      uint32_t *dst_row = (uint32_t *)color;
458
459      for (x = 0; x < width; x++) {
460         *dst_row++ = *src_row++ | 0xff000000;
461      }
462
463      color += stride;
464      src += src_stride;
465   }
466
467   return TRUE;
468}
469
470
471/* Linear shader variant implementing the BLIT_RGBA shader without
472 * blending.
473 */
474static boolean
475blit_rgba(const struct lp_rast_state *state,
476          unsigned x, unsigned y,
477          unsigned width, unsigned height,
478          const float (*a0)[4],
479          const float (*dadx)[4],
480          const float (*dady)[4],
481          uint8_t *color,
482          unsigned stride)
483{
484   const struct lp_jit_context *context = &state->jit_context;
485   struct nearest_sampler samp;
486   struct color_blend blend;
487
488   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
489
490   if (!init_nearest_sampler(&samp,
491                             &context->textures[0],
492                             x, y, width, height,
493                             a0[1][0], dadx[1][0], dady[1][0],
494                             a0[1][1], dadx[1][1], dady[1][1],
495                             a0[0][3], dadx[0][3], dady[0][3]))
496      return FALSE;
497
498   init_blend(&blend,
499              x, y, width, height,
500              color, stride);
501
502   /* Rasterize the rectangle and run the shader:
503    */
504   for (y = 0; y < height; y++) {
505      blend.src = samp.fetch(&samp);
506      blend_noop(&blend);
507   }
508
509   return TRUE;
510}
511
512
513static boolean
514blit_rgb1(const struct lp_rast_state *state,
515          unsigned x, unsigned y,
516          unsigned width, unsigned height,
517          const float (*a0)[4],
518          const float (*dadx)[4],
519          const float (*dady)[4],
520          uint8_t *color,
521          unsigned stride)
522{
523   const struct lp_jit_context *context = &state->jit_context;
524   struct nearest_sampler samp;
525   struct color_blend blend;
526   struct shader shader;
527
528   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
529
530   if (!init_nearest_sampler(&samp,
531                             &context->textures[0],
532                             x, y, width, height,
533                             a0[1][0], dadx[1][0], dady[1][0],
534                             a0[1][1], dadx[1][1], dady[1][1],
535                             a0[0][3], dadx[0][3], dady[0][3]))
536      return FALSE;
537
538   init_blend(&blend,
539              x, y, width, height,
540              color, stride);
541
542
543   init_shader(&shader,
544               x, y, width, height);
545
546   /* Rasterize the rectangle and run the shader:
547    */
548   for (y = 0; y < height; y++) {
549      shader.src0 = samp.fetch(&samp);
550      blend.src = shade_rgb1(&shader);
551      blend_noop(&blend);
552   }
553
554   return TRUE;
555}
556
557
558/* Linear shader variant implementing the BLIT_RGBA shader with
559 * one/inv_src_alpha blending.
560 */
561static boolean
562blit_rgba_blend_premul(const struct lp_rast_state *state,
563                       unsigned x, unsigned y,
564                       unsigned width, unsigned height,
565                       const float (*a0)[4],
566                       const float (*dadx)[4],
567                       const float (*dady)[4],
568                       uint8_t *color,
569                       unsigned stride)
570{
571   const struct lp_jit_context *context = &state->jit_context;
572   struct nearest_sampler samp;
573   struct color_blend blend;
574
575   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
576
577   if (!init_nearest_sampler(&samp,
578                             &context->textures[0],
579                             x, y, width, height,
580                             a0[1][0], dadx[1][0], dady[1][0],
581                             a0[1][1], dadx[1][1], dady[1][1],
582                             a0[0][3], dadx[0][3], dady[0][3]))
583      return FALSE;
584
585
586   init_blend(&blend,
587              x, y, width, height,
588              color, stride);
589
590   /* Rasterize the rectangle and run the shader:
591    */
592   for (y = 0; y < height; y++) {
593      blend.src = samp.fetch(&samp);
594      blend_premul(&blend);
595   }
596
597   return TRUE;
598}
599
600
601/* Linear shader which always emits red.  Used for debugging.
602 */
603static boolean
604linear_red(const struct lp_rast_state *state,
605           unsigned x, unsigned y,
606           unsigned width, unsigned height,
607           const float (*a0)[4],
608           const float (*dadx)[4],
609           const float (*dady)[4],
610           uint8_t *color,
611           unsigned stride)
612{
613   union util_color uc;
614
615   util_pack_color_ub(0xff, 0, 0, 0xff,
616                      PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
617
618   util_fill_rect(color,
619                  PIPE_FORMAT_B8G8R8A8_UNORM,
620                  stride,
621                  x,
622                  y,
623                  width,
624                  height,
625                  &uc);
626
627   return TRUE;
628}
629
630
631/* Noop linear shader variant, for debugging.
632 */
633static boolean
634linear_no_op(const struct lp_rast_state *state,
635             unsigned x, unsigned y,
636             unsigned width, unsigned height,
637             const float (*a0)[4],
638             const float (*dadx)[4],
639             const float (*dady)[4],
640             uint8_t *color,
641             unsigned stride)
642{
643   return TRUE;
644}
645
646/* Check for ADD/ONE/INV_SRC_ALPHA, ie premultiplied-alpha blending.
647 */
648static boolean
649is_one_inv_src_alpha_blend(const struct lp_fragment_shader_variant *variant)
650{
651   return
652      !variant->key.blend.logicop_enable &&
653      variant->key.blend.rt[0].blend_enable &&
654      variant->key.blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
655      variant->key.blend.rt[0].rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
656      variant->key.blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
657      variant->key.blend.rt[0].alpha_func == PIPE_BLEND_ADD &&
658      variant->key.blend.rt[0].alpha_src_factor == PIPE_BLENDFACTOR_ONE &&
659      variant->key.blend.rt[0].alpha_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
660      variant->key.blend.rt[0].colormask == 0xf;
661}
662
663
664/* Examine the fragment shader varient and determine whether we can
665 * substitute a fastpath linear shader implementation.
666 */
667void
668llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
669{
670   struct lp_sampler_static_state *samp0 = lp_fs_variant_key_sampler_idx(&variant->key, 0);
671
672   if (LP_PERF & PERF_NO_SHADE) {
673      variant->jit_linear                   = linear_red;
674      return;
675   }
676
677   if (!samp0)
678      return;
679
680   enum pipe_format tex_format = samp0->texture_state.format;
681   if (variant->shader->kind == LP_FS_KIND_BLIT_RGBA &&
682       tex_format == PIPE_FORMAT_B8G8R8A8_UNORM &&
683       is_nearest_clamp_sampler(samp0)) {
684      if (variant->opaque) {
685         variant->jit_linear_blit             = blit_rgba_blit;
686         variant->jit_linear                  = blit_rgba;
687      }
688      else if (is_one_inv_src_alpha_blend(variant) &&
689               util_get_cpu_caps()->has_sse2) {
690         variant->jit_linear                  = blit_rgba_blend_premul;
691      }
692      return;
693   }
694
695   if (variant->shader->kind == LP_FS_KIND_BLIT_RGB1 &&
696       variant->opaque &&
697       (tex_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
698        tex_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
699       is_nearest_clamp_sampler(samp0)) {
700      variant->jit_linear_blit             = blit_rgb1_blit;
701      variant->jit_linear                  = blit_rgb1;
702      return;
703   }
704
705   if (0) {
706      variant->jit_linear                   = linear_no_op;
707      return;
708   }
709}
710#else
711void
712llvmpipe_fs_variant_linear_fastpath(struct lp_fragment_shader_variant *variant)
713{
714   /* don't bother if there is no SSE */
715}
716#endif
717
718