1/*
2 * Mesa 3-D graphics library
3 *
4 * Copyright 2012 Intel Corporation
5 * Copyright 2013 Google
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 * Authors:
28 *    Chad Versace <chad.versace@linux.intel.com>
29 *    Frank Henigman <fjhenigman@google.com>
30 */
31
32#include <string.h>
33
34#include "util/macros.h"
35#include "main/macros.h"
36
37#include "isl_priv.h"
38
39#if defined(__SSSE3__)
40#include <tmmintrin.h>
41#elif defined(__SSE2__)
42#include <emmintrin.h>
43#endif
44
45#define FILE_DEBUG_FLAG DEBUG_TEXTURE
46
47#define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48#define ALIGN_UP(a, b) ALIGN(a, b)
49
50/* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
51 * unitless).  A "span" is the most number of bytes we can copy from linear
52 * to tiled without needing to calculate a new destination address.
53 */
54static const uint32_t xtile_width = 512;
55static const uint32_t xtile_height = 8;
56static const uint32_t xtile_span = 64;
57static const uint32_t ytile_width = 128;
58static const uint32_t ytile_height = 32;
59static const uint32_t ytile_span = 16;
60
61static inline uint32_t
62ror(uint32_t n, uint32_t d)
63{
64   return (n >> d) | (n << (32 - d));
65}
66
67/* Handle conflicting declaration and conflicting macro in netbsd */
68#undef bswap32
69#define bswap32(n) __builtin_bswap32(n)
70#if 0
71static inline uint32_t
72bswap32(uint32_t n)
73{
74#if defined(HAVE___BUILTIN_BSWAP32)
75   return __builtin_bswap32(n);
76#else
77   return (n >> 24) |
78          ((n >> 8) & 0x0000ff00) |
79          ((n << 8) & 0x00ff0000) |
80          (n << 24);
81#endif
82}
83#endif
84
85/**
86 * Copy RGBA to BGRA - swap R and B.
87 */
88static inline void *
89rgba8_copy(void *dst, const void *src, size_t bytes)
90{
91   uint32_t *d = dst;
92   uint32_t const *s = src;
93
94   assert(bytes % 4 == 0);
95
96   while (bytes >= 4) {
97      *d = ror(bswap32(*s), 8);
98      d += 1;
99      s += 1;
100      bytes -= 4;
101   }
102   return dst;
103}
104
105#ifdef __SSSE3__
106static const uint8_t rgba8_permutation[16] =
107   { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
108
109static inline void
110rgba8_copy_16_aligned_dst(void *dst, const void *src)
111{
112   _mm_store_si128(dst,
113                   _mm_shuffle_epi8(_mm_loadu_si128(src),
114                                    *(__m128i *)rgba8_permutation));
115}
116
117static inline void
118rgba8_copy_16_aligned_src(void *dst, const void *src)
119{
120   _mm_storeu_si128(dst,
121                    _mm_shuffle_epi8(_mm_load_si128(src),
122                                     *(__m128i *)rgba8_permutation));
123}
124
125#elif defined(__SSE2__)
126static inline void
127rgba8_copy_16_aligned_dst(void *dst, const void *src)
128{
129   __m128i srcreg, dstreg, agmask, ag, rb, br;
130
131   agmask = _mm_set1_epi32(0xFF00FF00);
132   srcreg = _mm_loadu_si128((__m128i *)src);
133
134   rb = _mm_andnot_si128(agmask, srcreg);
135   ag = _mm_and_si128(agmask, srcreg);
136   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
137                            _MM_SHUFFLE(2, 3, 0, 1));
138   dstreg = _mm_or_si128(ag, br);
139
140   _mm_store_si128((__m128i *)dst, dstreg);
141}
142
143static inline void
144rgba8_copy_16_aligned_src(void *dst, const void *src)
145{
146   __m128i srcreg, dstreg, agmask, ag, rb, br;
147
148   agmask = _mm_set1_epi32(0xFF00FF00);
149   srcreg = _mm_load_si128((__m128i *)src);
150
151   rb = _mm_andnot_si128(agmask, srcreg);
152   ag = _mm_and_si128(agmask, srcreg);
153   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
154                            _MM_SHUFFLE(2, 3, 0, 1));
155   dstreg = _mm_or_si128(ag, br);
156
157   _mm_storeu_si128((__m128i *)dst, dstreg);
158}
159#endif
160
161/**
162 * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
163 */
164static inline void *
165rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
166{
167   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
168
169#if defined(__SSSE3__) || defined(__SSE2__)
170   if (bytes == 64) {
171      rgba8_copy_16_aligned_dst(dst +  0, src +  0);
172      rgba8_copy_16_aligned_dst(dst + 16, src + 16);
173      rgba8_copy_16_aligned_dst(dst + 32, src + 32);
174      rgba8_copy_16_aligned_dst(dst + 48, src + 48);
175      return dst;
176   }
177
178   while (bytes >= 16) {
179      rgba8_copy_16_aligned_dst(dst, src);
180      src += 16;
181      dst += 16;
182      bytes -= 16;
183   }
184#endif
185
186   rgba8_copy(dst, src, bytes);
187
188   return dst;
189}
190
191/**
192 * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
193 */
194static inline void *
195rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
196{
197   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
198
199#if defined(__SSSE3__) || defined(__SSE2__)
200   if (bytes == 64) {
201      rgba8_copy_16_aligned_src(dst +  0, src +  0);
202      rgba8_copy_16_aligned_src(dst + 16, src + 16);
203      rgba8_copy_16_aligned_src(dst + 32, src + 32);
204      rgba8_copy_16_aligned_src(dst + 48, src + 48);
205      return dst;
206   }
207
208   while (bytes >= 16) {
209      rgba8_copy_16_aligned_src(dst, src);
210      src += 16;
211      dst += 16;
212      bytes -= 16;
213   }
214#endif
215
216   rgba8_copy(dst, src, bytes);
217
218   return dst;
219}
220
221/**
222 * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
223 * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
224 * The first and last ranges must be shorter than a "span" (the longest linear
225 * stretch within a tile) and the middle must equal a whole number of spans.
226 * Ranges may be empty.  The region copied must land entirely within one tile.
227 * 'dst' is the start of the tile and 'src' is the corresponding
228 * address to copy from, though copying begins at (x0, y0).
229 * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
230 * Swizzling flips bit 6 in the copy destination offset, when certain other
231 * bits are set in it.
232 */
233typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
234                             uint32_t y0, uint32_t y1,
235                             char *dst, const char *src,
236                             int32_t linear_pitch,
237                             uint32_t swizzle_bit,
238                             isl_memcpy_type copy_type);
239
240/**
241 * Copy texture data from linear to X tile layout.
242 *
243 * \copydoc tile_copy_fn
244 *
245 * The mem_copy parameters allow the user to specify an alternative mem_copy
246 * function that, for instance, may do RGBA -> BGRA swizzling.  The first
247 * function must handle any memory alignment while the second function must
248 * only handle 16-byte alignment in whichever side (source or destination) is
249 * tiled.
250 */
251static inline void
252linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
253                 uint32_t y0, uint32_t y1,
254                 char *dst, const char *src,
255                 int32_t src_pitch,
256                 uint32_t swizzle_bit,
257                 isl_mem_copy_fn mem_copy,
258                 isl_mem_copy_fn mem_copy_align16)
259{
260   /* The copy destination offset for each range copied is the sum of
261    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
262    */
263   uint32_t xo, yo;
264
265   src += (ptrdiff_t)y0 * src_pitch;
266
267   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
268      /* Bits 9 and 10 of the copy destination offset control swizzling.
269       * Only 'yo' contributes to those bits in the total offset,
270       * so calculate 'swizzle' just once per row.
271       * Move bits 9 and 10 three and four places respectively down
272       * to bit 6 and xor them.
273       */
274      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
275
276      mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
277
278      for (xo = x1; xo < x2; xo += xtile_span) {
279         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
280      }
281
282      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
283
284      src += src_pitch;
285   }
286}
287
288/**
289 * Copy texture data from linear to Y tile layout.
290 *
291 * \copydoc tile_copy_fn
292 */
293static inline void
294linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
295                 uint32_t y0, uint32_t y3,
296                 char *dst, const char *src,
297                 int32_t src_pitch,
298                 uint32_t swizzle_bit,
299                 isl_mem_copy_fn mem_copy,
300                 isl_mem_copy_fn mem_copy_align16)
301{
302   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
303    * as the tile).  Thus the destination offset for (x,y) is the sum of:
304    *   (x % column_width)                    // position within column
305    *   (x / column_width) * bytes_per_column // column number * bytes per column
306    *   y * column_width
307    *
308    * The copy destination offset for each range copied is the sum of
309    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
310    */
311   const uint32_t column_width = ytile_span;
312   const uint32_t bytes_per_column = column_width * ytile_height;
313
314   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
315   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
316
317   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
318   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
319
320   /* Bit 9 of the destination offset control swizzling.
321    * Only the X offset contributes to bit 9 of the total offset,
322    * so swizzle can be calculated in advance for these X positions.
323    * Move bit 9 three places down to bit 6.
324    */
325   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
326   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
327
328   uint32_t x, yo;
329
330   src += (ptrdiff_t)y0 * src_pitch;
331
332   if (y0 != y1) {
333      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
334         uint32_t xo = xo1;
335         uint32_t swizzle = swizzle1;
336
337         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
338
339         /* Step by spans/columns.  As it happens, the swizzle bit flips
340          * at each step so we don't need to calculate it explicitly.
341          */
342         for (x = x1; x < x2; x += ytile_span) {
343            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
344            xo += bytes_per_column;
345            swizzle ^= swizzle_bit;
346         }
347
348         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
349
350         src += src_pitch;
351      }
352   }
353
354   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
355      uint32_t xo = xo1;
356      uint32_t swizzle = swizzle1;
357
358      if (x0 != x1) {
359         mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
360         mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
361         mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
362         mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
363      }
364
365      /* Step by spans/columns.  As it happens, the swizzle bit flips
366       * at each step so we don't need to calculate it explicitly.
367       */
368      for (x = x1; x < x2; x += ytile_span) {
369         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
370         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
371         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
372         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
373         xo += bytes_per_column;
374         swizzle ^= swizzle_bit;
375      }
376
377      if (x2 != x3) {
378         mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
379         mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
380         mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
381         mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
382      }
383
384      src += 4 * src_pitch;
385   }
386
387   if (y2 != y3) {
388      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
389         uint32_t xo = xo1;
390         uint32_t swizzle = swizzle1;
391
392         mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
393
394         /* Step by spans/columns.  As it happens, the swizzle bit flips
395          * at each step so we don't need to calculate it explicitly.
396          */
397         for (x = x1; x < x2; x += ytile_span) {
398            mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
399            xo += bytes_per_column;
400            swizzle ^= swizzle_bit;
401         }
402
403         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
404
405         src += src_pitch;
406      }
407   }
408}
409
410/**
411 * Copy texture data from X tile layout to linear.
412 *
413 * \copydoc tile_copy_fn
414 */
415static inline void
416xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
417                 uint32_t y0, uint32_t y1,
418                 char *dst, const char *src,
419                 int32_t dst_pitch,
420                 uint32_t swizzle_bit,
421                 isl_mem_copy_fn mem_copy,
422                 isl_mem_copy_fn mem_copy_align16)
423{
424   /* The copy destination offset for each range copied is the sum of
425    * an X offset 'x0' or 'xo' and a Y offset 'yo.'
426    */
427   uint32_t xo, yo;
428
429   dst += (ptrdiff_t)y0 * dst_pitch;
430
431   for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
432      /* Bits 9 and 10 of the copy destination offset control swizzling.
433       * Only 'yo' contributes to those bits in the total offset,
434       * so calculate 'swizzle' just once per row.
435       * Move bits 9 and 10 three and four places respectively down
436       * to bit 6 and xor them.
437       */
438      uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
439
440      mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
441
442      for (xo = x1; xo < x2; xo += xtile_span) {
443         mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
444      }
445
446      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
447
448      dst += dst_pitch;
449   }
450}
451
452 /**
453 * Copy texture data from Y tile layout to linear.
454 *
455 * \copydoc tile_copy_fn
456 */
457static inline void
458ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
459                 uint32_t y0, uint32_t y3,
460                 char *dst, const char *src,
461                 int32_t dst_pitch,
462                 uint32_t swizzle_bit,
463                 isl_mem_copy_fn mem_copy,
464                 isl_mem_copy_fn mem_copy_align16)
465{
466   /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
467    * as the tile).  Thus the destination offset for (x,y) is the sum of:
468    *   (x % column_width)                    // position within column
469    *   (x / column_width) * bytes_per_column // column number * bytes per column
470    *   y * column_width
471    *
472    * The copy destination offset for each range copied is the sum of
473    * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
474    */
475   const uint32_t column_width = ytile_span;
476   const uint32_t bytes_per_column = column_width * ytile_height;
477
478   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
479   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
480
481   uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
482   uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
483
484   /* Bit 9 of the destination offset control swizzling.
485    * Only the X offset contributes to bit 9 of the total offset,
486    * so swizzle can be calculated in advance for these X positions.
487    * Move bit 9 three places down to bit 6.
488    */
489   uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
490   uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
491
492   uint32_t x, yo;
493
494   dst += (ptrdiff_t)y0 * dst_pitch;
495
496   if (y0 != y1) {
497      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
498         uint32_t xo = xo1;
499         uint32_t swizzle = swizzle1;
500
501         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
502
503         /* Step by spans/columns.  As it happens, the swizzle bit flips
504          * at each step so we don't need to calculate it explicitly.
505          */
506         for (x = x1; x < x2; x += ytile_span) {
507            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
508            xo += bytes_per_column;
509            swizzle ^= swizzle_bit;
510         }
511
512         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
513
514         dst += dst_pitch;
515      }
516   }
517
518   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
519      uint32_t xo = xo1;
520      uint32_t swizzle = swizzle1;
521
522      if (x0 != x1) {
523         mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
524         mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
525         mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
526         mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
527      }
528
529      /* Step by spans/columns.  As it happens, the swizzle bit flips
530       * at each step so we don't need to calculate it explicitly.
531       */
532      for (x = x1; x < x2; x += ytile_span) {
533         mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
534         mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
535         mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
536         mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
537         xo += bytes_per_column;
538         swizzle ^= swizzle_bit;
539      }
540
541      if (x2 != x3) {
542         mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
543         mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
544         mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
545         mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
546      }
547
548      dst += 4 * dst_pitch;
549   }
550
551   if (y2 != y3) {
552      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
553         uint32_t xo = xo1;
554         uint32_t swizzle = swizzle1;
555
556         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
557
558         /* Step by spans/columns.  As it happens, the swizzle bit flips
559          * at each step so we don't need to calculate it explicitly.
560          */
561         for (x = x1; x < x2; x += ytile_span) {
562            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
563            xo += bytes_per_column;
564            swizzle ^= swizzle_bit;
565         }
566
567         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
568
569         dst += dst_pitch;
570      }
571   }
572}
573
574#if defined(INLINE_SSE41)
575static ALWAYS_INLINE void *
576_memcpy_streaming_load(void *dest, const void *src, size_t count)
577{
578   if (count == 16) {
579      __m128i val = _mm_stream_load_si128((__m128i *)src);
580      _mm_storeu_si128((__m128i *)dest, val);
581      return dest;
582   } else if (count == 64) {
583      __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
584      __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
585      __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
586      __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
587      _mm_storeu_si128(((__m128i *)dest) + 0, val0);
588      _mm_storeu_si128(((__m128i *)dest) + 1, val1);
589      _mm_storeu_si128(((__m128i *)dest) + 2, val2);
590      _mm_storeu_si128(((__m128i *)dest) + 3, val3);
591      return dest;
592   } else {
593      assert(count < 64); /* and (count < 16) for ytiled */
594      return memcpy(dest, src, count);
595   }
596}
597#endif
598
599static isl_mem_copy_fn
600choose_copy_function(isl_memcpy_type copy_type)
601{
602   switch(copy_type) {
603   case ISL_MEMCPY:
604      return memcpy;
605   case ISL_MEMCPY_BGRA8:
606      return rgba8_copy;
607   case ISL_MEMCPY_STREAMING_LOAD:
608#if defined(INLINE_SSE41)
609      return _memcpy_streaming_load;
610#else
611      unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
612#endif
613   case ISL_MEMCPY_INVALID:
614      unreachable("invalid copy_type");
615   }
616   unreachable("unhandled copy_type");
617   return NULL;
618}
619
620/**
621 * Copy texture data from linear to X tile layout, faster.
622 *
623 * Same as \ref linear_to_xtiled but faster, because it passes constant
624 * parameters for common cases, allowing the compiler to inline code
625 * optimized for those cases.
626 *
627 * \copydoc tile_copy_fn
628 */
629static FLATTEN void
630linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
631                        uint32_t y0, uint32_t y1,
632                        char *dst, const char *src,
633                        int32_t src_pitch,
634                        uint32_t swizzle_bit,
635                        isl_memcpy_type copy_type)
636{
637   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
638
639   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
640      if (mem_copy == memcpy)
641         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
642                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
643      else if (mem_copy == rgba8_copy)
644         return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
645                                 dst, src, src_pitch, swizzle_bit,
646                                 rgba8_copy, rgba8_copy_aligned_dst);
647      else
648         unreachable("not reached");
649   } else {
650      if (mem_copy == memcpy)
651         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
652                                 dst, src, src_pitch, swizzle_bit,
653                                 memcpy, memcpy);
654      else if (mem_copy == rgba8_copy)
655         return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
656                                 dst, src, src_pitch, swizzle_bit,
657                                 rgba8_copy, rgba8_copy_aligned_dst);
658      else
659         unreachable("not reached");
660   }
661   linear_to_xtiled(x0, x1, x2, x3, y0, y1,
662                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
663}
664
665/**
666 * Copy texture data from linear to Y tile layout, faster.
667 *
668 * Same as \ref linear_to_ytiled but faster, because it passes constant
669 * parameters for common cases, allowing the compiler to inline code
670 * optimized for those cases.
671 *
672 * \copydoc tile_copy_fn
673 */
674static FLATTEN void
675linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
676                        uint32_t y0, uint32_t y1,
677                        char *dst, const char *src,
678                        int32_t src_pitch,
679                        uint32_t swizzle_bit,
680                        isl_memcpy_type copy_type)
681{
682   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
683
684   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
685      if (mem_copy == memcpy)
686         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
687                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
688      else if (mem_copy == rgba8_copy)
689         return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
690                                 dst, src, src_pitch, swizzle_bit,
691                                 rgba8_copy, rgba8_copy_aligned_dst);
692      else
693         unreachable("not reached");
694   } else {
695      if (mem_copy == memcpy)
696         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
697                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
698      else if (mem_copy == rgba8_copy)
699         return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
700                                 dst, src, src_pitch, swizzle_bit,
701                                 rgba8_copy, rgba8_copy_aligned_dst);
702      else
703         unreachable("not reached");
704   }
705   linear_to_ytiled(x0, x1, x2, x3, y0, y1,
706                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
707}
708
709/**
710 * Copy texture data from X tile layout to linear, faster.
711 *
712 * Same as \ref xtile_to_linear but faster, because it passes constant
713 * parameters for common cases, allowing the compiler to inline code
714 * optimized for those cases.
715 *
716 * \copydoc tile_copy_fn
717 */
718static FLATTEN void
719xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
720                        uint32_t y0, uint32_t y1,
721                        char *dst, const char *src,
722                        int32_t dst_pitch,
723                        uint32_t swizzle_bit,
724                        isl_memcpy_type copy_type)
725{
726   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
727
728   if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
729      if (mem_copy == memcpy)
730         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
731                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
732      else if (mem_copy == rgba8_copy)
733         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
734                                 dst, src, dst_pitch, swizzle_bit,
735                                 rgba8_copy, rgba8_copy_aligned_src);
736#if defined(INLINE_SSE41)
737      else if (mem_copy == _memcpy_streaming_load)
738         return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
739                                 dst, src, dst_pitch, swizzle_bit,
740                                 memcpy, _memcpy_streaming_load);
741#endif
742      else
743         unreachable("not reached");
744   } else {
745      if (mem_copy == memcpy)
746         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
747                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
748      else if (mem_copy == rgba8_copy)
749         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
750                                 dst, src, dst_pitch, swizzle_bit,
751                                 rgba8_copy, rgba8_copy_aligned_src);
752#if defined(INLINE_SSE41)
753      else if (mem_copy == _memcpy_streaming_load)
754         return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
755                                 dst, src, dst_pitch, swizzle_bit,
756                                 memcpy, _memcpy_streaming_load);
757#endif
758      else
759         unreachable("not reached");
760   }
761   xtiled_to_linear(x0, x1, x2, x3, y0, y1,
762                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
763}
764
765/**
766 * Copy texture data from Y tile layout to linear, faster.
767 *
768 * Same as \ref ytile_to_linear but faster, because it passes constant
769 * parameters for common cases, allowing the compiler to inline code
770 * optimized for those cases.
771 *
772 * \copydoc tile_copy_fn
773 */
774static FLATTEN void
775ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
776                        uint32_t y0, uint32_t y1,
777                        char *dst, const char *src,
778                        int32_t dst_pitch,
779                        uint32_t swizzle_bit,
780                        isl_memcpy_type copy_type)
781{
782   isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
783
784   if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
785      if (mem_copy == memcpy)
786         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
787                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
788      else if (mem_copy == rgba8_copy)
789         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
790                                 dst, src, dst_pitch, swizzle_bit,
791                                 rgba8_copy, rgba8_copy_aligned_src);
792#if defined(INLINE_SSE41)
793      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
794         return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
795                                 dst, src, dst_pitch, swizzle_bit,
796                                 memcpy, _memcpy_streaming_load);
797#endif
798      else
799         unreachable("not reached");
800   } else {
801      if (mem_copy == memcpy)
802         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
803                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
804      else if (mem_copy == rgba8_copy)
805         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
806                                 dst, src, dst_pitch, swizzle_bit,
807                                 rgba8_copy, rgba8_copy_aligned_src);
808#if defined(INLINE_SSE41)
809      else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
810         return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
811                                 dst, src, dst_pitch, swizzle_bit,
812                                 memcpy, _memcpy_streaming_load);
813#endif
814      else
815         unreachable("not reached");
816   }
817   ytiled_to_linear(x0, x1, x2, x3, y0, y1,
818                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
819}
820
821/**
822 * Copy from linear to tiled texture.
823 *
824 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
825 * pieces that do not cross tile boundaries and copy each piece with a tile
826 * copy function (\ref tile_copy_fn).
827 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
828 * The Y range is in pixels (i.e. unitless).
829 * 'dst' is the address of (0, 0) in the destination tiled texture.
830 * 'src' is the address of (xt1, yt1) in the source linear texture.
831 */
832static void
833intel_linear_to_tiled(uint32_t xt1, uint32_t xt2,
834                      uint32_t yt1, uint32_t yt2,
835                      char *dst, const char *src,
836                      uint32_t dst_pitch, int32_t src_pitch,
837                      bool has_swizzling,
838                      enum isl_tiling tiling,
839                      isl_memcpy_type copy_type)
840{
841   tile_copy_fn tile_copy;
842   uint32_t xt0, xt3;
843   uint32_t yt0, yt3;
844   uint32_t xt, yt;
845   uint32_t tw, th, span;
846   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
847
848   if (tiling == ISL_TILING_X) {
849      tw = xtile_width;
850      th = xtile_height;
851      span = xtile_span;
852      tile_copy = linear_to_xtiled_faster;
853   } else if (tiling == ISL_TILING_Y0) {
854      tw = ytile_width;
855      th = ytile_height;
856      span = ytile_span;
857      tile_copy = linear_to_ytiled_faster;
858   } else {
859      unreachable("unsupported tiling");
860   }
861
862   /* Round out to tile boundaries. */
863   xt0 = ALIGN_DOWN(xt1, tw);
864   xt3 = ALIGN_UP  (xt2, tw);
865   yt0 = ALIGN_DOWN(yt1, th);
866   yt3 = ALIGN_UP  (yt2, th);
867
868   /* Loop over all tiles to which we have something to copy.
869    * 'xt' and 'yt' are the origin of the destination tile, whether copying
870    * copying a full or partial tile.
871    * tile_copy() copies one tile or partial tile.
872    * Looping x inside y is the faster memory access pattern.
873    */
874   for (yt = yt0; yt < yt3; yt += th) {
875      for (xt = xt0; xt < xt3; xt += tw) {
876         /* The area to update is [x0,x3) x [y0,y1).
877          * May not want the whole tile, hence the min and max.
878          */
879         uint32_t x0 = MAX2(xt1, xt);
880         uint32_t y0 = MAX2(yt1, yt);
881         uint32_t x3 = MIN2(xt2, xt + tw);
882         uint32_t y1 = MIN2(yt2, yt + th);
883
884         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
885          * the middle interval is the longest span-aligned part.
886          * The sub-ranges could be empty.
887          */
888         uint32_t x1, x2;
889         x1 = ALIGN_UP(x0, span);
890         if (x1 > x3)
891            x1 = x2 = x3;
892         else
893            x2 = ALIGN_DOWN(x3, span);
894
895         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
896         assert(x1 - x0 < span && x3 - x2 < span);
897         assert(x3 - x0 <= tw);
898         assert((x2 - x1) % span == 0);
899
900         /* Translate by (xt,yt) for single-tile copier. */
901         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
902                   y0-yt, y1-yt,
903                   dst + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * dst_pitch,
904                   src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
905                   src_pitch,
906                   swizzle_bit,
907                   copy_type);
908      }
909   }
910}
911
912/**
913 * Copy from tiled to linear texture.
914 *
915 * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
916 * pieces that do not cross tile boundaries and copy each piece with a tile
917 * copy function (\ref tile_copy_fn).
918 * The X range is in bytes, i.e. pixels * bytes-per-pixel.
919 * The Y range is in pixels (i.e. unitless).
920 * 'dst' is the address of (xt1, yt1) in the destination linear texture.
921 * 'src' is the address of (0, 0) in the source tiled texture.
922 */
923static void
924intel_tiled_to_linear(uint32_t xt1, uint32_t xt2,
925                      uint32_t yt1, uint32_t yt2,
926                      char *dst, const char *src,
927                      int32_t dst_pitch, uint32_t src_pitch,
928                      bool has_swizzling,
929                      enum isl_tiling tiling,
930                      isl_memcpy_type copy_type)
931{
932   tile_copy_fn tile_copy;
933   uint32_t xt0, xt3;
934   uint32_t yt0, yt3;
935   uint32_t xt, yt;
936   uint32_t tw, th, span;
937   uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
938
939   if (tiling == ISL_TILING_X) {
940      tw = xtile_width;
941      th = xtile_height;
942      span = xtile_span;
943      tile_copy = xtiled_to_linear_faster;
944   } else if (tiling == ISL_TILING_Y0) {
945      tw = ytile_width;
946      th = ytile_height;
947      span = ytile_span;
948      tile_copy = ytiled_to_linear_faster;
949   } else {
950      unreachable("unsupported tiling");
951   }
952
953#if defined(INLINE_SSE41)
954   if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
955      /* The hidden cacheline sized register used by movntdqa can apparently
956       * give you stale data, so do an mfence to invalidate it.
957       */
958      _mm_mfence();
959   }
960#endif
961
962   /* Round out to tile boundaries. */
963   xt0 = ALIGN_DOWN(xt1, tw);
964   xt3 = ALIGN_UP  (xt2, tw);
965   yt0 = ALIGN_DOWN(yt1, th);
966   yt3 = ALIGN_UP  (yt2, th);
967
968   /* Loop over all tiles to which we have something to copy.
969    * 'xt' and 'yt' are the origin of the destination tile, whether copying
970    * copying a full or partial tile.
971    * tile_copy() copies one tile or partial tile.
972    * Looping x inside y is the faster memory access pattern.
973    */
974   for (yt = yt0; yt < yt3; yt += th) {
975      for (xt = xt0; xt < xt3; xt += tw) {
976         /* The area to update is [x0,x3) x [y0,y1).
977          * May not want the whole tile, hence the min and max.
978          */
979         uint32_t x0 = MAX2(xt1, xt);
980         uint32_t y0 = MAX2(yt1, yt);
981         uint32_t x3 = MIN2(xt2, xt + tw);
982         uint32_t y1 = MIN2(yt2, yt + th);
983
984         /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
985          * the middle interval is the longest span-aligned part.
986          * The sub-ranges could be empty.
987          */
988         uint32_t x1, x2;
989         x1 = ALIGN_UP(x0, span);
990         if (x1 > x3)
991            x1 = x2 = x3;
992         else
993            x2 = ALIGN_DOWN(x3, span);
994
995         assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
996         assert(x1 - x0 < span && x3 - x2 < span);
997         assert(x3 - x0 <= tw);
998         assert((x2 - x1) % span == 0);
999
1000         /* Translate by (xt,yt) for single-tile copier. */
1001         tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1002                   y0-yt, y1-yt,
1003                   dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1004                   src + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * src_pitch,
1005                   dst_pitch,
1006                   swizzle_bit,
1007                   copy_type);
1008      }
1009   }
1010}
1011