1/**************************************************************************
2 *
3 * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
4 * Copyright (c) 2008 VMware, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 **************************************************************************/
25
26#include "util/format/u_format.h"
27#include "util/format/u_format_fxt1.h"
28#include "util/format/u_format_pack.h"
29#include "util/format_srgb.h"
30#include "util/u_math.h"
31
32#define RCOMP 0
33#define GCOMP 1
34#define BCOMP 2
35#define ACOMP 3
36
37#define FXT1_BLOCK_SIZE 16
38
39static void
40fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
41             const void *source, int32_t srcRowStride,
42             void *dest, int32_t destRowStride);
43
44static void
45fxt1_decode_1 (const void *texture, int32_t stride,
46               int32_t i, int32_t j, uint8_t *rgba);
47
48/***************************************************************************\
49 * FXT1 encoder
50 *
51 * The encoder was built by reversing the decoder,
52 * and is vaguely based on Texus2 by 3dfx. Note that this code
53 * is merely a proof of concept, since it is highly UNoptimized;
54 * moreover, it is sub-optimal due to initial conditions passed
55 * to Lloyd's algorithm (the interpolation modes are even worse).
56\***************************************************************************/
57
58
59#define MAX_COMP 4 /* ever needed maximum number of components in texel */
60#define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
61#define N_TEXELS 32 /* number of texels in a block (always 32) */
62#define LL_N_REP 50 /* number of iterations in lloyd's vq */
63#define LL_RMS_D 10 /* fault tolerance (maximum delta) */
64#define LL_RMS_E 255 /* fault tolerance (maximum error) */
65#define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
66static const uint32_t zero = 0;
67#define ISTBLACK(v) (memcmp(&(v), &zero, sizeof(zero)) == 0)
68
69/*
70 * Define a 64-bit unsigned integer type and macros
71 */
72#if 1
73
74#define FX64_NATIVE 1
75
76typedef uint64_t Fx64;
77
78#define FX64_MOV32(a, b) a = b
79#define FX64_OR32(a, b)  a |= b
80#define FX64_SHL(a, c)   a <<= c
81
82#else
83
84#define FX64_NATIVE 0
85
86typedef struct {
87   uint32_t lo, hi;
88} Fx64;
89
90#define FX64_MOV32(a, b) a.lo = b
91#define FX64_OR32(a, b)  a.lo |= b
92
93#define FX64_SHL(a, c)                                 \
94   do {                                                \
95       if ((c) >= 32) {                                \
96          a.hi = a.lo << ((c) - 32);                   \
97          a.lo = 0;                                    \
98       } else {                                        \
99          a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
100          a.lo <<= (c);                                \
101       }                                               \
102   } while (0)
103
104#endif
105
106
107#define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
108#define SAFECDOT 1 /* for paranoids */
109
110#define MAKEIVEC(NV, NC, IV, B, V0, V1)  \
111   do {                                  \
112      /* compute interpolation vector */ \
113      float d2 = 0.0F;                   \
114      float rd2;                         \
115                                         \
116      for (i = 0; i < NC; i++) {         \
117         IV[i] = (V1[i] - V0[i]) * F(i); \
118         d2 += IV[i] * IV[i];            \
119      }                                  \
120      rd2 = (float)NV / d2;              \
121      B = 0;                             \
122      for (i = 0; i < NC; i++) {         \
123         IV[i] *= F(i);                  \
124         B -= IV[i] * V0[i];             \
125         IV[i] *= rd2;                   \
126      }                                  \
127      B = B * rd2 + 0.5f;                \
128   } while (0)
129
130#define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
131   do {                                  \
132      float dot = 0.0F;                  \
133      for (i = 0; i < NC; i++) {         \
134         dot += V[i] * IV[i];            \
135      }                                  \
136      TEXEL = (int32_t)(dot + B);        \
137      if (SAFECDOT) {                    \
138         if (TEXEL < 0) {                \
139            TEXEL = 0;                   \
140         } else if (TEXEL > NV) {        \
141            TEXEL = NV;                  \
142         }                               \
143      }                                  \
144   } while (0)
145
146
147static int32_t
148fxt1_bestcol (float vec[][MAX_COMP], int32_t nv,
149              uint8_t input[MAX_COMP], int32_t nc)
150{
151   int32_t i, j, best = -1;
152   float err = 1e9; /* big enough */
153
154   for (j = 0; j < nv; j++) {
155      float e = 0.0F;
156      for (i = 0; i < nc; i++) {
157         e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
158      }
159      if (e < err) {
160         err = e;
161         best = j;
162      }
163   }
164
165   return best;
166}
167
168
169static int32_t
170fxt1_worst (float vec[MAX_COMP],
171            uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
172{
173   int32_t i, k, worst = -1;
174   float err = -1.0F; /* small enough */
175
176   for (k = 0; k < n; k++) {
177      float e = 0.0F;
178      for (i = 0; i < nc; i++) {
179         e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
180      }
181      if (e > err) {
182         err = e;
183         worst = k;
184      }
185   }
186
187   return worst;
188}
189
190
191static int32_t
192fxt1_variance (uint8_t input[N_TEXELS / 2][MAX_COMP], int32_t nc)
193{
194   const int n = N_TEXELS / 2;
195   int32_t i, k, best = 0;
196   int32_t sx, sx2;
197   double var, maxvar = -1; /* small enough */
198   double teenth = 1.0 / n;
199
200   for (i = 0; i < nc; i++) {
201      sx = sx2 = 0;
202      for (k = 0; k < n; k++) {
203         int32_t t = input[k][i];
204         sx += t;
205         sx2 += t * t;
206      }
207      var = sx2 * teenth - sx * sx * teenth * teenth;
208      if (maxvar < var) {
209         maxvar = var;
210         best = i;
211      }
212   }
213
214   return best;
215}
216
217
218static int32_t
219fxt1_choose (float vec[][MAX_COMP], int32_t nv,
220             uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
221{
222#if 0
223   /* Choose colors from a grid.
224    */
225   int32_t i, j;
226
227   for (j = 0; j < nv; j++) {
228      int32_t m = j * (n - 1) / (nv - 1);
229      for (i = 0; i < nc; i++) {
230         vec[j][i] = input[m][i];
231      }
232   }
233#else
234   /* Our solution here is to find the darkest and brightest colors in
235    * the 8x4 tile and use those as the two representative colors.
236    * There are probably better algorithms to use (histogram-based).
237    */
238   int32_t i, j, k;
239   int32_t minSum = 2000; /* big enough */
240   int32_t maxSum = -1; /* small enough */
241   int32_t minCol = 0; /* phoudoin: silent compiler! */
242   int32_t maxCol = 0; /* phoudoin: silent compiler! */
243
244   struct {
245      int32_t flag;
246      int32_t key;
247      int32_t freq;
248      int32_t idx;
249   } hist[N_TEXELS];
250   int32_t lenh = 0;
251
252   memset(hist, 0, sizeof(hist));
253
254   for (k = 0; k < n; k++) {
255      int32_t l;
256      int32_t key = 0;
257      int32_t sum = 0;
258      for (i = 0; i < nc; i++) {
259         key <<= 8;
260         key |= input[k][i];
261         sum += input[k][i];
262      }
263      for (l = 0; l < n; l++) {
264         if (!hist[l].flag) {
265            /* alloc new slot */
266            hist[l].flag = !0;
267            hist[l].key = key;
268            hist[l].freq = 1;
269            hist[l].idx = k;
270            lenh = l + 1;
271            break;
272         } else if (hist[l].key == key) {
273            hist[l].freq++;
274            break;
275         }
276      }
277      if (minSum > sum) {
278         minSum = sum;
279         minCol = k;
280      }
281      if (maxSum < sum) {
282         maxSum = sum;
283         maxCol = k;
284      }
285   }
286
287   if (lenh <= nv) {
288      for (j = 0; j < lenh; j++) {
289         for (i = 0; i < nc; i++) {
290            vec[j][i] = (float)input[hist[j].idx][i];
291         }
292      }
293      for (; j < nv; j++) {
294         for (i = 0; i < nc; i++) {
295            vec[j][i] = vec[0][i];
296         }
297      }
298      return 0;
299   }
300
301   for (j = 0; j < nv; j++) {
302      for (i = 0; i < nc; i++) {
303         vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
304      }
305   }
306#endif
307
308   return !0;
309}
310
311
312static int32_t
313fxt1_lloyd (float vec[][MAX_COMP], int32_t nv,
314            uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
315{
316   /* Use the generalized lloyd's algorithm for VQ:
317    *     find 4 color vectors.
318    *
319    *     for each sample color
320    *         sort to nearest vector.
321    *
322    *     replace each vector with the centroid of its matching colors.
323    *
324    *     repeat until RMS doesn't improve.
325    *
326    *     if a color vector has no samples, or becomes the same as another
327    *     vector, replace it with the color which is farthest from a sample.
328    *
329    * vec[][MAX_COMP]           initial vectors and resulting colors
330    * nv                        number of resulting colors required
331    * input[N_TEXELS][MAX_COMP] input texels
332    * nc                        number of components in input / vec
333    * n                         number of input samples
334    */
335
336   int32_t sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
337   int32_t cnt[MAX_VECT]; /* how many times a certain vector was chosen */
338   float error, lasterror = 1e9;
339
340   int32_t i, j, k, rep;
341
342   /* the quantizer */
343   for (rep = 0; rep < LL_N_REP; rep++) {
344      /* reset sums & counters */
345      for (j = 0; j < nv; j++) {
346         for (i = 0; i < nc; i++) {
347            sum[j][i] = 0;
348         }
349         cnt[j] = 0;
350      }
351      error = 0;
352
353      /* scan whole block */
354      for (k = 0; k < n; k++) {
355#if 1
356         int32_t best = -1;
357         float err = 1e9; /* big enough */
358         /* determine best vector */
359         for (j = 0; j < nv; j++) {
360            float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
361                      (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
362                      (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
363            if (nc == 4) {
364               e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
365            }
366            if (e < err) {
367               err = e;
368               best = j;
369            }
370         }
371#else
372         int32_t best = fxt1_bestcol(vec, nv, input[k], nc, &err);
373#endif
374         assert(best >= 0);
375         /* add in closest color */
376         for (i = 0; i < nc; i++) {
377            sum[best][i] += input[k][i];
378         }
379         /* mark this vector as used */
380         cnt[best]++;
381         /* accumulate error */
382         error += err;
383      }
384
385      /* check RMS */
386      if ((error < LL_RMS_E) ||
387          ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
388         return !0; /* good match */
389      }
390      lasterror = error;
391
392      /* move each vector to the barycenter of its closest colors */
393      for (j = 0; j < nv; j++) {
394         if (cnt[j]) {
395            float div = 1.0F / cnt[j];
396            for (i = 0; i < nc; i++) {
397               vec[j][i] = div * sum[j][i];
398            }
399         } else {
400            /* this vec has no samples or is identical with a previous vec */
401            int32_t worst = fxt1_worst(vec[j], input, nc, n);
402            for (i = 0; i < nc; i++) {
403               vec[j][i] = input[worst][i];
404            }
405         }
406      }
407   }
408
409   return 0; /* could not converge fast enough */
410}
411
412
413static void
414fxt1_quantize_CHROMA (uint32_t *cc,
415                      uint8_t input[N_TEXELS][MAX_COMP])
416{
417   const int32_t n_vect = 4; /* 4 base vectors to find */
418   const int32_t n_comp = 3; /* 3 components: R, G, B */
419   float vec[MAX_VECT][MAX_COMP];
420   int32_t i, j, k;
421   Fx64 hi; /* high quadword */
422   uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
423
424   if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
425      fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
426   }
427
428   FX64_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
429   for (j = n_vect - 1; j >= 0; j--) {
430      for (i = 0; i < n_comp; i++) {
431         /* add in colors */
432         FX64_SHL(hi, 5);
433         FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
434      }
435   }
436   ((Fx64 *)cc)[1] = hi;
437
438   lohi = lolo = 0;
439   /* right microtile */
440   for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
441      lohi <<= 2;
442      lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
443   }
444   /* left microtile */
445   for (; k >= 0; k--) {
446      lolo <<= 2;
447      lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
448   }
449   cc[1] = lohi;
450   cc[0] = lolo;
451}
452
453
454static void
455fxt1_quantize_ALPHA0 (uint32_t *cc,
456                      uint8_t input[N_TEXELS][MAX_COMP],
457                      uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
458{
459   const int32_t n_vect = 3; /* 3 base vectors to find */
460   const int32_t n_comp = 4; /* 4 components: R, G, B, A */
461   float vec[MAX_VECT][MAX_COMP];
462   int32_t i, j, k;
463   Fx64 hi; /* high quadword */
464   uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
465
466   /* the last vector indicates zero */
467   for (i = 0; i < n_comp; i++) {
468      vec[n_vect][i] = 0;
469   }
470
471   /* the first n texels in reord are guaranteed to be non-zero */
472   if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
473      fxt1_lloyd(vec, n_vect, reord, n_comp, n);
474   }
475
476   FX64_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
477   for (j = n_vect - 1; j >= 0; j--) {
478      /* add in alphas */
479      FX64_SHL(hi, 5);
480      FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
481   }
482   for (j = n_vect - 1; j >= 0; j--) {
483      for (i = 0; i < n_comp - 1; i++) {
484         /* add in colors */
485         FX64_SHL(hi, 5);
486         FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
487      }
488   }
489   ((Fx64 *)cc)[1] = hi;
490
491   lohi = lolo = 0;
492   /* right microtile */
493   for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
494      lohi <<= 2;
495      lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
496   }
497   /* left microtile */
498   for (; k >= 0; k--) {
499      lolo <<= 2;
500      lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
501   }
502   cc[1] = lohi;
503   cc[0] = lolo;
504}
505
506
507static void
508fxt1_quantize_ALPHA1 (uint32_t *cc,
509                      uint8_t input[N_TEXELS][MAX_COMP])
510{
511   const int32_t n_vect = 3; /* highest vector number in each microtile */
512   const int32_t n_comp = 4; /* 4 components: R, G, B, A */
513   float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
514   float b, iv[MAX_COMP]; /* interpolation vector */
515   int32_t i, j, k;
516   Fx64 hi; /* high quadword */
517   uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
518
519   int32_t minSum;
520   int32_t maxSum;
521   int32_t minColL = 0, maxColL = 0;
522   int32_t minColR = 0, maxColR = 0;
523   int32_t sumL = 0, sumR = 0;
524   int32_t nn_comp;
525   /* Our solution here is to find the darkest and brightest colors in
526    * the 4x4 tile and use those as the two representative colors.
527    * There are probably better algorithms to use (histogram-based).
528    */
529   nn_comp = n_comp;
530   while ((minColL == maxColL) && nn_comp) {
531       minSum = 2000; /* big enough */
532       maxSum = -1; /* small enough */
533       for (k = 0; k < N_TEXELS / 2; k++) {
534           int32_t sum = 0;
535           for (i = 0; i < nn_comp; i++) {
536               sum += input[k][i];
537           }
538           if (minSum > sum) {
539               minSum = sum;
540               minColL = k;
541           }
542           if (maxSum < sum) {
543               maxSum = sum;
544               maxColL = k;
545           }
546           sumL += sum;
547       }
548
549       nn_comp--;
550   }
551
552   nn_comp = n_comp;
553   while ((minColR == maxColR) && nn_comp) {
554       minSum = 2000; /* big enough */
555       maxSum = -1; /* small enough */
556       for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
557           int32_t sum = 0;
558           for (i = 0; i < nn_comp; i++) {
559               sum += input[k][i];
560           }
561           if (minSum > sum) {
562               minSum = sum;
563               minColR = k;
564           }
565           if (maxSum < sum) {
566               maxSum = sum;
567               maxColR = k;
568           }
569           sumR += sum;
570       }
571
572       nn_comp--;
573   }
574
575   /* choose the common vector (yuck!) */
576   {
577      int32_t j1, j2;
578      int32_t v1 = 0, v2 = 0;
579      float err = 1e9; /* big enough */
580      float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
581      for (i = 0; i < n_comp; i++) {
582         tv[0][i] = input[minColL][i];
583         tv[1][i] = input[maxColL][i];
584         tv[2][i] = input[minColR][i];
585         tv[3][i] = input[maxColR][i];
586      }
587      for (j1 = 0; j1 < 2; j1++) {
588         for (j2 = 2; j2 < 4; j2++) {
589            float e = 0.0F;
590            for (i = 0; i < n_comp; i++) {
591               e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
592            }
593            if (e < err) {
594               err = e;
595               v1 = j1;
596               v2 = j2;
597            }
598         }
599      }
600      for (i = 0; i < n_comp; i++) {
601         vec[0][i] = tv[1 - v1][i];
602         vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
603         vec[2][i] = tv[5 - v2][i];
604      }
605   }
606
607   /* left microtile */
608   cc[0] = 0;
609   if (minColL != maxColL) {
610      /* compute interpolation vector */
611      MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
612
613      /* add in texels */
614      lolo = 0;
615      for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
616         int32_t texel;
617         /* interpolate color */
618         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
619         /* add in texel */
620         lolo <<= 2;
621         lolo |= texel;
622      }
623
624      cc[0] = lolo;
625   }
626
627   /* right microtile */
628   cc[1] = 0;
629   if (minColR != maxColR) {
630      /* compute interpolation vector */
631      MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
632
633      /* add in texels */
634      lohi = 0;
635      for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
636         int32_t texel;
637         /* interpolate color */
638         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
639         /* add in texel */
640         lohi <<= 2;
641         lohi |= texel;
642      }
643
644      cc[1] = lohi;
645   }
646
647   FX64_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
648   for (j = n_vect - 1; j >= 0; j--) {
649      /* add in alphas */
650      FX64_SHL(hi, 5);
651      FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
652   }
653   for (j = n_vect - 1; j >= 0; j--) {
654      for (i = 0; i < n_comp - 1; i++) {
655         /* add in colors */
656         FX64_SHL(hi, 5);
657         FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
658      }
659   }
660   ((Fx64 *)cc)[1] = hi;
661}
662
663
664static void
665fxt1_quantize_HI (uint32_t *cc,
666                  uint8_t input[N_TEXELS][MAX_COMP],
667                  uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
668{
669   const int32_t n_vect = 6; /* highest vector number */
670   const int32_t n_comp = 3; /* 3 components: R, G, B */
671   float b = 0.0F;       /* phoudoin: silent compiler! */
672   float iv[MAX_COMP];   /* interpolation vector */
673   int32_t i, k;
674   uint32_t hihi; /* high quadword: hi dword */
675
676   int32_t minSum = 2000; /* big enough */
677   int32_t maxSum = -1; /* small enough */
678   int32_t minCol = 0; /* phoudoin: silent compiler! */
679   int32_t maxCol = 0; /* phoudoin: silent compiler! */
680
681   /* Our solution here is to find the darkest and brightest colors in
682    * the 8x4 tile and use those as the two representative colors.
683    * There are probably better algorithms to use (histogram-based).
684    */
685   for (k = 0; k < n; k++) {
686      int32_t sum = 0;
687      for (i = 0; i < n_comp; i++) {
688         sum += reord[k][i];
689      }
690      if (minSum > sum) {
691         minSum = sum;
692         minCol = k;
693      }
694      if (maxSum < sum) {
695         maxSum = sum;
696         maxCol = k;
697      }
698   }
699
700   hihi = 0; /* cc-hi = "00" */
701   for (i = 0; i < n_comp; i++) {
702      /* add in colors */
703      hihi <<= 5;
704      hihi |= reord[maxCol][i] >> 3;
705   }
706   for (i = 0; i < n_comp; i++) {
707      /* add in colors */
708      hihi <<= 5;
709      hihi |= reord[minCol][i] >> 3;
710   }
711   cc[3] = hihi;
712   cc[0] = cc[1] = cc[2] = 0;
713
714   /* compute interpolation vector */
715   if (minCol != maxCol) {
716      MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
717   }
718
719   /* add in texels */
720   for (k = N_TEXELS - 1; k >= 0; k--) {
721      int32_t t = k * 3;
722      uint32_t *kk = (uint32_t *)((char *)cc + t / 8);
723      int32_t texel = n_vect + 1; /* transparent black */
724
725      if (!ISTBLACK(input[k])) {
726         if (minCol != maxCol) {
727            /* interpolate color */
728            CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
729            /* add in texel */
730            kk[0] |= texel << (t & 7);
731         }
732      } else {
733         /* add in texel */
734         kk[0] |= texel << (t & 7);
735      }
736   }
737}
738
739
740static void
741fxt1_quantize_MIXED1 (uint32_t *cc,
742                      uint8_t input[N_TEXELS][MAX_COMP])
743{
744   const int32_t n_vect = 2; /* highest vector number in each microtile */
745   const int32_t n_comp = 3; /* 3 components: R, G, B */
746   uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
747   float b, iv[MAX_COMP]; /* interpolation vector */
748   int32_t i, j, k;
749   Fx64 hi; /* high quadword */
750   uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
751
752   int32_t minSum;
753   int32_t maxSum;
754   int32_t minColL = 0, maxColL = -1;
755   int32_t minColR = 0, maxColR = -1;
756
757   /* Our solution here is to find the darkest and brightest colors in
758    * the 4x4 tile and use those as the two representative colors.
759    * There are probably better algorithms to use (histogram-based).
760    */
761   minSum = 2000; /* big enough */
762   maxSum = -1; /* small enough */
763   for (k = 0; k < N_TEXELS / 2; k++) {
764      if (!ISTBLACK(input[k])) {
765         int32_t sum = 0;
766         for (i = 0; i < n_comp; i++) {
767            sum += input[k][i];
768         }
769         if (minSum > sum) {
770            minSum = sum;
771            minColL = k;
772         }
773         if (maxSum < sum) {
774            maxSum = sum;
775            maxColL = k;
776         }
777      }
778   }
779   minSum = 2000; /* big enough */
780   maxSum = -1; /* small enough */
781   for (; k < N_TEXELS; k++) {
782      if (!ISTBLACK(input[k])) {
783         int32_t sum = 0;
784         for (i = 0; i < n_comp; i++) {
785            sum += input[k][i];
786         }
787         if (minSum > sum) {
788            minSum = sum;
789            minColR = k;
790         }
791         if (maxSum < sum) {
792            maxSum = sum;
793            maxColR = k;
794         }
795      }
796   }
797
798   /* left microtile */
799   if (maxColL == -1) {
800      /* all transparent black */
801      cc[0] = ~0u;
802      for (i = 0; i < n_comp; i++) {
803         vec[0][i] = 0;
804         vec[1][i] = 0;
805      }
806   } else {
807      cc[0] = 0;
808      for (i = 0; i < n_comp; i++) {
809         vec[0][i] = input[minColL][i];
810         vec[1][i] = input[maxColL][i];
811      }
812      if (minColL != maxColL) {
813         /* compute interpolation vector */
814         MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
815
816         /* add in texels */
817         lolo = 0;
818         for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
819            int32_t texel = n_vect + 1; /* transparent black */
820            if (!ISTBLACK(input[k])) {
821               /* interpolate color */
822               CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
823            }
824            /* add in texel */
825            lolo <<= 2;
826            lolo |= texel;
827         }
828         cc[0] = lolo;
829      }
830   }
831
832   /* right microtile */
833   if (maxColR == -1) {
834      /* all transparent black */
835      cc[1] = ~0u;
836      for (i = 0; i < n_comp; i++) {
837         vec[2][i] = 0;
838         vec[3][i] = 0;
839      }
840   } else {
841      cc[1] = 0;
842      for (i = 0; i < n_comp; i++) {
843         vec[2][i] = input[minColR][i];
844         vec[3][i] = input[maxColR][i];
845      }
846      if (minColR != maxColR) {
847         /* compute interpolation vector */
848         MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
849
850         /* add in texels */
851         lohi = 0;
852         for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
853            int32_t texel = n_vect + 1; /* transparent black */
854            if (!ISTBLACK(input[k])) {
855               /* interpolate color */
856               CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
857            }
858            /* add in texel */
859            lohi <<= 2;
860            lohi |= texel;
861         }
862         cc[1] = lohi;
863      }
864   }
865
866   FX64_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
867   for (j = 2 * 2 - 1; j >= 0; j--) {
868      for (i = 0; i < n_comp; i++) {
869         /* add in colors */
870         FX64_SHL(hi, 5);
871         FX64_OR32(hi, vec[j][i] >> 3);
872      }
873   }
874   ((Fx64 *)cc)[1] = hi;
875}
876
877
878static void
879fxt1_quantize_MIXED0 (uint32_t *cc,
880                      uint8_t input[N_TEXELS][MAX_COMP])
881{
882   const int32_t n_vect = 3; /* highest vector number in each microtile */
883   const int32_t n_comp = 3; /* 3 components: R, G, B */
884   uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
885   float b, iv[MAX_COMP]; /* interpolation vector */
886   int32_t i, j, k;
887   Fx64 hi; /* high quadword */
888   uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
889
890   int32_t minColL = 0, maxColL = 0;
891   int32_t minColR = 0, maxColR = 0;
892#if 0
893   int32_t minSum;
894   int32_t maxSum;
895
896   /* Our solution here is to find the darkest and brightest colors in
897    * the 4x4 tile and use those as the two representative colors.
898    * There are probably better algorithms to use (histogram-based).
899    */
900   minSum = 2000; /* big enough */
901   maxSum = -1; /* small enough */
902   for (k = 0; k < N_TEXELS / 2; k++) {
903      int32_t sum = 0;
904      for (i = 0; i < n_comp; i++) {
905         sum += input[k][i];
906      }
907      if (minSum > sum) {
908         minSum = sum;
909         minColL = k;
910      }
911      if (maxSum < sum) {
912         maxSum = sum;
913         maxColL = k;
914      }
915   }
916   minSum = 2000; /* big enough */
917   maxSum = -1; /* small enough */
918   for (; k < N_TEXELS; k++) {
919      int32_t sum = 0;
920      for (i = 0; i < n_comp; i++) {
921         sum += input[k][i];
922      }
923      if (minSum > sum) {
924         minSum = sum;
925         minColR = k;
926      }
927      if (maxSum < sum) {
928         maxSum = sum;
929         maxColR = k;
930      }
931   }
932#else
933   int32_t minVal;
934   int32_t maxVal;
935   int32_t maxVarL = fxt1_variance(input, n_comp);
936   int32_t maxVarR = fxt1_variance(&input[N_TEXELS / 2], n_comp);
937
938   /* Scan the channel with max variance for lo & hi
939    * and use those as the two representative colors.
940    */
941   minVal = 2000; /* big enough */
942   maxVal = -1; /* small enough */
943   for (k = 0; k < N_TEXELS / 2; k++) {
944      int32_t t = input[k][maxVarL];
945      if (minVal > t) {
946         minVal = t;
947         minColL = k;
948      }
949      if (maxVal < t) {
950         maxVal = t;
951         maxColL = k;
952      }
953   }
954   minVal = 2000; /* big enough */
955   maxVal = -1; /* small enough */
956   for (; k < N_TEXELS; k++) {
957      int32_t t = input[k][maxVarR];
958      if (minVal > t) {
959         minVal = t;
960         minColR = k;
961      }
962      if (maxVal < t) {
963         maxVal = t;
964         maxColR = k;
965      }
966   }
967#endif
968
969   /* left microtile */
970   cc[0] = 0;
971   for (i = 0; i < n_comp; i++) {
972      vec[0][i] = input[minColL][i];
973      vec[1][i] = input[maxColL][i];
974   }
975   if (minColL != maxColL) {
976      /* compute interpolation vector */
977      MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
978
979      /* add in texels */
980      lolo = 0;
981      for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
982         int32_t texel;
983         /* interpolate color */
984         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
985         /* add in texel */
986         lolo <<= 2;
987         lolo |= texel;
988      }
989
990      /* funky encoding for LSB of green */
991      if ((int32_t)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
992         for (i = 0; i < n_comp; i++) {
993            vec[1][i] = input[minColL][i];
994            vec[0][i] = input[maxColL][i];
995         }
996         lolo = ~lolo;
997      }
998
999      cc[0] = lolo;
1000   }
1001
1002   /* right microtile */
1003   cc[1] = 0;
1004   for (i = 0; i < n_comp; i++) {
1005      vec[2][i] = input[minColR][i];
1006      vec[3][i] = input[maxColR][i];
1007   }
1008   if (minColR != maxColR) {
1009      /* compute interpolation vector */
1010      MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
1011
1012      /* add in texels */
1013      lohi = 0;
1014      for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
1015         int32_t texel;
1016         /* interpolate color */
1017         CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
1018         /* add in texel */
1019         lohi <<= 2;
1020         lohi |= texel;
1021      }
1022
1023      /* funky encoding for LSB of green */
1024      if ((int32_t)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
1025         for (i = 0; i < n_comp; i++) {
1026            vec[3][i] = input[minColR][i];
1027            vec[2][i] = input[maxColR][i];
1028         }
1029         lohi = ~lohi;
1030      }
1031
1032      cc[1] = lohi;
1033   }
1034
1035   FX64_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
1036   for (j = 2 * 2 - 1; j >= 0; j--) {
1037      for (i = 0; i < n_comp; i++) {
1038         /* add in colors */
1039         FX64_SHL(hi, 5);
1040         FX64_OR32(hi, vec[j][i] >> 3);
1041      }
1042   }
1043   ((Fx64 *)cc)[1] = hi;
1044}
1045
1046
1047static void
1048fxt1_quantize (uint32_t *cc, const uint8_t *lines[], int32_t comps)
1049{
1050   int32_t trualpha;
1051   uint8_t reord[N_TEXELS][MAX_COMP];
1052
1053   uint8_t input[N_TEXELS][MAX_COMP];
1054   int32_t i, k, l;
1055
1056   if (comps == 3) {
1057      /* make the whole block opaque */
1058      memset(input, -1, sizeof(input));
1059   }
1060
1061   /* 8 texels each line */
1062   for (l = 0; l < 4; l++) {
1063      for (k = 0; k < 4; k++) {
1064         for (i = 0; i < comps; i++) {
1065            input[k + l * 4][i] = *lines[l]++;
1066         }
1067      }
1068      for (; k < 8; k++) {
1069         for (i = 0; i < comps; i++) {
1070            input[k + l * 4 + 12][i] = *lines[l]++;
1071         }
1072      }
1073   }
1074
1075   /* block layout:
1076    * 00, 01, 02, 03, 08, 09, 0a, 0b
1077    * 10, 11, 12, 13, 18, 19, 1a, 1b
1078    * 04, 05, 06, 07, 0c, 0d, 0e, 0f
1079    * 14, 15, 16, 17, 1c, 1d, 1e, 1f
1080    */
1081
1082   /* [dBorca]
1083    * stupidity flows forth from this
1084    */
1085   l = N_TEXELS;
1086   trualpha = 0;
1087   if (comps == 4) {
1088      /* skip all transparent black texels */
1089      l = 0;
1090      for (k = 0; k < N_TEXELS; k++) {
1091         /* test all components against 0 */
1092         if (!ISTBLACK(input[k])) {
1093            /* texel is not transparent black */
1094            memcpy(reord[l], input[k], 4);
1095            if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
1096               /* non-opaque texel */
1097               trualpha = !0;
1098            }
1099            l++;
1100         }
1101      }
1102   }
1103
1104#if 0
1105   if (trualpha) {
1106      fxt1_quantize_ALPHA0(cc, input, reord, l);
1107   } else if (l == 0) {
1108      cc[0] = cc[1] = cc[2] = -1;
1109      cc[3] = 0;
1110   } else if (l < N_TEXELS) {
1111      fxt1_quantize_HI(cc, input, reord, l);
1112   } else {
1113      fxt1_quantize_CHROMA(cc, input);
1114   }
1115   (void)fxt1_quantize_ALPHA1;
1116   (void)fxt1_quantize_MIXED1;
1117   (void)fxt1_quantize_MIXED0;
1118#else
1119   if (trualpha) {
1120      fxt1_quantize_ALPHA1(cc, input);
1121   } else if (l == 0) {
1122      cc[0] = cc[1] = cc[2] = ~0u;
1123      cc[3] = 0;
1124   } else if (l < N_TEXELS) {
1125      fxt1_quantize_MIXED1(cc, input);
1126   } else {
1127      fxt1_quantize_MIXED0(cc, input);
1128   }
1129   (void)fxt1_quantize_ALPHA0;
1130   (void)fxt1_quantize_HI;
1131   (void)fxt1_quantize_CHROMA;
1132#endif
1133}
1134
1135
1136
1137/**
1138 * Upscale an image by replication, not (typical) stretching.
1139 * We use this when the image width or height is less than a
1140 * certain size (4, 8) and we need to upscale an image.
1141 */
1142static void
1143upscale_teximage2d(int32_t inWidth, int32_t inHeight,
1144                   int32_t outWidth, int32_t outHeight,
1145                   int32_t comps, const uint8_t *src, int32_t srcRowStride,
1146                   uint8_t *dest )
1147{
1148   int32_t i, j, k;
1149
1150   assert(outWidth >= inWidth);
1151   assert(outHeight >= inHeight);
1152#if 0
1153   assert(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
1154   assert((outWidth & 3) == 0);
1155   assert((outHeight & 3) == 0);
1156#endif
1157
1158   for (i = 0; i < outHeight; i++) {
1159      const int32_t ii = i % inHeight;
1160      for (j = 0; j < outWidth; j++) {
1161         const int32_t jj = j % inWidth;
1162         for (k = 0; k < comps; k++) {
1163            dest[(i * outWidth + j) * comps + k]
1164               = src[ii * srcRowStride + jj * comps + k];
1165         }
1166      }
1167   }
1168}
1169
1170
1171static void
1172fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
1173             const void *source, int32_t srcRowStride,
1174             void *dest, int32_t destRowStride)
1175{
1176   uint32_t x, y;
1177   const uint8_t *data;
1178   uint32_t *encoded = (uint32_t *)dest;
1179   void *newSource = NULL;
1180
1181   assert(comps == 3 || comps == 4);
1182
1183   /* Replicate image if width is not M8 or height is not M4 */
1184   if ((width & 7) | (height & 3)) {
1185      int32_t newWidth = (width + 7) & ~7;
1186      int32_t newHeight = (height + 3) & ~3;
1187      newSource = malloc(comps * newWidth * newHeight * sizeof(uint8_t));
1188      if (!newSource)
1189         return;
1190      upscale_teximage2d(width, height, newWidth, newHeight,
1191                         comps, (const uint8_t *) source,
1192                         srcRowStride, (uint8_t *) newSource);
1193      source = newSource;
1194      width = newWidth;
1195      height = newHeight;
1196      srcRowStride = comps * newWidth;
1197   }
1198
1199   data = (const uint8_t *) source;
1200   destRowStride = (destRowStride - width * 2) / 4;
1201   for (y = 0; y < height; y += 4) {
1202      uint32_t offs = 0 + (y + 0) * srcRowStride;
1203      for (x = 0; x < width; x += 8) {
1204         const uint8_t *lines[4];
1205         lines[0] = &data[offs];
1206         lines[1] = lines[0] + srcRowStride;
1207         lines[2] = lines[1] + srcRowStride;
1208         lines[3] = lines[2] + srcRowStride;
1209         offs += 8 * comps;
1210         fxt1_quantize(encoded, lines, comps);
1211         /* 128 bits per 8x4 block */
1212         encoded += 4;
1213      }
1214      encoded += destRowStride;
1215   }
1216
1217   free(newSource);
1218}
1219
1220
1221/***************************************************************************\
1222 * FXT1 decoder
1223 *
1224 * The decoder is based on GL_3DFX_texture_compression_FXT1
1225 * specification and serves as a concept for the encoder.
1226\***************************************************************************/
1227
1228
1229/* lookup table for scaling 5 bit colors up to 8 bits */
1230static const uint8_t _rgb_scale_5[] = {
1231   0,   8,   16,  25,  33,  41,  49,  58,
1232   66,  74,  82,  90,  99,  107, 115, 123,
1233   132, 140, 148, 156, 165, 173, 181, 189,
1234   197, 206, 214, 222, 230, 239, 247, 255
1235};
1236
1237/* lookup table for scaling 6 bit colors up to 8 bits */
1238static const uint8_t _rgb_scale_6[] = {
1239   0,   4,   8,   12,  16,  20,  24,  28,
1240   32,  36,  40,  45,  49,  53,  57,  61,
1241   65,  69,  73,  77,  81,  85,  89,  93,
1242   97,  101, 105, 109, 113, 117, 121, 125,
1243   130, 134, 138, 142, 146, 150, 154, 158,
1244   162, 166, 170, 174, 178, 182, 186, 190,
1245   194, 198, 202, 206, 210, 215, 219, 223,
1246   227, 231, 235, 239, 243, 247, 251, 255
1247};
1248
1249
1250#define CC_SEL(cc, which) (((uint32_t *)(cc))[(which) / 32] >> ((which) & 31))
1251#define UP5(c) _rgb_scale_5[(c) & 31]
1252#define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
1253#define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
1254
1255
1256static void
1257fxt1_decode_1HI (const uint8_t *code, int32_t t, uint8_t *rgba)
1258{
1259   const uint32_t *cc;
1260
1261   t *= 3;
1262   cc = (const uint32_t *)(code + t / 8);
1263   t = (cc[0] >> (t & 7)) & 7;
1264
1265   if (t == 7) {
1266      rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
1267   } else {
1268      uint8_t r, g, b;
1269      cc = (const uint32_t *)(code + 12);
1270      if (t == 0) {
1271         b = UP5(CC_SEL(cc, 0));
1272         g = UP5(CC_SEL(cc, 5));
1273         r = UP5(CC_SEL(cc, 10));
1274      } else if (t == 6) {
1275         b = UP5(CC_SEL(cc, 15));
1276         g = UP5(CC_SEL(cc, 20));
1277         r = UP5(CC_SEL(cc, 25));
1278      } else {
1279         b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
1280         g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
1281         r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
1282      }
1283      rgba[RCOMP] = r;
1284      rgba[GCOMP] = g;
1285      rgba[BCOMP] = b;
1286      rgba[ACOMP] = 255;
1287   }
1288}
1289
1290
1291static void
1292fxt1_decode_1CHROMA (const uint8_t *code, int32_t t, uint8_t *rgba)
1293{
1294   const uint32_t *cc;
1295   uint32_t kk;
1296
1297   cc = (const uint32_t *)code;
1298   if (t & 16) {
1299      cc++;
1300      t &= 15;
1301   }
1302   t = (cc[0] >> (t * 2)) & 3;
1303
1304   t *= 15;
1305   cc = (const uint32_t *)(code + 8 + t / 8);
1306   kk = cc[0] >> (t & 7);
1307   rgba[BCOMP] = UP5(kk);
1308   rgba[GCOMP] = UP5(kk >> 5);
1309   rgba[RCOMP] = UP5(kk >> 10);
1310   rgba[ACOMP] = 255;
1311}
1312
1313
1314static void
1315fxt1_decode_1MIXED (const uint8_t *code, int32_t t, uint8_t *rgba)
1316{
1317   const uint32_t *cc;
1318   uint32_t col[2][3];
1319   int32_t glsb, selb;
1320
1321   cc = (const uint32_t *)code;
1322   if (t & 16) {
1323      t &= 15;
1324      t = (cc[1] >> (t * 2)) & 3;
1325      /* col 2 */
1326      col[0][BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1327      col[0][GCOMP] = CC_SEL(cc, 99);
1328      col[0][RCOMP] = CC_SEL(cc, 104);
1329      /* col 3 */
1330      col[1][BCOMP] = CC_SEL(cc, 109);
1331      col[1][GCOMP] = CC_SEL(cc, 114);
1332      col[1][RCOMP] = CC_SEL(cc, 119);
1333      glsb = CC_SEL(cc, 126);
1334      selb = CC_SEL(cc, 33);
1335   } else {
1336      t = (cc[0] >> (t * 2)) & 3;
1337      /* col 0 */
1338      col[0][BCOMP] = CC_SEL(cc, 64);
1339      col[0][GCOMP] = CC_SEL(cc, 69);
1340      col[0][RCOMP] = CC_SEL(cc, 74);
1341      /* col 1 */
1342      col[1][BCOMP] = CC_SEL(cc, 79);
1343      col[1][GCOMP] = CC_SEL(cc, 84);
1344      col[1][RCOMP] = CC_SEL(cc, 89);
1345      glsb = CC_SEL(cc, 125);
1346      selb = CC_SEL(cc, 1);
1347   }
1348
1349   if (CC_SEL(cc, 124) & 1) {
1350      /* alpha[0] == 1 */
1351
1352      if (t == 3) {
1353         /* zero */
1354         rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
1355      } else {
1356         uint8_t r, g, b;
1357         if (t == 0) {
1358            b = UP5(col[0][BCOMP]);
1359            g = UP5(col[0][GCOMP]);
1360            r = UP5(col[0][RCOMP]);
1361         } else if (t == 2) {
1362            b = UP5(col[1][BCOMP]);
1363            g = UP6(col[1][GCOMP], glsb);
1364            r = UP5(col[1][RCOMP]);
1365         } else {
1366            b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
1367            g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
1368            r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
1369         }
1370         rgba[RCOMP] = r;
1371         rgba[GCOMP] = g;
1372         rgba[BCOMP] = b;
1373         rgba[ACOMP] = 255;
1374      }
1375   } else {
1376      /* alpha[0] == 0 */
1377      uint8_t r, g, b;
1378      if (t == 0) {
1379         b = UP5(col[0][BCOMP]);
1380         g = UP6(col[0][GCOMP], glsb ^ selb);
1381         r = UP5(col[0][RCOMP]);
1382      } else if (t == 3) {
1383         b = UP5(col[1][BCOMP]);
1384         g = UP6(col[1][GCOMP], glsb);
1385         r = UP5(col[1][RCOMP]);
1386      } else {
1387         b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
1388         g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
1389                        UP6(col[1][GCOMP], glsb));
1390         r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
1391      }
1392      rgba[RCOMP] = r;
1393      rgba[GCOMP] = g;
1394      rgba[BCOMP] = b;
1395      rgba[ACOMP] = 255;
1396   }
1397}
1398
1399
1400static void
1401fxt1_decode_1ALPHA (const uint8_t *code, int32_t t, uint8_t *rgba)
1402{
1403   const uint32_t *cc;
1404   uint8_t r, g, b, a;
1405
1406   cc = (const uint32_t *)code;
1407   if (CC_SEL(cc, 124) & 1) {
1408      /* lerp == 1 */
1409      uint32_t col0[4];
1410
1411      if (t & 16) {
1412         t &= 15;
1413         t = (cc[1] >> (t * 2)) & 3;
1414         /* col 2 */
1415         col0[BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1416         col0[GCOMP] = CC_SEL(cc, 99);
1417         col0[RCOMP] = CC_SEL(cc, 104);
1418         col0[ACOMP] = CC_SEL(cc, 119);
1419      } else {
1420         t = (cc[0] >> (t * 2)) & 3;
1421         /* col 0 */
1422         col0[BCOMP] = CC_SEL(cc, 64);
1423         col0[GCOMP] = CC_SEL(cc, 69);
1424         col0[RCOMP] = CC_SEL(cc, 74);
1425         col0[ACOMP] = CC_SEL(cc, 109);
1426      }
1427
1428      if (t == 0) {
1429         b = UP5(col0[BCOMP]);
1430         g = UP5(col0[GCOMP]);
1431         r = UP5(col0[RCOMP]);
1432         a = UP5(col0[ACOMP]);
1433      } else if (t == 3) {
1434         b = UP5(CC_SEL(cc, 79));
1435         g = UP5(CC_SEL(cc, 84));
1436         r = UP5(CC_SEL(cc, 89));
1437         a = UP5(CC_SEL(cc, 114));
1438      } else {
1439         b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
1440         g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
1441         r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
1442         a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
1443      }
1444   } else {
1445      /* lerp == 0 */
1446
1447      if (t & 16) {
1448         cc++;
1449         t &= 15;
1450      }
1451      t = (cc[0] >> (t * 2)) & 3;
1452
1453      if (t == 3) {
1454         /* zero */
1455         r = g = b = a = 0;
1456      } else {
1457         uint32_t kk;
1458         cc = (const uint32_t *)code;
1459         a = UP5(cc[3] >> (t * 5 + 13));
1460         t *= 15;
1461         cc = (const uint32_t *)(code + 8 + t / 8);
1462         kk = cc[0] >> (t & 7);
1463         b = UP5(kk);
1464         g = UP5(kk >> 5);
1465         r = UP5(kk >> 10);
1466      }
1467   }
1468   rgba[RCOMP] = r;
1469   rgba[GCOMP] = g;
1470   rgba[BCOMP] = b;
1471   rgba[ACOMP] = a;
1472}
1473
1474
1475static void
1476fxt1_decode_1 (const void *texture, int32_t stride, /* in pixels */
1477               int32_t i, int32_t j, uint8_t *rgba)
1478{
1479   static void (*decode_1[]) (const uint8_t *, int32_t, uint8_t *) = {
1480      fxt1_decode_1HI,     /* cc-high   = "00?" */
1481      fxt1_decode_1HI,     /* cc-high   = "00?" */
1482      fxt1_decode_1CHROMA, /* cc-chroma = "010" */
1483      fxt1_decode_1ALPHA,  /* alpha     = "011" */
1484      fxt1_decode_1MIXED,  /* mixed     = "1??" */
1485      fxt1_decode_1MIXED,  /* mixed     = "1??" */
1486      fxt1_decode_1MIXED,  /* mixed     = "1??" */
1487      fxt1_decode_1MIXED   /* mixed     = "1??" */
1488   };
1489
1490   const uint8_t *code = (const uint8_t *)texture +
1491                         ((j / 4) * (stride / 8) + (i / 8)) * 16;
1492   int32_t mode = CC_SEL(code, 125);
1493   int32_t t = i & 7;
1494
1495   if (t & 4) {
1496      t += 12;
1497   }
1498   t += (j & 3) * 4;
1499
1500   decode_1[mode](code, t, rgba);
1501}
1502
1503/*
1504 * Pixel fetch within a block.
1505 */
1506
1507void
1508util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1509{
1510   fxt1_decode_1(src, 0, i, j, dst);
1511}
1512
1513void
1514util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1515{
1516   fxt1_decode_1(src, 0, i, j, dst);
1517   dst[3] = 0xff;
1518}
1519
1520void
1521util_format_fxt1_rgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1522{
1523   float *dst = in_dst;
1524   uint8_t tmp[4];
1525   fxt1_decode_1(src, 0, i, j, tmp);
1526   dst[0] = ubyte_to_float(tmp[0]);
1527   dst[1] = ubyte_to_float(tmp[1]);
1528   dst[2] = ubyte_to_float(tmp[2]);
1529   dst[3] = 1.0;
1530}
1531
1532void
1533util_format_fxt1_rgba_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1534{
1535   float *dst = in_dst;
1536   uint8_t tmp[4];
1537   fxt1_decode_1(src, 0, i, j, tmp);
1538   dst[0] = ubyte_to_float(tmp[0]);
1539   dst[1] = ubyte_to_float(tmp[1]);
1540   dst[2] = ubyte_to_float(tmp[2]);
1541   dst[3] = ubyte_to_float(tmp[3]);
1542}
1543
1544/*
1545 * Block decompression.
1546 */
1547
1548static inline void
1549util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1550                                        const uint8_t *restrict src_row, unsigned src_stride,
1551                                        unsigned width, unsigned height,
1552                                        boolean rgba)
1553{
1554   const unsigned bw = 8, bh = 4, comps = 4;
1555   unsigned x, y, i, j;
1556   for (y = 0; y < height; y += bh) {
1557      const uint8_t *src = src_row;
1558      for (x = 0; x < width; x += bw) {
1559         for (j = 0; j < bh; ++j) {
1560            for (i = 0; i < bw; ++i) {
1561               uint8_t *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + (x + i) * comps;
1562               fxt1_decode_1(src, 0, i, j, dst);
1563               if (!rgba)
1564                  dst[3] = 0xff;
1565            }
1566         }
1567         src += FXT1_BLOCK_SIZE;
1568      }
1569      src_row += src_stride;
1570   }
1571}
1572
1573void
1574util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1575                                        const uint8_t *restrict src_row, unsigned src_stride,
1576                                        unsigned width, unsigned height)
1577{
1578   util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1579                                           src_row, src_stride,
1580                                           width, height,
1581                                           false);
1582}
1583
1584void
1585util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1586                                         const uint8_t *restrict src_row, unsigned src_stride,
1587                                         unsigned width, unsigned height)
1588{
1589   util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1590                                           src_row, src_stride,
1591                                           width, height,
1592                                           true);
1593}
1594
1595static inline void
1596util_format_fxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
1597                                       const uint8_t *restrict src_row, unsigned src_stride,
1598                                       unsigned width, unsigned height,
1599                                       boolean rgba)
1600{
1601   const unsigned bw = 8, bh = 4, comps = 4;
1602   unsigned x, y, i, j;
1603   for (y = 0; y < height; y += 4) {
1604      const uint8_t *src = src_row;
1605      for (x = 0; x < width; x += 8) {
1606         for (j = 0; j < bh; ++j) {
1607            for (i = 0; i < bw; ++i) {
1608               float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i) * comps;
1609               uint8_t tmp[4];
1610               fxt1_decode_1(src, 0, i, j, tmp);
1611               dst[0] = ubyte_to_float(tmp[0]);
1612               dst[1] = ubyte_to_float(tmp[1]);
1613               dst[2] = ubyte_to_float(tmp[2]);
1614               if (rgba)
1615                  dst[3] = ubyte_to_float(tmp[3]);
1616               else
1617                  dst[3] = 1.0;
1618            }
1619         }
1620         src += FXT1_BLOCK_SIZE;
1621      }
1622      src_row += src_stride;
1623   }
1624}
1625
1626void
1627util_format_fxt1_rgb_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1628                                       const uint8_t *restrict src_row, unsigned src_stride,
1629                                       unsigned width, unsigned height)
1630{
1631   util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1632                                          src_row, src_stride,
1633                                          width, height,
1634                                          false);
1635}
1636
1637void
1638util_format_fxt1_rgba_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1639                                        const uint8_t *restrict src_row, unsigned src_stride,
1640                                        unsigned width, unsigned height)
1641{
1642   util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1643                                          src_row, src_stride,
1644                                          width, height,
1645                                          true);
1646}
1647
1648/*
1649 * Block compression.
1650 */
1651
1652void
1653util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1654                                      const uint8_t *restrict src, unsigned src_stride,
1655                                      unsigned width, unsigned height)
1656{
1657   /* The encoder for FXT1_RGB wants 24bpp packed rgb, so make a temporary to do that.
1658    */
1659   int temp_stride = width * 3;
1660   uint8_t *temp = malloc(height * temp_stride);
1661   if (!temp)
1662      return;
1663
1664   for (int y = 0; y < height; y++) {
1665      for (int x = 0; x < width; x++) {
1666         temp[y * temp_stride + x * 3 + 0] = src[x * 4 + 0];
1667         temp[y * temp_stride + x * 3 + 1] = src[x * 4 + 1];
1668         temp[y * temp_stride + x * 3 + 2] = src[x * 4 + 2];
1669      }
1670      src += src_stride;
1671   }
1672
1673   fxt1_encode(width, height, 3, temp, temp_stride, dst_row, dst_stride);
1674
1675   free(temp);
1676}
1677
1678void
1679util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1680                                       const uint8_t *restrict src, unsigned src_stride,
1681                                       unsigned width, unsigned height)
1682{
1683   fxt1_encode(width, height, 4, src, src_stride, dst_row, dst_stride);
1684}
1685
1686void
1687util_format_fxt1_rgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1688                                     const float *restrict src, unsigned src_stride,
1689                                     unsigned width, unsigned height)
1690{
1691   int temp_stride = width * 4;
1692   uint8_t *temp = malloc(height * temp_stride);
1693   if (!temp)
1694      return;
1695
1696   util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1697                                              src, src_stride,
1698                                              width, height);
1699
1700   util_format_fxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride,
1701                                         temp, temp_stride,
1702                                         width, height);
1703
1704   free(temp);
1705}
1706
1707void
1708util_format_fxt1_rgba_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1709                                      const float *restrict src, unsigned src_stride,
1710                                      unsigned width, unsigned height)
1711{
1712   int temp_stride = width * 4;
1713   uint8_t *temp = malloc(height * temp_stride);
1714   if (!temp)
1715      return;
1716
1717   util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1718                                              src, src_stride,
1719                                              width, height);
1720
1721   util_format_fxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride,
1722                                          temp, temp_stride,
1723                                          width, height);
1724
1725   free(temp);
1726}
1727