1/*
2 * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
3 * Copyright 2018 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file texcompress_astc.c
27 *
28 * Decompression code for GL_KHR_texture_compression_astc_ldr, which is just
29 * ASTC 2D LDR.
30 *
31 * The ASTC 2D LDR decoder (without the sRGB part) was copied from the OASTC
32 * library written by Philip Taylor. I added sRGB support and adjusted it for
33 * Mesa. - Marek
34 */
35
36#include "texcompress_astc.h"
37#include "macros.h"
38#include "util/half_float.h"
39#include <stdio.h>
40
41static bool VERBOSE_DECODE = false;
42static bool VERBOSE_WRITE = false;
43
44static inline uint8_t
45uint16_div_64k_to_half_to_unorm8(uint16_t v)
46{
47   return _mesa_half_to_unorm8(_mesa_uint16_div_64k_to_half(v));
48}
49
50class decode_error
51{
52public:
53   enum type {
54      ok,
55      unsupported_hdr_void_extent,
56      reserved_block_mode_1,
57      reserved_block_mode_2,
58      dual_plane_and_too_many_partitions,
59      invalid_range_in_void_extent,
60      weight_grid_exceeds_block_size,
61      invalid_colour_endpoints_size,
62      invalid_colour_endpoints_count,
63      invalid_weight_bits,
64      invalid_num_weights,
65   };
66};
67
68
69struct cem_range {
70   uint8_t max;
71   uint8_t t, q, b;
72};
73
74/* Based on the Color Unquantization Parameters table,
75 * plus the bit-only representations, sorted by increasing size
76 */
77static cem_range cem_ranges[] = {
78   { 5, 1, 0, 1 },
79   { 7, 0, 0, 3 },
80   { 9, 0, 1, 1 },
81   { 11, 1, 0, 2 },
82   { 15, 0, 0, 4 },
83   { 19, 0, 1, 2 },
84   { 23, 1, 0, 3 },
85   { 31, 0, 0, 5 },
86   { 39, 0, 1, 3 },
87   { 47, 1, 0, 4 },
88   { 63, 0, 0, 6 },
89   { 79, 0, 1, 4 },
90   { 95, 1, 0, 5 },
91   { 127, 0, 0, 7 },
92   { 159, 0, 1, 5 },
93   { 191, 1, 0, 6 },
94   { 255, 0, 0, 8 },
95};
96
97#define CAT_BITS_2(a, b)          ( ((a) << 1) | (b) )
98#define CAT_BITS_3(a, b, c)       ( ((a) << 2) | ((b) << 1) | (c) )
99#define CAT_BITS_4(a, b, c, d)    ( ((a) << 3) | ((b) << 2) | ((c) << 1) | (d) )
100#define CAT_BITS_5(a, b, c, d, e) ( ((a) << 4) | ((b) << 3) | ((c) << 2) | ((d) << 1) | (e) )
101
102/**
103 * Unpack 5n+8 bits from 'in' into 5 output values.
104 * If n <= 4 then T should be uint32_t, else it must be uint64_t.
105 */
106template <typename T>
107static void unpack_trit_block(int n, T in, uint8_t *out)
108{
109   assert(n <= 6); /* else output will overflow uint8_t */
110
111   uint8_t T0 = (in >> (n)) & 0x1;
112   uint8_t T1 = (in >> (n+1)) & 0x1;
113   uint8_t T2 = (in >> (2*n+2)) & 0x1;
114   uint8_t T3 = (in >> (2*n+3)) & 0x1;
115   uint8_t T4 = (in >> (3*n+4)) & 0x1;
116   uint8_t T5 = (in >> (4*n+5)) & 0x1;
117   uint8_t T6 = (in >> (4*n+6)) & 0x1;
118   uint8_t T7 = (in >> (5*n+7)) & 0x1;
119   uint8_t mmask = (1 << n) - 1;
120   uint8_t m0 = (in >> (0)) & mmask;
121   uint8_t m1 = (in >> (n+2)) & mmask;
122   uint8_t m2 = (in >> (2*n+4)) & mmask;
123   uint8_t m3 = (in >> (3*n+5)) & mmask;
124   uint8_t m4 = (in >> (4*n+7)) & mmask;
125
126   uint8_t C;
127   uint8_t t4, t3, t2, t1, t0;
128   if (CAT_BITS_3(T4, T3, T2) == 0x7) {
129      C = CAT_BITS_5(T7, T6, T5, T1, T0);
130      t4 = t3 = 2;
131   } else {
132      C = CAT_BITS_5(T4, T3, T2, T1, T0);
133      if (CAT_BITS_2(T6, T5) == 0x3) {
134         t4 = 2;
135         t3 = T7;
136      } else {
137         t4 = T7;
138         t3 = CAT_BITS_2(T6, T5);
139      }
140   }
141
142   if ((C & 0x3) == 0x3) {
143      t2 = 2;
144      t1 = (C >> 4) & 0x1;
145      uint8_t C3 = (C >> 3) & 0x1;
146      uint8_t C2 = (C >> 2) & 0x1;
147      t0 = (C3 << 1) | (C2 & ~C3);
148   } else if (((C >> 2) & 0x3) == 0x3) {
149      t2 = 2;
150      t1 = 2;
151      t0 = C & 0x3;
152   } else {
153      t2 = (C >> 4) & 0x1;
154      t1 = (C >> 2) & 0x3;
155      uint8_t C1 = (C >> 1) & 0x1;
156      uint8_t C0 = (C >> 0) & 0x1;
157      t0 = (C1 << 1) | (C0 & ~C1);
158   }
159
160   out[0] = (t0 << n) | m0;
161   out[1] = (t1 << n) | m1;
162   out[2] = (t2 << n) | m2;
163   out[3] = (t3 << n) | m3;
164   out[4] = (t4 << n) | m4;
165}
166
167/**
168 * Unpack 3n+7 bits from 'in' into 3 output values
169 */
170static void unpack_quint_block(int n, uint32_t in, uint8_t *out)
171{
172   assert(n <= 5); /* else output will overflow uint8_t */
173
174   uint8_t Q0 = (in >> (n)) & 0x1;
175   uint8_t Q1 = (in >> (n+1)) & 0x1;
176   uint8_t Q2 = (in >> (n+2)) & 0x1;
177   uint8_t Q3 = (in >> (2*n+3)) & 0x1;
178   uint8_t Q4 = (in >> (2*n+4)) & 0x1;
179   uint8_t Q5 = (in >> (3*n+5)) & 0x1;
180   uint8_t Q6 = (in >> (3*n+6)) & 0x1;
181   uint8_t mmask = (1 << n) - 1;
182   uint8_t m0 = (in >> (0)) & mmask;
183   uint8_t m1 = (in >> (n+3)) & mmask;
184   uint8_t m2 = (in >> (2*n+5)) & mmask;
185
186   uint8_t C;
187   uint8_t q2, q1, q0;
188   if (CAT_BITS_4(Q6, Q5, Q2, Q1) == 0x3) {
189      q2 = CAT_BITS_3(Q0, Q4 & ~Q0, Q3 & ~Q0);
190      q1 = 4;
191      q0 = 4;
192   } else {
193      if (CAT_BITS_2(Q2, Q1) == 0x3) {
194         q2 = 4;
195         C = CAT_BITS_5(Q4, Q3, 0x1 & ~Q6, 0x1 & ~Q5, Q0);
196      } else {
197         q2 = CAT_BITS_2(Q6, Q5);
198         C = CAT_BITS_5(Q4, Q3, Q2, Q1, Q0);
199      }
200      if ((C & 0x7) == 0x5) {
201         q1 = 4;
202         q0 = (C >> 3) & 0x3;
203      } else {
204         q1 = (C >> 3) & 0x3;
205         q0 = C & 0x7;
206      }
207   }
208   out[0] = (q0 << n) | m0;
209   out[1] = (q1 << n) | m1;
210   out[2] = (q2 << n) | m2;
211}
212
213
214struct uint8x4_t
215{
216   uint8_t v[4];
217
218   uint8x4_t() { }
219
220   uint8x4_t(int a, int b, int c, int d)
221   {
222      assert(0 <= a && a <= 255);
223      assert(0 <= b && b <= 255);
224      assert(0 <= c && c <= 255);
225      assert(0 <= d && d <= 255);
226      v[0] = a;
227      v[1] = b;
228      v[2] = c;
229      v[3] = d;
230   }
231
232   static uint8x4_t clamped(int a, int b, int c, int d)
233   {
234      uint8x4_t r;
235      r.v[0] = MAX2(0, MIN2(255, a));
236      r.v[1] = MAX2(0, MIN2(255, b));
237      r.v[2] = MAX2(0, MIN2(255, c));
238      r.v[3] = MAX2(0, MIN2(255, d));
239      return r;
240   }
241};
242
243static uint8x4_t blue_contract(int r, int g, int b, int a)
244{
245   return uint8x4_t((r+b) >> 1, (g+b) >> 1, b, a);
246}
247
248static uint8x4_t blue_contract_clamped(int r, int g, int b, int a)
249{
250   return uint8x4_t::clamped((r+b) >> 1, (g+b) >> 1, b, a);
251}
252
253static void bit_transfer_signed(int &a, int &b)
254{
255   b >>= 1;
256   b |= a & 0x80;
257   a >>= 1;
258   a &= 0x3f;
259   if (a & 0x20)
260      a -= 0x40;
261}
262
263static uint32_t hash52(uint32_t p)
264{
265   p ^= p >> 15;
266   p -= p << 17;
267   p += p << 7;
268   p += p << 4;
269   p ^= p >> 5;
270   p += p << 16;
271   p ^= p >> 7;
272   p ^= p >> 3;
273   p ^= p << 6;
274   p ^= p >> 17;
275   return p;
276}
277
278static int select_partition(int seed, int x, int y, int z, int partitioncount,
279                            int small_block)
280{
281   if (small_block) {
282      x <<= 1;
283      y <<= 1;
284      z <<= 1;
285   }
286   seed += (partitioncount - 1) * 1024;
287   uint32_t rnum = hash52(seed);
288   uint8_t seed1 = rnum & 0xF;
289   uint8_t seed2 = (rnum >> 4) & 0xF;
290   uint8_t seed3 = (rnum >> 8) & 0xF;
291   uint8_t seed4 = (rnum >> 12) & 0xF;
292   uint8_t seed5 = (rnum >> 16) & 0xF;
293   uint8_t seed6 = (rnum >> 20) & 0xF;
294   uint8_t seed7 = (rnum >> 24) & 0xF;
295   uint8_t seed8 = (rnum >> 28) & 0xF;
296   uint8_t seed9 = (rnum >> 18) & 0xF;
297   uint8_t seed10 = (rnum >> 22) & 0xF;
298   uint8_t seed11 = (rnum >> 26) & 0xF;
299   uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
300
301   seed1 *= seed1;
302   seed2 *= seed2;
303   seed3 *= seed3;
304   seed4 *= seed4;
305   seed5 *= seed5;
306   seed6 *= seed6;
307   seed7 *= seed7;
308   seed8 *= seed8;
309   seed9 *= seed9;
310   seed10 *= seed10;
311   seed11 *= seed11;
312   seed12 *= seed12;
313
314   int sh1, sh2, sh3;
315   if (seed & 1) {
316      sh1 = (seed & 2 ? 4 : 5);
317      sh2 = (partitioncount == 3 ? 6 : 5);
318   } else {
319      sh1 = (partitioncount == 3 ? 6 : 5);
320      sh2 = (seed & 2 ? 4 : 5);
321   }
322   sh3 = (seed & 0x10) ? sh1 : sh2;
323
324   seed1 >>= sh1;
325   seed2 >>= sh2;
326   seed3 >>= sh1;
327   seed4 >>= sh2;
328   seed5 >>= sh1;
329   seed6 >>= sh2;
330   seed7 >>= sh1;
331   seed8 >>= sh2;
332   seed9 >>= sh3;
333   seed10 >>= sh3;
334   seed11 >>= sh3;
335   seed12 >>= sh3;
336
337   int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
338   int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
339   int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
340   int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
341
342   a &= 0x3F;
343   b &= 0x3F;
344   c &= 0x3F;
345   d &= 0x3F;
346
347   if (partitioncount < 4)
348      d = 0;
349   if (partitioncount < 3)
350      c = 0;
351
352   if (a >= b && a >= c && a >= d)
353      return 0;
354   else if (b >= c && b >= d)
355      return 1;
356   else if (c >= d)
357      return 2;
358   else
359      return 3;
360}
361
362
363struct InputBitVector
364{
365   uint32_t data[4];
366
367   void printf_bits(int offset, int count, const char *fmt = "", ...)
368   {
369      char out[129];
370      memset(out, '.', 128);
371      out[128] = '\0';
372      int idx = offset;
373      for (int i = 0; i < count; ++i) {
374         out[127 - idx] = ((data[idx >> 5] >> (idx & 31)) & 1) ? '1' : '0';
375         ++idx;
376      }
377      printf("%s ", out);
378      va_list ap;
379      va_start(ap, fmt);
380      vprintf(fmt, ap);
381      va_end(ap);
382      printf("\n");
383   }
384
385   uint32_t get_bits(int offset, int count)
386   {
387      assert(count >= 0 && count < 32);
388
389      uint32_t out = 0;
390      if (offset < 32)
391         out |= data[0] >> offset;
392
393      if (0 < offset && offset <= 32)
394         out |= data[1] << (32 - offset);
395      if (32 < offset && offset < 64)
396         out |= data[1] >> (offset - 32);
397
398      if (32 < offset && offset <= 64)
399         out |= data[2] << (64 - offset);
400      if (64 < offset && offset < 96)
401         out |= data[2] >> (offset - 64);
402
403      if (64 < offset && offset <= 96)
404         out |= data[3] << (96 - offset);
405      if (96 < offset && offset < 128)
406         out |= data[3] >> (offset - 96);
407
408      out &= (1 << count) - 1;
409      return out;
410   }
411
412   uint64_t get_bits64(int offset, int count)
413   {
414      assert(count >= 0 && count < 64);
415
416      uint64_t out = 0;
417      if (offset < 32)
418         out |= data[0] >> offset;
419
420      if (offset <= 32)
421         out |= (uint64_t)data[1] << (32 - offset);
422      if (32 < offset && offset < 64)
423         out |= data[1] >> (offset - 32);
424
425      if (0 < offset && offset <= 64)
426         out |= (uint64_t)data[2] << (64 - offset);
427      if (64 < offset && offset < 96)
428         out |= data[2] >> (offset - 64);
429
430      if (32 < offset && offset <= 96)
431         out |= (uint64_t)data[3] << (96 - offset);
432      if (96 < offset && offset < 128)
433         out |= data[3] >> (offset - 96);
434
435      out &= ((uint64_t)1 << count) - 1;
436      return out;
437   }
438
439   uint32_t get_bits_rev(int offset, int count)
440   {
441      assert(offset >= count);
442      uint32_t tmp = get_bits(offset - count, count);
443      uint32_t out = 0;
444      for (int i = 0; i < count; ++i)
445         out |= ((tmp >> i) & 1) << (count - 1 - i);
446      return out;
447   }
448};
449
450struct OutputBitVector
451{
452   uint32_t data[4];
453   int offset;
454
455   OutputBitVector()
456      : offset(0)
457   {
458      memset(data, 0, sizeof(data));
459   }
460
461   void append(uint32_t value, int size)
462   {
463      if (VERBOSE_WRITE)
464         printf("append offset=%d size=%d values=0x%x\n", offset, size, value);
465
466      assert(offset + size <= 128);
467
468      assert(size <= 32);
469      if (size < 32)
470         assert((value >> size) == 0);
471
472      while (size) {
473         int c = MIN2(size, 32 - (offset & 31));
474         data[offset >> 5] |= (value << (offset & 31));
475         offset += c;
476         size -= c;
477         value >>= c;
478      }
479   }
480
481   void append64(uint64_t value, int size)
482   {
483      if (VERBOSE_WRITE)
484         printf("append offset=%d size=%d values=0x%llx\n", offset, size, (unsigned long long)value);
485
486      assert(offset + size <= 128);
487
488      assert(size <= 64);
489      if (size < 64)
490         assert((value >> size) == 0);
491
492      while (size) {
493         int c = MIN2(size, 32 - (offset & 31));
494         data[offset >> 5] |= (value << (offset & 31));
495         offset += c;
496         size -= c;
497         value >>= c;
498      }
499   }
500
501   void append(OutputBitVector &v, int size)
502   {
503      if (VERBOSE_WRITE)
504         printf("append vector offset=%d size=%d\n", offset, size);
505
506      assert(offset + size <= 128);
507      int i = 0;
508      while (size >= 32) {
509         append(v.data[i++], 32);
510         size -= 32;
511      }
512      if (size > 0)
513         append(v.data[i] & ((1 << size) - 1), size);
514   }
515
516   void append_end(OutputBitVector &v, int size)
517   {
518      for (int i = 0; i < size; ++i)
519         data[(127 - i) >> 5] |= ((v.data[i >> 5] >> (i & 31)) & 1) << ((127 - i) & 31);
520   }
521
522   /* Insert the given number of '1' bits. (We could use 0s instead, but 1s are
523    * more likely to flush out bugs where we accidentally read undefined bits.)
524    */
525   void skip(int size)
526   {
527      if (VERBOSE_WRITE)
528         printf("skip offset=%d size=%d\n", offset, size);
529
530      assert(offset + size <= 128);
531      while (size >= 32) {
532         append(0xffffffff, 32);
533         size -= 32;
534      }
535      if (size > 0)
536         append(0xffffffff >> (32 - size), size);
537   }
538};
539
540
541class Decoder
542{
543public:
544   Decoder(int block_w, int block_h, int block_d, bool srgb, bool output_unorm8)
545      : block_w(block_w), block_h(block_h), block_d(block_d), srgb(srgb),
546        output_unorm8(output_unorm8) {}
547
548   decode_error::type decode(const uint8_t *in, uint16_t *output) const;
549
550   int block_w, block_h, block_d;
551   bool srgb, output_unorm8;
552};
553
554struct Block
555{
556   bool is_error;
557   bool bogus_colour_endpoints;
558   bool bogus_weights;
559
560   int high_prec;
561   int dual_plane;
562   int colour_component_selector;
563   int wt_range;
564   int wt_w, wt_h, wt_d;
565   int num_parts;
566   int partition_index;
567
568   bool is_void_extent;
569   int void_extent_d;
570   int void_extent_min_s;
571   int void_extent_max_s;
572   int void_extent_min_t;
573   int void_extent_max_t;
574   uint16_t void_extent_colour_r;
575   uint16_t void_extent_colour_g;
576   uint16_t void_extent_colour_b;
577   uint16_t void_extent_colour_a;
578
579   bool is_multi_cem;
580   int num_extra_cem_bits;
581   int colour_endpoint_data_offset;
582   int extra_cem_bits;
583   int cem_base_class;
584   int cems[4];
585
586   int num_cem_values;
587
588   /* Calculated by unpack_weights(): */
589   uint8_t weights_quant[64 + 4]; /* max 64 values, plus padding for overflows in trit parsing */
590
591   /* Calculated by unquantise_weights(): */
592   uint8_t weights[64 + 18]; /* max 64 values, plus padding for the infill interpolation */
593
594   /* Calculated by unpack_colour_endpoints(): */
595   uint8_t colour_endpoints_quant[18 + 4]; /* max 18 values, plus padding for overflows in trit parsing */
596
597   /* Calculated by unquantise_colour_endpoints(): */
598   uint8_t colour_endpoints[18];
599
600   /* Calculated by calculate_from_weights(): */
601   int wt_trits;
602   int wt_quints;
603   int wt_bits;
604   int wt_max;
605   int num_weights;
606   int weight_bits;
607
608   /* Calculated by calculate_remaining_bits(): */
609   int remaining_bits;
610
611   /* Calculated by calculate_colour_endpoints_size(): */
612   int colour_endpoint_bits;
613   int ce_max;
614   int ce_trits;
615   int ce_quints;
616   int ce_bits;
617
618   /* Calculated by compute_infill_weights(); */
619   uint8_t infill_weights[2][216]; /* large enough for 6x6x6 */
620
621   /* Calculated by decode_colour_endpoints(); */
622   uint8x4_t endpoints_decoded[2][4];
623
624   void calculate_from_weights();
625   void calculate_remaining_bits();
626   decode_error::type calculate_colour_endpoints_size();
627
628   void unquantise_weights();
629   void unquantise_colour_endpoints();
630
631   decode_error::type decode(const Decoder &decoder, InputBitVector in);
632
633   decode_error::type decode_block_mode(InputBitVector in);
634   decode_error::type decode_void_extent(InputBitVector in);
635   void decode_cem(InputBitVector in);
636   void unpack_colour_endpoints(InputBitVector in);
637   void decode_colour_endpoints();
638   void unpack_weights(InputBitVector in);
639   void compute_infill_weights(int block_w, int block_h, int block_d);
640
641   void write_decoded(const Decoder &decoder, uint16_t *output);
642};
643
644
645decode_error::type Decoder::decode(const uint8_t *in, uint16_t *output) const
646{
647   Block blk;
648   InputBitVector in_vec;
649   memcpy(&in_vec.data, in, 16);
650   decode_error::type err = blk.decode(*this, in_vec);
651   if (err == decode_error::ok) {
652      blk.write_decoded(*this, output);
653   } else {
654      /* Fill output with the error colour */
655      for (int i = 0; i < block_w * block_h * block_d; ++i) {
656         if (output_unorm8) {
657            output[i*4+0] = 0xff;
658            output[i*4+1] = 0;
659            output[i*4+2] = 0xff;
660            output[i*4+3] = 0xff;
661         } else {
662            assert(!srgb); /* srgb must use unorm8 */
663
664            output[i*4+0] = FP16_ONE;
665            output[i*4+1] = FP16_ZERO;
666            output[i*4+2] = FP16_ONE;
667            output[i*4+3] = FP16_ONE;
668         }
669      }
670   }
671   return err;
672}
673
674
675decode_error::type Block::decode_void_extent(InputBitVector block)
676{
677   /* TODO: 3D */
678
679   is_void_extent = true;
680   void_extent_d = block.get_bits(9, 1);
681   void_extent_min_s = block.get_bits(12, 13);
682   void_extent_max_s = block.get_bits(25, 13);
683   void_extent_min_t = block.get_bits(38, 13);
684   void_extent_max_t = block.get_bits(51, 13);
685   void_extent_colour_r = block.get_bits(64, 16);
686   void_extent_colour_g = block.get_bits(80, 16);
687   void_extent_colour_b = block.get_bits(96, 16);
688   void_extent_colour_a = block.get_bits(112, 16);
689
690   /* TODO: maybe we should do something useful with the extent coordinates? */
691
692   if (void_extent_d) {
693      return decode_error::unsupported_hdr_void_extent;
694   }
695
696   if (void_extent_min_s == 0x1fff && void_extent_max_s == 0x1fff
697       && void_extent_min_t == 0x1fff && void_extent_max_t == 0x1fff) {
698
699      /* No extents */
700
701   } else {
702
703      /* Check for illegal encoding */
704      if (void_extent_min_s >= void_extent_max_s || void_extent_min_t >= void_extent_max_t) {
705         return decode_error::invalid_range_in_void_extent;
706      }
707   }
708
709   return decode_error::ok;
710}
711
712decode_error::type Block::decode_block_mode(InputBitVector in)
713{
714   dual_plane = in.get_bits(10, 1);
715   high_prec = in.get_bits(9, 1);
716
717   if (in.get_bits(0, 2) != 0x0) {
718      wt_range = (in.get_bits(0, 2) << 1) | in.get_bits(4, 1);
719      int a = in.get_bits(5, 2);
720      int b = in.get_bits(7, 2);
721      switch (in.get_bits(2, 2)) {
722      case 0x0:
723         if (VERBOSE_DECODE)
724            in.printf_bits(0, 11, "DHBBAAR00RR");
725         wt_w = b + 4;
726         wt_h = a + 2;
727         break;
728      case 0x1:
729         if (VERBOSE_DECODE)
730            in.printf_bits(0, 11, "DHBBAAR01RR");
731         wt_w = b + 8;
732         wt_h = a + 2;
733         break;
734      case 0x2:
735         if (VERBOSE_DECODE)
736            in.printf_bits(0, 11, "DHBBAAR10RR");
737         wt_w = a + 2;
738         wt_h = b + 8;
739         break;
740      case 0x3:
741         if ((b & 0x2) == 0) {
742            if (VERBOSE_DECODE)
743               in.printf_bits(0, 11, "DH0BAAR11RR");
744            wt_w = a + 2;
745            wt_h = b + 6;
746         } else {
747            if (VERBOSE_DECODE)
748               in.printf_bits(0, 11, "DH1BAAR11RR");
749            wt_w = (b & 0x1) + 2;
750            wt_h = a + 2;
751         }
752         break;
753      }
754   } else {
755      if (in.get_bits(6, 3) == 0x7) {
756         if (in.get_bits(0, 9) == 0x1fc) {
757            if (VERBOSE_DECODE)
758               in.printf_bits(0, 11, "xx111111100 (void extent)");
759            return decode_void_extent(in);
760         } else {
761            if (VERBOSE_DECODE)
762               in.printf_bits(0, 11, "xx111xxxx00");
763            return decode_error::reserved_block_mode_1;
764         }
765      }
766      if (in.get_bits(0, 4) == 0x0) {
767         if (VERBOSE_DECODE)
768            in.printf_bits(0, 11, "xxxxxxx0000");
769         return decode_error::reserved_block_mode_2;
770      }
771
772      wt_range = in.get_bits(1, 3) | in.get_bits(4, 1);
773      int a = in.get_bits(5, 2);
774      int b;
775
776      switch (in.get_bits(7, 2)) {
777      case 0x0:
778         if (VERBOSE_DECODE)
779            in.printf_bits(0, 11, "DH00AARRR00");
780         wt_w = 12;
781         wt_h = a + 2;
782         break;
783      case 0x1:
784         if (VERBOSE_DECODE)
785            in.printf_bits(0, 11, "DH01AARRR00");
786         wt_w = a + 2;
787         wt_h = 12;
788         break;
789      case 0x3:
790         if (in.get_bits(5, 1) == 0) {
791            if (VERBOSE_DECODE)
792               in.printf_bits(0, 11, "DH1100RRR00");
793            wt_w = 6;
794            wt_h = 10;
795         } else {
796            if (VERBOSE_DECODE)
797               in.printf_bits(0, 11, "DH1101RRR00");
798            wt_w = 10;
799            wt_h = 6;
800         }
801         break;
802      case 0x2:
803         if (VERBOSE_DECODE)
804            in.printf_bits(0, 11, "BB10AARRR00");
805         b = in.get_bits(9, 2);
806         wt_w = a + 6;
807         wt_h = b + 6;
808         dual_plane = 0;
809         high_prec = 0;
810         break;
811      }
812   }
813   return decode_error::ok;
814}
815
816void Block::decode_cem(InputBitVector in)
817{
818   cems[0] = cems[1] = cems[2] = cems[3] = -1;
819
820   num_extra_cem_bits = 0;
821   extra_cem_bits = 0;
822
823   if (num_parts > 1) {
824
825      partition_index = in.get_bits(13, 10);
826      if (VERBOSE_DECODE)
827         in.printf_bits(13, 10, "partition ID (%d)", partition_index);
828
829      uint32_t cem = in.get_bits(23, 6);
830
831      if ((cem & 0x3) == 0x0) {
832         cem >>= 2;
833         cem_base_class = cem >> 2;
834         is_multi_cem = false;
835
836         for (int i = 0; i < num_parts; ++i)
837            cems[i] = cem;
838
839         if (VERBOSE_DECODE)
840            in.printf_bits(23, 6, "CEM (single, %d)", cem);
841      } else {
842
843         cem_base_class = (cem & 0x3) - 1;
844         is_multi_cem = true;
845
846         if (VERBOSE_DECODE)
847            in.printf_bits(23, 6, "CEM (multi, base class %d)", cem_base_class);
848
849         int offset = 128 - weight_bits;
850
851         if (num_parts == 2) {
852            if (VERBOSE_DECODE) {
853               in.printf_bits(25, 4, "M0M0 C1 C0");
854               in.printf_bits(offset - 2, 2, "M1M1");
855            }
856
857            uint32_t c0 = in.get_bits(25, 1);
858            uint32_t c1 = in.get_bits(26, 1);
859
860            extra_cem_bits = c0 + c1;
861
862            num_extra_cem_bits = 2;
863
864            uint32_t m0 = in.get_bits(27, 2);
865            uint32_t m1 = in.get_bits(offset - 2, 2);
866
867            cems[0] = ((cem_base_class + c0) << 2) | m0;
868            cems[1] = ((cem_base_class + c1) << 2) | m1;
869
870         } else if (num_parts == 3) {
871            if (VERBOSE_DECODE) {
872               in.printf_bits(25, 4, "M0 C2 C1 C0");
873               in.printf_bits(offset - 5, 5, "M2M2 M1M1 M0");
874            }
875
876            uint32_t c0 = in.get_bits(25, 1);
877            uint32_t c1 = in.get_bits(26, 1);
878            uint32_t c2 = in.get_bits(27, 1);
879
880            extra_cem_bits = c0 + c1 + c2;
881
882            num_extra_cem_bits = 5;
883
884            uint32_t m0 = in.get_bits(28, 1) | (in.get_bits(128 - weight_bits - 5, 1) << 1);
885            uint32_t m1 = in.get_bits(offset - 4, 2);
886            uint32_t m2 = in.get_bits(offset - 2, 2);
887
888            cems[0] = ((cem_base_class + c0) << 2) | m0;
889            cems[1] = ((cem_base_class + c1) << 2) | m1;
890            cems[2] = ((cem_base_class + c2) << 2) | m2;
891
892         } else if (num_parts == 4) {
893            if (VERBOSE_DECODE) {
894               in.printf_bits(25, 4, "C3 C2 C1 C0");
895               in.printf_bits(offset - 8, 8, "M3M3 M2M2 M1M1 M0M0");
896            }
897
898            uint32_t c0 = in.get_bits(25, 1);
899            uint32_t c1 = in.get_bits(26, 1);
900            uint32_t c2 = in.get_bits(27, 1);
901            uint32_t c3 = in.get_bits(28, 1);
902
903            extra_cem_bits = c0 + c1 + c2 + c3;
904
905            num_extra_cem_bits = 8;
906
907            uint32_t m0 = in.get_bits(offset - 8, 2);
908            uint32_t m1 = in.get_bits(offset - 6, 2);
909            uint32_t m2 = in.get_bits(offset - 4, 2);
910            uint32_t m3 = in.get_bits(offset - 2, 2);
911
912            cems[0] = ((cem_base_class + c0) << 2) | m0;
913            cems[1] = ((cem_base_class + c1) << 2) | m1;
914            cems[2] = ((cem_base_class + c2) << 2) | m2;
915            cems[3] = ((cem_base_class + c3) << 2) | m3;
916         } else {
917            unreachable("");
918         }
919      }
920
921      colour_endpoint_data_offset = 29;
922
923   } else {
924      uint32_t cem = in.get_bits(13, 4);
925
926      cem_base_class = cem >> 2;
927      is_multi_cem = false;
928
929      cems[0] = cem;
930
931      partition_index = -1;
932
933      if (VERBOSE_DECODE)
934         in.printf_bits(13, 4, "CEM = %d (class %d)", cem, cem_base_class);
935
936      colour_endpoint_data_offset = 17;
937   }
938}
939
940void Block::unpack_colour_endpoints(InputBitVector in)
941{
942   if (ce_trits) {
943      int offset = colour_endpoint_data_offset;
944      int bits_left = colour_endpoint_bits;
945      for (int i = 0; i < num_cem_values; i += 5) {
946         int bits_to_read = MIN2(bits_left, 8 + ce_bits * 5);
947         /* If ce_trits then ce_bits <= 6, so bits_to_read <= 38 and we have to use uint64_t */
948         uint64_t raw = in.get_bits64(offset, bits_to_read);
949         unpack_trit_block(ce_bits, raw, &colour_endpoints_quant[i]);
950
951         if (VERBOSE_DECODE)
952            in.printf_bits(offset, bits_to_read,
953                           "trits [%d,%d,%d,%d,%d]",
954                           colour_endpoints_quant[i+0], colour_endpoints_quant[i+1],
955                  colour_endpoints_quant[i+2], colour_endpoints_quant[i+3],
956                  colour_endpoints_quant[i+4]);
957
958         offset += 8 + ce_bits * 5;
959         bits_left -= 8 + ce_bits * 5;
960      }
961   } else if (ce_quints) {
962      int offset = colour_endpoint_data_offset;
963      int bits_left = colour_endpoint_bits;
964      for (int i = 0; i < num_cem_values; i += 3) {
965         int bits_to_read = MIN2(bits_left, 7 + ce_bits * 3);
966         /* If ce_quints then ce_bits <= 5, so bits_to_read <= 22 and we can use uint32_t */
967         uint32_t raw = in.get_bits(offset, bits_to_read);
968         unpack_quint_block(ce_bits, raw, &colour_endpoints_quant[i]);
969
970         if (VERBOSE_DECODE)
971            in.printf_bits(offset, bits_to_read,
972                           "quints [%d,%d,%d]",
973                           colour_endpoints_quant[i], colour_endpoints_quant[i+1], colour_endpoints_quant[i+2]);
974
975         offset += 7 + ce_bits * 3;
976         bits_left -= 7 + ce_bits * 3;
977      }
978   } else {
979      assert((colour_endpoint_bits % ce_bits) == 0);
980      int offset = colour_endpoint_data_offset;
981      for (int i = 0; i < num_cem_values; i++) {
982         colour_endpoints_quant[i] = in.get_bits(offset, ce_bits);
983
984         if (VERBOSE_DECODE)
985            in.printf_bits(offset, ce_bits, "bits [%d]", colour_endpoints_quant[i]);
986
987         offset += ce_bits;
988      }
989   }
990}
991
992void Block::decode_colour_endpoints()
993{
994   int cem_values_idx = 0;
995   for (int part = 0; part < num_parts; ++part) {
996      uint8_t *v = &colour_endpoints[cem_values_idx];
997      int v0 = v[0];
998      int v1 = v[1];
999      int v2 = v[2];
1000      int v3 = v[3];
1001      int v4 = v[4];
1002      int v5 = v[5];
1003      int v6 = v[6];
1004      int v7 = v[7];
1005      cem_values_idx += ((cems[part] >> 2) + 1) * 2;
1006
1007      uint8x4_t e0, e1;
1008      int s0, s1, L0, L1;
1009
1010      switch (cems[part])
1011      {
1012      case 0:
1013         e0 = uint8x4_t(v0, v0, v0, 0xff);
1014         e1 = uint8x4_t(v1, v1, v1, 0xff);
1015         break;
1016      case 1:
1017         L0 = (v0 >> 2) | (v1 & 0xc0);
1018         L1 = L0 + (v1 & 0x3f);
1019         if (L1 > 0xff)
1020            L1 = 0xff;
1021         e0 = uint8x4_t(L0, L0, L0, 0xff);
1022         e1 = uint8x4_t(L1, L1, L1, 0xff);
1023         break;
1024      case 4:
1025         e0 = uint8x4_t(v0, v0, v0, v2);
1026         e1 = uint8x4_t(v1, v1, v1, v3);
1027         break;
1028      case 5:
1029         bit_transfer_signed(v1, v0);
1030         bit_transfer_signed(v3, v2);
1031         e0 = uint8x4_t(v0, v0, v0, v2);
1032         e1 = uint8x4_t::clamped(v0+v1, v0+v1, v0+v1, v2+v3);
1033         break;
1034      case 6:
1035         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, 0xff);
1036         e1 = uint8x4_t(v0, v1, v2, 0xff);
1037         break;
1038      case 8:
1039         s0 = v0 + v2 + v4;
1040         s1 = v1 + v3 + v5;
1041         if (s1 >= s0) {
1042            e0 = uint8x4_t(v0, v2, v4, 0xff);
1043            e1 = uint8x4_t(v1, v3, v5, 0xff);
1044         } else {
1045            e0 = blue_contract(v1, v3, v5, 0xff);
1046            e1 = blue_contract(v0, v2, v4, 0xff);
1047         }
1048         break;
1049      case 9:
1050         bit_transfer_signed(v1, v0);
1051         bit_transfer_signed(v3, v2);
1052         bit_transfer_signed(v5, v4);
1053         if (v1 + v3 + v5 >= 0) {
1054            e0 = uint8x4_t(v0, v2, v4, 0xff);
1055            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, 0xff);
1056         } else {
1057            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, 0xff);
1058            e1 = blue_contract(v0, v2, v4, 0xff);
1059         }
1060         break;
1061      case 10:
1062         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, v4);
1063         e1 = uint8x4_t(v0, v1, v2, v5);
1064         break;
1065      case 12:
1066         s0 = v0 + v2 + v4;
1067         s1 = v1 + v3 + v5;
1068         if (s1 >= s0) {
1069            e0 = uint8x4_t(v0, v2, v4, v6);
1070            e1 = uint8x4_t(v1, v3, v5, v7);
1071         } else {
1072            e0 = blue_contract(v1, v3, v5, v7);
1073            e1 = blue_contract(v0, v2, v4, v6);
1074         }
1075         break;
1076      case 13:
1077         bit_transfer_signed(v1, v0);
1078         bit_transfer_signed(v3, v2);
1079         bit_transfer_signed(v5, v4);
1080         bit_transfer_signed(v7, v6);
1081         if (v1 + v3 + v5 >= 0) {
1082            e0 = uint8x4_t(v0, v2, v4, v6);
1083            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1084         } else {
1085            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, v6+v7);
1086            e1 = blue_contract(v0, v2, v4, v6);
1087         }
1088         break;
1089      default:
1090         /* HDR endpoints not supported; return error colour */
1091         e0 = uint8x4_t(255, 0, 255, 255);
1092         e1 = uint8x4_t(255, 0, 255, 255);
1093         break;
1094      }
1095
1096      endpoints_decoded[0][part] = e0;
1097      endpoints_decoded[1][part] = e1;
1098
1099      if (VERBOSE_DECODE) {
1100         printf("cems[%d]=%d v=[", part, cems[part]);
1101         for (int i = 0; i < (cems[part] >> 2) + 1; ++i) {
1102            if (i)
1103               printf(", ");
1104            printf("%3d", v[i]);
1105         }
1106         printf("] e0=[%3d,%4d,%4d,%4d] e1=[%3d,%4d,%4d,%4d]\n",
1107                e0.v[0], e0.v[1], e0.v[2], e0.v[3],
1108               e1.v[0], e1.v[1], e1.v[2], e1.v[3]);
1109      }
1110   }
1111}
1112
1113void Block::unpack_weights(InputBitVector in)
1114{
1115   if (wt_trits) {
1116      int offset = 128;
1117      int bits_left = weight_bits;
1118      for (int i = 0; i < num_weights; i += 5) {
1119         int bits_to_read = MIN2(bits_left, 8 + 5*wt_bits);
1120         /* If wt_trits then wt_bits <= 3, so bits_to_read <= 23 and we can use uint32_t */
1121         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1122         unpack_trit_block(wt_bits, raw, &weights_quant[i]);
1123
1124         if (VERBOSE_DECODE)
1125            in.printf_bits(offset - bits_to_read, bits_to_read, "weight trits [%d,%d,%d,%d,%d]",
1126                           weights_quant[i+0], weights_quant[i+1],
1127                  weights_quant[i+2], weights_quant[i+3],
1128                  weights_quant[i+4]);
1129
1130         offset -= 8 + wt_bits * 5;
1131         bits_left -= 8 + wt_bits * 5;
1132      }
1133
1134   } else if (wt_quints) {
1135
1136      int offset = 128;
1137      int bits_left = weight_bits;
1138      for (int i = 0; i < num_weights; i += 3) {
1139         int bits_to_read = MIN2(bits_left, 7 + 3*wt_bits);
1140         /* If wt_quints then wt_bits <= 2, so bits_to_read <= 13 and we can use uint32_t */
1141         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
1142         unpack_quint_block(wt_bits, raw, &weights_quant[i]);
1143
1144         if (VERBOSE_DECODE)
1145            in.printf_bits(offset - bits_to_read, bits_to_read, "weight quints [%d,%d,%d]",
1146                           weights_quant[i], weights_quant[i+1], weights_quant[i+2]);
1147
1148         offset -= 7 + wt_bits * 3;
1149         bits_left -= 7 + wt_bits * 3;
1150      }
1151
1152   } else {
1153      int offset = 128;
1154      assert((weight_bits % wt_bits) == 0);
1155      for (int i = 0; i < num_weights; ++i) {
1156         weights_quant[i] = in.get_bits_rev(offset, wt_bits);
1157
1158         if (VERBOSE_DECODE)
1159            in.printf_bits(offset - wt_bits, wt_bits, "weight bits [%d]", weights_quant[i]);
1160
1161         offset -= wt_bits;
1162      }
1163   }
1164}
1165
1166void Block::unquantise_weights()
1167{
1168   assert(num_weights <= (int)ARRAY_SIZE(weights_quant));
1169   assert(num_weights <= (int)ARRAY_SIZE(weights));
1170
1171   memset(weights, 0, sizeof(weights));
1172
1173   for (int i = 0; i < num_weights; ++i) {
1174
1175      uint8_t v = weights_quant[i];
1176      uint8_t w;
1177
1178      if (wt_trits) {
1179
1180         if (wt_bits == 0) {
1181            w = v * 32;
1182         } else {
1183            uint8_t A, B, C, D;
1184            A = (v & 0x1) ? 0x7F : 0x00;
1185            switch (wt_bits) {
1186            case 1:
1187               B = 0;
1188               C = 50;
1189               D = v >> 1;
1190               break;
1191            case 2:
1192               B = (v & 0x2) ? 0x45 : 0x00;
1193               C = 23;
1194               D = v >> 2;
1195               break;
1196            case 3:
1197               B = ((v & 0x6) >> 1) | ((v & 0x6) << 4);
1198               C = 11;
1199               D = v >> 3;
1200               break;
1201            default:
1202               unreachable("");
1203            }
1204            uint16_t T = D * C + B;
1205            T = T ^ A;
1206            T = (A & 0x20) | (T >> 2);
1207            assert(T < 64);
1208            if (T > 32)
1209               T++;
1210            w = T;
1211         }
1212
1213      } else if (wt_quints) {
1214
1215         if (wt_bits == 0) {
1216            w = v * 16;
1217         } else {
1218            uint8_t A, B, C, D;
1219            A = (v & 0x1) ? 0x7F : 0x00;
1220            switch (wt_bits) {
1221            case 1:
1222               B = 0;
1223               C = 28;
1224               D = v >> 1;
1225               break;
1226            case 2:
1227               B = (v & 0x2) ? 0x42 : 0x00;
1228               C = 13;
1229               D = v >> 2;
1230               break;
1231            default:
1232               unreachable("");
1233            }
1234            uint16_t T = D * C + B;
1235            T = T ^ A;
1236            T = (A & 0x20) | (T >> 2);
1237            assert(T < 64);
1238            if (T > 32)
1239               T++;
1240            w = T;
1241         }
1242         weights[i] = w;
1243
1244      } else {
1245
1246         switch (wt_bits) {
1247         case 1: w = v ? 0x3F : 0x00; break;
1248         case 2: w = v | (v << 2) | (v << 4); break;
1249         case 3: w = v | (v << 3); break;
1250         case 4: w = (v >> 2) | (v << 2); break;
1251         case 5: w = (v >> 4) | (v << 1); break;
1252         default: unreachable("");
1253         }
1254         assert(w < 64);
1255         if (w > 32)
1256            w++;
1257      }
1258      weights[i] = w;
1259   }
1260}
1261
1262void Block::compute_infill_weights(int block_w, int block_h, int block_d)
1263{
1264   int Ds = block_w <= 1 ? 0 : (1024 + block_w / 2) / (block_w - 1);
1265   int Dt = block_h <= 1 ? 0 : (1024 + block_h / 2) / (block_h - 1);
1266   int Dr = block_d <= 1 ? 0 : (1024 + block_d / 2) / (block_d - 1);
1267   for (int r = 0; r < block_d; ++r) {
1268      for (int t = 0; t < block_h; ++t) {
1269         for (int s = 0; s < block_w; ++s) {
1270            int cs = Ds * s;
1271            int ct = Dt * t;
1272            int cr = Dr * r;
1273            int gs = (cs * (wt_w - 1) + 32) >> 6;
1274            int gt = (ct * (wt_h - 1) + 32) >> 6;
1275            int gr = (cr * (wt_d - 1) + 32) >> 6;
1276            assert(gs >= 0 && gs <= 176);
1277            assert(gt >= 0 && gt <= 176);
1278            assert(gr >= 0 && gr <= 176);
1279            int js = gs >> 4;
1280            int fs = gs & 0xf;
1281            int jt = gt >> 4;
1282            int ft = gt & 0xf;
1283            int jr = gr >> 4;
1284            int fr = gr & 0xf;
1285
1286            /* TODO: 3D */
1287            (void)jr;
1288            (void)fr;
1289
1290            int w11 = (fs * ft + 8) >> 4;
1291            int w10 = ft - w11;
1292            int w01 = fs - w11;
1293            int w00 = 16 - fs - ft + w11;
1294
1295            if (dual_plane) {
1296               int p00, p01, p10, p11, i0, i1;
1297               int v0 = js + jt * wt_w;
1298               p00 = weights[(v0) * 2];
1299               p01 = weights[(v0 + 1) * 2];
1300               p10 = weights[(v0 + wt_w) * 2];
1301               p11 = weights[(v0 + wt_w + 1) * 2];
1302               i0 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1303               p00 = weights[(v0) * 2 + 1];
1304               p01 = weights[(v0 + 1) * 2 + 1];
1305               p10 = weights[(v0 + wt_w) * 2 + 1];
1306               p11 = weights[(v0 + wt_w + 1) * 2 + 1];
1307               assert((v0 + wt_w + 1) * 2 + 1 < (int)ARRAY_SIZE(weights));
1308               i1 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1309               assert(0 <= i0 && i0 <= 64);
1310               infill_weights[0][s + t*block_w + r*block_w*block_h] = i0;
1311               infill_weights[1][s + t*block_w + r*block_w*block_h] = i1;
1312            } else {
1313               int p00, p01, p10, p11, i;
1314               int v0 = js + jt * wt_w;
1315               p00 = weights[v0];
1316               p01 = weights[v0 + 1];
1317               p10 = weights[v0 + wt_w];
1318               p11 = weights[v0 + wt_w + 1];
1319               assert(v0 + wt_w + 1 < (int)ARRAY_SIZE(weights));
1320               i = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
1321               assert(0 <= i && i <= 64);
1322               infill_weights[0][s + t*block_w + r*block_w*block_h] = i;
1323            }
1324         }
1325      }
1326   }
1327}
1328
1329void Block::unquantise_colour_endpoints()
1330{
1331   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints_quant));
1332   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints));
1333
1334   for (int i = 0; i < num_cem_values; ++i) {
1335      uint8_t v = colour_endpoints_quant[i];
1336
1337      if (ce_trits) {
1338         uint16_t A, B, C, D;
1339         uint16_t t;
1340         A = (v & 0x1) ? 0x1FF : 0x000;
1341         switch (ce_bits) {
1342         case 1:
1343            B = 0;
1344            C = 204;
1345            D = v >> 1;
1346            break;
1347         case 2:
1348            B = (v & 0x2) ? 0x116 : 0x000;
1349            C = 93;
1350            D = v >> 2;
1351            break;
1352         case 3:
1353            t = ((v >> 1) & 0x3);
1354            B = t | (t << 2) | (t << 7);
1355            C = 44;
1356            D = v >> 3;
1357            break;
1358         case 4:
1359            t = ((v >> 1) & 0x7);
1360            B = t | (t << 6);
1361            C = 22;
1362            D = v >> 4;
1363            break;
1364         case 5:
1365            t = ((v >> 1) & 0xF);
1366            B = (t >> 2) | (t << 5);
1367            C = 11;
1368            D = v >> 5;
1369            break;
1370         case 6:
1371            B = ((v & 0x3E) << 3) | ((v >> 5) & 0x1);
1372            C = 5;
1373            D = v >> 6;
1374            break;
1375         default:
1376            unreachable("");
1377         }
1378         uint16_t T = D * C + B;
1379         T = T ^ A;
1380         T = (A & 0x80) | (T >> 2);
1381         assert(T < 256);
1382         colour_endpoints[i] = T;
1383      } else if (ce_quints) {
1384         uint16_t A, B, C, D;
1385         uint16_t t;
1386         A = (v & 0x1) ? 0x1FF : 0x000;
1387         switch (ce_bits) {
1388         case 1:
1389            B = 0;
1390            C = 113;
1391            D = v >> 1;
1392            break;
1393         case 2:
1394            B = (v & 0x2) ? 0x10C : 0x000;
1395            C = 54;
1396            D = v >> 2;
1397            break;
1398         case 3:
1399            t = ((v >> 1) & 0x3);
1400            B = (t >> 1) | (t << 1) | (t << 7);
1401            C = 26;
1402            D = v >> 3;
1403            break;
1404         case 4:
1405            t = ((v >> 1) & 0x7);
1406            B = (t >> 1) | (t << 6);
1407            C = 13;
1408            D = v >> 4;
1409            break;
1410         case 5:
1411            t = ((v >> 1) & 0xF);
1412            B = (t >> 4) | (t << 5);
1413            C = 6;
1414            D = v >> 5;
1415            break;
1416         default:
1417            unreachable("");
1418         }
1419         uint16_t T = D * C + B;
1420         T = T ^ A;
1421         T = (A & 0x80) | (T >> 2);
1422         assert(T < 256);
1423         colour_endpoints[i] = T;
1424      } else {
1425         switch (ce_bits) {
1426         case 1: v = v ? 0xFF : 0x00; break;
1427         case 2: v = (v << 6) | (v << 4) | (v << 2) | v; break;
1428         case 3: v = (v << 5) | (v << 2) | (v >> 1); break;
1429         case 4: v = (v << 4) | v; break;
1430         case 5: v = (v << 3) | (v >> 2); break;
1431         case 6: v = (v << 2) | (v >> 4); break;
1432         case 7: v = (v << 1) | (v >> 6); break;
1433         case 8: break;
1434         default: unreachable("");
1435         }
1436         colour_endpoints[i] = v;
1437      }
1438   }
1439}
1440
1441decode_error::type Block::decode(const Decoder &decoder, InputBitVector in)
1442{
1443   decode_error::type err;
1444
1445   is_error = false;
1446   bogus_colour_endpoints = false;
1447   bogus_weights = false;
1448   is_void_extent = false;
1449
1450   wt_d = 1;
1451   /* TODO: 3D */
1452
1453   /* TODO: test for all the illegal encodings */
1454
1455   if (VERBOSE_DECODE)
1456      in.printf_bits(0, 128);
1457
1458   err = decode_block_mode(in);
1459   if (err != decode_error::ok)
1460      return err;
1461
1462   if (is_void_extent)
1463      return decode_error::ok;
1464
1465   /* TODO: 3D */
1466
1467   calculate_from_weights();
1468
1469   if (VERBOSE_DECODE)
1470      printf("weights_grid=%dx%dx%d dual_plane=%d num_weights=%d high_prec=%d r=%d range=0..%d (%dt %dq %db) weight_bits=%d\n",
1471             wt_w, wt_h, wt_d, dual_plane, num_weights, high_prec, wt_range, wt_max, wt_trits, wt_quints, wt_bits, weight_bits);
1472
1473   if (wt_w > decoder.block_w || wt_h > decoder.block_h || wt_d > decoder.block_d)
1474      return decode_error::weight_grid_exceeds_block_size;
1475
1476   num_parts = in.get_bits(11, 2) + 1;
1477
1478   if (VERBOSE_DECODE)
1479      in.printf_bits(11, 2, "partitions = %d", num_parts);
1480
1481   if (dual_plane && num_parts > 3)
1482      return decode_error::dual_plane_and_too_many_partitions;
1483
1484   decode_cem(in);
1485
1486   if (VERBOSE_DECODE)
1487      printf("cem=[%d,%d,%d,%d] base_cem_class=%d\n", cems[0], cems[1], cems[2], cems[3], cem_base_class);
1488
1489   int num_cem_pairs = (cem_base_class + 1) * num_parts + extra_cem_bits;
1490   num_cem_values = num_cem_pairs * 2;
1491
1492   calculate_remaining_bits();
1493   err = calculate_colour_endpoints_size();
1494   if (err != decode_error::ok)
1495      return err;
1496
1497   if (VERBOSE_DECODE)
1498      in.printf_bits(colour_endpoint_data_offset, colour_endpoint_bits,
1499                     "endpoint data (%d bits, %d vals, %dt %dq %db)",
1500                     colour_endpoint_bits, num_cem_values, ce_trits, ce_quints, ce_bits);
1501
1502   unpack_colour_endpoints(in);
1503
1504   if (VERBOSE_DECODE) {
1505      printf("cem values raw =[");
1506      for (int i = 0; i < num_cem_values; i++) {
1507         if (i)
1508            printf(", ");
1509         printf("%3d", colour_endpoints_quant[i]);
1510      }
1511      printf("]\n");
1512   }
1513
1514   if (num_cem_values > 18)
1515      return decode_error::invalid_colour_endpoints_count;
1516
1517   unquantise_colour_endpoints();
1518
1519   if (VERBOSE_DECODE) {
1520      printf("cem values norm=[");
1521      for (int i = 0; i < num_cem_values; i++) {
1522         if (i)
1523            printf(", ");
1524         printf("%3d", colour_endpoints[i]);
1525      }
1526      printf("]\n");
1527   }
1528
1529   decode_colour_endpoints();
1530
1531   if (dual_plane) {
1532      int ccs_offset = 128 - weight_bits - num_extra_cem_bits - 2;
1533      colour_component_selector = in.get_bits(ccs_offset, 2);
1534
1535      if (VERBOSE_DECODE)
1536         in.printf_bits(ccs_offset, 2, "colour component selector = %d", colour_component_selector);
1537   } else {
1538      colour_component_selector = 0;
1539   }
1540
1541
1542   if (VERBOSE_DECODE)
1543      in.printf_bits(128 - weight_bits, weight_bits, "weights (%d bits)", weight_bits);
1544
1545   if (num_weights > 64)
1546      return decode_error::invalid_num_weights;
1547
1548   if (weight_bits < 24 || weight_bits > 96)
1549      return decode_error::invalid_weight_bits;
1550
1551   unpack_weights(in);
1552
1553   unquantise_weights();
1554
1555   if (VERBOSE_DECODE) {
1556      printf("weights=[");
1557      for (int i = 0; i < num_weights; ++i) {
1558         if (i)
1559            printf(", ");
1560         printf("%d", weights[i]);
1561      }
1562      printf("]\n");
1563
1564      for (int plane = 0; plane <= dual_plane; ++plane) {
1565         printf("weights (plane %d):\n", plane);
1566         int i = 0;
1567         (void)i;
1568
1569         for (int r = 0; r < wt_d; ++r) {
1570            for (int t = 0; t < wt_h; ++t) {
1571               for (int s = 0; s < wt_w; ++s) {
1572                  printf("%3d", weights[i++ * (1 + dual_plane) + plane]);
1573               }
1574               printf("\n");
1575            }
1576            if (r < wt_d - 1)
1577               printf("\n");
1578         }
1579      }
1580   }
1581
1582   compute_infill_weights(decoder.block_w, decoder.block_h, decoder.block_d);
1583
1584   if (VERBOSE_DECODE) {
1585      for (int plane = 0; plane <= dual_plane; ++plane) {
1586         printf("infilled weights (plane %d):\n", plane);
1587         int i = 0;
1588         (void)i;
1589
1590         for (int r = 0; r < decoder.block_d; ++r) {
1591            for (int t = 0; t < decoder.block_h; ++t) {
1592               for (int s = 0; s < decoder.block_w; ++s) {
1593                  printf("%3d", infill_weights[plane][i++]);
1594               }
1595               printf("\n");
1596            }
1597            if (r < decoder.block_d - 1)
1598               printf("\n");
1599         }
1600      }
1601   }
1602   if (VERBOSE_DECODE)
1603      printf("\n");
1604
1605   return decode_error::ok;
1606}
1607
1608void Block::write_decoded(const Decoder &decoder, uint16_t *output)
1609{
1610   /* sRGB can only be stored as unorm8. */
1611   assert(!decoder.srgb || decoder.output_unorm8);
1612
1613   if (is_void_extent) {
1614      for (int idx = 0; idx < decoder.block_w*decoder.block_h*decoder.block_d; ++idx) {
1615         if (decoder.output_unorm8) {
1616            if (decoder.srgb) {
1617               output[idx*4+0] = void_extent_colour_r >> 8;
1618               output[idx*4+1] = void_extent_colour_g >> 8;
1619               output[idx*4+2] = void_extent_colour_b >> 8;
1620            } else {
1621               output[idx*4+0] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_r);
1622               output[idx*4+1] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_g);
1623               output[idx*4+2] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_b);
1624            }
1625            output[idx*4+3] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_a);
1626         } else {
1627            /* Store the color as FP16. */
1628            output[idx*4+0] = _mesa_uint16_div_64k_to_half(void_extent_colour_r);
1629            output[idx*4+1] = _mesa_uint16_div_64k_to_half(void_extent_colour_g);
1630            output[idx*4+2] = _mesa_uint16_div_64k_to_half(void_extent_colour_b);
1631            output[idx*4+3] = _mesa_uint16_div_64k_to_half(void_extent_colour_a);
1632         }
1633      }
1634      return;
1635   }
1636
1637   int small_block = (decoder.block_w * decoder.block_h * decoder.block_d) < 31;
1638
1639   int idx = 0;
1640   for (int z = 0; z < decoder.block_d; ++z) {
1641      for (int y = 0; y < decoder.block_h; ++y) {
1642         for (int x = 0; x < decoder.block_w; ++x) {
1643
1644            int partition;
1645            if (num_parts > 1) {
1646               partition = select_partition(partition_index, x, y, z, num_parts, small_block);
1647               assert(partition < num_parts);
1648            } else {
1649               partition = 0;
1650            }
1651
1652            /* TODO: HDR */
1653
1654            uint8x4_t e0 = endpoints_decoded[0][partition];
1655            uint8x4_t e1 = endpoints_decoded[1][partition];
1656            uint16_t c0[4], c1[4];
1657
1658            /* Expand to 16 bits. */
1659            if (decoder.srgb) {
1660               c0[0] = (uint16_t)((e0.v[0] << 8) | 0x80);
1661               c0[1] = (uint16_t)((e0.v[1] << 8) | 0x80);
1662               c0[2] = (uint16_t)((e0.v[2] << 8) | 0x80);
1663               c0[3] = (uint16_t)((e0.v[3] << 8) | 0x80);
1664
1665               c1[0] = (uint16_t)((e1.v[0] << 8) | 0x80);
1666               c1[1] = (uint16_t)((e1.v[1] << 8) | 0x80);
1667               c1[2] = (uint16_t)((e1.v[2] << 8) | 0x80);
1668               c1[3] = (uint16_t)((e1.v[3] << 8) | 0x80);
1669            } else {
1670               c0[0] = (uint16_t)((e0.v[0] << 8) | e0.v[0]);
1671               c0[1] = (uint16_t)((e0.v[1] << 8) | e0.v[1]);
1672               c0[2] = (uint16_t)((e0.v[2] << 8) | e0.v[2]);
1673               c0[3] = (uint16_t)((e0.v[3] << 8) | e0.v[3]);
1674
1675               c1[0] = (uint16_t)((e1.v[0] << 8) | e1.v[0]);
1676               c1[1] = (uint16_t)((e1.v[1] << 8) | e1.v[1]);
1677               c1[2] = (uint16_t)((e1.v[2] << 8) | e1.v[2]);
1678               c1[3] = (uint16_t)((e1.v[3] << 8) | e1.v[3]);
1679            }
1680
1681            int w[4];
1682            if (dual_plane) {
1683               int w0 = infill_weights[0][idx];
1684               int w1 = infill_weights[1][idx];
1685               w[0] = w[1] = w[2] = w[3] = w0;
1686               w[colour_component_selector] = w1;
1687            } else {
1688               int w0 = infill_weights[0][idx];
1689               w[0] = w[1] = w[2] = w[3] = w0;
1690            }
1691
1692            /* Interpolate to produce UNORM16, applying weights. */
1693            uint16_t c[4] = {
1694               (uint16_t)((c0[0] * (64 - w[0]) + c1[0] * w[0] + 32) >> 6),
1695               (uint16_t)((c0[1] * (64 - w[1]) + c1[1] * w[1] + 32) >> 6),
1696               (uint16_t)((c0[2] * (64 - w[2]) + c1[2] * w[2] + 32) >> 6),
1697               (uint16_t)((c0[3] * (64 - w[3]) + c1[3] * w[3] + 32) >> 6),
1698            };
1699
1700            if (decoder.output_unorm8) {
1701               if (decoder.srgb) {
1702                  output[idx*4+0] = c[0] >> 8;
1703                  output[idx*4+1] = c[1] >> 8;
1704                  output[idx*4+2] = c[2] >> 8;
1705               } else {
1706                  output[idx*4+0] = c[0] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[0]);
1707                  output[idx*4+1] = c[1] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[1]);
1708                  output[idx*4+2] = c[2] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[2]);
1709               }
1710               output[idx*4+3] = c[3] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[3]);
1711            } else {
1712               /* Store the color as FP16. */
1713               output[idx*4+0] = c[0] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[0]);
1714               output[idx*4+1] = c[1] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[1]);
1715               output[idx*4+2] = c[2] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[2]);
1716               output[idx*4+3] = c[3] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[3]);
1717            }
1718
1719            idx++;
1720         }
1721      }
1722   }
1723}
1724
1725void Block::calculate_from_weights()
1726{
1727   wt_trits = 0;
1728   wt_quints = 0;
1729   wt_bits = 0;
1730   switch (high_prec) {
1731   case 0:
1732      switch (wt_range) {
1733      case 0x2: wt_max = 1; wt_bits = 1; break;
1734      case 0x3: wt_max = 2; wt_trits = 1; break;
1735      case 0x4: wt_max = 3; wt_bits = 2; break;
1736      case 0x5: wt_max = 4; wt_quints = 1; break;
1737      case 0x6: wt_max = 5; wt_trits = 1; wt_bits = 1; break;
1738      case 0x7: wt_max = 7; wt_bits = 3; break;
1739      default: abort();
1740      }
1741      break;
1742   case 1:
1743      switch (wt_range) {
1744      case 0x2: wt_max = 9; wt_quints = 1; wt_bits = 1; break;
1745      case 0x3: wt_max = 11; wt_trits = 1; wt_bits = 2; break;
1746      case 0x4: wt_max = 15; wt_bits = 4; break;
1747      case 0x5: wt_max = 19; wt_quints = 1; wt_bits = 2; break;
1748      case 0x6: wt_max = 23; wt_trits = 1; wt_bits = 3; break;
1749      case 0x7: wt_max = 31; wt_bits = 5; break;
1750      default: abort();
1751      }
1752      break;
1753   }
1754
1755   assert(wt_trits || wt_quints || wt_bits);
1756
1757   num_weights = wt_w * wt_h * wt_d;
1758
1759   if (dual_plane)
1760      num_weights *= 2;
1761
1762   weight_bits =
1763         (num_weights * 8 * wt_trits + 4) / 5
1764         + (num_weights * 7 * wt_quints + 2) / 3
1765         +  num_weights * wt_bits;
1766}
1767
1768void Block::calculate_remaining_bits()
1769{
1770   int config_bits;
1771   if (num_parts > 1) {
1772      if (!is_multi_cem)
1773         config_bits = 29;
1774      else
1775         config_bits = 25 + 3 * num_parts;
1776   } else {
1777      config_bits = 17;
1778   }
1779
1780   if (dual_plane)
1781      config_bits += 2;
1782
1783   remaining_bits = 128 - config_bits - weight_bits;
1784}
1785
1786decode_error::type Block::calculate_colour_endpoints_size()
1787{
1788   /* Specified as illegal */
1789   if (remaining_bits < (13 * num_cem_values + 4) / 5) {
1790      colour_endpoint_bits = ce_max = ce_trits = ce_quints = ce_bits = 0;
1791      return decode_error::invalid_colour_endpoints_size;
1792   }
1793
1794   /* Find the largest cem_ranges that fits within remaining_bits */
1795   for (int i = ARRAY_SIZE(cem_ranges)-1; i >= 0; --i) {
1796      int cem_bits;
1797      cem_bits = (num_cem_values * 8 * cem_ranges[i].t + 4) / 5
1798                 + (num_cem_values * 7 * cem_ranges[i].q + 2) / 3
1799                 +  num_cem_values * cem_ranges[i].b;
1800
1801      if (cem_bits <= remaining_bits)
1802      {
1803         colour_endpoint_bits = cem_bits;
1804         ce_max = cem_ranges[i].max;
1805         ce_trits = cem_ranges[i].t;
1806         ce_quints = cem_ranges[i].q;
1807         ce_bits = cem_ranges[i].b;
1808         return decode_error::ok;
1809      }
1810   }
1811
1812   assert(0);
1813   return decode_error::invalid_colour_endpoints_size;
1814}
1815
1816/**
1817 * Decode ASTC 2D LDR texture data.
1818 *
1819 * \param src_width in pixels
1820 * \param src_height in pixels
1821 * \param dst_stride in bytes
1822 */
1823extern "C" void
1824_mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
1825                         unsigned dst_stride,
1826                         const uint8_t *src_row,
1827                         unsigned src_stride,
1828                         unsigned src_width,
1829                         unsigned src_height,
1830                         mesa_format format)
1831{
1832   assert(_mesa_is_format_astc_2d(format));
1833   bool srgb = _mesa_get_format_color_encoding(format) == GL_SRGB;
1834
1835   unsigned blk_w, blk_h;
1836   _mesa_get_format_block_size(format, &blk_w, &blk_h);
1837
1838   const unsigned block_size = 16;
1839   unsigned x_blocks = (src_width + blk_w - 1) / blk_w;
1840   unsigned y_blocks = (src_height + blk_h - 1) / blk_h;
1841
1842   Decoder dec(blk_w, blk_h, 1, srgb, true);
1843
1844   for (unsigned y = 0; y < y_blocks; ++y) {
1845      for (unsigned x = 0; x < x_blocks; ++x) {
1846         /* Same size as the largest block. */
1847         uint16_t block_out[12 * 12 * 4];
1848
1849         dec.decode(src_row + x * block_size, block_out);
1850
1851         /* This can be smaller with NPOT dimensions. */
1852         unsigned dst_blk_w = MIN2(blk_w, src_width  - x*blk_w);
1853         unsigned dst_blk_h = MIN2(blk_h, src_height - y*blk_h);
1854
1855         for (unsigned sub_y = 0; sub_y < dst_blk_h; ++sub_y) {
1856            for (unsigned sub_x = 0; sub_x < dst_blk_w; ++sub_x) {
1857               uint8_t *dst = dst_row + sub_y * dst_stride +
1858                              (x * blk_w + sub_x) * 4;
1859               const uint16_t *src = &block_out[(sub_y * blk_w + sub_x) * 4];
1860
1861               dst[0] = src[0];
1862               dst[1] = src[1];
1863               dst[2] = src[2];
1864               dst[3] = src[3];
1865            }
1866         }
1867      }
1868      src_row += src_stride;
1869      dst_row += dst_stride * blk_h;
1870   }
1871}
1872