101e04c3fSmrg/*
201e04c3fSmrg * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
301e04c3fSmrg * Copyright 2018 Advanced Micro Devices, Inc.
401e04c3fSmrg *
501e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
601e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
701e04c3fSmrg * to deal in the Software without restriction, including without limitation
801e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
901e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
1001e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1101e04c3fSmrg *
1201e04c3fSmrg * The above copyright notice and this permission notice (including the next
1301e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1401e04c3fSmrg * Software.
1501e04c3fSmrg *
1601e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1701e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1801e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1901e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2001e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2101e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
2201e04c3fSmrg * DEALINGS IN THE SOFTWARE.
2301e04c3fSmrg */
2401e04c3fSmrg
2501e04c3fSmrg/**
2601e04c3fSmrg * \file texcompress_astc.c
2701e04c3fSmrg *
2801e04c3fSmrg * Decompression code for GL_KHR_texture_compression_astc_ldr, which is just
2901e04c3fSmrg * ASTC 2D LDR.
3001e04c3fSmrg *
3101e04c3fSmrg * The ASTC 2D LDR decoder (without the sRGB part) was copied from the OASTC
3201e04c3fSmrg * library written by Philip Taylor. I added sRGB support and adjusted it for
3301e04c3fSmrg * Mesa. - Marek
3401e04c3fSmrg */
3501e04c3fSmrg
3601e04c3fSmrg#include "texcompress_astc.h"
3701e04c3fSmrg#include "macros.h"
3801e04c3fSmrg#include "util/half_float.h"
3901e04c3fSmrg#include <stdio.h>
407ec681f3Smrg#include <cstdlib>  // for abort() on windows
4101e04c3fSmrg
4201e04c3fSmrgstatic bool VERBOSE_DECODE = false;
4301e04c3fSmrgstatic bool VERBOSE_WRITE = false;
4401e04c3fSmrg
4501e04c3fSmrgstatic inline uint8_t
4601e04c3fSmrguint16_div_64k_to_half_to_unorm8(uint16_t v)
4701e04c3fSmrg{
4801e04c3fSmrg   return _mesa_half_to_unorm8(_mesa_uint16_div_64k_to_half(v));
4901e04c3fSmrg}
5001e04c3fSmrg
5101e04c3fSmrgclass decode_error
5201e04c3fSmrg{
5301e04c3fSmrgpublic:
5401e04c3fSmrg   enum type {
5501e04c3fSmrg      ok,
5601e04c3fSmrg      unsupported_hdr_void_extent,
5701e04c3fSmrg      reserved_block_mode_1,
5801e04c3fSmrg      reserved_block_mode_2,
5901e04c3fSmrg      dual_plane_and_too_many_partitions,
6001e04c3fSmrg      invalid_range_in_void_extent,
6101e04c3fSmrg      weight_grid_exceeds_block_size,
6201e04c3fSmrg      invalid_colour_endpoints_size,
6301e04c3fSmrg      invalid_colour_endpoints_count,
6401e04c3fSmrg      invalid_weight_bits,
6501e04c3fSmrg      invalid_num_weights,
6601e04c3fSmrg   };
6701e04c3fSmrg};
6801e04c3fSmrg
6901e04c3fSmrg
7001e04c3fSmrgstruct cem_range {
7101e04c3fSmrg   uint8_t max;
7201e04c3fSmrg   uint8_t t, q, b;
7301e04c3fSmrg};
7401e04c3fSmrg
7501e04c3fSmrg/* Based on the Color Unquantization Parameters table,
7601e04c3fSmrg * plus the bit-only representations, sorted by increasing size
7701e04c3fSmrg */
7801e04c3fSmrgstatic cem_range cem_ranges[] = {
7901e04c3fSmrg   { 5, 1, 0, 1 },
8001e04c3fSmrg   { 7, 0, 0, 3 },
8101e04c3fSmrg   { 9, 0, 1, 1 },
8201e04c3fSmrg   { 11, 1, 0, 2 },
8301e04c3fSmrg   { 15, 0, 0, 4 },
8401e04c3fSmrg   { 19, 0, 1, 2 },
8501e04c3fSmrg   { 23, 1, 0, 3 },
8601e04c3fSmrg   { 31, 0, 0, 5 },
8701e04c3fSmrg   { 39, 0, 1, 3 },
8801e04c3fSmrg   { 47, 1, 0, 4 },
8901e04c3fSmrg   { 63, 0, 0, 6 },
9001e04c3fSmrg   { 79, 0, 1, 4 },
9101e04c3fSmrg   { 95, 1, 0, 5 },
9201e04c3fSmrg   { 127, 0, 0, 7 },
9301e04c3fSmrg   { 159, 0, 1, 5 },
9401e04c3fSmrg   { 191, 1, 0, 6 },
9501e04c3fSmrg   { 255, 0, 0, 8 },
9601e04c3fSmrg};
9701e04c3fSmrg
9801e04c3fSmrg#define CAT_BITS_2(a, b)          ( ((a) << 1) | (b) )
9901e04c3fSmrg#define CAT_BITS_3(a, b, c)       ( ((a) << 2) | ((b) << 1) | (c) )
10001e04c3fSmrg#define CAT_BITS_4(a, b, c, d)    ( ((a) << 3) | ((b) << 2) | ((c) << 1) | (d) )
10101e04c3fSmrg#define CAT_BITS_5(a, b, c, d, e) ( ((a) << 4) | ((b) << 3) | ((c) << 2) | ((d) << 1) | (e) )
10201e04c3fSmrg
10301e04c3fSmrg/**
10401e04c3fSmrg * Unpack 5n+8 bits from 'in' into 5 output values.
10501e04c3fSmrg * If n <= 4 then T should be uint32_t, else it must be uint64_t.
10601e04c3fSmrg */
10701e04c3fSmrgtemplate <typename T>
10801e04c3fSmrgstatic void unpack_trit_block(int n, T in, uint8_t *out)
10901e04c3fSmrg{
11001e04c3fSmrg   assert(n <= 6); /* else output will overflow uint8_t */
11101e04c3fSmrg
11201e04c3fSmrg   uint8_t T0 = (in >> (n)) & 0x1;
11301e04c3fSmrg   uint8_t T1 = (in >> (n+1)) & 0x1;
11401e04c3fSmrg   uint8_t T2 = (in >> (2*n+2)) & 0x1;
11501e04c3fSmrg   uint8_t T3 = (in >> (2*n+3)) & 0x1;
11601e04c3fSmrg   uint8_t T4 = (in >> (3*n+4)) & 0x1;
11701e04c3fSmrg   uint8_t T5 = (in >> (4*n+5)) & 0x1;
11801e04c3fSmrg   uint8_t T6 = (in >> (4*n+6)) & 0x1;
11901e04c3fSmrg   uint8_t T7 = (in >> (5*n+7)) & 0x1;
12001e04c3fSmrg   uint8_t mmask = (1 << n) - 1;
12101e04c3fSmrg   uint8_t m0 = (in >> (0)) & mmask;
12201e04c3fSmrg   uint8_t m1 = (in >> (n+2)) & mmask;
12301e04c3fSmrg   uint8_t m2 = (in >> (2*n+4)) & mmask;
12401e04c3fSmrg   uint8_t m3 = (in >> (3*n+5)) & mmask;
12501e04c3fSmrg   uint8_t m4 = (in >> (4*n+7)) & mmask;
12601e04c3fSmrg
12701e04c3fSmrg   uint8_t C;
12801e04c3fSmrg   uint8_t t4, t3, t2, t1, t0;
12901e04c3fSmrg   if (CAT_BITS_3(T4, T3, T2) == 0x7) {
13001e04c3fSmrg      C = CAT_BITS_5(T7, T6, T5, T1, T0);
13101e04c3fSmrg      t4 = t3 = 2;
13201e04c3fSmrg   } else {
13301e04c3fSmrg      C = CAT_BITS_5(T4, T3, T2, T1, T0);
13401e04c3fSmrg      if (CAT_BITS_2(T6, T5) == 0x3) {
13501e04c3fSmrg         t4 = 2;
13601e04c3fSmrg         t3 = T7;
13701e04c3fSmrg      } else {
13801e04c3fSmrg         t4 = T7;
13901e04c3fSmrg         t3 = CAT_BITS_2(T6, T5);
14001e04c3fSmrg      }
14101e04c3fSmrg   }
14201e04c3fSmrg
14301e04c3fSmrg   if ((C & 0x3) == 0x3) {
14401e04c3fSmrg      t2 = 2;
14501e04c3fSmrg      t1 = (C >> 4) & 0x1;
14601e04c3fSmrg      uint8_t C3 = (C >> 3) & 0x1;
14701e04c3fSmrg      uint8_t C2 = (C >> 2) & 0x1;
14801e04c3fSmrg      t0 = (C3 << 1) | (C2 & ~C3);
14901e04c3fSmrg   } else if (((C >> 2) & 0x3) == 0x3) {
15001e04c3fSmrg      t2 = 2;
15101e04c3fSmrg      t1 = 2;
15201e04c3fSmrg      t0 = C & 0x3;
15301e04c3fSmrg   } else {
15401e04c3fSmrg      t2 = (C >> 4) & 0x1;
15501e04c3fSmrg      t1 = (C >> 2) & 0x3;
15601e04c3fSmrg      uint8_t C1 = (C >> 1) & 0x1;
15701e04c3fSmrg      uint8_t C0 = (C >> 0) & 0x1;
15801e04c3fSmrg      t0 = (C1 << 1) | (C0 & ~C1);
15901e04c3fSmrg   }
16001e04c3fSmrg
16101e04c3fSmrg   out[0] = (t0 << n) | m0;
16201e04c3fSmrg   out[1] = (t1 << n) | m1;
16301e04c3fSmrg   out[2] = (t2 << n) | m2;
16401e04c3fSmrg   out[3] = (t3 << n) | m3;
16501e04c3fSmrg   out[4] = (t4 << n) | m4;
16601e04c3fSmrg}
16701e04c3fSmrg
16801e04c3fSmrg/**
16901e04c3fSmrg * Unpack 3n+7 bits from 'in' into 3 output values
17001e04c3fSmrg */
17101e04c3fSmrgstatic void unpack_quint_block(int n, uint32_t in, uint8_t *out)
17201e04c3fSmrg{
17301e04c3fSmrg   assert(n <= 5); /* else output will overflow uint8_t */
17401e04c3fSmrg
17501e04c3fSmrg   uint8_t Q0 = (in >> (n)) & 0x1;
17601e04c3fSmrg   uint8_t Q1 = (in >> (n+1)) & 0x1;
17701e04c3fSmrg   uint8_t Q2 = (in >> (n+2)) & 0x1;
17801e04c3fSmrg   uint8_t Q3 = (in >> (2*n+3)) & 0x1;
17901e04c3fSmrg   uint8_t Q4 = (in >> (2*n+4)) & 0x1;
18001e04c3fSmrg   uint8_t Q5 = (in >> (3*n+5)) & 0x1;
18101e04c3fSmrg   uint8_t Q6 = (in >> (3*n+6)) & 0x1;
18201e04c3fSmrg   uint8_t mmask = (1 << n) - 1;
18301e04c3fSmrg   uint8_t m0 = (in >> (0)) & mmask;
18401e04c3fSmrg   uint8_t m1 = (in >> (n+3)) & mmask;
18501e04c3fSmrg   uint8_t m2 = (in >> (2*n+5)) & mmask;
18601e04c3fSmrg
18701e04c3fSmrg   uint8_t C;
18801e04c3fSmrg   uint8_t q2, q1, q0;
18901e04c3fSmrg   if (CAT_BITS_4(Q6, Q5, Q2, Q1) == 0x3) {
19001e04c3fSmrg      q2 = CAT_BITS_3(Q0, Q4 & ~Q0, Q3 & ~Q0);
19101e04c3fSmrg      q1 = 4;
19201e04c3fSmrg      q0 = 4;
19301e04c3fSmrg   } else {
19401e04c3fSmrg      if (CAT_BITS_2(Q2, Q1) == 0x3) {
19501e04c3fSmrg         q2 = 4;
19601e04c3fSmrg         C = CAT_BITS_5(Q4, Q3, 0x1 & ~Q6, 0x1 & ~Q5, Q0);
19701e04c3fSmrg      } else {
19801e04c3fSmrg         q2 = CAT_BITS_2(Q6, Q5);
19901e04c3fSmrg         C = CAT_BITS_5(Q4, Q3, Q2, Q1, Q0);
20001e04c3fSmrg      }
20101e04c3fSmrg      if ((C & 0x7) == 0x5) {
20201e04c3fSmrg         q1 = 4;
20301e04c3fSmrg         q0 = (C >> 3) & 0x3;
20401e04c3fSmrg      } else {
20501e04c3fSmrg         q1 = (C >> 3) & 0x3;
20601e04c3fSmrg         q0 = C & 0x7;
20701e04c3fSmrg      }
20801e04c3fSmrg   }
20901e04c3fSmrg   out[0] = (q0 << n) | m0;
21001e04c3fSmrg   out[1] = (q1 << n) | m1;
21101e04c3fSmrg   out[2] = (q2 << n) | m2;
21201e04c3fSmrg}
21301e04c3fSmrg
21401e04c3fSmrg
21501e04c3fSmrgstruct uint8x4_t
21601e04c3fSmrg{
21701e04c3fSmrg   uint8_t v[4];
21801e04c3fSmrg
21901e04c3fSmrg   uint8x4_t() { }
22001e04c3fSmrg
22101e04c3fSmrg   uint8x4_t(int a, int b, int c, int d)
22201e04c3fSmrg   {
22301e04c3fSmrg      assert(0 <= a && a <= 255);
22401e04c3fSmrg      assert(0 <= b && b <= 255);
22501e04c3fSmrg      assert(0 <= c && c <= 255);
22601e04c3fSmrg      assert(0 <= d && d <= 255);
22701e04c3fSmrg      v[0] = a;
22801e04c3fSmrg      v[1] = b;
22901e04c3fSmrg      v[2] = c;
23001e04c3fSmrg      v[3] = d;
23101e04c3fSmrg   }
23201e04c3fSmrg
23301e04c3fSmrg   static uint8x4_t clamped(int a, int b, int c, int d)
23401e04c3fSmrg   {
23501e04c3fSmrg      uint8x4_t r;
23601e04c3fSmrg      r.v[0] = MAX2(0, MIN2(255, a));
23701e04c3fSmrg      r.v[1] = MAX2(0, MIN2(255, b));
23801e04c3fSmrg      r.v[2] = MAX2(0, MIN2(255, c));
23901e04c3fSmrg      r.v[3] = MAX2(0, MIN2(255, d));
24001e04c3fSmrg      return r;
24101e04c3fSmrg   }
24201e04c3fSmrg};
24301e04c3fSmrg
24401e04c3fSmrgstatic uint8x4_t blue_contract(int r, int g, int b, int a)
24501e04c3fSmrg{
24601e04c3fSmrg   return uint8x4_t((r+b) >> 1, (g+b) >> 1, b, a);
24701e04c3fSmrg}
24801e04c3fSmrg
24901e04c3fSmrgstatic uint8x4_t blue_contract_clamped(int r, int g, int b, int a)
25001e04c3fSmrg{
25101e04c3fSmrg   return uint8x4_t::clamped((r+b) >> 1, (g+b) >> 1, b, a);
25201e04c3fSmrg}
25301e04c3fSmrg
25401e04c3fSmrgstatic void bit_transfer_signed(int &a, int &b)
25501e04c3fSmrg{
25601e04c3fSmrg   b >>= 1;
25701e04c3fSmrg   b |= a & 0x80;
25801e04c3fSmrg   a >>= 1;
25901e04c3fSmrg   a &= 0x3f;
26001e04c3fSmrg   if (a & 0x20)
26101e04c3fSmrg      a -= 0x40;
26201e04c3fSmrg}
26301e04c3fSmrg
26401e04c3fSmrgstatic uint32_t hash52(uint32_t p)
26501e04c3fSmrg{
26601e04c3fSmrg   p ^= p >> 15;
26701e04c3fSmrg   p -= p << 17;
26801e04c3fSmrg   p += p << 7;
26901e04c3fSmrg   p += p << 4;
27001e04c3fSmrg   p ^= p >> 5;
27101e04c3fSmrg   p += p << 16;
27201e04c3fSmrg   p ^= p >> 7;
27301e04c3fSmrg   p ^= p >> 3;
27401e04c3fSmrg   p ^= p << 6;
27501e04c3fSmrg   p ^= p >> 17;
27601e04c3fSmrg   return p;
27701e04c3fSmrg}
27801e04c3fSmrg
27901e04c3fSmrgstatic int select_partition(int seed, int x, int y, int z, int partitioncount,
28001e04c3fSmrg                            int small_block)
28101e04c3fSmrg{
28201e04c3fSmrg   if (small_block) {
28301e04c3fSmrg      x <<= 1;
28401e04c3fSmrg      y <<= 1;
28501e04c3fSmrg      z <<= 1;
28601e04c3fSmrg   }
28701e04c3fSmrg   seed += (partitioncount - 1) * 1024;
28801e04c3fSmrg   uint32_t rnum = hash52(seed);
28901e04c3fSmrg   uint8_t seed1 = rnum & 0xF;
29001e04c3fSmrg   uint8_t seed2 = (rnum >> 4) & 0xF;
29101e04c3fSmrg   uint8_t seed3 = (rnum >> 8) & 0xF;
29201e04c3fSmrg   uint8_t seed4 = (rnum >> 12) & 0xF;
29301e04c3fSmrg   uint8_t seed5 = (rnum >> 16) & 0xF;
29401e04c3fSmrg   uint8_t seed6 = (rnum >> 20) & 0xF;
29501e04c3fSmrg   uint8_t seed7 = (rnum >> 24) & 0xF;
29601e04c3fSmrg   uint8_t seed8 = (rnum >> 28) & 0xF;
29701e04c3fSmrg   uint8_t seed9 = (rnum >> 18) & 0xF;
29801e04c3fSmrg   uint8_t seed10 = (rnum >> 22) & 0xF;
29901e04c3fSmrg   uint8_t seed11 = (rnum >> 26) & 0xF;
30001e04c3fSmrg   uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
30101e04c3fSmrg
30201e04c3fSmrg   seed1 *= seed1;
30301e04c3fSmrg   seed2 *= seed2;
30401e04c3fSmrg   seed3 *= seed3;
30501e04c3fSmrg   seed4 *= seed4;
30601e04c3fSmrg   seed5 *= seed5;
30701e04c3fSmrg   seed6 *= seed6;
30801e04c3fSmrg   seed7 *= seed7;
30901e04c3fSmrg   seed8 *= seed8;
31001e04c3fSmrg   seed9 *= seed9;
31101e04c3fSmrg   seed10 *= seed10;
31201e04c3fSmrg   seed11 *= seed11;
31301e04c3fSmrg   seed12 *= seed12;
31401e04c3fSmrg
31501e04c3fSmrg   int sh1, sh2, sh3;
31601e04c3fSmrg   if (seed & 1) {
31701e04c3fSmrg      sh1 = (seed & 2 ? 4 : 5);
31801e04c3fSmrg      sh2 = (partitioncount == 3 ? 6 : 5);
31901e04c3fSmrg   } else {
32001e04c3fSmrg      sh1 = (partitioncount == 3 ? 6 : 5);
32101e04c3fSmrg      sh2 = (seed & 2 ? 4 : 5);
32201e04c3fSmrg   }
32301e04c3fSmrg   sh3 = (seed & 0x10) ? sh1 : sh2;
32401e04c3fSmrg
32501e04c3fSmrg   seed1 >>= sh1;
32601e04c3fSmrg   seed2 >>= sh2;
32701e04c3fSmrg   seed3 >>= sh1;
32801e04c3fSmrg   seed4 >>= sh2;
32901e04c3fSmrg   seed5 >>= sh1;
33001e04c3fSmrg   seed6 >>= sh2;
33101e04c3fSmrg   seed7 >>= sh1;
33201e04c3fSmrg   seed8 >>= sh2;
33301e04c3fSmrg   seed9 >>= sh3;
33401e04c3fSmrg   seed10 >>= sh3;
33501e04c3fSmrg   seed11 >>= sh3;
33601e04c3fSmrg   seed12 >>= sh3;
33701e04c3fSmrg
33801e04c3fSmrg   int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
33901e04c3fSmrg   int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
34001e04c3fSmrg   int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
34101e04c3fSmrg   int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
34201e04c3fSmrg
34301e04c3fSmrg   a &= 0x3F;
34401e04c3fSmrg   b &= 0x3F;
34501e04c3fSmrg   c &= 0x3F;
34601e04c3fSmrg   d &= 0x3F;
34701e04c3fSmrg
34801e04c3fSmrg   if (partitioncount < 4)
34901e04c3fSmrg      d = 0;
35001e04c3fSmrg   if (partitioncount < 3)
35101e04c3fSmrg      c = 0;
35201e04c3fSmrg
35301e04c3fSmrg   if (a >= b && a >= c && a >= d)
35401e04c3fSmrg      return 0;
35501e04c3fSmrg   else if (b >= c && b >= d)
35601e04c3fSmrg      return 1;
35701e04c3fSmrg   else if (c >= d)
35801e04c3fSmrg      return 2;
35901e04c3fSmrg   else
36001e04c3fSmrg      return 3;
36101e04c3fSmrg}
36201e04c3fSmrg
36301e04c3fSmrg
36401e04c3fSmrgstruct InputBitVector
36501e04c3fSmrg{
36601e04c3fSmrg   uint32_t data[4];
36701e04c3fSmrg
36801e04c3fSmrg   void printf_bits(int offset, int count, const char *fmt = "", ...)
36901e04c3fSmrg   {
37001e04c3fSmrg      char out[129];
37101e04c3fSmrg      memset(out, '.', 128);
37201e04c3fSmrg      out[128] = '\0';
37301e04c3fSmrg      int idx = offset;
37401e04c3fSmrg      for (int i = 0; i < count; ++i) {
37501e04c3fSmrg         out[127 - idx] = ((data[idx >> 5] >> (idx & 31)) & 1) ? '1' : '0';
37601e04c3fSmrg         ++idx;
37701e04c3fSmrg      }
37801e04c3fSmrg      printf("%s ", out);
37901e04c3fSmrg      va_list ap;
38001e04c3fSmrg      va_start(ap, fmt);
38101e04c3fSmrg      vprintf(fmt, ap);
38201e04c3fSmrg      va_end(ap);
38301e04c3fSmrg      printf("\n");
38401e04c3fSmrg   }
38501e04c3fSmrg
38601e04c3fSmrg   uint32_t get_bits(int offset, int count)
38701e04c3fSmrg   {
38801e04c3fSmrg      assert(count >= 0 && count < 32);
38901e04c3fSmrg
39001e04c3fSmrg      uint32_t out = 0;
39101e04c3fSmrg      if (offset < 32)
39201e04c3fSmrg         out |= data[0] >> offset;
39301e04c3fSmrg
39401e04c3fSmrg      if (0 < offset && offset <= 32)
39501e04c3fSmrg         out |= data[1] << (32 - offset);
39601e04c3fSmrg      if (32 < offset && offset < 64)
39701e04c3fSmrg         out |= data[1] >> (offset - 32);
39801e04c3fSmrg
39901e04c3fSmrg      if (32 < offset && offset <= 64)
40001e04c3fSmrg         out |= data[2] << (64 - offset);
40101e04c3fSmrg      if (64 < offset && offset < 96)
40201e04c3fSmrg         out |= data[2] >> (offset - 64);
40301e04c3fSmrg
40401e04c3fSmrg      if (64 < offset && offset <= 96)
40501e04c3fSmrg         out |= data[3] << (96 - offset);
40601e04c3fSmrg      if (96 < offset && offset < 128)
40701e04c3fSmrg         out |= data[3] >> (offset - 96);
40801e04c3fSmrg
40901e04c3fSmrg      out &= (1 << count) - 1;
41001e04c3fSmrg      return out;
41101e04c3fSmrg   }
41201e04c3fSmrg
41301e04c3fSmrg   uint64_t get_bits64(int offset, int count)
41401e04c3fSmrg   {
41501e04c3fSmrg      assert(count >= 0 && count < 64);
41601e04c3fSmrg
41701e04c3fSmrg      uint64_t out = 0;
41801e04c3fSmrg      if (offset < 32)
41901e04c3fSmrg         out |= data[0] >> offset;
42001e04c3fSmrg
42101e04c3fSmrg      if (offset <= 32)
42201e04c3fSmrg         out |= (uint64_t)data[1] << (32 - offset);
42301e04c3fSmrg      if (32 < offset && offset < 64)
42401e04c3fSmrg         out |= data[1] >> (offset - 32);
42501e04c3fSmrg
42601e04c3fSmrg      if (0 < offset && offset <= 64)
42701e04c3fSmrg         out |= (uint64_t)data[2] << (64 - offset);
42801e04c3fSmrg      if (64 < offset && offset < 96)
42901e04c3fSmrg         out |= data[2] >> (offset - 64);
43001e04c3fSmrg
43101e04c3fSmrg      if (32 < offset && offset <= 96)
43201e04c3fSmrg         out |= (uint64_t)data[3] << (96 - offset);
43301e04c3fSmrg      if (96 < offset && offset < 128)
43401e04c3fSmrg         out |= data[3] >> (offset - 96);
43501e04c3fSmrg
43601e04c3fSmrg      out &= ((uint64_t)1 << count) - 1;
43701e04c3fSmrg      return out;
43801e04c3fSmrg   }
43901e04c3fSmrg
44001e04c3fSmrg   uint32_t get_bits_rev(int offset, int count)
44101e04c3fSmrg   {
44201e04c3fSmrg      assert(offset >= count);
44301e04c3fSmrg      uint32_t tmp = get_bits(offset - count, count);
44401e04c3fSmrg      uint32_t out = 0;
44501e04c3fSmrg      for (int i = 0; i < count; ++i)
44601e04c3fSmrg         out |= ((tmp >> i) & 1) << (count - 1 - i);
44701e04c3fSmrg      return out;
44801e04c3fSmrg   }
44901e04c3fSmrg};
45001e04c3fSmrg
45101e04c3fSmrgstruct OutputBitVector
45201e04c3fSmrg{
45301e04c3fSmrg   uint32_t data[4];
45401e04c3fSmrg   int offset;
45501e04c3fSmrg
45601e04c3fSmrg   OutputBitVector()
45701e04c3fSmrg      : offset(0)
45801e04c3fSmrg   {
45901e04c3fSmrg      memset(data, 0, sizeof(data));
46001e04c3fSmrg   }
46101e04c3fSmrg
46201e04c3fSmrg   void append(uint32_t value, int size)
46301e04c3fSmrg   {
46401e04c3fSmrg      if (VERBOSE_WRITE)
46501e04c3fSmrg         printf("append offset=%d size=%d values=0x%x\n", offset, size, value);
46601e04c3fSmrg
46701e04c3fSmrg      assert(offset + size <= 128);
46801e04c3fSmrg
46901e04c3fSmrg      assert(size <= 32);
47001e04c3fSmrg      if (size < 32)
47101e04c3fSmrg         assert((value >> size) == 0);
47201e04c3fSmrg
47301e04c3fSmrg      while (size) {
47401e04c3fSmrg         int c = MIN2(size, 32 - (offset & 31));
47501e04c3fSmrg         data[offset >> 5] |= (value << (offset & 31));
47601e04c3fSmrg         offset += c;
47701e04c3fSmrg         size -= c;
47801e04c3fSmrg         value >>= c;
47901e04c3fSmrg      }
48001e04c3fSmrg   }
48101e04c3fSmrg
48201e04c3fSmrg   void append64(uint64_t value, int size)
48301e04c3fSmrg   {
48401e04c3fSmrg      if (VERBOSE_WRITE)
48501e04c3fSmrg         printf("append offset=%d size=%d values=0x%llx\n", offset, size, (unsigned long long)value);
48601e04c3fSmrg
48701e04c3fSmrg      assert(offset + size <= 128);
48801e04c3fSmrg
48901e04c3fSmrg      assert(size <= 64);
49001e04c3fSmrg      if (size < 64)
49101e04c3fSmrg         assert((value >> size) == 0);
49201e04c3fSmrg
49301e04c3fSmrg      while (size) {
49401e04c3fSmrg         int c = MIN2(size, 32 - (offset & 31));
49501e04c3fSmrg         data[offset >> 5] |= (value << (offset & 31));
49601e04c3fSmrg         offset += c;
49701e04c3fSmrg         size -= c;
49801e04c3fSmrg         value >>= c;
49901e04c3fSmrg      }
50001e04c3fSmrg   }
50101e04c3fSmrg
50201e04c3fSmrg   void append(OutputBitVector &v, int size)
50301e04c3fSmrg   {
50401e04c3fSmrg      if (VERBOSE_WRITE)
50501e04c3fSmrg         printf("append vector offset=%d size=%d\n", offset, size);
50601e04c3fSmrg
50701e04c3fSmrg      assert(offset + size <= 128);
50801e04c3fSmrg      int i = 0;
50901e04c3fSmrg      while (size >= 32) {
51001e04c3fSmrg         append(v.data[i++], 32);
51101e04c3fSmrg         size -= 32;
51201e04c3fSmrg      }
51301e04c3fSmrg      if (size > 0)
51401e04c3fSmrg         append(v.data[i] & ((1 << size) - 1), size);
51501e04c3fSmrg   }
51601e04c3fSmrg
51701e04c3fSmrg   void append_end(OutputBitVector &v, int size)
51801e04c3fSmrg   {
51901e04c3fSmrg      for (int i = 0; i < size; ++i)
52001e04c3fSmrg         data[(127 - i) >> 5] |= ((v.data[i >> 5] >> (i & 31)) & 1) << ((127 - i) & 31);
52101e04c3fSmrg   }
52201e04c3fSmrg
52301e04c3fSmrg   /* Insert the given number of '1' bits. (We could use 0s instead, but 1s are
52401e04c3fSmrg    * more likely to flush out bugs where we accidentally read undefined bits.)
52501e04c3fSmrg    */
52601e04c3fSmrg   void skip(int size)
52701e04c3fSmrg   {
52801e04c3fSmrg      if (VERBOSE_WRITE)
52901e04c3fSmrg         printf("skip offset=%d size=%d\n", offset, size);
53001e04c3fSmrg
53101e04c3fSmrg      assert(offset + size <= 128);
53201e04c3fSmrg      while (size >= 32) {
53301e04c3fSmrg         append(0xffffffff, 32);
53401e04c3fSmrg         size -= 32;
53501e04c3fSmrg      }
53601e04c3fSmrg      if (size > 0)
53701e04c3fSmrg         append(0xffffffff >> (32 - size), size);
53801e04c3fSmrg   }
53901e04c3fSmrg};
54001e04c3fSmrg
54101e04c3fSmrg
54201e04c3fSmrgclass Decoder
54301e04c3fSmrg{
54401e04c3fSmrgpublic:
54501e04c3fSmrg   Decoder(int block_w, int block_h, int block_d, bool srgb, bool output_unorm8)
54601e04c3fSmrg      : block_w(block_w), block_h(block_h), block_d(block_d), srgb(srgb),
54701e04c3fSmrg        output_unorm8(output_unorm8) {}
54801e04c3fSmrg
54901e04c3fSmrg   decode_error::type decode(const uint8_t *in, uint16_t *output) const;
55001e04c3fSmrg
55101e04c3fSmrg   int block_w, block_h, block_d;
55201e04c3fSmrg   bool srgb, output_unorm8;
55301e04c3fSmrg};
55401e04c3fSmrg
55501e04c3fSmrgstruct Block
55601e04c3fSmrg{
55701e04c3fSmrg   bool is_error;
55801e04c3fSmrg   bool bogus_colour_endpoints;
55901e04c3fSmrg   bool bogus_weights;
56001e04c3fSmrg
56101e04c3fSmrg   int high_prec;
56201e04c3fSmrg   int dual_plane;
56301e04c3fSmrg   int colour_component_selector;
56401e04c3fSmrg   int wt_range;
56501e04c3fSmrg   int wt_w, wt_h, wt_d;
56601e04c3fSmrg   int num_parts;
56701e04c3fSmrg   int partition_index;
56801e04c3fSmrg
56901e04c3fSmrg   bool is_void_extent;
57001e04c3fSmrg   int void_extent_d;
57101e04c3fSmrg   int void_extent_min_s;
57201e04c3fSmrg   int void_extent_max_s;
57301e04c3fSmrg   int void_extent_min_t;
57401e04c3fSmrg   int void_extent_max_t;
57501e04c3fSmrg   uint16_t void_extent_colour_r;
57601e04c3fSmrg   uint16_t void_extent_colour_g;
57701e04c3fSmrg   uint16_t void_extent_colour_b;
57801e04c3fSmrg   uint16_t void_extent_colour_a;
57901e04c3fSmrg
58001e04c3fSmrg   bool is_multi_cem;
58101e04c3fSmrg   int num_extra_cem_bits;
58201e04c3fSmrg   int colour_endpoint_data_offset;
58301e04c3fSmrg   int extra_cem_bits;
58401e04c3fSmrg   int cem_base_class;
58501e04c3fSmrg   int cems[4];
58601e04c3fSmrg
58701e04c3fSmrg   int num_cem_values;
58801e04c3fSmrg
58901e04c3fSmrg   /* Calculated by unpack_weights(): */
59001e04c3fSmrg   uint8_t weights_quant[64 + 4]; /* max 64 values, plus padding for overflows in trit parsing */
59101e04c3fSmrg
59201e04c3fSmrg   /* Calculated by unquantise_weights(): */
59301e04c3fSmrg   uint8_t weights[64 + 18]; /* max 64 values, plus padding for the infill interpolation */
59401e04c3fSmrg
59501e04c3fSmrg   /* Calculated by unpack_colour_endpoints(): */
59601e04c3fSmrg   uint8_t colour_endpoints_quant[18 + 4]; /* max 18 values, plus padding for overflows in trit parsing */
59701e04c3fSmrg
59801e04c3fSmrg   /* Calculated by unquantise_colour_endpoints(): */
59901e04c3fSmrg   uint8_t colour_endpoints[18];
60001e04c3fSmrg
60101e04c3fSmrg   /* Calculated by calculate_from_weights(): */
60201e04c3fSmrg   int wt_trits;
60301e04c3fSmrg   int wt_quints;
60401e04c3fSmrg   int wt_bits;
60501e04c3fSmrg   int wt_max;
60601e04c3fSmrg   int num_weights;
60701e04c3fSmrg   int weight_bits;
60801e04c3fSmrg
60901e04c3fSmrg   /* Calculated by calculate_remaining_bits(): */
61001e04c3fSmrg   int remaining_bits;
61101e04c3fSmrg
61201e04c3fSmrg   /* Calculated by calculate_colour_endpoints_size(): */
61301e04c3fSmrg   int colour_endpoint_bits;
61401e04c3fSmrg   int ce_max;
61501e04c3fSmrg   int ce_trits;
61601e04c3fSmrg   int ce_quints;
61701e04c3fSmrg   int ce_bits;
61801e04c3fSmrg
61901e04c3fSmrg   /* Calculated by compute_infill_weights(); */
62001e04c3fSmrg   uint8_t infill_weights[2][216]; /* large enough for 6x6x6 */
62101e04c3fSmrg
62201e04c3fSmrg   /* Calculated by decode_colour_endpoints(); */
62301e04c3fSmrg   uint8x4_t endpoints_decoded[2][4];
62401e04c3fSmrg
62501e04c3fSmrg   void calculate_from_weights();
62601e04c3fSmrg   void calculate_remaining_bits();
62701e04c3fSmrg   decode_error::type calculate_colour_endpoints_size();
62801e04c3fSmrg
62901e04c3fSmrg   void unquantise_weights();
63001e04c3fSmrg   void unquantise_colour_endpoints();
63101e04c3fSmrg
63201e04c3fSmrg   decode_error::type decode(const Decoder &decoder, InputBitVector in);
63301e04c3fSmrg
63401e04c3fSmrg   decode_error::type decode_block_mode(InputBitVector in);
63501e04c3fSmrg   decode_error::type decode_void_extent(InputBitVector in);
63601e04c3fSmrg   void decode_cem(InputBitVector in);
63701e04c3fSmrg   void unpack_colour_endpoints(InputBitVector in);
63801e04c3fSmrg   void decode_colour_endpoints();
63901e04c3fSmrg   void unpack_weights(InputBitVector in);
64001e04c3fSmrg   void compute_infill_weights(int block_w, int block_h, int block_d);
64101e04c3fSmrg
64201e04c3fSmrg   void write_decoded(const Decoder &decoder, uint16_t *output);
64301e04c3fSmrg};
64401e04c3fSmrg
64501e04c3fSmrg
64601e04c3fSmrgdecode_error::type Decoder::decode(const uint8_t *in, uint16_t *output) const
64701e04c3fSmrg{
64801e04c3fSmrg   Block blk;
64901e04c3fSmrg   InputBitVector in_vec;
65001e04c3fSmrg   memcpy(&in_vec.data, in, 16);
65101e04c3fSmrg   decode_error::type err = blk.decode(*this, in_vec);
65201e04c3fSmrg   if (err == decode_error::ok) {
65301e04c3fSmrg      blk.write_decoded(*this, output);
65401e04c3fSmrg   } else {
65501e04c3fSmrg      /* Fill output with the error colour */
65601e04c3fSmrg      for (int i = 0; i < block_w * block_h * block_d; ++i) {
65701e04c3fSmrg         if (output_unorm8) {
65801e04c3fSmrg            output[i*4+0] = 0xff;
65901e04c3fSmrg            output[i*4+1] = 0;
66001e04c3fSmrg            output[i*4+2] = 0xff;
66101e04c3fSmrg            output[i*4+3] = 0xff;
66201e04c3fSmrg         } else {
66301e04c3fSmrg            assert(!srgb); /* srgb must use unorm8 */
66401e04c3fSmrg
66501e04c3fSmrg            output[i*4+0] = FP16_ONE;
66601e04c3fSmrg            output[i*4+1] = FP16_ZERO;
66701e04c3fSmrg            output[i*4+2] = FP16_ONE;
66801e04c3fSmrg            output[i*4+3] = FP16_ONE;
66901e04c3fSmrg         }
67001e04c3fSmrg      }
67101e04c3fSmrg   }
67201e04c3fSmrg   return err;
67301e04c3fSmrg}
67401e04c3fSmrg
67501e04c3fSmrg
67601e04c3fSmrgdecode_error::type Block::decode_void_extent(InputBitVector block)
67701e04c3fSmrg{
67801e04c3fSmrg   /* TODO: 3D */
67901e04c3fSmrg
68001e04c3fSmrg   is_void_extent = true;
68101e04c3fSmrg   void_extent_d = block.get_bits(9, 1);
68201e04c3fSmrg   void_extent_min_s = block.get_bits(12, 13);
68301e04c3fSmrg   void_extent_max_s = block.get_bits(25, 13);
68401e04c3fSmrg   void_extent_min_t = block.get_bits(38, 13);
68501e04c3fSmrg   void_extent_max_t = block.get_bits(51, 13);
68601e04c3fSmrg   void_extent_colour_r = block.get_bits(64, 16);
68701e04c3fSmrg   void_extent_colour_g = block.get_bits(80, 16);
68801e04c3fSmrg   void_extent_colour_b = block.get_bits(96, 16);
68901e04c3fSmrg   void_extent_colour_a = block.get_bits(112, 16);
69001e04c3fSmrg
69101e04c3fSmrg   /* TODO: maybe we should do something useful with the extent coordinates? */
69201e04c3fSmrg
69301e04c3fSmrg   if (void_extent_d) {
69401e04c3fSmrg      return decode_error::unsupported_hdr_void_extent;
69501e04c3fSmrg   }
69601e04c3fSmrg
69701e04c3fSmrg   if (void_extent_min_s == 0x1fff && void_extent_max_s == 0x1fff
69801e04c3fSmrg       && void_extent_min_t == 0x1fff && void_extent_max_t == 0x1fff) {
69901e04c3fSmrg
70001e04c3fSmrg      /* No extents */
70101e04c3fSmrg
70201e04c3fSmrg   } else {
70301e04c3fSmrg
70401e04c3fSmrg      /* Check for illegal encoding */
70501e04c3fSmrg      if (void_extent_min_s >= void_extent_max_s || void_extent_min_t >= void_extent_max_t) {
70601e04c3fSmrg         return decode_error::invalid_range_in_void_extent;
70701e04c3fSmrg      }
70801e04c3fSmrg   }
70901e04c3fSmrg
71001e04c3fSmrg   return decode_error::ok;
71101e04c3fSmrg}
71201e04c3fSmrg
71301e04c3fSmrgdecode_error::type Block::decode_block_mode(InputBitVector in)
71401e04c3fSmrg{
71501e04c3fSmrg   dual_plane = in.get_bits(10, 1);
71601e04c3fSmrg   high_prec = in.get_bits(9, 1);
71701e04c3fSmrg
71801e04c3fSmrg   if (in.get_bits(0, 2) != 0x0) {
71901e04c3fSmrg      wt_range = (in.get_bits(0, 2) << 1) | in.get_bits(4, 1);
72001e04c3fSmrg      int a = in.get_bits(5, 2);
72101e04c3fSmrg      int b = in.get_bits(7, 2);
72201e04c3fSmrg      switch (in.get_bits(2, 2)) {
72301e04c3fSmrg      case 0x0:
72401e04c3fSmrg         if (VERBOSE_DECODE)
72501e04c3fSmrg            in.printf_bits(0, 11, "DHBBAAR00RR");
72601e04c3fSmrg         wt_w = b + 4;
72701e04c3fSmrg         wt_h = a + 2;
72801e04c3fSmrg         break;
72901e04c3fSmrg      case 0x1:
73001e04c3fSmrg         if (VERBOSE_DECODE)
73101e04c3fSmrg            in.printf_bits(0, 11, "DHBBAAR01RR");
73201e04c3fSmrg         wt_w = b + 8;
73301e04c3fSmrg         wt_h = a + 2;
73401e04c3fSmrg         break;
73501e04c3fSmrg      case 0x2:
73601e04c3fSmrg         if (VERBOSE_DECODE)
73701e04c3fSmrg            in.printf_bits(0, 11, "DHBBAAR10RR");
73801e04c3fSmrg         wt_w = a + 2;
73901e04c3fSmrg         wt_h = b + 8;
74001e04c3fSmrg         break;
74101e04c3fSmrg      case 0x3:
74201e04c3fSmrg         if ((b & 0x2) == 0) {
74301e04c3fSmrg            if (VERBOSE_DECODE)
74401e04c3fSmrg               in.printf_bits(0, 11, "DH0BAAR11RR");
74501e04c3fSmrg            wt_w = a + 2;
74601e04c3fSmrg            wt_h = b + 6;
74701e04c3fSmrg         } else {
74801e04c3fSmrg            if (VERBOSE_DECODE)
74901e04c3fSmrg               in.printf_bits(0, 11, "DH1BAAR11RR");
75001e04c3fSmrg            wt_w = (b & 0x1) + 2;
75101e04c3fSmrg            wt_h = a + 2;
75201e04c3fSmrg         }
75301e04c3fSmrg         break;
75401e04c3fSmrg      }
75501e04c3fSmrg   } else {
75601e04c3fSmrg      if (in.get_bits(6, 3) == 0x7) {
75701e04c3fSmrg         if (in.get_bits(0, 9) == 0x1fc) {
75801e04c3fSmrg            if (VERBOSE_DECODE)
75901e04c3fSmrg               in.printf_bits(0, 11, "xx111111100 (void extent)");
76001e04c3fSmrg            return decode_void_extent(in);
76101e04c3fSmrg         } else {
76201e04c3fSmrg            if (VERBOSE_DECODE)
76301e04c3fSmrg               in.printf_bits(0, 11, "xx111xxxx00");
76401e04c3fSmrg            return decode_error::reserved_block_mode_1;
76501e04c3fSmrg         }
76601e04c3fSmrg      }
76701e04c3fSmrg      if (in.get_bits(0, 4) == 0x0) {
76801e04c3fSmrg         if (VERBOSE_DECODE)
76901e04c3fSmrg            in.printf_bits(0, 11, "xxxxxxx0000");
77001e04c3fSmrg         return decode_error::reserved_block_mode_2;
77101e04c3fSmrg      }
77201e04c3fSmrg
77301e04c3fSmrg      wt_range = in.get_bits(1, 3) | in.get_bits(4, 1);
77401e04c3fSmrg      int a = in.get_bits(5, 2);
77501e04c3fSmrg      int b;
77601e04c3fSmrg
77701e04c3fSmrg      switch (in.get_bits(7, 2)) {
77801e04c3fSmrg      case 0x0:
77901e04c3fSmrg         if (VERBOSE_DECODE)
78001e04c3fSmrg            in.printf_bits(0, 11, "DH00AARRR00");
78101e04c3fSmrg         wt_w = 12;
78201e04c3fSmrg         wt_h = a + 2;
78301e04c3fSmrg         break;
78401e04c3fSmrg      case 0x1:
78501e04c3fSmrg         if (VERBOSE_DECODE)
78601e04c3fSmrg            in.printf_bits(0, 11, "DH01AARRR00");
78701e04c3fSmrg         wt_w = a + 2;
78801e04c3fSmrg         wt_h = 12;
78901e04c3fSmrg         break;
79001e04c3fSmrg      case 0x3:
79101e04c3fSmrg         if (in.get_bits(5, 1) == 0) {
79201e04c3fSmrg            if (VERBOSE_DECODE)
79301e04c3fSmrg               in.printf_bits(0, 11, "DH1100RRR00");
79401e04c3fSmrg            wt_w = 6;
79501e04c3fSmrg            wt_h = 10;
79601e04c3fSmrg         } else {
79701e04c3fSmrg            if (VERBOSE_DECODE)
79801e04c3fSmrg               in.printf_bits(0, 11, "DH1101RRR00");
79901e04c3fSmrg            wt_w = 10;
80001e04c3fSmrg            wt_h = 6;
80101e04c3fSmrg         }
80201e04c3fSmrg         break;
80301e04c3fSmrg      case 0x2:
80401e04c3fSmrg         if (VERBOSE_DECODE)
80501e04c3fSmrg            in.printf_bits(0, 11, "BB10AARRR00");
80601e04c3fSmrg         b = in.get_bits(9, 2);
80701e04c3fSmrg         wt_w = a + 6;
80801e04c3fSmrg         wt_h = b + 6;
80901e04c3fSmrg         dual_plane = 0;
81001e04c3fSmrg         high_prec = 0;
81101e04c3fSmrg         break;
81201e04c3fSmrg      }
81301e04c3fSmrg   }
81401e04c3fSmrg   return decode_error::ok;
81501e04c3fSmrg}
81601e04c3fSmrg
81701e04c3fSmrgvoid Block::decode_cem(InputBitVector in)
81801e04c3fSmrg{
81901e04c3fSmrg   cems[0] = cems[1] = cems[2] = cems[3] = -1;
82001e04c3fSmrg
82101e04c3fSmrg   num_extra_cem_bits = 0;
82201e04c3fSmrg   extra_cem_bits = 0;
82301e04c3fSmrg
82401e04c3fSmrg   if (num_parts > 1) {
82501e04c3fSmrg
82601e04c3fSmrg      partition_index = in.get_bits(13, 10);
82701e04c3fSmrg      if (VERBOSE_DECODE)
82801e04c3fSmrg         in.printf_bits(13, 10, "partition ID (%d)", partition_index);
82901e04c3fSmrg
83001e04c3fSmrg      uint32_t cem = in.get_bits(23, 6);
83101e04c3fSmrg
83201e04c3fSmrg      if ((cem & 0x3) == 0x0) {
83301e04c3fSmrg         cem >>= 2;
83401e04c3fSmrg         cem_base_class = cem >> 2;
83501e04c3fSmrg         is_multi_cem = false;
83601e04c3fSmrg
83701e04c3fSmrg         for (int i = 0; i < num_parts; ++i)
83801e04c3fSmrg            cems[i] = cem;
83901e04c3fSmrg
84001e04c3fSmrg         if (VERBOSE_DECODE)
84101e04c3fSmrg            in.printf_bits(23, 6, "CEM (single, %d)", cem);
84201e04c3fSmrg      } else {
84301e04c3fSmrg
84401e04c3fSmrg         cem_base_class = (cem & 0x3) - 1;
84501e04c3fSmrg         is_multi_cem = true;
84601e04c3fSmrg
84701e04c3fSmrg         if (VERBOSE_DECODE)
84801e04c3fSmrg            in.printf_bits(23, 6, "CEM (multi, base class %d)", cem_base_class);
84901e04c3fSmrg
85001e04c3fSmrg         int offset = 128 - weight_bits;
85101e04c3fSmrg
85201e04c3fSmrg         if (num_parts == 2) {
85301e04c3fSmrg            if (VERBOSE_DECODE) {
85401e04c3fSmrg               in.printf_bits(25, 4, "M0M0 C1 C0");
85501e04c3fSmrg               in.printf_bits(offset - 2, 2, "M1M1");
85601e04c3fSmrg            }
85701e04c3fSmrg
85801e04c3fSmrg            uint32_t c0 = in.get_bits(25, 1);
85901e04c3fSmrg            uint32_t c1 = in.get_bits(26, 1);
86001e04c3fSmrg
86101e04c3fSmrg            extra_cem_bits = c0 + c1;
86201e04c3fSmrg
86301e04c3fSmrg            num_extra_cem_bits = 2;
86401e04c3fSmrg
86501e04c3fSmrg            uint32_t m0 = in.get_bits(27, 2);
86601e04c3fSmrg            uint32_t m1 = in.get_bits(offset - 2, 2);
86701e04c3fSmrg
86801e04c3fSmrg            cems[0] = ((cem_base_class + c0) << 2) | m0;
86901e04c3fSmrg            cems[1] = ((cem_base_class + c1) << 2) | m1;
87001e04c3fSmrg
87101e04c3fSmrg         } else if (num_parts == 3) {
87201e04c3fSmrg            if (VERBOSE_DECODE) {
87301e04c3fSmrg               in.printf_bits(25, 4, "M0 C2 C1 C0");
87401e04c3fSmrg               in.printf_bits(offset - 5, 5, "M2M2 M1M1 M0");
87501e04c3fSmrg            }
87601e04c3fSmrg
87701e04c3fSmrg            uint32_t c0 = in.get_bits(25, 1);
87801e04c3fSmrg            uint32_t c1 = in.get_bits(26, 1);
87901e04c3fSmrg            uint32_t c2 = in.get_bits(27, 1);
88001e04c3fSmrg
88101e04c3fSmrg            extra_cem_bits = c0 + c1 + c2;
88201e04c3fSmrg
88301e04c3fSmrg            num_extra_cem_bits = 5;
88401e04c3fSmrg
88501e04c3fSmrg            uint32_t m0 = in.get_bits(28, 1) | (in.get_bits(128 - weight_bits - 5, 1) << 1);
88601e04c3fSmrg            uint32_t m1 = in.get_bits(offset - 4, 2);
88701e04c3fSmrg            uint32_t m2 = in.get_bits(offset - 2, 2);
88801e04c3fSmrg
88901e04c3fSmrg            cems[0] = ((cem_base_class + c0) << 2) | m0;
89001e04c3fSmrg            cems[1] = ((cem_base_class + c1) << 2) | m1;
89101e04c3fSmrg            cems[2] = ((cem_base_class + c2) << 2) | m2;
89201e04c3fSmrg
89301e04c3fSmrg         } else if (num_parts == 4) {
89401e04c3fSmrg            if (VERBOSE_DECODE) {
89501e04c3fSmrg               in.printf_bits(25, 4, "C3 C2 C1 C0");
89601e04c3fSmrg               in.printf_bits(offset - 8, 8, "M3M3 M2M2 M1M1 M0M0");
89701e04c3fSmrg            }
89801e04c3fSmrg
89901e04c3fSmrg            uint32_t c0 = in.get_bits(25, 1);
90001e04c3fSmrg            uint32_t c1 = in.get_bits(26, 1);
90101e04c3fSmrg            uint32_t c2 = in.get_bits(27, 1);
90201e04c3fSmrg            uint32_t c3 = in.get_bits(28, 1);
90301e04c3fSmrg
90401e04c3fSmrg            extra_cem_bits = c0 + c1 + c2 + c3;
90501e04c3fSmrg
90601e04c3fSmrg            num_extra_cem_bits = 8;
90701e04c3fSmrg
90801e04c3fSmrg            uint32_t m0 = in.get_bits(offset - 8, 2);
90901e04c3fSmrg            uint32_t m1 = in.get_bits(offset - 6, 2);
91001e04c3fSmrg            uint32_t m2 = in.get_bits(offset - 4, 2);
91101e04c3fSmrg            uint32_t m3 = in.get_bits(offset - 2, 2);
91201e04c3fSmrg
91301e04c3fSmrg            cems[0] = ((cem_base_class + c0) << 2) | m0;
91401e04c3fSmrg            cems[1] = ((cem_base_class + c1) << 2) | m1;
91501e04c3fSmrg            cems[2] = ((cem_base_class + c2) << 2) | m2;
91601e04c3fSmrg            cems[3] = ((cem_base_class + c3) << 2) | m3;
91701e04c3fSmrg         } else {
91801e04c3fSmrg            unreachable("");
91901e04c3fSmrg         }
92001e04c3fSmrg      }
92101e04c3fSmrg
92201e04c3fSmrg      colour_endpoint_data_offset = 29;
92301e04c3fSmrg
92401e04c3fSmrg   } else {
92501e04c3fSmrg      uint32_t cem = in.get_bits(13, 4);
92601e04c3fSmrg
92701e04c3fSmrg      cem_base_class = cem >> 2;
92801e04c3fSmrg      is_multi_cem = false;
92901e04c3fSmrg
93001e04c3fSmrg      cems[0] = cem;
93101e04c3fSmrg
93201e04c3fSmrg      partition_index = -1;
93301e04c3fSmrg
93401e04c3fSmrg      if (VERBOSE_DECODE)
93501e04c3fSmrg         in.printf_bits(13, 4, "CEM = %d (class %d)", cem, cem_base_class);
93601e04c3fSmrg
93701e04c3fSmrg      colour_endpoint_data_offset = 17;
93801e04c3fSmrg   }
93901e04c3fSmrg}
94001e04c3fSmrg
94101e04c3fSmrgvoid Block::unpack_colour_endpoints(InputBitVector in)
94201e04c3fSmrg{
94301e04c3fSmrg   if (ce_trits) {
94401e04c3fSmrg      int offset = colour_endpoint_data_offset;
94501e04c3fSmrg      int bits_left = colour_endpoint_bits;
94601e04c3fSmrg      for (int i = 0; i < num_cem_values; i += 5) {
94701e04c3fSmrg         int bits_to_read = MIN2(bits_left, 8 + ce_bits * 5);
94801e04c3fSmrg         /* If ce_trits then ce_bits <= 6, so bits_to_read <= 38 and we have to use uint64_t */
94901e04c3fSmrg         uint64_t raw = in.get_bits64(offset, bits_to_read);
95001e04c3fSmrg         unpack_trit_block(ce_bits, raw, &colour_endpoints_quant[i]);
95101e04c3fSmrg
95201e04c3fSmrg         if (VERBOSE_DECODE)
95301e04c3fSmrg            in.printf_bits(offset, bits_to_read,
95401e04c3fSmrg                           "trits [%d,%d,%d,%d,%d]",
95501e04c3fSmrg                           colour_endpoints_quant[i+0], colour_endpoints_quant[i+1],
95601e04c3fSmrg                  colour_endpoints_quant[i+2], colour_endpoints_quant[i+3],
95701e04c3fSmrg                  colour_endpoints_quant[i+4]);
95801e04c3fSmrg
95901e04c3fSmrg         offset += 8 + ce_bits * 5;
96001e04c3fSmrg         bits_left -= 8 + ce_bits * 5;
96101e04c3fSmrg      }
96201e04c3fSmrg   } else if (ce_quints) {
96301e04c3fSmrg      int offset = colour_endpoint_data_offset;
96401e04c3fSmrg      int bits_left = colour_endpoint_bits;
96501e04c3fSmrg      for (int i = 0; i < num_cem_values; i += 3) {
96601e04c3fSmrg         int bits_to_read = MIN2(bits_left, 7 + ce_bits * 3);
96701e04c3fSmrg         /* If ce_quints then ce_bits <= 5, so bits_to_read <= 22 and we can use uint32_t */
96801e04c3fSmrg         uint32_t raw = in.get_bits(offset, bits_to_read);
96901e04c3fSmrg         unpack_quint_block(ce_bits, raw, &colour_endpoints_quant[i]);
97001e04c3fSmrg
97101e04c3fSmrg         if (VERBOSE_DECODE)
97201e04c3fSmrg            in.printf_bits(offset, bits_to_read,
97301e04c3fSmrg                           "quints [%d,%d,%d]",
97401e04c3fSmrg                           colour_endpoints_quant[i], colour_endpoints_quant[i+1], colour_endpoints_quant[i+2]);
97501e04c3fSmrg
97601e04c3fSmrg         offset += 7 + ce_bits * 3;
97701e04c3fSmrg         bits_left -= 7 + ce_bits * 3;
97801e04c3fSmrg      }
97901e04c3fSmrg   } else {
98001e04c3fSmrg      assert((colour_endpoint_bits % ce_bits) == 0);
98101e04c3fSmrg      int offset = colour_endpoint_data_offset;
98201e04c3fSmrg      for (int i = 0; i < num_cem_values; i++) {
98301e04c3fSmrg         colour_endpoints_quant[i] = in.get_bits(offset, ce_bits);
98401e04c3fSmrg
98501e04c3fSmrg         if (VERBOSE_DECODE)
98601e04c3fSmrg            in.printf_bits(offset, ce_bits, "bits [%d]", colour_endpoints_quant[i]);
98701e04c3fSmrg
98801e04c3fSmrg         offset += ce_bits;
98901e04c3fSmrg      }
99001e04c3fSmrg   }
99101e04c3fSmrg}
99201e04c3fSmrg
99301e04c3fSmrgvoid Block::decode_colour_endpoints()
99401e04c3fSmrg{
99501e04c3fSmrg   int cem_values_idx = 0;
99601e04c3fSmrg   for (int part = 0; part < num_parts; ++part) {
99701e04c3fSmrg      uint8_t *v = &colour_endpoints[cem_values_idx];
99801e04c3fSmrg      int v0 = v[0];
99901e04c3fSmrg      int v1 = v[1];
100001e04c3fSmrg      int v2 = v[2];
100101e04c3fSmrg      int v3 = v[3];
100201e04c3fSmrg      int v4 = v[4];
100301e04c3fSmrg      int v5 = v[5];
100401e04c3fSmrg      int v6 = v[6];
100501e04c3fSmrg      int v7 = v[7];
100601e04c3fSmrg      cem_values_idx += ((cems[part] >> 2) + 1) * 2;
100701e04c3fSmrg
100801e04c3fSmrg      uint8x4_t e0, e1;
100901e04c3fSmrg      int s0, s1, L0, L1;
101001e04c3fSmrg
101101e04c3fSmrg      switch (cems[part])
101201e04c3fSmrg      {
101301e04c3fSmrg      case 0:
101401e04c3fSmrg         e0 = uint8x4_t(v0, v0, v0, 0xff);
101501e04c3fSmrg         e1 = uint8x4_t(v1, v1, v1, 0xff);
101601e04c3fSmrg         break;
101701e04c3fSmrg      case 1:
101801e04c3fSmrg         L0 = (v0 >> 2) | (v1 & 0xc0);
101901e04c3fSmrg         L1 = L0 + (v1 & 0x3f);
102001e04c3fSmrg         if (L1 > 0xff)
102101e04c3fSmrg            L1 = 0xff;
102201e04c3fSmrg         e0 = uint8x4_t(L0, L0, L0, 0xff);
102301e04c3fSmrg         e1 = uint8x4_t(L1, L1, L1, 0xff);
102401e04c3fSmrg         break;
102501e04c3fSmrg      case 4:
102601e04c3fSmrg         e0 = uint8x4_t(v0, v0, v0, v2);
102701e04c3fSmrg         e1 = uint8x4_t(v1, v1, v1, v3);
102801e04c3fSmrg         break;
102901e04c3fSmrg      case 5:
103001e04c3fSmrg         bit_transfer_signed(v1, v0);
103101e04c3fSmrg         bit_transfer_signed(v3, v2);
103201e04c3fSmrg         e0 = uint8x4_t(v0, v0, v0, v2);
103301e04c3fSmrg         e1 = uint8x4_t::clamped(v0+v1, v0+v1, v0+v1, v2+v3);
103401e04c3fSmrg         break;
103501e04c3fSmrg      case 6:
103601e04c3fSmrg         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, 0xff);
103701e04c3fSmrg         e1 = uint8x4_t(v0, v1, v2, 0xff);
103801e04c3fSmrg         break;
103901e04c3fSmrg      case 8:
104001e04c3fSmrg         s0 = v0 + v2 + v4;
104101e04c3fSmrg         s1 = v1 + v3 + v5;
104201e04c3fSmrg         if (s1 >= s0) {
104301e04c3fSmrg            e0 = uint8x4_t(v0, v2, v4, 0xff);
104401e04c3fSmrg            e1 = uint8x4_t(v1, v3, v5, 0xff);
104501e04c3fSmrg         } else {
104601e04c3fSmrg            e0 = blue_contract(v1, v3, v5, 0xff);
104701e04c3fSmrg            e1 = blue_contract(v0, v2, v4, 0xff);
104801e04c3fSmrg         }
104901e04c3fSmrg         break;
105001e04c3fSmrg      case 9:
105101e04c3fSmrg         bit_transfer_signed(v1, v0);
105201e04c3fSmrg         bit_transfer_signed(v3, v2);
105301e04c3fSmrg         bit_transfer_signed(v5, v4);
105401e04c3fSmrg         if (v1 + v3 + v5 >= 0) {
105501e04c3fSmrg            e0 = uint8x4_t(v0, v2, v4, 0xff);
105601e04c3fSmrg            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, 0xff);
105701e04c3fSmrg         } else {
105801e04c3fSmrg            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, 0xff);
105901e04c3fSmrg            e1 = blue_contract(v0, v2, v4, 0xff);
106001e04c3fSmrg         }
106101e04c3fSmrg         break;
106201e04c3fSmrg      case 10:
106301e04c3fSmrg         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, v4);
106401e04c3fSmrg         e1 = uint8x4_t(v0, v1, v2, v5);
106501e04c3fSmrg         break;
106601e04c3fSmrg      case 12:
106701e04c3fSmrg         s0 = v0 + v2 + v4;
106801e04c3fSmrg         s1 = v1 + v3 + v5;
106901e04c3fSmrg         if (s1 >= s0) {
107001e04c3fSmrg            e0 = uint8x4_t(v0, v2, v4, v6);
107101e04c3fSmrg            e1 = uint8x4_t(v1, v3, v5, v7);
107201e04c3fSmrg         } else {
107301e04c3fSmrg            e0 = blue_contract(v1, v3, v5, v7);
107401e04c3fSmrg            e1 = blue_contract(v0, v2, v4, v6);
107501e04c3fSmrg         }
107601e04c3fSmrg         break;
107701e04c3fSmrg      case 13:
107801e04c3fSmrg         bit_transfer_signed(v1, v0);
107901e04c3fSmrg         bit_transfer_signed(v3, v2);
108001e04c3fSmrg         bit_transfer_signed(v5, v4);
108101e04c3fSmrg         bit_transfer_signed(v7, v6);
108201e04c3fSmrg         if (v1 + v3 + v5 >= 0) {
108301e04c3fSmrg            e0 = uint8x4_t(v0, v2, v4, v6);
108401e04c3fSmrg            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, v6+v7);
108501e04c3fSmrg         } else {
108601e04c3fSmrg            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, v6+v7);
108701e04c3fSmrg            e1 = blue_contract(v0, v2, v4, v6);
108801e04c3fSmrg         }
108901e04c3fSmrg         break;
109001e04c3fSmrg      default:
109101e04c3fSmrg         /* HDR endpoints not supported; return error colour */
109201e04c3fSmrg         e0 = uint8x4_t(255, 0, 255, 255);
109301e04c3fSmrg         e1 = uint8x4_t(255, 0, 255, 255);
109401e04c3fSmrg         break;
109501e04c3fSmrg      }
109601e04c3fSmrg
109701e04c3fSmrg      endpoints_decoded[0][part] = e0;
109801e04c3fSmrg      endpoints_decoded[1][part] = e1;
109901e04c3fSmrg
110001e04c3fSmrg      if (VERBOSE_DECODE) {
110101e04c3fSmrg         printf("cems[%d]=%d v=[", part, cems[part]);
110201e04c3fSmrg         for (int i = 0; i < (cems[part] >> 2) + 1; ++i) {
110301e04c3fSmrg            if (i)
110401e04c3fSmrg               printf(", ");
110501e04c3fSmrg            printf("%3d", v[i]);
110601e04c3fSmrg         }
110701e04c3fSmrg         printf("] e0=[%3d,%4d,%4d,%4d] e1=[%3d,%4d,%4d,%4d]\n",
110801e04c3fSmrg                e0.v[0], e0.v[1], e0.v[2], e0.v[3],
110901e04c3fSmrg               e1.v[0], e1.v[1], e1.v[2], e1.v[3]);
111001e04c3fSmrg      }
111101e04c3fSmrg   }
111201e04c3fSmrg}
111301e04c3fSmrg
111401e04c3fSmrgvoid Block::unpack_weights(InputBitVector in)
111501e04c3fSmrg{
111601e04c3fSmrg   if (wt_trits) {
111701e04c3fSmrg      int offset = 128;
111801e04c3fSmrg      int bits_left = weight_bits;
111901e04c3fSmrg      for (int i = 0; i < num_weights; i += 5) {
112001e04c3fSmrg         int bits_to_read = MIN2(bits_left, 8 + 5*wt_bits);
112101e04c3fSmrg         /* If wt_trits then wt_bits <= 3, so bits_to_read <= 23 and we can use uint32_t */
112201e04c3fSmrg         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
112301e04c3fSmrg         unpack_trit_block(wt_bits, raw, &weights_quant[i]);
112401e04c3fSmrg
112501e04c3fSmrg         if (VERBOSE_DECODE)
112601e04c3fSmrg            in.printf_bits(offset - bits_to_read, bits_to_read, "weight trits [%d,%d,%d,%d,%d]",
112701e04c3fSmrg                           weights_quant[i+0], weights_quant[i+1],
112801e04c3fSmrg                  weights_quant[i+2], weights_quant[i+3],
112901e04c3fSmrg                  weights_quant[i+4]);
113001e04c3fSmrg
113101e04c3fSmrg         offset -= 8 + wt_bits * 5;
113201e04c3fSmrg         bits_left -= 8 + wt_bits * 5;
113301e04c3fSmrg      }
113401e04c3fSmrg
113501e04c3fSmrg   } else if (wt_quints) {
113601e04c3fSmrg
113701e04c3fSmrg      int offset = 128;
113801e04c3fSmrg      int bits_left = weight_bits;
113901e04c3fSmrg      for (int i = 0; i < num_weights; i += 3) {
114001e04c3fSmrg         int bits_to_read = MIN2(bits_left, 7 + 3*wt_bits);
114101e04c3fSmrg         /* If wt_quints then wt_bits <= 2, so bits_to_read <= 13 and we can use uint32_t */
114201e04c3fSmrg         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
114301e04c3fSmrg         unpack_quint_block(wt_bits, raw, &weights_quant[i]);
114401e04c3fSmrg
114501e04c3fSmrg         if (VERBOSE_DECODE)
114601e04c3fSmrg            in.printf_bits(offset - bits_to_read, bits_to_read, "weight quints [%d,%d,%d]",
114701e04c3fSmrg                           weights_quant[i], weights_quant[i+1], weights_quant[i+2]);
114801e04c3fSmrg
114901e04c3fSmrg         offset -= 7 + wt_bits * 3;
115001e04c3fSmrg         bits_left -= 7 + wt_bits * 3;
115101e04c3fSmrg      }
115201e04c3fSmrg
115301e04c3fSmrg   } else {
115401e04c3fSmrg      int offset = 128;
115501e04c3fSmrg      assert((weight_bits % wt_bits) == 0);
115601e04c3fSmrg      for (int i = 0; i < num_weights; ++i) {
115701e04c3fSmrg         weights_quant[i] = in.get_bits_rev(offset, wt_bits);
115801e04c3fSmrg
115901e04c3fSmrg         if (VERBOSE_DECODE)
116001e04c3fSmrg            in.printf_bits(offset - wt_bits, wt_bits, "weight bits [%d]", weights_quant[i]);
116101e04c3fSmrg
116201e04c3fSmrg         offset -= wt_bits;
116301e04c3fSmrg      }
116401e04c3fSmrg   }
116501e04c3fSmrg}
116601e04c3fSmrg
116701e04c3fSmrgvoid Block::unquantise_weights()
116801e04c3fSmrg{
116901e04c3fSmrg   assert(num_weights <= (int)ARRAY_SIZE(weights_quant));
117001e04c3fSmrg   assert(num_weights <= (int)ARRAY_SIZE(weights));
117101e04c3fSmrg
117201e04c3fSmrg   memset(weights, 0, sizeof(weights));
117301e04c3fSmrg
117401e04c3fSmrg   for (int i = 0; i < num_weights; ++i) {
117501e04c3fSmrg
117601e04c3fSmrg      uint8_t v = weights_quant[i];
117701e04c3fSmrg      uint8_t w;
117801e04c3fSmrg
117901e04c3fSmrg      if (wt_trits) {
118001e04c3fSmrg
118101e04c3fSmrg         if (wt_bits == 0) {
118201e04c3fSmrg            w = v * 32;
118301e04c3fSmrg         } else {
118401e04c3fSmrg            uint8_t A, B, C, D;
118501e04c3fSmrg            A = (v & 0x1) ? 0x7F : 0x00;
118601e04c3fSmrg            switch (wt_bits) {
118701e04c3fSmrg            case 1:
118801e04c3fSmrg               B = 0;
118901e04c3fSmrg               C = 50;
119001e04c3fSmrg               D = v >> 1;
119101e04c3fSmrg               break;
119201e04c3fSmrg            case 2:
119301e04c3fSmrg               B = (v & 0x2) ? 0x45 : 0x00;
119401e04c3fSmrg               C = 23;
119501e04c3fSmrg               D = v >> 2;
119601e04c3fSmrg               break;
119701e04c3fSmrg            case 3:
119801e04c3fSmrg               B = ((v & 0x6) >> 1) | ((v & 0x6) << 4);
119901e04c3fSmrg               C = 11;
120001e04c3fSmrg               D = v >> 3;
120101e04c3fSmrg               break;
120201e04c3fSmrg            default:
120301e04c3fSmrg               unreachable("");
120401e04c3fSmrg            }
120501e04c3fSmrg            uint16_t T = D * C + B;
120601e04c3fSmrg            T = T ^ A;
120701e04c3fSmrg            T = (A & 0x20) | (T >> 2);
120801e04c3fSmrg            assert(T < 64);
120901e04c3fSmrg            if (T > 32)
121001e04c3fSmrg               T++;
121101e04c3fSmrg            w = T;
121201e04c3fSmrg         }
121301e04c3fSmrg
121401e04c3fSmrg      } else if (wt_quints) {
121501e04c3fSmrg
121601e04c3fSmrg         if (wt_bits == 0) {
121701e04c3fSmrg            w = v * 16;
121801e04c3fSmrg         } else {
121901e04c3fSmrg            uint8_t A, B, C, D;
122001e04c3fSmrg            A = (v & 0x1) ? 0x7F : 0x00;
122101e04c3fSmrg            switch (wt_bits) {
122201e04c3fSmrg            case 1:
122301e04c3fSmrg               B = 0;
122401e04c3fSmrg               C = 28;
122501e04c3fSmrg               D = v >> 1;
122601e04c3fSmrg               break;
122701e04c3fSmrg            case 2:
122801e04c3fSmrg               B = (v & 0x2) ? 0x42 : 0x00;
122901e04c3fSmrg               C = 13;
123001e04c3fSmrg               D = v >> 2;
123101e04c3fSmrg               break;
123201e04c3fSmrg            default:
123301e04c3fSmrg               unreachable("");
123401e04c3fSmrg            }
123501e04c3fSmrg            uint16_t T = D * C + B;
123601e04c3fSmrg            T = T ^ A;
123701e04c3fSmrg            T = (A & 0x20) | (T >> 2);
123801e04c3fSmrg            assert(T < 64);
123901e04c3fSmrg            if (T > 32)
124001e04c3fSmrg               T++;
124101e04c3fSmrg            w = T;
124201e04c3fSmrg         }
124301e04c3fSmrg         weights[i] = w;
124401e04c3fSmrg
124501e04c3fSmrg      } else {
124601e04c3fSmrg
124701e04c3fSmrg         switch (wt_bits) {
124801e04c3fSmrg         case 1: w = v ? 0x3F : 0x00; break;
124901e04c3fSmrg         case 2: w = v | (v << 2) | (v << 4); break;
125001e04c3fSmrg         case 3: w = v | (v << 3); break;
125101e04c3fSmrg         case 4: w = (v >> 2) | (v << 2); break;
125201e04c3fSmrg         case 5: w = (v >> 4) | (v << 1); break;
125301e04c3fSmrg         default: unreachable("");
125401e04c3fSmrg         }
125501e04c3fSmrg         assert(w < 64);
125601e04c3fSmrg         if (w > 32)
125701e04c3fSmrg            w++;
125801e04c3fSmrg      }
125901e04c3fSmrg      weights[i] = w;
126001e04c3fSmrg   }
126101e04c3fSmrg}
126201e04c3fSmrg
126301e04c3fSmrgvoid Block::compute_infill_weights(int block_w, int block_h, int block_d)
126401e04c3fSmrg{
126501e04c3fSmrg   int Ds = block_w <= 1 ? 0 : (1024 + block_w / 2) / (block_w - 1);
126601e04c3fSmrg   int Dt = block_h <= 1 ? 0 : (1024 + block_h / 2) / (block_h - 1);
126701e04c3fSmrg   int Dr = block_d <= 1 ? 0 : (1024 + block_d / 2) / (block_d - 1);
126801e04c3fSmrg   for (int r = 0; r < block_d; ++r) {
126901e04c3fSmrg      for (int t = 0; t < block_h; ++t) {
127001e04c3fSmrg         for (int s = 0; s < block_w; ++s) {
127101e04c3fSmrg            int cs = Ds * s;
127201e04c3fSmrg            int ct = Dt * t;
127301e04c3fSmrg            int cr = Dr * r;
127401e04c3fSmrg            int gs = (cs * (wt_w - 1) + 32) >> 6;
127501e04c3fSmrg            int gt = (ct * (wt_h - 1) + 32) >> 6;
127601e04c3fSmrg            int gr = (cr * (wt_d - 1) + 32) >> 6;
127701e04c3fSmrg            assert(gs >= 0 && gs <= 176);
127801e04c3fSmrg            assert(gt >= 0 && gt <= 176);
127901e04c3fSmrg            assert(gr >= 0 && gr <= 176);
128001e04c3fSmrg            int js = gs >> 4;
128101e04c3fSmrg            int fs = gs & 0xf;
128201e04c3fSmrg            int jt = gt >> 4;
128301e04c3fSmrg            int ft = gt & 0xf;
128401e04c3fSmrg            int jr = gr >> 4;
128501e04c3fSmrg            int fr = gr & 0xf;
128601e04c3fSmrg
128701e04c3fSmrg            /* TODO: 3D */
128801e04c3fSmrg            (void)jr;
128901e04c3fSmrg            (void)fr;
129001e04c3fSmrg
129101e04c3fSmrg            int w11 = (fs * ft + 8) >> 4;
129201e04c3fSmrg            int w10 = ft - w11;
129301e04c3fSmrg            int w01 = fs - w11;
129401e04c3fSmrg            int w00 = 16 - fs - ft + w11;
129501e04c3fSmrg
129601e04c3fSmrg            if (dual_plane) {
129701e04c3fSmrg               int p00, p01, p10, p11, i0, i1;
129801e04c3fSmrg               int v0 = js + jt * wt_w;
129901e04c3fSmrg               p00 = weights[(v0) * 2];
130001e04c3fSmrg               p01 = weights[(v0 + 1) * 2];
130101e04c3fSmrg               p10 = weights[(v0 + wt_w) * 2];
130201e04c3fSmrg               p11 = weights[(v0 + wt_w + 1) * 2];
130301e04c3fSmrg               i0 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
130401e04c3fSmrg               p00 = weights[(v0) * 2 + 1];
130501e04c3fSmrg               p01 = weights[(v0 + 1) * 2 + 1];
130601e04c3fSmrg               p10 = weights[(v0 + wt_w) * 2 + 1];
130701e04c3fSmrg               p11 = weights[(v0 + wt_w + 1) * 2 + 1];
130801e04c3fSmrg               assert((v0 + wt_w + 1) * 2 + 1 < (int)ARRAY_SIZE(weights));
130901e04c3fSmrg               i1 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
131001e04c3fSmrg               assert(0 <= i0 && i0 <= 64);
131101e04c3fSmrg               infill_weights[0][s + t*block_w + r*block_w*block_h] = i0;
131201e04c3fSmrg               infill_weights[1][s + t*block_w + r*block_w*block_h] = i1;
131301e04c3fSmrg            } else {
131401e04c3fSmrg               int p00, p01, p10, p11, i;
131501e04c3fSmrg               int v0 = js + jt * wt_w;
131601e04c3fSmrg               p00 = weights[v0];
131701e04c3fSmrg               p01 = weights[v0 + 1];
131801e04c3fSmrg               p10 = weights[v0 + wt_w];
131901e04c3fSmrg               p11 = weights[v0 + wt_w + 1];
132001e04c3fSmrg               assert(v0 + wt_w + 1 < (int)ARRAY_SIZE(weights));
132101e04c3fSmrg               i = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
132201e04c3fSmrg               assert(0 <= i && i <= 64);
132301e04c3fSmrg               infill_weights[0][s + t*block_w + r*block_w*block_h] = i;
132401e04c3fSmrg            }
132501e04c3fSmrg         }
132601e04c3fSmrg      }
132701e04c3fSmrg   }
132801e04c3fSmrg}
132901e04c3fSmrg
133001e04c3fSmrgvoid Block::unquantise_colour_endpoints()
133101e04c3fSmrg{
133201e04c3fSmrg   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints_quant));
133301e04c3fSmrg   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints));
133401e04c3fSmrg
133501e04c3fSmrg   for (int i = 0; i < num_cem_values; ++i) {
133601e04c3fSmrg      uint8_t v = colour_endpoints_quant[i];
133701e04c3fSmrg
133801e04c3fSmrg      if (ce_trits) {
133901e04c3fSmrg         uint16_t A, B, C, D;
134001e04c3fSmrg         uint16_t t;
134101e04c3fSmrg         A = (v & 0x1) ? 0x1FF : 0x000;
134201e04c3fSmrg         switch (ce_bits) {
134301e04c3fSmrg         case 1:
134401e04c3fSmrg            B = 0;
134501e04c3fSmrg            C = 204;
134601e04c3fSmrg            D = v >> 1;
134701e04c3fSmrg            break;
134801e04c3fSmrg         case 2:
134901e04c3fSmrg            B = (v & 0x2) ? 0x116 : 0x000;
135001e04c3fSmrg            C = 93;
135101e04c3fSmrg            D = v >> 2;
135201e04c3fSmrg            break;
135301e04c3fSmrg         case 3:
135401e04c3fSmrg            t = ((v >> 1) & 0x3);
135501e04c3fSmrg            B = t | (t << 2) | (t << 7);
135601e04c3fSmrg            C = 44;
135701e04c3fSmrg            D = v >> 3;
135801e04c3fSmrg            break;
135901e04c3fSmrg         case 4:
136001e04c3fSmrg            t = ((v >> 1) & 0x7);
136101e04c3fSmrg            B = t | (t << 6);
136201e04c3fSmrg            C = 22;
136301e04c3fSmrg            D = v >> 4;
136401e04c3fSmrg            break;
136501e04c3fSmrg         case 5:
136601e04c3fSmrg            t = ((v >> 1) & 0xF);
136701e04c3fSmrg            B = (t >> 2) | (t << 5);
136801e04c3fSmrg            C = 11;
136901e04c3fSmrg            D = v >> 5;
137001e04c3fSmrg            break;
137101e04c3fSmrg         case 6:
137201e04c3fSmrg            B = ((v & 0x3E) << 3) | ((v >> 5) & 0x1);
137301e04c3fSmrg            C = 5;
137401e04c3fSmrg            D = v >> 6;
137501e04c3fSmrg            break;
137601e04c3fSmrg         default:
137701e04c3fSmrg            unreachable("");
137801e04c3fSmrg         }
137901e04c3fSmrg         uint16_t T = D * C + B;
138001e04c3fSmrg         T = T ^ A;
138101e04c3fSmrg         T = (A & 0x80) | (T >> 2);
138201e04c3fSmrg         assert(T < 256);
138301e04c3fSmrg         colour_endpoints[i] = T;
138401e04c3fSmrg      } else if (ce_quints) {
138501e04c3fSmrg         uint16_t A, B, C, D;
138601e04c3fSmrg         uint16_t t;
138701e04c3fSmrg         A = (v & 0x1) ? 0x1FF : 0x000;
138801e04c3fSmrg         switch (ce_bits) {
138901e04c3fSmrg         case 1:
139001e04c3fSmrg            B = 0;
139101e04c3fSmrg            C = 113;
139201e04c3fSmrg            D = v >> 1;
139301e04c3fSmrg            break;
139401e04c3fSmrg         case 2:
139501e04c3fSmrg            B = (v & 0x2) ? 0x10C : 0x000;
139601e04c3fSmrg            C = 54;
139701e04c3fSmrg            D = v >> 2;
139801e04c3fSmrg            break;
139901e04c3fSmrg         case 3:
140001e04c3fSmrg            t = ((v >> 1) & 0x3);
140101e04c3fSmrg            B = (t >> 1) | (t << 1) | (t << 7);
140201e04c3fSmrg            C = 26;
140301e04c3fSmrg            D = v >> 3;
140401e04c3fSmrg            break;
140501e04c3fSmrg         case 4:
140601e04c3fSmrg            t = ((v >> 1) & 0x7);
140701e04c3fSmrg            B = (t >> 1) | (t << 6);
140801e04c3fSmrg            C = 13;
140901e04c3fSmrg            D = v >> 4;
141001e04c3fSmrg            break;
141101e04c3fSmrg         case 5:
141201e04c3fSmrg            t = ((v >> 1) & 0xF);
141301e04c3fSmrg            B = (t >> 4) | (t << 5);
141401e04c3fSmrg            C = 6;
141501e04c3fSmrg            D = v >> 5;
141601e04c3fSmrg            break;
141701e04c3fSmrg         default:
141801e04c3fSmrg            unreachable("");
141901e04c3fSmrg         }
142001e04c3fSmrg         uint16_t T = D * C + B;
142101e04c3fSmrg         T = T ^ A;
142201e04c3fSmrg         T = (A & 0x80) | (T >> 2);
142301e04c3fSmrg         assert(T < 256);
142401e04c3fSmrg         colour_endpoints[i] = T;
142501e04c3fSmrg      } else {
142601e04c3fSmrg         switch (ce_bits) {
142701e04c3fSmrg         case 1: v = v ? 0xFF : 0x00; break;
142801e04c3fSmrg         case 2: v = (v << 6) | (v << 4) | (v << 2) | v; break;
142901e04c3fSmrg         case 3: v = (v << 5) | (v << 2) | (v >> 1); break;
143001e04c3fSmrg         case 4: v = (v << 4) | v; break;
143101e04c3fSmrg         case 5: v = (v << 3) | (v >> 2); break;
143201e04c3fSmrg         case 6: v = (v << 2) | (v >> 4); break;
143301e04c3fSmrg         case 7: v = (v << 1) | (v >> 6); break;
143401e04c3fSmrg         case 8: break;
143501e04c3fSmrg         default: unreachable("");
143601e04c3fSmrg         }
143701e04c3fSmrg         colour_endpoints[i] = v;
143801e04c3fSmrg      }
143901e04c3fSmrg   }
144001e04c3fSmrg}
144101e04c3fSmrg
144201e04c3fSmrgdecode_error::type Block::decode(const Decoder &decoder, InputBitVector in)
144301e04c3fSmrg{
144401e04c3fSmrg   decode_error::type err;
144501e04c3fSmrg
144601e04c3fSmrg   is_error = false;
144701e04c3fSmrg   bogus_colour_endpoints = false;
144801e04c3fSmrg   bogus_weights = false;
144901e04c3fSmrg   is_void_extent = false;
145001e04c3fSmrg
145101e04c3fSmrg   wt_d = 1;
145201e04c3fSmrg   /* TODO: 3D */
145301e04c3fSmrg
145401e04c3fSmrg   /* TODO: test for all the illegal encodings */
145501e04c3fSmrg
145601e04c3fSmrg   if (VERBOSE_DECODE)
145701e04c3fSmrg      in.printf_bits(0, 128);
145801e04c3fSmrg
145901e04c3fSmrg   err = decode_block_mode(in);
146001e04c3fSmrg   if (err != decode_error::ok)
146101e04c3fSmrg      return err;
146201e04c3fSmrg
146301e04c3fSmrg   if (is_void_extent)
146401e04c3fSmrg      return decode_error::ok;
146501e04c3fSmrg
146601e04c3fSmrg   /* TODO: 3D */
146701e04c3fSmrg
146801e04c3fSmrg   calculate_from_weights();
146901e04c3fSmrg
147001e04c3fSmrg   if (VERBOSE_DECODE)
147101e04c3fSmrg      printf("weights_grid=%dx%dx%d dual_plane=%d num_weights=%d high_prec=%d r=%d range=0..%d (%dt %dq %db) weight_bits=%d\n",
147201e04c3fSmrg             wt_w, wt_h, wt_d, dual_plane, num_weights, high_prec, wt_range, wt_max, wt_trits, wt_quints, wt_bits, weight_bits);
147301e04c3fSmrg
147401e04c3fSmrg   if (wt_w > decoder.block_w || wt_h > decoder.block_h || wt_d > decoder.block_d)
147501e04c3fSmrg      return decode_error::weight_grid_exceeds_block_size;
147601e04c3fSmrg
147701e04c3fSmrg   num_parts = in.get_bits(11, 2) + 1;
147801e04c3fSmrg
147901e04c3fSmrg   if (VERBOSE_DECODE)
148001e04c3fSmrg      in.printf_bits(11, 2, "partitions = %d", num_parts);
148101e04c3fSmrg
148201e04c3fSmrg   if (dual_plane && num_parts > 3)
148301e04c3fSmrg      return decode_error::dual_plane_and_too_many_partitions;
148401e04c3fSmrg
148501e04c3fSmrg   decode_cem(in);
148601e04c3fSmrg
148701e04c3fSmrg   if (VERBOSE_DECODE)
148801e04c3fSmrg      printf("cem=[%d,%d,%d,%d] base_cem_class=%d\n", cems[0], cems[1], cems[2], cems[3], cem_base_class);
148901e04c3fSmrg
149001e04c3fSmrg   int num_cem_pairs = (cem_base_class + 1) * num_parts + extra_cem_bits;
149101e04c3fSmrg   num_cem_values = num_cem_pairs * 2;
149201e04c3fSmrg
149301e04c3fSmrg   calculate_remaining_bits();
149401e04c3fSmrg   err = calculate_colour_endpoints_size();
149501e04c3fSmrg   if (err != decode_error::ok)
149601e04c3fSmrg      return err;
149701e04c3fSmrg
149801e04c3fSmrg   if (VERBOSE_DECODE)
149901e04c3fSmrg      in.printf_bits(colour_endpoint_data_offset, colour_endpoint_bits,
150001e04c3fSmrg                     "endpoint data (%d bits, %d vals, %dt %dq %db)",
150101e04c3fSmrg                     colour_endpoint_bits, num_cem_values, ce_trits, ce_quints, ce_bits);
150201e04c3fSmrg
150301e04c3fSmrg   unpack_colour_endpoints(in);
150401e04c3fSmrg
150501e04c3fSmrg   if (VERBOSE_DECODE) {
150601e04c3fSmrg      printf("cem values raw =[");
150701e04c3fSmrg      for (int i = 0; i < num_cem_values; i++) {
150801e04c3fSmrg         if (i)
150901e04c3fSmrg            printf(", ");
151001e04c3fSmrg         printf("%3d", colour_endpoints_quant[i]);
151101e04c3fSmrg      }
151201e04c3fSmrg      printf("]\n");
151301e04c3fSmrg   }
151401e04c3fSmrg
151501e04c3fSmrg   if (num_cem_values > 18)
151601e04c3fSmrg      return decode_error::invalid_colour_endpoints_count;
151701e04c3fSmrg
151801e04c3fSmrg   unquantise_colour_endpoints();
151901e04c3fSmrg
152001e04c3fSmrg   if (VERBOSE_DECODE) {
152101e04c3fSmrg      printf("cem values norm=[");
152201e04c3fSmrg      for (int i = 0; i < num_cem_values; i++) {
152301e04c3fSmrg         if (i)
152401e04c3fSmrg            printf(", ");
152501e04c3fSmrg         printf("%3d", colour_endpoints[i]);
152601e04c3fSmrg      }
152701e04c3fSmrg      printf("]\n");
152801e04c3fSmrg   }
152901e04c3fSmrg
153001e04c3fSmrg   decode_colour_endpoints();
153101e04c3fSmrg
153201e04c3fSmrg   if (dual_plane) {
153301e04c3fSmrg      int ccs_offset = 128 - weight_bits - num_extra_cem_bits - 2;
153401e04c3fSmrg      colour_component_selector = in.get_bits(ccs_offset, 2);
153501e04c3fSmrg
153601e04c3fSmrg      if (VERBOSE_DECODE)
153701e04c3fSmrg         in.printf_bits(ccs_offset, 2, "colour component selector = %d", colour_component_selector);
153801e04c3fSmrg   } else {
153901e04c3fSmrg      colour_component_selector = 0;
154001e04c3fSmrg   }
154101e04c3fSmrg
154201e04c3fSmrg
154301e04c3fSmrg   if (VERBOSE_DECODE)
154401e04c3fSmrg      in.printf_bits(128 - weight_bits, weight_bits, "weights (%d bits)", weight_bits);
154501e04c3fSmrg
154601e04c3fSmrg   if (num_weights > 64)
154701e04c3fSmrg      return decode_error::invalid_num_weights;
154801e04c3fSmrg
154901e04c3fSmrg   if (weight_bits < 24 || weight_bits > 96)
155001e04c3fSmrg      return decode_error::invalid_weight_bits;
155101e04c3fSmrg
155201e04c3fSmrg   unpack_weights(in);
155301e04c3fSmrg
155401e04c3fSmrg   unquantise_weights();
155501e04c3fSmrg
155601e04c3fSmrg   if (VERBOSE_DECODE) {
155701e04c3fSmrg      printf("weights=[");
155801e04c3fSmrg      for (int i = 0; i < num_weights; ++i) {
155901e04c3fSmrg         if (i)
156001e04c3fSmrg            printf(", ");
156101e04c3fSmrg         printf("%d", weights[i]);
156201e04c3fSmrg      }
156301e04c3fSmrg      printf("]\n");
156401e04c3fSmrg
156501e04c3fSmrg      for (int plane = 0; plane <= dual_plane; ++plane) {
156601e04c3fSmrg         printf("weights (plane %d):\n", plane);
156701e04c3fSmrg         int i = 0;
156801e04c3fSmrg         (void)i;
156901e04c3fSmrg
157001e04c3fSmrg         for (int r = 0; r < wt_d; ++r) {
157101e04c3fSmrg            for (int t = 0; t < wt_h; ++t) {
157201e04c3fSmrg               for (int s = 0; s < wt_w; ++s) {
157301e04c3fSmrg                  printf("%3d", weights[i++ * (1 + dual_plane) + plane]);
157401e04c3fSmrg               }
157501e04c3fSmrg               printf("\n");
157601e04c3fSmrg            }
157701e04c3fSmrg            if (r < wt_d - 1)
157801e04c3fSmrg               printf("\n");
157901e04c3fSmrg         }
158001e04c3fSmrg      }
158101e04c3fSmrg   }
158201e04c3fSmrg
158301e04c3fSmrg   compute_infill_weights(decoder.block_w, decoder.block_h, decoder.block_d);
158401e04c3fSmrg
158501e04c3fSmrg   if (VERBOSE_DECODE) {
158601e04c3fSmrg      for (int plane = 0; plane <= dual_plane; ++plane) {
158701e04c3fSmrg         printf("infilled weights (plane %d):\n", plane);
158801e04c3fSmrg         int i = 0;
158901e04c3fSmrg         (void)i;
159001e04c3fSmrg
159101e04c3fSmrg         for (int r = 0; r < decoder.block_d; ++r) {
159201e04c3fSmrg            for (int t = 0; t < decoder.block_h; ++t) {
159301e04c3fSmrg               for (int s = 0; s < decoder.block_w; ++s) {
159401e04c3fSmrg                  printf("%3d", infill_weights[plane][i++]);
159501e04c3fSmrg               }
159601e04c3fSmrg               printf("\n");
159701e04c3fSmrg            }
159801e04c3fSmrg            if (r < decoder.block_d - 1)
159901e04c3fSmrg               printf("\n");
160001e04c3fSmrg         }
160101e04c3fSmrg      }
160201e04c3fSmrg   }
160301e04c3fSmrg   if (VERBOSE_DECODE)
160401e04c3fSmrg      printf("\n");
160501e04c3fSmrg
160601e04c3fSmrg   return decode_error::ok;
160701e04c3fSmrg}
160801e04c3fSmrg
160901e04c3fSmrgvoid Block::write_decoded(const Decoder &decoder, uint16_t *output)
161001e04c3fSmrg{
161101e04c3fSmrg   /* sRGB can only be stored as unorm8. */
161201e04c3fSmrg   assert(!decoder.srgb || decoder.output_unorm8);
161301e04c3fSmrg
161401e04c3fSmrg   if (is_void_extent) {
161501e04c3fSmrg      for (int idx = 0; idx < decoder.block_w*decoder.block_h*decoder.block_d; ++idx) {
161601e04c3fSmrg         if (decoder.output_unorm8) {
161701e04c3fSmrg            if (decoder.srgb) {
161801e04c3fSmrg               output[idx*4+0] = void_extent_colour_r >> 8;
161901e04c3fSmrg               output[idx*4+1] = void_extent_colour_g >> 8;
162001e04c3fSmrg               output[idx*4+2] = void_extent_colour_b >> 8;
162101e04c3fSmrg            } else {
162201e04c3fSmrg               output[idx*4+0] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_r);
162301e04c3fSmrg               output[idx*4+1] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_g);
162401e04c3fSmrg               output[idx*4+2] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_b);
162501e04c3fSmrg            }
162601e04c3fSmrg            output[idx*4+3] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_a);
162701e04c3fSmrg         } else {
162801e04c3fSmrg            /* Store the color as FP16. */
162901e04c3fSmrg            output[idx*4+0] = _mesa_uint16_div_64k_to_half(void_extent_colour_r);
163001e04c3fSmrg            output[idx*4+1] = _mesa_uint16_div_64k_to_half(void_extent_colour_g);
163101e04c3fSmrg            output[idx*4+2] = _mesa_uint16_div_64k_to_half(void_extent_colour_b);
163201e04c3fSmrg            output[idx*4+3] = _mesa_uint16_div_64k_to_half(void_extent_colour_a);
163301e04c3fSmrg         }
163401e04c3fSmrg      }
163501e04c3fSmrg      return;
163601e04c3fSmrg   }
163701e04c3fSmrg
163801e04c3fSmrg   int small_block = (decoder.block_w * decoder.block_h * decoder.block_d) < 31;
163901e04c3fSmrg
164001e04c3fSmrg   int idx = 0;
164101e04c3fSmrg   for (int z = 0; z < decoder.block_d; ++z) {
164201e04c3fSmrg      for (int y = 0; y < decoder.block_h; ++y) {
164301e04c3fSmrg         for (int x = 0; x < decoder.block_w; ++x) {
164401e04c3fSmrg
164501e04c3fSmrg            int partition;
164601e04c3fSmrg            if (num_parts > 1) {
164701e04c3fSmrg               partition = select_partition(partition_index, x, y, z, num_parts, small_block);
164801e04c3fSmrg               assert(partition < num_parts);
164901e04c3fSmrg            } else {
165001e04c3fSmrg               partition = 0;
165101e04c3fSmrg            }
165201e04c3fSmrg
165301e04c3fSmrg            /* TODO: HDR */
165401e04c3fSmrg
165501e04c3fSmrg            uint8x4_t e0 = endpoints_decoded[0][partition];
165601e04c3fSmrg            uint8x4_t e1 = endpoints_decoded[1][partition];
165701e04c3fSmrg            uint16_t c0[4], c1[4];
165801e04c3fSmrg
165901e04c3fSmrg            /* Expand to 16 bits. */
166001e04c3fSmrg            if (decoder.srgb) {
166101e04c3fSmrg               c0[0] = (uint16_t)((e0.v[0] << 8) | 0x80);
166201e04c3fSmrg               c0[1] = (uint16_t)((e0.v[1] << 8) | 0x80);
166301e04c3fSmrg               c0[2] = (uint16_t)((e0.v[2] << 8) | 0x80);
166401e04c3fSmrg               c0[3] = (uint16_t)((e0.v[3] << 8) | 0x80);
166501e04c3fSmrg
166601e04c3fSmrg               c1[0] = (uint16_t)((e1.v[0] << 8) | 0x80);
166701e04c3fSmrg               c1[1] = (uint16_t)((e1.v[1] << 8) | 0x80);
166801e04c3fSmrg               c1[2] = (uint16_t)((e1.v[2] << 8) | 0x80);
166901e04c3fSmrg               c1[3] = (uint16_t)((e1.v[3] << 8) | 0x80);
167001e04c3fSmrg            } else {
167101e04c3fSmrg               c0[0] = (uint16_t)((e0.v[0] << 8) | e0.v[0]);
167201e04c3fSmrg               c0[1] = (uint16_t)((e0.v[1] << 8) | e0.v[1]);
167301e04c3fSmrg               c0[2] = (uint16_t)((e0.v[2] << 8) | e0.v[2]);
167401e04c3fSmrg               c0[3] = (uint16_t)((e0.v[3] << 8) | e0.v[3]);
167501e04c3fSmrg
167601e04c3fSmrg               c1[0] = (uint16_t)((e1.v[0] << 8) | e1.v[0]);
167701e04c3fSmrg               c1[1] = (uint16_t)((e1.v[1] << 8) | e1.v[1]);
167801e04c3fSmrg               c1[2] = (uint16_t)((e1.v[2] << 8) | e1.v[2]);
167901e04c3fSmrg               c1[3] = (uint16_t)((e1.v[3] << 8) | e1.v[3]);
168001e04c3fSmrg            }
168101e04c3fSmrg
168201e04c3fSmrg            int w[4];
168301e04c3fSmrg            if (dual_plane) {
168401e04c3fSmrg               int w0 = infill_weights[0][idx];
168501e04c3fSmrg               int w1 = infill_weights[1][idx];
168601e04c3fSmrg               w[0] = w[1] = w[2] = w[3] = w0;
168701e04c3fSmrg               w[colour_component_selector] = w1;
168801e04c3fSmrg            } else {
168901e04c3fSmrg               int w0 = infill_weights[0][idx];
169001e04c3fSmrg               w[0] = w[1] = w[2] = w[3] = w0;
169101e04c3fSmrg            }
169201e04c3fSmrg
169301e04c3fSmrg            /* Interpolate to produce UNORM16, applying weights. */
169401e04c3fSmrg            uint16_t c[4] = {
169501e04c3fSmrg               (uint16_t)((c0[0] * (64 - w[0]) + c1[0] * w[0] + 32) >> 6),
169601e04c3fSmrg               (uint16_t)((c0[1] * (64 - w[1]) + c1[1] * w[1] + 32) >> 6),
169701e04c3fSmrg               (uint16_t)((c0[2] * (64 - w[2]) + c1[2] * w[2] + 32) >> 6),
169801e04c3fSmrg               (uint16_t)((c0[3] * (64 - w[3]) + c1[3] * w[3] + 32) >> 6),
169901e04c3fSmrg            };
170001e04c3fSmrg
170101e04c3fSmrg            if (decoder.output_unorm8) {
170201e04c3fSmrg               if (decoder.srgb) {
170301e04c3fSmrg                  output[idx*4+0] = c[0] >> 8;
170401e04c3fSmrg                  output[idx*4+1] = c[1] >> 8;
170501e04c3fSmrg                  output[idx*4+2] = c[2] >> 8;
170601e04c3fSmrg               } else {
170701e04c3fSmrg                  output[idx*4+0] = c[0] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[0]);
170801e04c3fSmrg                  output[idx*4+1] = c[1] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[1]);
170901e04c3fSmrg                  output[idx*4+2] = c[2] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[2]);
171001e04c3fSmrg               }
171101e04c3fSmrg               output[idx*4+3] = c[3] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[3]);
171201e04c3fSmrg            } else {
171301e04c3fSmrg               /* Store the color as FP16. */
171401e04c3fSmrg               output[idx*4+0] = c[0] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[0]);
171501e04c3fSmrg               output[idx*4+1] = c[1] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[1]);
171601e04c3fSmrg               output[idx*4+2] = c[2] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[2]);
171701e04c3fSmrg               output[idx*4+3] = c[3] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[3]);
171801e04c3fSmrg            }
171901e04c3fSmrg
172001e04c3fSmrg            idx++;
172101e04c3fSmrg         }
172201e04c3fSmrg      }
172301e04c3fSmrg   }
172401e04c3fSmrg}
172501e04c3fSmrg
172601e04c3fSmrgvoid Block::calculate_from_weights()
172701e04c3fSmrg{
172801e04c3fSmrg   wt_trits = 0;
172901e04c3fSmrg   wt_quints = 0;
173001e04c3fSmrg   wt_bits = 0;
173101e04c3fSmrg   switch (high_prec) {
173201e04c3fSmrg   case 0:
173301e04c3fSmrg      switch (wt_range) {
173401e04c3fSmrg      case 0x2: wt_max = 1; wt_bits = 1; break;
173501e04c3fSmrg      case 0x3: wt_max = 2; wt_trits = 1; break;
173601e04c3fSmrg      case 0x4: wt_max = 3; wt_bits = 2; break;
173701e04c3fSmrg      case 0x5: wt_max = 4; wt_quints = 1; break;
173801e04c3fSmrg      case 0x6: wt_max = 5; wt_trits = 1; wt_bits = 1; break;
173901e04c3fSmrg      case 0x7: wt_max = 7; wt_bits = 3; break;
174001e04c3fSmrg      default: abort();
174101e04c3fSmrg      }
174201e04c3fSmrg      break;
174301e04c3fSmrg   case 1:
174401e04c3fSmrg      switch (wt_range) {
174501e04c3fSmrg      case 0x2: wt_max = 9; wt_quints = 1; wt_bits = 1; break;
174601e04c3fSmrg      case 0x3: wt_max = 11; wt_trits = 1; wt_bits = 2; break;
174701e04c3fSmrg      case 0x4: wt_max = 15; wt_bits = 4; break;
174801e04c3fSmrg      case 0x5: wt_max = 19; wt_quints = 1; wt_bits = 2; break;
174901e04c3fSmrg      case 0x6: wt_max = 23; wt_trits = 1; wt_bits = 3; break;
175001e04c3fSmrg      case 0x7: wt_max = 31; wt_bits = 5; break;
175101e04c3fSmrg      default: abort();
175201e04c3fSmrg      }
175301e04c3fSmrg      break;
175401e04c3fSmrg   }
175501e04c3fSmrg
175601e04c3fSmrg   assert(wt_trits || wt_quints || wt_bits);
175701e04c3fSmrg
175801e04c3fSmrg   num_weights = wt_w * wt_h * wt_d;
175901e04c3fSmrg
176001e04c3fSmrg   if (dual_plane)
176101e04c3fSmrg      num_weights *= 2;
176201e04c3fSmrg
176301e04c3fSmrg   weight_bits =
176401e04c3fSmrg         (num_weights * 8 * wt_trits + 4) / 5
176501e04c3fSmrg         + (num_weights * 7 * wt_quints + 2) / 3
176601e04c3fSmrg         +  num_weights * wt_bits;
176701e04c3fSmrg}
176801e04c3fSmrg
176901e04c3fSmrgvoid Block::calculate_remaining_bits()
177001e04c3fSmrg{
177101e04c3fSmrg   int config_bits;
177201e04c3fSmrg   if (num_parts > 1) {
177301e04c3fSmrg      if (!is_multi_cem)
177401e04c3fSmrg         config_bits = 29;
177501e04c3fSmrg      else
177601e04c3fSmrg         config_bits = 25 + 3 * num_parts;
177701e04c3fSmrg   } else {
177801e04c3fSmrg      config_bits = 17;
177901e04c3fSmrg   }
178001e04c3fSmrg
178101e04c3fSmrg   if (dual_plane)
178201e04c3fSmrg      config_bits += 2;
178301e04c3fSmrg
178401e04c3fSmrg   remaining_bits = 128 - config_bits - weight_bits;
178501e04c3fSmrg}
178601e04c3fSmrg
178701e04c3fSmrgdecode_error::type Block::calculate_colour_endpoints_size()
178801e04c3fSmrg{
178901e04c3fSmrg   /* Specified as illegal */
179001e04c3fSmrg   if (remaining_bits < (13 * num_cem_values + 4) / 5) {
179101e04c3fSmrg      colour_endpoint_bits = ce_max = ce_trits = ce_quints = ce_bits = 0;
179201e04c3fSmrg      return decode_error::invalid_colour_endpoints_size;
179301e04c3fSmrg   }
179401e04c3fSmrg
179501e04c3fSmrg   /* Find the largest cem_ranges that fits within remaining_bits */
179601e04c3fSmrg   for (int i = ARRAY_SIZE(cem_ranges)-1; i >= 0; --i) {
179701e04c3fSmrg      int cem_bits;
179801e04c3fSmrg      cem_bits = (num_cem_values * 8 * cem_ranges[i].t + 4) / 5
179901e04c3fSmrg                 + (num_cem_values * 7 * cem_ranges[i].q + 2) / 3
180001e04c3fSmrg                 +  num_cem_values * cem_ranges[i].b;
180101e04c3fSmrg
180201e04c3fSmrg      if (cem_bits <= remaining_bits)
180301e04c3fSmrg      {
180401e04c3fSmrg         colour_endpoint_bits = cem_bits;
180501e04c3fSmrg         ce_max = cem_ranges[i].max;
180601e04c3fSmrg         ce_trits = cem_ranges[i].t;
180701e04c3fSmrg         ce_quints = cem_ranges[i].q;
180801e04c3fSmrg         ce_bits = cem_ranges[i].b;
180901e04c3fSmrg         return decode_error::ok;
181001e04c3fSmrg      }
181101e04c3fSmrg   }
181201e04c3fSmrg
181301e04c3fSmrg   assert(0);
181401e04c3fSmrg   return decode_error::invalid_colour_endpoints_size;
181501e04c3fSmrg}
181601e04c3fSmrg
181701e04c3fSmrg/**
181801e04c3fSmrg * Decode ASTC 2D LDR texture data.
181901e04c3fSmrg *
182001e04c3fSmrg * \param src_width in pixels
182101e04c3fSmrg * \param src_height in pixels
182201e04c3fSmrg * \param dst_stride in bytes
182301e04c3fSmrg */
182401e04c3fSmrgextern "C" void
182501e04c3fSmrg_mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
182601e04c3fSmrg                         unsigned dst_stride,
182701e04c3fSmrg                         const uint8_t *src_row,
182801e04c3fSmrg                         unsigned src_stride,
182901e04c3fSmrg                         unsigned src_width,
183001e04c3fSmrg                         unsigned src_height,
183101e04c3fSmrg                         mesa_format format)
183201e04c3fSmrg{
183301e04c3fSmrg   assert(_mesa_is_format_astc_2d(format));
18347ec681f3Smrg   bool srgb = _mesa_is_format_srgb(format);
183501e04c3fSmrg
183601e04c3fSmrg   unsigned blk_w, blk_h;
183701e04c3fSmrg   _mesa_get_format_block_size(format, &blk_w, &blk_h);
183801e04c3fSmrg
183901e04c3fSmrg   const unsigned block_size = 16;
184001e04c3fSmrg   unsigned x_blocks = (src_width + blk_w - 1) / blk_w;
184101e04c3fSmrg   unsigned y_blocks = (src_height + blk_h - 1) / blk_h;
184201e04c3fSmrg
184301e04c3fSmrg   Decoder dec(blk_w, blk_h, 1, srgb, true);
184401e04c3fSmrg
184501e04c3fSmrg   for (unsigned y = 0; y < y_blocks; ++y) {
184601e04c3fSmrg      for (unsigned x = 0; x < x_blocks; ++x) {
184701e04c3fSmrg         /* Same size as the largest block. */
184801e04c3fSmrg         uint16_t block_out[12 * 12 * 4];
184901e04c3fSmrg
185001e04c3fSmrg         dec.decode(src_row + x * block_size, block_out);
185101e04c3fSmrg
185201e04c3fSmrg         /* This can be smaller with NPOT dimensions. */
185301e04c3fSmrg         unsigned dst_blk_w = MIN2(blk_w, src_width  - x*blk_w);
185401e04c3fSmrg         unsigned dst_blk_h = MIN2(blk_h, src_height - y*blk_h);
185501e04c3fSmrg
185601e04c3fSmrg         for (unsigned sub_y = 0; sub_y < dst_blk_h; ++sub_y) {
185701e04c3fSmrg            for (unsigned sub_x = 0; sub_x < dst_blk_w; ++sub_x) {
185801e04c3fSmrg               uint8_t *dst = dst_row + sub_y * dst_stride +
185901e04c3fSmrg                              (x * blk_w + sub_x) * 4;
186001e04c3fSmrg               const uint16_t *src = &block_out[(sub_y * blk_w + sub_x) * 4];
186101e04c3fSmrg
186201e04c3fSmrg               dst[0] = src[0];
186301e04c3fSmrg               dst[1] = src[1];
186401e04c3fSmrg               dst[2] = src[2];
186501e04c3fSmrg               dst[3] = src[3];
186601e04c3fSmrg            }
186701e04c3fSmrg         }
186801e04c3fSmrg      }
186901e04c3fSmrg      src_row += src_stride;
187001e04c3fSmrg      dst_row += dst_stride * blk_h;
187101e04c3fSmrg   }
187201e04c3fSmrg}
1873