1/*
2 * Copyright (C) 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24/*
25 * Included by texcompress_bptc and gallium to define BPTC decoding routines.
26 */
27
28#ifndef TEXCOMPRESS_BPTC_TMP_H
29#define TEXCOMPRESS_BPTC_TMP_H
30
31#include "util/format_srgb.h"
32#include "util/half_float.h"
33#include "macros.h"
34
35#define BLOCK_SIZE 4
36#define N_PARTITIONS 64
37#define BLOCK_BYTES 16
38
39struct bptc_unorm_mode {
40   int n_subsets;
41   int n_partition_bits;
42   bool has_rotation_bits;
43   bool has_index_selection_bit;
44   int n_color_bits;
45   int n_alpha_bits;
46   bool has_endpoint_pbits;
47   bool has_shared_pbits;
48   int n_index_bits;
49   int n_secondary_index_bits;
50};
51
52struct bptc_float_bitfield {
53   int8_t endpoint;
54   uint8_t component;
55   uint8_t offset;
56   uint8_t n_bits;
57   bool reverse;
58};
59
60struct bptc_float_mode {
61   bool reserved;
62   bool transformed_endpoints;
63   int n_partition_bits;
64   int n_endpoint_bits;
65   int n_index_bits;
66   int n_delta_bits[3];
67   struct bptc_float_bitfield bitfields[24];
68};
69
70struct bit_writer {
71   uint8_t buf;
72   int pos;
73   uint8_t *dst;
74};
75
76static const struct bptc_unorm_mode
77bptc_unorm_modes[] = {
78   /* 0 */ { 3, 4, false, false, 4, 0, true,  false, 3, 0 },
79   /* 1 */ { 2, 6, false, false, 6, 0, false, true,  3, 0 },
80   /* 2 */ { 3, 6, false, false, 5, 0, false, false, 2, 0 },
81   /* 3 */ { 2, 6, false, false, 7, 0, true,  false, 2, 0 },
82   /* 4 */ { 1, 0, true,  true,  5, 6, false, false, 2, 3 },
83   /* 5 */ { 1, 0, true,  false, 7, 8, false, false, 2, 2 },
84   /* 6 */ { 1, 0, false, false, 7, 7, true,  false, 4, 0 },
85   /* 7 */ { 2, 6, false, false, 5, 5, true,  false, 2, 0 }
86};
87
88static const struct bptc_float_mode
89bptc_float_modes[] = {
90   /* 00 */
91   { false, true, 5, 10, 3, { 5, 5, 5 },
92     { { 2, 1, 4, 1, false }, { 2, 2, 4, 1, false }, { 3, 2, 4, 1, false },
93       { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
94       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
95       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
96       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
97       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
98       { 3, 2, 3, 1, false },
99       { -1 } }
100   },
101   /* 01 */
102   { false, true, 5, 7, 3, { 6, 6, 6 },
103     { { 2, 1, 5, 1, false }, { 3, 1, 4, 1, false }, { 3, 1, 5, 1, false },
104       { 0, 0, 0, 7, false }, { 3, 2, 0, 1, false }, { 3, 2, 1, 1, false },
105       { 2, 2, 4, 1, false }, { 0, 1, 0, 7, false }, { 2, 2, 5, 1, false },
106       { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false }, { 0, 2, 0, 7, false },
107       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
108       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
109       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
110       { 2, 0, 0, 6, false },
111       { 3, 0, 0, 6, false },
112       { -1 } }
113   },
114   /* 00010 */
115   { false, true, 5, 11, 3, { 5, 4, 4 },
116     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
117       { 1, 0, 0, 5, false }, { 0, 0, 10, 1, false }, { 2, 1, 0, 4, false },
118       { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false }, { 3, 2, 0, 1, false },
119       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
120       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
121       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
122       { -1 } }
123   },
124   /* 00011 */
125   { false, false, 0, 10, 4, { 10, 10, 10 },
126     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
127       { 1, 0, 0, 10, false }, { 1, 1, 0, 10, false }, { 1, 2, 0, 10, false },
128       { -1 } }
129   },
130   /* 00110 */
131   { false, true, 5, 11, 3, { 4, 5, 4 },
132     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
133       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 3, 1, 4, 1, false },
134       { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false }, { 0, 1, 10, 1, false },
135       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
136       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
137       { 3, 2, 0, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
138       { 2, 1, 4, 1, false }, { 3, 2, 3, 1, false },
139       { -1 } }
140   },
141   /* 00111 */
142   { false, true, 0, 11, 4, { 9, 9, 9 },
143     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
144       { 1, 0, 0, 9, false }, { 0, 0, 10, 1, false }, { 1, 1, 0, 9, false },
145       { 0, 1, 10, 1, false }, { 1, 2, 0, 9, false }, { 0, 2, 10, 1, false },
146       { -1 } }
147   },
148   /* 01010 */
149   { false, true, 5, 11, 3, { 4, 4, 5 },
150     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
151       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 2, 2, 4, 1, false },
152       { 2, 1, 0, 4, false }, { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false },
153       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
154       { 0, 2, 10, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
155       { 3, 2, 1, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
156       { 3, 2, 4, 1, false }, { 3, 2, 3, 1, false },
157       { -1 } }
158   },
159   /* 01011 */
160   { false, true, 0, 12, 4, { 8, 8, 8 },
161     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
162       { 1, 0, 0, 8, false }, { 0, 0, 10, 2, true }, { 1, 1, 0, 8, false },
163       { 0, 1, 10, 2, true }, { 1, 2, 0, 8, false }, { 0, 2, 10, 2, true },
164       { -1 } }
165   },
166   /* 01110 */
167   { false, true, 5, 9, 3, { 5, 5, 5 },
168     { { 0, 0, 0, 9, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 9, false },
169       { 2, 1, 4, 1, false }, { 0, 2, 0, 9, false }, { 3, 2, 4, 1, false },
170       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
171       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
172       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
173       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
174       { 3, 2, 3, 1, false },
175       { -1 } }
176   },
177   /* 01111 */
178   { false, true, 0, 16, 4, { 4, 4, 4 },
179     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
180       { 1, 0, 0, 4, false }, { 0, 0, 10, 6, true }, { 1, 1, 0, 4, false },
181       { 0, 1, 10, 6, true }, { 1, 2, 0, 4, false }, { 0, 2, 10, 6, true },
182       { -1 } }
183   },
184   /* 10010 */
185   { false, true, 5, 8, 3, { 6, 5, 5 },
186     { { 0, 0, 0, 8, false }, { 3, 1, 4, 1, false }, { 2, 2, 4, 1, false },
187       { 0, 1, 0, 8, false }, { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false },
188       { 0, 2, 0, 8, false }, { 3, 2, 3, 1, false }, { 3, 2, 4, 1, false },
189       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false },
190       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
191       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 6, false },
192       { 3, 0, 0, 6, false },
193       { -1 } }
194   },
195   /* 10011 */
196   { true /* reserved */ },
197   /* 10110 */
198   { false, true, 5, 8, 3, { 5, 6, 5 },
199     { { 0, 0, 0, 8, false }, { 3, 2, 0, 1, false }, { 2, 2, 4, 1, false },
200       { 0, 1, 0, 8, false }, { 2, 1, 5, 1, false }, { 2, 1, 4, 1, false },
201       { 0, 2, 0, 8, false }, { 3, 1, 5, 1, false }, { 3, 2, 4, 1, false },
202       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
203       { 1, 1, 0, 6, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
204       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
205       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
206       { -1 } }
207   },
208   /* 10111 */
209   { true /* reserved */ },
210   /* 11010 */
211   { false, true, 5, 8, 3, { 5, 5, 6 },
212     { { 0, 0, 0, 8, false }, { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false },
213       { 0, 1, 0, 8, false }, { 2, 2, 5, 1, false }, { 2, 1, 4, 1, false },
214       { 0, 2, 0, 8, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
215       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
216       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
217       { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
218       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
219       { -1 } }
220   },
221   /* 11011 */
222   { true /* reserved */ },
223   /* 11110 */
224   { false, false, 5, 6, 3, { 6, 6, 6 },
225     { { 0, 0, 0, 6, false }, { 3, 1, 4, 1, false }, { 3, 2, 0, 1, false },
226       { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 6, false },
227       { 2, 1, 5, 1, false }, { 2, 2, 5, 1, false }, { 3, 2, 2, 1, false },
228       { 2, 1, 4, 1, false }, { 0, 2, 0, 6, false }, { 3, 1, 5, 1, false },
229       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
230       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
231       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
232       { 2, 0, 0, 6, false }, { 3, 0, 0, 6, false },
233       { -1 } }
234   },
235   /* 11111 */
236   { true /* reserved */ },
237};
238
239/* This partition table is used when the mode has two subsets. Each
240 * partition is represented by a 32-bit value which gives 2 bits per texel
241 * within the block. The value of the two bits represents which subset to use
242 * (0 or 1).
243 */
244static const uint32_t
245partition_table1[N_PARTITIONS] = {
246   0x50505050U, 0x40404040U, 0x54545454U, 0x54505040U,
247   0x50404000U, 0x55545450U, 0x55545040U, 0x54504000U,
248   0x50400000U, 0x55555450U, 0x55544000U, 0x54400000U,
249   0x55555440U, 0x55550000U, 0x55555500U, 0x55000000U,
250   0x55150100U, 0x00004054U, 0x15010000U, 0x00405054U,
251   0x00004050U, 0x15050100U, 0x05010000U, 0x40505054U,
252   0x00404050U, 0x05010100U, 0x14141414U, 0x05141450U,
253   0x01155440U, 0x00555500U, 0x15014054U, 0x05414150U,
254   0x44444444U, 0x55005500U, 0x11441144U, 0x05055050U,
255   0x05500550U, 0x11114444U, 0x41144114U, 0x44111144U,
256   0x15055054U, 0x01055040U, 0x05041050U, 0x05455150U,
257   0x14414114U, 0x50050550U, 0x41411414U, 0x00141400U,
258   0x00041504U, 0x00105410U, 0x10541000U, 0x04150400U,
259   0x50410514U, 0x41051450U, 0x05415014U, 0x14054150U,
260   0x41050514U, 0x41505014U, 0x40011554U, 0x54150140U,
261   0x50505500U, 0x00555050U, 0x15151010U, 0x54540404U,
262};
263
264/* This partition table is used when the mode has three subsets. In this case
265 * the values can be 0, 1 or 2.
266 */
267static const uint32_t
268partition_table2[N_PARTITIONS] = {
269   0xaa685050U, 0x6a5a5040U, 0x5a5a4200U, 0x5450a0a8U,
270   0xa5a50000U, 0xa0a05050U, 0x5555a0a0U, 0x5a5a5050U,
271   0xaa550000U, 0xaa555500U, 0xaaaa5500U, 0x90909090U,
272   0x94949494U, 0xa4a4a4a4U, 0xa9a59450U, 0x2a0a4250U,
273   0xa5945040U, 0x0a425054U, 0xa5a5a500U, 0x55a0a0a0U,
274   0xa8a85454U, 0x6a6a4040U, 0xa4a45000U, 0x1a1a0500U,
275   0x0050a4a4U, 0xaaa59090U, 0x14696914U, 0x69691400U,
276   0xa08585a0U, 0xaa821414U, 0x50a4a450U, 0x6a5a0200U,
277   0xa9a58000U, 0x5090a0a8U, 0xa8a09050U, 0x24242424U,
278   0x00aa5500U, 0x24924924U, 0x24499224U, 0x50a50a50U,
279   0x500aa550U, 0xaaaa4444U, 0x66660000U, 0xa5a0a5a0U,
280   0x50a050a0U, 0x69286928U, 0x44aaaa44U, 0x66666600U,
281   0xaa444444U, 0x54a854a8U, 0x95809580U, 0x96969600U,
282   0xa85454a8U, 0x80959580U, 0xaa141414U, 0x96960000U,
283   0xaaaa1414U, 0xa05050a0U, 0xa0a5a5a0U, 0x96000000U,
284   0x40804080U, 0xa9a8a9a8U, 0xaaaaaa44U, 0x2a4a5254U
285};
286
287static const uint8_t
288anchor_indices[][N_PARTITIONS] = {
289   /* Anchor index values for the second subset of two-subset partitioning */
290   {
291      0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,
292      0xf,0x2,0x8,0x2,0x2,0x8,0x8,0xf,0x2,0x8,0x2,0x2,0x8,0x8,0x2,0x2,
293      0xf,0xf,0x6,0x8,0x2,0x8,0xf,0xf,0x2,0x8,0x2,0x2,0x2,0xf,0xf,0x6,
294      0x6,0x2,0x6,0x8,0xf,0xf,0x2,0x2,0xf,0xf,0xf,0xf,0xf,0x2,0x2,0xf
295   },
296
297   /* Anchor index values for the second subset of three-subset partitioning */
298   {
299      0x3,0x3,0xf,0xf,0x8,0x3,0xf,0xf,0x8,0x8,0x6,0x6,0x6,0x5,0x3,0x3,
300      0x3,0x3,0x8,0xf,0x3,0x3,0x6,0xa,0x5,0x8,0x8,0x6,0x8,0x5,0xf,0xf,
301      0x8,0xf,0x3,0x5,0x6,0xa,0x8,0xf,0xf,0x3,0xf,0x5,0xf,0xf,0xf,0xf,
302      0x3,0xf,0x5,0x5,0x5,0x8,0x5,0xa,0x5,0xa,0x8,0xd,0xf,0xc,0x3,0x3
303   },
304
305   /* Anchor index values for the third subset of three-subset
306    * partitioning
307    */
308   {
309      0xf,0x8,0x8,0x3,0xf,0xf,0x3,0x8,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x8,
310      0xf,0x8,0xf,0x3,0xf,0x8,0xf,0x8,0x3,0xf,0x6,0xa,0xf,0xf,0xa,0x8,
311      0xf,0x3,0xf,0xa,0xa,0x8,0x9,0xa,0x6,0xf,0x8,0xf,0x3,0x6,0x6,0x8,
312      0xf,0x3,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x3,0xf,0xf,0x8
313   }
314};
315
316static int
317extract_bits(const uint8_t *block,
318             int offset,
319             int n_bits)
320{
321   int byte_index = offset / 8;
322   int bit_index = offset % 8;
323   int n_bits_in_byte = MIN2(n_bits, 8 - bit_index);
324   int result = 0;
325   int bit = 0;
326
327   while (true) {
328      result |= ((block[byte_index] >> bit_index) &
329                 ((1 << n_bits_in_byte) - 1)) << bit;
330
331      n_bits -= n_bits_in_byte;
332
333      if (n_bits <= 0)
334         return result;
335
336      bit += n_bits_in_byte;
337      byte_index++;
338      bit_index = 0;
339      n_bits_in_byte = MIN2(n_bits, 8);
340   }
341}
342
343static uint8_t
344expand_component(uint8_t byte,
345                 int n_bits)
346{
347   /* Expands a n-bit quantity into a byte by copying the most-significant
348    * bits into the unused least-significant bits.
349    */
350   return byte << (8 - n_bits) | (byte >> (2 * n_bits - 8));
351}
352
353static int
354extract_unorm_endpoints(const struct bptc_unorm_mode *mode,
355                        const uint8_t *block,
356                        int bit_offset,
357                        uint8_t endpoints[][4])
358{
359   int component;
360   int subset;
361   int endpoint;
362   int pbit;
363   int n_components;
364
365   /* Extract each color component */
366   for (component = 0; component < 3; component++) {
367      for (subset = 0; subset < mode->n_subsets; subset++) {
368         for (endpoint = 0; endpoint < 2; endpoint++) {
369            endpoints[subset * 2 + endpoint][component] =
370               extract_bits(block, bit_offset, mode->n_color_bits);
371            bit_offset += mode->n_color_bits;
372         }
373      }
374   }
375
376   /* Extract the alpha values */
377   if (mode->n_alpha_bits > 0) {
378      for (subset = 0; subset < mode->n_subsets; subset++) {
379         for (endpoint = 0; endpoint < 2; endpoint++) {
380            endpoints[subset * 2 + endpoint][3] =
381               extract_bits(block, bit_offset, mode->n_alpha_bits);
382            bit_offset += mode->n_alpha_bits;
383         }
384      }
385
386      n_components = 4;
387   } else {
388      for (subset = 0; subset < mode->n_subsets; subset++)
389         for (endpoint = 0; endpoint < 2; endpoint++)
390            endpoints[subset * 2 + endpoint][3] = 255;
391
392      n_components = 3;
393   }
394
395   /* Add in the p-bits */
396   if (mode->has_endpoint_pbits) {
397      for (subset = 0; subset < mode->n_subsets; subset++) {
398         for (endpoint = 0; endpoint < 2; endpoint++) {
399            pbit = extract_bits(block, bit_offset, 1);
400            bit_offset += 1;
401
402            for (component = 0; component < n_components; component++) {
403               endpoints[subset * 2 + endpoint][component] <<= 1;
404               endpoints[subset * 2 + endpoint][component] |= pbit;
405            }
406         }
407      }
408   } else if (mode->has_shared_pbits) {
409      for (subset = 0; subset < mode->n_subsets; subset++) {
410         pbit = extract_bits(block, bit_offset, 1);
411         bit_offset += 1;
412
413         for (endpoint = 0; endpoint < 2; endpoint++) {
414            for (component = 0; component < n_components; component++) {
415               endpoints[subset * 2 + endpoint][component] <<= 1;
416               endpoints[subset * 2 + endpoint][component] |= pbit;
417            }
418         }
419      }
420   }
421
422   /* Expand the n-bit values to a byte */
423   for (subset = 0; subset < mode->n_subsets; subset++) {
424      for (endpoint = 0; endpoint < 2; endpoint++) {
425         for (component = 0; component < 3; component++) {
426            endpoints[subset * 2 + endpoint][component] =
427               expand_component(endpoints[subset * 2 + endpoint][component],
428                                mode->n_color_bits +
429                                mode->has_endpoint_pbits +
430                                mode->has_shared_pbits);
431         }
432
433         if (mode->n_alpha_bits > 0) {
434            endpoints[subset * 2 + endpoint][3] =
435               expand_component(endpoints[subset * 2 + endpoint][3],
436                                mode->n_alpha_bits +
437                                mode->has_endpoint_pbits +
438                                mode->has_shared_pbits);
439         }
440      }
441   }
442
443   return bit_offset;
444}
445
446static bool
447is_anchor(int n_subsets,
448          int partition_num,
449          int texel)
450{
451   if (texel == 0)
452      return true;
453
454   switch (n_subsets) {
455   case 1:
456      return false;
457   case 2:
458      return anchor_indices[0][partition_num] == texel;
459   case 3:
460      return (anchor_indices[1][partition_num] == texel ||
461              anchor_indices[2][partition_num] == texel);
462   default:
463      assert(false);
464      return false;
465   }
466}
467
468static int
469count_anchors_before_texel(int n_subsets,
470                           int partition_num,
471                           int texel)
472{
473   int count = 1;
474
475   if (texel == 0)
476      return 0;
477
478   switch (n_subsets) {
479   case 1:
480      break;
481   case 2:
482      if (texel > anchor_indices[0][partition_num])
483         count++;
484      break;
485   case 3:
486      if (texel > anchor_indices[1][partition_num])
487         count++;
488      if (texel > anchor_indices[2][partition_num])
489         count++;
490      break;
491   default:
492      assert(false);
493      return 0;
494   }
495
496   return count;
497}
498
499static int32_t
500interpolate(int32_t a, int32_t b,
501            int index,
502            int index_bits)
503{
504   static const uint8_t weights2[] = { 0, 21, 43, 64 };
505   static const uint8_t weights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
506   static const uint8_t weights4[] =
507      { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
508   static const uint8_t *weights[] = {
509      NULL, NULL, weights2, weights3, weights4
510   };
511   int weight;
512
513   weight = weights[index_bits][index];
514
515   return ((64 - weight) * a + weight * b + 32) >> 6;
516}
517
518static void
519apply_rotation(int rotation,
520               uint8_t *result)
521{
522   uint8_t t;
523
524   if (rotation == 0)
525      return;
526
527   rotation--;
528
529   t = result[rotation];
530   result[rotation] = result[3];
531   result[3] = t;
532}
533
534static void
535fetch_rgba_unorm_from_block(const uint8_t *block,
536                            uint8_t *result,
537                            int texel)
538{
539   int mode_num = ffs(block[0]);
540   const struct bptc_unorm_mode *mode;
541   int bit_offset, secondary_bit_offset;
542   int partition_num;
543   int subset_num;
544   int rotation;
545   int index_selection;
546   int index_bits;
547   int indices[2];
548   int index;
549   int anchors_before_texel;
550   bool anchor;
551   uint8_t endpoints[3 * 2][4];
552   uint32_t subsets;
553   int component;
554
555   if (mode_num == 0) {
556      /* According to the spec this mode is reserved and shouldn't be used. */
557      memset(result, 0, 3);
558      result[3] = 0xff;
559      return;
560   }
561
562   mode = bptc_unorm_modes + mode_num - 1;
563   bit_offset = mode_num;
564
565   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
566   bit_offset += mode->n_partition_bits;
567
568   switch (mode->n_subsets) {
569   case 1:
570      subsets = 0;
571      break;
572   case 2:
573      subsets = partition_table1[partition_num];
574      break;
575   case 3:
576      subsets = partition_table2[partition_num];
577      break;
578   default:
579      assert(false);
580      return;
581   }
582
583   if (mode->has_rotation_bits) {
584      rotation = extract_bits(block, bit_offset, 2);
585      bit_offset += 2;
586   } else {
587      rotation = 0;
588   }
589
590   if (mode->has_index_selection_bit) {
591      index_selection = extract_bits(block, bit_offset, 1);
592      bit_offset++;
593   } else {
594      index_selection = 0;
595   }
596
597   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
598
599   anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
600                                                     partition_num, texel);
601
602   /* Calculate the offset to the secondary index */
603   secondary_bit_offset = (bit_offset +
604                           BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
605                           mode->n_subsets +
606                           mode->n_secondary_index_bits * texel -
607                           anchors_before_texel);
608
609   /* Calculate the offset to the primary index for this texel */
610   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
611
612   subset_num = (subsets >> (texel * 2)) & 3;
613
614   anchor = is_anchor(mode->n_subsets, partition_num, texel);
615
616   index_bits = mode->n_index_bits;
617   if (anchor)
618      index_bits--;
619   indices[0] = extract_bits(block, bit_offset, index_bits);
620
621   if (mode->n_secondary_index_bits) {
622      index_bits = mode->n_secondary_index_bits;
623      if (anchor)
624         index_bits--;
625      indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
626   }
627
628   index = indices[index_selection];
629   index_bits = (index_selection ?
630                 mode->n_secondary_index_bits :
631                 mode->n_index_bits);
632
633   for (component = 0; component < 3; component++)
634      result[component] = interpolate(endpoints[subset_num * 2][component],
635                                      endpoints[subset_num * 2 + 1][component],
636                                      index,
637                                      index_bits);
638
639   /* Alpha uses the opposite index from the color components */
640   if (mode->n_secondary_index_bits && !index_selection) {
641      index = indices[1];
642      index_bits = mode->n_secondary_index_bits;
643   } else {
644      index = indices[0];
645      index_bits = mode->n_index_bits;
646   }
647
648   result[3] = interpolate(endpoints[subset_num * 2][3],
649                           endpoints[subset_num * 2 + 1][3],
650                           index,
651                           index_bits);
652
653   apply_rotation(rotation, result);
654}
655
656#ifdef BPTC_BLOCK_DECODE
657static void
658decompress_rgba_unorm_block(int src_width, int src_height,
659                            const uint8_t *block,
660                            uint8_t *dst_row, int dst_rowstride)
661{
662   int mode_num = ffs(block[0]);
663   const struct bptc_unorm_mode *mode;
664   int bit_offset, secondary_bit_offset;
665   int partition_num;
666   int subset_num;
667   int rotation;
668   int index_selection;
669   int index_bits;
670   int indices[2];
671   int index;
672   int anchors_before_texel;
673   bool anchor;
674   uint8_t endpoints[3 * 2][4];
675   uint32_t subsets;
676   int component;
677   unsigned x, y;
678
679   if (mode_num == 0) {
680      /* According to the spec this mode is reserved and shouldn't be used. */
681      for(y = 0; y < src_height; y += 1) {
682         uint8_t *result = dst_row;
683         memset(result, 0, 4 * src_width);
684         for(x = 0; x < src_width; x += 1) {
685            result[3] = 0xff;
686            result += 4;
687         }
688         dst_row += dst_rowstride;
689      }
690      return;
691   }
692
693   mode = bptc_unorm_modes + mode_num - 1;
694   bit_offset = mode_num;
695
696   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
697   bit_offset += mode->n_partition_bits;
698
699   switch (mode->n_subsets) {
700   case 1:
701      subsets = 0;
702      break;
703   case 2:
704      subsets = partition_table1[partition_num];
705      break;
706   case 3:
707      subsets = partition_table2[partition_num];
708      break;
709   default:
710      assert(false);
711      return;
712   }
713
714   if (mode->has_rotation_bits) {
715      rotation = extract_bits(block, bit_offset, 2);
716      bit_offset += 2;
717   } else {
718      rotation = 0;
719   }
720
721   if (mode->has_index_selection_bit) {
722      index_selection = extract_bits(block, bit_offset, 1);
723      bit_offset++;
724   } else {
725      index_selection = 0;
726   }
727
728   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
729
730   for(y = 0; y < src_height; y += 1) {
731      uint8_t *result = dst_row;
732      for(x = 0; x < src_width; x += 1) {
733         int texel;
734         texel = x + y * 4;
735
736         anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
737                                                           partition_num,
738                                                           texel);
739
740         /* Calculate the offset to the secondary index */
741         secondary_bit_offset = (bit_offset +
742                                 BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
743                                 mode->n_subsets +
744                                 mode->n_secondary_index_bits * texel -
745                                 anchors_before_texel);
746
747         /* Calculate the offset to the primary index for this texel */
748         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
749
750         subset_num = (subsets >> (texel * 2)) & 3;
751
752         anchor = is_anchor(mode->n_subsets, partition_num, texel);
753
754         index_bits = mode->n_index_bits;
755         if (anchor)
756            index_bits--;
757         indices[0] = extract_bits(block, bit_offset, index_bits);
758
759         if (mode->n_secondary_index_bits) {
760            index_bits = mode->n_secondary_index_bits;
761            if (anchor)
762               index_bits--;
763            indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
764         }
765
766         index = indices[index_selection];
767         index_bits = (index_selection ?
768                       mode->n_secondary_index_bits :
769                       mode->n_index_bits);
770
771         for (component = 0; component < 3; component++)
772            result[component] = interpolate(endpoints[subset_num * 2][component],
773                                            endpoints[subset_num * 2 + 1][component],
774                                            index,
775                                            index_bits);
776
777         /* Alpha uses the opposite index from the color components */
778         if (mode->n_secondary_index_bits && !index_selection) {
779            index = indices[1];
780            index_bits = mode->n_secondary_index_bits;
781         } else {
782            index = indices[0];
783            index_bits = mode->n_index_bits;
784         }
785
786         result[3] = interpolate(endpoints[subset_num * 2][3],
787                                 endpoints[subset_num * 2 + 1][3],
788                                 index,
789                                 index_bits);
790
791         apply_rotation(rotation, result);
792         result += 4;
793      }
794      dst_row += dst_rowstride;
795   }
796}
797
798static void
799decompress_rgba_unorm(int width, int height,
800                      const uint8_t *src, int src_rowstride,
801                      uint8_t *dst, int dst_rowstride)
802{
803   int src_row_diff;
804   int y, x;
805
806   if (src_rowstride >= width * 4)
807      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
808   else
809      src_row_diff = 0;
810
811   for (y = 0; y < height; y += BLOCK_SIZE) {
812      for (x = 0; x < width; x += BLOCK_SIZE) {
813         decompress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
814                                     MIN2(height - y, BLOCK_SIZE),
815                                     src,
816                                     dst + x * 4 + y * dst_rowstride,
817                                     dst_rowstride);
818         src += BLOCK_BYTES;
819      }
820      src += src_row_diff;
821   }
822}
823#endif // BPTC_BLOCK_DECODE
824
825static int32_t
826sign_extend(int32_t value,
827            int n_bits)
828{
829   if ((value & (1 << (n_bits - 1)))) {
830      value |= (~(int32_t) 0) << n_bits;
831   }
832
833   return value;
834}
835
836static int
837signed_unquantize(int value, int n_endpoint_bits)
838{
839   bool sign;
840
841   if (n_endpoint_bits >= 16)
842      return value;
843
844   if (value == 0)
845      return 0;
846
847   sign = false;
848
849   if (value < 0) {
850      sign = true;
851      value = -value;
852   }
853
854   if (value >= (1 << (n_endpoint_bits - 1)) - 1)
855      value = 0x7fff;
856   else
857      value = ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
858
859   if (sign)
860      value = -value;
861
862   return value;
863}
864
865static int
866unsigned_unquantize(int value, int n_endpoint_bits)
867{
868   if (n_endpoint_bits >= 15)
869      return value;
870
871   if (value == 0)
872      return 0;
873
874   if (value == (1 << n_endpoint_bits) - 1)
875      return 0xffff;
876
877   return ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
878}
879
880static int
881extract_float_endpoints(const struct bptc_float_mode *mode,
882                        const uint8_t *block,
883                        int bit_offset,
884                        int32_t endpoints[][3],
885                        bool is_signed)
886{
887   const struct bptc_float_bitfield *bitfield;
888   int endpoint, component;
889   int n_endpoints;
890   int value;
891   int i;
892
893   if (mode->n_partition_bits)
894      n_endpoints = 4;
895   else
896      n_endpoints = 2;
897
898   memset(endpoints, 0, sizeof endpoints[0][0] * n_endpoints * 3);
899
900   for (bitfield = mode->bitfields; bitfield->endpoint != -1; bitfield++) {
901      value = extract_bits(block, bit_offset, bitfield->n_bits);
902      bit_offset += bitfield->n_bits;
903
904      if (bitfield->reverse) {
905         for (i = 0; i < bitfield->n_bits; i++) {
906            if (value & (1 << i))
907               endpoints[bitfield->endpoint][bitfield->component] |=
908                  1 << ((bitfield->n_bits - 1 - i) + bitfield->offset);
909         }
910      } else {
911         endpoints[bitfield->endpoint][bitfield->component] |=
912            value << bitfield->offset;
913      }
914   }
915
916   if (mode->transformed_endpoints) {
917      /* The endpoints are specified as signed offsets from e0 */
918      for (endpoint = 1; endpoint < n_endpoints; endpoint++) {
919         for (component = 0; component < 3; component++) {
920            value = sign_extend(endpoints[endpoint][component],
921                                mode->n_delta_bits[component]);
922            endpoints[endpoint][component] =
923               ((endpoints[0][component] + value) &
924                ((1 << mode->n_endpoint_bits) - 1));
925         }
926      }
927   }
928
929   if (is_signed) {
930      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
931         for (component = 0; component < 3; component++) {
932            value = sign_extend(endpoints[endpoint][component],
933                                mode->n_endpoint_bits);
934            endpoints[endpoint][component] =
935               signed_unquantize(value, mode->n_endpoint_bits);
936         }
937      }
938   } else {
939      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
940         for (component = 0; component < 3; component++) {
941            endpoints[endpoint][component] =
942               unsigned_unquantize(endpoints[endpoint][component],
943                                   mode->n_endpoint_bits);
944         }
945      }
946   }
947
948   return bit_offset;
949}
950
951static int32_t
952finish_unsigned_unquantize(int32_t value)
953{
954   return value * 31 / 64;
955}
956
957static int32_t
958finish_signed_unquantize(int32_t value)
959{
960   if (value < 0)
961      return (-value * 31 / 32) | 0x8000;
962   else
963      return value * 31 / 32;
964}
965
966static void
967fetch_rgb_float_from_block(const uint8_t *block,
968                           float *result,
969                           int texel,
970                           bool is_signed)
971{
972   int mode_num;
973   const struct bptc_float_mode *mode;
974   int bit_offset;
975   int partition_num;
976   int subset_num;
977   int index_bits;
978   int index;
979   int anchors_before_texel;
980   int32_t endpoints[2 * 2][3];
981   uint32_t subsets;
982   int n_subsets;
983   int component;
984   int32_t value;
985
986   if (block[0] & 0x2) {
987      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
988      bit_offset = 5;
989   } else {
990      mode_num = block[0] & 3;
991      bit_offset = 2;
992   }
993
994   mode = bptc_float_modes + mode_num;
995
996   if (mode->reserved) {
997      memset(result, 0, sizeof result[0] * 3);
998      result[3] = 1.0f;
999      return;
1000   }
1001
1002   bit_offset = extract_float_endpoints(mode, block, bit_offset,
1003                                        endpoints, is_signed);
1004
1005   if (mode->n_partition_bits) {
1006      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
1007      bit_offset += mode->n_partition_bits;
1008
1009      subsets = partition_table1[partition_num];
1010      n_subsets = 2;
1011   } else {
1012      partition_num = 0;
1013      subsets = 0;
1014      n_subsets = 1;
1015   }
1016
1017   anchors_before_texel =
1018      count_anchors_before_texel(n_subsets, partition_num, texel);
1019
1020   /* Calculate the offset to the primary index for this texel */
1021   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1022
1023   subset_num = (subsets >> (texel * 2)) & 3;
1024
1025   index_bits = mode->n_index_bits;
1026   if (is_anchor(n_subsets, partition_num, texel))
1027      index_bits--;
1028   index = extract_bits(block, bit_offset, index_bits);
1029
1030   for (component = 0; component < 3; component++) {
1031      value = interpolate(endpoints[subset_num * 2][component],
1032                          endpoints[subset_num * 2 + 1][component],
1033                          index,
1034                          mode->n_index_bits);
1035
1036      if (is_signed)
1037         value = finish_signed_unquantize(value);
1038      else
1039         value = finish_unsigned_unquantize(value);
1040
1041      result[component] = _mesa_half_to_float(value);
1042   }
1043
1044   result[3] = 1.0f;
1045}
1046
1047#ifdef BPTC_BLOCK_DECODE
1048static void
1049decompress_rgb_float_block(unsigned src_width, unsigned src_height,
1050                           const uint8_t *block,
1051                           float *dst_row, unsigned dst_rowstride,
1052                           bool is_signed)
1053{
1054   int mode_num;
1055   const struct bptc_float_mode *mode;
1056   int bit_offset;
1057   int partition_num;
1058   int subset_num;
1059   int index_bits;
1060   int index;
1061   int anchors_before_texel;
1062   int32_t endpoints[2 * 2][3];
1063   uint32_t subsets;
1064   int n_subsets;
1065   int component;
1066   int32_t value;
1067   unsigned x, y;
1068
1069   if (block[0] & 0x2) {
1070      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
1071      bit_offset = 5;
1072   } else {
1073      mode_num = block[0] & 3;
1074      bit_offset = 2;
1075   }
1076
1077   mode = bptc_float_modes + mode_num;
1078
1079   if (mode->reserved) {
1080      for(y = 0; y < src_height; y += 1) {
1081         float *result = dst_row;
1082         memset(result, 0, sizeof result[0] * 4 * src_width);
1083         for(x = 0; x < src_width; x += 1) {
1084            result[3] = 1.0f;
1085            result += 4;
1086         }
1087         dst_row += dst_rowstride / sizeof dst_row[0];
1088      }
1089      return;
1090   }
1091
1092   bit_offset = extract_float_endpoints(mode, block, bit_offset,
1093                                        endpoints, is_signed);
1094
1095   if (mode->n_partition_bits) {
1096      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
1097      bit_offset += mode->n_partition_bits;
1098
1099      subsets = partition_table1[partition_num];
1100      n_subsets = 2;
1101   } else {
1102      partition_num = 0;
1103      subsets = 0;
1104      n_subsets = 1;
1105   }
1106
1107   for(y = 0; y < src_height; y += 1) {
1108      float *result = dst_row;
1109      for(x = 0; x < src_width; x += 1) {
1110         int texel;
1111
1112         texel = x + y * 4;
1113
1114         anchors_before_texel =
1115            count_anchors_before_texel(n_subsets, partition_num, texel);
1116
1117         /* Calculate the offset to the primary index for this texel */
1118         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1119
1120         subset_num = (subsets >> (texel * 2)) & 3;
1121
1122         index_bits = mode->n_index_bits;
1123         if (is_anchor(n_subsets, partition_num, texel))
1124            index_bits--;
1125         index = extract_bits(block, bit_offset, index_bits);
1126
1127         for (component = 0; component < 3; component++) {
1128            value = interpolate(endpoints[subset_num * 2][component],
1129                                endpoints[subset_num * 2 + 1][component],
1130                                index,
1131                                mode->n_index_bits);
1132
1133            if (is_signed)
1134               value = finish_signed_unquantize(value);
1135            else
1136               value = finish_unsigned_unquantize(value);
1137
1138            result[component] = _mesa_half_to_float(value);
1139         }
1140
1141         result[3] = 1.0f;
1142         result += 4;
1143      }
1144      dst_row += dst_rowstride / sizeof dst_row[0];
1145   }
1146}
1147
1148static void
1149decompress_rgb_float(int width, int height,
1150                      const uint8_t *src, int src_rowstride,
1151                      float *dst, int dst_rowstride, bool is_signed)
1152{
1153   int src_row_diff;
1154   int y, x;
1155
1156   if (src_rowstride >= width * 4)
1157      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
1158   else
1159      src_row_diff = 0;
1160
1161   for (y = 0; y < height; y += BLOCK_SIZE) {
1162      for (x = 0; x < width; x += BLOCK_SIZE) {
1163         decompress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1164                                    MIN2(height - y, BLOCK_SIZE),
1165                                    src,
1166                                    (dst + x * 4 +
1167                                     (y * dst_rowstride / sizeof dst[0])),
1168                                    dst_rowstride, is_signed);
1169         src += BLOCK_BYTES;
1170      }
1171      src += src_row_diff;
1172   }
1173}
1174#endif // BPTC_BLOCK_DECODE
1175
1176static void
1177write_bits(struct bit_writer *writer, int n_bits, int value)
1178{
1179   do {
1180      if (n_bits + writer->pos >= 8) {
1181         *(writer->dst++) = writer->buf | (value << writer->pos);
1182         writer->buf = 0;
1183         value >>= (8 - writer->pos);
1184         n_bits -= (8 - writer->pos);
1185         writer->pos = 0;
1186      } else {
1187         writer->buf |= value << writer->pos;
1188         writer->pos += n_bits;
1189         break;
1190      }
1191   } while (n_bits > 0);
1192}
1193
1194static void
1195get_average_luminance_alpha_unorm(int width, int height,
1196                                  const uint8_t *src, int src_rowstride,
1197                                  int *average_luminance, int *average_alpha)
1198{
1199   int luminance_sum = 0, alpha_sum = 0;
1200   int y, x;
1201
1202   for (y = 0; y < height; y++) {
1203      for (x = 0; x < width; x++) {
1204         luminance_sum += src[0] + src[1] + src[2];
1205         alpha_sum += src[3];
1206         src += 4;
1207      }
1208      src += src_rowstride - width * 4;
1209   }
1210
1211   *average_luminance = luminance_sum / (width * height);
1212   *average_alpha = alpha_sum / (width * height);
1213}
1214
1215static void
1216get_rgba_endpoints_unorm(int width, int height,
1217                         const uint8_t *src, int src_rowstride,
1218                         int average_luminance, int average_alpha,
1219                         uint8_t endpoints[][4])
1220{
1221   int endpoint_luminances[2];
1222   int midpoint;
1223   int sums[2][4];
1224   int endpoint;
1225   int luminance;
1226   uint8_t temp[3];
1227   const uint8_t *p = src;
1228   int rgb_left_endpoint_count = 0;
1229   int alpha_left_endpoint_count = 0;
1230   int y, x, i;
1231
1232   memset(sums, 0, sizeof sums);
1233
1234   for (y = 0; y < height; y++) {
1235      for (x = 0; x < width; x++) {
1236         luminance = p[0] + p[1] + p[2];
1237         if (luminance < average_luminance) {
1238            endpoint = 0;
1239            rgb_left_endpoint_count++;
1240         } else {
1241            endpoint = 1;
1242         }
1243         for (i = 0; i < 3; i++)
1244            sums[endpoint][i] += p[i];
1245
1246         if (p[2] < average_alpha) {
1247            endpoint = 0;
1248            alpha_left_endpoint_count++;
1249         } else {
1250            endpoint = 1;
1251         }
1252         sums[endpoint][3] += p[3];
1253
1254         p += 4;
1255      }
1256
1257      p += src_rowstride - width * 4;
1258   }
1259
1260   if (rgb_left_endpoint_count == 0 ||
1261       rgb_left_endpoint_count == width * height) {
1262      for (i = 0; i < 3; i++)
1263         endpoints[0][i] = endpoints[1][i] =
1264            (sums[0][i] + sums[1][i]) / (width * height);
1265   } else {
1266      for (i = 0; i < 3; i++) {
1267         endpoints[0][i] = sums[0][i] / rgb_left_endpoint_count;
1268         endpoints[1][i] = (sums[1][i] /
1269                            (width * height - rgb_left_endpoint_count));
1270      }
1271   }
1272
1273   if (alpha_left_endpoint_count == 0 ||
1274       alpha_left_endpoint_count == width * height) {
1275      endpoints[0][3] = endpoints[1][3] =
1276         (sums[0][3] + sums[1][3]) / (width * height);
1277   } else {
1278         endpoints[0][3] = sums[0][3] / alpha_left_endpoint_count;
1279         endpoints[1][3] = (sums[1][3] /
1280                            (width * height - alpha_left_endpoint_count));
1281   }
1282
1283   /* We may need to swap the endpoints to ensure the most-significant bit of
1284    * the first index is zero */
1285
1286   for (endpoint = 0; endpoint < 2; endpoint++) {
1287      endpoint_luminances[endpoint] =
1288         endpoints[endpoint][0] +
1289         endpoints[endpoint][1] +
1290         endpoints[endpoint][2];
1291   }
1292   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2;
1293
1294   if ((src[0] + src[1] + src[2] <= midpoint) !=
1295       (endpoint_luminances[0] <= midpoint)) {
1296      memcpy(temp, endpoints[0], 3);
1297      memcpy(endpoints[0], endpoints[1], 3);
1298      memcpy(endpoints[1], temp, 3);
1299   }
1300
1301   /* Same for the alpha endpoints */
1302
1303   midpoint = (endpoints[0][3] + endpoints[1][3]) / 2;
1304
1305   if ((src[3] <= midpoint) != (endpoints[0][3] <= midpoint)) {
1306      temp[0] = endpoints[0][3];
1307      endpoints[0][3] = endpoints[1][3];
1308      endpoints[1][3] = temp[0];
1309   }
1310}
1311
1312static void
1313write_rgb_indices_unorm(struct bit_writer *writer,
1314                        int src_width, int src_height,
1315                        const uint8_t *src, int src_rowstride,
1316                        uint8_t endpoints[][4])
1317{
1318   int luminance;
1319   int endpoint_luminances[2];
1320   int endpoint;
1321   int index;
1322   int y, x;
1323
1324   for (endpoint = 0; endpoint < 2; endpoint++) {
1325      endpoint_luminances[endpoint] =
1326         endpoints[endpoint][0] +
1327         endpoints[endpoint][1] +
1328         endpoints[endpoint][2];
1329   }
1330
1331   /* If the endpoints have the same luminance then we'll just use index 0 for
1332    * all of the texels */
1333   if (endpoint_luminances[0] == endpoint_luminances[1]) {
1334      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 2 - 1, 0);
1335      return;
1336   }
1337
1338   for (y = 0; y < src_height; y++) {
1339      for (x = 0; x < src_width; x++) {
1340         luminance = src[0] + src[1] + src[2];
1341
1342         index = ((luminance - endpoint_luminances[0]) * 3 /
1343                  (endpoint_luminances[1] - endpoint_luminances[0]));
1344         if (index < 0)
1345            index = 0;
1346         else if (index > 3)
1347            index = 3;
1348
1349         assert(x != 0 || y != 0 || index < 2);
1350
1351         write_bits(writer, (x == 0 && y == 0) ? 1 : 2, index);
1352
1353         src += 4;
1354      }
1355
1356      /* Pad the indices out to the block size */
1357      if (src_width < BLOCK_SIZE)
1358         write_bits(writer, 2 * (BLOCK_SIZE - src_width), 0);
1359
1360      src += src_rowstride - src_width * 4;
1361   }
1362
1363   /* Pad the indices out to the block size */
1364   if (src_height < BLOCK_SIZE)
1365      write_bits(writer, 2 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1366}
1367
1368static void
1369write_alpha_indices_unorm(struct bit_writer *writer,
1370                          int src_width, int src_height,
1371                          const uint8_t *src, int src_rowstride,
1372                          uint8_t endpoints[][4])
1373{
1374   int index;
1375   int y, x;
1376
1377   /* If the endpoints have the same alpha then we'll just use index 0 for
1378    * all of the texels */
1379   if (endpoints[0][3] == endpoints[1][3]) {
1380      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 3 - 1, 0);
1381      return;
1382   }
1383
1384   for (y = 0; y < src_height; y++) {
1385      for (x = 0; x < src_width; x++) {
1386         index = (((int) src[3] - (int) endpoints[0][3]) * 7 /
1387                  ((int) endpoints[1][3] - endpoints[0][3]));
1388         if (index < 0)
1389            index = 0;
1390         else if (index > 7)
1391            index = 7;
1392
1393         assert(x != 0 || y != 0 || index < 4);
1394
1395         /* The first index has one less bit */
1396         write_bits(writer, (x == 0 && y == 0) ? 2 : 3, index);
1397
1398         src += 4;
1399      }
1400
1401      /* Pad the indices out to the block size */
1402      if (src_width < BLOCK_SIZE)
1403         write_bits(writer, 3 * (BLOCK_SIZE - src_width), 0);
1404
1405      src += src_rowstride - src_width * 4;
1406   }
1407
1408   /* Pad the indices out to the block size */
1409   if (src_height < BLOCK_SIZE)
1410      write_bits(writer, 3 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1411}
1412
1413static void
1414compress_rgba_unorm_block(int src_width, int src_height,
1415                          const uint8_t *src, int src_rowstride,
1416                          uint8_t *dst)
1417{
1418   int average_luminance, average_alpha;
1419   uint8_t endpoints[2][4];
1420   struct bit_writer writer;
1421   int component, endpoint;
1422
1423   get_average_luminance_alpha_unorm(src_width, src_height, src, src_rowstride,
1424                                     &average_luminance, &average_alpha);
1425   get_rgba_endpoints_unorm(src_width, src_height, src, src_rowstride,
1426                            average_luminance, average_alpha,
1427                            endpoints);
1428
1429   writer.dst = dst;
1430   writer.pos = 0;
1431   writer.buf = 0;
1432
1433   write_bits(&writer, 5, 0x10); /* mode 4 */
1434   write_bits(&writer, 2, 0); /* rotation 0 */
1435   write_bits(&writer, 1, 0); /* index selection bit */
1436
1437   /* Write the color endpoints */
1438   for (component = 0; component < 3; component++)
1439      for (endpoint = 0; endpoint < 2; endpoint++)
1440         write_bits(&writer, 5, endpoints[endpoint][component] >> 3);
1441
1442   /* Write the alpha endpoints */
1443   for (endpoint = 0; endpoint < 2; endpoint++)
1444      write_bits(&writer, 6, endpoints[endpoint][3] >> 2);
1445
1446   write_rgb_indices_unorm(&writer,
1447                           src_width, src_height,
1448                           src, src_rowstride,
1449                           endpoints);
1450   write_alpha_indices_unorm(&writer,
1451                             src_width, src_height,
1452                             src, src_rowstride,
1453                             endpoints);
1454}
1455
1456static void
1457compress_rgba_unorm(int width, int height,
1458                    const uint8_t *src, int src_rowstride,
1459                    uint8_t *dst, int dst_rowstride)
1460{
1461   int dst_row_diff;
1462   int y, x;
1463
1464   if (dst_rowstride >= width * 4)
1465      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1466   else
1467      dst_row_diff = 0;
1468
1469   for (y = 0; y < height; y += BLOCK_SIZE) {
1470      for (x = 0; x < width; x += BLOCK_SIZE) {
1471         compress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
1472                                   MIN2(height - y, BLOCK_SIZE),
1473                                   src + x * 4 + y * src_rowstride,
1474                                   src_rowstride,
1475                                   dst);
1476         dst += BLOCK_BYTES;
1477      }
1478      dst += dst_row_diff;
1479   }
1480}
1481
1482static float
1483get_average_luminance_float(int width, int height,
1484                            const float *src, int src_rowstride)
1485{
1486   float luminance_sum = 0;
1487   int y, x;
1488
1489   for (y = 0; y < height; y++) {
1490      for (x = 0; x < width; x++) {
1491         luminance_sum += src[0] + src[1] + src[2];
1492         src += 3;
1493      }
1494      src += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1495   }
1496
1497   return luminance_sum / (width * height);
1498}
1499
1500static float
1501clamp_value(float value, bool is_signed)
1502{
1503   if (value > 65504.0f)
1504      return 65504.0f;
1505
1506   if (is_signed) {
1507      if (value < -65504.0f)
1508         return -65504.0f;
1509      else
1510         return value;
1511   }
1512
1513   if (value < 0.0f)
1514      return 0.0f;
1515
1516   return value;
1517}
1518
1519static void
1520get_endpoints_float(int width, int height,
1521                    const float *src, int src_rowstride,
1522                    float average_luminance, float endpoints[][3],
1523                    bool is_signed)
1524{
1525   float endpoint_luminances[2];
1526   float midpoint;
1527   float sums[2][3];
1528   int endpoint, component;
1529   float luminance;
1530   float temp[3];
1531   const float *p = src;
1532   int left_endpoint_count = 0;
1533   int y, x, i;
1534
1535   memset(sums, 0, sizeof sums);
1536
1537   for (y = 0; y < height; y++) {
1538      for (x = 0; x < width; x++) {
1539         luminance = p[0] + p[1] + p[2];
1540         if (luminance < average_luminance) {
1541            endpoint = 0;
1542            left_endpoint_count++;
1543         } else {
1544            endpoint = 1;
1545         }
1546         for (i = 0; i < 3; i++)
1547            sums[endpoint][i] += p[i];
1548
1549         p += 3;
1550      }
1551
1552      p += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1553   }
1554
1555   if (left_endpoint_count == 0 ||
1556       left_endpoint_count == width * height) {
1557      for (i = 0; i < 3; i++)
1558         endpoints[0][i] = endpoints[1][i] =
1559            (sums[0][i] + sums[1][i]) / (width * height);
1560   } else {
1561      for (i = 0; i < 3; i++) {
1562         endpoints[0][i] = sums[0][i] / left_endpoint_count;
1563         endpoints[1][i] = sums[1][i] / (width * height - left_endpoint_count);
1564      }
1565   }
1566
1567   /* Clamp the endpoints to the range of a half float and strip out
1568    * infinities */
1569   for (endpoint = 0; endpoint < 2; endpoint++) {
1570      for (component = 0; component < 3; component++) {
1571         endpoints[endpoint][component] =
1572            clamp_value(endpoints[endpoint][component], is_signed);
1573      }
1574   }
1575
1576   /* We may need to swap the endpoints to ensure the most-significant bit of
1577    * the first index is zero */
1578
1579   for (endpoint = 0; endpoint < 2; endpoint++) {
1580      endpoint_luminances[endpoint] =
1581         endpoints[endpoint][0] +
1582         endpoints[endpoint][1] +
1583         endpoints[endpoint][2];
1584   }
1585   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2.0f;
1586
1587   if ((src[0] + src[1] + src[2] <= midpoint) !=
1588       (endpoint_luminances[0] <= midpoint)) {
1589      memcpy(temp, endpoints[0], sizeof temp);
1590      memcpy(endpoints[0], endpoints[1], sizeof temp);
1591      memcpy(endpoints[1], temp, sizeof temp);
1592   }
1593}
1594
1595static void
1596write_rgb_indices_float(struct bit_writer *writer,
1597                        int src_width, int src_height,
1598                        const float *src, int src_rowstride,
1599                        float endpoints[][3])
1600{
1601   float luminance;
1602   float endpoint_luminances[2];
1603   int endpoint;
1604   int index;
1605   int y, x;
1606
1607   for (endpoint = 0; endpoint < 2; endpoint++) {
1608      endpoint_luminances[endpoint] =
1609         endpoints[endpoint][0] +
1610         endpoints[endpoint][1] +
1611         endpoints[endpoint][2];
1612   }
1613
1614   /* If the endpoints have the same luminance then we'll just use index 0 for
1615    * all of the texels */
1616   if (endpoint_luminances[0] == endpoint_luminances[1]) {
1617      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 4 - 1, 0);
1618      return;
1619   }
1620
1621   for (y = 0; y < src_height; y++) {
1622      for (x = 0; x < src_width; x++) {
1623         luminance = src[0] + src[1] + src[2];
1624
1625         index = ((luminance - endpoint_luminances[0]) * 15 /
1626                  (endpoint_luminances[1] - endpoint_luminances[0]));
1627         if (index < 0)
1628            index = 0;
1629         else if (index > 15)
1630            index = 15;
1631
1632         assert(x != 0 || y != 0 || index < 8);
1633
1634         write_bits(writer, (x == 0 && y == 0) ? 3 : 4, index);
1635
1636         src += 3;
1637      }
1638
1639      /* Pad the indices out to the block size */
1640      if (src_width < BLOCK_SIZE)
1641         write_bits(writer, 4 * (BLOCK_SIZE - src_width), 0);
1642
1643      src += (src_rowstride - src_width * 3 * sizeof (float)) / sizeof (float);
1644   }
1645
1646   /* Pad the indices out to the block size */
1647   if (src_height < BLOCK_SIZE)
1648      write_bits(writer, 4 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1649}
1650
1651static int
1652get_endpoint_value(float value, bool is_signed)
1653{
1654   bool sign = false;
1655   int half;
1656
1657   if (is_signed) {
1658      half = _mesa_float_to_half(value);
1659
1660      if (half & 0x8000) {
1661         half &= 0x7fff;
1662         sign = true;
1663      }
1664
1665      half = (32 * half / 31) >> 6;
1666
1667      if (sign)
1668         half = -half & ((1 << 10) - 1);
1669
1670      return half;
1671   } else {
1672      if (value <= 0.0f)
1673         return 0;
1674
1675      half = _mesa_float_to_half(value);
1676
1677      return (64 * half / 31) >> 6;
1678   }
1679}
1680
1681static void
1682compress_rgb_float_block(int src_width, int src_height,
1683                         const float *src, int src_rowstride,
1684                         uint8_t *dst,
1685                         bool is_signed)
1686{
1687   float average_luminance;
1688   float endpoints[2][3];
1689   struct bit_writer writer;
1690   int component, endpoint;
1691   int endpoint_value;
1692
1693   average_luminance =
1694      get_average_luminance_float(src_width, src_height, src, src_rowstride);
1695   get_endpoints_float(src_width, src_height, src, src_rowstride,
1696                       average_luminance, endpoints, is_signed);
1697
1698   writer.dst = dst;
1699   writer.pos = 0;
1700   writer.buf = 0;
1701
1702   write_bits(&writer, 5, 3); /* mode 3 */
1703
1704   /* Write the endpoints */
1705   for (endpoint = 0; endpoint < 2; endpoint++) {
1706      for (component = 0; component < 3; component++) {
1707         endpoint_value =
1708            get_endpoint_value(endpoints[endpoint][component], is_signed);
1709         write_bits(&writer, 10, endpoint_value);
1710      }
1711   }
1712
1713   write_rgb_indices_float(&writer,
1714                           src_width, src_height,
1715                           src, src_rowstride,
1716                           endpoints);
1717}
1718
1719static void
1720compress_rgb_float(int width, int height,
1721                   const float *src, int src_rowstride,
1722                   uint8_t *dst, int dst_rowstride,
1723                   bool is_signed)
1724{
1725   int dst_row_diff;
1726   int y, x;
1727
1728   if (dst_rowstride >= width * 4)
1729      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1730   else
1731      dst_row_diff = 0;
1732
1733   for (y = 0; y < height; y += BLOCK_SIZE) {
1734      for (x = 0; x < width; x += BLOCK_SIZE) {
1735         compress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1736                                  MIN2(height - y, BLOCK_SIZE),
1737                                  src + x * 3 +
1738                                  y * src_rowstride / sizeof (float),
1739                                  src_rowstride,
1740                                  dst,
1741                                  is_signed);
1742         dst += BLOCK_BYTES;
1743      }
1744      dst += dst_row_diff;
1745   }
1746}
1747
1748#endif
1749