1b8e80941Smrg/*
2b8e80941Smrg * Copyright (c) 2012-2013 Luc Verhaegen <libv@skynet.be>
3b8e80941Smrg * Copyright (c) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io>
4b8e80941Smrg *
5b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
6b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
7b8e80941Smrg * to deal in the Software without restriction, including without limitation
8b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sub license,
9b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
10b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
11b8e80941Smrg *
12b8e80941Smrg * The above copyright notice and this permission notice (including the
13b8e80941Smrg * next paragraph) shall be included in all copies or substantial portions
14b8e80941Smrg * of the Software.
15b8e80941Smrg *
16b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22b8e80941Smrg * DEALINGS IN THE SOFTWARE.
23b8e80941Smrg */
24b8e80941Smrg
25b8e80941Smrg#include <stdio.h>
26b8e80941Smrg#include "pan_swizzle.h"
27b8e80941Smrg#include "pan_allocate.h"
28b8e80941Smrg
29b8e80941Smrg/* Space a group of 4-bits out. For instance, 0x7 -- that is, 0b111 -- would
30b8e80941Smrg * become 0b10101 */
31b8e80941Smrg
32b8e80941Smrgstatic inline int
33b8e80941Smrgspace_bits_4(int i)
34b8e80941Smrg{
35b8e80941Smrg        return ((i & 0x8) << 3) |
36b8e80941Smrg               ((i & 0x4) << 2) |
37b8e80941Smrg               ((i & 0x2) << 1) |
38b8e80941Smrg               ((i & 0x1) << 0);
39b8e80941Smrg}
40b8e80941Smrg
41b8e80941Smrg/* Generate lookup table for the space filler curve. Note this is a 1:1
42b8e80941Smrg * mapping, just with bits twiddled around. */
43b8e80941Smrg
44b8e80941Smrguint32_t space_filler[16][16];
45b8e80941Smrguint32_t space_filler_packed4[16][4];
46b8e80941Smrg
47b8e80941Smrgvoid
48b8e80941Smrgpanfrost_generate_space_filler_indices()
49b8e80941Smrg{
50b8e80941Smrg        for (int y = 0; y < 16; ++y) {
51b8e80941Smrg                for (int x = 0; x < 16; ++x) {
52b8e80941Smrg                        space_filler[y][x] =
53b8e80941Smrg                                space_bits_4(y ^ x) | (space_bits_4(y) << 1);
54b8e80941Smrg                }
55b8e80941Smrg
56b8e80941Smrg                for (int q = 0; q < 4; ++q) {
57b8e80941Smrg                        space_filler_packed4[y][q] =
58b8e80941Smrg                                (space_filler[y][(q * 4) + 0] << 0) |
59b8e80941Smrg                                (space_filler[y][(q * 4) + 1] << 8) |
60b8e80941Smrg                                (space_filler[y][(q * 4) + 2] << 16) |
61b8e80941Smrg                                (space_filler[y][(q * 4) + 3] << 24);
62b8e80941Smrg                }
63b8e80941Smrg        }
64b8e80941Smrg}
65b8e80941Smrg
66b8e80941Smrgstatic void
67b8e80941Smrgswizzle_bpp1_align16(int width, int height, int source_stride, int block_pitch,
68b8e80941Smrg                     const uint8_t *pixels,
69b8e80941Smrg                     uint8_t *ldest)
70b8e80941Smrg{
71b8e80941Smrg        for (int y = 0; y < height; ++y) {
72b8e80941Smrg                {
73b8e80941Smrg                        int block_y = y & ~(0x0f);
74b8e80941Smrg                        int rem_y = y & 0x0f;
75b8e80941Smrg                        uint8_t *block_start_s = ldest + (block_y * block_pitch);
76b8e80941Smrg                        const uint8_t *source_start = pixels + (y * source_stride);
77b8e80941Smrg                        const uint8_t *source_end = source_start + width;
78b8e80941Smrg
79b8e80941Smrg                        /* Operate on blocks of 16 pixels to minimise bookkeeping */
80b8e80941Smrg
81b8e80941Smrg                        for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
82b8e80941Smrg                                const uint32_t *src_32 = (const uint32_t *) source_start;
83b8e80941Smrg
84b8e80941Smrg                                for (int q = 0; q < 4; ++q) {
85b8e80941Smrg                                        uint32_t src = src_32[q];
86b8e80941Smrg                                        uint32_t spaced = space_filler_packed4[rem_y][q];
87b8e80941Smrg                                        uint16_t *bs = (uint16_t *) block_start_s;
88b8e80941Smrg
89b8e80941Smrg                                        int spacedA = (spaced >> 0) & 0xFF;
90b8e80941Smrg                                        int spacedB = (spaced >> 16) & 0xFF;
91b8e80941Smrg
92b8e80941Smrg                                        bs[spacedA >> 1] = (src >> 0) & 0xFFFF;
93b8e80941Smrg                                        bs[spacedB >> 1] = (src >> 16) & 0xFFFF;
94b8e80941Smrg                                }
95b8e80941Smrg                        }
96b8e80941Smrg                }
97b8e80941Smrg
98b8e80941Smrg                ++y;
99b8e80941Smrg
100b8e80941Smrg                if (y >= height)
101b8e80941Smrg                        break;
102b8e80941Smrg
103b8e80941Smrg                {
104b8e80941Smrg                        int block_y = y & ~(0x0f);
105b8e80941Smrg                        int rem_y = y & 0x0f;
106b8e80941Smrg                        uint8_t *block_start_s = ldest + (block_y * block_pitch);
107b8e80941Smrg                        const uint8_t *source_start = pixels + (y * source_stride);
108b8e80941Smrg                        const uint8_t *source_end = source_start + width;
109b8e80941Smrg
110b8e80941Smrg                        /* Operate on blocks of 16 pixels to minimise bookkeeping */
111b8e80941Smrg
112b8e80941Smrg                        for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
113b8e80941Smrg                                const uint32_t *src_32 = (const uint32_t *) source_start;
114b8e80941Smrg
115b8e80941Smrg                                for (int q = 0; q < 4; ++q) {
116b8e80941Smrg                                        uint32_t src = src_32[q];
117b8e80941Smrg                                        uint32_t spaced = space_filler_packed4[rem_y][q];
118b8e80941Smrg
119b8e80941Smrg                                        block_start_s[(spaced >> 0) & 0xFF] = (src >> 0) & 0xFF;
120b8e80941Smrg                                        block_start_s[(spaced >> 8) & 0xFF] = (src >> 8) & 0xFF;
121b8e80941Smrg
122b8e80941Smrg                                        block_start_s[(spaced >> 16) & 0xFF] = (src >> 16) & 0xFF;
123b8e80941Smrg                                        block_start_s[(spaced >> 24) & 0xFF] = (src >> 24) & 0xFF;
124b8e80941Smrg                                }
125b8e80941Smrg                        }
126b8e80941Smrg                }
127b8e80941Smrg
128b8e80941Smrg        }
129b8e80941Smrg}
130b8e80941Smrg
131b8e80941Smrgstatic void
132b8e80941Smrgswizzle_bpp4_align16(int width, int height, int source_stride, int block_pitch,
133b8e80941Smrg                     const uint32_t *pixels,
134b8e80941Smrg                     uint32_t *ldest)
135b8e80941Smrg{
136b8e80941Smrg        for (int y = 0; y < height; ++y) {
137b8e80941Smrg                int block_y = y & ~(0x0f);
138b8e80941Smrg                int rem_y = y & 0x0f;
139b8e80941Smrg                uint32_t *block_start_s = ldest + (block_y * block_pitch);
140b8e80941Smrg                const uint32_t *source_start = pixels + (y * source_stride);
141b8e80941Smrg                const uint32_t *source_end = source_start + width;
142b8e80941Smrg
143b8e80941Smrg                /* Operate on blocks of 16 pixels to minimise bookkeeping */
144b8e80941Smrg
145b8e80941Smrg                for (; source_start < source_end; block_start_s += 16 * 16, source_start += 16) {
146b8e80941Smrg                        for (int j = 0; j < 16; ++j)
147b8e80941Smrg                                block_start_s[space_filler[rem_y][j]] = source_start[j];
148b8e80941Smrg                }
149b8e80941Smrg        }
150b8e80941Smrg}
151b8e80941Smrg
152b8e80941Smrgvoid
153b8e80941Smrgpanfrost_texture_swizzle(unsigned off_x,
154b8e80941Smrg                         unsigned off_y,
155b8e80941Smrg                         int width, int height, int bytes_per_pixel, int dest_width,
156b8e80941Smrg                         const uint8_t *pixels,
157b8e80941Smrg                         uint8_t *ldest)
158b8e80941Smrg{
159b8e80941Smrg        /* Calculate maximum size, overestimating a bit */
160b8e80941Smrg        int block_pitch = ALIGN(dest_width, 16) >> 4;
161b8e80941Smrg
162b8e80941Smrg        /* Strides must be tight, since we're only ever called indirectly */
163b8e80941Smrg        int source_stride = width * bytes_per_pixel;
164b8e80941Smrg
165b8e80941Smrg        /* Use fast path if available */
166b8e80941Smrg        if (!(off_x || off_y) && (width == dest_width)) {
167b8e80941Smrg                if (bytes_per_pixel == 4 && (ALIGN(width, 16) == width)) {
168b8e80941Smrg                        swizzle_bpp4_align16(width, height, source_stride >> 2, (block_pitch * 256 >> 4), (const uint32_t *) pixels, (uint32_t *) ldest);
169b8e80941Smrg                        return;
170b8e80941Smrg                } else if (bytes_per_pixel == 1 && (ALIGN(width, 16) == width)) {
171b8e80941Smrg                        swizzle_bpp1_align16(width, height, source_stride, (block_pitch * 256 >> 4), pixels, (uint8_t *) ldest);
172b8e80941Smrg                        return;
173b8e80941Smrg                }
174b8e80941Smrg        }
175b8e80941Smrg
176b8e80941Smrg        /* Otherwise, default back on generic path */
177b8e80941Smrg
178b8e80941Smrg        for (int y = 0; y < height; ++y) {
179b8e80941Smrg                int block_y = (y + off_y) >> 4;
180b8e80941Smrg                int rem_y = (y + off_y) & 0x0F;
181b8e80941Smrg                int block_start_s = block_y * block_pitch * 256;
182b8e80941Smrg                int source_start = y * source_stride;
183b8e80941Smrg
184b8e80941Smrg                for (int x = 0; x < width; ++x) {
185b8e80941Smrg                        int block_x_s = ((x + off_x) >> 4) * 256;
186b8e80941Smrg                        int rem_x = (x + off_x) & 0x0F;
187b8e80941Smrg
188b8e80941Smrg                        int index = space_filler[rem_y][rem_x];
189b8e80941Smrg                        const uint8_t *source = &pixels[source_start + bytes_per_pixel * x];
190b8e80941Smrg                        uint8_t *dest = ldest + bytes_per_pixel * (block_start_s + block_x_s + index);
191b8e80941Smrg
192b8e80941Smrg                        for (int b = 0; b < bytes_per_pixel; ++b)
193b8e80941Smrg                                dest[b] = source[b];
194b8e80941Smrg                }
195b8e80941Smrg        }
196b8e80941Smrg}
197