1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2017 Broadcom
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg/** @file v3d_cpu_tiling.h
25b8e80941Smrg *
26b8e80941Smrg * Contains load/store functions common to both v3d and vc4.  The utile layout
27b8e80941Smrg * stayed the same, though the way utiles get laid out has changed.
28b8e80941Smrg */
29b8e80941Smrg
30b8e80941Smrgstatic inline void
31b8e80941Smrgv3d_load_utile(void *cpu, uint32_t cpu_stride,
32b8e80941Smrg               void *gpu, uint32_t gpu_stride)
33b8e80941Smrg{
34b8e80941Smrg#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35b8e80941Smrg        if (gpu_stride == 8) {
36b8e80941Smrg                __asm__ volatile (
37b8e80941Smrg                        /* Load from the GPU in one shot, no interleave, to
38b8e80941Smrg                         * d0-d7.
39b8e80941Smrg                         */
40b8e80941Smrg                        "vldm %[gpu], {q0, q1, q2, q3}\n"
41b8e80941Smrg                        /* Store each 8-byte line to cpu-side destination,
42b8e80941Smrg                         * incrementing it by the stride each time.
43b8e80941Smrg                         */
44b8e80941Smrg                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45b8e80941Smrg                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46b8e80941Smrg                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47b8e80941Smrg                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48b8e80941Smrg                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49b8e80941Smrg                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50b8e80941Smrg                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51b8e80941Smrg                        "vst1.8 d7, [%[cpu]]\n"
52b8e80941Smrg                        : [cpu]         "+r"(cpu)
53b8e80941Smrg                        : [gpu]         "r"(gpu),
54b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
55b8e80941Smrg                        : "q0", "q1", "q2", "q3");
56b8e80941Smrg                return;
57b8e80941Smrg        } else if (gpu_stride == 16) {
58b8e80941Smrg                void *cpu2 = cpu + 8;
59b8e80941Smrg                __asm__ volatile (
60b8e80941Smrg                        /* Load from the GPU in one shot, no interleave, to
61b8e80941Smrg                         * d0-d7.
62b8e80941Smrg                         */
63b8e80941Smrg                        "vldm %[gpu], {q0, q1, q2, q3};\n"
64b8e80941Smrg                        /* Store each 16-byte line in 2 parts to the cpu-side
65b8e80941Smrg                         * destination.  (vld1 can only store one d-register
66b8e80941Smrg                         * at a time).
67b8e80941Smrg                         */
68b8e80941Smrg                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69b8e80941Smrg                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70b8e80941Smrg                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71b8e80941Smrg                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72b8e80941Smrg                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73b8e80941Smrg                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74b8e80941Smrg                        "vst1.8 d6, [%[cpu]]\n"
75b8e80941Smrg                        "vst1.8 d7, [%[cpu2]]\n"
76b8e80941Smrg                        : [cpu]         "+r"(cpu),
77b8e80941Smrg                          [cpu2]        "+r"(cpu2)
78b8e80941Smrg                        : [gpu]         "r"(gpu),
79b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
80b8e80941Smrg                        : "q0", "q1", "q2", "q3");
81b8e80941Smrg                return;
82b8e80941Smrg        }
83b8e80941Smrg#elif defined (PIPE_ARCH_AARCH64)
84b8e80941Smrg        if (gpu_stride == 8) {
85b8e80941Smrg                __asm__ volatile (
86b8e80941Smrg                        /* Load from the GPU in one shot, no interleave, to
87b8e80941Smrg                         * d0-d7.
88b8e80941Smrg                         */
89b8e80941Smrg                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90b8e80941Smrg                        /* Store each 8-byte line to cpu-side destination,
91b8e80941Smrg                         * incrementing it by the stride each time.
92b8e80941Smrg                         */
93b8e80941Smrg                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94b8e80941Smrg                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95b8e80941Smrg                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96b8e80941Smrg                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97b8e80941Smrg                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98b8e80941Smrg                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99b8e80941Smrg                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100b8e80941Smrg                        "st1 {v3.D}[1], [%[cpu]]\n"
101b8e80941Smrg                        : [cpu]         "+r"(cpu)
102b8e80941Smrg                        : [gpu]         "r"(gpu),
103b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
104b8e80941Smrg                        : "v0", "v1", "v2", "v3");
105b8e80941Smrg                return;
106b8e80941Smrg        } else if (gpu_stride == 16) {
107b8e80941Smrg                void *cpu2 = cpu + 8;
108b8e80941Smrg                __asm__ volatile (
109b8e80941Smrg                        /* Load from the GPU in one shot, no interleave, to
110b8e80941Smrg                         * d0-d7.
111b8e80941Smrg                         */
112b8e80941Smrg                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113b8e80941Smrg                        /* Store each 16-byte line in 2 parts to the cpu-side
114b8e80941Smrg                         * destination.  (vld1 can only store one d-register
115b8e80941Smrg                         * at a time).
116b8e80941Smrg                         */
117b8e80941Smrg                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118b8e80941Smrg                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119b8e80941Smrg                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120b8e80941Smrg                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121b8e80941Smrg                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122b8e80941Smrg                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123b8e80941Smrg                        "st1 {v3.D}[0], [%[cpu]]\n"
124b8e80941Smrg                        "st1 {v3.D}[1], [%[cpu2]]\n"
125b8e80941Smrg                        : [cpu]         "+r"(cpu),
126b8e80941Smrg                          [cpu2]        "+r"(cpu2)
127b8e80941Smrg                        : [gpu]         "r"(gpu),
128b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
129b8e80941Smrg                        : "v0", "v1", "v2", "v3");
130b8e80941Smrg                return;
131b8e80941Smrg        }
132b8e80941Smrg#endif
133b8e80941Smrg
134b8e80941Smrg        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135b8e80941Smrg                memcpy(cpu, gpu + gpu_offset, gpu_stride);
136b8e80941Smrg                cpu += cpu_stride;
137b8e80941Smrg        }
138b8e80941Smrg}
139b8e80941Smrg
140b8e80941Smrgstatic inline void
141b8e80941Smrgv3d_store_utile(void *gpu, uint32_t gpu_stride,
142b8e80941Smrg                void *cpu, uint32_t cpu_stride)
143b8e80941Smrg{
144b8e80941Smrg#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145b8e80941Smrg        if (gpu_stride == 8) {
146b8e80941Smrg                __asm__ volatile (
147b8e80941Smrg                        /* Load each 8-byte line from cpu-side source,
148b8e80941Smrg                         * incrementing it by the stride each time.
149b8e80941Smrg                         */
150b8e80941Smrg                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151b8e80941Smrg                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152b8e80941Smrg                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153b8e80941Smrg                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154b8e80941Smrg                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155b8e80941Smrg                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156b8e80941Smrg                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157b8e80941Smrg                        "vld1.8 d7, [%[cpu]]\n"
158b8e80941Smrg                        /* Load from the GPU in one shot, no interleave, to
159b8e80941Smrg                         * d0-d7.
160b8e80941Smrg                         */
161b8e80941Smrg                        "vstm %[gpu], {q0, q1, q2, q3}\n"
162b8e80941Smrg                        : [cpu]         "+r"(cpu)
163b8e80941Smrg                        : [gpu]         "r"(gpu),
164b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
165b8e80941Smrg                        : "q0", "q1", "q2", "q3");
166b8e80941Smrg                return;
167b8e80941Smrg        } else if (gpu_stride == 16) {
168b8e80941Smrg                void *cpu2 = cpu + 8;
169b8e80941Smrg                __asm__ volatile (
170b8e80941Smrg                        /* Load each 16-byte line in 2 parts from the cpu-side
171b8e80941Smrg                         * destination.  (vld1 can only store one d-register
172b8e80941Smrg                         * at a time).
173b8e80941Smrg                         */
174b8e80941Smrg                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175b8e80941Smrg                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176b8e80941Smrg                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177b8e80941Smrg                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178b8e80941Smrg                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179b8e80941Smrg                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180b8e80941Smrg                        "vld1.8 d6, [%[cpu]]\n"
181b8e80941Smrg                        "vld1.8 d7, [%[cpu2]]\n"
182b8e80941Smrg                        /* Store to the GPU in one shot, no interleave. */
183b8e80941Smrg                        "vstm %[gpu], {q0, q1, q2, q3}\n"
184b8e80941Smrg                        : [cpu]         "+r"(cpu),
185b8e80941Smrg                          [cpu2]        "+r"(cpu2)
186b8e80941Smrg                        : [gpu]         "r"(gpu),
187b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
188b8e80941Smrg                        : "q0", "q1", "q2", "q3");
189b8e80941Smrg                return;
190b8e80941Smrg        }
191b8e80941Smrg#elif defined (PIPE_ARCH_AARCH64)
192b8e80941Smrg        if (gpu_stride == 8) {
193b8e80941Smrg                __asm__ volatile (
194b8e80941Smrg                        /* Load each 8-byte line from cpu-side source,
195b8e80941Smrg                         * incrementing it by the stride each time.
196b8e80941Smrg                         */
197b8e80941Smrg                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198b8e80941Smrg                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199b8e80941Smrg                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200b8e80941Smrg                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201b8e80941Smrg                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202b8e80941Smrg                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203b8e80941Smrg                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204b8e80941Smrg                        "ld1 {v3.D}[1], [%[cpu]]\n"
205b8e80941Smrg                        /* Store to the GPU in one shot, no interleave. */
206b8e80941Smrg                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207b8e80941Smrg                        : [cpu]         "+r"(cpu)
208b8e80941Smrg                        : [gpu]         "r"(gpu),
209b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
210b8e80941Smrg                        : "v0", "v1", "v2", "v3");
211b8e80941Smrg                return;
212b8e80941Smrg        } else if (gpu_stride == 16) {
213b8e80941Smrg                void *cpu2 = cpu + 8;
214b8e80941Smrg                __asm__ volatile (
215b8e80941Smrg                        /* Load each 16-byte line in 2 parts from the cpu-side
216b8e80941Smrg                         * destination.  (vld1 can only store one d-register
217b8e80941Smrg                         * at a time).
218b8e80941Smrg                         */
219b8e80941Smrg                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220b8e80941Smrg                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221b8e80941Smrg                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222b8e80941Smrg                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223b8e80941Smrg                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224b8e80941Smrg                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225b8e80941Smrg                        "ld1 {v3.D}[0], [%[cpu]]\n"
226b8e80941Smrg                        "ld1 {v3.D}[1], [%[cpu2]]\n"
227b8e80941Smrg                        /* Store to the GPU in one shot, no interleave. */
228b8e80941Smrg                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229b8e80941Smrg                        : [cpu]         "+r"(cpu),
230b8e80941Smrg                          [cpu2]        "+r"(cpu2)
231b8e80941Smrg                        : [gpu]         "r"(gpu),
232b8e80941Smrg                          [cpu_stride]  "r"(cpu_stride)
233b8e80941Smrg                        : "v0", "v1", "v2", "v3");
234b8e80941Smrg                return;
235b8e80941Smrg        }
236b8e80941Smrg#endif
237b8e80941Smrg
238b8e80941Smrg        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239b8e80941Smrg                memcpy(gpu + gpu_offset, cpu, gpu_stride);
240b8e80941Smrg                cpu += cpu_stride;
241b8e80941Smrg        }
242b8e80941Smrg}
243