1ed98bd31Smaya/*
2ed98bd31Smaya * Copyright © 2017 Broadcom
3ed98bd31Smaya *
4ed98bd31Smaya * Permission is hereby granted, free of charge, to any person obtaining a
5ed98bd31Smaya * copy of this software and associated documentation files (the "Software"),
6ed98bd31Smaya * to deal in the Software without restriction, including without limitation
7ed98bd31Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8ed98bd31Smaya * and/or sell copies of the Software, and to permit persons to whom the
9ed98bd31Smaya * Software is furnished to do so, subject to the following conditions:
10ed98bd31Smaya *
11ed98bd31Smaya * The above copyright notice and this permission notice (including the next
12ed98bd31Smaya * paragraph) shall be included in all copies or substantial portions of the
13ed98bd31Smaya * Software.
14ed98bd31Smaya *
15ed98bd31Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16ed98bd31Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17ed98bd31Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18ed98bd31Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19ed98bd31Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20ed98bd31Smaya * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21ed98bd31Smaya * IN THE SOFTWARE.
22ed98bd31Smaya */
23ed98bd31Smaya
24ed98bd31Smaya/** @file v3d_cpu_tiling.h
25ed98bd31Smaya *
26ed98bd31Smaya * Contains load/store functions common to both v3d and vc4.  The utile layout
27ed98bd31Smaya * stayed the same, though the way utiles get laid out has changed.
28ed98bd31Smaya */
29ed98bd31Smaya
30ed98bd31Smayastatic inline void
31ed98bd31Smayav3d_load_utile(void *cpu, uint32_t cpu_stride,
32ed98bd31Smaya               void *gpu, uint32_t gpu_stride)
33ed98bd31Smaya{
34ed98bd31Smaya#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35ed98bd31Smaya        if (gpu_stride == 8) {
36ed98bd31Smaya                __asm__ volatile (
37ed98bd31Smaya                        /* Load from the GPU in one shot, no interleave, to
38ed98bd31Smaya                         * d0-d7.
39ed98bd31Smaya                         */
40ed98bd31Smaya                        "vldm %[gpu], {q0, q1, q2, q3}\n"
41ed98bd31Smaya                        /* Store each 8-byte line to cpu-side destination,
42ed98bd31Smaya                         * incrementing it by the stride each time.
43ed98bd31Smaya                         */
44ed98bd31Smaya                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45ed98bd31Smaya                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46ed98bd31Smaya                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47ed98bd31Smaya                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48ed98bd31Smaya                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49ed98bd31Smaya                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50ed98bd31Smaya                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51ed98bd31Smaya                        "vst1.8 d7, [%[cpu]]\n"
52ed98bd31Smaya                        : [cpu]         "+r"(cpu)
53ed98bd31Smaya                        : [gpu]         "r"(gpu),
54ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
55ed98bd31Smaya                        : "q0", "q1", "q2", "q3");
56ed98bd31Smaya                return;
57ed98bd31Smaya        } else if (gpu_stride == 16) {
58ed98bd31Smaya                void *cpu2 = cpu + 8;
59ed98bd31Smaya                __asm__ volatile (
60ed98bd31Smaya                        /* Load from the GPU in one shot, no interleave, to
61ed98bd31Smaya                         * d0-d7.
62ed98bd31Smaya                         */
63ed98bd31Smaya                        "vldm %[gpu], {q0, q1, q2, q3};\n"
64ed98bd31Smaya                        /* Store each 16-byte line in 2 parts to the cpu-side
65ed98bd31Smaya                         * destination.  (vld1 can only store one d-register
66ed98bd31Smaya                         * at a time).
67ed98bd31Smaya                         */
68ed98bd31Smaya                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69ed98bd31Smaya                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70ed98bd31Smaya                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71ed98bd31Smaya                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72ed98bd31Smaya                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73ed98bd31Smaya                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74ed98bd31Smaya                        "vst1.8 d6, [%[cpu]]\n"
75ed98bd31Smaya                        "vst1.8 d7, [%[cpu2]]\n"
76ed98bd31Smaya                        : [cpu]         "+r"(cpu),
77ed98bd31Smaya                          [cpu2]        "+r"(cpu2)
78ed98bd31Smaya                        : [gpu]         "r"(gpu),
79ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
80ed98bd31Smaya                        : "q0", "q1", "q2", "q3");
81ed98bd31Smaya                return;
82ed98bd31Smaya        }
83ed98bd31Smaya#elif defined (PIPE_ARCH_AARCH64)
84ed98bd31Smaya        if (gpu_stride == 8) {
85ed98bd31Smaya                __asm__ volatile (
86ed98bd31Smaya                        /* Load from the GPU in one shot, no interleave, to
87ed98bd31Smaya                         * d0-d7.
88ed98bd31Smaya                         */
89ed98bd31Smaya                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90ed98bd31Smaya                        /* Store each 8-byte line to cpu-side destination,
91ed98bd31Smaya                         * incrementing it by the stride each time.
92ed98bd31Smaya                         */
93ed98bd31Smaya                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94ed98bd31Smaya                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95ed98bd31Smaya                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96ed98bd31Smaya                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97ed98bd31Smaya                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98ed98bd31Smaya                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99ed98bd31Smaya                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100ed98bd31Smaya                        "st1 {v3.D}[1], [%[cpu]]\n"
101ed98bd31Smaya                        : [cpu]         "+r"(cpu)
102ed98bd31Smaya                        : [gpu]         "r"(gpu),
103ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
104ed98bd31Smaya                        : "v0", "v1", "v2", "v3");
105ed98bd31Smaya                return;
106ed98bd31Smaya        } else if (gpu_stride == 16) {
107ed98bd31Smaya                void *cpu2 = cpu + 8;
108ed98bd31Smaya                __asm__ volatile (
109ed98bd31Smaya                        /* Load from the GPU in one shot, no interleave, to
110ed98bd31Smaya                         * d0-d7.
111ed98bd31Smaya                         */
112ed98bd31Smaya                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113ed98bd31Smaya                        /* Store each 16-byte line in 2 parts to the cpu-side
114ed98bd31Smaya                         * destination.  (vld1 can only store one d-register
115ed98bd31Smaya                         * at a time).
116ed98bd31Smaya                         */
117ed98bd31Smaya                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118ed98bd31Smaya                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119ed98bd31Smaya                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120ed98bd31Smaya                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121ed98bd31Smaya                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122ed98bd31Smaya                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123ed98bd31Smaya                        "st1 {v3.D}[0], [%[cpu]]\n"
124ed98bd31Smaya                        "st1 {v3.D}[1], [%[cpu2]]\n"
125ed98bd31Smaya                        : [cpu]         "+r"(cpu),
126ed98bd31Smaya                          [cpu2]        "+r"(cpu2)
127ed98bd31Smaya                        : [gpu]         "r"(gpu),
128ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
129ed98bd31Smaya                        : "v0", "v1", "v2", "v3");
130ed98bd31Smaya                return;
131ed98bd31Smaya        }
132ed98bd31Smaya#endif
133ed98bd31Smaya
134ed98bd31Smaya        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135ed98bd31Smaya                memcpy(cpu, gpu + gpu_offset, gpu_stride);
136ed98bd31Smaya                cpu += cpu_stride;
137ed98bd31Smaya        }
138ed98bd31Smaya}
139ed98bd31Smaya
140ed98bd31Smayastatic inline void
141ed98bd31Smayav3d_store_utile(void *gpu, uint32_t gpu_stride,
142ed98bd31Smaya                void *cpu, uint32_t cpu_stride)
143ed98bd31Smaya{
144ed98bd31Smaya#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145ed98bd31Smaya        if (gpu_stride == 8) {
146ed98bd31Smaya                __asm__ volatile (
147ed98bd31Smaya                        /* Load each 8-byte line from cpu-side source,
148ed98bd31Smaya                         * incrementing it by the stride each time.
149ed98bd31Smaya                         */
150ed98bd31Smaya                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151ed98bd31Smaya                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152ed98bd31Smaya                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153ed98bd31Smaya                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154ed98bd31Smaya                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155ed98bd31Smaya                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156ed98bd31Smaya                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157ed98bd31Smaya                        "vld1.8 d7, [%[cpu]]\n"
158ed98bd31Smaya                        /* Load from the GPU in one shot, no interleave, to
159ed98bd31Smaya                         * d0-d7.
160ed98bd31Smaya                         */
161ed98bd31Smaya                        "vstm %[gpu], {q0, q1, q2, q3}\n"
162ed98bd31Smaya                        : [cpu]         "+r"(cpu)
163ed98bd31Smaya                        : [gpu]         "r"(gpu),
164ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
165ed98bd31Smaya                        : "q0", "q1", "q2", "q3");
166ed98bd31Smaya                return;
167ed98bd31Smaya        } else if (gpu_stride == 16) {
168ed98bd31Smaya                void *cpu2 = cpu + 8;
169ed98bd31Smaya                __asm__ volatile (
170ed98bd31Smaya                        /* Load each 16-byte line in 2 parts from the cpu-side
171ed98bd31Smaya                         * destination.  (vld1 can only store one d-register
172ed98bd31Smaya                         * at a time).
173ed98bd31Smaya                         */
174ed98bd31Smaya                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175ed98bd31Smaya                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176ed98bd31Smaya                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177ed98bd31Smaya                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178ed98bd31Smaya                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179ed98bd31Smaya                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180ed98bd31Smaya                        "vld1.8 d6, [%[cpu]]\n"
181ed98bd31Smaya                        "vld1.8 d7, [%[cpu2]]\n"
182ed98bd31Smaya                        /* Store to the GPU in one shot, no interleave. */
183ed98bd31Smaya                        "vstm %[gpu], {q0, q1, q2, q3}\n"
184ed98bd31Smaya                        : [cpu]         "+r"(cpu),
185ed98bd31Smaya                          [cpu2]        "+r"(cpu2)
186ed98bd31Smaya                        : [gpu]         "r"(gpu),
187ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
188ed98bd31Smaya                        : "q0", "q1", "q2", "q3");
189ed98bd31Smaya                return;
190ed98bd31Smaya        }
191ed98bd31Smaya#elif defined (PIPE_ARCH_AARCH64)
192ed98bd31Smaya        if (gpu_stride == 8) {
193ed98bd31Smaya                __asm__ volatile (
194ed98bd31Smaya                        /* Load each 8-byte line from cpu-side source,
195ed98bd31Smaya                         * incrementing it by the stride each time.
196ed98bd31Smaya                         */
197ed98bd31Smaya                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198ed98bd31Smaya                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199ed98bd31Smaya                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200ed98bd31Smaya                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201ed98bd31Smaya                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202ed98bd31Smaya                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203ed98bd31Smaya                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204ed98bd31Smaya                        "ld1 {v3.D}[1], [%[cpu]]\n"
205ed98bd31Smaya                        /* Store to the GPU in one shot, no interleave. */
206ed98bd31Smaya                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207ed98bd31Smaya                        : [cpu]         "+r"(cpu)
208ed98bd31Smaya                        : [gpu]         "r"(gpu),
209ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
210ed98bd31Smaya                        : "v0", "v1", "v2", "v3");
211ed98bd31Smaya                return;
212ed98bd31Smaya        } else if (gpu_stride == 16) {
213ed98bd31Smaya                void *cpu2 = cpu + 8;
214ed98bd31Smaya                __asm__ volatile (
215ed98bd31Smaya                        /* Load each 16-byte line in 2 parts from the cpu-side
216ed98bd31Smaya                         * destination.  (vld1 can only store one d-register
217ed98bd31Smaya                         * at a time).
218ed98bd31Smaya                         */
219ed98bd31Smaya                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220ed98bd31Smaya                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221ed98bd31Smaya                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222ed98bd31Smaya                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223ed98bd31Smaya                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224ed98bd31Smaya                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225ed98bd31Smaya                        "ld1 {v3.D}[0], [%[cpu]]\n"
226ed98bd31Smaya                        "ld1 {v3.D}[1], [%[cpu2]]\n"
227ed98bd31Smaya                        /* Store to the GPU in one shot, no interleave. */
228ed98bd31Smaya                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229ed98bd31Smaya                        : [cpu]         "+r"(cpu),
230ed98bd31Smaya                          [cpu2]        "+r"(cpu2)
231ed98bd31Smaya                        : [gpu]         "r"(gpu),
232ed98bd31Smaya                          [cpu_stride]  "r"(cpu_stride)
233ed98bd31Smaya                        : "v0", "v1", "v2", "v3");
234ed98bd31Smaya                return;
235ed98bd31Smaya        }
236ed98bd31Smaya#endif
237ed98bd31Smaya
238ed98bd31Smaya        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239ed98bd31Smaya                memcpy(gpu + gpu_offset, cpu, gpu_stride);
240ed98bd31Smaya                cpu += cpu_stride;
241ed98bd31Smaya        }
242ed98bd31Smaya}
243