1/*
2 * Copyright © 2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file v3d_cpu_tiling.h
25 *
26 * Contains load/store functions common to both v3d and vc4.  The utile layout
27 * stayed the same, though the way utiles get laid out has changed.
28 */
29
30static inline void
31v3d_load_utile(void *cpu, uint32_t cpu_stride,
32               void *gpu, uint32_t gpu_stride)
33{
34#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
35        if (gpu_stride == 8) {
36                __asm__ volatile (
37                        /* Load from the GPU in one shot, no interleave, to
38                         * d0-d7.
39                         */
40                        "vldm %[gpu], {q0, q1, q2, q3}\n"
41                        /* Store each 8-byte line to cpu-side destination,
42                         * incrementing it by the stride each time.
43                         */
44                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
45                        "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
46                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
47                        "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
48                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
49                        "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
50                        "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
51                        "vst1.8 d7, [%[cpu]]\n"
52                        : [cpu]         "+r"(cpu)
53                        : [gpu]         "r"(gpu),
54                          [cpu_stride]  "r"(cpu_stride)
55                        : "q0", "q1", "q2", "q3");
56                return;
57        } else if (gpu_stride == 16) {
58                void *cpu2 = cpu + 8;
59                __asm__ volatile (
60                        /* Load from the GPU in one shot, no interleave, to
61                         * d0-d7.
62                         */
63                        "vldm %[gpu], {q0, q1, q2, q3};\n"
64                        /* Store each 16-byte line in 2 parts to the cpu-side
65                         * destination.  (vld1 can only store one d-register
66                         * at a time).
67                         */
68                        "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
69                        "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
70                        "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
71                        "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
72                        "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
73                        "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
74                        "vst1.8 d6, [%[cpu]]\n"
75                        "vst1.8 d7, [%[cpu2]]\n"
76                        : [cpu]         "+r"(cpu),
77                          [cpu2]        "+r"(cpu2)
78                        : [gpu]         "r"(gpu),
79                          [cpu_stride]  "r"(cpu_stride)
80                        : "q0", "q1", "q2", "q3");
81                return;
82        }
83#elif defined (PIPE_ARCH_AARCH64)
84        if (gpu_stride == 8) {
85                __asm__ volatile (
86                        /* Load from the GPU in one shot, no interleave, to
87                         * d0-d7.
88                         */
89                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
90                        /* Store each 8-byte line to cpu-side destination,
91                         * incrementing it by the stride each time.
92                         */
93                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
94                        "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
95                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
96                        "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
97                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
98                        "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
99                        "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
100                        "st1 {v3.D}[1], [%[cpu]]\n"
101                        : [cpu]         "+r"(cpu)
102                        : [gpu]         "r"(gpu),
103                          [cpu_stride]  "r"(cpu_stride)
104                        : "v0", "v1", "v2", "v3");
105                return;
106        } else if (gpu_stride == 16) {
107                void *cpu2 = cpu + 8;
108                __asm__ volatile (
109                        /* Load from the GPU in one shot, no interleave, to
110                         * d0-d7.
111                         */
112                        "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
113                        /* Store each 16-byte line in 2 parts to the cpu-side
114                         * destination.  (vld1 can only store one d-register
115                         * at a time).
116                         */
117                        "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
118                        "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
119                        "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
120                        "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
121                        "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
122                        "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
123                        "st1 {v3.D}[0], [%[cpu]]\n"
124                        "st1 {v3.D}[1], [%[cpu2]]\n"
125                        : [cpu]         "+r"(cpu),
126                          [cpu2]        "+r"(cpu2)
127                        : [gpu]         "r"(gpu),
128                          [cpu_stride]  "r"(cpu_stride)
129                        : "v0", "v1", "v2", "v3");
130                return;
131        }
132#endif
133
134        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
135                memcpy(cpu, gpu + gpu_offset, gpu_stride);
136                cpu += cpu_stride;
137        }
138}
139
140static inline void
141v3d_store_utile(void *gpu, uint32_t gpu_stride,
142                void *cpu, uint32_t cpu_stride)
143{
144#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM)
145        if (gpu_stride == 8) {
146                __asm__ volatile (
147                        /* Load each 8-byte line from cpu-side source,
148                         * incrementing it by the stride each time.
149                         */
150                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
151                        "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
152                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
153                        "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
154                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
155                        "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
156                        "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
157                        "vld1.8 d7, [%[cpu]]\n"
158                        /* Load from the GPU in one shot, no interleave, to
159                         * d0-d7.
160                         */
161                        "vstm %[gpu], {q0, q1, q2, q3}\n"
162                        : [cpu]         "+r"(cpu)
163                        : [gpu]         "r"(gpu),
164                          [cpu_stride]  "r"(cpu_stride)
165                        : "q0", "q1", "q2", "q3");
166                return;
167        } else if (gpu_stride == 16) {
168                void *cpu2 = cpu + 8;
169                __asm__ volatile (
170                        /* Load each 16-byte line in 2 parts from the cpu-side
171                         * destination.  (vld1 can only store one d-register
172                         * at a time).
173                         */
174                        "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
175                        "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
176                        "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
177                        "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
178                        "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
179                        "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
180                        "vld1.8 d6, [%[cpu]]\n"
181                        "vld1.8 d7, [%[cpu2]]\n"
182                        /* Store to the GPU in one shot, no interleave. */
183                        "vstm %[gpu], {q0, q1, q2, q3}\n"
184                        : [cpu]         "+r"(cpu),
185                          [cpu2]        "+r"(cpu2)
186                        : [gpu]         "r"(gpu),
187                          [cpu_stride]  "r"(cpu_stride)
188                        : "q0", "q1", "q2", "q3");
189                return;
190        }
191#elif defined (PIPE_ARCH_AARCH64)
192        if (gpu_stride == 8) {
193                __asm__ volatile (
194                        /* Load each 8-byte line from cpu-side source,
195                         * incrementing it by the stride each time.
196                         */
197                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
198                        "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
199                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
200                        "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
201                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
202                        "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
203                        "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
204                        "ld1 {v3.D}[1], [%[cpu]]\n"
205                        /* Store to the GPU in one shot, no interleave. */
206                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
207                        : [cpu]         "+r"(cpu)
208                        : [gpu]         "r"(gpu),
209                          [cpu_stride]  "r"(cpu_stride)
210                        : "v0", "v1", "v2", "v3");
211                return;
212        } else if (gpu_stride == 16) {
213                void *cpu2 = cpu + 8;
214                __asm__ volatile (
215                        /* Load each 16-byte line in 2 parts from the cpu-side
216                         * destination.  (vld1 can only store one d-register
217                         * at a time).
218                         */
219                        "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
220                        "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
221                        "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
222                        "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
223                        "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
224                        "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
225                        "ld1 {v3.D}[0], [%[cpu]]\n"
226                        "ld1 {v3.D}[1], [%[cpu2]]\n"
227                        /* Store to the GPU in one shot, no interleave. */
228                        "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
229                        : [cpu]         "+r"(cpu),
230                          [cpu2]        "+r"(cpu2)
231                        : [gpu]         "r"(gpu),
232                          [cpu_stride]  "r"(cpu_stride)
233                        : "v0", "v1", "v2", "v3");
234                return;
235        }
236#endif
237
238        for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
239                memcpy(gpu + gpu_offset, cpu, gpu_stride);
240                cpu += cpu_stride;
241        }
242}
243