1/* 2 * Copyright © 2017 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file v3d_cpu_tiling.h 25 * 26 * Contains load/store functions common to both v3d and vc4. The utile layout 27 * stayed the same, though the way utiles get laid out has changed. 28 */ 29 30static inline void 31v3d_load_utile(void *cpu, uint32_t cpu_stride, 32 void *gpu, uint32_t gpu_stride) 33{ 34#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 35 if (gpu_stride == 8) { 36 __asm__ volatile ( 37 /* Load from the GPU in one shot, no interleave, to 38 * d0-d7. 39 */ 40 "vldm %[gpu], {q0, q1, q2, q3}\n" 41 /* Store each 8-byte line to cpu-side destination, 42 * incrementing it by the stride each time. 43 */ 44 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 45 "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" 46 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 47 "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" 48 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 49 "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" 50 "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" 51 "vst1.8 d7, [%[cpu]]\n" 52 : [cpu] "+r"(cpu) 53 : [gpu] "r"(gpu), 54 [cpu_stride] "r"(cpu_stride) 55 : "q0", "q1", "q2", "q3"); 56 return; 57 } else if (gpu_stride == 16) { 58 void *cpu2 = cpu + 8; 59 __asm__ volatile ( 60 /* Load from the GPU in one shot, no interleave, to 61 * d0-d7. 62 */ 63 "vldm %[gpu], {q0, q1, q2, q3};\n" 64 /* Store each 16-byte line in 2 parts to the cpu-side 65 * destination. (vld1 can only store one d-register 66 * at a time). 67 */ 68 "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 69 "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" 70 "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 71 "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" 72 "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 73 "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" 74 "vst1.8 d6, [%[cpu]]\n" 75 "vst1.8 d7, [%[cpu2]]\n" 76 : [cpu] "+r"(cpu), 77 [cpu2] "+r"(cpu2) 78 : [gpu] "r"(gpu), 79 [cpu_stride] "r"(cpu_stride) 80 : "q0", "q1", "q2", "q3"); 81 return; 82 } 83#elif defined (PIPE_ARCH_AARCH64) 84 if (gpu_stride == 8) { 85 __asm__ volatile ( 86 /* Load from the GPU in one shot, no interleave, to 87 * d0-d7. 88 */ 89 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 90 /* Store each 8-byte line to cpu-side destination, 91 * incrementing it by the stride each time. 92 */ 93 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 94 "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 95 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 96 "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 97 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 98 "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 99 "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 100 "st1 {v3.D}[1], [%[cpu]]\n" 101 : [cpu] "+r"(cpu) 102 : [gpu] "r"(gpu), 103 [cpu_stride] "r"(cpu_stride) 104 : "v0", "v1", "v2", "v3"); 105 return; 106 } else if (gpu_stride == 16) { 107 void *cpu2 = cpu + 8; 108 __asm__ volatile ( 109 /* Load from the GPU in one shot, no interleave, to 110 * d0-d7. 111 */ 112 "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 113 /* Store each 16-byte line in 2 parts to the cpu-side 114 * destination. (vld1 can only store one d-register 115 * at a time). 116 */ 117 "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 118 "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 119 "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 120 "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 121 "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 122 "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 123 "st1 {v3.D}[0], [%[cpu]]\n" 124 "st1 {v3.D}[1], [%[cpu2]]\n" 125 : [cpu] "+r"(cpu), 126 [cpu2] "+r"(cpu2) 127 : [gpu] "r"(gpu), 128 [cpu_stride] "r"(cpu_stride) 129 : "v0", "v1", "v2", "v3"); 130 return; 131 } 132#endif 133 134 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 135 memcpy(cpu, gpu + gpu_offset, gpu_stride); 136 cpu += cpu_stride; 137 } 138} 139 140static inline void 141v3d_store_utile(void *gpu, uint32_t gpu_stride, 142 void *cpu, uint32_t cpu_stride) 143{ 144#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 145 if (gpu_stride == 8) { 146 __asm__ volatile ( 147 /* Load each 8-byte line from cpu-side source, 148 * incrementing it by the stride each time. 149 */ 150 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 151 "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" 152 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 153 "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" 154 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 155 "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" 156 "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" 157 "vld1.8 d7, [%[cpu]]\n" 158 /* Load from the GPU in one shot, no interleave, to 159 * d0-d7. 160 */ 161 "vstm %[gpu], {q0, q1, q2, q3}\n" 162 : [cpu] "+r"(cpu) 163 : [gpu] "r"(gpu), 164 [cpu_stride] "r"(cpu_stride) 165 : "q0", "q1", "q2", "q3"); 166 return; 167 } else if (gpu_stride == 16) { 168 void *cpu2 = cpu + 8; 169 __asm__ volatile ( 170 /* Load each 16-byte line in 2 parts from the cpu-side 171 * destination. (vld1 can only store one d-register 172 * at a time). 173 */ 174 "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 175 "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" 176 "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 177 "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" 178 "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 179 "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" 180 "vld1.8 d6, [%[cpu]]\n" 181 "vld1.8 d7, [%[cpu2]]\n" 182 /* Store to the GPU in one shot, no interleave. */ 183 "vstm %[gpu], {q0, q1, q2, q3}\n" 184 : [cpu] "+r"(cpu), 185 [cpu2] "+r"(cpu2) 186 : [gpu] "r"(gpu), 187 [cpu_stride] "r"(cpu_stride) 188 : "q0", "q1", "q2", "q3"); 189 return; 190 } 191#elif defined (PIPE_ARCH_AARCH64) 192 if (gpu_stride == 8) { 193 __asm__ volatile ( 194 /* Load each 8-byte line from cpu-side source, 195 * incrementing it by the stride each time. 196 */ 197 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 198 "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 199 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 200 "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 201 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 202 "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 203 "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 204 "ld1 {v3.D}[1], [%[cpu]]\n" 205 /* Store to the GPU in one shot, no interleave. */ 206 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 207 : [cpu] "+r"(cpu) 208 : [gpu] "r"(gpu), 209 [cpu_stride] "r"(cpu_stride) 210 : "v0", "v1", "v2", "v3"); 211 return; 212 } else if (gpu_stride == 16) { 213 void *cpu2 = cpu + 8; 214 __asm__ volatile ( 215 /* Load each 16-byte line in 2 parts from the cpu-side 216 * destination. (vld1 can only store one d-register 217 * at a time). 218 */ 219 "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 220 "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 221 "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 222 "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 223 "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 224 "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 225 "ld1 {v3.D}[0], [%[cpu]]\n" 226 "ld1 {v3.D}[1], [%[cpu2]]\n" 227 /* Store to the GPU in one shot, no interleave. */ 228 "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 229 : [cpu] "+r"(cpu), 230 [cpu2] "+r"(cpu2) 231 : [gpu] "r"(gpu), 232 [cpu_stride] "r"(cpu_stride) 233 : "v0", "v1", "v2", "v3"); 234 return; 235 } 236#endif 237 238 for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 239 memcpy(gpu + gpu_offset, cpu, gpu_stride); 240 cpu += cpu_stride; 241 } 242} 243