1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2017 Broadcom 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg/** @file v3d_cpu_tiling.h 25b8e80941Smrg * 26b8e80941Smrg * Contains load/store functions common to both v3d and vc4. The utile layout 27b8e80941Smrg * stayed the same, though the way utiles get laid out has changed. 28b8e80941Smrg */ 29b8e80941Smrg 30b8e80941Smrgstatic inline void 31b8e80941Smrgv3d_load_utile(void *cpu, uint32_t cpu_stride, 32b8e80941Smrg void *gpu, uint32_t gpu_stride) 33b8e80941Smrg{ 34b8e80941Smrg#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 35b8e80941Smrg if (gpu_stride == 8) { 36b8e80941Smrg __asm__ volatile ( 37b8e80941Smrg /* Load from the GPU in one shot, no interleave, to 38b8e80941Smrg * d0-d7. 39b8e80941Smrg */ 40b8e80941Smrg "vldm %[gpu], {q0, q1, q2, q3}\n" 41b8e80941Smrg /* Store each 8-byte line to cpu-side destination, 42b8e80941Smrg * incrementing it by the stride each time. 43b8e80941Smrg */ 44b8e80941Smrg "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 45b8e80941Smrg "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" 46b8e80941Smrg "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 47b8e80941Smrg "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" 48b8e80941Smrg "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 49b8e80941Smrg "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" 50b8e80941Smrg "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" 51b8e80941Smrg "vst1.8 d7, [%[cpu]]\n" 52b8e80941Smrg : [cpu] "+r"(cpu) 53b8e80941Smrg : [gpu] "r"(gpu), 54b8e80941Smrg [cpu_stride] "r"(cpu_stride) 55b8e80941Smrg : "q0", "q1", "q2", "q3"); 56b8e80941Smrg return; 57b8e80941Smrg } else if (gpu_stride == 16) { 58b8e80941Smrg void *cpu2 = cpu + 8; 59b8e80941Smrg __asm__ volatile ( 60b8e80941Smrg /* Load from the GPU in one shot, no interleave, to 61b8e80941Smrg * d0-d7. 62b8e80941Smrg */ 63b8e80941Smrg "vldm %[gpu], {q0, q1, q2, q3};\n" 64b8e80941Smrg /* Store each 16-byte line in 2 parts to the cpu-side 65b8e80941Smrg * destination. (vld1 can only store one d-register 66b8e80941Smrg * at a time). 67b8e80941Smrg */ 68b8e80941Smrg "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 69b8e80941Smrg "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" 70b8e80941Smrg "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 71b8e80941Smrg "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" 72b8e80941Smrg "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 73b8e80941Smrg "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" 74b8e80941Smrg "vst1.8 d6, [%[cpu]]\n" 75b8e80941Smrg "vst1.8 d7, [%[cpu2]]\n" 76b8e80941Smrg : [cpu] "+r"(cpu), 77b8e80941Smrg [cpu2] "+r"(cpu2) 78b8e80941Smrg : [gpu] "r"(gpu), 79b8e80941Smrg [cpu_stride] "r"(cpu_stride) 80b8e80941Smrg : "q0", "q1", "q2", "q3"); 81b8e80941Smrg return; 82b8e80941Smrg } 83b8e80941Smrg#elif defined (PIPE_ARCH_AARCH64) 84b8e80941Smrg if (gpu_stride == 8) { 85b8e80941Smrg __asm__ volatile ( 86b8e80941Smrg /* Load from the GPU in one shot, no interleave, to 87b8e80941Smrg * d0-d7. 88b8e80941Smrg */ 89b8e80941Smrg "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 90b8e80941Smrg /* Store each 8-byte line to cpu-side destination, 91b8e80941Smrg * incrementing it by the stride each time. 92b8e80941Smrg */ 93b8e80941Smrg "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 94b8e80941Smrg "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 95b8e80941Smrg "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 96b8e80941Smrg "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 97b8e80941Smrg "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 98b8e80941Smrg "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 99b8e80941Smrg "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 100b8e80941Smrg "st1 {v3.D}[1], [%[cpu]]\n" 101b8e80941Smrg : [cpu] "+r"(cpu) 102b8e80941Smrg : [gpu] "r"(gpu), 103b8e80941Smrg [cpu_stride] "r"(cpu_stride) 104b8e80941Smrg : "v0", "v1", "v2", "v3"); 105b8e80941Smrg return; 106b8e80941Smrg } else if (gpu_stride == 16) { 107b8e80941Smrg void *cpu2 = cpu + 8; 108b8e80941Smrg __asm__ volatile ( 109b8e80941Smrg /* Load from the GPU in one shot, no interleave, to 110b8e80941Smrg * d0-d7. 111b8e80941Smrg */ 112b8e80941Smrg "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 113b8e80941Smrg /* Store each 16-byte line in 2 parts to the cpu-side 114b8e80941Smrg * destination. (vld1 can only store one d-register 115b8e80941Smrg * at a time). 116b8e80941Smrg */ 117b8e80941Smrg "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 118b8e80941Smrg "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 119b8e80941Smrg "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 120b8e80941Smrg "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 121b8e80941Smrg "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 122b8e80941Smrg "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 123b8e80941Smrg "st1 {v3.D}[0], [%[cpu]]\n" 124b8e80941Smrg "st1 {v3.D}[1], [%[cpu2]]\n" 125b8e80941Smrg : [cpu] "+r"(cpu), 126b8e80941Smrg [cpu2] "+r"(cpu2) 127b8e80941Smrg : [gpu] "r"(gpu), 128b8e80941Smrg [cpu_stride] "r"(cpu_stride) 129b8e80941Smrg : "v0", "v1", "v2", "v3"); 130b8e80941Smrg return; 131b8e80941Smrg } 132b8e80941Smrg#endif 133b8e80941Smrg 134b8e80941Smrg for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 135b8e80941Smrg memcpy(cpu, gpu + gpu_offset, gpu_stride); 136b8e80941Smrg cpu += cpu_stride; 137b8e80941Smrg } 138b8e80941Smrg} 139b8e80941Smrg 140b8e80941Smrgstatic inline void 141b8e80941Smrgv3d_store_utile(void *gpu, uint32_t gpu_stride, 142b8e80941Smrg void *cpu, uint32_t cpu_stride) 143b8e80941Smrg{ 144b8e80941Smrg#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 145b8e80941Smrg if (gpu_stride == 8) { 146b8e80941Smrg __asm__ volatile ( 147b8e80941Smrg /* Load each 8-byte line from cpu-side source, 148b8e80941Smrg * incrementing it by the stride each time. 149b8e80941Smrg */ 150b8e80941Smrg "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 151b8e80941Smrg "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" 152b8e80941Smrg "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 153b8e80941Smrg "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" 154b8e80941Smrg "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 155b8e80941Smrg "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" 156b8e80941Smrg "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" 157b8e80941Smrg "vld1.8 d7, [%[cpu]]\n" 158b8e80941Smrg /* Load from the GPU in one shot, no interleave, to 159b8e80941Smrg * d0-d7. 160b8e80941Smrg */ 161b8e80941Smrg "vstm %[gpu], {q0, q1, q2, q3}\n" 162b8e80941Smrg : [cpu] "+r"(cpu) 163b8e80941Smrg : [gpu] "r"(gpu), 164b8e80941Smrg [cpu_stride] "r"(cpu_stride) 165b8e80941Smrg : "q0", "q1", "q2", "q3"); 166b8e80941Smrg return; 167b8e80941Smrg } else if (gpu_stride == 16) { 168b8e80941Smrg void *cpu2 = cpu + 8; 169b8e80941Smrg __asm__ volatile ( 170b8e80941Smrg /* Load each 16-byte line in 2 parts from the cpu-side 171b8e80941Smrg * destination. (vld1 can only store one d-register 172b8e80941Smrg * at a time). 173b8e80941Smrg */ 174b8e80941Smrg "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 175b8e80941Smrg "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" 176b8e80941Smrg "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 177b8e80941Smrg "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" 178b8e80941Smrg "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 179b8e80941Smrg "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" 180b8e80941Smrg "vld1.8 d6, [%[cpu]]\n" 181b8e80941Smrg "vld1.8 d7, [%[cpu2]]\n" 182b8e80941Smrg /* Store to the GPU in one shot, no interleave. */ 183b8e80941Smrg "vstm %[gpu], {q0, q1, q2, q3}\n" 184b8e80941Smrg : [cpu] "+r"(cpu), 185b8e80941Smrg [cpu2] "+r"(cpu2) 186b8e80941Smrg : [gpu] "r"(gpu), 187b8e80941Smrg [cpu_stride] "r"(cpu_stride) 188b8e80941Smrg : "q0", "q1", "q2", "q3"); 189b8e80941Smrg return; 190b8e80941Smrg } 191b8e80941Smrg#elif defined (PIPE_ARCH_AARCH64) 192b8e80941Smrg if (gpu_stride == 8) { 193b8e80941Smrg __asm__ volatile ( 194b8e80941Smrg /* Load each 8-byte line from cpu-side source, 195b8e80941Smrg * incrementing it by the stride each time. 196b8e80941Smrg */ 197b8e80941Smrg "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 198b8e80941Smrg "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 199b8e80941Smrg "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 200b8e80941Smrg "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 201b8e80941Smrg "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 202b8e80941Smrg "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 203b8e80941Smrg "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 204b8e80941Smrg "ld1 {v3.D}[1], [%[cpu]]\n" 205b8e80941Smrg /* Store to the GPU in one shot, no interleave. */ 206b8e80941Smrg "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 207b8e80941Smrg : [cpu] "+r"(cpu) 208b8e80941Smrg : [gpu] "r"(gpu), 209b8e80941Smrg [cpu_stride] "r"(cpu_stride) 210b8e80941Smrg : "v0", "v1", "v2", "v3"); 211b8e80941Smrg return; 212b8e80941Smrg } else if (gpu_stride == 16) { 213b8e80941Smrg void *cpu2 = cpu + 8; 214b8e80941Smrg __asm__ volatile ( 215b8e80941Smrg /* Load each 16-byte line in 2 parts from the cpu-side 216b8e80941Smrg * destination. (vld1 can only store one d-register 217b8e80941Smrg * at a time). 218b8e80941Smrg */ 219b8e80941Smrg "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 220b8e80941Smrg "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 221b8e80941Smrg "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 222b8e80941Smrg "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 223b8e80941Smrg "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 224b8e80941Smrg "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 225b8e80941Smrg "ld1 {v3.D}[0], [%[cpu]]\n" 226b8e80941Smrg "ld1 {v3.D}[1], [%[cpu2]]\n" 227b8e80941Smrg /* Store to the GPU in one shot, no interleave. */ 228b8e80941Smrg "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 229b8e80941Smrg : [cpu] "+r"(cpu), 230b8e80941Smrg [cpu2] "+r"(cpu2) 231b8e80941Smrg : [gpu] "r"(gpu), 232b8e80941Smrg [cpu_stride] "r"(cpu_stride) 233b8e80941Smrg : "v0", "v1", "v2", "v3"); 234b8e80941Smrg return; 235b8e80941Smrg } 236b8e80941Smrg#endif 237b8e80941Smrg 238b8e80941Smrg for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 239b8e80941Smrg memcpy(gpu + gpu_offset, cpu, gpu_stride); 240b8e80941Smrg cpu += cpu_stride; 241b8e80941Smrg } 242b8e80941Smrg} 243