1ed98bd31Smaya/* 2ed98bd31Smaya * Copyright © 2017 Broadcom 3ed98bd31Smaya * 4ed98bd31Smaya * Permission is hereby granted, free of charge, to any person obtaining a 5ed98bd31Smaya * copy of this software and associated documentation files (the "Software"), 6ed98bd31Smaya * to deal in the Software without restriction, including without limitation 7ed98bd31Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8ed98bd31Smaya * and/or sell copies of the Software, and to permit persons to whom the 9ed98bd31Smaya * Software is furnished to do so, subject to the following conditions: 10ed98bd31Smaya * 11ed98bd31Smaya * The above copyright notice and this permission notice (including the next 12ed98bd31Smaya * paragraph) shall be included in all copies or substantial portions of the 13ed98bd31Smaya * Software. 14ed98bd31Smaya * 15ed98bd31Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16ed98bd31Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17ed98bd31Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18ed98bd31Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19ed98bd31Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20ed98bd31Smaya * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21ed98bd31Smaya * IN THE SOFTWARE. 22ed98bd31Smaya */ 23ed98bd31Smaya 24ed98bd31Smaya/** @file v3d_cpu_tiling.h 25ed98bd31Smaya * 26ed98bd31Smaya * Contains load/store functions common to both v3d and vc4. The utile layout 27ed98bd31Smaya * stayed the same, though the way utiles get laid out has changed. 28ed98bd31Smaya */ 29ed98bd31Smaya 30ed98bd31Smayastatic inline void 31ed98bd31Smayav3d_load_utile(void *cpu, uint32_t cpu_stride, 32ed98bd31Smaya void *gpu, uint32_t gpu_stride) 33ed98bd31Smaya{ 34ed98bd31Smaya#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 35ed98bd31Smaya if (gpu_stride == 8) { 36ed98bd31Smaya __asm__ volatile ( 37ed98bd31Smaya /* Load from the GPU in one shot, no interleave, to 38ed98bd31Smaya * d0-d7. 39ed98bd31Smaya */ 40ed98bd31Smaya "vldm %[gpu], {q0, q1, q2, q3}\n" 41ed98bd31Smaya /* Store each 8-byte line to cpu-side destination, 42ed98bd31Smaya * incrementing it by the stride each time. 43ed98bd31Smaya */ 44ed98bd31Smaya "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 45ed98bd31Smaya "vst1.8 d1, [%[cpu]], %[cpu_stride]\n" 46ed98bd31Smaya "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 47ed98bd31Smaya "vst1.8 d3, [%[cpu]], %[cpu_stride]\n" 48ed98bd31Smaya "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 49ed98bd31Smaya "vst1.8 d5, [%[cpu]], %[cpu_stride]\n" 50ed98bd31Smaya "vst1.8 d6, [%[cpu]], %[cpu_stride]\n" 51ed98bd31Smaya "vst1.8 d7, [%[cpu]]\n" 52ed98bd31Smaya : [cpu] "+r"(cpu) 53ed98bd31Smaya : [gpu] "r"(gpu), 54ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 55ed98bd31Smaya : "q0", "q1", "q2", "q3"); 56ed98bd31Smaya return; 57ed98bd31Smaya } else if (gpu_stride == 16) { 58ed98bd31Smaya void *cpu2 = cpu + 8; 59ed98bd31Smaya __asm__ volatile ( 60ed98bd31Smaya /* Load from the GPU in one shot, no interleave, to 61ed98bd31Smaya * d0-d7. 62ed98bd31Smaya */ 63ed98bd31Smaya "vldm %[gpu], {q0, q1, q2, q3};\n" 64ed98bd31Smaya /* Store each 16-byte line in 2 parts to the cpu-side 65ed98bd31Smaya * destination. (vld1 can only store one d-register 66ed98bd31Smaya * at a time). 67ed98bd31Smaya */ 68ed98bd31Smaya "vst1.8 d0, [%[cpu]], %[cpu_stride]\n" 69ed98bd31Smaya "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n" 70ed98bd31Smaya "vst1.8 d2, [%[cpu]], %[cpu_stride]\n" 71ed98bd31Smaya "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n" 72ed98bd31Smaya "vst1.8 d4, [%[cpu]], %[cpu_stride]\n" 73ed98bd31Smaya "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n" 74ed98bd31Smaya "vst1.8 d6, [%[cpu]]\n" 75ed98bd31Smaya "vst1.8 d7, [%[cpu2]]\n" 76ed98bd31Smaya : [cpu] "+r"(cpu), 77ed98bd31Smaya [cpu2] "+r"(cpu2) 78ed98bd31Smaya : [gpu] "r"(gpu), 79ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 80ed98bd31Smaya : "q0", "q1", "q2", "q3"); 81ed98bd31Smaya return; 82ed98bd31Smaya } 83ed98bd31Smaya#elif defined (PIPE_ARCH_AARCH64) 84ed98bd31Smaya if (gpu_stride == 8) { 85ed98bd31Smaya __asm__ volatile ( 86ed98bd31Smaya /* Load from the GPU in one shot, no interleave, to 87ed98bd31Smaya * d0-d7. 88ed98bd31Smaya */ 89ed98bd31Smaya "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 90ed98bd31Smaya /* Store each 8-byte line to cpu-side destination, 91ed98bd31Smaya * incrementing it by the stride each time. 92ed98bd31Smaya */ 93ed98bd31Smaya "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 94ed98bd31Smaya "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 95ed98bd31Smaya "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 96ed98bd31Smaya "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 97ed98bd31Smaya "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 98ed98bd31Smaya "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 99ed98bd31Smaya "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 100ed98bd31Smaya "st1 {v3.D}[1], [%[cpu]]\n" 101ed98bd31Smaya : [cpu] "+r"(cpu) 102ed98bd31Smaya : [gpu] "r"(gpu), 103ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 104ed98bd31Smaya : "v0", "v1", "v2", "v3"); 105ed98bd31Smaya return; 106ed98bd31Smaya } else if (gpu_stride == 16) { 107ed98bd31Smaya void *cpu2 = cpu + 8; 108ed98bd31Smaya __asm__ volatile ( 109ed98bd31Smaya /* Load from the GPU in one shot, no interleave, to 110ed98bd31Smaya * d0-d7. 111ed98bd31Smaya */ 112ed98bd31Smaya "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 113ed98bd31Smaya /* Store each 16-byte line in 2 parts to the cpu-side 114ed98bd31Smaya * destination. (vld1 can only store one d-register 115ed98bd31Smaya * at a time). 116ed98bd31Smaya */ 117ed98bd31Smaya "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 118ed98bd31Smaya "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 119ed98bd31Smaya "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 120ed98bd31Smaya "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 121ed98bd31Smaya "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 122ed98bd31Smaya "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 123ed98bd31Smaya "st1 {v3.D}[0], [%[cpu]]\n" 124ed98bd31Smaya "st1 {v3.D}[1], [%[cpu2]]\n" 125ed98bd31Smaya : [cpu] "+r"(cpu), 126ed98bd31Smaya [cpu2] "+r"(cpu2) 127ed98bd31Smaya : [gpu] "r"(gpu), 128ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 129ed98bd31Smaya : "v0", "v1", "v2", "v3"); 130ed98bd31Smaya return; 131ed98bd31Smaya } 132ed98bd31Smaya#endif 133ed98bd31Smaya 134ed98bd31Smaya for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 135ed98bd31Smaya memcpy(cpu, gpu + gpu_offset, gpu_stride); 136ed98bd31Smaya cpu += cpu_stride; 137ed98bd31Smaya } 138ed98bd31Smaya} 139ed98bd31Smaya 140ed98bd31Smayastatic inline void 141ed98bd31Smayav3d_store_utile(void *gpu, uint32_t gpu_stride, 142ed98bd31Smaya void *cpu, uint32_t cpu_stride) 143ed98bd31Smaya{ 144ed98bd31Smaya#if defined(V3D_BUILD_NEON) && defined(PIPE_ARCH_ARM) 145ed98bd31Smaya if (gpu_stride == 8) { 146ed98bd31Smaya __asm__ volatile ( 147ed98bd31Smaya /* Load each 8-byte line from cpu-side source, 148ed98bd31Smaya * incrementing it by the stride each time. 149ed98bd31Smaya */ 150ed98bd31Smaya "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 151ed98bd31Smaya "vld1.8 d1, [%[cpu]], %[cpu_stride]\n" 152ed98bd31Smaya "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 153ed98bd31Smaya "vld1.8 d3, [%[cpu]], %[cpu_stride]\n" 154ed98bd31Smaya "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 155ed98bd31Smaya "vld1.8 d5, [%[cpu]], %[cpu_stride]\n" 156ed98bd31Smaya "vld1.8 d6, [%[cpu]], %[cpu_stride]\n" 157ed98bd31Smaya "vld1.8 d7, [%[cpu]]\n" 158ed98bd31Smaya /* Load from the GPU in one shot, no interleave, to 159ed98bd31Smaya * d0-d7. 160ed98bd31Smaya */ 161ed98bd31Smaya "vstm %[gpu], {q0, q1, q2, q3}\n" 162ed98bd31Smaya : [cpu] "+r"(cpu) 163ed98bd31Smaya : [gpu] "r"(gpu), 164ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 165ed98bd31Smaya : "q0", "q1", "q2", "q3"); 166ed98bd31Smaya return; 167ed98bd31Smaya } else if (gpu_stride == 16) { 168ed98bd31Smaya void *cpu2 = cpu + 8; 169ed98bd31Smaya __asm__ volatile ( 170ed98bd31Smaya /* Load each 16-byte line in 2 parts from the cpu-side 171ed98bd31Smaya * destination. (vld1 can only store one d-register 172ed98bd31Smaya * at a time). 173ed98bd31Smaya */ 174ed98bd31Smaya "vld1.8 d0, [%[cpu]], %[cpu_stride]\n" 175ed98bd31Smaya "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n" 176ed98bd31Smaya "vld1.8 d2, [%[cpu]], %[cpu_stride]\n" 177ed98bd31Smaya "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n" 178ed98bd31Smaya "vld1.8 d4, [%[cpu]], %[cpu_stride]\n" 179ed98bd31Smaya "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n" 180ed98bd31Smaya "vld1.8 d6, [%[cpu]]\n" 181ed98bd31Smaya "vld1.8 d7, [%[cpu2]]\n" 182ed98bd31Smaya /* Store to the GPU in one shot, no interleave. */ 183ed98bd31Smaya "vstm %[gpu], {q0, q1, q2, q3}\n" 184ed98bd31Smaya : [cpu] "+r"(cpu), 185ed98bd31Smaya [cpu2] "+r"(cpu2) 186ed98bd31Smaya : [gpu] "r"(gpu), 187ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 188ed98bd31Smaya : "q0", "q1", "q2", "q3"); 189ed98bd31Smaya return; 190ed98bd31Smaya } 191ed98bd31Smaya#elif defined (PIPE_ARCH_AARCH64) 192ed98bd31Smaya if (gpu_stride == 8) { 193ed98bd31Smaya __asm__ volatile ( 194ed98bd31Smaya /* Load each 8-byte line from cpu-side source, 195ed98bd31Smaya * incrementing it by the stride each time. 196ed98bd31Smaya */ 197ed98bd31Smaya "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 198ed98bd31Smaya "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n" 199ed98bd31Smaya "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 200ed98bd31Smaya "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n" 201ed98bd31Smaya "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 202ed98bd31Smaya "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n" 203ed98bd31Smaya "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n" 204ed98bd31Smaya "ld1 {v3.D}[1], [%[cpu]]\n" 205ed98bd31Smaya /* Store to the GPU in one shot, no interleave. */ 206ed98bd31Smaya "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 207ed98bd31Smaya : [cpu] "+r"(cpu) 208ed98bd31Smaya : [gpu] "r"(gpu), 209ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 210ed98bd31Smaya : "v0", "v1", "v2", "v3"); 211ed98bd31Smaya return; 212ed98bd31Smaya } else if (gpu_stride == 16) { 213ed98bd31Smaya void *cpu2 = cpu + 8; 214ed98bd31Smaya __asm__ volatile ( 215ed98bd31Smaya /* Load each 16-byte line in 2 parts from the cpu-side 216ed98bd31Smaya * destination. (vld1 can only store one d-register 217ed98bd31Smaya * at a time). 218ed98bd31Smaya */ 219ed98bd31Smaya "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n" 220ed98bd31Smaya "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n" 221ed98bd31Smaya "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n" 222ed98bd31Smaya "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n" 223ed98bd31Smaya "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n" 224ed98bd31Smaya "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n" 225ed98bd31Smaya "ld1 {v3.D}[0], [%[cpu]]\n" 226ed98bd31Smaya "ld1 {v3.D}[1], [%[cpu2]]\n" 227ed98bd31Smaya /* Store to the GPU in one shot, no interleave. */ 228ed98bd31Smaya "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n" 229ed98bd31Smaya : [cpu] "+r"(cpu), 230ed98bd31Smaya [cpu2] "+r"(cpu2) 231ed98bd31Smaya : [gpu] "r"(gpu), 232ed98bd31Smaya [cpu_stride] "r"(cpu_stride) 233ed98bd31Smaya : "v0", "v1", "v2", "v3"); 234ed98bd31Smaya return; 235ed98bd31Smaya } 236ed98bd31Smaya#endif 237ed98bd31Smaya 238ed98bd31Smaya for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { 239ed98bd31Smaya memcpy(gpu + gpu_offset, cpu, gpu_stride); 240ed98bd31Smaya cpu += cpu_stride; 241ed98bd31Smaya } 242ed98bd31Smaya} 243