1 1.1 mrg dnl ARM v6 mpn_sqr_basecase. 2 1.1 mrg 3 1.1.1.2 mrg dnl Contributed to the GNU project by Torbjrn Granlund. 4 1.1 mrg 5 1.1.1.2 mrg dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. 6 1.1 mrg 7 1.1 mrg dnl This file is part of the GNU MP Library. 8 1.1.1.2 mrg dnl 9 1.1 mrg dnl The GNU MP Library is free software; you can redistribute it and/or modify 10 1.1.1.2 mrg dnl it under the terms of either: 11 1.1.1.2 mrg dnl 12 1.1.1.2 mrg dnl * the GNU Lesser General Public License as published by the Free 13 1.1.1.2 mrg dnl Software Foundation; either version 3 of the License, or (at your 14 1.1.1.2 mrg dnl option) any later version. 15 1.1.1.2 mrg dnl 16 1.1.1.2 mrg dnl or 17 1.1.1.2 mrg dnl 18 1.1.1.2 mrg dnl * the GNU General Public License as published by the Free Software 19 1.1.1.2 mrg dnl Foundation; either version 2 of the License, or (at your option) any 20 1.1.1.2 mrg dnl later version. 21 1.1.1.2 mrg dnl 22 1.1.1.2 mrg dnl or both in parallel, as here. 23 1.1.1.2 mrg dnl 24 1.1 mrg dnl The GNU MP Library is distributed in the hope that it will be useful, but 25 1.1 mrg dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26 1.1.1.2 mrg dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27 1.1.1.2 mrg dnl for more details. 28 1.1.1.2 mrg dnl 29 1.1.1.2 mrg dnl You should have received copies of the GNU General Public License and the 30 1.1.1.2 mrg dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31 1.1.1.2 mrg dnl see https://www.gnu.org/licenses/. 32 1.1 mrg 33 1.1 mrg include(`../config.m4') 34 1.1 mrg 35 1.1 mrg C Code structure: 36 1.1 mrg C 37 1.1 mrg C 38 1.1 mrg C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) 39 1.1 mrg C | | | | 40 1.1 mrg C | | | | 41 1.1 mrg C | | | | 42 1.1 mrg C \|/ \|/ \|/ \|/ 43 1.1 mrg C ____________ ____________ 44 1.1 mrg C / \ / \ 45 1.1 mrg C \|/ \ \|/ \ 46 1.1 mrg C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) 47 1.1 mrg C \ /|\ \ /|\ 48 1.1 mrg C \____________/ \____________/ 49 1.1 mrg C \ / 50 1.1 mrg C \ / 51 1.1 mrg C \ / 52 1.1.1.2 mrg C cor3 cor2 53 1.1 mrg C \ / 54 1.1 mrg C \ / 55 1.1 mrg C sqr_diag_addlsh1 56 1.1 mrg 57 1.1 mrg C TODO 58 1.1.1.2 mrg C * Align more labels. 59 1.1 mrg C * Further tweak counter and updates in outer loops. (This could save 60 1.1 mrg C perhaps 5n cycles). 61 1.1.1.2 mrg C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then 62 1.1.1.2 mrg C initialise loop counter i with a right shift. 63 1.1 mrg C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved. 64 1.1 mrg C (This could save 2-3 cycles for n > 4.) 65 1.1.1.2 mrg C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry 66 1.1.1.2 mrg C propagation. 67 1.1.1.2 mrg C * Stop loops earlier suppressing writes of upper-most rp[] values. 68 1.1.1.2 mrg C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly 69 1.1.1.2 mrg C particularly on Cortex-A8. 70 1.1.1.2 mrg 71 1.1 mrg 72 1.1 mrg define(`rp', r0) 73 1.1 mrg define(`up', r1) 74 1.1 mrg define(`n', r2) 75 1.1 mrg 76 1.1 mrg define(`v0', r3) 77 1.1 mrg define(`v1', r6) 78 1.1 mrg define(`i', r8) 79 1.1 mrg define(`n_saved', r14) 80 1.1 mrg define(`cya', r11) 81 1.1 mrg define(`cyb', r12) 82 1.1 mrg define(`u0', r7) 83 1.1 mrg define(`u1', r9) 84 1.1 mrg 85 1.1 mrg ASM_START() 86 1.1 mrg PROLOGUE(mpn_sqr_basecase) 87 1.1 mrg and r12, n, #3 88 1.1 mrg cmp n, #4 89 1.1 mrg addgt r12, r12, #4 90 1.1 mrg add pc, pc, r12, lsl #2 91 1.1 mrg nop 92 1.1 mrg b L(4) 93 1.1 mrg b L(1) 94 1.1 mrg b L(2) 95 1.1 mrg b L(3) 96 1.1 mrg b L(0m4) 97 1.1 mrg b L(1m4) 98 1.1 mrg b L(2m4) 99 1.1 mrg b L(3m4) 100 1.1 mrg 101 1.1 mrg 102 1.1.1.2 mrg L(1m4): push {r4-r11, r14} 103 1.1 mrg mov n_saved, n 104 1.1 mrg sub i, n, #4 105 1.1 mrg sub n, n, #2 106 1.1 mrg add r10, pc, #L(am2_2m4)-.-8 107 1.1 mrg ldm up, {v0,v1,u0} 108 1.1 mrg sub up, up, #4 109 1.1 mrg mov cyb, #0 110 1.1 mrg mov r5, #0 111 1.1 mrg umull r4, cya, v1, v0 112 1.1 mrg str r4, [rp], #-12 113 1.1 mrg mov r4, #0 114 1.1 mrg b L(ko0) 115 1.1 mrg 116 1.1.1.2 mrg L(3m4): push {r4-r11, r14} 117 1.1 mrg mov n_saved, n 118 1.1 mrg sub i, n, #4 119 1.1 mrg sub n, n, #2 120 1.1 mrg add r10, pc, #L(am2_0m4)-.-8 121 1.1 mrg ldm up, {v0,v1,u0} 122 1.1 mrg add up, up, #4 123 1.1 mrg mov cyb, #0 124 1.1 mrg mov r5, #0 125 1.1 mrg umull r4, cya, v1, v0 126 1.1 mrg str r4, [rp], #-4 127 1.1 mrg mov r4, #0 128 1.1 mrg b L(ko2) 129 1.1 mrg 130 1.1.1.2 mrg L(2m4): push {r4-r11, r14} 131 1.1 mrg mov n_saved, n 132 1.1 mrg sub i, n, #4 133 1.1 mrg sub n, n, #2 134 1.1 mrg add r10, pc, #L(am2_3m4)-.-8 135 1.1 mrg ldm up, {v0,v1,u1} 136 1.1 mrg mov cyb, #0 137 1.1 mrg mov r4, #0 138 1.1 mrg umull r5, cya, v1, v0 139 1.1 mrg str r5, [rp], #-8 140 1.1 mrg mov r5, #0 141 1.1 mrg b L(ko1) 142 1.1 mrg 143 1.1.1.2 mrg L(0m4): push {r4-r11, r14} 144 1.1 mrg mov n_saved, n 145 1.1 mrg sub i, n, #4 146 1.1 mrg sub n, n, #2 147 1.1 mrg add r10, pc, #L(am2_1m4)-.-8 148 1.1 mrg ldm up, {v0,v1,u1} 149 1.1 mrg mov cyb, #0 150 1.1 mrg mov r4, #0 151 1.1 mrg add up, up, #8 152 1.1 mrg umull r5, cya, v1, v0 153 1.1 mrg str r5, [rp, #0] 154 1.1 mrg mov r5, #0 155 1.1 mrg 156 1.1 mrg L(top): ldr u0, [up, #4] 157 1.1 mrg umaal r4, cya, u1, v0 158 1.1 mrg str r4, [rp, #4] 159 1.1 mrg mov r4, #0 160 1.1 mrg umaal r5, cyb, u1, v1 161 1.1 mrg L(ko2): ldr u1, [up, #8] 162 1.1 mrg umaal r5, cya, u0, v0 163 1.1 mrg str r5, [rp, #8] 164 1.1 mrg mov r5, #0 165 1.1 mrg umaal r4, cyb, u0, v1 166 1.1 mrg L(ko1): ldr u0, [up, #12] 167 1.1 mrg umaal r4, cya, u1, v0 168 1.1 mrg str r4, [rp, #12] 169 1.1 mrg mov r4, #0 170 1.1 mrg umaal r5, cyb, u1, v1 171 1.1 mrg L(ko0): ldr u1, [up, #16]! 172 1.1 mrg umaal r5, cya, u0, v0 173 1.1 mrg str r5, [rp, #16]! 174 1.1 mrg mov r5, #0 175 1.1 mrg umaal r4, cyb, u0, v1 176 1.1 mrg subs i, i, #4 177 1.1 mrg bhi L(top) 178 1.1.1.2 mrg 179 1.1.1.2 mrg umaal r4, cya, u1, v0 180 1.1.1.2 mrg ldr u0, [up, #4] 181 1.1.1.2 mrg umaal r5, cyb, u1, v1 182 1.1.1.2 mrg str r4, [rp, #4] 183 1.1.1.2 mrg umaal r5, cya, u0, v0 184 1.1.1.2 mrg umaal cya, cyb, u0, v1 185 1.1.1.2 mrg str r5, [rp, #8] 186 1.1.1.2 mrg str cya, [rp, #12] 187 1.1.1.2 mrg str cyb, [rp, #16] 188 1.1.1.2 mrg 189 1.1.1.2 mrg add up, up, #4 190 1.1.1.2 mrg sub n, n, #1 191 1.1.1.2 mrg add rp, rp, #8 192 1.1 mrg bx r10 193 1.1 mrg 194 1.1 mrg L(evnloop): 195 1.1.1.2 mrg subs i, n, #6 196 1.1 mrg sub n, n, #2 197 1.1.1.2 mrg blt L(cor2) 198 1.1.1.2 mrg ldm up, {v0,v1,u1} 199 1.1.1.2 mrg add up, up, #8 200 1.1 mrg mov cya, #0 201 1.1 mrg mov cyb, #0 202 1.1.1.2 mrg ldr r4, [rp, #-4] 203 1.1 mrg umaal r4, cya, v1, v0 204 1.1.1.2 mrg str r4, [rp, #-4] 205 1.1.1.2 mrg ldr r4, [rp, #0] 206 1.1.1.2 mrg 207 1.1.1.2 mrg ALIGN(16) 208 1.1.1.2 mrg L(ua2): ldr r5, [rp, #4] 209 1.1 mrg umaal r4, cya, u1, v0 210 1.1.1.2 mrg ldr u0, [up, #4] 211 1.1 mrg umaal r5, cyb, u1, v1 212 1.1.1.2 mrg str r4, [rp, #0] 213 1.1.1.2 mrg ldr r4, [rp, #8] 214 1.1 mrg umaal r5, cya, u0, v0 215 1.1.1.2 mrg ldr u1, [up, #8] 216 1.1 mrg umaal r4, cyb, u0, v1 217 1.1.1.2 mrg str r5, [rp, #4] 218 1.1.1.2 mrg ldr r5, [rp, #12] 219 1.1 mrg umaal r4, cya, u1, v0 220 1.1.1.2 mrg ldr u0, [up, #12] 221 1.1 mrg umaal r5, cyb, u1, v1 222 1.1.1.2 mrg str r4, [rp, #8] 223 1.1.1.2 mrg ldr r4, [rp, #16]! 224 1.1 mrg umaal r5, cya, u0, v0 225 1.1.1.2 mrg ldr u1, [up, #16]! 226 1.1 mrg umaal r4, cyb, u0, v1 227 1.1.1.2 mrg str r5, [rp, #-4] 228 1.1 mrg subs i, i, #4 229 1.1.1.2 mrg bhs L(ua2) 230 1.1.1.2 mrg 231 1.1 mrg umaal r4, cya, u1, v0 232 1.1.1.2 mrg umaal cya, cyb, u1, v1 233 1.1.1.2 mrg str r4, [rp, #0] 234 1.1.1.2 mrg str cya, [rp, #4] 235 1.1.1.2 mrg str cyb, [rp, #8] 236 1.1.1.2 mrg L(am2_0m4): 237 1.1 mrg sub rp, rp, n, lsl #2 238 1.1.1.2 mrg sub up, up, n, lsl #2 239 1.1.1.2 mrg add rp, rp, #8 240 1.1.1.2 mrg 241 1.1 mrg sub i, n, #4 242 1.1 mrg sub n, n, #2 243 1.1.1.2 mrg ldm up, {v0,v1,u1} 244 1.1 mrg mov cya, #0 245 1.1 mrg mov cyb, #0 246 1.1.1.2 mrg ldr r4, [rp, #4] 247 1.1 mrg umaal r4, cya, v1, v0 248 1.1.1.2 mrg str r4, [rp, #4] 249 1.1.1.2 mrg ldr r4, [rp, #8] 250 1.1 mrg b L(lo0) 251 1.1.1.2 mrg 252 1.1.1.2 mrg ALIGN(16) 253 1.1.1.2 mrg L(ua0): ldr r5, [rp, #4] 254 1.1 mrg umaal r4, cya, u1, v0 255 1.1.1.2 mrg ldr u0, [up, #4] 256 1.1 mrg umaal r5, cyb, u1, v1 257 1.1.1.2 mrg str r4, [rp, #0] 258 1.1.1.2 mrg ldr r4, [rp, #8] 259 1.1 mrg umaal r5, cya, u0, v0 260 1.1.1.2 mrg ldr u1, [up, #8] 261 1.1 mrg umaal r4, cyb, u0, v1 262 1.1.1.2 mrg str r5, [rp, #4] 263 1.1.1.2 mrg L(lo0): ldr r5, [rp, #12] 264 1.1 mrg umaal r4, cya, u1, v0 265 1.1.1.2 mrg ldr u0, [up, #12] 266 1.1 mrg umaal r5, cyb, u1, v1 267 1.1.1.2 mrg str r4, [rp, #8] 268 1.1.1.2 mrg ldr r4, [rp, #16]! 269 1.1 mrg umaal r5, cya, u0, v0 270 1.1.1.2 mrg ldr u1, [up, #16]! 271 1.1 mrg umaal r4, cyb, u0, v1 272 1.1.1.2 mrg str r5, [rp, #-4] 273 1.1 mrg subs i, i, #4 274 1.1.1.2 mrg bhs L(ua0) 275 1.1.1.2 mrg 276 1.1 mrg umaal r4, cya, u1, v0 277 1.1.1.2 mrg umaal cya, cyb, u1, v1 278 1.1.1.2 mrg str r4, [rp, #0] 279 1.1.1.2 mrg str cya, [rp, #4] 280 1.1.1.2 mrg str cyb, [rp, #8] 281 1.1.1.2 mrg L(am2_2m4): 282 1.1 mrg sub rp, rp, n, lsl #2 283 1.1.1.2 mrg sub up, up, n, lsl #2 284 1.1.1.2 mrg add rp, rp, #16 285 1.1 mrg b L(evnloop) 286 1.1 mrg 287 1.1 mrg 288 1.1 mrg L(oddloop): 289 1.1.1.2 mrg sub i, n, #5 290 1.1 mrg sub n, n, #2 291 1.1.1.2 mrg ldm up, {v0,v1,u0} 292 1.1 mrg mov cya, #0 293 1.1 mrg mov cyb, #0 294 1.1.1.2 mrg ldr r5, [rp, #0] 295 1.1 mrg umaal r5, cya, v1, v0 296 1.1.1.2 mrg str r5, [rp, #0] 297 1.1.1.2 mrg ldr r5, [rp, #4] 298 1.1.1.2 mrg add up, up, #4 299 1.1 mrg b L(lo1) 300 1.1.1.2 mrg 301 1.1.1.2 mrg ALIGN(16) 302 1.1.1.2 mrg L(ua1): ldr r5, [rp, #4] 303 1.1 mrg umaal r4, cya, u1, v0 304 1.1.1.2 mrg ldr u0, [up, #4] 305 1.1 mrg umaal r5, cyb, u1, v1 306 1.1.1.2 mrg str r4, [rp, #0] 307 1.1.1.2 mrg L(lo1): ldr r4, [rp, #8] 308 1.1 mrg umaal r5, cya, u0, v0 309 1.1.1.2 mrg ldr u1, [up, #8] 310 1.1 mrg umaal r4, cyb, u0, v1 311 1.1.1.2 mrg str r5, [rp, #4] 312 1.1.1.2 mrg ldr r5, [rp, #12] 313 1.1 mrg umaal r4, cya, u1, v0 314 1.1.1.2 mrg ldr u0, [up, #12] 315 1.1 mrg umaal r5, cyb, u1, v1 316 1.1.1.2 mrg str r4, [rp, #8] 317 1.1.1.2 mrg ldr r4, [rp, #16]! 318 1.1 mrg umaal r5, cya, u0, v0 319 1.1.1.2 mrg ldr u1, [up, #16]! 320 1.1 mrg umaal r4, cyb, u0, v1 321 1.1.1.2 mrg str r5, [rp, #-4] 322 1.1 mrg subs i, i, #4 323 1.1.1.2 mrg bhs L(ua1) 324 1.1.1.2 mrg 325 1.1 mrg umaal r4, cya, u1, v0 326 1.1.1.2 mrg umaal cya, cyb, u1, v1 327 1.1.1.2 mrg str r4, [rp, #0] 328 1.1.1.2 mrg str cya, [rp, #4] 329 1.1.1.2 mrg str cyb, [rp, #8] 330 1.1.1.2 mrg L(am2_3m4): 331 1.1 mrg sub rp, rp, n, lsl #2 332 1.1.1.2 mrg sub up, up, n, lsl #2 333 1.1.1.2 mrg add rp, rp, #4 334 1.1.1.2 mrg 335 1.1.1.2 mrg subs i, n, #3 336 1.1.1.2 mrg beq L(cor3) 337 1.1 mrg sub n, n, #2 338 1.1.1.2 mrg ldm up, {v0,v1,u0} 339 1.1 mrg mov cya, #0 340 1.1 mrg mov cyb, #0 341 1.1 mrg ldr r5, [rp, #8] 342 1.1.1.2 mrg sub up, up, #4 343 1.1.1.2 mrg umaal r5, cya, v1, v0 344 1.1.1.2 mrg str r5, [rp, #8] 345 1.1.1.2 mrg ldr r5, [rp, #12] 346 1.1.1.2 mrg b L(lo3) 347 1.1.1.2 mrg 348 1.1.1.2 mrg ALIGN(16) 349 1.1.1.2 mrg L(ua3): ldr r5, [rp, #4] 350 1.1 mrg umaal r4, cya, u1, v0 351 1.1.1.2 mrg ldr u0, [up, #4] 352 1.1 mrg umaal r5, cyb, u1, v1 353 1.1.1.2 mrg str r4, [rp, #0] 354 1.1.1.2 mrg ldr r4, [rp, #8] 355 1.1 mrg umaal r5, cya, u0, v0 356 1.1.1.2 mrg ldr u1, [up, #8] 357 1.1 mrg umaal r4, cyb, u0, v1 358 1.1.1.2 mrg str r5, [rp, #4] 359 1.1.1.2 mrg ldr r5, [rp, #12] 360 1.1 mrg umaal r4, cya, u1, v0 361 1.1.1.2 mrg ldr u0, [up, #12] 362 1.1 mrg umaal r5, cyb, u1, v1 363 1.1.1.2 mrg str r4, [rp, #8] 364 1.1.1.2 mrg L(lo3): ldr r4, [rp, #16]! 365 1.1 mrg umaal r5, cya, u0, v0 366 1.1.1.2 mrg ldr u1, [up, #16]! 367 1.1 mrg umaal r4, cyb, u0, v1 368 1.1.1.2 mrg str r5, [rp, #-4] 369 1.1 mrg subs i, i, #4 370 1.1.1.2 mrg bhs L(ua3) 371 1.1.1.2 mrg 372 1.1 mrg umaal r4, cya, u1, v0 373 1.1.1.2 mrg umaal cya, cyb, u1, v1 374 1.1.1.2 mrg str r4, [rp, #0] 375 1.1.1.2 mrg str cya, [rp, #4] 376 1.1.1.2 mrg str cyb, [rp, #8] 377 1.1.1.2 mrg L(am2_1m4): 378 1.1 mrg sub rp, rp, n, lsl #2 379 1.1.1.2 mrg sub up, up, n, lsl #2 380 1.1.1.2 mrg add rp, rp, #12 381 1.1 mrg b L(oddloop) 382 1.1 mrg 383 1.1.1.2 mrg 384 1.1.1.2 mrg L(cor3):ldm up, {v0,v1,u0} 385 1.1.1.2 mrg ldr r5, [rp, #8] 386 1.1.1.2 mrg mov cya, #0 387 1.1 mrg mov cyb, #0 388 1.1.1.2 mrg umaal r5, cya, v1, v0 389 1.1.1.2 mrg str r5, [rp, #8] 390 1.1.1.2 mrg ldr r5, [rp, #12] 391 1.1.1.2 mrg ldr r4, [rp, #16] 392 1.1.1.2 mrg umaal r5, cya, u0, v0 393 1.1.1.2 mrg ldr u1, [up, #12] 394 1.1.1.2 mrg umaal r4, cyb, u0, v1 395 1.1.1.2 mrg str r5, [rp, #12] 396 1.1.1.2 mrg umaal r4, cya, u1, v0 397 1.1.1.2 mrg umaal cya, cyb, u1, v1 398 1.1.1.2 mrg str r4, [rp, #16] 399 1.1.1.2 mrg str cya, [rp, #20] 400 1.1.1.2 mrg str cyb, [rp, #24] 401 1.1.1.2 mrg add up, up, #16 402 1.1.1.2 mrg mov cya, cyb 403 1.1.1.2 mrg adds rp, rp, #36 C clear cy 404 1.1.1.2 mrg mov cyb, #0 405 1.1.1.2 mrg umaal cya, cyb, u1, u0 406 1.1 mrg b L(sqr_diag_addlsh1) 407 1.1 mrg 408 1.1.1.2 mrg L(cor2): 409 1.1.1.2 mrg ldm up!, {v0,v1,u0} 410 1.1.1.2 mrg mov r4, cya 411 1.1.1.2 mrg mov r5, cyb 412 1.1 mrg mov cya, #0 413 1.1 mrg umaal r4, cya, v1, v0 414 1.1.1.2 mrg mov cyb, #0 415 1.1 mrg umaal r5, cya, u0, v0 416 1.1.1.2 mrg strd r4, r5, [rp, #-4] 417 1.1 mrg umaal cya, cyb, u0, v1 418 1.1.1.2 mrg add rp, rp, #16 419 1.1 mrg C b L(sqr_diag_addlsh1) 420 1.1 mrg 421 1.1 mrg 422 1.1 mrg define(`w0', r6) 423 1.1 mrg define(`w1', r7) 424 1.1 mrg define(`w2', r8) 425 1.1 mrg define(`rbx', r9) 426 1.1 mrg 427 1.1 mrg L(sqr_diag_addlsh1): 428 1.1 mrg str cya, [rp, #-12] 429 1.1 mrg str cyb, [rp, #-8] 430 1.1 mrg sub n, n_saved, #1 431 1.1 mrg sub up, up, n_saved, lsl #2 432 1.1 mrg sub rp, rp, n_saved, lsl #3 433 1.1 mrg ldr r3, [up], #4 434 1.1 mrg umull w1, r5, r3, r3 435 1.1 mrg mov w2, #0 436 1.1.1.2 mrg mov r10, #0 437 1.1.1.2 mrg C cmn r0, #0 C clear cy (already clear) 438 1.1 mrg b L(lm) 439 1.1 mrg 440 1.1 mrg L(tsd): adds w0, w0, rbx 441 1.1 mrg adcs w1, w1, r4 442 1.1 mrg str w0, [rp, #0] 443 1.1 mrg L(lm): ldr w0, [rp, #4] 444 1.1 mrg str w1, [rp, #4] 445 1.1 mrg ldr w1, [rp, #8]! 446 1.1 mrg add rbx, r5, w2 447 1.1 mrg adcs w0, w0, w0 448 1.1 mrg ldr r3, [up], #4 449 1.1 mrg adcs w1, w1, w1 450 1.1.1.2 mrg adc w2, r10, r10 451 1.1 mrg umull r4, r5, r3, r3 452 1.1 mrg subs n, n, #1 453 1.1 mrg bne L(tsd) 454 1.1 mrg 455 1.1 mrg adds w0, w0, rbx 456 1.1 mrg adcs w1, w1, r4 457 1.1 mrg adc w2, r5, w2 458 1.1 mrg stm rp, {w0,w1,w2} 459 1.1 mrg 460 1.1.1.2 mrg pop {r4-r11, pc} 461 1.1 mrg 462 1.1 mrg 463 1.1 mrg C Straight line code for n <= 4 464 1.1 mrg 465 1.1 mrg L(1): ldr r3, [up, #0] 466 1.1 mrg umull r1, r2, r3, r3 467 1.1 mrg stm rp, {r1,r2} 468 1.1 mrg bx r14 469 1.1 mrg 470 1.1 mrg L(2): push {r4-r5} 471 1.1 mrg ldm up, {r5,r12} 472 1.1 mrg umull r1, r2, r5, r5 473 1.1 mrg umull r3, r4, r12, r12 474 1.1 mrg umull r5, r12, r5, r12 475 1.1 mrg adds r5, r5, r5 476 1.1 mrg adcs r12, r12, r12 477 1.1 mrg adc r4, r4, #0 478 1.1 mrg adds r2, r2, r5 479 1.1 mrg adcs r3, r3, r12 480 1.1 mrg adc r4, r4, #0 481 1.1 mrg stm rp, {r1,r2,r3,r4} 482 1.1 mrg pop {r4-r5} 483 1.1 mrg bx r14 484 1.1 mrg 485 1.1 mrg L(3): push {r4-r11} 486 1.1 mrg ldm up, {r7,r8,r9} 487 1.1 mrg umull r1, r2, r7, r7 488 1.1 mrg umull r3, r4, r8, r8 489 1.1 mrg umull r5, r6, r9, r9 490 1.1 mrg umull r10, r11, r7, r8 491 1.1 mrg mov r12, #0 492 1.1 mrg umlal r11, r12, r7, r9 493 1.1 mrg mov r7, #0 494 1.1 mrg umlal r12, r7, r8, r9 495 1.1 mrg adds r10, r10, r10 496 1.1 mrg adcs r11, r11, r11 497 1.1 mrg adcs r12, r12, r12 498 1.1 mrg adcs r7, r7, r7 499 1.1 mrg adc r6, r6, #0 500 1.1 mrg adds r2, r2, r10 501 1.1 mrg adcs r3, r3, r11 502 1.1 mrg adcs r4, r4, r12 503 1.1 mrg adcs r5, r5, r7 504 1.1 mrg adc r6, r6, #0 505 1.1 mrg stm rp, {r1,r2,r3,r4,r5,r6} 506 1.1 mrg pop {r4-r11} 507 1.1 mrg bx r14 508 1.1 mrg 509 1.1 mrg L(4): push {r4-r11, r14} 510 1.1 mrg ldm up, {r9,r10,r11,r12} 511 1.1 mrg umull r1, r2, r9, r9 512 1.1 mrg umull r3, r4, r10, r10 513 1.1 mrg umull r5, r6, r11, r11 514 1.1 mrg umull r7, r8, r12, r12 515 1.1 mrg stm rp, {r1,r2,r3,r4,r5,r6,r7} 516 1.1 mrg umull r1, r2, r9, r10 517 1.1 mrg mov r3, #0 518 1.1 mrg umlal r2, r3, r9, r11 519 1.1 mrg mov r4, #0 520 1.1 mrg umlal r3, r4, r9, r12 521 1.1 mrg mov r5, #0 522 1.1 mrg umlal r3, r5, r10, r11 523 1.1 mrg umaal r4, r5, r10, r12 524 1.1 mrg mov r6, #0 525 1.1 mrg umlal r5, r6, r11, r12 526 1.1 mrg adds r1, r1, r1 527 1.1 mrg adcs r2, r2, r2 528 1.1 mrg adcs r3, r3, r3 529 1.1 mrg adcs r4, r4, r4 530 1.1 mrg adcs r5, r5, r5 531 1.1 mrg adcs r6, r6, r6 532 1.1 mrg add rp, rp, #4 533 1.1.1.2 mrg adc r7, r8, #0 534 1.1 mrg ldm rp, {r8,r9,r10,r11,r12,r14} 535 1.1 mrg adds r1, r1, r8 536 1.1 mrg adcs r2, r2, r9 537 1.1 mrg adcs r3, r3, r10 538 1.1 mrg adcs r4, r4, r11 539 1.1 mrg adcs r5, r5, r12 540 1.1 mrg adcs r6, r6, r14 541 1.1 mrg adc r7, r7, #0 542 1.1 mrg stm rp, {r1,r2,r3,r4,r5,r6,r7} 543 1.1 mrg pop {r4-r11, pc} 544 1.1 mrg EPILOGUE() 545