1 1.1 christos ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding 2 1.1 christos ; version for AMD64 on Windows using Microsoft C compiler 3 1.1 christos ; 4 1.1 christos ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c 5 1.1 christos ; inffasx64.asm is called by inffas8664.c, which contain more info. 6 1.1 christos 7 1.1 christos 8 1.1 christos ; to compile this file, I use option 9 1.1 christos ; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm 10 1.1 christos ; with Microsoft Macro Assembler (x64) for AMD64 11 1.1 christos ; 12 1.1 christos 13 1.1 christos ; This file compile with Microsoft Macro Assembler (x64) for AMD64 14 1.1 christos ; 15 1.1 christos ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK 16 1.1 christos ; 17 1.1 christos ; (you can get Windows WDK with ml64 for AMD64 from 18 1.1 christos ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) 19 1.1 christos ; 20 1.1 christos 21 1.1 christos 22 1.1 christos .code 23 1.1 christos inffas8664fnc PROC 24 1.1 christos 25 1.1 christos ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 26 1.1 christos ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 27 1.1 christos ; 28 1.1 christos ; All registers must be preserved across the call, except for 29 1.1 christos ; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch. 30 1.1 christos 31 1.1 christos 32 1.1 christos mov [rsp-8],rsi 33 1.1 christos mov [rsp-16],rdi 34 1.1 christos mov [rsp-24],r12 35 1.1 christos mov [rsp-32],r13 36 1.1 christos mov [rsp-40],r14 37 1.1 christos mov [rsp-48],r15 38 1.1 christos mov [rsp-56],rbx 39 1.1 christos 40 1.1 christos mov rax,rcx 41 1.1 christos 42 1.1 christos mov [rax+8], rbp ; /* save regs rbp and rsp */ 43 1.1 christos mov [rax], rsp 44 1.1 christos 45 1.1 christos mov rsp, rax ; /* make rsp point to &ar */ 46 1.1 christos 47 1.1 christos mov rsi, [rsp+16] ; /* rsi = in */ 48 1.1 christos mov rdi, [rsp+32] ; /* rdi = out */ 49 1.1 christos mov r9, [rsp+24] ; /* r9 = last */ 50 1.1 christos mov r10, [rsp+48] ; /* r10 = end */ 51 1.1 christos mov rbp, [rsp+64] ; /* rbp = lcode */ 52 1.1 christos mov r11, [rsp+72] ; /* r11 = dcode */ 53 1.1 christos mov rdx, [rsp+80] ; /* rdx = hold */ 54 1.1 christos mov ebx, [rsp+88] ; /* ebx = bits */ 55 1.1 christos mov r12d, [rsp+100] ; /* r12d = lmask */ 56 1.1 christos mov r13d, [rsp+104] ; /* r13d = dmask */ 57 1.1 christos ; /* r14d = len */ 58 1.1 christos ; /* r15d = dist */ 59 1.1 christos 60 1.1 christos 61 1.1 christos cld 62 1.1 christos cmp r10, rdi 63 1.1 christos je L_one_time ; /* if only one decode left */ 64 1.1 christos cmp r9, rsi 65 1.1 christos 66 1.1 christos jne L_do_loop 67 1.1 christos 68 1.1 christos 69 1.1 christos L_one_time: 70 1.1 christos mov r8, r12 ; /* r8 = lmask */ 71 1.1 christos cmp bl, 32 72 1.1 christos ja L_get_length_code_one_time 73 1.1 christos 74 1.1 christos lodsd ; /* eax = *(uint *)in++ */ 75 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */ 76 1.1 christos add bl, 32 ; /* bits += 32 */ 77 1.1 christos shl rax, cl 78 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 79 1.1 christos jmp L_get_length_code_one_time 80 1.1 christos 81 1.1 christos ALIGN 4 82 1.1 christos L_while_test: 83 1.1 christos cmp r10, rdi 84 1.1 christos jbe L_break_loop 85 1.1 christos cmp r9, rsi 86 1.1 christos jbe L_break_loop 87 1.1 christos 88 1.1 christos L_do_loop: 89 1.1 christos mov r8, r12 ; /* r8 = lmask */ 90 1.1 christos cmp bl, 32 91 1.1 christos ja L_get_length_code ; /* if (32 < bits) */ 92 1.1 christos 93 1.1 christos lodsd ; /* eax = *(uint *)in++ */ 94 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */ 95 1.1 christos add bl, 32 ; /* bits += 32 */ 96 1.1 christos shl rax, cl 97 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 98 1.1 christos 99 1.1 christos L_get_length_code: 100 1.1 christos and r8, rdx ; /* r8 &= hold */ 101 1.1 christos mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ 102 1.1 christos 103 1.1 christos mov cl, ah ; /* cl = this.bits */ 104 1.1 christos sub bl, ah ; /* bits -= this.bits */ 105 1.1 christos shr rdx, cl ; /* hold >>= this.bits */ 106 1.1 christos 107 1.1 christos test al, al 108 1.1 christos jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ 109 1.1 christos 110 1.1 christos mov r8, r12 ; /* r8 = lmask */ 111 1.1 christos shr eax, 16 ; /* output this.val char */ 112 1.1 christos stosb 113 1.1 christos 114 1.1 christos L_get_length_code_one_time: 115 1.1 christos and r8, rdx ; /* r8 &= hold */ 116 1.1 christos mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ 117 1.1 christos 118 1.1 christos L_dolen: 119 1.1 christos mov cl, ah ; /* cl = this.bits */ 120 1.1 christos sub bl, ah ; /* bits -= this.bits */ 121 1.1 christos shr rdx, cl ; /* hold >>= this.bits */ 122 1.1 christos 123 1.1 christos test al, al 124 1.1 christos jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ 125 1.1 christos 126 1.1 christos shr eax, 16 ; /* output this.val char */ 127 1.1 christos stosb 128 1.1 christos jmp L_while_test 129 1.1 christos 130 1.1 christos ALIGN 4 131 1.1 christos L_test_for_length_base: 132 1.1 christos mov r14d, eax ; /* len = this */ 133 1.1 christos shr r14d, 16 ; /* len = this.val */ 134 1.1 christos mov cl, al 135 1.1 christos 136 1.1 christos test al, 16 137 1.1 christos jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */ 138 1.1 christos and cl, 15 ; /* op &= 15 */ 139 1.1 christos jz L_decode_distance ; /* if (!op) */ 140 1.1 christos 141 1.1 christos L_add_bits_to_len: 142 1.1 christos sub bl, cl 143 1.1 christos xor eax, eax 144 1.1 christos inc eax 145 1.1 christos shl eax, cl 146 1.1 christos dec eax 147 1.1 christos and eax, edx ; /* eax &= hold */ 148 1.1 christos shr rdx, cl 149 1.1 christos add r14d, eax ; /* len += hold & mask[op] */ 150 1.1 christos 151 1.1 christos L_decode_distance: 152 1.1 christos mov r8, r13 ; /* r8 = dmask */ 153 1.1 christos cmp bl, 32 154 1.1 christos ja L_get_distance_code ; /* if (32 < bits) */ 155 1.1 christos 156 1.1 christos lodsd ; /* eax = *(uint *)in++ */ 157 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */ 158 1.1 christos add bl, 32 ; /* bits += 32 */ 159 1.1 christos shl rax, cl 160 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 161 1.1 christos 162 1.1 christos L_get_distance_code: 163 1.1 christos and r8, rdx ; /* r8 &= hold */ 164 1.1 christos mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */ 165 1.1 christos 166 1.1 christos L_dodist: 167 1.1 christos mov r15d, eax ; /* dist = this */ 168 1.1 christos shr r15d, 16 ; /* dist = this.val */ 169 1.1 christos mov cl, ah 170 1.1 christos sub bl, ah ; /* bits -= this.bits */ 171 1.1 christos shr rdx, cl ; /* hold >>= this.bits */ 172 1.1 christos mov cl, al ; /* cl = this.op */ 173 1.1 christos 174 1.1 christos test al, 16 ; /* if ((op & 16) == 0) */ 175 1.1 christos jz L_test_for_second_level_dist 176 1.1 christos and cl, 15 ; /* op &= 15 */ 177 1.1 christos jz L_check_dist_one 178 1.1 christos 179 1.1 christos L_add_bits_to_dist: 180 1.1 christos sub bl, cl 181 1.1 christos xor eax, eax 182 1.1 christos inc eax 183 1.1 christos shl eax, cl 184 1.1 christos dec eax ; /* (1 << op) - 1 */ 185 1.1 christos and eax, edx ; /* eax &= hold */ 186 1.1 christos shr rdx, cl 187 1.1 christos add r15d, eax ; /* dist += hold & ((1 << op) - 1) */ 188 1.1 christos 189 1.1 christos L_check_window: 190 1.1 christos mov r8, rsi ; /* save in so from can use it's reg */ 191 1.1 christos mov rax, rdi 192 1.1 christos sub rax, [rsp+40] ; /* nbytes = out - beg */ 193 1.1 christos 194 1.1 christos cmp eax, r15d 195 1.1 christos jb L_clip_window ; /* if (dist > nbytes) 4.2% */ 196 1.1 christos 197 1.1 christos mov ecx, r14d ; /* ecx = len */ 198 1.1 christos mov rsi, rdi 199 1.1 christos sub rsi, r15 ; /* from = out - dist */ 200 1.1 christos 201 1.1 christos sar ecx, 1 202 1.1 christos jnc L_copy_two ; /* if len % 2 == 0 */ 203 1.1 christos 204 1.1 christos rep movsw 205 1.1 christos mov al, [rsi] 206 1.1 christos mov [rdi], al 207 1.1 christos inc rdi 208 1.1 christos 209 1.1 christos mov rsi, r8 ; /* move in back to %rsi, toss from */ 210 1.1 christos jmp L_while_test 211 1.1 christos 212 1.1 christos L_copy_two: 213 1.1 christos rep movsw 214 1.1 christos mov rsi, r8 ; /* move in back to %rsi, toss from */ 215 1.1 christos jmp L_while_test 216 1.1 christos 217 1.1 christos ALIGN 4 218 1.1 christos L_check_dist_one: 219 1.1 christos cmp r15d, 1 ; /* if dist 1, is a memset */ 220 1.1 christos jne L_check_window 221 1.1 christos cmp [rsp+40], rdi ; /* if out == beg, outside window */ 222 1.1 christos je L_check_window 223 1.1 christos 224 1.1 christos mov ecx, r14d ; /* ecx = len */ 225 1.1 christos mov al, [rdi-1] 226 1.1 christos mov ah, al 227 1.1 christos 228 1.1 christos sar ecx, 1 229 1.1 christos jnc L_set_two 230 1.1 christos mov [rdi], al 231 1.1 christos inc rdi 232 1.1 christos 233 1.1 christos L_set_two: 234 1.1 christos rep stosw 235 1.1 christos jmp L_while_test 236 1.1 christos 237 1.1 christos ALIGN 4 238 1.1 christos L_test_for_second_level_length: 239 1.1 christos test al, 64 240 1.1 christos jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */ 241 1.1 christos 242 1.1 christos xor eax, eax 243 1.1 christos inc eax 244 1.1 christos shl eax, cl 245 1.1 christos dec eax 246 1.1 christos and eax, edx ; /* eax &= hold */ 247 1.1 christos add eax, r14d ; /* eax += len */ 248 1.1 christos mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/ 249 1.1 christos jmp L_dolen 250 1.1 christos 251 1.1 christos ALIGN 4 252 1.1 christos L_test_for_second_level_dist: 253 1.1 christos test al, 64 254 1.1 christos jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */ 255 1.1 christos 256 1.1 christos xor eax, eax 257 1.1 christos inc eax 258 1.1 christos shl eax, cl 259 1.1 christos dec eax 260 1.1 christos and eax, edx ; /* eax &= hold */ 261 1.1 christos add eax, r15d ; /* eax += dist */ 262 1.1 christos mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/ 263 1.1 christos jmp L_dodist 264 1.1 christos 265 1.1 christos ALIGN 4 266 1.1 christos L_clip_window: 267 1.1 christos mov ecx, eax ; /* ecx = nbytes */ 268 1.1 christos mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */ 269 1.1 christos neg ecx ; /* nbytes = -nbytes */ 270 1.1 christos 271 1.1 christos cmp eax, r15d 272 1.1 christos jb L_invalid_distance_too_far ; /* if (dist > wsize) */ 273 1.1 christos 274 1.1 christos add ecx, r15d ; /* nbytes = dist - nbytes */ 275 1.1 christos cmp dword ptr [rsp+96], 0 276 1.1 christos jne L_wrap_around_window ; /* if (write != 0) */ 277 1.1 christos 278 1.1 christos mov rsi, [rsp+56] ; /* from = window */ 279 1.1 christos sub eax, ecx ; /* eax -= nbytes */ 280 1.1 christos add rsi, rax ; /* from += wsize - nbytes */ 281 1.1 christos 282 1.1 christos mov eax, r14d ; /* eax = len */ 283 1.1 christos cmp r14d, ecx 284 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */ 285 1.1 christos 286 1.1 christos sub eax, ecx ; /* eax -= nbytes */ 287 1.1 christos rep movsb 288 1.1 christos mov rsi, rdi 289 1.1 christos sub rsi, r15 ; /* from = &out[ -dist ] */ 290 1.1 christos jmp L_do_copy 291 1.1 christos 292 1.1 christos ALIGN 4 293 1.1 christos L_wrap_around_window: 294 1.1 christos mov eax, [rsp+96] ; /* eax = write */ 295 1.1 christos cmp ecx, eax 296 1.1 christos jbe L_contiguous_in_window ; /* if (write >= nbytes) */ 297 1.1 christos 298 1.1 christos mov esi, [rsp+92] ; /* from = wsize */ 299 1.1 christos add rsi, [rsp+56] ; /* from += window */ 300 1.1 christos add rsi, rax ; /* from += write */ 301 1.1 christos sub rsi, rcx ; /* from -= nbytes */ 302 1.1 christos sub ecx, eax ; /* nbytes -= write */ 303 1.1 christos 304 1.1 christos mov eax, r14d ; /* eax = len */ 305 1.1 christos cmp eax, ecx 306 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */ 307 1.1 christos 308 1.1 christos sub eax, ecx ; /* len -= nbytes */ 309 1.1 christos rep movsb 310 1.1 christos mov rsi, [rsp+56] ; /* from = window */ 311 1.1 christos mov ecx, [rsp+96] ; /* nbytes = write */ 312 1.1 christos cmp eax, ecx 313 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */ 314 1.1 christos 315 1.1 christos sub eax, ecx ; /* len -= nbytes */ 316 1.1 christos rep movsb 317 1.1 christos mov rsi, rdi 318 1.1 christos sub rsi, r15 ; /* from = out - dist */ 319 1.1 christos jmp L_do_copy 320 1.1 christos 321 1.1 christos ALIGN 4 322 1.1 christos L_contiguous_in_window: 323 1.1 christos mov rsi, [rsp+56] ; /* rsi = window */ 324 1.1 christos add rsi, rax 325 1.1 christos sub rsi, rcx ; /* from += write - nbytes */ 326 1.1 christos 327 1.1 christos mov eax, r14d ; /* eax = len */ 328 1.1 christos cmp eax, ecx 329 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */ 330 1.1 christos 331 1.1 christos sub eax, ecx ; /* len -= nbytes */ 332 1.1 christos rep movsb 333 1.1 christos mov rsi, rdi 334 1.1 christos sub rsi, r15 ; /* from = out - dist */ 335 1.1 christos jmp L_do_copy ; /* if (nbytes >= len) */ 336 1.1 christos 337 1.1 christos ALIGN 4 338 1.1 christos L_do_copy: 339 1.1 christos mov ecx, eax ; /* ecx = len */ 340 1.1 christos rep movsb 341 1.1 christos 342 1.1 christos mov rsi, r8 ; /* move in back to %esi, toss from */ 343 1.1 christos jmp L_while_test 344 1.1 christos 345 1.1 christos L_test_for_end_of_block: 346 1.1 christos test al, 32 347 1.1 christos jz L_invalid_literal_length_code 348 1.1 christos mov dword ptr [rsp+116], 1 349 1.1 christos jmp L_break_loop_with_status 350 1.1 christos 351 1.1 christos L_invalid_literal_length_code: 352 1.1 christos mov dword ptr [rsp+116], 2 353 1.1 christos jmp L_break_loop_with_status 354 1.1 christos 355 1.1 christos L_invalid_distance_code: 356 1.1 christos mov dword ptr [rsp+116], 3 357 1.1 christos jmp L_break_loop_with_status 358 1.1 christos 359 1.1 christos L_invalid_distance_too_far: 360 1.1 christos mov dword ptr [rsp+116], 4 361 1.1 christos jmp L_break_loop_with_status 362 1.1 christos 363 1.1 christos L_break_loop: 364 1.1 christos mov dword ptr [rsp+116], 0 365 1.1 christos 366 1.1 christos L_break_loop_with_status: 367 1.1 christos ; /* put in, out, bits, and hold back into ar and pop esp */ 368 1.1 christos mov [rsp+16], rsi ; /* in */ 369 1.1 christos mov [rsp+32], rdi ; /* out */ 370 1.1 christos mov [rsp+88], ebx ; /* bits */ 371 1.1 christos mov [rsp+80], rdx ; /* hold */ 372 1.1 christos 373 1.1 christos mov rax, [rsp] ; /* restore rbp and rsp */ 374 1.1 christos mov rbp, [rsp+8] 375 1.1 christos mov rsp, rax 376 1.1 christos 377 1.1 christos 378 1.1 christos 379 1.1 christos mov rsi,[rsp-8] 380 1.1 christos mov rdi,[rsp-16] 381 1.1 christos mov r12,[rsp-24] 382 1.1 christos mov r13,[rsp-32] 383 1.1 christos mov r14,[rsp-40] 384 1.1 christos mov r15,[rsp-48] 385 1.1 christos mov rbx,[rsp-56] 386 1.1 christos 387 1.1 christos ret 0 388 1.1 christos ; : 389 1.1 christos ; : "m" (ar) 390 1.1 christos ; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", 391 1.1 christos ; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" 392 1.1 christos ; ); 393 1.1 christos 394 1.1 christos inffas8664fnc ENDP 395 1.1 christos ;_TEXT ENDS 396 1.1 christos END 397