1 1.1 christos /* 2 1.1 christos ;uInt longest_match_x64( 3 1.1 christos ; deflate_state *s, 4 1.1 christos ; IPos cur_match); // current match 5 1.1 christos 6 1.1 christos ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64 7 1.1 christos ; (AMD64 on Athlon 64, Opteron, Phenom 8 1.1 christos ; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) 9 1.1 christos ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode) 10 1.1 christos ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 11 1.1 christos ; 12 1.1 christos ; File written by Gilles Vollant, by converting to assembly the longest_match 13 1.1 christos ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 14 1.1 christos ; and by taking inspiration on asm686 with masm, optimised assembly code 15 1.1 christos ; from Brian Raiter, written 1998 16 1.1 christos ; 17 1.1 christos ; This software is provided 'as-is', without any express or implied 18 1.1 christos ; warranty. In no event will the authors be held liable for any damages 19 1.1 christos ; arising from the use of this software. 20 1.1 christos ; 21 1.1 christos ; Permission is granted to anyone to use this software for any purpose, 22 1.1 christos ; including commercial applications, and to alter it and redistribute it 23 1.1 christos ; freely, subject to the following restrictions: 24 1.1 christos ; 25 1.1 christos ; 1. The origin of this software must not be misrepresented; you must not 26 1.1 christos ; claim that you wrote the original software. If you use this software 27 1.1 christos ; in a product, an acknowledgment in the product documentation would be 28 1.1 christos ; appreciated but is not required. 29 1.1 christos ; 2. Altered source versions must be plainly marked as such, and must not be 30 1.1 christos ; misrepresented as being the original software 31 1.1 christos ; 3. This notice may not be removed or altered from any source distribution. 32 1.1 christos ; 33 1.1 christos ; http://www.zlib.net 34 1.1 christos ; http://www.winimage.com/zLibDll 35 1.1 christos ; http://www.muppetlabs.com/~breadbox/software/assembly.html 36 1.1 christos ; 37 1.1 christos ; to compile this file for zLib, I use option: 38 1.1 christos ; gcc -c -arch x86_64 gvmat64.S 39 1.1 christos 40 1.1 christos 41 1.1 christos ;uInt longest_match(s, cur_match) 42 1.1 christos ; deflate_state *s; 43 1.1 christos ; IPos cur_match; // current match / 44 1.1 christos ; 45 1.1 christos ; with XCode for Mac, I had strange error with some jump on intel syntax 46 1.1 christos ; this is why BEFORE_JMP and AFTER_JMP are used 47 1.1 christos */ 48 1.1 christos 49 1.1 christos 50 1.1 christos #define BEFORE_JMP .att_syntax 51 1.1 christos #define AFTER_JMP .intel_syntax noprefix 52 1.1 christos 53 1.1 christos #ifndef NO_UNDERLINE 54 1.1 christos # define match_init _match_init 55 1.1 christos # define longest_match _longest_match 56 1.1 christos #endif 57 1.1 christos 58 1.1 christos .intel_syntax noprefix 59 1.1 christos 60 1.1 christos .globl match_init, longest_match 61 1.1 christos .text 62 1.1 christos longest_match: 63 1.1 christos 64 1.1 christos 65 1.1 christos 66 1.1 christos #define LocalVarsSize 96 67 1.1 christos /* 68 1.1 christos ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 69 1.1 christos ; free register : r14,r15 70 1.1 christos ; register can be saved : rsp 71 1.1 christos */ 72 1.1 christos 73 1.1 christos #define chainlenwmask (rsp + 8 - LocalVarsSize) 74 1.1 christos #define nicematch (rsp + 16 - LocalVarsSize) 75 1.1 christos 76 1.1 christos #define save_rdi (rsp + 24 - LocalVarsSize) 77 1.1 christos #define save_rsi (rsp + 32 - LocalVarsSize) 78 1.1 christos #define save_rbx (rsp + 40 - LocalVarsSize) 79 1.1 christos #define save_rbp (rsp + 48 - LocalVarsSize) 80 1.1 christos #define save_r12 (rsp + 56 - LocalVarsSize) 81 1.1 christos #define save_r13 (rsp + 64 - LocalVarsSize) 82 1.1 christos #define save_r14 (rsp + 72 - LocalVarsSize) 83 1.1 christos #define save_r15 (rsp + 80 - LocalVarsSize) 84 1.1 christos 85 1.1 christos 86 1.1 christos /* 87 1.1 christos ; all the +4 offsets are due to the addition of pending_buf_size (in zlib 88 1.1 christos ; in the deflate_state structure since the asm code was first written 89 1.1 christos ; (if you compile with zlib 1.0.4 or older, remove the +4). 90 1.1 christos ; Note : these value are good with a 8 bytes boundary pack structure 91 1.1 christos */ 92 1.1 christos 93 1.1 christos #define MAX_MATCH 258 94 1.1 christos #define MIN_MATCH 3 95 1.1 christos #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) 96 1.1 christos 97 1.1 christos /* 98 1.1 christos ;;; Offsets for fields in the deflate_state structure. These numbers 99 1.1 christos ;;; are calculated from the definition of deflate_state, with the 100 1.1 christos ;;; assumption that the compiler will dword-align the fields. (Thus, 101 1.1 christos ;;; changing the definition of deflate_state could easily cause this 102 1.1 christos ;;; program to crash horribly, without so much as a warning at 103 1.1 christos ;;; compile time. Sigh.) 104 1.1 christos 105 1.1 christos ; all the +zlib1222add offsets are due to the addition of fields 106 1.1 christos ; in zlib in the deflate_state structure since the asm code was first written 107 1.1 christos ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 108 1.1 christos ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 109 1.1 christos ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 110 1.1 christos */ 111 1.1 christos 112 1.1 christos 113 1.1 christos 114 1.1 christos /* you can check the structure offset by running 115 1.1 christos 116 1.1 christos #include <stdlib.h> 117 1.1 christos #include <stdio.h> 118 1.1 christos #include "deflate.h" 119 1.1 christos 120 1.1 christos void print_depl() 121 1.1 christos { 122 1.1 christos deflate_state ds; 123 1.1 christos deflate_state *s=&ds; 124 1.1 christos printf("size pointer=%u\n",(int)sizeof(void*)); 125 1.1 christos 126 1.1 christos printf("#define dsWSize %u\n",(int)(((char*)&(s->w_size))-((char*)s))); 127 1.1 christos printf("#define dsWMask %u\n",(int)(((char*)&(s->w_mask))-((char*)s))); 128 1.1 christos printf("#define dsWindow %u\n",(int)(((char*)&(s->window))-((char*)s))); 129 1.1 christos printf("#define dsPrev %u\n",(int)(((char*)&(s->prev))-((char*)s))); 130 1.1 christos printf("#define dsMatchLen %u\n",(int)(((char*)&(s->match_length))-((char*)s))); 131 1.1 christos printf("#define dsPrevMatch %u\n",(int)(((char*)&(s->prev_match))-((char*)s))); 132 1.1 christos printf("#define dsStrStart %u\n",(int)(((char*)&(s->strstart))-((char*)s))); 133 1.1 christos printf("#define dsMatchStart %u\n",(int)(((char*)&(s->match_start))-((char*)s))); 134 1.1 christos printf("#define dsLookahead %u\n",(int)(((char*)&(s->lookahead))-((char*)s))); 135 1.1 christos printf("#define dsPrevLen %u\n",(int)(((char*)&(s->prev_length))-((char*)s))); 136 1.1 christos printf("#define dsMaxChainLen %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); 137 1.1 christos printf("#define dsGoodMatch %u\n",(int)(((char*)&(s->good_match))-((char*)s))); 138 1.1 christos printf("#define dsNiceMatch %u\n",(int)(((char*)&(s->nice_match))-((char*)s))); 139 1.1 christos } 140 1.1 christos */ 141 1.1 christos 142 1.1 christos #define dsWSize 68 143 1.1 christos #define dsWMask 76 144 1.1 christos #define dsWindow 80 145 1.1 christos #define dsPrev 96 146 1.1 christos #define dsMatchLen 144 147 1.1 christos #define dsPrevMatch 148 148 1.1 christos #define dsStrStart 156 149 1.1 christos #define dsMatchStart 160 150 1.1 christos #define dsLookahead 164 151 1.1 christos #define dsPrevLen 168 152 1.1 christos #define dsMaxChainLen 172 153 1.1 christos #define dsGoodMatch 188 154 1.1 christos #define dsNiceMatch 192 155 1.1 christos 156 1.1 christos #define window_size [ rcx + dsWSize] 157 1.1 christos #define WMask [ rcx + dsWMask] 158 1.1 christos #define window_ad [ rcx + dsWindow] 159 1.1 christos #define prev_ad [ rcx + dsPrev] 160 1.1 christos #define strstart [ rcx + dsStrStart] 161 1.1 christos #define match_start [ rcx + dsMatchStart] 162 1.1 christos #define Lookahead [ rcx + dsLookahead] //; 0ffffffffh on infozip 163 1.1 christos #define prev_length [ rcx + dsPrevLen] 164 1.1 christos #define max_chain_length [ rcx + dsMaxChainLen] 165 1.1 christos #define good_match [ rcx + dsGoodMatch] 166 1.1 christos #define nice_match [ rcx + dsNiceMatch] 167 1.1 christos 168 1.1 christos /* 169 1.1 christos ; windows: 170 1.1 christos ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match) 171 1.1 christos 172 1.1 christos ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 173 1.1 christos ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 174 1.1 christos ; 175 1.1 christos ; All registers must be preserved across the call, except for 176 1.1 christos ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 177 1.1 christos 178 1.1 christos ; 179 1.1 christos ; gcc on macosx-linux: 180 1.1 christos ; see http://www.x86-64.org/documentation/abi-0.99.pdf 181 1.1 christos ; param 1 in rdi, param 2 in rsi 182 1.1 christos ; rbx, rsp, rbp, r12 to r15 must be preserved 183 1.1 christos 184 1.1 christos ;;; Save registers that the compiler may be using, and adjust esp to 185 1.1 christos ;;; make room for our stack frame. 186 1.1 christos 187 1.1 christos 188 1.1 christos ;;; Retrieve the function arguments. r8d will hold cur_match 189 1.1 christos ;;; throughout the entire function. edx will hold the pointer to the 190 1.1 christos ;;; deflate_state structure during the function's setup (before 191 1.1 christos ;;; entering the main loop. 192 1.1 christos 193 1.1 christos ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 194 1.1 christos ; mac: param 1 in rdi, param 2 rsi 195 1.1 christos ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 196 1.1 christos */ 197 1.1 christos mov [save_rbx],rbx 198 1.1 christos mov [save_rbp],rbp 199 1.1 christos 200 1.1 christos 201 1.1 christos mov rcx,rdi 202 1.1 christos 203 1.1 christos mov r8d,esi 204 1.1 christos 205 1.1 christos 206 1.1 christos mov [save_r12],r12 207 1.1 christos mov [save_r13],r13 208 1.1 christos mov [save_r14],r14 209 1.1 christos mov [save_r15],r15 210 1.1 christos 211 1.1 christos 212 1.1 christos //;;; uInt wmask = s->w_mask; 213 1.1 christos //;;; unsigned chain_length = s->max_chain_length; 214 1.1 christos //;;; if (s->prev_length >= s->good_match) { 215 1.1 christos //;;; chain_length >>= 2; 216 1.1 christos //;;; } 217 1.1 christos 218 1.1 christos 219 1.1 christos mov edi, prev_length 220 1.1 christos mov esi, good_match 221 1.1 christos mov eax, WMask 222 1.1 christos mov ebx, max_chain_length 223 1.1 christos cmp edi, esi 224 1.1 christos jl LastMatchGood 225 1.1 christos shr ebx, 2 226 1.1 christos LastMatchGood: 227 1.1 christos 228 1.1 christos //;;; chainlen is decremented once beforehand so that the function can 229 1.1 christos //;;; use the sign flag instead of the zero flag for the exit test. 230 1.1 christos //;;; It is then shifted into the high word, to make room for the wmask 231 1.1 christos //;;; value, which it will always accompany. 232 1.1 christos 233 1.1 christos dec ebx 234 1.1 christos shl ebx, 16 235 1.1 christos or ebx, eax 236 1.1 christos 237 1.1 christos //;;; on zlib only 238 1.1 christos //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 239 1.1 christos 240 1.1 christos 241 1.1 christos 242 1.1 christos mov eax, nice_match 243 1.1 christos mov [chainlenwmask], ebx 244 1.1 christos mov r10d, Lookahead 245 1.1 christos cmp r10d, eax 246 1.1 christos cmovnl r10d, eax 247 1.1 christos mov [nicematch],r10d 248 1.1 christos 249 1.1 christos 250 1.1 christos 251 1.1 christos //;;; register Bytef *scan = s->window + s->strstart; 252 1.1 christos mov r10, window_ad 253 1.1 christos mov ebp, strstart 254 1.1 christos lea r13, [r10 + rbp] 255 1.1 christos 256 1.1 christos //;;; Determine how many bytes the scan ptr is off from being 257 1.1 christos //;;; dword-aligned. 258 1.1 christos 259 1.1 christos mov r9,r13 260 1.1 christos neg r13 261 1.1 christos and r13,3 262 1.1 christos 263 1.1 christos //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 264 1.1 christos //;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 265 1.1 christos 266 1.1 christos 267 1.1 christos mov eax, window_size 268 1.1 christos sub eax, MIN_LOOKAHEAD 269 1.1 christos 270 1.1 christos 271 1.1 christos xor edi,edi 272 1.1 christos sub ebp, eax 273 1.1 christos 274 1.1 christos mov r11d, prev_length 275 1.1 christos 276 1.1 christos cmovng ebp,edi 277 1.1 christos 278 1.1 christos //;;; int best_len = s->prev_length; 279 1.1 christos 280 1.1 christos 281 1.1 christos //;;; Store the sum of s->window + best_len in esi locally, and in esi. 282 1.1 christos 283 1.1 christos lea rsi,[r10+r11] 284 1.1 christos 285 1.1 christos //;;; register ush scan_start = *(ushf*)scan; 286 1.1 christos //;;; register ush scan_end = *(ushf*)(scan+best_len-1); 287 1.1 christos //;;; Posf *prev = s->prev; 288 1.1 christos 289 1.1 christos movzx r12d,word ptr [r9] 290 1.1 christos movzx ebx, word ptr [r9 + r11 - 1] 291 1.1 christos 292 1.1 christos mov rdi, prev_ad 293 1.1 christos 294 1.1 christos //;;; Jump into the main loop. 295 1.1 christos 296 1.1 christos mov edx, [chainlenwmask] 297 1.1 christos 298 1.1 christos cmp bx,word ptr [rsi + r8 - 1] 299 1.1 christos jz LookupLoopIsZero 300 1.1 christos 301 1.1 christos 302 1.1 christos 303 1.1 christos LookupLoop1: 304 1.1 christos and r8d, edx 305 1.1 christos 306 1.1 christos movzx r8d, word ptr [rdi + r8*2] 307 1.1 christos cmp r8d, ebp 308 1.1 christos jbe LeaveNow 309 1.1 christos 310 1.1 christos 311 1.1 christos 312 1.1 christos sub edx, 0x00010000 313 1.1 christos BEFORE_JMP 314 1.1 christos js LeaveNow 315 1.1 christos AFTER_JMP 316 1.1 christos 317 1.1 christos LoopEntry1: 318 1.1 christos cmp bx,word ptr [rsi + r8 - 1] 319 1.1 christos BEFORE_JMP 320 1.1 christos jz LookupLoopIsZero 321 1.1 christos AFTER_JMP 322 1.1 christos 323 1.1 christos LookupLoop2: 324 1.1 christos and r8d, edx 325 1.1 christos 326 1.1 christos movzx r8d, word ptr [rdi + r8*2] 327 1.1 christos cmp r8d, ebp 328 1.1 christos BEFORE_JMP 329 1.1 christos jbe LeaveNow 330 1.1 christos AFTER_JMP 331 1.1 christos sub edx, 0x00010000 332 1.1 christos BEFORE_JMP 333 1.1 christos js LeaveNow 334 1.1 christos AFTER_JMP 335 1.1 christos 336 1.1 christos LoopEntry2: 337 1.1 christos cmp bx,word ptr [rsi + r8 - 1] 338 1.1 christos BEFORE_JMP 339 1.1 christos jz LookupLoopIsZero 340 1.1 christos AFTER_JMP 341 1.1 christos 342 1.1 christos LookupLoop4: 343 1.1 christos and r8d, edx 344 1.1 christos 345 1.1 christos movzx r8d, word ptr [rdi + r8*2] 346 1.1 christos cmp r8d, ebp 347 1.1 christos BEFORE_JMP 348 1.1 christos jbe LeaveNow 349 1.1 christos AFTER_JMP 350 1.1 christos sub edx, 0x00010000 351 1.1 christos BEFORE_JMP 352 1.1 christos js LeaveNow 353 1.1 christos AFTER_JMP 354 1.1 christos 355 1.1 christos LoopEntry4: 356 1.1 christos 357 1.1 christos cmp bx,word ptr [rsi + r8 - 1] 358 1.1 christos BEFORE_JMP 359 1.1 christos jnz LookupLoop1 360 1.1 christos jmp LookupLoopIsZero 361 1.1 christos AFTER_JMP 362 1.1 christos /* 363 1.1 christos ;;; do { 364 1.1 christos ;;; match = s->window + cur_match; 365 1.1 christos ;;; if (*(ushf*)(match+best_len-1) != scan_end || 366 1.1 christos ;;; *(ushf*)match != scan_start) continue; 367 1.1 christos ;;; [...] 368 1.1 christos ;;; } while ((cur_match = prev[cur_match & wmask]) > limit 369 1.1 christos ;;; && --chain_length != 0); 370 1.1 christos ;;; 371 1.1 christos ;;; Here is the inner loop of the function. The function will spend the 372 1.1 christos ;;; majority of its time in this loop, and majority of that time will 373 1.1 christos ;;; be spent in the first ten instructions. 374 1.1 christos ;;; 375 1.1 christos ;;; Within this loop: 376 1.1 christos ;;; ebx = scanend 377 1.1 christos ;;; r8d = curmatch 378 1.1 christos ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 379 1.1 christos ;;; esi = windowbestlen - i.e., (window + bestlen) 380 1.1 christos ;;; edi = prev 381 1.1 christos ;;; ebp = limit 382 1.1 christos */ 383 1.1 christos .balign 16 384 1.1 christos LookupLoop: 385 1.1 christos and r8d, edx 386 1.1 christos 387 1.1 christos movzx r8d, word ptr [rdi + r8*2] 388 1.1 christos cmp r8d, ebp 389 1.1 christos BEFORE_JMP 390 1.1 christos jbe LeaveNow 391 1.1 christos AFTER_JMP 392 1.1 christos sub edx, 0x00010000 393 1.1 christos BEFORE_JMP 394 1.1 christos js LeaveNow 395 1.1 christos AFTER_JMP 396 1.1 christos 397 1.1 christos LoopEntry: 398 1.1 christos 399 1.1 christos cmp bx,word ptr [rsi + r8 - 1] 400 1.1 christos BEFORE_JMP 401 1.1 christos jnz LookupLoop1 402 1.1 christos AFTER_JMP 403 1.1 christos LookupLoopIsZero: 404 1.1 christos cmp r12w, word ptr [r10 + r8] 405 1.1 christos BEFORE_JMP 406 1.1 christos jnz LookupLoop1 407 1.1 christos AFTER_JMP 408 1.1 christos 409 1.1 christos 410 1.1 christos //;;; Store the current value of chainlen. 411 1.1 christos mov [chainlenwmask], edx 412 1.1 christos /* 413 1.1 christos ;;; Point edi to the string under scrutiny, and esi to the string we 414 1.1 christos ;;; are hoping to match it up with. In actuality, esi and edi are 415 1.1 christos ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 416 1.1 christos ;;; initialized to -(MAX_MATCH_8 - scanalign). 417 1.1 christos */ 418 1.1 christos lea rsi,[r8+r10] 419 1.1 christos mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8) 420 1.1 christos lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8] 421 1.1 christos lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8] 422 1.1 christos 423 1.1 christos prefetcht1 [rsi+rdx] 424 1.1 christos prefetcht1 [rdi+rdx] 425 1.1 christos 426 1.1 christos /* 427 1.1 christos ;;; Test the strings for equality, 8 bytes at a time. At the end, 428 1.1 christos ;;; adjust rdx so that it is offset to the exact byte that mismatched. 429 1.1 christos ;;; 430 1.1 christos ;;; We already know at this point that the first three bytes of the 431 1.1 christos ;;; strings match each other, and they can be safely passed over before 432 1.1 christos ;;; starting the compare loop. So what this code does is skip over 0-3 433 1.1 christos ;;; bytes, as much as necessary in order to dword-align the edi 434 1.1 christos ;;; pointer. (rsi will still be misaligned three times out of four.) 435 1.1 christos ;;; 436 1.1 christos ;;; It should be confessed that this loop usually does not represent 437 1.1 christos ;;; much of the total running time. Replacing it with a more 438 1.1 christos ;;; straightforward "rep cmpsb" would not drastically degrade 439 1.1 christos ;;; performance. 440 1.1 christos */ 441 1.1 christos 442 1.1 christos LoopCmps: 443 1.1 christos mov rax, [rsi + rdx] 444 1.1 christos xor rax, [rdi + rdx] 445 1.1 christos jnz LeaveLoopCmps 446 1.1 christos 447 1.1 christos mov rax, [rsi + rdx + 8] 448 1.1 christos xor rax, [rdi + rdx + 8] 449 1.1 christos jnz LeaveLoopCmps8 450 1.1 christos 451 1.1 christos 452 1.1 christos mov rax, [rsi + rdx + 8+8] 453 1.1 christos xor rax, [rdi + rdx + 8+8] 454 1.1 christos jnz LeaveLoopCmps16 455 1.1 christos 456 1.1 christos add rdx,8+8+8 457 1.1 christos 458 1.1 christos BEFORE_JMP 459 1.1 christos jnz LoopCmps 460 1.1 christos jmp LenMaximum 461 1.1 christos AFTER_JMP 462 1.1 christos 463 1.1 christos LeaveLoopCmps16: add rdx,8 464 1.1 christos LeaveLoopCmps8: add rdx,8 465 1.1 christos LeaveLoopCmps: 466 1.1 christos 467 1.1 christos test eax, 0x0000FFFF 468 1.1 christos jnz LenLower 469 1.1 christos 470 1.1 christos test eax,0xffffffff 471 1.1 christos 472 1.1 christos jnz LenLower32 473 1.1 christos 474 1.1 christos add rdx,4 475 1.1 christos shr rax,32 476 1.1 christos or ax,ax 477 1.1 christos BEFORE_JMP 478 1.1 christos jnz LenLower 479 1.1 christos AFTER_JMP 480 1.1 christos 481 1.1 christos LenLower32: 482 1.1 christos shr eax,16 483 1.1 christos add rdx,2 484 1.1 christos 485 1.1 christos LenLower: 486 1.1 christos sub al, 1 487 1.1 christos adc rdx, 0 488 1.1 christos //;;; Calculate the length of the match. If it is longer than MAX_MATCH, 489 1.1 christos //;;; then automatically accept it as the best possible match and leave. 490 1.1 christos 491 1.1 christos lea rax, [rdi + rdx] 492 1.1 christos sub rax, r9 493 1.1 christos cmp eax, MAX_MATCH 494 1.1 christos BEFORE_JMP 495 1.1 christos jge LenMaximum 496 1.1 christos AFTER_JMP 497 1.1 christos /* 498 1.1 christos ;;; If the length of the match is not longer than the best match we 499 1.1 christos ;;; have so far, then forget it and return to the lookup loop. 500 1.1 christos ;/////////////////////////////////// 501 1.1 christos */ 502 1.1 christos cmp eax, r11d 503 1.1 christos jg LongerMatch 504 1.1 christos 505 1.1 christos lea rsi,[r10+r11] 506 1.1 christos 507 1.1 christos mov rdi, prev_ad 508 1.1 christos mov edx, [chainlenwmask] 509 1.1 christos BEFORE_JMP 510 1.1 christos jmp LookupLoop 511 1.1 christos AFTER_JMP 512 1.1 christos /* 513 1.1 christos ;;; s->match_start = cur_match; 514 1.1 christos ;;; best_len = len; 515 1.1 christos ;;; if (len >= nice_match) break; 516 1.1 christos ;;; scan_end = *(ushf*)(scan+best_len-1); 517 1.1 christos */ 518 1.1 christos LongerMatch: 519 1.1 christos mov r11d, eax 520 1.1 christos mov match_start, r8d 521 1.1 christos cmp eax, [nicematch] 522 1.1 christos BEFORE_JMP 523 1.1 christos jge LeaveNow 524 1.1 christos AFTER_JMP 525 1.1 christos 526 1.1 christos lea rsi,[r10+rax] 527 1.1 christos 528 1.1 christos movzx ebx, word ptr [r9 + rax - 1] 529 1.1 christos mov rdi, prev_ad 530 1.1 christos mov edx, [chainlenwmask] 531 1.1 christos BEFORE_JMP 532 1.1 christos jmp LookupLoop 533 1.1 christos AFTER_JMP 534 1.1 christos 535 1.1 christos //;;; Accept the current string, with the maximum possible length. 536 1.1 christos 537 1.1 christos LenMaximum: 538 1.1 christos mov r11d,MAX_MATCH 539 1.1 christos mov match_start, r8d 540 1.1 christos 541 1.1 christos //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 542 1.1 christos //;;; return s->lookahead; 543 1.1 christos 544 1.1 christos LeaveNow: 545 1.1 christos mov eax, Lookahead 546 1.1 christos cmp r11d, eax 547 1.1 christos cmovng eax, r11d 548 1.1 christos 549 1.1 christos 550 1.1 christos 551 1.1 christos //;;; Restore the stack and return from whence we came. 552 1.1 christos 553 1.1 christos 554 1.1 christos // mov rsi,[save_rsi] 555 1.1 christos // mov rdi,[save_rdi] 556 1.1 christos mov rbx,[save_rbx] 557 1.1 christos mov rbp,[save_rbp] 558 1.1 christos mov r12,[save_r12] 559 1.1 christos mov r13,[save_r13] 560 1.1 christos mov r14,[save_r14] 561 1.1 christos mov r15,[save_r15] 562 1.1 christos 563 1.1 christos 564 1.1 christos ret 0 565 1.1 christos //; please don't remove this string ! 566 1.1 christos //; Your can freely use gvmat64 in any free or commercial app 567 1.1 christos //; but it is far better don't remove the string in the binary! 568 1.1 christos // db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 569 1.1 christos 570 1.1 christos 571 1.1 christos match_init: 572 1.1 christos ret 0 573 1.1 christos 574 1.1 christos 575