Home | History | Annotate | Line # | Download | only in gcc_gvmat64
      1  1.1  christos /*
      2  1.1  christos ;uInt longest_match_x64(
      3  1.1  christos ;    deflate_state *s,
      4  1.1  christos ;    IPos cur_match);                             // current match
      5  1.1  christos 
      6  1.1  christos ; gvmat64.S -- Asm portion of the optimized longest_match for 32 bits x86_64
      7  1.1  christos ;  (AMD64 on Athlon 64, Opteron, Phenom
      8  1.1  christos ;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
      9  1.1  christos ; this file is translation from gvmat64.asm to GCC 4.x (for Linux, Mac XCode)
     10  1.1  christos ; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
     11  1.1  christos ;
     12  1.1  christos ; File written by Gilles Vollant, by converting to assembly the longest_match
     13  1.1  christos ;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
     14  1.1  christos ;  and by taking inspiration on asm686 with masm, optimised assembly code
     15  1.1  christos ;        from Brian Raiter, written 1998
     16  1.1  christos ;
     17  1.1  christos ;  This software is provided 'as-is', without any express or implied
     18  1.1  christos ;  warranty.  In no event will the authors be held liable for any damages
     19  1.1  christos ;  arising from the use of this software.
     20  1.1  christos ;
     21  1.1  christos ;  Permission is granted to anyone to use this software for any purpose,
     22  1.1  christos ;  including commercial applications, and to alter it and redistribute it
     23  1.1  christos ;  freely, subject to the following restrictions:
     24  1.1  christos ;
     25  1.1  christos ;  1. The origin of this software must not be misrepresented; you must not
     26  1.1  christos ;     claim that you wrote the original software. If you use this software
     27  1.1  christos ;     in a product, an acknowledgment in the product documentation would be
     28  1.1  christos ;     appreciated but is not required.
     29  1.1  christos ;  2. Altered source versions must be plainly marked as such, and must not be
     30  1.1  christos ;     misrepresented as being the original software
     31  1.1  christos ;  3. This notice may not be removed or altered from any source distribution.
     32  1.1  christos ;
     33  1.1  christos ;         http://www.zlib.net
     34  1.1  christos ;         http://www.winimage.com/zLibDll
     35  1.1  christos ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
     36  1.1  christos ;
     37  1.1  christos ; to compile this file for zLib, I use option:
     38  1.1  christos ;   gcc -c -arch x86_64 gvmat64.S
     39  1.1  christos 
     40  1.1  christos 
     41  1.1  christos ;uInt longest_match(s, cur_match)
     42  1.1  christos ;    deflate_state *s;
     43  1.1  christos ;    IPos cur_match;                             // current match /
     44  1.1  christos ;
     45  1.1  christos ; with XCode for Mac, I had strange error with some jump on intel syntax
     46  1.1  christos ; this is why BEFORE_JMP and AFTER_JMP are used
     47  1.1  christos  */
     48  1.1  christos 
     49  1.1  christos 
     50  1.1  christos #define BEFORE_JMP .att_syntax
     51  1.1  christos #define AFTER_JMP .intel_syntax noprefix
     52  1.1  christos 
     53  1.1  christos #ifndef NO_UNDERLINE
     54  1.1  christos #	define	match_init	_match_init
     55  1.1  christos #	define	longest_match	_longest_match
     56  1.1  christos #endif
     57  1.1  christos 
     58  1.1  christos .intel_syntax noprefix
     59  1.1  christos 
     60  1.1  christos .globl	match_init, longest_match
     61  1.1  christos .text
     62  1.1  christos longest_match:
     63  1.1  christos 
     64  1.1  christos 
     65  1.1  christos 
     66  1.1  christos #define LocalVarsSize 96
     67  1.1  christos /*
     68  1.1  christos ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
     69  1.1  christos ; free register :  r14,r15
     70  1.1  christos ; register can be saved : rsp
     71  1.1  christos */
     72  1.1  christos 
     73  1.1  christos #define chainlenwmask     (rsp + 8 - LocalVarsSize)
     74  1.1  christos #define nicematch         (rsp + 16 - LocalVarsSize)
     75  1.1  christos 
     76  1.1  christos #define save_rdi        (rsp + 24 - LocalVarsSize)
     77  1.1  christos #define save_rsi        (rsp + 32 - LocalVarsSize)
     78  1.1  christos #define save_rbx        (rsp + 40 - LocalVarsSize)
     79  1.1  christos #define save_rbp        (rsp + 48 - LocalVarsSize)
     80  1.1  christos #define save_r12        (rsp + 56 - LocalVarsSize)
     81  1.1  christos #define save_r13        (rsp + 64 - LocalVarsSize)
     82  1.1  christos #define save_r14        (rsp + 72 - LocalVarsSize)
     83  1.1  christos #define save_r15        (rsp + 80 - LocalVarsSize)
     84  1.1  christos 
     85  1.1  christos 
     86  1.1  christos /*
     87  1.1  christos ;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
     88  1.1  christos ;  in the deflate_state structure since the asm code was first written
     89  1.1  christos ;  (if you compile with zlib 1.0.4 or older, remove the +4).
     90  1.1  christos ;  Note : these value are good with a 8 bytes boundary pack structure
     91  1.1  christos */
     92  1.1  christos 
     93  1.1  christos #define    MAX_MATCH              258
     94  1.1  christos #define    MIN_MATCH              3
     95  1.1  christos #define    MIN_LOOKAHEAD          (MAX_MATCH+MIN_MATCH+1)
     96  1.1  christos 
     97  1.1  christos /*
     98  1.1  christos ;;; Offsets for fields in the deflate_state structure. These numbers
     99  1.1  christos ;;; are calculated from the definition of deflate_state, with the
    100  1.1  christos ;;; assumption that the compiler will dword-align the fields. (Thus,
    101  1.1  christos ;;; changing the definition of deflate_state could easily cause this
    102  1.1  christos ;;; program to crash horribly, without so much as a warning at
    103  1.1  christos ;;; compile time. Sigh.)
    104  1.1  christos 
    105  1.1  christos ;  all the +zlib1222add offsets are due to the addition of fields
    106  1.1  christos ;  in zlib in the deflate_state structure since the asm code was first written
    107  1.1  christos ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
    108  1.1  christos ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
    109  1.1  christos ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
    110  1.1  christos */
    111  1.1  christos 
    112  1.1  christos 
    113  1.1  christos 
    114  1.1  christos /* you can check the structure offset by running
    115  1.1  christos 
    116  1.1  christos #include <stdlib.h>
    117  1.1  christos #include <stdio.h>
    118  1.1  christos #include "deflate.h"
    119  1.1  christos 
    120  1.1  christos void print_depl()
    121  1.1  christos {
    122  1.1  christos deflate_state ds;
    123  1.1  christos deflate_state *s=&ds;
    124  1.1  christos printf("size pointer=%u\n",(int)sizeof(void*));
    125  1.1  christos 
    126  1.1  christos printf("#define dsWSize         %u\n",(int)(((char*)&(s->w_size))-((char*)s)));
    127  1.1  christos printf("#define dsWMask         %u\n",(int)(((char*)&(s->w_mask))-((char*)s)));
    128  1.1  christos printf("#define dsWindow        %u\n",(int)(((char*)&(s->window))-((char*)s)));
    129  1.1  christos printf("#define dsPrev          %u\n",(int)(((char*)&(s->prev))-((char*)s)));
    130  1.1  christos printf("#define dsMatchLen      %u\n",(int)(((char*)&(s->match_length))-((char*)s)));
    131  1.1  christos printf("#define dsPrevMatch     %u\n",(int)(((char*)&(s->prev_match))-((char*)s)));
    132  1.1  christos printf("#define dsStrStart      %u\n",(int)(((char*)&(s->strstart))-((char*)s)));
    133  1.1  christos printf("#define dsMatchStart    %u\n",(int)(((char*)&(s->match_start))-((char*)s)));
    134  1.1  christos printf("#define dsLookahead     %u\n",(int)(((char*)&(s->lookahead))-((char*)s)));
    135  1.1  christos printf("#define dsPrevLen       %u\n",(int)(((char*)&(s->prev_length))-((char*)s)));
    136  1.1  christos printf("#define dsMaxChainLen   %u\n",(int)(((char*)&(s->max_chain_length))-((char*)s)));
    137  1.1  christos printf("#define dsGoodMatch     %u\n",(int)(((char*)&(s->good_match))-((char*)s)));
    138  1.1  christos printf("#define dsNiceMatch     %u\n",(int)(((char*)&(s->nice_match))-((char*)s)));
    139  1.1  christos }
    140  1.1  christos */
    141  1.1  christos 
    142  1.1  christos #define dsWSize          68
    143  1.1  christos #define dsWMask          76
    144  1.1  christos #define dsWindow         80
    145  1.1  christos #define dsPrev           96
    146  1.1  christos #define dsMatchLen       144
    147  1.1  christos #define dsPrevMatch      148
    148  1.1  christos #define dsStrStart       156
    149  1.1  christos #define dsMatchStart     160
    150  1.1  christos #define dsLookahead      164
    151  1.1  christos #define dsPrevLen        168
    152  1.1  christos #define dsMaxChainLen    172
    153  1.1  christos #define dsGoodMatch      188
    154  1.1  christos #define dsNiceMatch      192
    155  1.1  christos 
    156  1.1  christos #define window_size      [ rcx + dsWSize]
    157  1.1  christos #define WMask            [ rcx + dsWMask]
    158  1.1  christos #define window_ad        [ rcx + dsWindow]
    159  1.1  christos #define prev_ad          [ rcx + dsPrev]
    160  1.1  christos #define strstart         [ rcx + dsStrStart]
    161  1.1  christos #define match_start      [ rcx + dsMatchStart]
    162  1.1  christos #define Lookahead        [ rcx + dsLookahead] //; 0ffffffffh on infozip
    163  1.1  christos #define prev_length      [ rcx + dsPrevLen]
    164  1.1  christos #define max_chain_length [ rcx + dsMaxChainLen]
    165  1.1  christos #define good_match       [ rcx + dsGoodMatch]
    166  1.1  christos #define nice_match       [ rcx + dsNiceMatch]
    167  1.1  christos 
    168  1.1  christos /*
    169  1.1  christos ; windows:
    170  1.1  christos ; parameter 1 in rcx(deflate state s), param 2 in rdx (cur match)
    171  1.1  christos 
    172  1.1  christos ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
    173  1.1  christos ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
    174  1.1  christos ;
    175  1.1  christos ; All registers must be preserved across the call, except for
    176  1.1  christos ;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
    177  1.1  christos 
    178  1.1  christos ;
    179  1.1  christos ; gcc on macosx-linux:
    180  1.1  christos ; see http://www.x86-64.org/documentation/abi-0.99.pdf
    181  1.1  christos ; param 1 in rdi, param 2 in rsi
    182  1.1  christos ; rbx, rsp, rbp, r12 to r15 must be preserved
    183  1.1  christos 
    184  1.1  christos ;;; Save registers that the compiler may be using, and adjust esp to
    185  1.1  christos ;;; make room for our stack frame.
    186  1.1  christos 
    187  1.1  christos 
    188  1.1  christos ;;; Retrieve the function arguments. r8d will hold cur_match
    189  1.1  christos ;;; throughout the entire function. edx will hold the pointer to the
    190  1.1  christos ;;; deflate_state structure during the function's setup (before
    191  1.1  christos ;;; entering the main loop.
    192  1.1  christos 
    193  1.1  christos ; ms: parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
    194  1.1  christos ; mac: param 1 in rdi, param 2 rsi
    195  1.1  christos ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
    196  1.1  christos */
    197  1.1  christos         mov [save_rbx],rbx
    198  1.1  christos         mov [save_rbp],rbp
    199  1.1  christos 
    200  1.1  christos 
    201  1.1  christos         mov rcx,rdi
    202  1.1  christos 
    203  1.1  christos         mov r8d,esi
    204  1.1  christos 
    205  1.1  christos 
    206  1.1  christos         mov [save_r12],r12
    207  1.1  christos         mov [save_r13],r13
    208  1.1  christos         mov [save_r14],r14
    209  1.1  christos         mov [save_r15],r15
    210  1.1  christos 
    211  1.1  christos 
    212  1.1  christos //;;; uInt wmask = s->w_mask;
    213  1.1  christos //;;; unsigned chain_length = s->max_chain_length;
    214  1.1  christos //;;; if (s->prev_length >= s->good_match) {
    215  1.1  christos //;;;     chain_length >>= 2;
    216  1.1  christos //;;; }
    217  1.1  christos 
    218  1.1  christos 
    219  1.1  christos         mov edi, prev_length
    220  1.1  christos         mov esi, good_match
    221  1.1  christos         mov eax, WMask
    222  1.1  christos         mov ebx, max_chain_length
    223  1.1  christos         cmp edi, esi
    224  1.1  christos         jl  LastMatchGood
    225  1.1  christos         shr ebx, 2
    226  1.1  christos LastMatchGood:
    227  1.1  christos 
    228  1.1  christos //;;; chainlen is decremented once beforehand so that the function can
    229  1.1  christos //;;; use the sign flag instead of the zero flag for the exit test.
    230  1.1  christos //;;; It is then shifted into the high word, to make room for the wmask
    231  1.1  christos //;;; value, which it will always accompany.
    232  1.1  christos 
    233  1.1  christos         dec ebx
    234  1.1  christos         shl ebx, 16
    235  1.1  christos         or  ebx, eax
    236  1.1  christos 
    237  1.1  christos //;;; on zlib only
    238  1.1  christos //;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
    239  1.1  christos 
    240  1.1  christos 
    241  1.1  christos 
    242  1.1  christos         mov eax, nice_match
    243  1.1  christos         mov [chainlenwmask], ebx
    244  1.1  christos         mov r10d, Lookahead
    245  1.1  christos         cmp r10d, eax
    246  1.1  christos         cmovnl r10d, eax
    247  1.1  christos         mov [nicematch],r10d
    248  1.1  christos 
    249  1.1  christos 
    250  1.1  christos 
    251  1.1  christos //;;; register Bytef *scan = s->window + s->strstart;
    252  1.1  christos         mov r10, window_ad
    253  1.1  christos         mov ebp, strstart
    254  1.1  christos         lea r13, [r10 + rbp]
    255  1.1  christos 
    256  1.1  christos //;;; Determine how many bytes the scan ptr is off from being
    257  1.1  christos //;;; dword-aligned.
    258  1.1  christos 
    259  1.1  christos          mov r9,r13
    260  1.1  christos          neg r13
    261  1.1  christos          and r13,3
    262  1.1  christos 
    263  1.1  christos //;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
    264  1.1  christos //;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
    265  1.1  christos 
    266  1.1  christos 
    267  1.1  christos         mov eax, window_size
    268  1.1  christos         sub eax, MIN_LOOKAHEAD
    269  1.1  christos 
    270  1.1  christos 
    271  1.1  christos         xor edi,edi
    272  1.1  christos         sub ebp, eax
    273  1.1  christos 
    274  1.1  christos         mov r11d, prev_length
    275  1.1  christos 
    276  1.1  christos         cmovng ebp,edi
    277  1.1  christos 
    278  1.1  christos //;;; int best_len = s->prev_length;
    279  1.1  christos 
    280  1.1  christos 
    281  1.1  christos //;;; Store the sum of s->window + best_len in esi locally, and in esi.
    282  1.1  christos 
    283  1.1  christos        lea  rsi,[r10+r11]
    284  1.1  christos 
    285  1.1  christos //;;; register ush scan_start = *(ushf*)scan;
    286  1.1  christos //;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
    287  1.1  christos //;;; Posf *prev = s->prev;
    288  1.1  christos 
    289  1.1  christos         movzx r12d,word ptr [r9]
    290  1.1  christos         movzx ebx, word ptr [r9 + r11 - 1]
    291  1.1  christos 
    292  1.1  christos         mov rdi, prev_ad
    293  1.1  christos 
    294  1.1  christos //;;; Jump into the main loop.
    295  1.1  christos 
    296  1.1  christos         mov edx, [chainlenwmask]
    297  1.1  christos 
    298  1.1  christos         cmp bx,word ptr [rsi + r8 - 1]
    299  1.1  christos         jz  LookupLoopIsZero
    300  1.1  christos 
    301  1.1  christos 
    302  1.1  christos 
    303  1.1  christos LookupLoop1:
    304  1.1  christos         and r8d, edx
    305  1.1  christos 
    306  1.1  christos         movzx   r8d, word ptr [rdi + r8*2]
    307  1.1  christos         cmp r8d, ebp
    308  1.1  christos         jbe LeaveNow
    309  1.1  christos 
    310  1.1  christos 
    311  1.1  christos 
    312  1.1  christos         sub edx, 0x00010000
    313  1.1  christos 		BEFORE_JMP
    314  1.1  christos         js  LeaveNow
    315  1.1  christos 		AFTER_JMP
    316  1.1  christos 
    317  1.1  christos LoopEntry1:
    318  1.1  christos         cmp bx,word ptr [rsi + r8 - 1]
    319  1.1  christos 		BEFORE_JMP
    320  1.1  christos         jz  LookupLoopIsZero
    321  1.1  christos 		AFTER_JMP
    322  1.1  christos 
    323  1.1  christos LookupLoop2:
    324  1.1  christos         and r8d, edx
    325  1.1  christos 
    326  1.1  christos         movzx   r8d, word ptr [rdi + r8*2]
    327  1.1  christos         cmp r8d, ebp
    328  1.1  christos 		BEFORE_JMP
    329  1.1  christos         jbe LeaveNow
    330  1.1  christos 		AFTER_JMP
    331  1.1  christos         sub edx, 0x00010000
    332  1.1  christos 		BEFORE_JMP
    333  1.1  christos         js  LeaveNow
    334  1.1  christos 		AFTER_JMP
    335  1.1  christos 
    336  1.1  christos LoopEntry2:
    337  1.1  christos         cmp bx,word ptr [rsi + r8 - 1]
    338  1.1  christos 		BEFORE_JMP
    339  1.1  christos         jz  LookupLoopIsZero
    340  1.1  christos 		AFTER_JMP
    341  1.1  christos 
    342  1.1  christos LookupLoop4:
    343  1.1  christos         and r8d, edx
    344  1.1  christos 
    345  1.1  christos         movzx   r8d, word ptr [rdi + r8*2]
    346  1.1  christos         cmp r8d, ebp
    347  1.1  christos 		BEFORE_JMP
    348  1.1  christos         jbe LeaveNow
    349  1.1  christos 		AFTER_JMP
    350  1.1  christos         sub edx, 0x00010000
    351  1.1  christos 		BEFORE_JMP
    352  1.1  christos         js  LeaveNow
    353  1.1  christos 		AFTER_JMP
    354  1.1  christos 
    355  1.1  christos LoopEntry4:
    356  1.1  christos 
    357  1.1  christos         cmp bx,word ptr [rsi + r8 - 1]
    358  1.1  christos 		BEFORE_JMP
    359  1.1  christos         jnz LookupLoop1
    360  1.1  christos         jmp LookupLoopIsZero
    361  1.1  christos 		AFTER_JMP
    362  1.1  christos /*
    363  1.1  christos ;;; do {
    364  1.1  christos ;;;     match = s->window + cur_match;
    365  1.1  christos ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
    366  1.1  christos ;;;         *(ushf*)match != scan_start) continue;
    367  1.1  christos ;;;     [...]
    368  1.1  christos ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
    369  1.1  christos ;;;          && --chain_length != 0);
    370  1.1  christos ;;;
    371  1.1  christos ;;; Here is the inner loop of the function. The function will spend the
    372  1.1  christos ;;; majority of its time in this loop, and majority of that time will
    373  1.1  christos ;;; be spent in the first ten instructions.
    374  1.1  christos ;;;
    375  1.1  christos ;;; Within this loop:
    376  1.1  christos ;;; ebx = scanend
    377  1.1  christos ;;; r8d = curmatch
    378  1.1  christos ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
    379  1.1  christos ;;; esi = windowbestlen - i.e., (window + bestlen)
    380  1.1  christos ;;; edi = prev
    381  1.1  christos ;;; ebp = limit
    382  1.1  christos */
    383  1.1  christos .balign 16
    384  1.1  christos LookupLoop:
    385  1.1  christos         and r8d, edx
    386  1.1  christos 
    387  1.1  christos         movzx   r8d, word ptr [rdi + r8*2]
    388  1.1  christos         cmp r8d, ebp
    389  1.1  christos 		BEFORE_JMP
    390  1.1  christos         jbe LeaveNow
    391  1.1  christos 		AFTER_JMP
    392  1.1  christos         sub edx, 0x00010000
    393  1.1  christos 		BEFORE_JMP
    394  1.1  christos         js  LeaveNow
    395  1.1  christos 		AFTER_JMP
    396  1.1  christos 
    397  1.1  christos LoopEntry:
    398  1.1  christos 
    399  1.1  christos         cmp bx,word ptr [rsi + r8 - 1]
    400  1.1  christos 		BEFORE_JMP
    401  1.1  christos         jnz LookupLoop1
    402  1.1  christos 		AFTER_JMP
    403  1.1  christos LookupLoopIsZero:
    404  1.1  christos         cmp     r12w, word ptr [r10 + r8]
    405  1.1  christos 		BEFORE_JMP
    406  1.1  christos         jnz LookupLoop1
    407  1.1  christos 		AFTER_JMP
    408  1.1  christos 
    409  1.1  christos 
    410  1.1  christos //;;; Store the current value of chainlen.
    411  1.1  christos         mov [chainlenwmask], edx
    412  1.1  christos /*
    413  1.1  christos ;;; Point edi to the string under scrutiny, and esi to the string we
    414  1.1  christos ;;; are hoping to match it up with. In actuality, esi and edi are
    415  1.1  christos ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
    416  1.1  christos ;;; initialized to -(MAX_MATCH_8 - scanalign).
    417  1.1  christos */
    418  1.1  christos         lea rsi,[r8+r10]
    419  1.1  christos         mov rdx, 0xfffffffffffffef8 //; -(MAX_MATCH_8)
    420  1.1  christos         lea rsi, [rsi + r13 + 0x0108] //;MAX_MATCH_8]
    421  1.1  christos         lea rdi, [r9 + r13 + 0x0108] //;MAX_MATCH_8]
    422  1.1  christos 
    423  1.1  christos         prefetcht1 [rsi+rdx]
    424  1.1  christos         prefetcht1 [rdi+rdx]
    425  1.1  christos 
    426  1.1  christos /*
    427  1.1  christos ;;; Test the strings for equality, 8 bytes at a time. At the end,
    428  1.1  christos ;;; adjust rdx so that it is offset to the exact byte that mismatched.
    429  1.1  christos ;;;
    430  1.1  christos ;;; We already know at this point that the first three bytes of the
    431  1.1  christos ;;; strings match each other, and they can be safely passed over before
    432  1.1  christos ;;; starting the compare loop. So what this code does is skip over 0-3
    433  1.1  christos ;;; bytes, as much as necessary in order to dword-align the edi
    434  1.1  christos ;;; pointer. (rsi will still be misaligned three times out of four.)
    435  1.1  christos ;;;
    436  1.1  christos ;;; It should be confessed that this loop usually does not represent
    437  1.1  christos ;;; much of the total running time. Replacing it with a more
    438  1.1  christos ;;; straightforward "rep cmpsb" would not drastically degrade
    439  1.1  christos ;;; performance.
    440  1.1  christos */
    441  1.1  christos 
    442  1.1  christos LoopCmps:
    443  1.1  christos         mov rax, [rsi + rdx]
    444  1.1  christos         xor rax, [rdi + rdx]
    445  1.1  christos         jnz LeaveLoopCmps
    446  1.1  christos 
    447  1.1  christos         mov rax, [rsi + rdx + 8]
    448  1.1  christos         xor rax, [rdi + rdx + 8]
    449  1.1  christos         jnz LeaveLoopCmps8
    450  1.1  christos 
    451  1.1  christos 
    452  1.1  christos         mov rax, [rsi + rdx + 8+8]
    453  1.1  christos         xor rax, [rdi + rdx + 8+8]
    454  1.1  christos         jnz LeaveLoopCmps16
    455  1.1  christos 
    456  1.1  christos         add rdx,8+8+8
    457  1.1  christos 
    458  1.1  christos 		BEFORE_JMP
    459  1.1  christos         jnz  LoopCmps
    460  1.1  christos         jmp  LenMaximum
    461  1.1  christos 		AFTER_JMP
    462  1.1  christos 
    463  1.1  christos LeaveLoopCmps16: add rdx,8
    464  1.1  christos LeaveLoopCmps8: add rdx,8
    465  1.1  christos LeaveLoopCmps:
    466  1.1  christos 
    467  1.1  christos         test    eax, 0x0000FFFF
    468  1.1  christos         jnz LenLower
    469  1.1  christos 
    470  1.1  christos         test eax,0xffffffff
    471  1.1  christos 
    472  1.1  christos         jnz LenLower32
    473  1.1  christos 
    474  1.1  christos         add rdx,4
    475  1.1  christos         shr rax,32
    476  1.1  christos         or ax,ax
    477  1.1  christos 		BEFORE_JMP
    478  1.1  christos         jnz LenLower
    479  1.1  christos 		AFTER_JMP
    480  1.1  christos 
    481  1.1  christos LenLower32:
    482  1.1  christos         shr eax,16
    483  1.1  christos         add rdx,2
    484  1.1  christos 
    485  1.1  christos LenLower:
    486  1.1  christos         sub al, 1
    487  1.1  christos         adc rdx, 0
    488  1.1  christos //;;; Calculate the length of the match. If it is longer than MAX_MATCH,
    489  1.1  christos //;;; then automatically accept it as the best possible match and leave.
    490  1.1  christos 
    491  1.1  christos         lea rax, [rdi + rdx]
    492  1.1  christos         sub rax, r9
    493  1.1  christos         cmp eax, MAX_MATCH
    494  1.1  christos 		BEFORE_JMP
    495  1.1  christos         jge LenMaximum
    496  1.1  christos 		AFTER_JMP
    497  1.1  christos /*
    498  1.1  christos ;;; If the length of the match is not longer than the best match we
    499  1.1  christos ;;; have so far, then forget it and return to the lookup loop.
    500  1.1  christos ;///////////////////////////////////
    501  1.1  christos */
    502  1.1  christos         cmp eax, r11d
    503  1.1  christos         jg  LongerMatch
    504  1.1  christos 
    505  1.1  christos         lea rsi,[r10+r11]
    506  1.1  christos 
    507  1.1  christos         mov rdi, prev_ad
    508  1.1  christos         mov edx, [chainlenwmask]
    509  1.1  christos 		BEFORE_JMP
    510  1.1  christos         jmp LookupLoop
    511  1.1  christos 		AFTER_JMP
    512  1.1  christos /*
    513  1.1  christos ;;;         s->match_start = cur_match;
    514  1.1  christos ;;;         best_len = len;
    515  1.1  christos ;;;         if (len >= nice_match) break;
    516  1.1  christos ;;;         scan_end = *(ushf*)(scan+best_len-1);
    517  1.1  christos */
    518  1.1  christos LongerMatch:
    519  1.1  christos         mov r11d, eax
    520  1.1  christos         mov match_start, r8d
    521  1.1  christos         cmp eax, [nicematch]
    522  1.1  christos 		BEFORE_JMP
    523  1.1  christos         jge LeaveNow
    524  1.1  christos 		AFTER_JMP
    525  1.1  christos 
    526  1.1  christos         lea rsi,[r10+rax]
    527  1.1  christos 
    528  1.1  christos         movzx   ebx, word ptr [r9 + rax - 1]
    529  1.1  christos         mov rdi, prev_ad
    530  1.1  christos         mov edx, [chainlenwmask]
    531  1.1  christos 		BEFORE_JMP
    532  1.1  christos         jmp LookupLoop
    533  1.1  christos 		AFTER_JMP
    534  1.1  christos 
    535  1.1  christos //;;; Accept the current string, with the maximum possible length.
    536  1.1  christos 
    537  1.1  christos LenMaximum:
    538  1.1  christos         mov r11d,MAX_MATCH
    539  1.1  christos         mov match_start, r8d
    540  1.1  christos 
    541  1.1  christos //;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    542  1.1  christos //;;; return s->lookahead;
    543  1.1  christos 
    544  1.1  christos LeaveNow:
    545  1.1  christos         mov eax, Lookahead
    546  1.1  christos         cmp r11d, eax
    547  1.1  christos         cmovng eax, r11d
    548  1.1  christos 
    549  1.1  christos 
    550  1.1  christos 
    551  1.1  christos //;;; Restore the stack and return from whence we came.
    552  1.1  christos 
    553  1.1  christos 
    554  1.1  christos //        mov rsi,[save_rsi]
    555  1.1  christos //        mov rdi,[save_rdi]
    556  1.1  christos         mov rbx,[save_rbx]
    557  1.1  christos         mov rbp,[save_rbp]
    558  1.1  christos         mov r12,[save_r12]
    559  1.1  christos         mov r13,[save_r13]
    560  1.1  christos         mov r14,[save_r14]
    561  1.1  christos         mov r15,[save_r15]
    562  1.1  christos 
    563  1.1  christos 
    564  1.1  christos         ret 0
    565  1.1  christos //; please don't remove this string !
    566  1.1  christos //; Your can freely use gvmat64 in any free or commercial app
    567  1.1  christos //; but it is far better don't remove the string in the binary!
    568  1.1  christos  //   db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
    569  1.1  christos 
    570  1.1  christos 
    571  1.1  christos match_init:
    572  1.1  christos   ret 0
    573  1.1  christos 
    574  1.1  christos 
    575