Home | History | Annotate | Line # | Download | only in masmx64
      1  1.1  christos ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
      2  1.1  christos ; version for AMD64 on Windows using Microsoft C compiler
      3  1.1  christos ;
      4  1.1  christos ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
      5  1.1  christos ; inffasx64.asm is called by inffas8664.c, which contain more info.
      6  1.1  christos 
      7  1.1  christos 
      8  1.1  christos ; to compile this file, I use option
      9  1.1  christos ;   ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
     10  1.1  christos ;   with Microsoft Macro Assembler (x64) for AMD64
     11  1.1  christos ;
     12  1.1  christos 
     13  1.1  christos ; This file compile with Microsoft Macro Assembler (x64) for AMD64
     14  1.1  christos ;
     15  1.1  christos ;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
     16  1.1  christos ;
     17  1.1  christos ;   (you can get Windows WDK with ml64 for AMD64 from
     18  1.1  christos ;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
     19  1.1  christos ;
     20  1.1  christos 
     21  1.1  christos 
     22  1.1  christos .code
     23  1.1  christos inffas8664fnc PROC
     24  1.1  christos 
     25  1.1  christos ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
     26  1.1  christos ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
     27  1.1  christos ;
     28  1.1  christos ; All registers must be preserved across the call, except for
     29  1.1  christos ;   rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
     30  1.1  christos 
     31  1.1  christos 
     32  1.1  christos 	mov [rsp-8],rsi
     33  1.1  christos 	mov [rsp-16],rdi
     34  1.1  christos 	mov [rsp-24],r12
     35  1.1  christos 	mov [rsp-32],r13
     36  1.1  christos 	mov [rsp-40],r14
     37  1.1  christos 	mov [rsp-48],r15
     38  1.1  christos 	mov [rsp-56],rbx
     39  1.1  christos 
     40  1.1  christos 	mov rax,rcx
     41  1.1  christos 
     42  1.1  christos 	mov	[rax+8], rbp       ; /* save regs rbp and rsp */
     43  1.1  christos 	mov	[rax], rsp
     44  1.1  christos 
     45  1.1  christos 	mov	rsp, rax          ; /* make rsp point to &ar */
     46  1.1  christos 
     47  1.1  christos 	mov	rsi, [rsp+16]      ; /* rsi  = in */
     48  1.1  christos 	mov	rdi, [rsp+32]      ; /* rdi  = out */
     49  1.1  christos 	mov	r9, [rsp+24]       ; /* r9   = last */
     50  1.1  christos 	mov	r10, [rsp+48]      ; /* r10  = end */
     51  1.1  christos 	mov	rbp, [rsp+64]      ; /* rbp  = lcode */
     52  1.1  christos 	mov	r11, [rsp+72]      ; /* r11  = dcode */
     53  1.1  christos 	mov	rdx, [rsp+80]      ; /* rdx  = hold */
     54  1.1  christos 	mov	ebx, [rsp+88]      ; /* ebx  = bits */
     55  1.1  christos 	mov	r12d, [rsp+100]    ; /* r12d = lmask */
     56  1.1  christos 	mov	r13d, [rsp+104]    ; /* r13d = dmask */
     57  1.1  christos                                           ; /* r14d = len */
     58  1.1  christos                                           ; /* r15d = dist */
     59  1.1  christos 
     60  1.1  christos 
     61  1.1  christos 	cld
     62  1.1  christos 	cmp	r10, rdi
     63  1.1  christos 	je	L_one_time           ; /* if only one decode left */
     64  1.1  christos 	cmp	r9, rsi
     65  1.1  christos 
     66  1.1  christos     jne L_do_loop
     67  1.1  christos 
     68  1.1  christos 
     69  1.1  christos L_one_time:
     70  1.1  christos 	mov	r8, r12           ; /* r8 = lmask */
     71  1.1  christos 	cmp	bl, 32
     72  1.1  christos 	ja	L_get_length_code_one_time
     73  1.1  christos 
     74  1.1  christos 	lodsd                         ; /* eax = *(uint *)in++ */
     75  1.1  christos 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
     76  1.1  christos 	add	bl, 32             ; /* bits += 32 */
     77  1.1  christos 	shl	rax, cl
     78  1.1  christos 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
     79  1.1  christos 	jmp	L_get_length_code_one_time
     80  1.1  christos 
     81  1.1  christos ALIGN 4
     82  1.1  christos L_while_test:
     83  1.1  christos 	cmp	r10, rdi
     84  1.1  christos 	jbe	L_break_loop
     85  1.1  christos 	cmp	r9, rsi
     86  1.1  christos 	jbe	L_break_loop
     87  1.1  christos 
     88  1.1  christos L_do_loop:
     89  1.1  christos 	mov	r8, r12           ; /* r8 = lmask */
     90  1.1  christos 	cmp	bl, 32
     91  1.1  christos 	ja	L_get_length_code    ; /* if (32 < bits) */
     92  1.1  christos 
     93  1.1  christos 	lodsd                         ; /* eax = *(uint *)in++ */
     94  1.1  christos 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
     95  1.1  christos 	add	bl, 32             ; /* bits += 32 */
     96  1.1  christos 	shl	rax, cl
     97  1.1  christos 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
     98  1.1  christos 
     99  1.1  christos L_get_length_code:
    100  1.1  christos 	and	r8, rdx            ; /* r8 &= hold */
    101  1.1  christos 	mov	eax, [rbp+r8*4]  ; /* eax = lcode[hold & lmask] */
    102  1.1  christos 
    103  1.1  christos 	mov	cl, ah            ; /* cl = this.bits */
    104  1.1  christos 	sub	bl, ah            ; /* bits -= this.bits */
    105  1.1  christos 	shr	rdx, cl           ; /* hold >>= this.bits */
    106  1.1  christos 
    107  1.1  christos 	test	al, al
    108  1.1  christos 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
    109  1.1  christos 
    110  1.1  christos 	mov	r8, r12            ; /* r8 = lmask */
    111  1.1  christos 	shr	eax, 16            ; /* output this.val char */
    112  1.1  christos 	stosb
    113  1.1  christos 
    114  1.1  christos L_get_length_code_one_time:
    115  1.1  christos 	and	r8, rdx            ; /* r8 &= hold */
    116  1.1  christos 	mov	eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
    117  1.1  christos 
    118  1.1  christos L_dolen:
    119  1.1  christos 	mov	cl, ah            ; /* cl = this.bits */
    120  1.1  christos 	sub	bl, ah            ; /* bits -= this.bits */
    121  1.1  christos 	shr	rdx, cl           ; /* hold >>= this.bits */
    122  1.1  christos 
    123  1.1  christos 	test	al, al
    124  1.1  christos 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */
    125  1.1  christos 
    126  1.1  christos 	shr	eax, 16            ; /* output this.val char */
    127  1.1  christos 	stosb
    128  1.1  christos 	jmp	L_while_test
    129  1.1  christos 
    130  1.1  christos ALIGN 4
    131  1.1  christos L_test_for_length_base:
    132  1.1  christos 	mov	r14d, eax         ; /* len = this */
    133  1.1  christos 	shr	r14d, 16           ; /* len = this.val */
    134  1.1  christos 	mov	cl, al
    135  1.1  christos 
    136  1.1  christos 	test	al, 16
    137  1.1  christos 	jz	L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
    138  1.1  christos 	and	cl, 15             ; /* op &= 15 */
    139  1.1  christos 	jz	L_decode_distance    ; /* if (!op) */
    140  1.1  christos 
    141  1.1  christos L_add_bits_to_len:
    142  1.1  christos 	sub	bl, cl
    143  1.1  christos 	xor	eax, eax
    144  1.1  christos 	inc	eax
    145  1.1  christos 	shl	eax, cl
    146  1.1  christos 	dec	eax
    147  1.1  christos 	and	eax, edx          ; /* eax &= hold */
    148  1.1  christos 	shr	rdx, cl
    149  1.1  christos 	add	r14d, eax         ; /* len += hold & mask[op] */
    150  1.1  christos 
    151  1.1  christos L_decode_distance:
    152  1.1  christos 	mov	r8, r13           ; /* r8 = dmask */
    153  1.1  christos 	cmp	bl, 32
    154  1.1  christos 	ja	L_get_distance_code  ; /* if (32 < bits) */
    155  1.1  christos 
    156  1.1  christos 	lodsd                         ; /* eax = *(uint *)in++ */
    157  1.1  christos 	mov	cl, bl            ; /* cl = bits, needs it for shifting */
    158  1.1  christos 	add	bl, 32             ; /* bits += 32 */
    159  1.1  christos 	shl	rax, cl
    160  1.1  christos 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */
    161  1.1  christos 
    162  1.1  christos L_get_distance_code:
    163  1.1  christos 	and	r8, rdx           ; /* r8 &= hold */
    164  1.1  christos 	mov	eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
    165  1.1  christos 
    166  1.1  christos L_dodist:
    167  1.1  christos 	mov	r15d, eax         ; /* dist = this */
    168  1.1  christos 	shr	r15d, 16           ; /* dist = this.val */
    169  1.1  christos 	mov	cl, ah
    170  1.1  christos 	sub	bl, ah            ; /* bits -= this.bits */
    171  1.1  christos 	shr	rdx, cl           ; /* hold >>= this.bits */
    172  1.1  christos 	mov	cl, al            ; /* cl = this.op */
    173  1.1  christos 
    174  1.1  christos 	test	al, 16             ; /* if ((op & 16) == 0) */
    175  1.1  christos 	jz	L_test_for_second_level_dist
    176  1.1  christos 	and	cl, 15             ; /* op &= 15 */
    177  1.1  christos 	jz	L_check_dist_one
    178  1.1  christos 
    179  1.1  christos L_add_bits_to_dist:
    180  1.1  christos 	sub	bl, cl
    181  1.1  christos 	xor	eax, eax
    182  1.1  christos 	inc	eax
    183  1.1  christos 	shl	eax, cl
    184  1.1  christos 	dec	eax                 ; /* (1 << op) - 1 */
    185  1.1  christos 	and	eax, edx          ; /* eax &= hold */
    186  1.1  christos 	shr	rdx, cl
    187  1.1  christos 	add	r15d, eax         ; /* dist += hold & ((1 << op) - 1) */
    188  1.1  christos 
    189  1.1  christos L_check_window:
    190  1.1  christos 	mov	r8, rsi           ; /* save in so from can use it's reg */
    191  1.1  christos 	mov	rax, rdi
    192  1.1  christos 	sub	rax, [rsp+40]      ; /* nbytes = out - beg */
    193  1.1  christos 
    194  1.1  christos 	cmp	eax, r15d
    195  1.1  christos 	jb	L_clip_window        ; /* if (dist > nbytes) 4.2% */
    196  1.1  christos 
    197  1.1  christos 	mov	ecx, r14d         ; /* ecx = len */
    198  1.1  christos 	mov	rsi, rdi
    199  1.1  christos 	sub	rsi, r15          ; /* from = out - dist */
    200  1.1  christos 
    201  1.1  christos 	sar	ecx, 1
    202  1.1  christos 	jnc	L_copy_two           ; /* if len % 2 == 0 */
    203  1.1  christos 
    204  1.1  christos 	rep     movsw
    205  1.1  christos 	mov	al, [rsi]
    206  1.1  christos 	mov	[rdi], al
    207  1.1  christos 	inc	rdi
    208  1.1  christos 
    209  1.1  christos 	mov	rsi, r8           ; /* move in back to %rsi, toss from */
    210  1.1  christos 	jmp	L_while_test
    211  1.1  christos 
    212  1.1  christos L_copy_two:
    213  1.1  christos 	rep     movsw
    214  1.1  christos 	mov	rsi, r8           ; /* move in back to %rsi, toss from */
    215  1.1  christos 	jmp	L_while_test
    216  1.1  christos 
    217  1.1  christos ALIGN 4
    218  1.1  christos L_check_dist_one:
    219  1.1  christos 	cmp	r15d, 1            ; /* if dist 1, is a memset */
    220  1.1  christos 	jne	L_check_window
    221  1.1  christos 	cmp	[rsp+40], rdi      ; /* if out == beg, outside window */
    222  1.1  christos 	je	L_check_window
    223  1.1  christos 
    224  1.1  christos 	mov	ecx, r14d         ; /* ecx = len */
    225  1.1  christos 	mov	al, [rdi-1]
    226  1.1  christos 	mov	ah, al
    227  1.1  christos 
    228  1.1  christos 	sar	ecx, 1
    229  1.1  christos 	jnc	L_set_two
    230  1.1  christos 	mov	[rdi], al
    231  1.1  christos 	inc	rdi
    232  1.1  christos 
    233  1.1  christos L_set_two:
    234  1.1  christos 	rep     stosw
    235  1.1  christos 	jmp	L_while_test
    236  1.1  christos 
    237  1.1  christos ALIGN 4
    238  1.1  christos L_test_for_second_level_length:
    239  1.1  christos 	test	al, 64
    240  1.1  christos 	jnz	L_test_for_end_of_block ; /* if ((op & 64) != 0) */
    241  1.1  christos 
    242  1.1  christos 	xor	eax, eax
    243  1.1  christos 	inc	eax
    244  1.1  christos 	shl	eax, cl
    245  1.1  christos 	dec	eax
    246  1.1  christos 	and	eax, edx         ; /* eax &= hold */
    247  1.1  christos 	add	eax, r14d        ; /* eax += len */
    248  1.1  christos 	mov	eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
    249  1.1  christos 	jmp	L_dolen
    250  1.1  christos 
    251  1.1  christos ALIGN 4
    252  1.1  christos L_test_for_second_level_dist:
    253  1.1  christos 	test	al, 64
    254  1.1  christos 	jnz	L_invalid_distance_code ; /* if ((op & 64) != 0) */
    255  1.1  christos 
    256  1.1  christos 	xor	eax, eax
    257  1.1  christos 	inc	eax
    258  1.1  christos 	shl	eax, cl
    259  1.1  christos 	dec	eax
    260  1.1  christos 	and	eax, edx         ; /* eax &= hold */
    261  1.1  christos 	add	eax, r15d        ; /* eax += dist */
    262  1.1  christos 	mov	eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
    263  1.1  christos 	jmp	L_dodist
    264  1.1  christos 
    265  1.1  christos ALIGN 4
    266  1.1  christos L_clip_window:
    267  1.1  christos 	mov	ecx, eax         ; /* ecx = nbytes */
    268  1.1  christos 	mov	eax, [rsp+92]     ; /* eax = wsize, prepare for dist cmp */
    269  1.1  christos 	neg	ecx                ; /* nbytes = -nbytes */
    270  1.1  christos 
    271  1.1  christos 	cmp	eax, r15d
    272  1.1  christos 	jb	L_invalid_distance_too_far ; /* if (dist > wsize) */
    273  1.1  christos 
    274  1.1  christos 	add	ecx, r15d         ; /* nbytes = dist - nbytes */
    275  1.1  christos 	cmp	dword ptr [rsp+96], 0
    276  1.1  christos 	jne	L_wrap_around_window ; /* if (write != 0) */
    277  1.1  christos 
    278  1.1  christos 	mov	rsi, [rsp+56]     ; /* from  = window */
    279  1.1  christos 	sub	eax, ecx         ; /* eax  -= nbytes */
    280  1.1  christos 	add	rsi, rax         ; /* from += wsize - nbytes */
    281  1.1  christos 
    282  1.1  christos 	mov	eax, r14d        ; /* eax = len */
    283  1.1  christos 	cmp	r14d, ecx
    284  1.1  christos 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    285  1.1  christos 
    286  1.1  christos 	sub	eax, ecx         ; /* eax -= nbytes */
    287  1.1  christos 	rep     movsb
    288  1.1  christos 	mov	rsi, rdi
    289  1.1  christos 	sub	rsi, r15         ; /* from = &out[ -dist ] */
    290  1.1  christos 	jmp	L_do_copy
    291  1.1  christos 
    292  1.1  christos ALIGN 4
    293  1.1  christos L_wrap_around_window:
    294  1.1  christos 	mov	eax, [rsp+96]     ; /* eax = write */
    295  1.1  christos 	cmp	ecx, eax
    296  1.1  christos 	jbe	L_contiguous_in_window ; /* if (write >= nbytes) */
    297  1.1  christos 
    298  1.1  christos 	mov	esi, [rsp+92]     ; /* from  = wsize */
    299  1.1  christos 	add	rsi, [rsp+56]     ; /* from += window */
    300  1.1  christos 	add	rsi, rax         ; /* from += write */
    301  1.1  christos 	sub	rsi, rcx         ; /* from -= nbytes */
    302  1.1  christos 	sub	ecx, eax         ; /* nbytes -= write */
    303  1.1  christos 
    304  1.1  christos 	mov	eax, r14d        ; /* eax = len */
    305  1.1  christos 	cmp	eax, ecx
    306  1.1  christos 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    307  1.1  christos 
    308  1.1  christos 	sub	eax, ecx         ; /* len -= nbytes */
    309  1.1  christos 	rep     movsb
    310  1.1  christos 	mov	rsi, [rsp+56]     ; /* from = window */
    311  1.1  christos 	mov	ecx, [rsp+96]     ; /* nbytes = write */
    312  1.1  christos 	cmp	eax, ecx
    313  1.1  christos 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    314  1.1  christos 
    315  1.1  christos 	sub	eax, ecx         ; /* len -= nbytes */
    316  1.1  christos 	rep     movsb
    317  1.1  christos 	mov	rsi, rdi
    318  1.1  christos 	sub	rsi, r15         ; /* from = out - dist */
    319  1.1  christos 	jmp	L_do_copy
    320  1.1  christos 
    321  1.1  christos ALIGN 4
    322  1.1  christos L_contiguous_in_window:
    323  1.1  christos 	mov	rsi, [rsp+56]     ; /* rsi = window */
    324  1.1  christos 	add	rsi, rax
    325  1.1  christos 	sub	rsi, rcx         ; /* from += write - nbytes */
    326  1.1  christos 
    327  1.1  christos 	mov	eax, r14d        ; /* eax = len */
    328  1.1  christos 	cmp	eax, ecx
    329  1.1  christos 	jbe	L_do_copy           ; /* if (nbytes >= len) */
    330  1.1  christos 
    331  1.1  christos 	sub	eax, ecx         ; /* len -= nbytes */
    332  1.1  christos 	rep     movsb
    333  1.1  christos 	mov	rsi, rdi
    334  1.1  christos 	sub	rsi, r15         ; /* from = out - dist */
    335  1.1  christos 	jmp	L_do_copy           ; /* if (nbytes >= len) */
    336  1.1  christos 
    337  1.1  christos ALIGN 4
    338  1.1  christos L_do_copy:
    339  1.1  christos 	mov	ecx, eax         ; /* ecx = len */
    340  1.1  christos 	rep     movsb
    341  1.1  christos 
    342  1.1  christos 	mov	rsi, r8          ; /* move in back to %esi, toss from */
    343  1.1  christos 	jmp	L_while_test
    344  1.1  christos 
    345  1.1  christos L_test_for_end_of_block:
    346  1.1  christos 	test	al, 32
    347  1.1  christos 	jz	L_invalid_literal_length_code
    348  1.1  christos 	mov	dword ptr [rsp+116], 1
    349  1.1  christos 	jmp	L_break_loop_with_status
    350  1.1  christos 
    351  1.1  christos L_invalid_literal_length_code:
    352  1.1  christos 	mov	dword ptr [rsp+116], 2
    353  1.1  christos 	jmp	L_break_loop_with_status
    354  1.1  christos 
    355  1.1  christos L_invalid_distance_code:
    356  1.1  christos 	mov	dword ptr [rsp+116], 3
    357  1.1  christos 	jmp	L_break_loop_with_status
    358  1.1  christos 
    359  1.1  christos L_invalid_distance_too_far:
    360  1.1  christos 	mov	dword ptr [rsp+116], 4
    361  1.1  christos 	jmp	L_break_loop_with_status
    362  1.1  christos 
    363  1.1  christos L_break_loop:
    364  1.1  christos 	mov	dword ptr [rsp+116], 0
    365  1.1  christos 
    366  1.1  christos L_break_loop_with_status:
    367  1.1  christos ; /* put in, out, bits, and hold back into ar and pop esp */
    368  1.1  christos 	mov	[rsp+16], rsi     ; /* in */
    369  1.1  christos 	mov	[rsp+32], rdi     ; /* out */
    370  1.1  christos 	mov	[rsp+88], ebx     ; /* bits */
    371  1.1  christos 	mov	[rsp+80], rdx     ; /* hold */
    372  1.1  christos 
    373  1.1  christos 	mov	rax, [rsp]       ; /* restore rbp and rsp */
    374  1.1  christos 	mov	rbp, [rsp+8]
    375  1.1  christos 	mov	rsp, rax
    376  1.1  christos 
    377  1.1  christos 
    378  1.1  christos 
    379  1.1  christos 	mov rsi,[rsp-8]
    380  1.1  christos 	mov rdi,[rsp-16]
    381  1.1  christos 	mov r12,[rsp-24]
    382  1.1  christos 	mov r13,[rsp-32]
    383  1.1  christos 	mov r14,[rsp-40]
    384  1.1  christos 	mov r15,[rsp-48]
    385  1.1  christos 	mov rbx,[rsp-56]
    386  1.1  christos 
    387  1.1  christos     ret 0
    388  1.1  christos ;          :
    389  1.1  christos ;          : "m" (ar)
    390  1.1  christos ;          : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
    391  1.1  christos ;            "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
    392  1.1  christos ;    );
    393  1.1  christos 
    394  1.1  christos inffas8664fnc 	ENDP
    395  1.1  christos ;_TEXT	ENDS
    396  1.1  christos END
    397