inffasx64.asm revision 1.1 1 1.1 christos ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
2 1.1 christos ; version for AMD64 on Windows using Microsoft C compiler
3 1.1 christos ;
4 1.1 christos ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
5 1.1 christos ; inffasx64.asm is called by inffas8664.c, which contain more info.
6 1.1 christos
7 1.1 christos
8 1.1 christos ; to compile this file, I use option
9 1.1 christos ; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
10 1.1 christos ; with Microsoft Macro Assembler (x64) for AMD64
11 1.1 christos ;
12 1.1 christos
13 1.1 christos ; This file compile with Microsoft Macro Assembler (x64) for AMD64
14 1.1 christos ;
15 1.1 christos ; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
16 1.1 christos ;
17 1.1 christos ; (you can get Windows WDK with ml64 for AMD64 from
18 1.1 christos ; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
19 1.1 christos ;
20 1.1 christos
21 1.1 christos
22 1.1 christos .code
23 1.1 christos inffas8664fnc PROC
24 1.1 christos
25 1.1 christos ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
26 1.1 christos ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
27 1.1 christos ;
28 1.1 christos ; All registers must be preserved across the call, except for
29 1.1 christos ; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
30 1.1 christos
31 1.1 christos
32 1.1 christos mov [rsp-8],rsi
33 1.1 christos mov [rsp-16],rdi
34 1.1 christos mov [rsp-24],r12
35 1.1 christos mov [rsp-32],r13
36 1.1 christos mov [rsp-40],r14
37 1.1 christos mov [rsp-48],r15
38 1.1 christos mov [rsp-56],rbx
39 1.1 christos
40 1.1 christos mov rax,rcx
41 1.1 christos
42 1.1 christos mov [rax+8], rbp ; /* save regs rbp and rsp */
43 1.1 christos mov [rax], rsp
44 1.1 christos
45 1.1 christos mov rsp, rax ; /* make rsp point to &ar */
46 1.1 christos
47 1.1 christos mov rsi, [rsp+16] ; /* rsi = in */
48 1.1 christos mov rdi, [rsp+32] ; /* rdi = out */
49 1.1 christos mov r9, [rsp+24] ; /* r9 = last */
50 1.1 christos mov r10, [rsp+48] ; /* r10 = end */
51 1.1 christos mov rbp, [rsp+64] ; /* rbp = lcode */
52 1.1 christos mov r11, [rsp+72] ; /* r11 = dcode */
53 1.1 christos mov rdx, [rsp+80] ; /* rdx = hold */
54 1.1 christos mov ebx, [rsp+88] ; /* ebx = bits */
55 1.1 christos mov r12d, [rsp+100] ; /* r12d = lmask */
56 1.1 christos mov r13d, [rsp+104] ; /* r13d = dmask */
57 1.1 christos ; /* r14d = len */
58 1.1 christos ; /* r15d = dist */
59 1.1 christos
60 1.1 christos
61 1.1 christos cld
62 1.1 christos cmp r10, rdi
63 1.1 christos je L_one_time ; /* if only one decode left */
64 1.1 christos cmp r9, rsi
65 1.1 christos
66 1.1 christos jne L_do_loop
67 1.1 christos
68 1.1 christos
69 1.1 christos L_one_time:
70 1.1 christos mov r8, r12 ; /* r8 = lmask */
71 1.1 christos cmp bl, 32
72 1.1 christos ja L_get_length_code_one_time
73 1.1 christos
74 1.1 christos lodsd ; /* eax = *(uint *)in++ */
75 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */
76 1.1 christos add bl, 32 ; /* bits += 32 */
77 1.1 christos shl rax, cl
78 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
79 1.1 christos jmp L_get_length_code_one_time
80 1.1 christos
81 1.1 christos ALIGN 4
82 1.1 christos L_while_test:
83 1.1 christos cmp r10, rdi
84 1.1 christos jbe L_break_loop
85 1.1 christos cmp r9, rsi
86 1.1 christos jbe L_break_loop
87 1.1 christos
88 1.1 christos L_do_loop:
89 1.1 christos mov r8, r12 ; /* r8 = lmask */
90 1.1 christos cmp bl, 32
91 1.1 christos ja L_get_length_code ; /* if (32 < bits) */
92 1.1 christos
93 1.1 christos lodsd ; /* eax = *(uint *)in++ */
94 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */
95 1.1 christos add bl, 32 ; /* bits += 32 */
96 1.1 christos shl rax, cl
97 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
98 1.1 christos
99 1.1 christos L_get_length_code:
100 1.1 christos and r8, rdx ; /* r8 &= hold */
101 1.1 christos mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
102 1.1 christos
103 1.1 christos mov cl, ah ; /* cl = this.bits */
104 1.1 christos sub bl, ah ; /* bits -= this.bits */
105 1.1 christos shr rdx, cl ; /* hold >>= this.bits */
106 1.1 christos
107 1.1 christos test al, al
108 1.1 christos jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
109 1.1 christos
110 1.1 christos mov r8, r12 ; /* r8 = lmask */
111 1.1 christos shr eax, 16 ; /* output this.val char */
112 1.1 christos stosb
113 1.1 christos
114 1.1 christos L_get_length_code_one_time:
115 1.1 christos and r8, rdx ; /* r8 &= hold */
116 1.1 christos mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
117 1.1 christos
118 1.1 christos L_dolen:
119 1.1 christos mov cl, ah ; /* cl = this.bits */
120 1.1 christos sub bl, ah ; /* bits -= this.bits */
121 1.1 christos shr rdx, cl ; /* hold >>= this.bits */
122 1.1 christos
123 1.1 christos test al, al
124 1.1 christos jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
125 1.1 christos
126 1.1 christos shr eax, 16 ; /* output this.val char */
127 1.1 christos stosb
128 1.1 christos jmp L_while_test
129 1.1 christos
130 1.1 christos ALIGN 4
131 1.1 christos L_test_for_length_base:
132 1.1 christos mov r14d, eax ; /* len = this */
133 1.1 christos shr r14d, 16 ; /* len = this.val */
134 1.1 christos mov cl, al
135 1.1 christos
136 1.1 christos test al, 16
137 1.1 christos jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
138 1.1 christos and cl, 15 ; /* op &= 15 */
139 1.1 christos jz L_decode_distance ; /* if (!op) */
140 1.1 christos
141 1.1 christos L_add_bits_to_len:
142 1.1 christos sub bl, cl
143 1.1 christos xor eax, eax
144 1.1 christos inc eax
145 1.1 christos shl eax, cl
146 1.1 christos dec eax
147 1.1 christos and eax, edx ; /* eax &= hold */
148 1.1 christos shr rdx, cl
149 1.1 christos add r14d, eax ; /* len += hold & mask[op] */
150 1.1 christos
151 1.1 christos L_decode_distance:
152 1.1 christos mov r8, r13 ; /* r8 = dmask */
153 1.1 christos cmp bl, 32
154 1.1 christos ja L_get_distance_code ; /* if (32 < bits) */
155 1.1 christos
156 1.1 christos lodsd ; /* eax = *(uint *)in++ */
157 1.1 christos mov cl, bl ; /* cl = bits, needs it for shifting */
158 1.1 christos add bl, 32 ; /* bits += 32 */
159 1.1 christos shl rax, cl
160 1.1 christos or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
161 1.1 christos
162 1.1 christos L_get_distance_code:
163 1.1 christos and r8, rdx ; /* r8 &= hold */
164 1.1 christos mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
165 1.1 christos
166 1.1 christos L_dodist:
167 1.1 christos mov r15d, eax ; /* dist = this */
168 1.1 christos shr r15d, 16 ; /* dist = this.val */
169 1.1 christos mov cl, ah
170 1.1 christos sub bl, ah ; /* bits -= this.bits */
171 1.1 christos shr rdx, cl ; /* hold >>= this.bits */
172 1.1 christos mov cl, al ; /* cl = this.op */
173 1.1 christos
174 1.1 christos test al, 16 ; /* if ((op & 16) == 0) */
175 1.1 christos jz L_test_for_second_level_dist
176 1.1 christos and cl, 15 ; /* op &= 15 */
177 1.1 christos jz L_check_dist_one
178 1.1 christos
179 1.1 christos L_add_bits_to_dist:
180 1.1 christos sub bl, cl
181 1.1 christos xor eax, eax
182 1.1 christos inc eax
183 1.1 christos shl eax, cl
184 1.1 christos dec eax ; /* (1 << op) - 1 */
185 1.1 christos and eax, edx ; /* eax &= hold */
186 1.1 christos shr rdx, cl
187 1.1 christos add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
188 1.1 christos
189 1.1 christos L_check_window:
190 1.1 christos mov r8, rsi ; /* save in so from can use it's reg */
191 1.1 christos mov rax, rdi
192 1.1 christos sub rax, [rsp+40] ; /* nbytes = out - beg */
193 1.1 christos
194 1.1 christos cmp eax, r15d
195 1.1 christos jb L_clip_window ; /* if (dist > nbytes) 4.2% */
196 1.1 christos
197 1.1 christos mov ecx, r14d ; /* ecx = len */
198 1.1 christos mov rsi, rdi
199 1.1 christos sub rsi, r15 ; /* from = out - dist */
200 1.1 christos
201 1.1 christos sar ecx, 1
202 1.1 christos jnc L_copy_two ; /* if len % 2 == 0 */
203 1.1 christos
204 1.1 christos rep movsw
205 1.1 christos mov al, [rsi]
206 1.1 christos mov [rdi], al
207 1.1 christos inc rdi
208 1.1 christos
209 1.1 christos mov rsi, r8 ; /* move in back to %rsi, toss from */
210 1.1 christos jmp L_while_test
211 1.1 christos
212 1.1 christos L_copy_two:
213 1.1 christos rep movsw
214 1.1 christos mov rsi, r8 ; /* move in back to %rsi, toss from */
215 1.1 christos jmp L_while_test
216 1.1 christos
217 1.1 christos ALIGN 4
218 1.1 christos L_check_dist_one:
219 1.1 christos cmp r15d, 1 ; /* if dist 1, is a memset */
220 1.1 christos jne L_check_window
221 1.1 christos cmp [rsp+40], rdi ; /* if out == beg, outside window */
222 1.1 christos je L_check_window
223 1.1 christos
224 1.1 christos mov ecx, r14d ; /* ecx = len */
225 1.1 christos mov al, [rdi-1]
226 1.1 christos mov ah, al
227 1.1 christos
228 1.1 christos sar ecx, 1
229 1.1 christos jnc L_set_two
230 1.1 christos mov [rdi], al
231 1.1 christos inc rdi
232 1.1 christos
233 1.1 christos L_set_two:
234 1.1 christos rep stosw
235 1.1 christos jmp L_while_test
236 1.1 christos
237 1.1 christos ALIGN 4
238 1.1 christos L_test_for_second_level_length:
239 1.1 christos test al, 64
240 1.1 christos jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
241 1.1 christos
242 1.1 christos xor eax, eax
243 1.1 christos inc eax
244 1.1 christos shl eax, cl
245 1.1 christos dec eax
246 1.1 christos and eax, edx ; /* eax &= hold */
247 1.1 christos add eax, r14d ; /* eax += len */
248 1.1 christos mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
249 1.1 christos jmp L_dolen
250 1.1 christos
251 1.1 christos ALIGN 4
252 1.1 christos L_test_for_second_level_dist:
253 1.1 christos test al, 64
254 1.1 christos jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
255 1.1 christos
256 1.1 christos xor eax, eax
257 1.1 christos inc eax
258 1.1 christos shl eax, cl
259 1.1 christos dec eax
260 1.1 christos and eax, edx ; /* eax &= hold */
261 1.1 christos add eax, r15d ; /* eax += dist */
262 1.1 christos mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
263 1.1 christos jmp L_dodist
264 1.1 christos
265 1.1 christos ALIGN 4
266 1.1 christos L_clip_window:
267 1.1 christos mov ecx, eax ; /* ecx = nbytes */
268 1.1 christos mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
269 1.1 christos neg ecx ; /* nbytes = -nbytes */
270 1.1 christos
271 1.1 christos cmp eax, r15d
272 1.1 christos jb L_invalid_distance_too_far ; /* if (dist > wsize) */
273 1.1 christos
274 1.1 christos add ecx, r15d ; /* nbytes = dist - nbytes */
275 1.1 christos cmp dword ptr [rsp+96], 0
276 1.1 christos jne L_wrap_around_window ; /* if (write != 0) */
277 1.1 christos
278 1.1 christos mov rsi, [rsp+56] ; /* from = window */
279 1.1 christos sub eax, ecx ; /* eax -= nbytes */
280 1.1 christos add rsi, rax ; /* from += wsize - nbytes */
281 1.1 christos
282 1.1 christos mov eax, r14d ; /* eax = len */
283 1.1 christos cmp r14d, ecx
284 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */
285 1.1 christos
286 1.1 christos sub eax, ecx ; /* eax -= nbytes */
287 1.1 christos rep movsb
288 1.1 christos mov rsi, rdi
289 1.1 christos sub rsi, r15 ; /* from = &out[ -dist ] */
290 1.1 christos jmp L_do_copy
291 1.1 christos
292 1.1 christos ALIGN 4
293 1.1 christos L_wrap_around_window:
294 1.1 christos mov eax, [rsp+96] ; /* eax = write */
295 1.1 christos cmp ecx, eax
296 1.1 christos jbe L_contiguous_in_window ; /* if (write >= nbytes) */
297 1.1 christos
298 1.1 christos mov esi, [rsp+92] ; /* from = wsize */
299 1.1 christos add rsi, [rsp+56] ; /* from += window */
300 1.1 christos add rsi, rax ; /* from += write */
301 1.1 christos sub rsi, rcx ; /* from -= nbytes */
302 1.1 christos sub ecx, eax ; /* nbytes -= write */
303 1.1 christos
304 1.1 christos mov eax, r14d ; /* eax = len */
305 1.1 christos cmp eax, ecx
306 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */
307 1.1 christos
308 1.1 christos sub eax, ecx ; /* len -= nbytes */
309 1.1 christos rep movsb
310 1.1 christos mov rsi, [rsp+56] ; /* from = window */
311 1.1 christos mov ecx, [rsp+96] ; /* nbytes = write */
312 1.1 christos cmp eax, ecx
313 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */
314 1.1 christos
315 1.1 christos sub eax, ecx ; /* len -= nbytes */
316 1.1 christos rep movsb
317 1.1 christos mov rsi, rdi
318 1.1 christos sub rsi, r15 ; /* from = out - dist */
319 1.1 christos jmp L_do_copy
320 1.1 christos
321 1.1 christos ALIGN 4
322 1.1 christos L_contiguous_in_window:
323 1.1 christos mov rsi, [rsp+56] ; /* rsi = window */
324 1.1 christos add rsi, rax
325 1.1 christos sub rsi, rcx ; /* from += write - nbytes */
326 1.1 christos
327 1.1 christos mov eax, r14d ; /* eax = len */
328 1.1 christos cmp eax, ecx
329 1.1 christos jbe L_do_copy ; /* if (nbytes >= len) */
330 1.1 christos
331 1.1 christos sub eax, ecx ; /* len -= nbytes */
332 1.1 christos rep movsb
333 1.1 christos mov rsi, rdi
334 1.1 christos sub rsi, r15 ; /* from = out - dist */
335 1.1 christos jmp L_do_copy ; /* if (nbytes >= len) */
336 1.1 christos
337 1.1 christos ALIGN 4
338 1.1 christos L_do_copy:
339 1.1 christos mov ecx, eax ; /* ecx = len */
340 1.1 christos rep movsb
341 1.1 christos
342 1.1 christos mov rsi, r8 ; /* move in back to %esi, toss from */
343 1.1 christos jmp L_while_test
344 1.1 christos
345 1.1 christos L_test_for_end_of_block:
346 1.1 christos test al, 32
347 1.1 christos jz L_invalid_literal_length_code
348 1.1 christos mov dword ptr [rsp+116], 1
349 1.1 christos jmp L_break_loop_with_status
350 1.1 christos
351 1.1 christos L_invalid_literal_length_code:
352 1.1 christos mov dword ptr [rsp+116], 2
353 1.1 christos jmp L_break_loop_with_status
354 1.1 christos
355 1.1 christos L_invalid_distance_code:
356 1.1 christos mov dword ptr [rsp+116], 3
357 1.1 christos jmp L_break_loop_with_status
358 1.1 christos
359 1.1 christos L_invalid_distance_too_far:
360 1.1 christos mov dword ptr [rsp+116], 4
361 1.1 christos jmp L_break_loop_with_status
362 1.1 christos
363 1.1 christos L_break_loop:
364 1.1 christos mov dword ptr [rsp+116], 0
365 1.1 christos
366 1.1 christos L_break_loop_with_status:
367 1.1 christos ; /* put in, out, bits, and hold back into ar and pop esp */
368 1.1 christos mov [rsp+16], rsi ; /* in */
369 1.1 christos mov [rsp+32], rdi ; /* out */
370 1.1 christos mov [rsp+88], ebx ; /* bits */
371 1.1 christos mov [rsp+80], rdx ; /* hold */
372 1.1 christos
373 1.1 christos mov rax, [rsp] ; /* restore rbp and rsp */
374 1.1 christos mov rbp, [rsp+8]
375 1.1 christos mov rsp, rax
376 1.1 christos
377 1.1 christos
378 1.1 christos
379 1.1 christos mov rsi,[rsp-8]
380 1.1 christos mov rdi,[rsp-16]
381 1.1 christos mov r12,[rsp-24]
382 1.1 christos mov r13,[rsp-32]
383 1.1 christos mov r14,[rsp-40]
384 1.1 christos mov r15,[rsp-48]
385 1.1 christos mov rbx,[rsp-56]
386 1.1 christos
387 1.1 christos ret 0
388 1.1 christos ; :
389 1.1 christos ; : "m" (ar)
390 1.1 christos ; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
391 1.1 christos ; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
392 1.1 christos ; );
393 1.1 christos
394 1.1 christos inffas8664fnc ENDP
395 1.1 christos ;_TEXT ENDS
396 1.1 christos END
397