1 1.1 christos /* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding 2 1.1 christos * version for AMD64 on Windows using Microsoft C compiler 3 1.1 christos * 4 1.1 christos * Copyright (C) 1995-2003 Mark Adler 5 1.1 christos * For conditions of distribution and use, see copyright notice in zlib.h 6 1.1 christos * 7 1.1 christos * Copyright (C) 2003 Chris Anderson <christop (at) charm.net> 8 1.1 christos * Please use the copyright conditions above. 9 1.1 christos * 10 1.1 christos * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant 11 1.1 christos * 12 1.1 christos * inffas8664.c call function inffas8664fnc in inffasx64.asm 13 1.1 christos * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c 14 1.1 christos * 15 1.1 christos * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also 16 1.1 christos * slightly quicker on x86 systems because, instead of using rep movsb to copy 17 1.1 christos * data, it uses rep movsw, which moves data in 2-byte chunks instead of single 18 1.1 christos * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates 19 1.1 christos * from http://fedora.linux.duke.edu/fc1_x86_64 20 1.1 christos * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with 21 1.1 christos * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version, 22 1.1 christos * when decompressing mozilla-source-1.3.tar.gz. 23 1.1 christos * 24 1.1 christos * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from 25 1.1 christos * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at 26 1.1 christos * the moment. I have successfully compiled and tested this code with gcc2.96, 27 1.1 christos * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S 28 1.1 christos * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX 29 1.1 christos * enabled. I will attempt to merge the MMX code into this version. Newer 30 1.1 christos * versions of this and inffast.S can be found at 31 1.1 christos * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ 32 1.1 christos * 33 1.1 christos */ 34 1.1 christos 35 1.1 christos #include <stdio.h> 36 1.1 christos #include "zutil.h" 37 1.1 christos #include "inftrees.h" 38 1.1 christos #include "inflate.h" 39 1.1 christos #include "inffast.h" 40 1.1 christos 41 1.1 christos /* Mark Adler's comments from inffast.c: */ 42 1.1 christos 43 1.1 christos /* 44 1.1 christos Decode literal, length, and distance codes and write out the resulting 45 1.1 christos literal and match bytes until either not enough input or output is 46 1.1 christos available, an end-of-block is encountered, or a data error is encountered. 47 1.1 christos When large enough input and output buffers are supplied to inflate(), for 48 1.1 christos example, a 16K input buffer and a 64K output buffer, more than 95% of the 49 1.1 christos inflate execution time is spent in this routine. 50 1.1 christos 51 1.1 christos Entry assumptions: 52 1.1 christos 53 1.1 christos state->mode == LEN 54 1.1 christos strm->avail_in >= 6 55 1.1 christos strm->avail_out >= 258 56 1.1 christos start >= strm->avail_out 57 1.1 christos state->bits < 8 58 1.1 christos 59 1.1 christos On return, state->mode is one of: 60 1.1 christos 61 1.1 christos LEN -- ran out of enough output space or enough available input 62 1.1 christos TYPE -- reached end of block code, inflate() to interpret next block 63 1.1 christos BAD -- error in block data 64 1.1 christos 65 1.1 christos Notes: 66 1.1 christos 67 1.1 christos - The maximum input bits used by a length/distance pair is 15 bits for the 68 1.1 christos length code, 5 bits for the length extra, 15 bits for the distance code, 69 1.1 christos and 13 bits for the distance extra. This totals 48 bits, or six bytes. 70 1.1 christos Therefore if strm->avail_in >= 6, then there is enough input to avoid 71 1.1 christos checking for available input while decoding. 72 1.1 christos 73 1.1 christos - The maximum bytes that a single length/distance pair can output is 258 74 1.1 christos bytes, which is the maximum length that can be coded. inflate_fast() 75 1.1 christos requires strm->avail_out >= 258 for each loop to avoid checking for 76 1.1 christos output space. 77 1.1 christos */ 78 1.1 christos 79 1.1 christos 80 1.1 christos 81 1.1 christos typedef struct inffast_ar { 82 1.1 christos /* 64 32 x86 x86_64 */ 83 1.1 christos /* ar offset register */ 84 1.1 christos /* 0 0 */ void *esp; /* esp save */ 85 1.1 christos /* 8 4 */ void *ebp; /* ebp save */ 86 1.1 christos /* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */ 87 1.1 christos /* 24 12 */ unsigned char FAR *last; /* r9 while in < last */ 88 1.1 christos /* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */ 89 1.1 christos /* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */ 90 1.1 christos /* 48 24 */ unsigned char FAR *end; /* r10 while out < end */ 91 1.1 christos /* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */ 92 1.1 christos /* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */ 93 1.1 christos /* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */ 94 1.1 christos /* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */ 95 1.1 christos /* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */ 96 1.1 christos /* 92 48 */ unsigned wsize; /* window size */ 97 1.1 christos /* 96 52 */ unsigned write; /* window write index */ 98 1.1 christos /*100 56 */ unsigned lmask; /* r12 mask for lcode */ 99 1.1 christos /*104 60 */ unsigned dmask; /* r13 mask for dcode */ 100 1.1 christos /*108 64 */ unsigned len; /* r14 match length */ 101 1.1 christos /*112 68 */ unsigned dist; /* r15 match distance */ 102 1.1 christos /*116 72 */ unsigned status; /* set when state chng*/ 103 1.1 christos } type_ar; 104 1.1 christos #ifdef ASMINF 105 1.1 christos 106 1.1 christos void inflate_fast(strm, start) 107 1.1 christos z_streamp strm; 108 1.1 christos unsigned start; /* inflate()'s starting value for strm->avail_out */ 109 1.1 christos { 110 1.1 christos struct inflate_state FAR *state; 111 1.1 christos type_ar ar; 112 1.1 christos void inffas8664fnc(struct inffast_ar * par); 113 1.1 christos 114 1.1 christos 115 1.1 christos 116 1.1 christos #if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64)) 117 1.1 christos #define PAD_AVAIL_IN 6 118 1.1 christos #define PAD_AVAIL_OUT 258 119 1.1 christos #else 120 1.1 christos #define PAD_AVAIL_IN 5 121 1.1 christos #define PAD_AVAIL_OUT 257 122 1.1 christos #endif 123 1.1 christos 124 1.1 christos /* copy state to local variables */ 125 1.1 christos state = (struct inflate_state FAR *)strm->state; 126 1.1 christos 127 1.1 christos ar.in = strm->next_in; 128 1.1 christos ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN); 129 1.1 christos ar.out = strm->next_out; 130 1.1 christos ar.beg = ar.out - (start - strm->avail_out); 131 1.1 christos ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT); 132 1.1 christos ar.wsize = state->wsize; 133 1.1 christos ar.write = state->wnext; 134 1.1 christos ar.window = state->window; 135 1.1 christos ar.hold = state->hold; 136 1.1 christos ar.bits = state->bits; 137 1.1 christos ar.lcode = state->lencode; 138 1.1 christos ar.dcode = state->distcode; 139 1.1 christos ar.lmask = (1U << state->lenbits) - 1; 140 1.1 christos ar.dmask = (1U << state->distbits) - 1; 141 1.1 christos 142 1.1 christos /* decode literals and length/distances until end-of-block or not enough 143 1.1 christos input data or output space */ 144 1.1 christos 145 1.1 christos /* align in on 1/2 hold size boundary */ 146 1.1 christos while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) { 147 1.1 christos ar.hold += (unsigned long)*ar.in++ << ar.bits; 148 1.1 christos ar.bits += 8; 149 1.1 christos } 150 1.1 christos 151 1.1 christos inffas8664fnc(&ar); 152 1.1 christos 153 1.1 christos if (ar.status > 1) { 154 1.1 christos if (ar.status == 2) 155 1.1 christos strm->msg = "invalid literal/length code"; 156 1.1 christos else if (ar.status == 3) 157 1.1 christos strm->msg = "invalid distance code"; 158 1.1 christos else 159 1.1 christos strm->msg = "invalid distance too far back"; 160 1.1 christos state->mode = BAD; 161 1.1 christos } 162 1.1 christos else if ( ar.status == 1 ) { 163 1.1 christos state->mode = TYPE; 164 1.1 christos } 165 1.1 christos 166 1.1 christos /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ 167 1.1 christos ar.len = ar.bits >> 3; 168 1.1 christos ar.in -= ar.len; 169 1.1 christos ar.bits -= ar.len << 3; 170 1.1 christos ar.hold &= (1U << ar.bits) - 1; 171 1.1 christos 172 1.1 christos /* update state and return */ 173 1.1 christos strm->next_in = ar.in; 174 1.1 christos strm->next_out = ar.out; 175 1.1 christos strm->avail_in = (unsigned)(ar.in < ar.last ? 176 1.1 christos PAD_AVAIL_IN + (ar.last - ar.in) : 177 1.1 christos PAD_AVAIL_IN - (ar.in - ar.last)); 178 1.1 christos strm->avail_out = (unsigned)(ar.out < ar.end ? 179 1.1 christos PAD_AVAIL_OUT + (ar.end - ar.out) : 180 1.1 christos PAD_AVAIL_OUT - (ar.out - ar.end)); 181 1.1 christos state->hold = (unsigned long)ar.hold; 182 1.1 christos state->bits = ar.bits; 183 1.1 christos return; 184 1.1 christos } 185 1.1 christos 186 1.1 christos #endif 187