Home | History | Annotate | Line # | Download | only in src
      1 /*	$NetBSD: dosbuf.c,v 1.1.1.1 2016/01/10 21:36:21 christos Exp $	*/
      2 
      3 /* Messy DOS-specific code for correctly treating binary, Unix text
      4    and DOS text files.
      5 
      6    This has several aspects:
      7 
      8      * Guessing the file type (unless the user tells us);
      9      * Stripping CR characters from DOS text files (otherwise regex
     10        functions won't work correctly);
     11      * Reporting correct byte count with -b for any kind of file.
     12 
     13 */
     14 
     15 typedef enum {
     16   UNKNOWN, DOS_BINARY, DOS_TEXT, UNIX_TEXT
     17 } File_type;
     18 
     19 struct dos_map {
     20   off_t pos;	/* position in buffer passed to matcher */
     21   off_t add;	/* how much to add when reporting char position */
     22 };
     23 
     24 static int       dos_report_unix_offset = 0;
     25 
     26 static File_type dos_file_type     = UNKNOWN;
     27 static File_type dos_use_file_type = UNKNOWN;
     28 static off_t     dos_stripped_crs  = 0;
     29 static struct dos_map *dos_pos_map;
     30 static int       dos_pos_map_size  = 0;
     31 static int       dos_pos_map_used  = 0;
     32 static int       inp_map_idx = 0, out_map_idx = 1;
     33 
     34 /* Guess DOS file type by looking at its contents.  */
     35 static inline File_type
     36 guess_type (char *buf, register size_t buflen)
     37 {
     38   int crlf_seen = 0;
     39   register char *bp = buf;
     40 
     41   while (buflen--)
     42     {
     43       /* Treat a file as binary if it has a NUL character.  */
     44       if (!*bp)
     45         return DOS_BINARY;
     46 
     47       /* CR before LF means DOS text file (unless we later see
     48          binary characters).  */
     49       else if (*bp == '\r' && buflen && bp[1] == '\n')
     50         crlf_seen = 1;
     51 
     52       bp++;
     53     }
     54 
     55   return crlf_seen ? DOS_TEXT : UNIX_TEXT;
     56 }
     57 
     58 /* Convert external DOS file representation to internal.
     59    Return the count of characters left in the buffer.
     60    Build table to map character positions when reporting byte counts.  */
     61 static inline int
     62 undossify_input (register char *buf, size_t buflen)
     63 {
     64   int chars_left = 0;
     65 
     66   if (totalcc == 0)
     67     {
     68       /* New file: forget everything we knew about character
     69          position mapping table and file type.  */
     70       inp_map_idx = 0;
     71       out_map_idx = 1;
     72       dos_pos_map_used = 0;
     73       dos_stripped_crs = 0;
     74       dos_file_type = dos_use_file_type;
     75     }
     76 
     77   /* Guess if this file is binary, unless we already know that.  */
     78   if (dos_file_type == UNKNOWN)
     79     dos_file_type = guess_type(buf, buflen);
     80 
     81   /* If this file is to be treated as DOS Text, strip the CR characters
     82      and maybe build the table for character position mapping on output.  */
     83   if (dos_file_type == DOS_TEXT)
     84     {
     85       char   *destp   = buf;
     86 
     87       while (buflen--)
     88         {
     89           if (*buf != '\r')
     90             {
     91               *destp++ = *buf++;
     92               chars_left++;
     93             }
     94           else
     95             {
     96               buf++;
     97               if (out_byte && !dos_report_unix_offset)
     98                 {
     99                   dos_stripped_crs++;
    100                   while (buflen && *buf == '\r')
    101                     {
    102                       dos_stripped_crs++;
    103                       buflen--;
    104                       buf++;
    105                     }
    106                   if (inp_map_idx >= dos_pos_map_size - 1)
    107                     {
    108                       dos_pos_map_size = inp_map_idx ? inp_map_idx * 2 : 1000;
    109                       dos_pos_map =
    110                         (struct dos_map *)xrealloc((char *)dos_pos_map,
    111 						   dos_pos_map_size *
    112 						   sizeof(struct dos_map));
    113                     }
    114 
    115                   if (!inp_map_idx)
    116                     {
    117                       /* Add sentinel entry.  */
    118                       dos_pos_map[inp_map_idx].pos = 0;
    119                       dos_pos_map[inp_map_idx++].add = 0;
    120 
    121                       /* Initialize first real entry.  */
    122                       dos_pos_map[inp_map_idx].add = 0;
    123                     }
    124 
    125                   /* Put the new entry.  If the stripped CR characters
    126                      precede a Newline (the usual case), pretend that
    127                      they were found *after* the Newline.  This makes
    128                      displayed byte offsets more reasonable in some
    129                      cases, and fits better the intuitive notion that
    130                      the line ends *before* the CR, not *after* it.  */
    131                   inp_map_idx++;
    132                   dos_pos_map[inp_map_idx-1].pos =
    133                     (*buf == '\n' ? destp + 1 : destp ) - bufbeg + totalcc;
    134                   dos_pos_map[inp_map_idx].add = dos_stripped_crs;
    135                   dos_pos_map_used = inp_map_idx;
    136 
    137                   /* The following will be updated on the next pass.  */
    138                   dos_pos_map[inp_map_idx].pos = destp - bufbeg + totalcc + 1;
    139                 }
    140             }
    141         }
    142 
    143       return chars_left;
    144     }
    145 
    146   return buflen;
    147 }
    148 
    149 /* Convert internal byte count into external.  */
    150 static inline off_t
    151 dossified_pos (off_t byteno)
    152 {
    153   off_t pos_lo;
    154   off_t pos_hi;
    155 
    156   if (dos_file_type != DOS_TEXT || dos_report_unix_offset)
    157     return byteno;
    158 
    159   /* Optimization: usually the file will be scanned sequentially.
    160      So in most cases, this byte position will be found in the
    161      table near the previous one, as recorded in `out_map_idx'.  */
    162   pos_lo = dos_pos_map[out_map_idx-1].pos;
    163   pos_hi = dos_pos_map[out_map_idx].pos;
    164 
    165   /* If the initial guess failed, search up or down, as
    166      appropriate, beginning with the previous place.  */
    167   if (byteno >= pos_hi)
    168     {
    169       out_map_idx++;
    170       while (out_map_idx < dos_pos_map_used &&
    171              byteno >= dos_pos_map[out_map_idx].pos)
    172         out_map_idx++;
    173     }
    174 
    175   else if (byteno < pos_lo)
    176     {
    177       out_map_idx--;
    178       while (out_map_idx > 1 && byteno < dos_pos_map[out_map_idx-1].pos)
    179         out_map_idx--;
    180     }
    181 
    182   return byteno + dos_pos_map[out_map_idx].add;
    183 }
    184