Home | History | Annotate | Line # | Download | only in libgettextpo
linebreak.c revision 1.1
      1  1.1  christos /* linebreak.c - line breaking of Unicode strings
      2  1.1  christos    Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
      3  1.1  christos    Written by Bruno Haible <haible (at) clisp.cons.org>, 2001.
      4  1.1  christos 
      5  1.1  christos This program is free software; you can redistribute it and/or modify
      6  1.1  christos it under the terms of the GNU General Public License as published by
      7  1.1  christos the Free Software Foundation; either version 2, or (at your option)
      8  1.1  christos any later version.
      9  1.1  christos 
     10  1.1  christos This program is distributed in the hope that it will be useful,
     11  1.1  christos but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  1.1  christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13  1.1  christos GNU General Public License for more details.
     14  1.1  christos 
     15  1.1  christos You should have received a copy of the GNU General Public License
     16  1.1  christos along with this program; if not, write to the Free Software Foundation,
     17  1.1  christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
     18  1.1  christos 
     19  1.1  christos #include <config.h>
     20  1.1  christos 
     21  1.1  christos /* Specification.  */
     22  1.1  christos #include "linebreak.h"
     23  1.1  christos 
     24  1.1  christos #include <stdlib.h>
     25  1.1  christos #include <string.h>
     26  1.1  christos #include "c-ctype.h"
     27  1.1  christos #include "xsize.h"
     28  1.1  christos 
     29  1.1  christos #include "utf8-ucs4.h"
     30  1.1  christos 
     31  1.1  christos #ifdef unused
     32  1.1  christos #include "utf16-ucs4.h"
     33  1.1  christos 
     34  1.1  christos static inline int
     35  1.1  christos u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
     36  1.1  christos {
     37  1.1  christos   *puc = *s;
     38  1.1  christos   return 1;
     39  1.1  christos }
     40  1.1  christos #endif
     41  1.1  christos 
     42  1.1  christos 
     43  1.1  christos /* Help GCC to generate good code for string comparisons with
     44  1.1  christos    immediate strings. */
     45  1.1  christos #if defined (__GNUC__) && defined (__OPTIMIZE__)
     46  1.1  christos 
     47  1.1  christos static inline int
     48  1.1  christos streq9 (const char *s1, const char *s2)
     49  1.1  christos {
     50  1.1  christos   return strcmp (s1 + 9, s2 + 9) == 0;
     51  1.1  christos }
     52  1.1  christos 
     53  1.1  christos static inline int
     54  1.1  christos streq8 (const char *s1, const char *s2, char s28)
     55  1.1  christos {
     56  1.1  christos   if (s1[8] == s28)
     57  1.1  christos     {
     58  1.1  christos       if (s28 == 0)
     59  1.1  christos         return 1;
     60  1.1  christos       else
     61  1.1  christos         return streq9 (s1, s2);
     62  1.1  christos     }
     63  1.1  christos   else
     64  1.1  christos     return 0;
     65  1.1  christos }
     66  1.1  christos 
     67  1.1  christos static inline int
     68  1.1  christos streq7 (const char *s1, const char *s2, char s27, char s28)
     69  1.1  christos {
     70  1.1  christos   if (s1[7] == s27)
     71  1.1  christos     {
     72  1.1  christos       if (s27 == 0)
     73  1.1  christos         return 1;
     74  1.1  christos       else
     75  1.1  christos         return streq8 (s1, s2, s28);
     76  1.1  christos     }
     77  1.1  christos   else
     78  1.1  christos     return 0;
     79  1.1  christos }
     80  1.1  christos 
     81  1.1  christos static inline int
     82  1.1  christos streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
     83  1.1  christos {
     84  1.1  christos   if (s1[6] == s26)
     85  1.1  christos     {
     86  1.1  christos       if (s26 == 0)
     87  1.1  christos         return 1;
     88  1.1  christos       else
     89  1.1  christos         return streq7 (s1, s2, s27, s28);
     90  1.1  christos     }
     91  1.1  christos   else
     92  1.1  christos     return 0;
     93  1.1  christos }
     94  1.1  christos 
     95  1.1  christos static inline int
     96  1.1  christos streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
     97  1.1  christos {
     98  1.1  christos   if (s1[5] == s25)
     99  1.1  christos     {
    100  1.1  christos       if (s25 == 0)
    101  1.1  christos         return 1;
    102  1.1  christos       else
    103  1.1  christos         return streq6 (s1, s2, s26, s27, s28);
    104  1.1  christos     }
    105  1.1  christos   else
    106  1.1  christos     return 0;
    107  1.1  christos }
    108  1.1  christos 
    109  1.1  christos static inline int
    110  1.1  christos streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
    111  1.1  christos {
    112  1.1  christos   if (s1[4] == s24)
    113  1.1  christos     {
    114  1.1  christos       if (s24 == 0)
    115  1.1  christos         return 1;
    116  1.1  christos       else
    117  1.1  christos         return streq5 (s1, s2, s25, s26, s27, s28);
    118  1.1  christos     }
    119  1.1  christos   else
    120  1.1  christos     return 0;
    121  1.1  christos }
    122  1.1  christos 
    123  1.1  christos static inline int
    124  1.1  christos streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
    125  1.1  christos {
    126  1.1  christos   if (s1[3] == s23)
    127  1.1  christos     {
    128  1.1  christos       if (s23 == 0)
    129  1.1  christos         return 1;
    130  1.1  christos       else
    131  1.1  christos         return streq4 (s1, s2, s24, s25, s26, s27, s28);
    132  1.1  christos     }
    133  1.1  christos   else
    134  1.1  christos     return 0;
    135  1.1  christos }
    136  1.1  christos 
    137  1.1  christos static inline int
    138  1.1  christos streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
    139  1.1  christos {
    140  1.1  christos   if (s1[2] == s22)
    141  1.1  christos     {
    142  1.1  christos       if (s22 == 0)
    143  1.1  christos         return 1;
    144  1.1  christos       else
    145  1.1  christos         return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
    146  1.1  christos     }
    147  1.1  christos   else
    148  1.1  christos     return 0;
    149  1.1  christos }
    150  1.1  christos 
    151  1.1  christos static inline int
    152  1.1  christos streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
    153  1.1  christos {
    154  1.1  christos   if (s1[1] == s21)
    155  1.1  christos     {
    156  1.1  christos       if (s21 == 0)
    157  1.1  christos         return 1;
    158  1.1  christos       else
    159  1.1  christos         return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
    160  1.1  christos     }
    161  1.1  christos   else
    162  1.1  christos     return 0;
    163  1.1  christos }
    164  1.1  christos 
    165  1.1  christos static inline int
    166  1.1  christos streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
    167  1.1  christos {
    168  1.1  christos   if (s1[0] == s20)
    169  1.1  christos     {
    170  1.1  christos       if (s20 == 0)
    171  1.1  christos         return 1;
    172  1.1  christos       else
    173  1.1  christos         return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
    174  1.1  christos     }
    175  1.1  christos   else
    176  1.1  christos     return 0;
    177  1.1  christos }
    178  1.1  christos 
    179  1.1  christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
    180  1.1  christos   streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
    181  1.1  christos 
    182  1.1  christos #else
    183  1.1  christos 
    184  1.1  christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
    185  1.1  christos   (strcmp (s1, s2) == 0)
    186  1.1  christos 
    187  1.1  christos #endif
    188  1.1  christos 
    189  1.1  christos 
    190  1.1  christos static int
    191  1.1  christos is_cjk_encoding (const char *encoding)
    192  1.1  christos {
    193  1.1  christos   if (0
    194  1.1  christos       /* Legacy Japanese encodings */
    195  1.1  christos       || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
    196  1.1  christos       /* Legacy Chinese encodings */
    197  1.1  christos       || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
    198  1.1  christos       || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
    199  1.1  christos       || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
    200  1.1  christos       || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
    201  1.1  christos       /* Legacy Korean encodings */
    202  1.1  christos       || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
    203  1.1  christos       || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
    204  1.1  christos       || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
    205  1.1  christos     return 1;
    206  1.1  christos   return 0;
    207  1.1  christos }
    208  1.1  christos 
    209  1.1  christos static int
    210  1.1  christos is_utf8_encoding (const char *encoding)
    211  1.1  christos {
    212  1.1  christos   if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
    213  1.1  christos     return 1;
    214  1.1  christos   return 0;
    215  1.1  christos }
    216  1.1  christos 
    217  1.1  christos 
    218  1.1  christos /* Determine number of column positions required for UC. */
    219  1.1  christos int uc_width (unsigned int uc, const char *encoding);
    220  1.1  christos 
    221  1.1  christos /*
    222  1.1  christos  * Non-spacing attribute table.
    223  1.1  christos  * Consists of:
    224  1.1  christos  * - Non-spacing characters; generated from PropList.txt or
    225  1.1  christos  *   "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
    226  1.1  christos  * - Format control characters; generated from
    227  1.1  christos  *   "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
    228  1.1  christos  * - Zero width characters; generated from
    229  1.1  christos  *   "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
    230  1.1  christos  */
    231  1.1  christos static const unsigned char nonspacing_table_data[16*64] = {
    232  1.1  christos   /* 0x0000-0x01ff */
    233  1.1  christos   0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
    234  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
    235  1.1  christos   0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
    236  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
    237  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
    238  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
    239  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
    240  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
    241  1.1  christos   /* 0x0200-0x03ff */
    242  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
    243  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
    244  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
    245  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
    246  1.1  christos   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
    247  1.1  christos   0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
    248  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
    249  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
    250  1.1  christos   /* 0x0400-0x05ff */
    251  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
    252  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
    253  1.1  christos   0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
    254  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
    255  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
    256  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
    257  1.1  christos   0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
    258  1.1  christos   0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
    259  1.1  christos   /* 0x0600-0x07ff */
    260  1.1  christos   0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
    261  1.1  christos   0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
    262  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
    263  1.1  christos   0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
    264  1.1  christos   0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
    265  1.1  christos   0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
    266  1.1  christos   0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
    267  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
    268  1.1  christos   /* 0x0800-0x09ff */
    269  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
    270  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
    271  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
    272  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
    273  1.1  christos   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
    274  1.1  christos   0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
    275  1.1  christos   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
    276  1.1  christos   0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
    277  1.1  christos   /* 0x0a00-0x0bff */
    278  1.1  christos   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
    279  1.1  christos   0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
    280  1.1  christos   0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
    281  1.1  christos   0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
    282  1.1  christos   0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
    283  1.1  christos   0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
    284  1.1  christos   0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
    285  1.1  christos   0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
    286  1.1  christos   /* 0x0c00-0x0dff */
    287  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
    288  1.1  christos   0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
    289  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
    290  1.1  christos   0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
    291  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
    292  1.1  christos   0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
    293  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
    294  1.1  christos   0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
    295  1.1  christos   /* 0x0e00-0x0fff */
    296  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
    297  1.1  christos   0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
    298  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
    299  1.1  christos   0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
    300  1.1  christos   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
    301  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
    302  1.1  christos   0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
    303  1.1  christos   0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
    304  1.1  christos   /* 0x1000-0x11ff */
    305  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
    306  1.1  christos   0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
    307  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
    308  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
    309  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
    310  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
    311  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
    312  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
    313  1.1  christos   /* 0x1600-0x17ff */
    314  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
    315  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
    316  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
    317  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
    318  1.1  christos   0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
    319  1.1  christos   0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
    320  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
    321  1.1  christos   0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
    322  1.1  christos   /* 0x1800-0x19ff */
    323  1.1  christos   0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
    324  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
    325  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
    326  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
    327  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
    328  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
    329  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
    330  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
    331  1.1  christos   /* 0x2000-0x21ff */
    332  1.1  christos   0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
    333  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
    334  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
    335  1.1  christos   0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
    336  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
    337  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
    338  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
    339  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
    340  1.1  christos   /* 0x3000-0x31ff */
    341  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
    342  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
    343  1.1  christos   0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
    344  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
    345  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
    346  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
    347  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
    348  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
    349  1.1  christos   /* 0xfa00-0xfbff */
    350  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
    351  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
    352  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
    353  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
    354  1.1  christos   0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
    355  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
    356  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
    357  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
    358  1.1  christos   /* 0xfe00-0xffff */
    359  1.1  christos   0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
    360  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
    361  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
    362  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
    363  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
    364  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
    365  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
    366  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
    367  1.1  christos   /* 0x1d000-0x1d1ff */
    368  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
    369  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
    370  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
    371  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
    372  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
    373  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
    374  1.1  christos   0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
    375  1.1  christos   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0x1d1c0-0x1d1ff */
    376  1.1  christos };
    377  1.1  christos static const signed char nonspacing_table_ind[240] = {
    378  1.1  christos    0,  1,  2,  3,  4,  5,  6,  7, /* 0x0000-0x0fff */
    379  1.1  christos    8, -1, -1,  9, 10, -1, -1, -1, /* 0x1000-0x1fff */
    380  1.1  christos   11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
    381  1.1  christos   12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
    382  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
    383  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
    384  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
    385  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
    386  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
    387  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
    388  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
    389  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
    390  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
    391  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
    392  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
    393  1.1  christos   -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
    394  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
    395  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
    396  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
    397  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
    398  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
    399  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
    400  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
    401  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
    402  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
    403  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
    404  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
    405  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
    406  1.1  christos   -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
    407  1.1  christos   15, -1, -1, -1, -1, -1, -1, -1  /* 0x1d000-0x1dfff */
    408  1.1  christos };
    409  1.1  christos 
    410  1.1  christos /* Determine number of column positions required for UC. */
    411  1.1  christos int
    412  1.1  christos uc_width (unsigned int uc, const char *encoding)
    413  1.1  christos {
    414  1.1  christos   /* Test for non-spacing or control character.  */
    415  1.1  christos   if ((uc >> 9) < 240)
    416  1.1  christos     {
    417  1.1  christos       int ind = nonspacing_table_ind[uc >> 9];
    418  1.1  christos       if (ind >= 0)
    419  1.1  christos 	if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
    420  1.1  christos 	  {
    421  1.1  christos 	    if (uc > 0 && uc < 0xa0)
    422  1.1  christos 	      return -1;
    423  1.1  christos 	    else
    424  1.1  christos 	      return 0;
    425  1.1  christos 	  }
    426  1.1  christos     }
    427  1.1  christos   else if ((uc >> 9) == (0xe0000 >> 9))
    428  1.1  christos     {
    429  1.1  christos       if (uc < 0xe0100
    430  1.1  christos 	  ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
    431  1.1  christos 	  : (uc <= 0xe01ef))
    432  1.1  christos 	return 0;
    433  1.1  christos     }
    434  1.1  christos   /* Test for double-width character.
    435  1.1  christos    * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
    436  1.1  christos    * and            "grep '^....;[^WF]' EastAsianWidth.txt"
    437  1.1  christos    */
    438  1.1  christos   if (uc >= 0x1100
    439  1.1  christos       && ((uc < 0x1160) /* Hangul Jamo */
    440  1.1  christos 	  || (uc >= 0x2e80 && uc < 0x4dc0  /* CJK */
    441  1.1  christos 	      && !(uc == 0x303f))
    442  1.1  christos 	  || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
    443  1.1  christos 	  || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
    444  1.1  christos 	  || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
    445  1.1  christos 	  || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
    446  1.1  christos 	  || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
    447  1.1  christos 	  || (uc >= 0xffe0 && uc < 0xffe7)
    448  1.1  christos 	  || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
    449  1.1  christos 	  || (uc >= 0x30000 && uc <= 0x3fffd)
    450  1.1  christos      )   )
    451  1.1  christos     return 2;
    452  1.1  christos   /* In ancient CJK encodings, Cyrillic and most other characters are
    453  1.1  christos      double-width as well.  */
    454  1.1  christos   if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
    455  1.1  christos       && is_cjk_encoding (encoding))
    456  1.1  christos     return 2;
    457  1.1  christos   return 1;
    458  1.1  christos }
    459  1.1  christos 
    460  1.1  christos 
    461  1.1  christos #ifdef unused
    462  1.1  christos 
    463  1.1  christos /* Determine number of column positions required for first N units
    464  1.1  christos    (or fewer if S ends before this) in S.  */
    465  1.1  christos 
    466  1.1  christos int
    467  1.1  christos u8_width (const unsigned char *s, size_t n, const char *encoding)
    468  1.1  christos {
    469  1.1  christos   const unsigned char *s_end = s + n;
    470  1.1  christos   int width = 0;
    471  1.1  christos 
    472  1.1  christos   while (s < s_end)
    473  1.1  christos     {
    474  1.1  christos       unsigned int uc;
    475  1.1  christos       int w;
    476  1.1  christos 
    477  1.1  christos       s += u8_mbtouc (&uc, s, s_end - s);
    478  1.1  christos 
    479  1.1  christos       if (uc == 0)
    480  1.1  christos         break; /* end of string reached */
    481  1.1  christos 
    482  1.1  christos       w = uc_width (uc, encoding);
    483  1.1  christos       if (w >= 0) /* ignore control characters in the string */
    484  1.1  christos         width += w;
    485  1.1  christos     }
    486  1.1  christos 
    487  1.1  christos   return width;
    488  1.1  christos }
    489  1.1  christos 
    490  1.1  christos int
    491  1.1  christos u16_width (const unsigned short *s, size_t n, const char *encoding)
    492  1.1  christos {
    493  1.1  christos   const unsigned short *s_end = s + n;
    494  1.1  christos   int width = 0;
    495  1.1  christos 
    496  1.1  christos   while (s < s_end)
    497  1.1  christos     {
    498  1.1  christos       unsigned int uc;
    499  1.1  christos       int w;
    500  1.1  christos 
    501  1.1  christos       s += u16_mbtouc (&uc, s, s_end - s);
    502  1.1  christos 
    503  1.1  christos       if (uc == 0)
    504  1.1  christos         break; /* end of string reached */
    505  1.1  christos 
    506  1.1  christos       w = uc_width (uc, encoding);
    507  1.1  christos       if (w >= 0) /* ignore control characters in the string */
    508  1.1  christos         width += w;
    509  1.1  christos     }
    510  1.1  christos 
    511  1.1  christos   return width;
    512  1.1  christos }
    513  1.1  christos 
    514  1.1  christos int
    515  1.1  christos u32_width (const unsigned int *s, size_t n, const char *encoding)
    516  1.1  christos {
    517  1.1  christos   const unsigned int *s_end = s + n;
    518  1.1  christos   int width = 0;
    519  1.1  christos 
    520  1.1  christos   while (s < s_end)
    521  1.1  christos     {
    522  1.1  christos       unsigned int uc = *s++;
    523  1.1  christos       int w;
    524  1.1  christos 
    525  1.1  christos       if (uc == 0)
    526  1.1  christos         break; /* end of string reached */
    527  1.1  christos 
    528  1.1  christos       w = uc_width (uc, encoding);
    529  1.1  christos       if (w >= 0) /* ignore control characters in the string */
    530  1.1  christos         width += w;
    531  1.1  christos     }
    532  1.1  christos 
    533  1.1  christos   return width;
    534  1.1  christos }
    535  1.1  christos 
    536  1.1  christos #endif
    537  1.1  christos 
    538  1.1  christos 
    539  1.1  christos /* Determine the line break points in S, and store the result at p[0..n-1].  */
    540  1.1  christos /* We don't support line breaking of complex-context dependent characters
    541  1.1  christos    (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
    542  1.1  christos 
    543  1.1  christos /* Line breaking classification.  */
    544  1.1  christos 
    545  1.1  christos enum
    546  1.1  christos {
    547  1.1  christos   /* Values >= 20 are resolved at run time. */
    548  1.1  christos   LBP_BK =  0, /* mandatory break */
    549  1.1  christos /*LBP_CR,         carriage return - not used here because it's a DOSism */
    550  1.1  christos /*LBP_LF,         line feed - not used here because it's a DOSism */
    551  1.1  christos   LBP_CM = 20, /* attached characters and combining marks */
    552  1.1  christos /*LBP_SG,         surrogates - not used here because they are not characters */
    553  1.1  christos   LBP_ZW =  1, /* zero width space */
    554  1.1  christos   LBP_IN =  2, /* inseparable */
    555  1.1  christos   LBP_GL =  3, /* non-breaking (glue) */
    556  1.1  christos   LBP_CB = 22, /* contingent break opportunity */
    557  1.1  christos   LBP_SP = 21, /* space */
    558  1.1  christos   LBP_BA =  4, /* break opportunity after */
    559  1.1  christos   LBP_BB =  5, /* break opportunity before */
    560  1.1  christos   LBP_B2 =  6, /* break opportunity before and after */
    561  1.1  christos   LBP_HY =  7, /* hyphen */
    562  1.1  christos   LBP_NS =  8, /* non starter */
    563  1.1  christos   LBP_OP =  9, /* opening punctuation */
    564  1.1  christos   LBP_CL = 10, /* closing punctuation */
    565  1.1  christos   LBP_QU = 11, /* ambiguous quotation */
    566  1.1  christos   LBP_EX = 12, /* exclamation/interrogation */
    567  1.1  christos   LBP_ID = 13, /* ideographic */
    568  1.1  christos   LBP_NU = 14, /* numeric */
    569  1.1  christos   LBP_IS = 15, /* infix separator (numeric) */
    570  1.1  christos   LBP_SY = 16, /* symbols allowing breaks */
    571  1.1  christos   LBP_AL = 17, /* ordinary alphabetic and symbol characters */
    572  1.1  christos   LBP_PR = 18, /* prefix (numeric) */
    573  1.1  christos   LBP_PO = 19, /* postfix (numeric) */
    574  1.1  christos   LBP_SA = 23, /* complex context (South East Asian) */
    575  1.1  christos   LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
    576  1.1  christos   LBP_XX = 25  /* unknown */
    577  1.1  christos };
    578  1.1  christos 
    579  1.1  christos #include "lbrkprop.h"
    580  1.1  christos 
    581  1.1  christos static inline unsigned char
    582  1.1  christos lbrkprop_lookup (unsigned int uc)
    583  1.1  christos {
    584  1.1  christos   unsigned int index1 = uc >> lbrkprop_header_0;
    585  1.1  christos   if (index1 < lbrkprop_header_1)
    586  1.1  christos     {
    587  1.1  christos       int lookup1 = lbrkprop.level1[index1];
    588  1.1  christos       if (lookup1 >= 0)
    589  1.1  christos         {
    590  1.1  christos           unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
    591  1.1  christos           int lookup2 = lbrkprop.level2[lookup1 + index2];
    592  1.1  christos           if (lookup2 >= 0)
    593  1.1  christos             {
    594  1.1  christos               unsigned int index3 = uc & lbrkprop_header_4;
    595  1.1  christos               return lbrkprop.level3[lookup2 + index3];
    596  1.1  christos             }
    597  1.1  christos         }
    598  1.1  christos     }
    599  1.1  christos   return LBP_XX;
    600  1.1  christos }
    601  1.1  christos 
    602  1.1  christos /* Table indexed by two line breaking classifications.  */
    603  1.1  christos #define D 1  /* direct break opportunity, empty in table 7.3 of UTR #14 */
    604  1.1  christos #define I 2  /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
    605  1.1  christos #define P 3  /* prohibited break,           '^' in table 7.3 of UTR #14 */
    606  1.1  christos static const unsigned char lbrk_table[19][19] = {
    607  1.1  christos                                 /* after */
    608  1.1  christos         /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
    609  1.1  christos /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
    610  1.1  christos /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    611  1.1  christos /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
    612  1.1  christos /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    613  1.1  christos /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
    614  1.1  christos /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    615  1.1  christos /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    616  1.1  christos /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    617  1.1  christos /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
    618  1.1  christos /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
    619  1.1  christos /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
    620  1.1  christos /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    621  1.1  christos /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
    622  1.1  christos /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
    623  1.1  christos /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
    624  1.1  christos /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
    625  1.1  christos /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
    626  1.1  christos /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
    627  1.1  christos /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
    628  1.1  christos /* "" */
    629  1.1  christos /* before */
    630  1.1  christos };
    631  1.1  christos /* Note: The (B2,B2) entry should probably be D instead of P.  */
    632  1.1  christos /* Note: The (PR,ID) entry should probably be D instead of I.  */
    633  1.1  christos 
    634  1.1  christos void
    635  1.1  christos u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
    636  1.1  christos {
    637  1.1  christos   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
    638  1.1  christos   const unsigned char *s_end = s + n;
    639  1.1  christos   int last_prop = LBP_BK; /* line break property of last non-space character */
    640  1.1  christos   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
    641  1.1  christos   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
    642  1.1  christos 
    643  1.1  christos   /* Don't break inside multibyte characters.  */
    644  1.1  christos   memset (p, UC_BREAK_PROHIBITED, n);
    645  1.1  christos 
    646  1.1  christos   while (s < s_end)
    647  1.1  christos     {
    648  1.1  christos       unsigned int uc;
    649  1.1  christos       int count = u8_mbtouc (&uc, s, s_end - s);
    650  1.1  christos       int prop = lbrkprop_lookup (uc);
    651  1.1  christos 
    652  1.1  christos       if (prop == LBP_BK)
    653  1.1  christos         {
    654  1.1  christos           /* Mandatory break.  */
    655  1.1  christos           *p = UC_BREAK_MANDATORY;
    656  1.1  christos           last_prop = LBP_BK;
    657  1.1  christos           seen_space = NULL;
    658  1.1  christos           seen_space2 = NULL;
    659  1.1  christos         }
    660  1.1  christos       else
    661  1.1  christos         {
    662  1.1  christos           char *q;
    663  1.1  christos 
    664  1.1  christos           /* Resolve property values whose behaviour is not fixed.  */
    665  1.1  christos           switch (prop)
    666  1.1  christos             {
    667  1.1  christos               case LBP_AI:
    668  1.1  christos                 /* Resolve ambiguous.  */
    669  1.1  christos                 prop = LBP_AI_REPLACEMENT;
    670  1.1  christos                 break;
    671  1.1  christos               case LBP_CB:
    672  1.1  christos                 /* This is arbitrary.  */
    673  1.1  christos                 prop = LBP_ID;
    674  1.1  christos                 break;
    675  1.1  christos               case LBP_SA:
    676  1.1  christos                 /* We don't handle complex scripts yet.
    677  1.1  christos                    Treat LBP_SA like LBP_XX.  */
    678  1.1  christos               case LBP_XX:
    679  1.1  christos                 /* This is arbitrary.  */
    680  1.1  christos                 prop = LBP_AL;
    681  1.1  christos                 break;
    682  1.1  christos             }
    683  1.1  christos 
    684  1.1  christos           /* Deal with combining characters.  */
    685  1.1  christos           q = p;
    686  1.1  christos           if (prop == LBP_CM)
    687  1.1  christos             {
    688  1.1  christos               /* Don't break just before a combining character.  */
    689  1.1  christos               *p = UC_BREAK_PROHIBITED;
    690  1.1  christos               /* A combining character turns a preceding space into LBP_AL.  */
    691  1.1  christos               if (seen_space != NULL)
    692  1.1  christos                 {
    693  1.1  christos                   q = seen_space;
    694  1.1  christos                   seen_space = seen_space2;
    695  1.1  christos                   prop = LBP_AL;
    696  1.1  christos                   goto lookup_via_table;
    697  1.1  christos                 }
    698  1.1  christos             }
    699  1.1  christos           else if (prop == LBP_SP)
    700  1.1  christos             {
    701  1.1  christos               /* Don't break just before a space.  */
    702  1.1  christos               *p = UC_BREAK_PROHIBITED;
    703  1.1  christos               seen_space2 = seen_space;
    704  1.1  christos               seen_space = p;
    705  1.1  christos             }
    706  1.1  christos           else
    707  1.1  christos             {
    708  1.1  christos              lookup_via_table:
    709  1.1  christos               /* prop must be usable as an index for table 7.3 of UTR #14.  */
    710  1.1  christos               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
    711  1.1  christos                 abort ();
    712  1.1  christos 
    713  1.1  christos               if (last_prop == LBP_BK)
    714  1.1  christos                 {
    715  1.1  christos                   /* Don't break at the beginning of a line.  */
    716  1.1  christos                   *q = UC_BREAK_PROHIBITED;
    717  1.1  christos                 }
    718  1.1  christos               else
    719  1.1  christos                 {
    720  1.1  christos                   switch (lbrk_table [last_prop-1] [prop-1])
    721  1.1  christos                     {
    722  1.1  christos                       case D:
    723  1.1  christos                         *q = UC_BREAK_POSSIBLE;
    724  1.1  christos                         break;
    725  1.1  christos                       case I:
    726  1.1  christos                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
    727  1.1  christos                         break;
    728  1.1  christos                       case P:
    729  1.1  christos                         *q = UC_BREAK_PROHIBITED;
    730  1.1  christos                         break;
    731  1.1  christos                       default:
    732  1.1  christos                         abort ();
    733  1.1  christos                     }
    734  1.1  christos                 }
    735  1.1  christos               last_prop = prop;
    736  1.1  christos               seen_space = NULL;
    737  1.1  christos               seen_space2 = NULL;
    738  1.1  christos             }
    739  1.1  christos         }
    740  1.1  christos 
    741  1.1  christos       s += count;
    742  1.1  christos       p += count;
    743  1.1  christos     }
    744  1.1  christos }
    745  1.1  christos 
    746  1.1  christos #ifdef unused
    747  1.1  christos 
    748  1.1  christos void
    749  1.1  christos u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
    750  1.1  christos {
    751  1.1  christos   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
    752  1.1  christos   const unsigned short *s_end = s + n;
    753  1.1  christos   int last_prop = LBP_BK; /* line break property of last non-space character */
    754  1.1  christos   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
    755  1.1  christos   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
    756  1.1  christos 
    757  1.1  christos   /* Don't break inside multibyte characters.  */
    758  1.1  christos   memset (p, UC_BREAK_PROHIBITED, n);
    759  1.1  christos 
    760  1.1  christos   while (s < s_end)
    761  1.1  christos     {
    762  1.1  christos       unsigned int uc;
    763  1.1  christos       int count = u16_mbtouc (&uc, s, s_end - s);
    764  1.1  christos       int prop = lbrkprop_lookup (uc);
    765  1.1  christos 
    766  1.1  christos       if (prop == LBP_BK)
    767  1.1  christos         {
    768  1.1  christos           /* Mandatory break.  */
    769  1.1  christos           *p = UC_BREAK_MANDATORY;
    770  1.1  christos           last_prop = LBP_BK;
    771  1.1  christos           seen_space = NULL;
    772  1.1  christos           seen_space2 = NULL;
    773  1.1  christos         }
    774  1.1  christos       else
    775  1.1  christos         {
    776  1.1  christos           char *q;
    777  1.1  christos 
    778  1.1  christos           /* Resolve property values whose behaviour is not fixed.  */
    779  1.1  christos           switch (prop)
    780  1.1  christos             {
    781  1.1  christos               case LBP_AI:
    782  1.1  christos                 /* Resolve ambiguous.  */
    783  1.1  christos                 prop = LBP_AI_REPLACEMENT;
    784  1.1  christos                 break;
    785  1.1  christos               case LBP_CB:
    786  1.1  christos                 /* This is arbitrary.  */
    787  1.1  christos                 prop = LBP_ID;
    788  1.1  christos                 break;
    789  1.1  christos               case LBP_SA:
    790  1.1  christos                 /* We don't handle complex scripts yet.
    791  1.1  christos                    Treat LBP_SA like LBP_XX.  */
    792  1.1  christos               case LBP_XX:
    793  1.1  christos                 /* This is arbitrary.  */
    794  1.1  christos                 prop = LBP_AL;
    795  1.1  christos                 break;
    796  1.1  christos             }
    797  1.1  christos 
    798  1.1  christos           /* Deal with combining characters.  */
    799  1.1  christos           q = p;
    800  1.1  christos           if (prop == LBP_CM)
    801  1.1  christos             {
    802  1.1  christos               /* Don't break just before a combining character.  */
    803  1.1  christos               *p = UC_BREAK_PROHIBITED;
    804  1.1  christos               /* A combining character turns a preceding space into LBP_AL.  */
    805  1.1  christos               if (seen_space != NULL)
    806  1.1  christos                 {
    807  1.1  christos                   q = seen_space;
    808  1.1  christos                   seen_space = seen_space2;
    809  1.1  christos                   prop = LBP_AL;
    810  1.1  christos                   goto lookup_via_table;
    811  1.1  christos                 }
    812  1.1  christos             }
    813  1.1  christos           else if (prop == LBP_SP)
    814  1.1  christos             {
    815  1.1  christos               /* Don't break just before a space.  */
    816  1.1  christos               *p = UC_BREAK_PROHIBITED;
    817  1.1  christos               seen_space2 = seen_space;
    818  1.1  christos               seen_space = p;
    819  1.1  christos             }
    820  1.1  christos           else
    821  1.1  christos             {
    822  1.1  christos              lookup_via_table:
    823  1.1  christos               /* prop must be usable as an index for table 7.3 of UTR #14.  */
    824  1.1  christos               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
    825  1.1  christos                 abort ();
    826  1.1  christos 
    827  1.1  christos               if (last_prop == LBP_BK)
    828  1.1  christos                 {
    829  1.1  christos                   /* Don't break at the beginning of a line.  */
    830  1.1  christos                   *q = UC_BREAK_PROHIBITED;
    831  1.1  christos                 }
    832  1.1  christos               else
    833  1.1  christos                 {
    834  1.1  christos                   switch (lbrk_table [last_prop-1] [prop-1])
    835  1.1  christos                     {
    836  1.1  christos                       case D:
    837  1.1  christos                         *q = UC_BREAK_POSSIBLE;
    838  1.1  christos                         break;
    839  1.1  christos                       case I:
    840  1.1  christos                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
    841  1.1  christos                         break;
    842  1.1  christos                       case P:
    843  1.1  christos                         *q = UC_BREAK_PROHIBITED;
    844  1.1  christos                         break;
    845  1.1  christos                       default:
    846  1.1  christos                         abort ();
    847  1.1  christos                     }
    848  1.1  christos                 }
    849  1.1  christos               last_prop = prop;
    850  1.1  christos               seen_space = NULL;
    851  1.1  christos               seen_space2 = NULL;
    852  1.1  christos             }
    853  1.1  christos         }
    854  1.1  christos 
    855  1.1  christos       s += count;
    856  1.1  christos       p += count;
    857  1.1  christos     }
    858  1.1  christos }
    859  1.1  christos 
    860  1.1  christos void
    861  1.1  christos u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
    862  1.1  christos {
    863  1.1  christos   int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
    864  1.1  christos   const unsigned int *s_end = s + n;
    865  1.1  christos   int last_prop = LBP_BK; /* line break property of last non-space character */
    866  1.1  christos   char *seen_space = NULL; /* Was a space seen after the last non-space character? */
    867  1.1  christos   char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
    868  1.1  christos 
    869  1.1  christos   while (s < s_end)
    870  1.1  christos     {
    871  1.1  christos       unsigned int uc = *s;
    872  1.1  christos       int prop = lbrkprop_lookup (uc);
    873  1.1  christos 
    874  1.1  christos       if (prop == LBP_BK)
    875  1.1  christos         {
    876  1.1  christos           /* Mandatory break.  */
    877  1.1  christos           *p = UC_BREAK_MANDATORY;
    878  1.1  christos           last_prop = LBP_BK;
    879  1.1  christos           seen_space = NULL;
    880  1.1  christos           seen_space2 = NULL;
    881  1.1  christos         }
    882  1.1  christos       else
    883  1.1  christos         {
    884  1.1  christos           char *q;
    885  1.1  christos 
    886  1.1  christos           /* Resolve property values whose behaviour is not fixed.  */
    887  1.1  christos           switch (prop)
    888  1.1  christos             {
    889  1.1  christos               case LBP_AI:
    890  1.1  christos                 /* Resolve ambiguous.  */
    891  1.1  christos                 prop = LBP_AI_REPLACEMENT;
    892  1.1  christos                 break;
    893  1.1  christos               case LBP_CB:
    894  1.1  christos                 /* This is arbitrary.  */
    895  1.1  christos                 prop = LBP_ID;
    896  1.1  christos                 break;
    897  1.1  christos               case LBP_SA:
    898  1.1  christos                 /* We don't handle complex scripts yet.
    899  1.1  christos                    Treat LBP_SA like LBP_XX.  */
    900  1.1  christos               case LBP_XX:
    901  1.1  christos                 /* This is arbitrary.  */
    902  1.1  christos                 prop = LBP_AL;
    903  1.1  christos                 break;
    904  1.1  christos             }
    905  1.1  christos 
    906  1.1  christos           /* Deal with combining characters.  */
    907  1.1  christos           q = p;
    908  1.1  christos           if (prop == LBP_CM)
    909  1.1  christos             {
    910  1.1  christos               /* Don't break just before a combining character.  */
    911  1.1  christos               *p = UC_BREAK_PROHIBITED;
    912  1.1  christos               /* A combining character turns a preceding space into LBP_AL.  */
    913  1.1  christos               if (seen_space != NULL)
    914  1.1  christos                 {
    915  1.1  christos                   q = seen_space;
    916  1.1  christos                   seen_space = seen_space2;
    917  1.1  christos                   prop = LBP_AL;
    918  1.1  christos                   goto lookup_via_table;
    919  1.1  christos                 }
    920  1.1  christos             }
    921  1.1  christos           else if (prop == LBP_SP)
    922  1.1  christos             {
    923  1.1  christos               /* Don't break just before a space.  */
    924  1.1  christos               *p = UC_BREAK_PROHIBITED;
    925  1.1  christos               seen_space2 = seen_space;
    926  1.1  christos               seen_space = p;
    927  1.1  christos             }
    928  1.1  christos           else
    929  1.1  christos             {
    930  1.1  christos              lookup_via_table:
    931  1.1  christos               /* prop must be usable as an index for table 7.3 of UTR #14.  */
    932  1.1  christos               if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
    933  1.1  christos                 abort ();
    934  1.1  christos 
    935  1.1  christos               if (last_prop == LBP_BK)
    936  1.1  christos                 {
    937  1.1  christos                   /* Don't break at the beginning of a line.  */
    938  1.1  christos                   *q = UC_BREAK_PROHIBITED;
    939  1.1  christos                 }
    940  1.1  christos               else
    941  1.1  christos                 {
    942  1.1  christos                   switch (lbrk_table [last_prop-1] [prop-1])
    943  1.1  christos                     {
    944  1.1  christos                       case D:
    945  1.1  christos                         *q = UC_BREAK_POSSIBLE;
    946  1.1  christos                         break;
    947  1.1  christos                       case I:
    948  1.1  christos                         *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
    949  1.1  christos                         break;
    950  1.1  christos                       case P:
    951  1.1  christos                         *q = UC_BREAK_PROHIBITED;
    952  1.1  christos                         break;
    953  1.1  christos                       default:
    954  1.1  christos                         abort ();
    955  1.1  christos                     }
    956  1.1  christos                 }
    957  1.1  christos               last_prop = prop;
    958  1.1  christos               seen_space = NULL;
    959  1.1  christos               seen_space2 = NULL;
    960  1.1  christos             }
    961  1.1  christos         }
    962  1.1  christos 
    963  1.1  christos       s++;
    964  1.1  christos       p++;
    965  1.1  christos     }
    966  1.1  christos }
    967  1.1  christos 
    968  1.1  christos #endif
    969  1.1  christos 
    970  1.1  christos 
    971  1.1  christos /* Choose the best line breaks, assuming the uc_width function.
    972  1.1  christos    Return the column after the end of the string.  */
    973  1.1  christos 
    974  1.1  christos int
    975  1.1  christos u8_width_linebreaks (const unsigned char *s, size_t n,
    976  1.1  christos                      int width, int start_column, int at_end_columns,
    977  1.1  christos                      const char *o, const char *encoding,
    978  1.1  christos                      char *p)
    979  1.1  christos {
    980  1.1  christos   const unsigned char *s_end;
    981  1.1  christos   char *last_p;
    982  1.1  christos   int last_column;
    983  1.1  christos   int piece_width;
    984  1.1  christos 
    985  1.1  christos   u8_possible_linebreaks (s, n, encoding, p);
    986  1.1  christos 
    987  1.1  christos   s_end = s + n;
    988  1.1  christos   last_p = NULL;
    989  1.1  christos   last_column = start_column;
    990  1.1  christos   piece_width = 0;
    991  1.1  christos   while (s < s_end)
    992  1.1  christos     {
    993  1.1  christos       unsigned int uc;
    994  1.1  christos       int count = u8_mbtouc (&uc, s, s_end - s);
    995  1.1  christos 
    996  1.1  christos       /* Respect the override.  */
    997  1.1  christos       if (o != NULL && *o != UC_BREAK_UNDEFINED)
    998  1.1  christos         *p = *o;
    999  1.1  christos 
   1000  1.1  christos       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
   1001  1.1  christos         {
   1002  1.1  christos           /* An atomic piece of text ends here.  */
   1003  1.1  christos           if (last_p != NULL && last_column + piece_width > width)
   1004  1.1  christos             {
   1005  1.1  christos               /* Insert a line break.  */
   1006  1.1  christos               *last_p = UC_BREAK_POSSIBLE;
   1007  1.1  christos               last_column = 0;
   1008  1.1  christos             }
   1009  1.1  christos         }
   1010  1.1  christos 
   1011  1.1  christos       if (*p == UC_BREAK_MANDATORY)
   1012  1.1  christos         {
   1013  1.1  christos           /* uc is a line break character.  */
   1014  1.1  christos           /* Start a new piece at column 0.  */
   1015  1.1  christos           last_p = NULL;
   1016  1.1  christos           last_column = 0;
   1017  1.1  christos           piece_width = 0;
   1018  1.1  christos         }
   1019  1.1  christos       else
   1020  1.1  christos         {
   1021  1.1  christos           /* uc is not a line break character.  */
   1022  1.1  christos           int w;
   1023  1.1  christos 
   1024  1.1  christos           if (*p == UC_BREAK_POSSIBLE)
   1025  1.1  christos             {
   1026  1.1  christos               /* Start a new piece.  */
   1027  1.1  christos               last_p = p;
   1028  1.1  christos               last_column += piece_width;
   1029  1.1  christos               piece_width = 0;
   1030  1.1  christos               /* No line break for the moment, may be turned into
   1031  1.1  christos                  UC_BREAK_POSSIBLE later, via last_p. */
   1032  1.1  christos             }
   1033  1.1  christos 
   1034  1.1  christos           *p = UC_BREAK_PROHIBITED;
   1035  1.1  christos 
   1036  1.1  christos           w = uc_width (uc, encoding);
   1037  1.1  christos           if (w >= 0) /* ignore control characters in the string */
   1038  1.1  christos             piece_width += w;
   1039  1.1  christos          }
   1040  1.1  christos 
   1041  1.1  christos       s += count;
   1042  1.1  christos       p += count;
   1043  1.1  christos       if (o != NULL)
   1044  1.1  christos         o += count;
   1045  1.1  christos     }
   1046  1.1  christos 
   1047  1.1  christos   /* The last atomic piece of text ends here.  */
   1048  1.1  christos   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
   1049  1.1  christos     {
   1050  1.1  christos       /* Insert a line break.  */
   1051  1.1  christos       *last_p = UC_BREAK_POSSIBLE;
   1052  1.1  christos       last_column = 0;
   1053  1.1  christos     }
   1054  1.1  christos 
   1055  1.1  christos   return last_column + piece_width;
   1056  1.1  christos }
   1057  1.1  christos 
   1058  1.1  christos #ifdef unused
   1059  1.1  christos 
   1060  1.1  christos int
   1061  1.1  christos u16_width_linebreaks (const unsigned short *s, size_t n,
   1062  1.1  christos                       int width, int start_column, int at_end_columns,
   1063  1.1  christos                       const char *o, const char *encoding,
   1064  1.1  christos                       char *p)
   1065  1.1  christos {
   1066  1.1  christos   const unsigned short *s_end;
   1067  1.1  christos   char *last_p;
   1068  1.1  christos   int last_column;
   1069  1.1  christos   int piece_width;
   1070  1.1  christos 
   1071  1.1  christos   u16_possible_linebreaks (s, n, encoding, p);
   1072  1.1  christos 
   1073  1.1  christos   s_end = s + n;
   1074  1.1  christos   last_p = NULL;
   1075  1.1  christos   last_column = start_column;
   1076  1.1  christos   piece_width = 0;
   1077  1.1  christos   while (s < s_end)
   1078  1.1  christos     {
   1079  1.1  christos       unsigned int uc;
   1080  1.1  christos       int count = u16_mbtouc (&uc, s, s_end - s);
   1081  1.1  christos 
   1082  1.1  christos       /* Respect the override.  */
   1083  1.1  christos       if (o != NULL && *o != UC_BREAK_UNDEFINED)
   1084  1.1  christos         *p = *o;
   1085  1.1  christos 
   1086  1.1  christos       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
   1087  1.1  christos         {
   1088  1.1  christos           /* An atomic piece of text ends here.  */
   1089  1.1  christos           if (last_p != NULL && last_column + piece_width > width)
   1090  1.1  christos             {
   1091  1.1  christos               /* Insert a line break.  */
   1092  1.1  christos               *last_p = UC_BREAK_POSSIBLE;
   1093  1.1  christos               last_column = 0;
   1094  1.1  christos             }
   1095  1.1  christos         }
   1096  1.1  christos 
   1097  1.1  christos       if (*p == UC_BREAK_MANDATORY)
   1098  1.1  christos         {
   1099  1.1  christos           /* uc is a line break character.  */
   1100  1.1  christos           /* Start a new piece at column 0.  */
   1101  1.1  christos           last_p = NULL;
   1102  1.1  christos           last_column = 0;
   1103  1.1  christos           piece_width = 0;
   1104  1.1  christos         }
   1105  1.1  christos       else
   1106  1.1  christos         {
   1107  1.1  christos           /* uc is not a line break character.  */
   1108  1.1  christos           int w;
   1109  1.1  christos 
   1110  1.1  christos           if (*p == UC_BREAK_POSSIBLE)
   1111  1.1  christos             {
   1112  1.1  christos               /* Start a new piece.  */
   1113  1.1  christos               last_p = p;
   1114  1.1  christos               last_column += piece_width;
   1115  1.1  christos               piece_width = 0;
   1116  1.1  christos               /* No line break for the moment, may be turned into
   1117  1.1  christos                  UC_BREAK_POSSIBLE later, via last_p. */
   1118  1.1  christos             }
   1119  1.1  christos 
   1120  1.1  christos           *p = UC_BREAK_PROHIBITED;
   1121  1.1  christos 
   1122  1.1  christos           w = uc_width (uc, encoding);
   1123  1.1  christos           if (w >= 0) /* ignore control characters in the string */
   1124  1.1  christos             piece_width += w;
   1125  1.1  christos          }
   1126  1.1  christos 
   1127  1.1  christos       s += count;
   1128  1.1  christos       p += count;
   1129  1.1  christos       if (o != NULL)
   1130  1.1  christos         o += count;
   1131  1.1  christos     }
   1132  1.1  christos 
   1133  1.1  christos   /* The last atomic piece of text ends here.  */
   1134  1.1  christos   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
   1135  1.1  christos     {
   1136  1.1  christos       /* Insert a line break.  */
   1137  1.1  christos       *last_p = UC_BREAK_POSSIBLE;
   1138  1.1  christos       last_column = 0;
   1139  1.1  christos     }
   1140  1.1  christos 
   1141  1.1  christos   return last_column + piece_width;
   1142  1.1  christos }
   1143  1.1  christos 
   1144  1.1  christos int
   1145  1.1  christos u32_width_linebreaks (const unsigned int *s, size_t n,
   1146  1.1  christos                       int width, int start_column, int at_end_columns,
   1147  1.1  christos                       const char *o, const char *encoding,
   1148  1.1  christos                       char *p)
   1149  1.1  christos {
   1150  1.1  christos   const unsigned int *s_end;
   1151  1.1  christos   char *last_p;
   1152  1.1  christos   int last_column;
   1153  1.1  christos   int piece_width;
   1154  1.1  christos 
   1155  1.1  christos   u32_possible_linebreaks (s, n, encoding, p);
   1156  1.1  christos 
   1157  1.1  christos   s_end = s + n;
   1158  1.1  christos   last_p = NULL;
   1159  1.1  christos   last_column = start_column;
   1160  1.1  christos   piece_width = 0;
   1161  1.1  christos   while (s < s_end)
   1162  1.1  christos     {
   1163  1.1  christos       unsigned int uc = *s;
   1164  1.1  christos 
   1165  1.1  christos       /* Respect the override.  */
   1166  1.1  christos       if (o != NULL && *o != UC_BREAK_UNDEFINED)
   1167  1.1  christos         *p = *o;
   1168  1.1  christos 
   1169  1.1  christos       if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
   1170  1.1  christos         {
   1171  1.1  christos           /* An atomic piece of text ends here.  */
   1172  1.1  christos           if (last_p != NULL && last_column + piece_width > width)
   1173  1.1  christos             {
   1174  1.1  christos               /* Insert a line break.  */
   1175  1.1  christos               *last_p = UC_BREAK_POSSIBLE;
   1176  1.1  christos               last_column = 0;
   1177  1.1  christos             }
   1178  1.1  christos         }
   1179  1.1  christos 
   1180  1.1  christos       if (*p == UC_BREAK_MANDATORY)
   1181  1.1  christos         {
   1182  1.1  christos           /* uc is a line break character.  */
   1183  1.1  christos           /* Start a new piece at column 0.  */
   1184  1.1  christos           last_p = NULL;
   1185  1.1  christos           last_column = 0;
   1186  1.1  christos           piece_width = 0;
   1187  1.1  christos         }
   1188  1.1  christos       else
   1189  1.1  christos         {
   1190  1.1  christos           /* uc is not a line break character.  */
   1191  1.1  christos           int w;
   1192  1.1  christos 
   1193  1.1  christos           if (*p == UC_BREAK_POSSIBLE)
   1194  1.1  christos             {
   1195  1.1  christos               /* Start a new piece.  */
   1196  1.1  christos               last_p = p;
   1197  1.1  christos               last_column += piece_width;
   1198  1.1  christos               piece_width = 0;
   1199  1.1  christos               /* No line break for the moment, may be turned into
   1200  1.1  christos                  UC_BREAK_POSSIBLE later, via last_p. */
   1201  1.1  christos             }
   1202  1.1  christos 
   1203  1.1  christos           *p = UC_BREAK_PROHIBITED;
   1204  1.1  christos 
   1205  1.1  christos           w = uc_width (uc, encoding);
   1206  1.1  christos           if (w >= 0) /* ignore control characters in the string */
   1207  1.1  christos             piece_width += w;
   1208  1.1  christos          }
   1209  1.1  christos 
   1210  1.1  christos       s++;
   1211  1.1  christos       p++;
   1212  1.1  christos       if (o != NULL)
   1213  1.1  christos         o++;
   1214  1.1  christos     }
   1215  1.1  christos 
   1216  1.1  christos   /* The last atomic piece of text ends here.  */
   1217  1.1  christos   if (last_p != NULL && last_column + piece_width + at_end_columns > width)
   1218  1.1  christos     {
   1219  1.1  christos       /* Insert a line break.  */
   1220  1.1  christos       *last_p = UC_BREAK_POSSIBLE;
   1221  1.1  christos       last_column = 0;
   1222  1.1  christos     }
   1223  1.1  christos 
   1224  1.1  christos   return last_column + piece_width;
   1225  1.1  christos }
   1226  1.1  christos 
   1227  1.1  christos #endif
   1228  1.1  christos 
   1229  1.1  christos 
   1230  1.1  christos #ifdef TEST1
   1231  1.1  christos 
   1232  1.1  christos #include <stdio.h>
   1233  1.1  christos 
   1234  1.1  christos /* Read the contents of an input stream, and return it, terminated with a NUL
   1235  1.1  christos    byte. */
   1236  1.1  christos char *
   1237  1.1  christos read_file (FILE *stream)
   1238  1.1  christos {
   1239  1.1  christos #define BUFSIZE 4096
   1240  1.1  christos   char *buf = NULL;
   1241  1.1  christos   int alloc = 0;
   1242  1.1  christos   int size = 0;
   1243  1.1  christos   int count;
   1244  1.1  christos 
   1245  1.1  christos   while (! feof (stream))
   1246  1.1  christos     {
   1247  1.1  christos       if (size + BUFSIZE > alloc)
   1248  1.1  christos         {
   1249  1.1  christos           alloc = alloc + alloc / 2;
   1250  1.1  christos           if (alloc < size + BUFSIZE)
   1251  1.1  christos             alloc = size + BUFSIZE;
   1252  1.1  christos           buf = realloc (buf, alloc);
   1253  1.1  christos           if (buf == NULL)
   1254  1.1  christos             {
   1255  1.1  christos               fprintf (stderr, "out of memory\n");
   1256  1.1  christos               exit (1);
   1257  1.1  christos             }
   1258  1.1  christos         }
   1259  1.1  christos       count = fread (buf + size, 1, BUFSIZE, stream);
   1260  1.1  christos       if (count == 0)
   1261  1.1  christos         {
   1262  1.1  christos           if (ferror (stream))
   1263  1.1  christos             {
   1264  1.1  christos               perror ("fread");
   1265  1.1  christos               exit (1);
   1266  1.1  christos             }
   1267  1.1  christos         }
   1268  1.1  christos       else
   1269  1.1  christos         size += count;
   1270  1.1  christos     }
   1271  1.1  christos   buf = realloc (buf, size + 1);
   1272  1.1  christos   if (buf == NULL)
   1273  1.1  christos     {
   1274  1.1  christos       fprintf (stderr, "out of memory\n");
   1275  1.1  christos       exit (1);
   1276  1.1  christos     }
   1277  1.1  christos   buf[size] = '\0';
   1278  1.1  christos   return buf;
   1279  1.1  christos #undef BUFSIZE
   1280  1.1  christos }
   1281  1.1  christos 
   1282  1.1  christos int
   1283  1.1  christos main (int argc, char * argv[])
   1284  1.1  christos {
   1285  1.1  christos   if (argc == 1)
   1286  1.1  christos     {
   1287  1.1  christos       /* Display all the break opportunities in the input string.  */
   1288  1.1  christos       char *input = read_file (stdin);
   1289  1.1  christos       int length = strlen (input);
   1290  1.1  christos       char *breaks = malloc (length);
   1291  1.1  christos       int i;
   1292  1.1  christos 
   1293  1.1  christos       u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
   1294  1.1  christos 
   1295  1.1  christos       for (i = 0; i < length; i++)
   1296  1.1  christos         {
   1297  1.1  christos           switch (breaks[i])
   1298  1.1  christos             {
   1299  1.1  christos               case UC_BREAK_POSSIBLE:
   1300  1.1  christos                 /* U+2027 in UTF-8 encoding */
   1301  1.1  christos                 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
   1302  1.1  christos                 break;
   1303  1.1  christos               case UC_BREAK_MANDATORY:
   1304  1.1  christos                 /* U+21B2 (or U+21B5) in UTF-8 encoding */
   1305  1.1  christos                 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
   1306  1.1  christos                 break;
   1307  1.1  christos               case UC_BREAK_PROHIBITED:
   1308  1.1  christos                 break;
   1309  1.1  christos               default:
   1310  1.1  christos                 abort ();
   1311  1.1  christos             }
   1312  1.1  christos           putc (input[i], stdout);
   1313  1.1  christos         }
   1314  1.1  christos 
   1315  1.1  christos       free (breaks);
   1316  1.1  christos 
   1317  1.1  christos       return 0;
   1318  1.1  christos     }
   1319  1.1  christos   else if (argc == 2)
   1320  1.1  christos     {
   1321  1.1  christos       /* Insert line breaks for a given width.  */
   1322  1.1  christos       int width = atoi (argv[1]);
   1323  1.1  christos       char *input = read_file (stdin);
   1324  1.1  christos       int length = strlen (input);
   1325  1.1  christos       char *breaks = malloc (length);
   1326  1.1  christos       int i;
   1327  1.1  christos 
   1328  1.1  christos       u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
   1329  1.1  christos 
   1330  1.1  christos       for (i = 0; i < length; i++)
   1331  1.1  christos         {
   1332  1.1  christos           switch (breaks[i])
   1333  1.1  christos             {
   1334  1.1  christos               case UC_BREAK_POSSIBLE:
   1335  1.1  christos                 putc ('\n', stdout);
   1336  1.1  christos                 break;
   1337  1.1  christos               case UC_BREAK_MANDATORY:
   1338  1.1  christos                 break;
   1339  1.1  christos               case UC_BREAK_PROHIBITED:
   1340  1.1  christos                 break;
   1341  1.1  christos               default:
   1342  1.1  christos                 abort ();
   1343  1.1  christos             }
   1344  1.1  christos           putc (input[i], stdout);
   1345  1.1  christos         }
   1346  1.1  christos 
   1347  1.1  christos       free (breaks);
   1348  1.1  christos 
   1349  1.1  christos       return 0;
   1350  1.1  christos     }
   1351  1.1  christos   else
   1352  1.1  christos     return 1;
   1353  1.1  christos }
   1354  1.1  christos 
   1355  1.1  christos #endif /* TEST1 */
   1356  1.1  christos 
   1357  1.1  christos 
   1358  1.1  christos /* Now the same thing with an arbitrary encoding.
   1359  1.1  christos 
   1360  1.1  christos    We convert the input string to Unicode.
   1361  1.1  christos 
   1362  1.1  christos    The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
   1363  1.1  christos    UTF-16BE, UTF-16LE, UTF-7.  UCS-2 supports only characters up to
   1364  1.1  christos    \U0000FFFF.  UTF-16 and variants support only characters up to
   1365  1.1  christos    \U0010FFFF.  UTF-7 is way too complex and not supported by glibc-2.1.
   1366  1.1  christos    UCS-4 specification leaves doubts about endianness and byte order mark.
   1367  1.1  christos    glibc currently interprets it as big endian without byte order mark,
   1368  1.1  christos    but this is not backed by an RFC.  So we use UTF-8. It supports
   1369  1.1  christos    characters up to \U7FFFFFFF and is unambiguously defined.  */
   1370  1.1  christos 
   1371  1.1  christos #if HAVE_ICONV
   1372  1.1  christos 
   1373  1.1  christos #include <iconv.h>
   1374  1.1  christos #include <errno.h>
   1375  1.1  christos 
   1376  1.1  christos /* Luckily, the encoding's name is platform independent.  */
   1377  1.1  christos #define UTF8_NAME "UTF-8"
   1378  1.1  christos 
   1379  1.1  christos /* Return the length of a string after conversion through an iconv_t.  */
   1380  1.1  christos static size_t
   1381  1.1  christos iconv_string_length (iconv_t cd, const char *s, size_t n)
   1382  1.1  christos {
   1383  1.1  christos #define TMPBUFSIZE 4096
   1384  1.1  christos   size_t count = 0;
   1385  1.1  christos   char tmpbuf[TMPBUFSIZE];
   1386  1.1  christos   const char *inptr = s;
   1387  1.1  christos   size_t insize = n;
   1388  1.1  christos   while (insize > 0)
   1389  1.1  christos     {
   1390  1.1  christos       char *outptr = tmpbuf;
   1391  1.1  christos       size_t outsize = TMPBUFSIZE;
   1392  1.1  christos       size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
   1393  1.1  christos       if (res == (size_t)(-1) && errno != E2BIG)
   1394  1.1  christos         return (size_t)(-1);
   1395  1.1  christos       count += outptr - tmpbuf;
   1396  1.1  christos     }
   1397  1.1  christos   /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
   1398  1.1  christos #if defined _LIBICONV_VERSION \
   1399  1.1  christos     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
   1400  1.1  christos   {
   1401  1.1  christos     char *outptr = tmpbuf;
   1402  1.1  christos     size_t outsize = TMPBUFSIZE;
   1403  1.1  christos     size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
   1404  1.1  christos     if (res == (size_t)(-1))
   1405  1.1  christos       return (size_t)(-1);
   1406  1.1  christos     count += outptr - tmpbuf;
   1407  1.1  christos   }
   1408  1.1  christos   /* Return to the initial state.  */
   1409  1.1  christos   iconv (cd, NULL, NULL, NULL, NULL);
   1410  1.1  christos #endif
   1411  1.1  christos   return count;
   1412  1.1  christos #undef TMPBUFSIZE
   1413  1.1  christos }
   1414  1.1  christos 
   1415  1.1  christos static void
   1416  1.1  christos iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
   1417  1.1  christos                               size_t *offtable, char *t, size_t m)
   1418  1.1  christos {
   1419  1.1  christos   size_t i;
   1420  1.1  christos   const char *s_end;
   1421  1.1  christos   const char *inptr;
   1422  1.1  christos   char *outptr;
   1423  1.1  christos   size_t outsize;
   1424  1.1  christos   /* Avoid glibc-2.1 bug.  */
   1425  1.1  christos #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
   1426  1.1  christos   const size_t extra = 1;
   1427  1.1  christos #else
   1428  1.1  christos   const size_t extra = 0;
   1429  1.1  christos #endif
   1430  1.1  christos 
   1431  1.1  christos   for (i = 0; i < n; i++)
   1432  1.1  christos     offtable[i] = (size_t)(-1);
   1433  1.1  christos 
   1434  1.1  christos   s_end = s + n;
   1435  1.1  christos   inptr = s;
   1436  1.1  christos   outptr = t;
   1437  1.1  christos   outsize = m + extra;
   1438  1.1  christos   while (inptr < s_end)
   1439  1.1  christos     {
   1440  1.1  christos       const char *saved_inptr;
   1441  1.1  christos       size_t insize;
   1442  1.1  christos       size_t res;
   1443  1.1  christos 
   1444  1.1  christos       offtable[inptr - s] = outptr - t;
   1445  1.1  christos 
   1446  1.1  christos       saved_inptr = inptr;
   1447  1.1  christos       res = (size_t)(-1);
   1448  1.1  christos       for (insize = 1; inptr + insize <= s_end; insize++)
   1449  1.1  christos         {
   1450  1.1  christos           res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
   1451  1.1  christos           if (!(res == (size_t)(-1) && errno == EINVAL))
   1452  1.1  christos             break;
   1453  1.1  christos           /* We expect that no input bytes have been consumed so far.  */
   1454  1.1  christos           if (inptr != saved_inptr)
   1455  1.1  christos             abort ();
   1456  1.1  christos         }
   1457  1.1  christos       /* After we verified the convertibility and computed the translation's
   1458  1.1  christos          size m, there shouldn't be any conversion error here. */
   1459  1.1  christos       if (res == (size_t)(-1))
   1460  1.1  christos         abort ();
   1461  1.1  christos     }
   1462  1.1  christos   /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
   1463  1.1  christos #if defined _LIBICONV_VERSION \
   1464  1.1  christos     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
   1465  1.1  christos   if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
   1466  1.1  christos     abort ();
   1467  1.1  christos #endif
   1468  1.1  christos   /* We should have produced exactly m output bytes.  */
   1469  1.1  christos   if (outsize != extra)
   1470  1.1  christos     abort ();
   1471  1.1  christos }
   1472  1.1  christos 
   1473  1.1  christos #endif /* HAVE_ICONV */
   1474  1.1  christos 
   1475  1.1  christos #if C_CTYPE_ASCII
   1476  1.1  christos 
   1477  1.1  christos /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
   1478  1.1  christos    Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding.  */
   1479  1.1  christos static int
   1480  1.1  christos is_all_ascii (const char *s, size_t n)
   1481  1.1  christos {
   1482  1.1  christos   for (; n > 0; s++, n--)
   1483  1.1  christos     {
   1484  1.1  christos       unsigned char c = (unsigned char) *s;
   1485  1.1  christos 
   1486  1.1  christos       if (!(c_isprint (c) || c_isspace (c)))
   1487  1.1  christos 	return 0;
   1488  1.1  christos     }
   1489  1.1  christos   return 1;
   1490  1.1  christos }
   1491  1.1  christos 
   1492  1.1  christos #endif /* C_CTYPE_ASCII */
   1493  1.1  christos 
   1494  1.1  christos #if defined unused || defined TEST2
   1495  1.1  christos 
   1496  1.1  christos void
   1497  1.1  christos mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
   1498  1.1  christos                          char *p)
   1499  1.1  christos {
   1500  1.1  christos   if (n == 0)
   1501  1.1  christos     return;
   1502  1.1  christos   if (is_utf8_encoding (encoding))
   1503  1.1  christos     u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
   1504  1.1  christos   else
   1505  1.1  christos     {
   1506  1.1  christos #if HAVE_ICONV
   1507  1.1  christos       iconv_t to_utf8;
   1508  1.1  christos       /* Avoid glibc-2.1 bug with EUC-KR.  */
   1509  1.1  christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
   1510  1.1  christos       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
   1511  1.1  christos 	to_utf8 = (iconv_t)(-1);
   1512  1.1  christos       else
   1513  1.1  christos # endif
   1514  1.1  christos       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
   1515  1.1  christos          GB18030.  */
   1516  1.1  christos # if defined __sun && !defined _LIBICONV_VERSION
   1517  1.1  christos       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
   1518  1.1  christos           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
   1519  1.1  christos           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
   1520  1.1  christos           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
   1521  1.1  christos           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
   1522  1.1  christos           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
   1523  1.1  christos         to_utf8 = (iconv_t)(-1);
   1524  1.1  christos       else
   1525  1.1  christos # endif
   1526  1.1  christos       to_utf8 = iconv_open (UTF8_NAME, encoding);
   1527  1.1  christos       if (to_utf8 != (iconv_t)(-1))
   1528  1.1  christos         {
   1529  1.1  christos           /* Determine the length of the resulting UTF-8 string.  */
   1530  1.1  christos           size_t m = iconv_string_length (to_utf8, s, n);
   1531  1.1  christos           if (m != (size_t)(-1))
   1532  1.1  christos             {
   1533  1.1  christos               /* Convert the string to UTF-8 and build a translation table
   1534  1.1  christos                  from offsets into s to offsets into the translated string.  */
   1535  1.1  christos 	      size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
   1536  1.1  christos               char *memory =
   1537  1.1  christos 		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
   1538  1.1  christos               if (memory != NULL)
   1539  1.1  christos                 {
   1540  1.1  christos                   size_t *offtable = (size_t *) memory;
   1541  1.1  christos                   char *t = (char *) (offtable + n);
   1542  1.1  christos                   char *q = (char *) (t + m);
   1543  1.1  christos                   size_t i;
   1544  1.1  christos 
   1545  1.1  christos                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
   1546  1.1  christos 
   1547  1.1  christos                   /* Determine the possible line breaks of the UTF-8 string.  */
   1548  1.1  christos                   u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
   1549  1.1  christos 
   1550  1.1  christos                   /* Translate the result back to the original string.  */
   1551  1.1  christos                   memset (p, UC_BREAK_PROHIBITED, n);
   1552  1.1  christos                   for (i = 0; i < n; i++)
   1553  1.1  christos                     if (offtable[i] != (size_t)(-1))
   1554  1.1  christos                       p[i] = q[offtable[i]];
   1555  1.1  christos 
   1556  1.1  christos                   free (memory);
   1557  1.1  christos                   iconv_close (to_utf8);
   1558  1.1  christos                   return;
   1559  1.1  christos                 }
   1560  1.1  christos             }
   1561  1.1  christos           iconv_close (to_utf8);
   1562  1.1  christos         }
   1563  1.1  christos #endif
   1564  1.1  christos       /* Impossible to convert.  */
   1565  1.1  christos #if C_CTYPE_ASCII
   1566  1.1  christos       if (is_all_ascii (s, n))
   1567  1.1  christos 	{
   1568  1.1  christos 	  /* ASCII is a subset of UTF-8.  */
   1569  1.1  christos 	  u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
   1570  1.1  christos 	  return;
   1571  1.1  christos 	}
   1572  1.1  christos #endif
   1573  1.1  christos       /* We have a non-ASCII string and cannot convert it.
   1574  1.1  christos 	 Don't produce line breaks except those already present in the
   1575  1.1  christos 	 input string.  All we assume here is that the encoding is
   1576  1.1  christos 	 minimally ASCII compatible.  */
   1577  1.1  christos       {
   1578  1.1  christos         const char *s_end = s + n;
   1579  1.1  christos         while (s < s_end)
   1580  1.1  christos           {
   1581  1.1  christos             *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
   1582  1.1  christos             s++;
   1583  1.1  christos             p++;
   1584  1.1  christos           }
   1585  1.1  christos       }
   1586  1.1  christos     }
   1587  1.1  christos }
   1588  1.1  christos 
   1589  1.1  christos #endif
   1590  1.1  christos 
   1591  1.1  christos int
   1592  1.1  christos mbs_width_linebreaks (const char *s, size_t n,
   1593  1.1  christos                       int width, int start_column, int at_end_columns,
   1594  1.1  christos                       const char *o, const char *encoding,
   1595  1.1  christos                       char *p)
   1596  1.1  christos {
   1597  1.1  christos   if (n == 0)
   1598  1.1  christos     return start_column;
   1599  1.1  christos   if (is_utf8_encoding (encoding))
   1600  1.1  christos     return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
   1601  1.1  christos   else
   1602  1.1  christos     {
   1603  1.1  christos #if HAVE_ICONV
   1604  1.1  christos       iconv_t to_utf8;
   1605  1.1  christos       /* Avoid glibc-2.1 bug with EUC-KR.  */
   1606  1.1  christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
   1607  1.1  christos       if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
   1608  1.1  christos 	to_utf8 = (iconv_t)(-1);
   1609  1.1  christos       else
   1610  1.1  christos # endif
   1611  1.1  christos       /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
   1612  1.1  christos          GB18030.  */
   1613  1.1  christos # if defined __sun && !defined _LIBICONV_VERSION
   1614  1.1  christos       if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
   1615  1.1  christos           || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
   1616  1.1  christos           || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
   1617  1.1  christos           || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
   1618  1.1  christos           || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
   1619  1.1  christos           || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
   1620  1.1  christos         to_utf8 = (iconv_t)(-1);
   1621  1.1  christos       else
   1622  1.1  christos # endif
   1623  1.1  christos       to_utf8 = iconv_open (UTF8_NAME, encoding);
   1624  1.1  christos       if (to_utf8 != (iconv_t)(-1))
   1625  1.1  christos         {
   1626  1.1  christos           /* Determine the length of the resulting UTF-8 string.  */
   1627  1.1  christos           size_t m = iconv_string_length (to_utf8, s, n);
   1628  1.1  christos           if (m != (size_t)(-1))
   1629  1.1  christos             {
   1630  1.1  christos               /* Convert the string to UTF-8 and build a translation table
   1631  1.1  christos                  from offsets into s to offsets into the translated string.  */
   1632  1.1  christos 	      size_t memory_size =
   1633  1.1  christos 		xsum4 (xtimes (n, sizeof (size_t)), m, m,
   1634  1.1  christos 		       (o != NULL ? m : 0));
   1635  1.1  christos 	      char *memory =
   1636  1.1  christos 		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
   1637  1.1  christos               if (memory != NULL)
   1638  1.1  christos                 {
   1639  1.1  christos                   size_t *offtable = (size_t *) memory;
   1640  1.1  christos                   char *t = (char *) (offtable + n);
   1641  1.1  christos                   char *q = (char *) (t + m);
   1642  1.1  christos                   char *o8 = (o != NULL ? (char *) (q + m) : NULL);
   1643  1.1  christos                   int res_column;
   1644  1.1  christos                   size_t i;
   1645  1.1  christos 
   1646  1.1  christos                   iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
   1647  1.1  christos 
   1648  1.1  christos                   /* Translate the overrides to the UTF-8 string.  */
   1649  1.1  christos                   if (o != NULL)
   1650  1.1  christos                     {
   1651  1.1  christos                       memset (o8, UC_BREAK_UNDEFINED, m);
   1652  1.1  christos                       for (i = 0; i < n; i++)
   1653  1.1  christos                         if (offtable[i] != (size_t)(-1))
   1654  1.1  christos                           o8[offtable[i]] = o[i];
   1655  1.1  christos                     }
   1656  1.1  christos 
   1657  1.1  christos                   /* Determine the line breaks of the UTF-8 string.  */
   1658  1.1  christos                   res_column =
   1659  1.1  christos                     u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
   1660  1.1  christos 
   1661  1.1  christos                   /* Translate the result back to the original string.  */
   1662  1.1  christos                   memset (p, UC_BREAK_PROHIBITED, n);
   1663  1.1  christos                   for (i = 0; i < n; i++)
   1664  1.1  christos                     if (offtable[i] != (size_t)(-1))
   1665  1.1  christos                       p[i] = q[offtable[i]];
   1666  1.1  christos 
   1667  1.1  christos                   free (memory);
   1668  1.1  christos                   iconv_close (to_utf8);
   1669  1.1  christos                   return res_column;
   1670  1.1  christos                 }
   1671  1.1  christos             }
   1672  1.1  christos           iconv_close (to_utf8);
   1673  1.1  christos         }
   1674  1.1  christos #endif
   1675  1.1  christos       /* Impossible to convert.  */
   1676  1.1  christos #if C_CTYPE_ASCII
   1677  1.1  christos       if (is_all_ascii (s, n))
   1678  1.1  christos 	{
   1679  1.1  christos 	  /* ASCII is a subset of UTF-8.  */
   1680  1.1  christos 	  return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
   1681  1.1  christos 	}
   1682  1.1  christos #endif
   1683  1.1  christos       /* We have a non-ASCII string and cannot convert it.
   1684  1.1  christos 	 Don't produce line breaks except those already present in the
   1685  1.1  christos 	 input string.  All we assume here is that the encoding is
   1686  1.1  christos 	 minimally ASCII compatible.  */
   1687  1.1  christos       {
   1688  1.1  christos         const char *s_end = s + n;
   1689  1.1  christos         while (s < s_end)
   1690  1.1  christos           {
   1691  1.1  christos             *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
   1692  1.1  christos                   ? UC_BREAK_MANDATORY
   1693  1.1  christos                   : UC_BREAK_PROHIBITED);
   1694  1.1  christos             s++;
   1695  1.1  christos             p++;
   1696  1.1  christos             if (o != NULL)
   1697  1.1  christos               o++;
   1698  1.1  christos           }
   1699  1.1  christos         /* We cannot compute widths in this case.  */
   1700  1.1  christos         return start_column;
   1701  1.1  christos       }
   1702  1.1  christos     }
   1703  1.1  christos }
   1704  1.1  christos 
   1705  1.1  christos 
   1706  1.1  christos #ifdef TEST2
   1707  1.1  christos 
   1708  1.1  christos #include <stdio.h>
   1709  1.1  christos #include <locale.h>
   1710  1.1  christos 
   1711  1.1  christos /* Read the contents of an input stream, and return it, terminated with a NUL
   1712  1.1  christos    byte. */
   1713  1.1  christos char *
   1714  1.1  christos read_file (FILE *stream)
   1715  1.1  christos {
   1716  1.1  christos #define BUFSIZE 4096
   1717  1.1  christos   char *buf = NULL;
   1718  1.1  christos   int alloc = 0;
   1719  1.1  christos   int size = 0;
   1720  1.1  christos   int count;
   1721  1.1  christos 
   1722  1.1  christos   while (! feof (stream))
   1723  1.1  christos     {
   1724  1.1  christos       if (size + BUFSIZE > alloc)
   1725  1.1  christos         {
   1726  1.1  christos           alloc = alloc + alloc / 2;
   1727  1.1  christos           if (alloc < size + BUFSIZE)
   1728  1.1  christos             alloc = size + BUFSIZE;
   1729  1.1  christos           buf = realloc (buf, alloc);
   1730  1.1  christos           if (buf == NULL)
   1731  1.1  christos             {
   1732  1.1  christos               fprintf (stderr, "out of memory\n");
   1733  1.1  christos               exit (1);
   1734  1.1  christos             }
   1735  1.1  christos         }
   1736  1.1  christos       count = fread (buf + size, 1, BUFSIZE, stream);
   1737  1.1  christos       if (count == 0)
   1738  1.1  christos         {
   1739  1.1  christos           if (ferror (stream))
   1740  1.1  christos             {
   1741  1.1  christos               perror ("fread");
   1742  1.1  christos               exit (1);
   1743  1.1  christos             }
   1744  1.1  christos         }
   1745  1.1  christos       else
   1746  1.1  christos         size += count;
   1747  1.1  christos     }
   1748  1.1  christos   buf = realloc (buf, size + 1);
   1749  1.1  christos   if (buf == NULL)
   1750  1.1  christos     {
   1751  1.1  christos       fprintf (stderr, "out of memory\n");
   1752  1.1  christos       exit (1);
   1753  1.1  christos     }
   1754  1.1  christos   buf[size] = '\0';
   1755  1.1  christos   return buf;
   1756  1.1  christos #undef BUFSIZE
   1757  1.1  christos }
   1758  1.1  christos 
   1759  1.1  christos int
   1760  1.1  christos main (int argc, char * argv[])
   1761  1.1  christos {
   1762  1.1  christos   setlocale (LC_CTYPE, "");
   1763  1.1  christos   if (argc == 1)
   1764  1.1  christos     {
   1765  1.1  christos       /* Display all the break opportunities in the input string.  */
   1766  1.1  christos       char *input = read_file (stdin);
   1767  1.1  christos       int length = strlen (input);
   1768  1.1  christos       char *breaks = malloc (length);
   1769  1.1  christos       int i;
   1770  1.1  christos 
   1771  1.1  christos       mbs_possible_linebreaks (input, length, locale_charset (), breaks);
   1772  1.1  christos 
   1773  1.1  christos       for (i = 0; i < length; i++)
   1774  1.1  christos         {
   1775  1.1  christos           switch (breaks[i])
   1776  1.1  christos             {
   1777  1.1  christos               case UC_BREAK_POSSIBLE:
   1778  1.1  christos                 putc ('|', stdout);
   1779  1.1  christos                 break;
   1780  1.1  christos               case UC_BREAK_MANDATORY:
   1781  1.1  christos                 break;
   1782  1.1  christos               case UC_BREAK_PROHIBITED:
   1783  1.1  christos                 break;
   1784  1.1  christos               default:
   1785  1.1  christos                 abort ();
   1786  1.1  christos             }
   1787  1.1  christos           putc (input[i], stdout);
   1788  1.1  christos         }
   1789  1.1  christos 
   1790  1.1  christos       free (breaks);
   1791  1.1  christos 
   1792  1.1  christos       return 0;
   1793  1.1  christos     }
   1794  1.1  christos   else if (argc == 2)
   1795  1.1  christos     {
   1796  1.1  christos       /* Insert line breaks for a given width.  */
   1797  1.1  christos       int width = atoi (argv[1]);
   1798  1.1  christos       char *input = read_file (stdin);
   1799  1.1  christos       int length = strlen (input);
   1800  1.1  christos       char *breaks = malloc (length);
   1801  1.1  christos       int i;
   1802  1.1  christos 
   1803  1.1  christos       mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
   1804  1.1  christos 
   1805  1.1  christos       for (i = 0; i < length; i++)
   1806  1.1  christos         {
   1807  1.1  christos           switch (breaks[i])
   1808  1.1  christos             {
   1809  1.1  christos               case UC_BREAK_POSSIBLE:
   1810  1.1  christos                 putc ('\n', stdout);
   1811  1.1  christos                 break;
   1812  1.1  christos               case UC_BREAK_MANDATORY:
   1813  1.1  christos                 break;
   1814  1.1  christos               case UC_BREAK_PROHIBITED:
   1815  1.1  christos                 break;
   1816  1.1  christos               default:
   1817  1.1  christos                 abort ();
   1818  1.1  christos             }
   1819  1.1  christos           putc (input[i], stdout);
   1820  1.1  christos         }
   1821  1.1  christos 
   1822  1.1  christos       free (breaks);
   1823  1.1  christos 
   1824  1.1  christos       return 0;
   1825  1.1  christos     }
   1826  1.1  christos   else
   1827  1.1  christos     return 1;
   1828  1.1  christos }
   1829  1.1  christos 
   1830  1.1  christos #endif /* TEST2 */
   1831