1 1.1 christos /* linebreak.c - line breaking of Unicode strings 2 1.1 christos Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc. 3 1.1 christos Written by Bruno Haible <haible (at) clisp.cons.org>, 2001. 4 1.1 christos 5 1.1 christos This program is free software; you can redistribute it and/or modify 6 1.1 christos it under the terms of the GNU General Public License as published by 7 1.1 christos the Free Software Foundation; either version 2, or (at your option) 8 1.1 christos any later version. 9 1.1 christos 10 1.1 christos This program is distributed in the hope that it will be useful, 11 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of 12 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 1.1 christos GNU General Public License for more details. 14 1.1 christos 15 1.1 christos You should have received a copy of the GNU General Public License 16 1.1 christos along with this program; if not, write to the Free Software Foundation, 17 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 1.1 christos 19 1.1 christos #include <config.h> 20 1.1 christos 21 1.1 christos /* Specification. */ 22 1.1 christos #include "linebreak.h" 23 1.1 christos 24 1.1 christos #include <stdlib.h> 25 1.1 christos #include <string.h> 26 1.1 christos #include "c-ctype.h" 27 1.1 christos #include "xsize.h" 28 1.1 christos 29 1.1 christos #include "utf8-ucs4.h" 30 1.1 christos 31 1.1 christos #ifdef unused 32 1.1 christos #include "utf16-ucs4.h" 33 1.1 christos 34 1.1 christos static inline int 35 1.1 christos u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n) 36 1.1 christos { 37 1.1 christos *puc = *s; 38 1.1 christos return 1; 39 1.1 christos } 40 1.1 christos #endif 41 1.1 christos 42 1.1 christos 43 1.1 christos /* Help GCC to generate good code for string comparisons with 44 1.1 christos immediate strings. */ 45 1.1 christos #if defined (__GNUC__) && defined (__OPTIMIZE__) 46 1.1 christos 47 1.1 christos static inline int 48 1.1 christos streq9 (const char *s1, const char *s2) 49 1.1 christos { 50 1.1 christos return strcmp (s1 + 9, s2 + 9) == 0; 51 1.1 christos } 52 1.1 christos 53 1.1 christos static inline int 54 1.1 christos streq8 (const char *s1, const char *s2, char s28) 55 1.1 christos { 56 1.1 christos if (s1[8] == s28) 57 1.1 christos { 58 1.1 christos if (s28 == 0) 59 1.1 christos return 1; 60 1.1 christos else 61 1.1 christos return streq9 (s1, s2); 62 1.1 christos } 63 1.1 christos else 64 1.1 christos return 0; 65 1.1 christos } 66 1.1 christos 67 1.1 christos static inline int 68 1.1 christos streq7 (const char *s1, const char *s2, char s27, char s28) 69 1.1 christos { 70 1.1 christos if (s1[7] == s27) 71 1.1 christos { 72 1.1 christos if (s27 == 0) 73 1.1 christos return 1; 74 1.1 christos else 75 1.1 christos return streq8 (s1, s2, s28); 76 1.1 christos } 77 1.1 christos else 78 1.1 christos return 0; 79 1.1 christos } 80 1.1 christos 81 1.1 christos static inline int 82 1.1 christos streq6 (const char *s1, const char *s2, char s26, char s27, char s28) 83 1.1 christos { 84 1.1 christos if (s1[6] == s26) 85 1.1 christos { 86 1.1 christos if (s26 == 0) 87 1.1 christos return 1; 88 1.1 christos else 89 1.1 christos return streq7 (s1, s2, s27, s28); 90 1.1 christos } 91 1.1 christos else 92 1.1 christos return 0; 93 1.1 christos } 94 1.1 christos 95 1.1 christos static inline int 96 1.1 christos streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28) 97 1.1 christos { 98 1.1 christos if (s1[5] == s25) 99 1.1 christos { 100 1.1 christos if (s25 == 0) 101 1.1 christos return 1; 102 1.1 christos else 103 1.1 christos return streq6 (s1, s2, s26, s27, s28); 104 1.1 christos } 105 1.1 christos else 106 1.1 christos return 0; 107 1.1 christos } 108 1.1 christos 109 1.1 christos static inline int 110 1.1 christos streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28) 111 1.1 christos { 112 1.1 christos if (s1[4] == s24) 113 1.1 christos { 114 1.1 christos if (s24 == 0) 115 1.1 christos return 1; 116 1.1 christos else 117 1.1 christos return streq5 (s1, s2, s25, s26, s27, s28); 118 1.1 christos } 119 1.1 christos else 120 1.1 christos return 0; 121 1.1 christos } 122 1.1 christos 123 1.1 christos static inline int 124 1.1 christos streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28) 125 1.1 christos { 126 1.1 christos if (s1[3] == s23) 127 1.1 christos { 128 1.1 christos if (s23 == 0) 129 1.1 christos return 1; 130 1.1 christos else 131 1.1 christos return streq4 (s1, s2, s24, s25, s26, s27, s28); 132 1.1 christos } 133 1.1 christos else 134 1.1 christos return 0; 135 1.1 christos } 136 1.1 christos 137 1.1 christos static inline int 138 1.1 christos streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 139 1.1 christos { 140 1.1 christos if (s1[2] == s22) 141 1.1 christos { 142 1.1 christos if (s22 == 0) 143 1.1 christos return 1; 144 1.1 christos else 145 1.1 christos return streq3 (s1, s2, s23, s24, s25, s26, s27, s28); 146 1.1 christos } 147 1.1 christos else 148 1.1 christos return 0; 149 1.1 christos } 150 1.1 christos 151 1.1 christos static inline int 152 1.1 christos streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 153 1.1 christos { 154 1.1 christos if (s1[1] == s21) 155 1.1 christos { 156 1.1 christos if (s21 == 0) 157 1.1 christos return 1; 158 1.1 christos else 159 1.1 christos return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28); 160 1.1 christos } 161 1.1 christos else 162 1.1 christos return 0; 163 1.1 christos } 164 1.1 christos 165 1.1 christos static inline int 166 1.1 christos streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28) 167 1.1 christos { 168 1.1 christos if (s1[0] == s20) 169 1.1 christos { 170 1.1 christos if (s20 == 0) 171 1.1 christos return 1; 172 1.1 christos else 173 1.1 christos return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28); 174 1.1 christos } 175 1.1 christos else 176 1.1 christos return 0; 177 1.1 christos } 178 1.1 christos 179 1.1 christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ 180 1.1 christos streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28) 181 1.1 christos 182 1.1 christos #else 183 1.1 christos 184 1.1 christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \ 185 1.1 christos (strcmp (s1, s2) == 0) 186 1.1 christos 187 1.1 christos #endif 188 1.1 christos 189 1.1 christos 190 1.1 christos static int 191 1.1 christos is_cjk_encoding (const char *encoding) 192 1.1 christos { 193 1.1 christos if (0 194 1.1 christos /* Legacy Japanese encodings */ 195 1.1 christos || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0) 196 1.1 christos /* Legacy Chinese encodings */ 197 1.1 christos || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 198 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 199 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 200 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 201 1.1 christos /* Legacy Korean encodings */ 202 1.1 christos || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0) 203 1.1 christos || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0) 204 1.1 christos || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0)) 205 1.1 christos return 1; 206 1.1 christos return 0; 207 1.1 christos } 208 1.1 christos 209 1.1 christos static int 210 1.1 christos is_utf8_encoding (const char *encoding) 211 1.1 christos { 212 1.1 christos if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0)) 213 1.1 christos return 1; 214 1.1 christos return 0; 215 1.1 christos } 216 1.1 christos 217 1.1 christos 218 1.1 christos /* Determine number of column positions required for UC. */ 219 1.1 christos int uc_width (unsigned int uc, const char *encoding); 220 1.1 christos 221 1.1 christos /* 222 1.1 christos * Non-spacing attribute table. 223 1.1 christos * Consists of: 224 1.1 christos * - Non-spacing characters; generated from PropList.txt or 225 1.1 christos * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt" 226 1.1 christos * - Format control characters; generated from 227 1.1 christos * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt" 228 1.1 christos * - Zero width characters; generated from 229 1.1 christos * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt" 230 1.1 christos */ 231 1.1 christos static const unsigned char nonspacing_table_data[16*64] = { 232 1.1 christos /* 0x0000-0x01ff */ 233 1.1 christos 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */ 234 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */ 235 1.1 christos 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */ 236 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */ 237 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */ 238 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */ 239 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */ 240 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */ 241 1.1 christos /* 0x0200-0x03ff */ 242 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */ 243 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */ 244 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */ 245 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */ 246 1.1 christos 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */ 247 1.1 christos 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */ 248 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */ 249 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */ 250 1.1 christos /* 0x0400-0x05ff */ 251 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */ 252 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */ 253 1.1 christos 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */ 254 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */ 255 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */ 256 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */ 257 1.1 christos 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */ 258 1.1 christos 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */ 259 1.1 christos /* 0x0600-0x07ff */ 260 1.1 christos 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */ 261 1.1 christos 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */ 262 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */ 263 1.1 christos 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */ 264 1.1 christos 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */ 265 1.1 christos 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */ 266 1.1 christos 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */ 267 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */ 268 1.1 christos /* 0x0800-0x09ff */ 269 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */ 270 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */ 271 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */ 272 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */ 273 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */ 274 1.1 christos 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */ 275 1.1 christos 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */ 276 1.1 christos 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */ 277 1.1 christos /* 0x0a00-0x0bff */ 278 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */ 279 1.1 christos 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */ 280 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */ 281 1.1 christos 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */ 282 1.1 christos 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */ 283 1.1 christos 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */ 284 1.1 christos 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */ 285 1.1 christos 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */ 286 1.1 christos /* 0x0c00-0x0dff */ 287 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */ 288 1.1 christos 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */ 289 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */ 290 1.1 christos 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */ 291 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */ 292 1.1 christos 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */ 293 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */ 294 1.1 christos 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */ 295 1.1 christos /* 0x0e00-0x0fff */ 296 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */ 297 1.1 christos 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */ 298 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */ 299 1.1 christos 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */ 300 1.1 christos 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */ 301 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */ 302 1.1 christos 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */ 303 1.1 christos 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */ 304 1.1 christos /* 0x1000-0x11ff */ 305 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */ 306 1.1 christos 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */ 307 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */ 308 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */ 309 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */ 310 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */ 311 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */ 312 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */ 313 1.1 christos /* 0x1600-0x17ff */ 314 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */ 315 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */ 316 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */ 317 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */ 318 1.1 christos 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */ 319 1.1 christos 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */ 320 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */ 321 1.1 christos 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */ 322 1.1 christos /* 0x1800-0x19ff */ 323 1.1 christos 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */ 324 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */ 325 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */ 326 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */ 327 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */ 328 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */ 329 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */ 330 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */ 331 1.1 christos /* 0x2000-0x21ff */ 332 1.1 christos 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */ 333 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */ 334 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */ 335 1.1 christos 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */ 336 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */ 337 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */ 338 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */ 339 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */ 340 1.1 christos /* 0x3000-0x31ff */ 341 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */ 342 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */ 343 1.1 christos 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */ 344 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */ 345 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */ 346 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */ 347 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */ 348 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */ 349 1.1 christos /* 0xfa00-0xfbff */ 350 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */ 351 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */ 352 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */ 353 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */ 354 1.1 christos 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */ 355 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */ 356 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */ 357 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */ 358 1.1 christos /* 0xfe00-0xffff */ 359 1.1 christos 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */ 360 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */ 361 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */ 362 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */ 363 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */ 364 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */ 365 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */ 366 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */ 367 1.1 christos /* 0x1d000-0x1d1ff */ 368 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */ 369 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */ 370 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */ 371 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */ 372 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */ 373 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */ 374 1.1 christos 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */ 375 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */ 376 1.1 christos }; 377 1.1 christos static const signed char nonspacing_table_ind[240] = { 378 1.1 christos 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */ 379 1.1 christos 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */ 380 1.1 christos 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */ 381 1.1 christos 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */ 382 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */ 383 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */ 384 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */ 385 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */ 386 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */ 387 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */ 388 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */ 389 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */ 390 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */ 391 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */ 392 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */ 393 1.1 christos -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */ 394 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */ 395 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */ 396 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */ 397 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */ 398 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */ 399 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */ 400 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */ 401 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */ 402 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */ 403 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */ 404 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */ 405 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */ 406 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */ 407 1.1 christos 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */ 408 1.1 christos }; 409 1.1 christos 410 1.1 christos /* Determine number of column positions required for UC. */ 411 1.1 christos int 412 1.1 christos uc_width (unsigned int uc, const char *encoding) 413 1.1 christos { 414 1.1 christos /* Test for non-spacing or control character. */ 415 1.1 christos if ((uc >> 9) < 240) 416 1.1 christos { 417 1.1 christos int ind = nonspacing_table_ind[uc >> 9]; 418 1.1 christos if (ind >= 0) 419 1.1 christos if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1) 420 1.1 christos { 421 1.1 christos if (uc > 0 && uc < 0xa0) 422 1.1 christos return -1; 423 1.1 christos else 424 1.1 christos return 0; 425 1.1 christos } 426 1.1 christos } 427 1.1 christos else if ((uc >> 9) == (0xe0000 >> 9)) 428 1.1 christos { 429 1.1 christos if (uc < 0xe0100 430 1.1 christos ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001) 431 1.1 christos : (uc <= 0xe01ef)) 432 1.1 christos return 0; 433 1.1 christos } 434 1.1 christos /* Test for double-width character. 435 1.1 christos * Generated from "grep '^....;[WF]' EastAsianWidth.txt" 436 1.1 christos * and "grep '^....;[^WF]' EastAsianWidth.txt" 437 1.1 christos */ 438 1.1 christos if (uc >= 0x1100 439 1.1 christos && ((uc < 0x1160) /* Hangul Jamo */ 440 1.1 christos || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */ 441 1.1 christos && !(uc == 0x303f)) 442 1.1 christos || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */ 443 1.1 christos || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */ 444 1.1 christos || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */ 445 1.1 christos || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */ 446 1.1 christos || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */ 447 1.1 christos || (uc >= 0xffe0 && uc < 0xffe7) 448 1.1 christos || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */ 449 1.1 christos || (uc >= 0x30000 && uc <= 0x3fffd) 450 1.1 christos ) ) 451 1.1 christos return 2; 452 1.1 christos /* In ancient CJK encodings, Cyrillic and most other characters are 453 1.1 christos double-width as well. */ 454 1.1 christos if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9 455 1.1 christos && is_cjk_encoding (encoding)) 456 1.1 christos return 2; 457 1.1 christos return 1; 458 1.1 christos } 459 1.1 christos 460 1.1 christos 461 1.1 christos #ifdef unused 462 1.1 christos 463 1.1 christos /* Determine number of column positions required for first N units 464 1.1 christos (or fewer if S ends before this) in S. */ 465 1.1 christos 466 1.1 christos int 467 1.1 christos u8_width (const unsigned char *s, size_t n, const char *encoding) 468 1.1 christos { 469 1.1 christos const unsigned char *s_end = s + n; 470 1.1 christos int width = 0; 471 1.1 christos 472 1.1 christos while (s < s_end) 473 1.1 christos { 474 1.1 christos unsigned int uc; 475 1.1 christos int w; 476 1.1 christos 477 1.1 christos s += u8_mbtouc (&uc, s, s_end - s); 478 1.1 christos 479 1.1 christos if (uc == 0) 480 1.1 christos break; /* end of string reached */ 481 1.1 christos 482 1.1 christos w = uc_width (uc, encoding); 483 1.1 christos if (w >= 0) /* ignore control characters in the string */ 484 1.1 christos width += w; 485 1.1 christos } 486 1.1 christos 487 1.1 christos return width; 488 1.1 christos } 489 1.1 christos 490 1.1 christos int 491 1.1 christos u16_width (const unsigned short *s, size_t n, const char *encoding) 492 1.1 christos { 493 1.1 christos const unsigned short *s_end = s + n; 494 1.1 christos int width = 0; 495 1.1 christos 496 1.1 christos while (s < s_end) 497 1.1 christos { 498 1.1 christos unsigned int uc; 499 1.1 christos int w; 500 1.1 christos 501 1.1 christos s += u16_mbtouc (&uc, s, s_end - s); 502 1.1 christos 503 1.1 christos if (uc == 0) 504 1.1 christos break; /* end of string reached */ 505 1.1 christos 506 1.1 christos w = uc_width (uc, encoding); 507 1.1 christos if (w >= 0) /* ignore control characters in the string */ 508 1.1 christos width += w; 509 1.1 christos } 510 1.1 christos 511 1.1 christos return width; 512 1.1 christos } 513 1.1 christos 514 1.1 christos int 515 1.1 christos u32_width (const unsigned int *s, size_t n, const char *encoding) 516 1.1 christos { 517 1.1 christos const unsigned int *s_end = s + n; 518 1.1 christos int width = 0; 519 1.1 christos 520 1.1 christos while (s < s_end) 521 1.1 christos { 522 1.1 christos unsigned int uc = *s++; 523 1.1 christos int w; 524 1.1 christos 525 1.1 christos if (uc == 0) 526 1.1 christos break; /* end of string reached */ 527 1.1 christos 528 1.1 christos w = uc_width (uc, encoding); 529 1.1 christos if (w >= 0) /* ignore control characters in the string */ 530 1.1 christos width += w; 531 1.1 christos } 532 1.1 christos 533 1.1 christos return width; 534 1.1 christos } 535 1.1 christos 536 1.1 christos #endif 537 1.1 christos 538 1.1 christos 539 1.1 christos /* Determine the line break points in S, and store the result at p[0..n-1]. */ 540 1.1 christos /* We don't support line breaking of complex-context dependent characters 541 1.1 christos (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */ 542 1.1 christos 543 1.1 christos /* Line breaking classification. */ 544 1.1 christos 545 1.1 christos enum 546 1.1 christos { 547 1.1 christos /* Values >= 20 are resolved at run time. */ 548 1.1 christos LBP_BK = 0, /* mandatory break */ 549 1.1 christos /*LBP_CR, carriage return - not used here because it's a DOSism */ 550 1.1 christos /*LBP_LF, line feed - not used here because it's a DOSism */ 551 1.1 christos LBP_CM = 20, /* attached characters and combining marks */ 552 1.1 christos /*LBP_SG, surrogates - not used here because they are not characters */ 553 1.1 christos LBP_ZW = 1, /* zero width space */ 554 1.1 christos LBP_IN = 2, /* inseparable */ 555 1.1 christos LBP_GL = 3, /* non-breaking (glue) */ 556 1.1 christos LBP_CB = 22, /* contingent break opportunity */ 557 1.1 christos LBP_SP = 21, /* space */ 558 1.1 christos LBP_BA = 4, /* break opportunity after */ 559 1.1 christos LBP_BB = 5, /* break opportunity before */ 560 1.1 christos LBP_B2 = 6, /* break opportunity before and after */ 561 1.1 christos LBP_HY = 7, /* hyphen */ 562 1.1 christos LBP_NS = 8, /* non starter */ 563 1.1 christos LBP_OP = 9, /* opening punctuation */ 564 1.1 christos LBP_CL = 10, /* closing punctuation */ 565 1.1 christos LBP_QU = 11, /* ambiguous quotation */ 566 1.1 christos LBP_EX = 12, /* exclamation/interrogation */ 567 1.1 christos LBP_ID = 13, /* ideographic */ 568 1.1 christos LBP_NU = 14, /* numeric */ 569 1.1 christos LBP_IS = 15, /* infix separator (numeric) */ 570 1.1 christos LBP_SY = 16, /* symbols allowing breaks */ 571 1.1 christos LBP_AL = 17, /* ordinary alphabetic and symbol characters */ 572 1.1 christos LBP_PR = 18, /* prefix (numeric) */ 573 1.1 christos LBP_PO = 19, /* postfix (numeric) */ 574 1.1 christos LBP_SA = 23, /* complex context (South East Asian) */ 575 1.1 christos LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ 576 1.1 christos LBP_XX = 25 /* unknown */ 577 1.1 christos }; 578 1.1 christos 579 1.1 christos #include "lbrkprop.h" 580 1.1 christos 581 1.1 christos static inline unsigned char 582 1.1 christos lbrkprop_lookup (unsigned int uc) 583 1.1 christos { 584 1.1 christos unsigned int index1 = uc >> lbrkprop_header_0; 585 1.1 christos if (index1 < lbrkprop_header_1) 586 1.1 christos { 587 1.1 christos int lookup1 = lbrkprop.level1[index1]; 588 1.1 christos if (lookup1 >= 0) 589 1.1 christos { 590 1.1 christos unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3; 591 1.1 christos int lookup2 = lbrkprop.level2[lookup1 + index2]; 592 1.1 christos if (lookup2 >= 0) 593 1.1 christos { 594 1.1 christos unsigned int index3 = uc & lbrkprop_header_4; 595 1.1 christos return lbrkprop.level3[lookup2 + index3]; 596 1.1 christos } 597 1.1 christos } 598 1.1 christos } 599 1.1 christos return LBP_XX; 600 1.1 christos } 601 1.1 christos 602 1.1 christos /* Table indexed by two line breaking classifications. */ 603 1.1 christos #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */ 604 1.1 christos #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */ 605 1.1 christos #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */ 606 1.1 christos static const unsigned char lbrk_table[19][19] = { 607 1.1 christos /* after */ 608 1.1 christos /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */ 609 1.1 christos /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, }, 610 1.1 christos /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 611 1.1 christos /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 612 1.1 christos /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 613 1.1 christos /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, }, 614 1.1 christos /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 615 1.1 christos /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 616 1.1 christos /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 617 1.1 christos /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, }, 618 1.1 christos /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, }, 619 1.1 christos /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, }, 620 1.1 christos /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 621 1.1 christos /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, }, 622 1.1 christos /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, }, 623 1.1 christos /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 624 1.1 christos /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, }, 625 1.1 christos /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, }, 626 1.1 christos /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, }, 627 1.1 christos /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, }, 628 1.1 christos /* "" */ 629 1.1 christos /* before */ 630 1.1 christos }; 631 1.1 christos /* Note: The (B2,B2) entry should probably be D instead of P. */ 632 1.1 christos /* Note: The (PR,ID) entry should probably be D instead of I. */ 633 1.1 christos 634 1.1 christos void 635 1.1 christos u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p) 636 1.1 christos { 637 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 638 1.1 christos const unsigned char *s_end = s + n; 639 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */ 640 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 641 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 642 1.1 christos 643 1.1 christos /* Don't break inside multibyte characters. */ 644 1.1 christos memset (p, UC_BREAK_PROHIBITED, n); 645 1.1 christos 646 1.1 christos while (s < s_end) 647 1.1 christos { 648 1.1 christos unsigned int uc; 649 1.1 christos int count = u8_mbtouc (&uc, s, s_end - s); 650 1.1 christos int prop = lbrkprop_lookup (uc); 651 1.1 christos 652 1.1 christos if (prop == LBP_BK) 653 1.1 christos { 654 1.1 christos /* Mandatory break. */ 655 1.1 christos *p = UC_BREAK_MANDATORY; 656 1.1 christos last_prop = LBP_BK; 657 1.1 christos seen_space = NULL; 658 1.1 christos seen_space2 = NULL; 659 1.1 christos } 660 1.1 christos else 661 1.1 christos { 662 1.1 christos char *q; 663 1.1 christos 664 1.1 christos /* Resolve property values whose behaviour is not fixed. */ 665 1.1 christos switch (prop) 666 1.1 christos { 667 1.1 christos case LBP_AI: 668 1.1 christos /* Resolve ambiguous. */ 669 1.1 christos prop = LBP_AI_REPLACEMENT; 670 1.1 christos break; 671 1.1 christos case LBP_CB: 672 1.1 christos /* This is arbitrary. */ 673 1.1 christos prop = LBP_ID; 674 1.1 christos break; 675 1.1 christos case LBP_SA: 676 1.1 christos /* We don't handle complex scripts yet. 677 1.1 christos Treat LBP_SA like LBP_XX. */ 678 1.1 christos case LBP_XX: 679 1.1 christos /* This is arbitrary. */ 680 1.1 christos prop = LBP_AL; 681 1.1 christos break; 682 1.1 christos } 683 1.1 christos 684 1.1 christos /* Deal with combining characters. */ 685 1.1 christos q = p; 686 1.1 christos if (prop == LBP_CM) 687 1.1 christos { 688 1.1 christos /* Don't break just before a combining character. */ 689 1.1 christos *p = UC_BREAK_PROHIBITED; 690 1.1 christos /* A combining character turns a preceding space into LBP_AL. */ 691 1.1 christos if (seen_space != NULL) 692 1.1 christos { 693 1.1 christos q = seen_space; 694 1.1 christos seen_space = seen_space2; 695 1.1 christos prop = LBP_AL; 696 1.1 christos goto lookup_via_table; 697 1.1 christos } 698 1.1 christos } 699 1.1 christos else if (prop == LBP_SP) 700 1.1 christos { 701 1.1 christos /* Don't break just before a space. */ 702 1.1 christos *p = UC_BREAK_PROHIBITED; 703 1.1 christos seen_space2 = seen_space; 704 1.1 christos seen_space = p; 705 1.1 christos } 706 1.1 christos else 707 1.1 christos { 708 1.1 christos lookup_via_table: 709 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */ 710 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 711 1.1 christos abort (); 712 1.1 christos 713 1.1 christos if (last_prop == LBP_BK) 714 1.1 christos { 715 1.1 christos /* Don't break at the beginning of a line. */ 716 1.1 christos *q = UC_BREAK_PROHIBITED; 717 1.1 christos } 718 1.1 christos else 719 1.1 christos { 720 1.1 christos switch (lbrk_table [last_prop-1] [prop-1]) 721 1.1 christos { 722 1.1 christos case D: 723 1.1 christos *q = UC_BREAK_POSSIBLE; 724 1.1 christos break; 725 1.1 christos case I: 726 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 727 1.1 christos break; 728 1.1 christos case P: 729 1.1 christos *q = UC_BREAK_PROHIBITED; 730 1.1 christos break; 731 1.1 christos default: 732 1.1 christos abort (); 733 1.1 christos } 734 1.1 christos } 735 1.1 christos last_prop = prop; 736 1.1 christos seen_space = NULL; 737 1.1 christos seen_space2 = NULL; 738 1.1 christos } 739 1.1 christos } 740 1.1 christos 741 1.1 christos s += count; 742 1.1 christos p += count; 743 1.1 christos } 744 1.1 christos } 745 1.1 christos 746 1.1 christos #ifdef unused 747 1.1 christos 748 1.1 christos void 749 1.1 christos u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p) 750 1.1 christos { 751 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 752 1.1 christos const unsigned short *s_end = s + n; 753 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */ 754 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 755 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 756 1.1 christos 757 1.1 christos /* Don't break inside multibyte characters. */ 758 1.1 christos memset (p, UC_BREAK_PROHIBITED, n); 759 1.1 christos 760 1.1 christos while (s < s_end) 761 1.1 christos { 762 1.1 christos unsigned int uc; 763 1.1 christos int count = u16_mbtouc (&uc, s, s_end - s); 764 1.1 christos int prop = lbrkprop_lookup (uc); 765 1.1 christos 766 1.1 christos if (prop == LBP_BK) 767 1.1 christos { 768 1.1 christos /* Mandatory break. */ 769 1.1 christos *p = UC_BREAK_MANDATORY; 770 1.1 christos last_prop = LBP_BK; 771 1.1 christos seen_space = NULL; 772 1.1 christos seen_space2 = NULL; 773 1.1 christos } 774 1.1 christos else 775 1.1 christos { 776 1.1 christos char *q; 777 1.1 christos 778 1.1 christos /* Resolve property values whose behaviour is not fixed. */ 779 1.1 christos switch (prop) 780 1.1 christos { 781 1.1 christos case LBP_AI: 782 1.1 christos /* Resolve ambiguous. */ 783 1.1 christos prop = LBP_AI_REPLACEMENT; 784 1.1 christos break; 785 1.1 christos case LBP_CB: 786 1.1 christos /* This is arbitrary. */ 787 1.1 christos prop = LBP_ID; 788 1.1 christos break; 789 1.1 christos case LBP_SA: 790 1.1 christos /* We don't handle complex scripts yet. 791 1.1 christos Treat LBP_SA like LBP_XX. */ 792 1.1 christos case LBP_XX: 793 1.1 christos /* This is arbitrary. */ 794 1.1 christos prop = LBP_AL; 795 1.1 christos break; 796 1.1 christos } 797 1.1 christos 798 1.1 christos /* Deal with combining characters. */ 799 1.1 christos q = p; 800 1.1 christos if (prop == LBP_CM) 801 1.1 christos { 802 1.1 christos /* Don't break just before a combining character. */ 803 1.1 christos *p = UC_BREAK_PROHIBITED; 804 1.1 christos /* A combining character turns a preceding space into LBP_AL. */ 805 1.1 christos if (seen_space != NULL) 806 1.1 christos { 807 1.1 christos q = seen_space; 808 1.1 christos seen_space = seen_space2; 809 1.1 christos prop = LBP_AL; 810 1.1 christos goto lookup_via_table; 811 1.1 christos } 812 1.1 christos } 813 1.1 christos else if (prop == LBP_SP) 814 1.1 christos { 815 1.1 christos /* Don't break just before a space. */ 816 1.1 christos *p = UC_BREAK_PROHIBITED; 817 1.1 christos seen_space2 = seen_space; 818 1.1 christos seen_space = p; 819 1.1 christos } 820 1.1 christos else 821 1.1 christos { 822 1.1 christos lookup_via_table: 823 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */ 824 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 825 1.1 christos abort (); 826 1.1 christos 827 1.1 christos if (last_prop == LBP_BK) 828 1.1 christos { 829 1.1 christos /* Don't break at the beginning of a line. */ 830 1.1 christos *q = UC_BREAK_PROHIBITED; 831 1.1 christos } 832 1.1 christos else 833 1.1 christos { 834 1.1 christos switch (lbrk_table [last_prop-1] [prop-1]) 835 1.1 christos { 836 1.1 christos case D: 837 1.1 christos *q = UC_BREAK_POSSIBLE; 838 1.1 christos break; 839 1.1 christos case I: 840 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 841 1.1 christos break; 842 1.1 christos case P: 843 1.1 christos *q = UC_BREAK_PROHIBITED; 844 1.1 christos break; 845 1.1 christos default: 846 1.1 christos abort (); 847 1.1 christos } 848 1.1 christos } 849 1.1 christos last_prop = prop; 850 1.1 christos seen_space = NULL; 851 1.1 christos seen_space2 = NULL; 852 1.1 christos } 853 1.1 christos } 854 1.1 christos 855 1.1 christos s += count; 856 1.1 christos p += count; 857 1.1 christos } 858 1.1 christos } 859 1.1 christos 860 1.1 christos void 861 1.1 christos u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p) 862 1.1 christos { 863 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 864 1.1 christos const unsigned int *s_end = s + n; 865 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */ 866 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 867 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 868 1.1 christos 869 1.1 christos while (s < s_end) 870 1.1 christos { 871 1.1 christos unsigned int uc = *s; 872 1.1 christos int prop = lbrkprop_lookup (uc); 873 1.1 christos 874 1.1 christos if (prop == LBP_BK) 875 1.1 christos { 876 1.1 christos /* Mandatory break. */ 877 1.1 christos *p = UC_BREAK_MANDATORY; 878 1.1 christos last_prop = LBP_BK; 879 1.1 christos seen_space = NULL; 880 1.1 christos seen_space2 = NULL; 881 1.1 christos } 882 1.1 christos else 883 1.1 christos { 884 1.1 christos char *q; 885 1.1 christos 886 1.1 christos /* Resolve property values whose behaviour is not fixed. */ 887 1.1 christos switch (prop) 888 1.1 christos { 889 1.1 christos case LBP_AI: 890 1.1 christos /* Resolve ambiguous. */ 891 1.1 christos prop = LBP_AI_REPLACEMENT; 892 1.1 christos break; 893 1.1 christos case LBP_CB: 894 1.1 christos /* This is arbitrary. */ 895 1.1 christos prop = LBP_ID; 896 1.1 christos break; 897 1.1 christos case LBP_SA: 898 1.1 christos /* We don't handle complex scripts yet. 899 1.1 christos Treat LBP_SA like LBP_XX. */ 900 1.1 christos case LBP_XX: 901 1.1 christos /* This is arbitrary. */ 902 1.1 christos prop = LBP_AL; 903 1.1 christos break; 904 1.1 christos } 905 1.1 christos 906 1.1 christos /* Deal with combining characters. */ 907 1.1 christos q = p; 908 1.1 christos if (prop == LBP_CM) 909 1.1 christos { 910 1.1 christos /* Don't break just before a combining character. */ 911 1.1 christos *p = UC_BREAK_PROHIBITED; 912 1.1 christos /* A combining character turns a preceding space into LBP_AL. */ 913 1.1 christos if (seen_space != NULL) 914 1.1 christos { 915 1.1 christos q = seen_space; 916 1.1 christos seen_space = seen_space2; 917 1.1 christos prop = LBP_AL; 918 1.1 christos goto lookup_via_table; 919 1.1 christos } 920 1.1 christos } 921 1.1 christos else if (prop == LBP_SP) 922 1.1 christos { 923 1.1 christos /* Don't break just before a space. */ 924 1.1 christos *p = UC_BREAK_PROHIBITED; 925 1.1 christos seen_space2 = seen_space; 926 1.1 christos seen_space = p; 927 1.1 christos } 928 1.1 christos else 929 1.1 christos { 930 1.1 christos lookup_via_table: 931 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */ 932 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0]))) 933 1.1 christos abort (); 934 1.1 christos 935 1.1 christos if (last_prop == LBP_BK) 936 1.1 christos { 937 1.1 christos /* Don't break at the beginning of a line. */ 938 1.1 christos *q = UC_BREAK_PROHIBITED; 939 1.1 christos } 940 1.1 christos else 941 1.1 christos { 942 1.1 christos switch (lbrk_table [last_prop-1] [prop-1]) 943 1.1 christos { 944 1.1 christos case D: 945 1.1 christos *q = UC_BREAK_POSSIBLE; 946 1.1 christos break; 947 1.1 christos case I: 948 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 949 1.1 christos break; 950 1.1 christos case P: 951 1.1 christos *q = UC_BREAK_PROHIBITED; 952 1.1 christos break; 953 1.1 christos default: 954 1.1 christos abort (); 955 1.1 christos } 956 1.1 christos } 957 1.1 christos last_prop = prop; 958 1.1 christos seen_space = NULL; 959 1.1 christos seen_space2 = NULL; 960 1.1 christos } 961 1.1 christos } 962 1.1 christos 963 1.1 christos s++; 964 1.1 christos p++; 965 1.1 christos } 966 1.1 christos } 967 1.1 christos 968 1.1 christos #endif 969 1.1 christos 970 1.1 christos 971 1.1 christos /* Choose the best line breaks, assuming the uc_width function. 972 1.1 christos Return the column after the end of the string. */ 973 1.1 christos 974 1.1 christos int 975 1.1 christos u8_width_linebreaks (const unsigned char *s, size_t n, 976 1.1 christos int width, int start_column, int at_end_columns, 977 1.1 christos const char *o, const char *encoding, 978 1.1 christos char *p) 979 1.1 christos { 980 1.1 christos const unsigned char *s_end; 981 1.1 christos char *last_p; 982 1.1 christos int last_column; 983 1.1 christos int piece_width; 984 1.1 christos 985 1.1 christos u8_possible_linebreaks (s, n, encoding, p); 986 1.1 christos 987 1.1 christos s_end = s + n; 988 1.1 christos last_p = NULL; 989 1.1 christos last_column = start_column; 990 1.1 christos piece_width = 0; 991 1.1 christos while (s < s_end) 992 1.1 christos { 993 1.1 christos unsigned int uc; 994 1.1 christos int count = u8_mbtouc (&uc, s, s_end - s); 995 1.1 christos 996 1.1 christos /* Respect the override. */ 997 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED) 998 1.1 christos *p = *o; 999 1.1 christos 1000 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1001 1.1 christos { 1002 1.1 christos /* An atomic piece of text ends here. */ 1003 1.1 christos if (last_p != NULL && last_column + piece_width > width) 1004 1.1 christos { 1005 1.1 christos /* Insert a line break. */ 1006 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1007 1.1 christos last_column = 0; 1008 1.1 christos } 1009 1.1 christos } 1010 1.1 christos 1011 1.1 christos if (*p == UC_BREAK_MANDATORY) 1012 1.1 christos { 1013 1.1 christos /* uc is a line break character. */ 1014 1.1 christos /* Start a new piece at column 0. */ 1015 1.1 christos last_p = NULL; 1016 1.1 christos last_column = 0; 1017 1.1 christos piece_width = 0; 1018 1.1 christos } 1019 1.1 christos else 1020 1.1 christos { 1021 1.1 christos /* uc is not a line break character. */ 1022 1.1 christos int w; 1023 1.1 christos 1024 1.1 christos if (*p == UC_BREAK_POSSIBLE) 1025 1.1 christos { 1026 1.1 christos /* Start a new piece. */ 1027 1.1 christos last_p = p; 1028 1.1 christos last_column += piece_width; 1029 1.1 christos piece_width = 0; 1030 1.1 christos /* No line break for the moment, may be turned into 1031 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */ 1032 1.1 christos } 1033 1.1 christos 1034 1.1 christos *p = UC_BREAK_PROHIBITED; 1035 1.1 christos 1036 1.1 christos w = uc_width (uc, encoding); 1037 1.1 christos if (w >= 0) /* ignore control characters in the string */ 1038 1.1 christos piece_width += w; 1039 1.1 christos } 1040 1.1 christos 1041 1.1 christos s += count; 1042 1.1 christos p += count; 1043 1.1 christos if (o != NULL) 1044 1.1 christos o += count; 1045 1.1 christos } 1046 1.1 christos 1047 1.1 christos /* The last atomic piece of text ends here. */ 1048 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1049 1.1 christos { 1050 1.1 christos /* Insert a line break. */ 1051 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1052 1.1 christos last_column = 0; 1053 1.1 christos } 1054 1.1 christos 1055 1.1 christos return last_column + piece_width; 1056 1.1 christos } 1057 1.1 christos 1058 1.1 christos #ifdef unused 1059 1.1 christos 1060 1.1 christos int 1061 1.1 christos u16_width_linebreaks (const unsigned short *s, size_t n, 1062 1.1 christos int width, int start_column, int at_end_columns, 1063 1.1 christos const char *o, const char *encoding, 1064 1.1 christos char *p) 1065 1.1 christos { 1066 1.1 christos const unsigned short *s_end; 1067 1.1 christos char *last_p; 1068 1.1 christos int last_column; 1069 1.1 christos int piece_width; 1070 1.1 christos 1071 1.1 christos u16_possible_linebreaks (s, n, encoding, p); 1072 1.1 christos 1073 1.1 christos s_end = s + n; 1074 1.1 christos last_p = NULL; 1075 1.1 christos last_column = start_column; 1076 1.1 christos piece_width = 0; 1077 1.1 christos while (s < s_end) 1078 1.1 christos { 1079 1.1 christos unsigned int uc; 1080 1.1 christos int count = u16_mbtouc (&uc, s, s_end - s); 1081 1.1 christos 1082 1.1 christos /* Respect the override. */ 1083 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED) 1084 1.1 christos *p = *o; 1085 1.1 christos 1086 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1087 1.1 christos { 1088 1.1 christos /* An atomic piece of text ends here. */ 1089 1.1 christos if (last_p != NULL && last_column + piece_width > width) 1090 1.1 christos { 1091 1.1 christos /* Insert a line break. */ 1092 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1093 1.1 christos last_column = 0; 1094 1.1 christos } 1095 1.1 christos } 1096 1.1 christos 1097 1.1 christos if (*p == UC_BREAK_MANDATORY) 1098 1.1 christos { 1099 1.1 christos /* uc is a line break character. */ 1100 1.1 christos /* Start a new piece at column 0. */ 1101 1.1 christos last_p = NULL; 1102 1.1 christos last_column = 0; 1103 1.1 christos piece_width = 0; 1104 1.1 christos } 1105 1.1 christos else 1106 1.1 christos { 1107 1.1 christos /* uc is not a line break character. */ 1108 1.1 christos int w; 1109 1.1 christos 1110 1.1 christos if (*p == UC_BREAK_POSSIBLE) 1111 1.1 christos { 1112 1.1 christos /* Start a new piece. */ 1113 1.1 christos last_p = p; 1114 1.1 christos last_column += piece_width; 1115 1.1 christos piece_width = 0; 1116 1.1 christos /* No line break for the moment, may be turned into 1117 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */ 1118 1.1 christos } 1119 1.1 christos 1120 1.1 christos *p = UC_BREAK_PROHIBITED; 1121 1.1 christos 1122 1.1 christos w = uc_width (uc, encoding); 1123 1.1 christos if (w >= 0) /* ignore control characters in the string */ 1124 1.1 christos piece_width += w; 1125 1.1 christos } 1126 1.1 christos 1127 1.1 christos s += count; 1128 1.1 christos p += count; 1129 1.1 christos if (o != NULL) 1130 1.1 christos o += count; 1131 1.1 christos } 1132 1.1 christos 1133 1.1 christos /* The last atomic piece of text ends here. */ 1134 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1135 1.1 christos { 1136 1.1 christos /* Insert a line break. */ 1137 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1138 1.1 christos last_column = 0; 1139 1.1 christos } 1140 1.1 christos 1141 1.1 christos return last_column + piece_width; 1142 1.1 christos } 1143 1.1 christos 1144 1.1 christos int 1145 1.1 christos u32_width_linebreaks (const unsigned int *s, size_t n, 1146 1.1 christos int width, int start_column, int at_end_columns, 1147 1.1 christos const char *o, const char *encoding, 1148 1.1 christos char *p) 1149 1.1 christos { 1150 1.1 christos const unsigned int *s_end; 1151 1.1 christos char *last_p; 1152 1.1 christos int last_column; 1153 1.1 christos int piece_width; 1154 1.1 christos 1155 1.1 christos u32_possible_linebreaks (s, n, encoding, p); 1156 1.1 christos 1157 1.1 christos s_end = s + n; 1158 1.1 christos last_p = NULL; 1159 1.1 christos last_column = start_column; 1160 1.1 christos piece_width = 0; 1161 1.1 christos while (s < s_end) 1162 1.1 christos { 1163 1.1 christos unsigned int uc = *s; 1164 1.1 christos 1165 1.1 christos /* Respect the override. */ 1166 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED) 1167 1.1 christos *p = *o; 1168 1.1 christos 1169 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY) 1170 1.1 christos { 1171 1.1 christos /* An atomic piece of text ends here. */ 1172 1.1 christos if (last_p != NULL && last_column + piece_width > width) 1173 1.1 christos { 1174 1.1 christos /* Insert a line break. */ 1175 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1176 1.1 christos last_column = 0; 1177 1.1 christos } 1178 1.1 christos } 1179 1.1 christos 1180 1.1 christos if (*p == UC_BREAK_MANDATORY) 1181 1.1 christos { 1182 1.1 christos /* uc is a line break character. */ 1183 1.1 christos /* Start a new piece at column 0. */ 1184 1.1 christos last_p = NULL; 1185 1.1 christos last_column = 0; 1186 1.1 christos piece_width = 0; 1187 1.1 christos } 1188 1.1 christos else 1189 1.1 christos { 1190 1.1 christos /* uc is not a line break character. */ 1191 1.1 christos int w; 1192 1.1 christos 1193 1.1 christos if (*p == UC_BREAK_POSSIBLE) 1194 1.1 christos { 1195 1.1 christos /* Start a new piece. */ 1196 1.1 christos last_p = p; 1197 1.1 christos last_column += piece_width; 1198 1.1 christos piece_width = 0; 1199 1.1 christos /* No line break for the moment, may be turned into 1200 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */ 1201 1.1 christos } 1202 1.1 christos 1203 1.1 christos *p = UC_BREAK_PROHIBITED; 1204 1.1 christos 1205 1.1 christos w = uc_width (uc, encoding); 1206 1.1 christos if (w >= 0) /* ignore control characters in the string */ 1207 1.1 christos piece_width += w; 1208 1.1 christos } 1209 1.1 christos 1210 1.1 christos s++; 1211 1.1 christos p++; 1212 1.1 christos if (o != NULL) 1213 1.1 christos o++; 1214 1.1 christos } 1215 1.1 christos 1216 1.1 christos /* The last atomic piece of text ends here. */ 1217 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width) 1218 1.1 christos { 1219 1.1 christos /* Insert a line break. */ 1220 1.1 christos *last_p = UC_BREAK_POSSIBLE; 1221 1.1 christos last_column = 0; 1222 1.1 christos } 1223 1.1 christos 1224 1.1 christos return last_column + piece_width; 1225 1.1 christos } 1226 1.1 christos 1227 1.1 christos #endif 1228 1.1 christos 1229 1.1 christos 1230 1.1 christos #ifdef TEST1 1231 1.1 christos 1232 1.1 christos #include <stdio.h> 1233 1.1 christos 1234 1.1 christos /* Read the contents of an input stream, and return it, terminated with a NUL 1235 1.1 christos byte. */ 1236 1.1 christos char * 1237 1.1 christos read_file (FILE *stream) 1238 1.1 christos { 1239 1.1 christos #define BUFSIZE 4096 1240 1.1 christos char *buf = NULL; 1241 1.1 christos int alloc = 0; 1242 1.1 christos int size = 0; 1243 1.1 christos int count; 1244 1.1 christos 1245 1.1 christos while (! feof (stream)) 1246 1.1 christos { 1247 1.1 christos if (size + BUFSIZE > alloc) 1248 1.1 christos { 1249 1.1 christos alloc = alloc + alloc / 2; 1250 1.1 christos if (alloc < size + BUFSIZE) 1251 1.1 christos alloc = size + BUFSIZE; 1252 1.1 christos buf = realloc (buf, alloc); 1253 1.1 christos if (buf == NULL) 1254 1.1 christos { 1255 1.1 christos fprintf (stderr, "out of memory\n"); 1256 1.1 christos exit (1); 1257 1.1 christos } 1258 1.1 christos } 1259 1.1 christos count = fread (buf + size, 1, BUFSIZE, stream); 1260 1.1 christos if (count == 0) 1261 1.1 christos { 1262 1.1 christos if (ferror (stream)) 1263 1.1 christos { 1264 1.1 christos perror ("fread"); 1265 1.1 christos exit (1); 1266 1.1 christos } 1267 1.1 christos } 1268 1.1 christos else 1269 1.1 christos size += count; 1270 1.1 christos } 1271 1.1 christos buf = realloc (buf, size + 1); 1272 1.1 christos if (buf == NULL) 1273 1.1 christos { 1274 1.1 christos fprintf (stderr, "out of memory\n"); 1275 1.1 christos exit (1); 1276 1.1 christos } 1277 1.1 christos buf[size] = '\0'; 1278 1.1 christos return buf; 1279 1.1 christos #undef BUFSIZE 1280 1.1 christos } 1281 1.1 christos 1282 1.1 christos int 1283 1.1 christos main (int argc, char * argv[]) 1284 1.1 christos { 1285 1.1 christos if (argc == 1) 1286 1.1 christos { 1287 1.1 christos /* Display all the break opportunities in the input string. */ 1288 1.1 christos char *input = read_file (stdin); 1289 1.1 christos int length = strlen (input); 1290 1.1 christos char *breaks = malloc (length); 1291 1.1 christos int i; 1292 1.1 christos 1293 1.1 christos u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks); 1294 1.1 christos 1295 1.1 christos for (i = 0; i < length; i++) 1296 1.1 christos { 1297 1.1 christos switch (breaks[i]) 1298 1.1 christos { 1299 1.1 christos case UC_BREAK_POSSIBLE: 1300 1.1 christos /* U+2027 in UTF-8 encoding */ 1301 1.1 christos putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); 1302 1.1 christos break; 1303 1.1 christos case UC_BREAK_MANDATORY: 1304 1.1 christos /* U+21B2 (or U+21B5) in UTF-8 encoding */ 1305 1.1 christos putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); 1306 1.1 christos break; 1307 1.1 christos case UC_BREAK_PROHIBITED: 1308 1.1 christos break; 1309 1.1 christos default: 1310 1.1 christos abort (); 1311 1.1 christos } 1312 1.1 christos putc (input[i], stdout); 1313 1.1 christos } 1314 1.1 christos 1315 1.1 christos free (breaks); 1316 1.1 christos 1317 1.1 christos return 0; 1318 1.1 christos } 1319 1.1 christos else if (argc == 2) 1320 1.1 christos { 1321 1.1 christos /* Insert line breaks for a given width. */ 1322 1.1 christos int width = atoi (argv[1]); 1323 1.1 christos char *input = read_file (stdin); 1324 1.1 christos int length = strlen (input); 1325 1.1 christos char *breaks = malloc (length); 1326 1.1 christos int i; 1327 1.1 christos 1328 1.1 christos u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks); 1329 1.1 christos 1330 1.1 christos for (i = 0; i < length; i++) 1331 1.1 christos { 1332 1.1 christos switch (breaks[i]) 1333 1.1 christos { 1334 1.1 christos case UC_BREAK_POSSIBLE: 1335 1.1 christos putc ('\n', stdout); 1336 1.1 christos break; 1337 1.1 christos case UC_BREAK_MANDATORY: 1338 1.1 christos break; 1339 1.1 christos case UC_BREAK_PROHIBITED: 1340 1.1 christos break; 1341 1.1 christos default: 1342 1.1 christos abort (); 1343 1.1 christos } 1344 1.1 christos putc (input[i], stdout); 1345 1.1 christos } 1346 1.1 christos 1347 1.1 christos free (breaks); 1348 1.1 christos 1349 1.1 christos return 0; 1350 1.1 christos } 1351 1.1 christos else 1352 1.1 christos return 1; 1353 1.1 christos } 1354 1.1 christos 1355 1.1 christos #endif /* TEST1 */ 1356 1.1 christos 1357 1.1 christos 1358 1.1 christos /* Now the same thing with an arbitrary encoding. 1359 1.1 christos 1360 1.1 christos We convert the input string to Unicode. 1361 1.1 christos 1362 1.1 christos The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16, 1363 1.1 christos UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to 1364 1.1 christos \U0000FFFF. UTF-16 and variants support only characters up to 1365 1.1 christos \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1. 1366 1.1 christos UCS-4 specification leaves doubts about endianness and byte order mark. 1367 1.1 christos glibc currently interprets it as big endian without byte order mark, 1368 1.1 christos but this is not backed by an RFC. So we use UTF-8. It supports 1369 1.1 christos characters up to \U7FFFFFFF and is unambiguously defined. */ 1370 1.1 christos 1371 1.1 christos #if HAVE_ICONV 1372 1.1 christos 1373 1.1 christos #include <iconv.h> 1374 1.1 christos #include <errno.h> 1375 1.1 christos 1376 1.1 christos /* Luckily, the encoding's name is platform independent. */ 1377 1.1 christos #define UTF8_NAME "UTF-8" 1378 1.1 christos 1379 1.1 christos /* Return the length of a string after conversion through an iconv_t. */ 1380 1.1 christos static size_t 1381 1.1 christos iconv_string_length (iconv_t cd, const char *s, size_t n) 1382 1.1 christos { 1383 1.1 christos #define TMPBUFSIZE 4096 1384 1.1 christos size_t count = 0; 1385 1.1 christos char tmpbuf[TMPBUFSIZE]; 1386 1.1 christos const char *inptr = s; 1387 1.1 christos size_t insize = n; 1388 1.1 christos while (insize > 0) 1389 1.1 christos { 1390 1.1 christos char *outptr = tmpbuf; 1391 1.1 christos size_t outsize = TMPBUFSIZE; 1392 1.1 christos size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 1393 1.1 christos if (res == (size_t)(-1) && errno != E2BIG) 1394 1.1 christos return (size_t)(-1); 1395 1.1 christos count += outptr - tmpbuf; 1396 1.1 christos } 1397 1.1 christos /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */ 1398 1.1 christos #if defined _LIBICONV_VERSION \ 1399 1.1 christos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 1400 1.1 christos { 1401 1.1 christos char *outptr = tmpbuf; 1402 1.1 christos size_t outsize = TMPBUFSIZE; 1403 1.1 christos size_t res = iconv (cd, NULL, NULL, &outptr, &outsize); 1404 1.1 christos if (res == (size_t)(-1)) 1405 1.1 christos return (size_t)(-1); 1406 1.1 christos count += outptr - tmpbuf; 1407 1.1 christos } 1408 1.1 christos /* Return to the initial state. */ 1409 1.1 christos iconv (cd, NULL, NULL, NULL, NULL); 1410 1.1 christos #endif 1411 1.1 christos return count; 1412 1.1 christos #undef TMPBUFSIZE 1413 1.1 christos } 1414 1.1 christos 1415 1.1 christos static void 1416 1.1 christos iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, 1417 1.1 christos size_t *offtable, char *t, size_t m) 1418 1.1 christos { 1419 1.1 christos size_t i; 1420 1.1 christos const char *s_end; 1421 1.1 christos const char *inptr; 1422 1.1 christos char *outptr; 1423 1.1 christos size_t outsize; 1424 1.1 christos /* Avoid glibc-2.1 bug. */ 1425 1.1 christos #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) 1426 1.1 christos const size_t extra = 1; 1427 1.1 christos #else 1428 1.1 christos const size_t extra = 0; 1429 1.1 christos #endif 1430 1.1 christos 1431 1.1 christos for (i = 0; i < n; i++) 1432 1.1 christos offtable[i] = (size_t)(-1); 1433 1.1 christos 1434 1.1 christos s_end = s + n; 1435 1.1 christos inptr = s; 1436 1.1 christos outptr = t; 1437 1.1 christos outsize = m + extra; 1438 1.1 christos while (inptr < s_end) 1439 1.1 christos { 1440 1.1 christos const char *saved_inptr; 1441 1.1 christos size_t insize; 1442 1.1 christos size_t res; 1443 1.1 christos 1444 1.1 christos offtable[inptr - s] = outptr - t; 1445 1.1 christos 1446 1.1 christos saved_inptr = inptr; 1447 1.1 christos res = (size_t)(-1); 1448 1.1 christos for (insize = 1; inptr + insize <= s_end; insize++) 1449 1.1 christos { 1450 1.1 christos res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize); 1451 1.1 christos if (!(res == (size_t)(-1) && errno == EINVAL)) 1452 1.1 christos break; 1453 1.1 christos /* We expect that no input bytes have been consumed so far. */ 1454 1.1 christos if (inptr != saved_inptr) 1455 1.1 christos abort (); 1456 1.1 christos } 1457 1.1 christos /* After we verified the convertibility and computed the translation's 1458 1.1 christos size m, there shouldn't be any conversion error here. */ 1459 1.1 christos if (res == (size_t)(-1)) 1460 1.1 christos abort (); 1461 1.1 christos } 1462 1.1 christos /* Avoid glibc-2.1 bug and Solaris 7 bug. */ 1463 1.1 christos #if defined _LIBICONV_VERSION \ 1464 1.1 christos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun) 1465 1.1 christos if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1)) 1466 1.1 christos abort (); 1467 1.1 christos #endif 1468 1.1 christos /* We should have produced exactly m output bytes. */ 1469 1.1 christos if (outsize != extra) 1470 1.1 christos abort (); 1471 1.1 christos } 1472 1.1 christos 1473 1.1 christos #endif /* HAVE_ICONV */ 1474 1.1 christos 1475 1.1 christos #if C_CTYPE_ASCII 1476 1.1 christos 1477 1.1 christos /* Tests whether a string is entirely ASCII. Returns 1 if yes. 1478 1.1 christos Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */ 1479 1.1 christos static int 1480 1.1 christos is_all_ascii (const char *s, size_t n) 1481 1.1 christos { 1482 1.1 christos for (; n > 0; s++, n--) 1483 1.1 christos { 1484 1.1 christos unsigned char c = (unsigned char) *s; 1485 1.1 christos 1486 1.1 christos if (!(c_isprint (c) || c_isspace (c))) 1487 1.1 christos return 0; 1488 1.1 christos } 1489 1.1 christos return 1; 1490 1.1 christos } 1491 1.1 christos 1492 1.1 christos #endif /* C_CTYPE_ASCII */ 1493 1.1 christos 1494 1.1 christos #if defined unused || defined TEST2 1495 1.1 christos 1496 1.1 christos void 1497 1.1 christos mbs_possible_linebreaks (const char *s, size_t n, const char *encoding, 1498 1.1 christos char *p) 1499 1.1 christos { 1500 1.1 christos if (n == 0) 1501 1.1 christos return; 1502 1.1 christos if (is_utf8_encoding (encoding)) 1503 1.1 christos u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1504 1.1 christos else 1505 1.1 christos { 1506 1.1 christos #if HAVE_ICONV 1507 1.1 christos iconv_t to_utf8; 1508 1.1 christos /* Avoid glibc-2.1 bug with EUC-KR. */ 1509 1.1 christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1510 1.1 christos if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1511 1.1 christos to_utf8 = (iconv_t)(-1); 1512 1.1 christos else 1513 1.1 christos # endif 1514 1.1 christos /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1515 1.1 christos GB18030. */ 1516 1.1 christos # if defined __sun && !defined _LIBICONV_VERSION 1517 1.1 christos if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1518 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1519 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1520 1.1 christos || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1521 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1522 1.1 christos || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1523 1.1 christos to_utf8 = (iconv_t)(-1); 1524 1.1 christos else 1525 1.1 christos # endif 1526 1.1 christos to_utf8 = iconv_open (UTF8_NAME, encoding); 1527 1.1 christos if (to_utf8 != (iconv_t)(-1)) 1528 1.1 christos { 1529 1.1 christos /* Determine the length of the resulting UTF-8 string. */ 1530 1.1 christos size_t m = iconv_string_length (to_utf8, s, n); 1531 1.1 christos if (m != (size_t)(-1)) 1532 1.1 christos { 1533 1.1 christos /* Convert the string to UTF-8 and build a translation table 1534 1.1 christos from offsets into s to offsets into the translated string. */ 1535 1.1 christos size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m); 1536 1.1 christos char *memory = 1537 1.1 christos (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1538 1.1 christos if (memory != NULL) 1539 1.1 christos { 1540 1.1 christos size_t *offtable = (size_t *) memory; 1541 1.1 christos char *t = (char *) (offtable + n); 1542 1.1 christos char *q = (char *) (t + m); 1543 1.1 christos size_t i; 1544 1.1 christos 1545 1.1 christos iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1546 1.1 christos 1547 1.1 christos /* Determine the possible line breaks of the UTF-8 string. */ 1548 1.1 christos u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q); 1549 1.1 christos 1550 1.1 christos /* Translate the result back to the original string. */ 1551 1.1 christos memset (p, UC_BREAK_PROHIBITED, n); 1552 1.1 christos for (i = 0; i < n; i++) 1553 1.1 christos if (offtable[i] != (size_t)(-1)) 1554 1.1 christos p[i] = q[offtable[i]]; 1555 1.1 christos 1556 1.1 christos free (memory); 1557 1.1 christos iconv_close (to_utf8); 1558 1.1 christos return; 1559 1.1 christos } 1560 1.1 christos } 1561 1.1 christos iconv_close (to_utf8); 1562 1.1 christos } 1563 1.1 christos #endif 1564 1.1 christos /* Impossible to convert. */ 1565 1.1 christos #if C_CTYPE_ASCII 1566 1.1 christos if (is_all_ascii (s, n)) 1567 1.1 christos { 1568 1.1 christos /* ASCII is a subset of UTF-8. */ 1569 1.1 christos u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p); 1570 1.1 christos return; 1571 1.1 christos } 1572 1.1 christos #endif 1573 1.1 christos /* We have a non-ASCII string and cannot convert it. 1574 1.1 christos Don't produce line breaks except those already present in the 1575 1.1 christos input string. All we assume here is that the encoding is 1576 1.1 christos minimally ASCII compatible. */ 1577 1.1 christos { 1578 1.1 christos const char *s_end = s + n; 1579 1.1 christos while (s < s_end) 1580 1.1 christos { 1581 1.1 christos *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED); 1582 1.1 christos s++; 1583 1.1 christos p++; 1584 1.1 christos } 1585 1.1 christos } 1586 1.1 christos } 1587 1.1 christos } 1588 1.1 christos 1589 1.1 christos #endif 1590 1.1 christos 1591 1.1 christos int 1592 1.1 christos mbs_width_linebreaks (const char *s, size_t n, 1593 1.1 christos int width, int start_column, int at_end_columns, 1594 1.1 christos const char *o, const char *encoding, 1595 1.1 christos char *p) 1596 1.1 christos { 1597 1.1 christos if (n == 0) 1598 1.1 christos return start_column; 1599 1.1 christos if (is_utf8_encoding (encoding)) 1600 1.1 christos return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1601 1.1 christos else 1602 1.1 christos { 1603 1.1 christos #if HAVE_ICONV 1604 1.1 christos iconv_t to_utf8; 1605 1.1 christos /* Avoid glibc-2.1 bug with EUC-KR. */ 1606 1.1 christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION 1607 1.1 christos if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)) 1608 1.1 christos to_utf8 = (iconv_t)(-1); 1609 1.1 christos else 1610 1.1 christos # endif 1611 1.1 christos /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK, 1612 1.1 christos GB18030. */ 1613 1.1 christos # if defined __sun && !defined _LIBICONV_VERSION 1614 1.1 christos if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0) 1615 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0) 1616 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0) 1617 1.1 christos || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C') 1618 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0) 1619 1.1 christos || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0)) 1620 1.1 christos to_utf8 = (iconv_t)(-1); 1621 1.1 christos else 1622 1.1 christos # endif 1623 1.1 christos to_utf8 = iconv_open (UTF8_NAME, encoding); 1624 1.1 christos if (to_utf8 != (iconv_t)(-1)) 1625 1.1 christos { 1626 1.1 christos /* Determine the length of the resulting UTF-8 string. */ 1627 1.1 christos size_t m = iconv_string_length (to_utf8, s, n); 1628 1.1 christos if (m != (size_t)(-1)) 1629 1.1 christos { 1630 1.1 christos /* Convert the string to UTF-8 and build a translation table 1631 1.1 christos from offsets into s to offsets into the translated string. */ 1632 1.1 christos size_t memory_size = 1633 1.1 christos xsum4 (xtimes (n, sizeof (size_t)), m, m, 1634 1.1 christos (o != NULL ? m : 0)); 1635 1.1 christos char *memory = 1636 1.1 christos (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL); 1637 1.1 christos if (memory != NULL) 1638 1.1 christos { 1639 1.1 christos size_t *offtable = (size_t *) memory; 1640 1.1 christos char *t = (char *) (offtable + n); 1641 1.1 christos char *q = (char *) (t + m); 1642 1.1 christos char *o8 = (o != NULL ? (char *) (q + m) : NULL); 1643 1.1 christos int res_column; 1644 1.1 christos size_t i; 1645 1.1 christos 1646 1.1 christos iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m); 1647 1.1 christos 1648 1.1 christos /* Translate the overrides to the UTF-8 string. */ 1649 1.1 christos if (o != NULL) 1650 1.1 christos { 1651 1.1 christos memset (o8, UC_BREAK_UNDEFINED, m); 1652 1.1 christos for (i = 0; i < n; i++) 1653 1.1 christos if (offtable[i] != (size_t)(-1)) 1654 1.1 christos o8[offtable[i]] = o[i]; 1655 1.1 christos } 1656 1.1 christos 1657 1.1 christos /* Determine the line breaks of the UTF-8 string. */ 1658 1.1 christos res_column = 1659 1.1 christos u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q); 1660 1.1 christos 1661 1.1 christos /* Translate the result back to the original string. */ 1662 1.1 christos memset (p, UC_BREAK_PROHIBITED, n); 1663 1.1 christos for (i = 0; i < n; i++) 1664 1.1 christos if (offtable[i] != (size_t)(-1)) 1665 1.1 christos p[i] = q[offtable[i]]; 1666 1.1 christos 1667 1.1 christos free (memory); 1668 1.1 christos iconv_close (to_utf8); 1669 1.1 christos return res_column; 1670 1.1 christos } 1671 1.1 christos } 1672 1.1 christos iconv_close (to_utf8); 1673 1.1 christos } 1674 1.1 christos #endif 1675 1.1 christos /* Impossible to convert. */ 1676 1.1 christos #if C_CTYPE_ASCII 1677 1.1 christos if (is_all_ascii (s, n)) 1678 1.1 christos { 1679 1.1 christos /* ASCII is a subset of UTF-8. */ 1680 1.1 christos return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p); 1681 1.1 christos } 1682 1.1 christos #endif 1683 1.1 christos /* We have a non-ASCII string and cannot convert it. 1684 1.1 christos Don't produce line breaks except those already present in the 1685 1.1 christos input string. All we assume here is that the encoding is 1686 1.1 christos minimally ASCII compatible. */ 1687 1.1 christos { 1688 1.1 christos const char *s_end = s + n; 1689 1.1 christos while (s < s_end) 1690 1.1 christos { 1691 1.1 christos *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n' 1692 1.1 christos ? UC_BREAK_MANDATORY 1693 1.1 christos : UC_BREAK_PROHIBITED); 1694 1.1 christos s++; 1695 1.1 christos p++; 1696 1.1 christos if (o != NULL) 1697 1.1 christos o++; 1698 1.1 christos } 1699 1.1 christos /* We cannot compute widths in this case. */ 1700 1.1 christos return start_column; 1701 1.1 christos } 1702 1.1 christos } 1703 1.1 christos } 1704 1.1 christos 1705 1.1 christos 1706 1.1 christos #ifdef TEST2 1707 1.1 christos 1708 1.1 christos #include <stdio.h> 1709 1.1 christos #include <locale.h> 1710 1.1 christos 1711 1.1 christos /* Read the contents of an input stream, and return it, terminated with a NUL 1712 1.1 christos byte. */ 1713 1.1 christos char * 1714 1.1 christos read_file (FILE *stream) 1715 1.1 christos { 1716 1.1 christos #define BUFSIZE 4096 1717 1.1 christos char *buf = NULL; 1718 1.1 christos int alloc = 0; 1719 1.1 christos int size = 0; 1720 1.1 christos int count; 1721 1.1 christos 1722 1.1 christos while (! feof (stream)) 1723 1.1 christos { 1724 1.1 christos if (size + BUFSIZE > alloc) 1725 1.1 christos { 1726 1.1 christos alloc = alloc + alloc / 2; 1727 1.1 christos if (alloc < size + BUFSIZE) 1728 1.1 christos alloc = size + BUFSIZE; 1729 1.1 christos buf = realloc (buf, alloc); 1730 1.1 christos if (buf == NULL) 1731 1.1 christos { 1732 1.1 christos fprintf (stderr, "out of memory\n"); 1733 1.1 christos exit (1); 1734 1.1 christos } 1735 1.1 christos } 1736 1.1 christos count = fread (buf + size, 1, BUFSIZE, stream); 1737 1.1 christos if (count == 0) 1738 1.1 christos { 1739 1.1 christos if (ferror (stream)) 1740 1.1 christos { 1741 1.1 christos perror ("fread"); 1742 1.1 christos exit (1); 1743 1.1 christos } 1744 1.1 christos } 1745 1.1 christos else 1746 1.1 christos size += count; 1747 1.1 christos } 1748 1.1 christos buf = realloc (buf, size + 1); 1749 1.1 christos if (buf == NULL) 1750 1.1 christos { 1751 1.1 christos fprintf (stderr, "out of memory\n"); 1752 1.1 christos exit (1); 1753 1.1 christos } 1754 1.1 christos buf[size] = '\0'; 1755 1.1 christos return buf; 1756 1.1 christos #undef BUFSIZE 1757 1.1 christos } 1758 1.1 christos 1759 1.1 christos int 1760 1.1 christos main (int argc, char * argv[]) 1761 1.1 christos { 1762 1.1 christos setlocale (LC_CTYPE, ""); 1763 1.1 christos if (argc == 1) 1764 1.1 christos { 1765 1.1 christos /* Display all the break opportunities in the input string. */ 1766 1.1 christos char *input = read_file (stdin); 1767 1.1 christos int length = strlen (input); 1768 1.1 christos char *breaks = malloc (length); 1769 1.1 christos int i; 1770 1.1 christos 1771 1.1 christos mbs_possible_linebreaks (input, length, locale_charset (), breaks); 1772 1.1 christos 1773 1.1 christos for (i = 0; i < length; i++) 1774 1.1 christos { 1775 1.1 christos switch (breaks[i]) 1776 1.1 christos { 1777 1.1 christos case UC_BREAK_POSSIBLE: 1778 1.1 christos putc ('|', stdout); 1779 1.1 christos break; 1780 1.1 christos case UC_BREAK_MANDATORY: 1781 1.1 christos break; 1782 1.1 christos case UC_BREAK_PROHIBITED: 1783 1.1 christos break; 1784 1.1 christos default: 1785 1.1 christos abort (); 1786 1.1 christos } 1787 1.1 christos putc (input[i], stdout); 1788 1.1 christos } 1789 1.1 christos 1790 1.1 christos free (breaks); 1791 1.1 christos 1792 1.1 christos return 0; 1793 1.1 christos } 1794 1.1 christos else if (argc == 2) 1795 1.1 christos { 1796 1.1 christos /* Insert line breaks for a given width. */ 1797 1.1 christos int width = atoi (argv[1]); 1798 1.1 christos char *input = read_file (stdin); 1799 1.1 christos int length = strlen (input); 1800 1.1 christos char *breaks = malloc (length); 1801 1.1 christos int i; 1802 1.1 christos 1803 1.1 christos mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks); 1804 1.1 christos 1805 1.1 christos for (i = 0; i < length; i++) 1806 1.1 christos { 1807 1.1 christos switch (breaks[i]) 1808 1.1 christos { 1809 1.1 christos case UC_BREAK_POSSIBLE: 1810 1.1 christos putc ('\n', stdout); 1811 1.1 christos break; 1812 1.1 christos case UC_BREAK_MANDATORY: 1813 1.1 christos break; 1814 1.1 christos case UC_BREAK_PROHIBITED: 1815 1.1 christos break; 1816 1.1 christos default: 1817 1.1 christos abort (); 1818 1.1 christos } 1819 1.1 christos putc (input[i], stdout); 1820 1.1 christos } 1821 1.1 christos 1822 1.1 christos free (breaks); 1823 1.1 christos 1824 1.1 christos return 0; 1825 1.1 christos } 1826 1.1 christos else 1827 1.1 christos return 1; 1828 1.1 christos } 1829 1.1 christos 1830 1.1 christos #endif /* TEST2 */ 1831