linebreak.c revision 1.1.1.1 1 1.1 christos /* linebreak.c - line breaking of Unicode strings
2 1.1 christos Copyright (C) 2001-2003, 2006 Free Software Foundation, Inc.
3 1.1 christos Written by Bruno Haible <haible (at) clisp.cons.org>, 2001.
4 1.1 christos
5 1.1 christos This program is free software; you can redistribute it and/or modify
6 1.1 christos it under the terms of the GNU General Public License as published by
7 1.1 christos the Free Software Foundation; either version 2, or (at your option)
8 1.1 christos any later version.
9 1.1 christos
10 1.1 christos This program is distributed in the hope that it will be useful,
11 1.1 christos but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 christos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 christos GNU General Public License for more details.
14 1.1 christos
15 1.1 christos You should have received a copy of the GNU General Public License
16 1.1 christos along with this program; if not, write to the Free Software Foundation,
17 1.1 christos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18 1.1 christos
19 1.1 christos #include <config.h>
20 1.1 christos
21 1.1 christos /* Specification. */
22 1.1 christos #include "linebreak.h"
23 1.1 christos
24 1.1 christos #include <stdlib.h>
25 1.1 christos #include <string.h>
26 1.1 christos #include "c-ctype.h"
27 1.1 christos #include "xsize.h"
28 1.1 christos
29 1.1 christos #include "utf8-ucs4.h"
30 1.1 christos
31 1.1 christos #ifdef unused
32 1.1 christos #include "utf16-ucs4.h"
33 1.1 christos
34 1.1 christos static inline int
35 1.1 christos u32_mbtouc (unsigned int *puc, const unsigned int *s, size_t n)
36 1.1 christos {
37 1.1 christos *puc = *s;
38 1.1 christos return 1;
39 1.1 christos }
40 1.1 christos #endif
41 1.1 christos
42 1.1 christos
43 1.1 christos /* Help GCC to generate good code for string comparisons with
44 1.1 christos immediate strings. */
45 1.1 christos #if defined (__GNUC__) && defined (__OPTIMIZE__)
46 1.1 christos
47 1.1 christos static inline int
48 1.1 christos streq9 (const char *s1, const char *s2)
49 1.1 christos {
50 1.1 christos return strcmp (s1 + 9, s2 + 9) == 0;
51 1.1 christos }
52 1.1 christos
53 1.1 christos static inline int
54 1.1 christos streq8 (const char *s1, const char *s2, char s28)
55 1.1 christos {
56 1.1 christos if (s1[8] == s28)
57 1.1 christos {
58 1.1 christos if (s28 == 0)
59 1.1 christos return 1;
60 1.1 christos else
61 1.1 christos return streq9 (s1, s2);
62 1.1 christos }
63 1.1 christos else
64 1.1 christos return 0;
65 1.1 christos }
66 1.1 christos
67 1.1 christos static inline int
68 1.1 christos streq7 (const char *s1, const char *s2, char s27, char s28)
69 1.1 christos {
70 1.1 christos if (s1[7] == s27)
71 1.1 christos {
72 1.1 christos if (s27 == 0)
73 1.1 christos return 1;
74 1.1 christos else
75 1.1 christos return streq8 (s1, s2, s28);
76 1.1 christos }
77 1.1 christos else
78 1.1 christos return 0;
79 1.1 christos }
80 1.1 christos
81 1.1 christos static inline int
82 1.1 christos streq6 (const char *s1, const char *s2, char s26, char s27, char s28)
83 1.1 christos {
84 1.1 christos if (s1[6] == s26)
85 1.1 christos {
86 1.1 christos if (s26 == 0)
87 1.1 christos return 1;
88 1.1 christos else
89 1.1 christos return streq7 (s1, s2, s27, s28);
90 1.1 christos }
91 1.1 christos else
92 1.1 christos return 0;
93 1.1 christos }
94 1.1 christos
95 1.1 christos static inline int
96 1.1 christos streq5 (const char *s1, const char *s2, char s25, char s26, char s27, char s28)
97 1.1 christos {
98 1.1 christos if (s1[5] == s25)
99 1.1 christos {
100 1.1 christos if (s25 == 0)
101 1.1 christos return 1;
102 1.1 christos else
103 1.1 christos return streq6 (s1, s2, s26, s27, s28);
104 1.1 christos }
105 1.1 christos else
106 1.1 christos return 0;
107 1.1 christos }
108 1.1 christos
109 1.1 christos static inline int
110 1.1 christos streq4 (const char *s1, const char *s2, char s24, char s25, char s26, char s27, char s28)
111 1.1 christos {
112 1.1 christos if (s1[4] == s24)
113 1.1 christos {
114 1.1 christos if (s24 == 0)
115 1.1 christos return 1;
116 1.1 christos else
117 1.1 christos return streq5 (s1, s2, s25, s26, s27, s28);
118 1.1 christos }
119 1.1 christos else
120 1.1 christos return 0;
121 1.1 christos }
122 1.1 christos
123 1.1 christos static inline int
124 1.1 christos streq3 (const char *s1, const char *s2, char s23, char s24, char s25, char s26, char s27, char s28)
125 1.1 christos {
126 1.1 christos if (s1[3] == s23)
127 1.1 christos {
128 1.1 christos if (s23 == 0)
129 1.1 christos return 1;
130 1.1 christos else
131 1.1 christos return streq4 (s1, s2, s24, s25, s26, s27, s28);
132 1.1 christos }
133 1.1 christos else
134 1.1 christos return 0;
135 1.1 christos }
136 1.1 christos
137 1.1 christos static inline int
138 1.1 christos streq2 (const char *s1, const char *s2, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
139 1.1 christos {
140 1.1 christos if (s1[2] == s22)
141 1.1 christos {
142 1.1 christos if (s22 == 0)
143 1.1 christos return 1;
144 1.1 christos else
145 1.1 christos return streq3 (s1, s2, s23, s24, s25, s26, s27, s28);
146 1.1 christos }
147 1.1 christos else
148 1.1 christos return 0;
149 1.1 christos }
150 1.1 christos
151 1.1 christos static inline int
152 1.1 christos streq1 (const char *s1, const char *s2, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
153 1.1 christos {
154 1.1 christos if (s1[1] == s21)
155 1.1 christos {
156 1.1 christos if (s21 == 0)
157 1.1 christos return 1;
158 1.1 christos else
159 1.1 christos return streq2 (s1, s2, s22, s23, s24, s25, s26, s27, s28);
160 1.1 christos }
161 1.1 christos else
162 1.1 christos return 0;
163 1.1 christos }
164 1.1 christos
165 1.1 christos static inline int
166 1.1 christos streq0 (const char *s1, const char *s2, char s20, char s21, char s22, char s23, char s24, char s25, char s26, char s27, char s28)
167 1.1 christos {
168 1.1 christos if (s1[0] == s20)
169 1.1 christos {
170 1.1 christos if (s20 == 0)
171 1.1 christos return 1;
172 1.1 christos else
173 1.1 christos return streq1 (s1, s2, s21, s22, s23, s24, s25, s26, s27, s28);
174 1.1 christos }
175 1.1 christos else
176 1.1 christos return 0;
177 1.1 christos }
178 1.1 christos
179 1.1 christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
180 1.1 christos streq0 (s1, s2, s20, s21, s22, s23, s24, s25, s26, s27, s28)
181 1.1 christos
182 1.1 christos #else
183 1.1 christos
184 1.1 christos #define STREQ(s1,s2,s20,s21,s22,s23,s24,s25,s26,s27,s28) \
185 1.1 christos (strcmp (s1, s2) == 0)
186 1.1 christos
187 1.1 christos #endif
188 1.1 christos
189 1.1 christos
190 1.1 christos static int
191 1.1 christos is_cjk_encoding (const char *encoding)
192 1.1 christos {
193 1.1 christos if (0
194 1.1 christos /* Legacy Japanese encodings */
195 1.1 christos || STREQ (encoding, "EUC-JP", 'E', 'U', 'C', '-', 'J', 'P', 0, 0, 0)
196 1.1 christos /* Legacy Chinese encodings */
197 1.1 christos || STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
198 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
199 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
200 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
201 1.1 christos /* Legacy Korean encodings */
202 1.1 christos || STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0)
203 1.1 christos || STREQ (encoding, "CP949", 'C', 'P', '9', '4', '9', 0, 0, 0, 0)
204 1.1 christos || STREQ (encoding, "JOHAB", 'J', 'O', 'H', 'A', 'B', 0, 0, 0, 0))
205 1.1 christos return 1;
206 1.1 christos return 0;
207 1.1 christos }
208 1.1 christos
209 1.1 christos static int
210 1.1 christos is_utf8_encoding (const char *encoding)
211 1.1 christos {
212 1.1 christos if (STREQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0 ,0))
213 1.1 christos return 1;
214 1.1 christos return 0;
215 1.1 christos }
216 1.1 christos
217 1.1 christos
218 1.1 christos /* Determine number of column positions required for UC. */
219 1.1 christos int uc_width (unsigned int uc, const char *encoding);
220 1.1 christos
221 1.1 christos /*
222 1.1 christos * Non-spacing attribute table.
223 1.1 christos * Consists of:
224 1.1 christos * - Non-spacing characters; generated from PropList.txt or
225 1.1 christos * "grep '^[^;]*;[^;]*;[^;]*;[^;]*;NSM;' UnicodeData.txt"
226 1.1 christos * - Format control characters; generated from
227 1.1 christos * "grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt"
228 1.1 christos * - Zero width characters; generated from
229 1.1 christos * "grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt"
230 1.1 christos */
231 1.1 christos static const unsigned char nonspacing_table_data[16*64] = {
232 1.1 christos /* 0x0000-0x01ff */
233 1.1 christos 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, /* 0x0000-0x003f */
234 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0x0040-0x007f */
235 1.1 christos 0xff, 0xff, 0xff, 0xff, 0x00, 0x20, 0x00, 0x00, /* 0x0080-0x00bf */
236 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00c0-0x00ff */
237 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0100-0x013f */
238 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0140-0x017f */
239 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0180-0x01bf */
240 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x01c0-0x01ff */
241 1.1 christos /* 0x0200-0x03ff */
242 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0200-0x023f */
243 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0240-0x027f */
244 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0280-0x02bf */
245 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x02c0-0x02ff */
246 1.1 christos 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* 0x0300-0x033f */
247 1.1 christos 0xff, 0xff, 0xff, 0xe0, 0xff, 0xff, 0x00, 0x00, /* 0x0340-0x037f */
248 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0380-0x03bf */
249 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x03c0-0x03ff */
250 1.1 christos /* 0x0400-0x05ff */
251 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0400-0x043f */
252 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0440-0x047f */
253 1.1 christos 0x78, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0480-0x04bf */
254 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x04c0-0x04ff */
255 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0500-0x053f */
256 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0540-0x057f */
257 1.1 christos 0x00, 0x00, 0xfe, 0xff, 0xfb, 0xff, 0xff, 0xbb, /* 0x0580-0x05bf */
258 1.1 christos 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x05c0-0x05ff */
259 1.1 christos /* 0x0600-0x07ff */
260 1.1 christos 0x0f, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0600-0x063f */
261 1.1 christos 0x00, 0xf8, 0xff, 0x01, 0x00, 0x00, 0x01, 0x00, /* 0x0640-0x067f */
262 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0680-0x06bf */
263 1.1 christos 0x00, 0x00, 0xc0, 0xff, 0x9f, 0x3d, 0x00, 0x00, /* 0x06c0-0x06ff */
264 1.1 christos 0x00, 0x80, 0x02, 0x00, 0x00, 0x00, 0xff, 0xff, /* 0x0700-0x073f */
265 1.1 christos 0xff, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0740-0x077f */
266 1.1 christos 0x00, 0x00, 0x00, 0x00, 0xc0, 0xff, 0x01, 0x00, /* 0x0780-0x07bf */
267 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x07c0-0x07ff */
268 1.1 christos /* 0x0800-0x09ff */
269 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0800-0x083f */
270 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0840-0x087f */
271 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0880-0x08bf */
272 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08c0-0x08ff */
273 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0900-0x093f */
274 1.1 christos 0xfe, 0x21, 0x1e, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0940-0x097f */
275 1.1 christos 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0980-0x09bf */
276 1.1 christos 0x1e, 0x20, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x09c0-0x09ff */
277 1.1 christos /* 0x0a00-0x0bff */
278 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a00-0x0a3f */
279 1.1 christos 0x86, 0x39, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, /* 0x0a40-0x0a7f */
280 1.1 christos 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0a80-0x0abf */
281 1.1 christos 0xbe, 0x21, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, /* 0x0ac0-0x0aff */
282 1.1 christos 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x90, /* 0x0b00-0x0b3f */
283 1.1 christos 0x0e, 0x20, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b40-0x0b7f */
284 1.1 christos 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0b80-0x0bbf */
285 1.1 christos 0x01, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0bc0-0x0bff */
286 1.1 christos /* 0x0c00-0x0dff */
287 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xc0, /* 0x0c00-0x0c3f */
288 1.1 christos 0xc1, 0x3d, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0c40-0x0c7f */
289 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, /* 0x0c80-0x0cbf */
290 1.1 christos 0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0cc0-0x0cff */
291 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d00-0x0d3f */
292 1.1 christos 0x0e, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d40-0x0d7f */
293 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0d80-0x0dbf */
294 1.1 christos 0x00, 0x04, 0x5c, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0dc0-0x0dff */
295 1.1 christos /* 0x0e00-0x0fff */
296 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x07, /* 0x0e00-0x0e3f */
297 1.1 christos 0x80, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0e40-0x0e7f */
298 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf2, 0x1b, /* 0x0e80-0x0ebf */
299 1.1 christos 0x00, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0ec0-0x0eff */
300 1.1 christos 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0xa0, 0x02, /* 0x0f00-0x0f3f */
301 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfe, 0x7f, /* 0x0f40-0x0f7f */
302 1.1 christos 0xdf, 0x00, 0xff, 0xfe, 0xff, 0xff, 0xff, 0x1f, /* 0x0f80-0x0fbf */
303 1.1 christos 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x0fc0-0x0fff */
304 1.1 christos /* 0x1000-0x11ff */
305 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xc5, 0x02, /* 0x1000-0x103f */
306 1.1 christos 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, /* 0x1040-0x107f */
307 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1080-0x10bf */
308 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10c0-0x10ff */
309 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1100-0x113f */
310 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1140-0x117f */
311 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1180-0x11bf */
312 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x11c0-0x11ff */
313 1.1 christos /* 0x1600-0x17ff */
314 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1600-0x163f */
315 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1640-0x167f */
316 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1680-0x16bf */
317 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x16c0-0x16ff */
318 1.1 christos 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00, /* 0x1700-0x173f */
319 1.1 christos 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, /* 0x1740-0x177f */
320 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x3f, /* 0x1780-0x17bf */
321 1.1 christos 0x40, 0xfe, 0x0f, 0x20, 0x00, 0x00, 0x00, 0x00, /* 0x17c0-0x17ff */
322 1.1 christos /* 0x1800-0x19ff */
323 1.1 christos 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1800-0x183f */
324 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1840-0x187f */
325 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, /* 0x1880-0x18bf */
326 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18c0-0x18ff */
327 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x87, 0x0f, 0x04, 0x0e, /* 0x1900-0x193f */
328 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1940-0x197f */
329 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1980-0x19bf */
330 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x19c0-0x19ff */
331 1.1 christos /* 0x2000-0x21ff */
332 1.1 christos 0x00, 0xf8, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, /* 0x2000-0x203f */
333 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x0f, 0xfc, 0x00, 0x00, /* 0x2040-0x207f */
334 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2080-0x20bf */
335 1.1 christos 0x00, 0x00, 0xff, 0xff, 0xff, 0x07, 0x00, 0x00, /* 0x20c0-0x20ff */
336 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2100-0x213f */
337 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2140-0x217f */
338 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x2180-0x21bf */
339 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x21c0-0x21ff */
340 1.1 christos /* 0x3000-0x31ff */
341 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x00, /* 0x3000-0x303f */
342 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3040-0x307f */
343 1.1 christos 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, /* 0x3080-0x30bf */
344 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30c0-0x30ff */
345 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3100-0x313f */
346 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3140-0x317f */
347 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x3180-0x31bf */
348 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x31c0-0x31ff */
349 1.1 christos /* 0xfa00-0xfbff */
350 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa00-0xfa3f */
351 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa40-0xfa7f */
352 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfa80-0xfabf */
353 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfac0-0xfaff */
354 1.1 christos 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, /* 0xfb00-0xfb3f */
355 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb40-0xfb7f */
356 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfb80-0xfbbf */
357 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfbc0-0xfbff */
358 1.1 christos /* 0xfe00-0xffff */
359 1.1 christos 0xff, 0xff, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, /* 0xfe00-0xfe3f */
360 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe40-0xfe7f */
361 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xfe80-0xfebf */
362 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, /* 0xfec0-0xfeff */
363 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff00-0xff3f */
364 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff40-0xff7f */
365 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xff80-0xffbf */
366 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, /* 0xffc0-0xffff */
367 1.1 christos /* 0x1d000-0x1d1ff */
368 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d000-0x1d03f */
369 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d040-0x1d07f */
370 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d080-0x1d0bf */
371 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d0c0-0x1d0ff */
372 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x1d100-0x1d13f */
373 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x80, 0x03, 0x00, 0xf8, /* 0x1d140-0x1d17f */
374 1.1 christos 0xe7, 0x0f, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, /* 0x1d180-0x1d1bf */
375 1.1 christos 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0x1d1c0-0x1d1ff */
376 1.1 christos };
377 1.1 christos static const signed char nonspacing_table_ind[240] = {
378 1.1 christos 0, 1, 2, 3, 4, 5, 6, 7, /* 0x0000-0x0fff */
379 1.1 christos 8, -1, -1, 9, 10, -1, -1, -1, /* 0x1000-0x1fff */
380 1.1 christos 11, -1, -1, -1, -1, -1, -1, -1, /* 0x2000-0x2fff */
381 1.1 christos 12, -1, -1, -1, -1, -1, -1, -1, /* 0x3000-0x3fff */
382 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x4000-0x4fff */
383 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x5000-0x5fff */
384 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x6000-0x6fff */
385 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x7000-0x7fff */
386 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x8000-0x8fff */
387 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x9000-0x9fff */
388 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xa000-0xafff */
389 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xb000-0xbfff */
390 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xc000-0xcfff */
391 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xd000-0xdfff */
392 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0xe000-0xefff */
393 1.1 christos -1, -1, -1, -1, -1, 13, -1, 14, /* 0xf000-0xffff */
394 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x10000-0x10fff */
395 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x11000-0x11fff */
396 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x12000-0x12fff */
397 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x13000-0x13fff */
398 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x14000-0x14fff */
399 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x15000-0x15fff */
400 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x16000-0x16fff */
401 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x17000-0x17fff */
402 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x18000-0x18fff */
403 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x19000-0x19fff */
404 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1a000-0x1afff */
405 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1b000-0x1bfff */
406 1.1 christos -1, -1, -1, -1, -1, -1, -1, -1, /* 0x1c000-0x1cfff */
407 1.1 christos 15, -1, -1, -1, -1, -1, -1, -1 /* 0x1d000-0x1dfff */
408 1.1 christos };
409 1.1 christos
410 1.1 christos /* Determine number of column positions required for UC. */
411 1.1 christos int
412 1.1 christos uc_width (unsigned int uc, const char *encoding)
413 1.1 christos {
414 1.1 christos /* Test for non-spacing or control character. */
415 1.1 christos if ((uc >> 9) < 240)
416 1.1 christos {
417 1.1 christos int ind = nonspacing_table_ind[uc >> 9];
418 1.1 christos if (ind >= 0)
419 1.1 christos if ((nonspacing_table_data[64*ind + ((uc >> 3) & 63)] >> (uc & 7)) & 1)
420 1.1 christos {
421 1.1 christos if (uc > 0 && uc < 0xa0)
422 1.1 christos return -1;
423 1.1 christos else
424 1.1 christos return 0;
425 1.1 christos }
426 1.1 christos }
427 1.1 christos else if ((uc >> 9) == (0xe0000 >> 9))
428 1.1 christos {
429 1.1 christos if (uc < 0xe0100
430 1.1 christos ? (uc >= 0xe0020 ? uc <= 0xe007f : uc == 0xe0001)
431 1.1 christos : (uc <= 0xe01ef))
432 1.1 christos return 0;
433 1.1 christos }
434 1.1 christos /* Test for double-width character.
435 1.1 christos * Generated from "grep '^....;[WF]' EastAsianWidth.txt"
436 1.1 christos * and "grep '^....;[^WF]' EastAsianWidth.txt"
437 1.1 christos */
438 1.1 christos if (uc >= 0x1100
439 1.1 christos && ((uc < 0x1160) /* Hangul Jamo */
440 1.1 christos || (uc >= 0x2e80 && uc < 0x4dc0 /* CJK */
441 1.1 christos && !(uc == 0x303f))
442 1.1 christos || (uc >= 0x4e00 && uc < 0xa4d0) /* CJK ... Yi */
443 1.1 christos || (uc >= 0xac00 && uc < 0xd7a4) /* Hangul Syllables */
444 1.1 christos || (uc >= 0xf900 && uc < 0xfb00) /* CJK Compatibility Ideographs */
445 1.1 christos || (uc >= 0xfe30 && uc < 0xfe70) /* CJK Compatibility Forms */
446 1.1 christos || (uc >= 0xff00 && uc < 0xff61) /* Fullwidth Forms */
447 1.1 christos || (uc >= 0xffe0 && uc < 0xffe7)
448 1.1 christos || (uc >= 0x20000 && uc <= 0x2fffd) /* CJK, CJK Compatibility Ideographs */
449 1.1 christos || (uc >= 0x30000 && uc <= 0x3fffd)
450 1.1 christos ) )
451 1.1 christos return 2;
452 1.1 christos /* In ancient CJK encodings, Cyrillic and most other characters are
453 1.1 christos double-width as well. */
454 1.1 christos if (uc >= 0x00A1 && uc < 0xFF61 && uc != 0x20A9
455 1.1 christos && is_cjk_encoding (encoding))
456 1.1 christos return 2;
457 1.1 christos return 1;
458 1.1 christos }
459 1.1 christos
460 1.1 christos
461 1.1 christos #ifdef unused
462 1.1 christos
463 1.1 christos /* Determine number of column positions required for first N units
464 1.1 christos (or fewer if S ends before this) in S. */
465 1.1 christos
466 1.1 christos int
467 1.1 christos u8_width (const unsigned char *s, size_t n, const char *encoding)
468 1.1 christos {
469 1.1 christos const unsigned char *s_end = s + n;
470 1.1 christos int width = 0;
471 1.1 christos
472 1.1 christos while (s < s_end)
473 1.1 christos {
474 1.1 christos unsigned int uc;
475 1.1 christos int w;
476 1.1 christos
477 1.1 christos s += u8_mbtouc (&uc, s, s_end - s);
478 1.1 christos
479 1.1 christos if (uc == 0)
480 1.1 christos break; /* end of string reached */
481 1.1 christos
482 1.1 christos w = uc_width (uc, encoding);
483 1.1 christos if (w >= 0) /* ignore control characters in the string */
484 1.1 christos width += w;
485 1.1 christos }
486 1.1 christos
487 1.1 christos return width;
488 1.1 christos }
489 1.1 christos
490 1.1 christos int
491 1.1 christos u16_width (const unsigned short *s, size_t n, const char *encoding)
492 1.1 christos {
493 1.1 christos const unsigned short *s_end = s + n;
494 1.1 christos int width = 0;
495 1.1 christos
496 1.1 christos while (s < s_end)
497 1.1 christos {
498 1.1 christos unsigned int uc;
499 1.1 christos int w;
500 1.1 christos
501 1.1 christos s += u16_mbtouc (&uc, s, s_end - s);
502 1.1 christos
503 1.1 christos if (uc == 0)
504 1.1 christos break; /* end of string reached */
505 1.1 christos
506 1.1 christos w = uc_width (uc, encoding);
507 1.1 christos if (w >= 0) /* ignore control characters in the string */
508 1.1 christos width += w;
509 1.1 christos }
510 1.1 christos
511 1.1 christos return width;
512 1.1 christos }
513 1.1 christos
514 1.1 christos int
515 1.1 christos u32_width (const unsigned int *s, size_t n, const char *encoding)
516 1.1 christos {
517 1.1 christos const unsigned int *s_end = s + n;
518 1.1 christos int width = 0;
519 1.1 christos
520 1.1 christos while (s < s_end)
521 1.1 christos {
522 1.1 christos unsigned int uc = *s++;
523 1.1 christos int w;
524 1.1 christos
525 1.1 christos if (uc == 0)
526 1.1 christos break; /* end of string reached */
527 1.1 christos
528 1.1 christos w = uc_width (uc, encoding);
529 1.1 christos if (w >= 0) /* ignore control characters in the string */
530 1.1 christos width += w;
531 1.1 christos }
532 1.1 christos
533 1.1 christos return width;
534 1.1 christos }
535 1.1 christos
536 1.1 christos #endif
537 1.1 christos
538 1.1 christos
539 1.1 christos /* Determine the line break points in S, and store the result at p[0..n-1]. */
540 1.1 christos /* We don't support line breaking of complex-context dependent characters
541 1.1 christos (Thai, Lao, Myanmar, Khmer) yet, because it requires dictionary lookup. */
542 1.1 christos
543 1.1 christos /* Line breaking classification. */
544 1.1 christos
545 1.1 christos enum
546 1.1 christos {
547 1.1 christos /* Values >= 20 are resolved at run time. */
548 1.1 christos LBP_BK = 0, /* mandatory break */
549 1.1 christos /*LBP_CR, carriage return - not used here because it's a DOSism */
550 1.1 christos /*LBP_LF, line feed - not used here because it's a DOSism */
551 1.1 christos LBP_CM = 20, /* attached characters and combining marks */
552 1.1 christos /*LBP_SG, surrogates - not used here because they are not characters */
553 1.1 christos LBP_ZW = 1, /* zero width space */
554 1.1 christos LBP_IN = 2, /* inseparable */
555 1.1 christos LBP_GL = 3, /* non-breaking (glue) */
556 1.1 christos LBP_CB = 22, /* contingent break opportunity */
557 1.1 christos LBP_SP = 21, /* space */
558 1.1 christos LBP_BA = 4, /* break opportunity after */
559 1.1 christos LBP_BB = 5, /* break opportunity before */
560 1.1 christos LBP_B2 = 6, /* break opportunity before and after */
561 1.1 christos LBP_HY = 7, /* hyphen */
562 1.1 christos LBP_NS = 8, /* non starter */
563 1.1 christos LBP_OP = 9, /* opening punctuation */
564 1.1 christos LBP_CL = 10, /* closing punctuation */
565 1.1 christos LBP_QU = 11, /* ambiguous quotation */
566 1.1 christos LBP_EX = 12, /* exclamation/interrogation */
567 1.1 christos LBP_ID = 13, /* ideographic */
568 1.1 christos LBP_NU = 14, /* numeric */
569 1.1 christos LBP_IS = 15, /* infix separator (numeric) */
570 1.1 christos LBP_SY = 16, /* symbols allowing breaks */
571 1.1 christos LBP_AL = 17, /* ordinary alphabetic and symbol characters */
572 1.1 christos LBP_PR = 18, /* prefix (numeric) */
573 1.1 christos LBP_PO = 19, /* postfix (numeric) */
574 1.1 christos LBP_SA = 23, /* complex context (South East Asian) */
575 1.1 christos LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
576 1.1 christos LBP_XX = 25 /* unknown */
577 1.1 christos };
578 1.1 christos
579 1.1 christos #include "lbrkprop.h"
580 1.1 christos
581 1.1 christos static inline unsigned char
582 1.1 christos lbrkprop_lookup (unsigned int uc)
583 1.1 christos {
584 1.1 christos unsigned int index1 = uc >> lbrkprop_header_0;
585 1.1 christos if (index1 < lbrkprop_header_1)
586 1.1 christos {
587 1.1 christos int lookup1 = lbrkprop.level1[index1];
588 1.1 christos if (lookup1 >= 0)
589 1.1 christos {
590 1.1 christos unsigned int index2 = (uc >> lbrkprop_header_2) & lbrkprop_header_3;
591 1.1 christos int lookup2 = lbrkprop.level2[lookup1 + index2];
592 1.1 christos if (lookup2 >= 0)
593 1.1 christos {
594 1.1 christos unsigned int index3 = uc & lbrkprop_header_4;
595 1.1 christos return lbrkprop.level3[lookup2 + index3];
596 1.1 christos }
597 1.1 christos }
598 1.1 christos }
599 1.1 christos return LBP_XX;
600 1.1 christos }
601 1.1 christos
602 1.1 christos /* Table indexed by two line breaking classifications. */
603 1.1 christos #define D 1 /* direct break opportunity, empty in table 7.3 of UTR #14 */
604 1.1 christos #define I 2 /* indirect break opportunity, '%' in table 7.3 of UTR #14 */
605 1.1 christos #define P 3 /* prohibited break, '^' in table 7.3 of UTR #14 */
606 1.1 christos static const unsigned char lbrk_table[19][19] = {
607 1.1 christos /* after */
608 1.1 christos /* ZW IN GL BA BB B2 HY NS OP CL QU EX ID NU IS SY AL PR PO */
609 1.1 christos /* ZW */ { P, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, },
610 1.1 christos /* IN */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
611 1.1 christos /* GL */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
612 1.1 christos /* BA */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
613 1.1 christos /* BB */ { P, I, I, I, I, I, I, I, I, P, I, P, I, I, P, P, I, I, I, },
614 1.1 christos /* B2 */ { P, D, I, I, D, P, I, I, D, P, I, P, D, D, P, P, D, D, D, },
615 1.1 christos /* HY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
616 1.1 christos /* NS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
617 1.1 christos /* OP */ { P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, P, },
618 1.1 christos /* CL */ { P, D, I, I, D, D, I, P, D, P, I, P, D, D, P, P, D, D, I, },
619 1.1 christos /* QU */ { P, I, I, I, I, I, I, I, P, P, I, P, I, I, P, P, I, I, I, },
620 1.1 christos /* EX */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
621 1.1 christos /* ID */ { P, I, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, I, },
622 1.1 christos /* NU */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, I, },
623 1.1 christos /* IS */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
624 1.1 christos /* SY */ { P, D, I, I, D, D, I, I, D, P, I, P, D, I, P, P, D, D, D, },
625 1.1 christos /* AL */ { P, I, I, I, D, D, I, I, D, P, I, P, D, I, P, P, I, D, D, },
626 1.1 christos /* PR */ { P, D, I, I, D, D, I, I, I, P, I, P, I, I, P, P, I, D, D, },
627 1.1 christos /* PO */ { P, D, I, I, D, D, I, I, D, P, I, P, D, D, P, P, D, D, D, },
628 1.1 christos /* "" */
629 1.1 christos /* before */
630 1.1 christos };
631 1.1 christos /* Note: The (B2,B2) entry should probably be D instead of P. */
632 1.1 christos /* Note: The (PR,ID) entry should probably be D instead of I. */
633 1.1 christos
634 1.1 christos void
635 1.1 christos u8_possible_linebreaks (const unsigned char *s, size_t n, const char *encoding, char *p)
636 1.1 christos {
637 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
638 1.1 christos const unsigned char *s_end = s + n;
639 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */
640 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */
641 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
642 1.1 christos
643 1.1 christos /* Don't break inside multibyte characters. */
644 1.1 christos memset (p, UC_BREAK_PROHIBITED, n);
645 1.1 christos
646 1.1 christos while (s < s_end)
647 1.1 christos {
648 1.1 christos unsigned int uc;
649 1.1 christos int count = u8_mbtouc (&uc, s, s_end - s);
650 1.1 christos int prop = lbrkprop_lookup (uc);
651 1.1 christos
652 1.1 christos if (prop == LBP_BK)
653 1.1 christos {
654 1.1 christos /* Mandatory break. */
655 1.1 christos *p = UC_BREAK_MANDATORY;
656 1.1 christos last_prop = LBP_BK;
657 1.1 christos seen_space = NULL;
658 1.1 christos seen_space2 = NULL;
659 1.1 christos }
660 1.1 christos else
661 1.1 christos {
662 1.1 christos char *q;
663 1.1 christos
664 1.1 christos /* Resolve property values whose behaviour is not fixed. */
665 1.1 christos switch (prop)
666 1.1 christos {
667 1.1 christos case LBP_AI:
668 1.1 christos /* Resolve ambiguous. */
669 1.1 christos prop = LBP_AI_REPLACEMENT;
670 1.1 christos break;
671 1.1 christos case LBP_CB:
672 1.1 christos /* This is arbitrary. */
673 1.1 christos prop = LBP_ID;
674 1.1 christos break;
675 1.1 christos case LBP_SA:
676 1.1 christos /* We don't handle complex scripts yet.
677 1.1 christos Treat LBP_SA like LBP_XX. */
678 1.1 christos case LBP_XX:
679 1.1 christos /* This is arbitrary. */
680 1.1 christos prop = LBP_AL;
681 1.1 christos break;
682 1.1 christos }
683 1.1 christos
684 1.1 christos /* Deal with combining characters. */
685 1.1 christos q = p;
686 1.1 christos if (prop == LBP_CM)
687 1.1 christos {
688 1.1 christos /* Don't break just before a combining character. */
689 1.1 christos *p = UC_BREAK_PROHIBITED;
690 1.1 christos /* A combining character turns a preceding space into LBP_AL. */
691 1.1 christos if (seen_space != NULL)
692 1.1 christos {
693 1.1 christos q = seen_space;
694 1.1 christos seen_space = seen_space2;
695 1.1 christos prop = LBP_AL;
696 1.1 christos goto lookup_via_table;
697 1.1 christos }
698 1.1 christos }
699 1.1 christos else if (prop == LBP_SP)
700 1.1 christos {
701 1.1 christos /* Don't break just before a space. */
702 1.1 christos *p = UC_BREAK_PROHIBITED;
703 1.1 christos seen_space2 = seen_space;
704 1.1 christos seen_space = p;
705 1.1 christos }
706 1.1 christos else
707 1.1 christos {
708 1.1 christos lookup_via_table:
709 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */
710 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
711 1.1 christos abort ();
712 1.1 christos
713 1.1 christos if (last_prop == LBP_BK)
714 1.1 christos {
715 1.1 christos /* Don't break at the beginning of a line. */
716 1.1 christos *q = UC_BREAK_PROHIBITED;
717 1.1 christos }
718 1.1 christos else
719 1.1 christos {
720 1.1 christos switch (lbrk_table [last_prop-1] [prop-1])
721 1.1 christos {
722 1.1 christos case D:
723 1.1 christos *q = UC_BREAK_POSSIBLE;
724 1.1 christos break;
725 1.1 christos case I:
726 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
727 1.1 christos break;
728 1.1 christos case P:
729 1.1 christos *q = UC_BREAK_PROHIBITED;
730 1.1 christos break;
731 1.1 christos default:
732 1.1 christos abort ();
733 1.1 christos }
734 1.1 christos }
735 1.1 christos last_prop = prop;
736 1.1 christos seen_space = NULL;
737 1.1 christos seen_space2 = NULL;
738 1.1 christos }
739 1.1 christos }
740 1.1 christos
741 1.1 christos s += count;
742 1.1 christos p += count;
743 1.1 christos }
744 1.1 christos }
745 1.1 christos
746 1.1 christos #ifdef unused
747 1.1 christos
748 1.1 christos void
749 1.1 christos u16_possible_linebreaks (const unsigned short *s, size_t n, const char *encoding, char *p)
750 1.1 christos {
751 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
752 1.1 christos const unsigned short *s_end = s + n;
753 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */
754 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */
755 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
756 1.1 christos
757 1.1 christos /* Don't break inside multibyte characters. */
758 1.1 christos memset (p, UC_BREAK_PROHIBITED, n);
759 1.1 christos
760 1.1 christos while (s < s_end)
761 1.1 christos {
762 1.1 christos unsigned int uc;
763 1.1 christos int count = u16_mbtouc (&uc, s, s_end - s);
764 1.1 christos int prop = lbrkprop_lookup (uc);
765 1.1 christos
766 1.1 christos if (prop == LBP_BK)
767 1.1 christos {
768 1.1 christos /* Mandatory break. */
769 1.1 christos *p = UC_BREAK_MANDATORY;
770 1.1 christos last_prop = LBP_BK;
771 1.1 christos seen_space = NULL;
772 1.1 christos seen_space2 = NULL;
773 1.1 christos }
774 1.1 christos else
775 1.1 christos {
776 1.1 christos char *q;
777 1.1 christos
778 1.1 christos /* Resolve property values whose behaviour is not fixed. */
779 1.1 christos switch (prop)
780 1.1 christos {
781 1.1 christos case LBP_AI:
782 1.1 christos /* Resolve ambiguous. */
783 1.1 christos prop = LBP_AI_REPLACEMENT;
784 1.1 christos break;
785 1.1 christos case LBP_CB:
786 1.1 christos /* This is arbitrary. */
787 1.1 christos prop = LBP_ID;
788 1.1 christos break;
789 1.1 christos case LBP_SA:
790 1.1 christos /* We don't handle complex scripts yet.
791 1.1 christos Treat LBP_SA like LBP_XX. */
792 1.1 christos case LBP_XX:
793 1.1 christos /* This is arbitrary. */
794 1.1 christos prop = LBP_AL;
795 1.1 christos break;
796 1.1 christos }
797 1.1 christos
798 1.1 christos /* Deal with combining characters. */
799 1.1 christos q = p;
800 1.1 christos if (prop == LBP_CM)
801 1.1 christos {
802 1.1 christos /* Don't break just before a combining character. */
803 1.1 christos *p = UC_BREAK_PROHIBITED;
804 1.1 christos /* A combining character turns a preceding space into LBP_AL. */
805 1.1 christos if (seen_space != NULL)
806 1.1 christos {
807 1.1 christos q = seen_space;
808 1.1 christos seen_space = seen_space2;
809 1.1 christos prop = LBP_AL;
810 1.1 christos goto lookup_via_table;
811 1.1 christos }
812 1.1 christos }
813 1.1 christos else if (prop == LBP_SP)
814 1.1 christos {
815 1.1 christos /* Don't break just before a space. */
816 1.1 christos *p = UC_BREAK_PROHIBITED;
817 1.1 christos seen_space2 = seen_space;
818 1.1 christos seen_space = p;
819 1.1 christos }
820 1.1 christos else
821 1.1 christos {
822 1.1 christos lookup_via_table:
823 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */
824 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
825 1.1 christos abort ();
826 1.1 christos
827 1.1 christos if (last_prop == LBP_BK)
828 1.1 christos {
829 1.1 christos /* Don't break at the beginning of a line. */
830 1.1 christos *q = UC_BREAK_PROHIBITED;
831 1.1 christos }
832 1.1 christos else
833 1.1 christos {
834 1.1 christos switch (lbrk_table [last_prop-1] [prop-1])
835 1.1 christos {
836 1.1 christos case D:
837 1.1 christos *q = UC_BREAK_POSSIBLE;
838 1.1 christos break;
839 1.1 christos case I:
840 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
841 1.1 christos break;
842 1.1 christos case P:
843 1.1 christos *q = UC_BREAK_PROHIBITED;
844 1.1 christos break;
845 1.1 christos default:
846 1.1 christos abort ();
847 1.1 christos }
848 1.1 christos }
849 1.1 christos last_prop = prop;
850 1.1 christos seen_space = NULL;
851 1.1 christos seen_space2 = NULL;
852 1.1 christos }
853 1.1 christos }
854 1.1 christos
855 1.1 christos s += count;
856 1.1 christos p += count;
857 1.1 christos }
858 1.1 christos }
859 1.1 christos
860 1.1 christos void
861 1.1 christos u32_possible_linebreaks (const unsigned int *s, size_t n, const char *encoding, char *p)
862 1.1 christos {
863 1.1 christos int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL);
864 1.1 christos const unsigned int *s_end = s + n;
865 1.1 christos int last_prop = LBP_BK; /* line break property of last non-space character */
866 1.1 christos char *seen_space = NULL; /* Was a space seen after the last non-space character? */
867 1.1 christos char *seen_space2 = NULL; /* At least two spaces after the last non-space? */
868 1.1 christos
869 1.1 christos while (s < s_end)
870 1.1 christos {
871 1.1 christos unsigned int uc = *s;
872 1.1 christos int prop = lbrkprop_lookup (uc);
873 1.1 christos
874 1.1 christos if (prop == LBP_BK)
875 1.1 christos {
876 1.1 christos /* Mandatory break. */
877 1.1 christos *p = UC_BREAK_MANDATORY;
878 1.1 christos last_prop = LBP_BK;
879 1.1 christos seen_space = NULL;
880 1.1 christos seen_space2 = NULL;
881 1.1 christos }
882 1.1 christos else
883 1.1 christos {
884 1.1 christos char *q;
885 1.1 christos
886 1.1 christos /* Resolve property values whose behaviour is not fixed. */
887 1.1 christos switch (prop)
888 1.1 christos {
889 1.1 christos case LBP_AI:
890 1.1 christos /* Resolve ambiguous. */
891 1.1 christos prop = LBP_AI_REPLACEMENT;
892 1.1 christos break;
893 1.1 christos case LBP_CB:
894 1.1 christos /* This is arbitrary. */
895 1.1 christos prop = LBP_ID;
896 1.1 christos break;
897 1.1 christos case LBP_SA:
898 1.1 christos /* We don't handle complex scripts yet.
899 1.1 christos Treat LBP_SA like LBP_XX. */
900 1.1 christos case LBP_XX:
901 1.1 christos /* This is arbitrary. */
902 1.1 christos prop = LBP_AL;
903 1.1 christos break;
904 1.1 christos }
905 1.1 christos
906 1.1 christos /* Deal with combining characters. */
907 1.1 christos q = p;
908 1.1 christos if (prop == LBP_CM)
909 1.1 christos {
910 1.1 christos /* Don't break just before a combining character. */
911 1.1 christos *p = UC_BREAK_PROHIBITED;
912 1.1 christos /* A combining character turns a preceding space into LBP_AL. */
913 1.1 christos if (seen_space != NULL)
914 1.1 christos {
915 1.1 christos q = seen_space;
916 1.1 christos seen_space = seen_space2;
917 1.1 christos prop = LBP_AL;
918 1.1 christos goto lookup_via_table;
919 1.1 christos }
920 1.1 christos }
921 1.1 christos else if (prop == LBP_SP)
922 1.1 christos {
923 1.1 christos /* Don't break just before a space. */
924 1.1 christos *p = UC_BREAK_PROHIBITED;
925 1.1 christos seen_space2 = seen_space;
926 1.1 christos seen_space = p;
927 1.1 christos }
928 1.1 christos else
929 1.1 christos {
930 1.1 christos lookup_via_table:
931 1.1 christos /* prop must be usable as an index for table 7.3 of UTR #14. */
932 1.1 christos if (!(prop >= 1 && prop <= sizeof(lbrk_table) / sizeof(lbrk_table[0])))
933 1.1 christos abort ();
934 1.1 christos
935 1.1 christos if (last_prop == LBP_BK)
936 1.1 christos {
937 1.1 christos /* Don't break at the beginning of a line. */
938 1.1 christos *q = UC_BREAK_PROHIBITED;
939 1.1 christos }
940 1.1 christos else
941 1.1 christos {
942 1.1 christos switch (lbrk_table [last_prop-1] [prop-1])
943 1.1 christos {
944 1.1 christos case D:
945 1.1 christos *q = UC_BREAK_POSSIBLE;
946 1.1 christos break;
947 1.1 christos case I:
948 1.1 christos *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED);
949 1.1 christos break;
950 1.1 christos case P:
951 1.1 christos *q = UC_BREAK_PROHIBITED;
952 1.1 christos break;
953 1.1 christos default:
954 1.1 christos abort ();
955 1.1 christos }
956 1.1 christos }
957 1.1 christos last_prop = prop;
958 1.1 christos seen_space = NULL;
959 1.1 christos seen_space2 = NULL;
960 1.1 christos }
961 1.1 christos }
962 1.1 christos
963 1.1 christos s++;
964 1.1 christos p++;
965 1.1 christos }
966 1.1 christos }
967 1.1 christos
968 1.1 christos #endif
969 1.1 christos
970 1.1 christos
971 1.1 christos /* Choose the best line breaks, assuming the uc_width function.
972 1.1 christos Return the column after the end of the string. */
973 1.1 christos
974 1.1 christos int
975 1.1 christos u8_width_linebreaks (const unsigned char *s, size_t n,
976 1.1 christos int width, int start_column, int at_end_columns,
977 1.1 christos const char *o, const char *encoding,
978 1.1 christos char *p)
979 1.1 christos {
980 1.1 christos const unsigned char *s_end;
981 1.1 christos char *last_p;
982 1.1 christos int last_column;
983 1.1 christos int piece_width;
984 1.1 christos
985 1.1 christos u8_possible_linebreaks (s, n, encoding, p);
986 1.1 christos
987 1.1 christos s_end = s + n;
988 1.1 christos last_p = NULL;
989 1.1 christos last_column = start_column;
990 1.1 christos piece_width = 0;
991 1.1 christos while (s < s_end)
992 1.1 christos {
993 1.1 christos unsigned int uc;
994 1.1 christos int count = u8_mbtouc (&uc, s, s_end - s);
995 1.1 christos
996 1.1 christos /* Respect the override. */
997 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED)
998 1.1 christos *p = *o;
999 1.1 christos
1000 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1001 1.1 christos {
1002 1.1 christos /* An atomic piece of text ends here. */
1003 1.1 christos if (last_p != NULL && last_column + piece_width > width)
1004 1.1 christos {
1005 1.1 christos /* Insert a line break. */
1006 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1007 1.1 christos last_column = 0;
1008 1.1 christos }
1009 1.1 christos }
1010 1.1 christos
1011 1.1 christos if (*p == UC_BREAK_MANDATORY)
1012 1.1 christos {
1013 1.1 christos /* uc is a line break character. */
1014 1.1 christos /* Start a new piece at column 0. */
1015 1.1 christos last_p = NULL;
1016 1.1 christos last_column = 0;
1017 1.1 christos piece_width = 0;
1018 1.1 christos }
1019 1.1 christos else
1020 1.1 christos {
1021 1.1 christos /* uc is not a line break character. */
1022 1.1 christos int w;
1023 1.1 christos
1024 1.1 christos if (*p == UC_BREAK_POSSIBLE)
1025 1.1 christos {
1026 1.1 christos /* Start a new piece. */
1027 1.1 christos last_p = p;
1028 1.1 christos last_column += piece_width;
1029 1.1 christos piece_width = 0;
1030 1.1 christos /* No line break for the moment, may be turned into
1031 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */
1032 1.1 christos }
1033 1.1 christos
1034 1.1 christos *p = UC_BREAK_PROHIBITED;
1035 1.1 christos
1036 1.1 christos w = uc_width (uc, encoding);
1037 1.1 christos if (w >= 0) /* ignore control characters in the string */
1038 1.1 christos piece_width += w;
1039 1.1 christos }
1040 1.1 christos
1041 1.1 christos s += count;
1042 1.1 christos p += count;
1043 1.1 christos if (o != NULL)
1044 1.1 christos o += count;
1045 1.1 christos }
1046 1.1 christos
1047 1.1 christos /* The last atomic piece of text ends here. */
1048 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1049 1.1 christos {
1050 1.1 christos /* Insert a line break. */
1051 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1052 1.1 christos last_column = 0;
1053 1.1 christos }
1054 1.1 christos
1055 1.1 christos return last_column + piece_width;
1056 1.1 christos }
1057 1.1 christos
1058 1.1 christos #ifdef unused
1059 1.1 christos
1060 1.1 christos int
1061 1.1 christos u16_width_linebreaks (const unsigned short *s, size_t n,
1062 1.1 christos int width, int start_column, int at_end_columns,
1063 1.1 christos const char *o, const char *encoding,
1064 1.1 christos char *p)
1065 1.1 christos {
1066 1.1 christos const unsigned short *s_end;
1067 1.1 christos char *last_p;
1068 1.1 christos int last_column;
1069 1.1 christos int piece_width;
1070 1.1 christos
1071 1.1 christos u16_possible_linebreaks (s, n, encoding, p);
1072 1.1 christos
1073 1.1 christos s_end = s + n;
1074 1.1 christos last_p = NULL;
1075 1.1 christos last_column = start_column;
1076 1.1 christos piece_width = 0;
1077 1.1 christos while (s < s_end)
1078 1.1 christos {
1079 1.1 christos unsigned int uc;
1080 1.1 christos int count = u16_mbtouc (&uc, s, s_end - s);
1081 1.1 christos
1082 1.1 christos /* Respect the override. */
1083 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED)
1084 1.1 christos *p = *o;
1085 1.1 christos
1086 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1087 1.1 christos {
1088 1.1 christos /* An atomic piece of text ends here. */
1089 1.1 christos if (last_p != NULL && last_column + piece_width > width)
1090 1.1 christos {
1091 1.1 christos /* Insert a line break. */
1092 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1093 1.1 christos last_column = 0;
1094 1.1 christos }
1095 1.1 christos }
1096 1.1 christos
1097 1.1 christos if (*p == UC_BREAK_MANDATORY)
1098 1.1 christos {
1099 1.1 christos /* uc is a line break character. */
1100 1.1 christos /* Start a new piece at column 0. */
1101 1.1 christos last_p = NULL;
1102 1.1 christos last_column = 0;
1103 1.1 christos piece_width = 0;
1104 1.1 christos }
1105 1.1 christos else
1106 1.1 christos {
1107 1.1 christos /* uc is not a line break character. */
1108 1.1 christos int w;
1109 1.1 christos
1110 1.1 christos if (*p == UC_BREAK_POSSIBLE)
1111 1.1 christos {
1112 1.1 christos /* Start a new piece. */
1113 1.1 christos last_p = p;
1114 1.1 christos last_column += piece_width;
1115 1.1 christos piece_width = 0;
1116 1.1 christos /* No line break for the moment, may be turned into
1117 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */
1118 1.1 christos }
1119 1.1 christos
1120 1.1 christos *p = UC_BREAK_PROHIBITED;
1121 1.1 christos
1122 1.1 christos w = uc_width (uc, encoding);
1123 1.1 christos if (w >= 0) /* ignore control characters in the string */
1124 1.1 christos piece_width += w;
1125 1.1 christos }
1126 1.1 christos
1127 1.1 christos s += count;
1128 1.1 christos p += count;
1129 1.1 christos if (o != NULL)
1130 1.1 christos o += count;
1131 1.1 christos }
1132 1.1 christos
1133 1.1 christos /* The last atomic piece of text ends here. */
1134 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1135 1.1 christos {
1136 1.1 christos /* Insert a line break. */
1137 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1138 1.1 christos last_column = 0;
1139 1.1 christos }
1140 1.1 christos
1141 1.1 christos return last_column + piece_width;
1142 1.1 christos }
1143 1.1 christos
1144 1.1 christos int
1145 1.1 christos u32_width_linebreaks (const unsigned int *s, size_t n,
1146 1.1 christos int width, int start_column, int at_end_columns,
1147 1.1 christos const char *o, const char *encoding,
1148 1.1 christos char *p)
1149 1.1 christos {
1150 1.1 christos const unsigned int *s_end;
1151 1.1 christos char *last_p;
1152 1.1 christos int last_column;
1153 1.1 christos int piece_width;
1154 1.1 christos
1155 1.1 christos u32_possible_linebreaks (s, n, encoding, p);
1156 1.1 christos
1157 1.1 christos s_end = s + n;
1158 1.1 christos last_p = NULL;
1159 1.1 christos last_column = start_column;
1160 1.1 christos piece_width = 0;
1161 1.1 christos while (s < s_end)
1162 1.1 christos {
1163 1.1 christos unsigned int uc = *s;
1164 1.1 christos
1165 1.1 christos /* Respect the override. */
1166 1.1 christos if (o != NULL && *o != UC_BREAK_UNDEFINED)
1167 1.1 christos *p = *o;
1168 1.1 christos
1169 1.1 christos if (*p == UC_BREAK_POSSIBLE || *p == UC_BREAK_MANDATORY)
1170 1.1 christos {
1171 1.1 christos /* An atomic piece of text ends here. */
1172 1.1 christos if (last_p != NULL && last_column + piece_width > width)
1173 1.1 christos {
1174 1.1 christos /* Insert a line break. */
1175 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1176 1.1 christos last_column = 0;
1177 1.1 christos }
1178 1.1 christos }
1179 1.1 christos
1180 1.1 christos if (*p == UC_BREAK_MANDATORY)
1181 1.1 christos {
1182 1.1 christos /* uc is a line break character. */
1183 1.1 christos /* Start a new piece at column 0. */
1184 1.1 christos last_p = NULL;
1185 1.1 christos last_column = 0;
1186 1.1 christos piece_width = 0;
1187 1.1 christos }
1188 1.1 christos else
1189 1.1 christos {
1190 1.1 christos /* uc is not a line break character. */
1191 1.1 christos int w;
1192 1.1 christos
1193 1.1 christos if (*p == UC_BREAK_POSSIBLE)
1194 1.1 christos {
1195 1.1 christos /* Start a new piece. */
1196 1.1 christos last_p = p;
1197 1.1 christos last_column += piece_width;
1198 1.1 christos piece_width = 0;
1199 1.1 christos /* No line break for the moment, may be turned into
1200 1.1 christos UC_BREAK_POSSIBLE later, via last_p. */
1201 1.1 christos }
1202 1.1 christos
1203 1.1 christos *p = UC_BREAK_PROHIBITED;
1204 1.1 christos
1205 1.1 christos w = uc_width (uc, encoding);
1206 1.1 christos if (w >= 0) /* ignore control characters in the string */
1207 1.1 christos piece_width += w;
1208 1.1 christos }
1209 1.1 christos
1210 1.1 christos s++;
1211 1.1 christos p++;
1212 1.1 christos if (o != NULL)
1213 1.1 christos o++;
1214 1.1 christos }
1215 1.1 christos
1216 1.1 christos /* The last atomic piece of text ends here. */
1217 1.1 christos if (last_p != NULL && last_column + piece_width + at_end_columns > width)
1218 1.1 christos {
1219 1.1 christos /* Insert a line break. */
1220 1.1 christos *last_p = UC_BREAK_POSSIBLE;
1221 1.1 christos last_column = 0;
1222 1.1 christos }
1223 1.1 christos
1224 1.1 christos return last_column + piece_width;
1225 1.1 christos }
1226 1.1 christos
1227 1.1 christos #endif
1228 1.1 christos
1229 1.1 christos
1230 1.1 christos #ifdef TEST1
1231 1.1 christos
1232 1.1 christos #include <stdio.h>
1233 1.1 christos
1234 1.1 christos /* Read the contents of an input stream, and return it, terminated with a NUL
1235 1.1 christos byte. */
1236 1.1 christos char *
1237 1.1 christos read_file (FILE *stream)
1238 1.1 christos {
1239 1.1 christos #define BUFSIZE 4096
1240 1.1 christos char *buf = NULL;
1241 1.1 christos int alloc = 0;
1242 1.1 christos int size = 0;
1243 1.1 christos int count;
1244 1.1 christos
1245 1.1 christos while (! feof (stream))
1246 1.1 christos {
1247 1.1 christos if (size + BUFSIZE > alloc)
1248 1.1 christos {
1249 1.1 christos alloc = alloc + alloc / 2;
1250 1.1 christos if (alloc < size + BUFSIZE)
1251 1.1 christos alloc = size + BUFSIZE;
1252 1.1 christos buf = realloc (buf, alloc);
1253 1.1 christos if (buf == NULL)
1254 1.1 christos {
1255 1.1 christos fprintf (stderr, "out of memory\n");
1256 1.1 christos exit (1);
1257 1.1 christos }
1258 1.1 christos }
1259 1.1 christos count = fread (buf + size, 1, BUFSIZE, stream);
1260 1.1 christos if (count == 0)
1261 1.1 christos {
1262 1.1 christos if (ferror (stream))
1263 1.1 christos {
1264 1.1 christos perror ("fread");
1265 1.1 christos exit (1);
1266 1.1 christos }
1267 1.1 christos }
1268 1.1 christos else
1269 1.1 christos size += count;
1270 1.1 christos }
1271 1.1 christos buf = realloc (buf, size + 1);
1272 1.1 christos if (buf == NULL)
1273 1.1 christos {
1274 1.1 christos fprintf (stderr, "out of memory\n");
1275 1.1 christos exit (1);
1276 1.1 christos }
1277 1.1 christos buf[size] = '\0';
1278 1.1 christos return buf;
1279 1.1 christos #undef BUFSIZE
1280 1.1 christos }
1281 1.1 christos
1282 1.1 christos int
1283 1.1 christos main (int argc, char * argv[])
1284 1.1 christos {
1285 1.1 christos if (argc == 1)
1286 1.1 christos {
1287 1.1 christos /* Display all the break opportunities in the input string. */
1288 1.1 christos char *input = read_file (stdin);
1289 1.1 christos int length = strlen (input);
1290 1.1 christos char *breaks = malloc (length);
1291 1.1 christos int i;
1292 1.1 christos
1293 1.1 christos u8_possible_linebreaks ((unsigned char *) input, length, "UTF-8", breaks);
1294 1.1 christos
1295 1.1 christos for (i = 0; i < length; i++)
1296 1.1 christos {
1297 1.1 christos switch (breaks[i])
1298 1.1 christos {
1299 1.1 christos case UC_BREAK_POSSIBLE:
1300 1.1 christos /* U+2027 in UTF-8 encoding */
1301 1.1 christos putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout);
1302 1.1 christos break;
1303 1.1 christos case UC_BREAK_MANDATORY:
1304 1.1 christos /* U+21B2 (or U+21B5) in UTF-8 encoding */
1305 1.1 christos putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout);
1306 1.1 christos break;
1307 1.1 christos case UC_BREAK_PROHIBITED:
1308 1.1 christos break;
1309 1.1 christos default:
1310 1.1 christos abort ();
1311 1.1 christos }
1312 1.1 christos putc (input[i], stdout);
1313 1.1 christos }
1314 1.1 christos
1315 1.1 christos free (breaks);
1316 1.1 christos
1317 1.1 christos return 0;
1318 1.1 christos }
1319 1.1 christos else if (argc == 2)
1320 1.1 christos {
1321 1.1 christos /* Insert line breaks for a given width. */
1322 1.1 christos int width = atoi (argv[1]);
1323 1.1 christos char *input = read_file (stdin);
1324 1.1 christos int length = strlen (input);
1325 1.1 christos char *breaks = malloc (length);
1326 1.1 christos int i;
1327 1.1 christos
1328 1.1 christos u8_width_linebreaks ((unsigned char *) input, length, width, 0, 0, NULL, "UTF-8", breaks);
1329 1.1 christos
1330 1.1 christos for (i = 0; i < length; i++)
1331 1.1 christos {
1332 1.1 christos switch (breaks[i])
1333 1.1 christos {
1334 1.1 christos case UC_BREAK_POSSIBLE:
1335 1.1 christos putc ('\n', stdout);
1336 1.1 christos break;
1337 1.1 christos case UC_BREAK_MANDATORY:
1338 1.1 christos break;
1339 1.1 christos case UC_BREAK_PROHIBITED:
1340 1.1 christos break;
1341 1.1 christos default:
1342 1.1 christos abort ();
1343 1.1 christos }
1344 1.1 christos putc (input[i], stdout);
1345 1.1 christos }
1346 1.1 christos
1347 1.1 christos free (breaks);
1348 1.1 christos
1349 1.1 christos return 0;
1350 1.1 christos }
1351 1.1 christos else
1352 1.1 christos return 1;
1353 1.1 christos }
1354 1.1 christos
1355 1.1 christos #endif /* TEST1 */
1356 1.1 christos
1357 1.1 christos
1358 1.1 christos /* Now the same thing with an arbitrary encoding.
1359 1.1 christos
1360 1.1 christos We convert the input string to Unicode.
1361 1.1 christos
1362 1.1 christos The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
1363 1.1 christos UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
1364 1.1 christos \U0000FFFF. UTF-16 and variants support only characters up to
1365 1.1 christos \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
1366 1.1 christos UCS-4 specification leaves doubts about endianness and byte order mark.
1367 1.1 christos glibc currently interprets it as big endian without byte order mark,
1368 1.1 christos but this is not backed by an RFC. So we use UTF-8. It supports
1369 1.1 christos characters up to \U7FFFFFFF and is unambiguously defined. */
1370 1.1 christos
1371 1.1 christos #if HAVE_ICONV
1372 1.1 christos
1373 1.1 christos #include <iconv.h>
1374 1.1 christos #include <errno.h>
1375 1.1 christos
1376 1.1 christos /* Luckily, the encoding's name is platform independent. */
1377 1.1 christos #define UTF8_NAME "UTF-8"
1378 1.1 christos
1379 1.1 christos /* Return the length of a string after conversion through an iconv_t. */
1380 1.1 christos static size_t
1381 1.1 christos iconv_string_length (iconv_t cd, const char *s, size_t n)
1382 1.1 christos {
1383 1.1 christos #define TMPBUFSIZE 4096
1384 1.1 christos size_t count = 0;
1385 1.1 christos char tmpbuf[TMPBUFSIZE];
1386 1.1 christos const char *inptr = s;
1387 1.1 christos size_t insize = n;
1388 1.1 christos while (insize > 0)
1389 1.1 christos {
1390 1.1 christos char *outptr = tmpbuf;
1391 1.1 christos size_t outsize = TMPBUFSIZE;
1392 1.1 christos size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1393 1.1 christos if (res == (size_t)(-1) && errno != E2BIG)
1394 1.1 christos return (size_t)(-1);
1395 1.1 christos count += outptr - tmpbuf;
1396 1.1 christos }
1397 1.1 christos /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
1398 1.1 christos #if defined _LIBICONV_VERSION \
1399 1.1 christos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1400 1.1 christos {
1401 1.1 christos char *outptr = tmpbuf;
1402 1.1 christos size_t outsize = TMPBUFSIZE;
1403 1.1 christos size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
1404 1.1 christos if (res == (size_t)(-1))
1405 1.1 christos return (size_t)(-1);
1406 1.1 christos count += outptr - tmpbuf;
1407 1.1 christos }
1408 1.1 christos /* Return to the initial state. */
1409 1.1 christos iconv (cd, NULL, NULL, NULL, NULL);
1410 1.1 christos #endif
1411 1.1 christos return count;
1412 1.1 christos #undef TMPBUFSIZE
1413 1.1 christos }
1414 1.1 christos
1415 1.1 christos static void
1416 1.1 christos iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
1417 1.1 christos size_t *offtable, char *t, size_t m)
1418 1.1 christos {
1419 1.1 christos size_t i;
1420 1.1 christos const char *s_end;
1421 1.1 christos const char *inptr;
1422 1.1 christos char *outptr;
1423 1.1 christos size_t outsize;
1424 1.1 christos /* Avoid glibc-2.1 bug. */
1425 1.1 christos #if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
1426 1.1 christos const size_t extra = 1;
1427 1.1 christos #else
1428 1.1 christos const size_t extra = 0;
1429 1.1 christos #endif
1430 1.1 christos
1431 1.1 christos for (i = 0; i < n; i++)
1432 1.1 christos offtable[i] = (size_t)(-1);
1433 1.1 christos
1434 1.1 christos s_end = s + n;
1435 1.1 christos inptr = s;
1436 1.1 christos outptr = t;
1437 1.1 christos outsize = m + extra;
1438 1.1 christos while (inptr < s_end)
1439 1.1 christos {
1440 1.1 christos const char *saved_inptr;
1441 1.1 christos size_t insize;
1442 1.1 christos size_t res;
1443 1.1 christos
1444 1.1 christos offtable[inptr - s] = outptr - t;
1445 1.1 christos
1446 1.1 christos saved_inptr = inptr;
1447 1.1 christos res = (size_t)(-1);
1448 1.1 christos for (insize = 1; inptr + insize <= s_end; insize++)
1449 1.1 christos {
1450 1.1 christos res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
1451 1.1 christos if (!(res == (size_t)(-1) && errno == EINVAL))
1452 1.1 christos break;
1453 1.1 christos /* We expect that no input bytes have been consumed so far. */
1454 1.1 christos if (inptr != saved_inptr)
1455 1.1 christos abort ();
1456 1.1 christos }
1457 1.1 christos /* After we verified the convertibility and computed the translation's
1458 1.1 christos size m, there shouldn't be any conversion error here. */
1459 1.1 christos if (res == (size_t)(-1))
1460 1.1 christos abort ();
1461 1.1 christos }
1462 1.1 christos /* Avoid glibc-2.1 bug and Solaris 7 bug. */
1463 1.1 christos #if defined _LIBICONV_VERSION \
1464 1.1 christos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
1465 1.1 christos if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
1466 1.1 christos abort ();
1467 1.1 christos #endif
1468 1.1 christos /* We should have produced exactly m output bytes. */
1469 1.1 christos if (outsize != extra)
1470 1.1 christos abort ();
1471 1.1 christos }
1472 1.1 christos
1473 1.1 christos #endif /* HAVE_ICONV */
1474 1.1 christos
1475 1.1 christos #if C_CTYPE_ASCII
1476 1.1 christos
1477 1.1 christos /* Tests whether a string is entirely ASCII. Returns 1 if yes.
1478 1.1 christos Returns 0 if the string is in an 8-bit encoding or an ISO-2022 encoding. */
1479 1.1 christos static int
1480 1.1 christos is_all_ascii (const char *s, size_t n)
1481 1.1 christos {
1482 1.1 christos for (; n > 0; s++, n--)
1483 1.1 christos {
1484 1.1 christos unsigned char c = (unsigned char) *s;
1485 1.1 christos
1486 1.1 christos if (!(c_isprint (c) || c_isspace (c)))
1487 1.1 christos return 0;
1488 1.1 christos }
1489 1.1 christos return 1;
1490 1.1 christos }
1491 1.1 christos
1492 1.1 christos #endif /* C_CTYPE_ASCII */
1493 1.1 christos
1494 1.1 christos #if defined unused || defined TEST2
1495 1.1 christos
1496 1.1 christos void
1497 1.1 christos mbs_possible_linebreaks (const char *s, size_t n, const char *encoding,
1498 1.1 christos char *p)
1499 1.1 christos {
1500 1.1 christos if (n == 0)
1501 1.1 christos return;
1502 1.1 christos if (is_utf8_encoding (encoding))
1503 1.1 christos u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1504 1.1 christos else
1505 1.1 christos {
1506 1.1 christos #if HAVE_ICONV
1507 1.1 christos iconv_t to_utf8;
1508 1.1 christos /* Avoid glibc-2.1 bug with EUC-KR. */
1509 1.1 christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1510 1.1 christos if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1511 1.1 christos to_utf8 = (iconv_t)(-1);
1512 1.1 christos else
1513 1.1 christos # endif
1514 1.1 christos /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1515 1.1 christos GB18030. */
1516 1.1 christos # if defined __sun && !defined _LIBICONV_VERSION
1517 1.1 christos if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1518 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1519 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1520 1.1 christos || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1521 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1522 1.1 christos || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1523 1.1 christos to_utf8 = (iconv_t)(-1);
1524 1.1 christos else
1525 1.1 christos # endif
1526 1.1 christos to_utf8 = iconv_open (UTF8_NAME, encoding);
1527 1.1 christos if (to_utf8 != (iconv_t)(-1))
1528 1.1 christos {
1529 1.1 christos /* Determine the length of the resulting UTF-8 string. */
1530 1.1 christos size_t m = iconv_string_length (to_utf8, s, n);
1531 1.1 christos if (m != (size_t)(-1))
1532 1.1 christos {
1533 1.1 christos /* Convert the string to UTF-8 and build a translation table
1534 1.1 christos from offsets into s to offsets into the translated string. */
1535 1.1 christos size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
1536 1.1 christos char *memory =
1537 1.1 christos (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1538 1.1 christos if (memory != NULL)
1539 1.1 christos {
1540 1.1 christos size_t *offtable = (size_t *) memory;
1541 1.1 christos char *t = (char *) (offtable + n);
1542 1.1 christos char *q = (char *) (t + m);
1543 1.1 christos size_t i;
1544 1.1 christos
1545 1.1 christos iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1546 1.1 christos
1547 1.1 christos /* Determine the possible line breaks of the UTF-8 string. */
1548 1.1 christos u8_possible_linebreaks ((const unsigned char *) t, m, encoding, q);
1549 1.1 christos
1550 1.1 christos /* Translate the result back to the original string. */
1551 1.1 christos memset (p, UC_BREAK_PROHIBITED, n);
1552 1.1 christos for (i = 0; i < n; i++)
1553 1.1 christos if (offtable[i] != (size_t)(-1))
1554 1.1 christos p[i] = q[offtable[i]];
1555 1.1 christos
1556 1.1 christos free (memory);
1557 1.1 christos iconv_close (to_utf8);
1558 1.1 christos return;
1559 1.1 christos }
1560 1.1 christos }
1561 1.1 christos iconv_close (to_utf8);
1562 1.1 christos }
1563 1.1 christos #endif
1564 1.1 christos /* Impossible to convert. */
1565 1.1 christos #if C_CTYPE_ASCII
1566 1.1 christos if (is_all_ascii (s, n))
1567 1.1 christos {
1568 1.1 christos /* ASCII is a subset of UTF-8. */
1569 1.1 christos u8_possible_linebreaks ((const unsigned char *) s, n, encoding, p);
1570 1.1 christos return;
1571 1.1 christos }
1572 1.1 christos #endif
1573 1.1 christos /* We have a non-ASCII string and cannot convert it.
1574 1.1 christos Don't produce line breaks except those already present in the
1575 1.1 christos input string. All we assume here is that the encoding is
1576 1.1 christos minimally ASCII compatible. */
1577 1.1 christos {
1578 1.1 christos const char *s_end = s + n;
1579 1.1 christos while (s < s_end)
1580 1.1 christos {
1581 1.1 christos *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
1582 1.1 christos s++;
1583 1.1 christos p++;
1584 1.1 christos }
1585 1.1 christos }
1586 1.1 christos }
1587 1.1 christos }
1588 1.1 christos
1589 1.1 christos #endif
1590 1.1 christos
1591 1.1 christos int
1592 1.1 christos mbs_width_linebreaks (const char *s, size_t n,
1593 1.1 christos int width, int start_column, int at_end_columns,
1594 1.1 christos const char *o, const char *encoding,
1595 1.1 christos char *p)
1596 1.1 christos {
1597 1.1 christos if (n == 0)
1598 1.1 christos return start_column;
1599 1.1 christos if (is_utf8_encoding (encoding))
1600 1.1 christos return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1601 1.1 christos else
1602 1.1 christos {
1603 1.1 christos #if HAVE_ICONV
1604 1.1 christos iconv_t to_utf8;
1605 1.1 christos /* Avoid glibc-2.1 bug with EUC-KR. */
1606 1.1 christos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
1607 1.1 christos if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
1608 1.1 christos to_utf8 = (iconv_t)(-1);
1609 1.1 christos else
1610 1.1 christos # endif
1611 1.1 christos /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
1612 1.1 christos GB18030. */
1613 1.1 christos # if defined __sun && !defined _LIBICONV_VERSION
1614 1.1 christos if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
1615 1.1 christos || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
1616 1.1 christos || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
1617 1.1 christos || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
1618 1.1 christos || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
1619 1.1 christos || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
1620 1.1 christos to_utf8 = (iconv_t)(-1);
1621 1.1 christos else
1622 1.1 christos # endif
1623 1.1 christos to_utf8 = iconv_open (UTF8_NAME, encoding);
1624 1.1 christos if (to_utf8 != (iconv_t)(-1))
1625 1.1 christos {
1626 1.1 christos /* Determine the length of the resulting UTF-8 string. */
1627 1.1 christos size_t m = iconv_string_length (to_utf8, s, n);
1628 1.1 christos if (m != (size_t)(-1))
1629 1.1 christos {
1630 1.1 christos /* Convert the string to UTF-8 and build a translation table
1631 1.1 christos from offsets into s to offsets into the translated string. */
1632 1.1 christos size_t memory_size =
1633 1.1 christos xsum4 (xtimes (n, sizeof (size_t)), m, m,
1634 1.1 christos (o != NULL ? m : 0));
1635 1.1 christos char *memory =
1636 1.1 christos (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
1637 1.1 christos if (memory != NULL)
1638 1.1 christos {
1639 1.1 christos size_t *offtable = (size_t *) memory;
1640 1.1 christos char *t = (char *) (offtable + n);
1641 1.1 christos char *q = (char *) (t + m);
1642 1.1 christos char *o8 = (o != NULL ? (char *) (q + m) : NULL);
1643 1.1 christos int res_column;
1644 1.1 christos size_t i;
1645 1.1 christos
1646 1.1 christos iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
1647 1.1 christos
1648 1.1 christos /* Translate the overrides to the UTF-8 string. */
1649 1.1 christos if (o != NULL)
1650 1.1 christos {
1651 1.1 christos memset (o8, UC_BREAK_UNDEFINED, m);
1652 1.1 christos for (i = 0; i < n; i++)
1653 1.1 christos if (offtable[i] != (size_t)(-1))
1654 1.1 christos o8[offtable[i]] = o[i];
1655 1.1 christos }
1656 1.1 christos
1657 1.1 christos /* Determine the line breaks of the UTF-8 string. */
1658 1.1 christos res_column =
1659 1.1 christos u8_width_linebreaks ((const unsigned char *) t, m, width, start_column, at_end_columns, o8, encoding, q);
1660 1.1 christos
1661 1.1 christos /* Translate the result back to the original string. */
1662 1.1 christos memset (p, UC_BREAK_PROHIBITED, n);
1663 1.1 christos for (i = 0; i < n; i++)
1664 1.1 christos if (offtable[i] != (size_t)(-1))
1665 1.1 christos p[i] = q[offtable[i]];
1666 1.1 christos
1667 1.1 christos free (memory);
1668 1.1 christos iconv_close (to_utf8);
1669 1.1 christos return res_column;
1670 1.1 christos }
1671 1.1 christos }
1672 1.1 christos iconv_close (to_utf8);
1673 1.1 christos }
1674 1.1 christos #endif
1675 1.1 christos /* Impossible to convert. */
1676 1.1 christos #if C_CTYPE_ASCII
1677 1.1 christos if (is_all_ascii (s, n))
1678 1.1 christos {
1679 1.1 christos /* ASCII is a subset of UTF-8. */
1680 1.1 christos return u8_width_linebreaks ((const unsigned char *) s, n, width, start_column, at_end_columns, o, encoding, p);
1681 1.1 christos }
1682 1.1 christos #endif
1683 1.1 christos /* We have a non-ASCII string and cannot convert it.
1684 1.1 christos Don't produce line breaks except those already present in the
1685 1.1 christos input string. All we assume here is that the encoding is
1686 1.1 christos minimally ASCII compatible. */
1687 1.1 christos {
1688 1.1 christos const char *s_end = s + n;
1689 1.1 christos while (s < s_end)
1690 1.1 christos {
1691 1.1 christos *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
1692 1.1 christos ? UC_BREAK_MANDATORY
1693 1.1 christos : UC_BREAK_PROHIBITED);
1694 1.1 christos s++;
1695 1.1 christos p++;
1696 1.1 christos if (o != NULL)
1697 1.1 christos o++;
1698 1.1 christos }
1699 1.1 christos /* We cannot compute widths in this case. */
1700 1.1 christos return start_column;
1701 1.1 christos }
1702 1.1 christos }
1703 1.1 christos }
1704 1.1 christos
1705 1.1 christos
1706 1.1 christos #ifdef TEST2
1707 1.1 christos
1708 1.1 christos #include <stdio.h>
1709 1.1 christos #include <locale.h>
1710 1.1 christos
1711 1.1 christos /* Read the contents of an input stream, and return it, terminated with a NUL
1712 1.1 christos byte. */
1713 1.1 christos char *
1714 1.1 christos read_file (FILE *stream)
1715 1.1 christos {
1716 1.1 christos #define BUFSIZE 4096
1717 1.1 christos char *buf = NULL;
1718 1.1 christos int alloc = 0;
1719 1.1 christos int size = 0;
1720 1.1 christos int count;
1721 1.1 christos
1722 1.1 christos while (! feof (stream))
1723 1.1 christos {
1724 1.1 christos if (size + BUFSIZE > alloc)
1725 1.1 christos {
1726 1.1 christos alloc = alloc + alloc / 2;
1727 1.1 christos if (alloc < size + BUFSIZE)
1728 1.1 christos alloc = size + BUFSIZE;
1729 1.1 christos buf = realloc (buf, alloc);
1730 1.1 christos if (buf == NULL)
1731 1.1 christos {
1732 1.1 christos fprintf (stderr, "out of memory\n");
1733 1.1 christos exit (1);
1734 1.1 christos }
1735 1.1 christos }
1736 1.1 christos count = fread (buf + size, 1, BUFSIZE, stream);
1737 1.1 christos if (count == 0)
1738 1.1 christos {
1739 1.1 christos if (ferror (stream))
1740 1.1 christos {
1741 1.1 christos perror ("fread");
1742 1.1 christos exit (1);
1743 1.1 christos }
1744 1.1 christos }
1745 1.1 christos else
1746 1.1 christos size += count;
1747 1.1 christos }
1748 1.1 christos buf = realloc (buf, size + 1);
1749 1.1 christos if (buf == NULL)
1750 1.1 christos {
1751 1.1 christos fprintf (stderr, "out of memory\n");
1752 1.1 christos exit (1);
1753 1.1 christos }
1754 1.1 christos buf[size] = '\0';
1755 1.1 christos return buf;
1756 1.1 christos #undef BUFSIZE
1757 1.1 christos }
1758 1.1 christos
1759 1.1 christos int
1760 1.1 christos main (int argc, char * argv[])
1761 1.1 christos {
1762 1.1 christos setlocale (LC_CTYPE, "");
1763 1.1 christos if (argc == 1)
1764 1.1 christos {
1765 1.1 christos /* Display all the break opportunities in the input string. */
1766 1.1 christos char *input = read_file (stdin);
1767 1.1 christos int length = strlen (input);
1768 1.1 christos char *breaks = malloc (length);
1769 1.1 christos int i;
1770 1.1 christos
1771 1.1 christos mbs_possible_linebreaks (input, length, locale_charset (), breaks);
1772 1.1 christos
1773 1.1 christos for (i = 0; i < length; i++)
1774 1.1 christos {
1775 1.1 christos switch (breaks[i])
1776 1.1 christos {
1777 1.1 christos case UC_BREAK_POSSIBLE:
1778 1.1 christos putc ('|', stdout);
1779 1.1 christos break;
1780 1.1 christos case UC_BREAK_MANDATORY:
1781 1.1 christos break;
1782 1.1 christos case UC_BREAK_PROHIBITED:
1783 1.1 christos break;
1784 1.1 christos default:
1785 1.1 christos abort ();
1786 1.1 christos }
1787 1.1 christos putc (input[i], stdout);
1788 1.1 christos }
1789 1.1 christos
1790 1.1 christos free (breaks);
1791 1.1 christos
1792 1.1 christos return 0;
1793 1.1 christos }
1794 1.1 christos else if (argc == 2)
1795 1.1 christos {
1796 1.1 christos /* Insert line breaks for a given width. */
1797 1.1 christos int width = atoi (argv[1]);
1798 1.1 christos char *input = read_file (stdin);
1799 1.1 christos int length = strlen (input);
1800 1.1 christos char *breaks = malloc (length);
1801 1.1 christos int i;
1802 1.1 christos
1803 1.1 christos mbs_width_linebreaks (input, length, width, 0, 0, NULL, locale_charset (), breaks);
1804 1.1 christos
1805 1.1 christos for (i = 0; i < length; i++)
1806 1.1 christos {
1807 1.1 christos switch (breaks[i])
1808 1.1 christos {
1809 1.1 christos case UC_BREAK_POSSIBLE:
1810 1.1 christos putc ('\n', stdout);
1811 1.1 christos break;
1812 1.1 christos case UC_BREAK_MANDATORY:
1813 1.1 christos break;
1814 1.1 christos case UC_BREAK_PROHIBITED:
1815 1.1 christos break;
1816 1.1 christos default:
1817 1.1 christos abort ();
1818 1.1 christos }
1819 1.1 christos putc (input[i], stdout);
1820 1.1 christos }
1821 1.1 christos
1822 1.1 christos free (breaks);
1823 1.1 christos
1824 1.1 christos return 0;
1825 1.1 christos }
1826 1.1 christos else
1827 1.1 christos return 1;
1828 1.1 christos }
1829 1.1 christos
1830 1.1 christos #endif /* TEST2 */
1831