1 1.1 christos #!/usr/local/bin/clisp -C 2 1.1 christos 3 1.1 christos ;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table. 4 1.1 christos ;;; Bruno Haible 2000-12-28 5 1.1 christos 6 1.1 christos (defparameter add-comments nil) 7 1.1 christos 8 1.1 christos (defstruct unicode-char 9 1.1 christos (code nil :type integer) 10 1.1 christos (name nil :type string) 11 1.1 christos word-indices 12 1.1 christos word-indices-index 13 1.1 christos ) 14 1.1 christos 15 1.1 christos (defstruct word-list 16 1.1 christos (hashed nil :type hash-table) 17 1.1 christos (sorted nil :type list) 18 1.1 christos size ; number of characters total 19 1.1 christos length ; number of words 20 1.1 christos ) 21 1.1 christos 22 1.1 christos (defun main (inputfile outputfile) 23 1.1 christos (declare (type string inputfile outputfile)) 24 1.1 christos #+UNICODE (setq *default-file-encoding* charset:utf-8) 25 1.1 christos (let ((all-chars '())) 26 1.1 christos ;; Read all characters and names from the input file. 27 1.1 christos (with-open-file (istream inputfile :direction :input) 28 1.1 christos (loop 29 1.1 christos (let ((line (read-line istream nil nil))) 30 1.1 christos (unless line (return)) 31 1.1 christos (let* ((i1 (position #\; line)) 32 1.1 christos (i2 (position #\; line :start (1+ i1))) 33 1.1 christos (code-string (subseq line 0 i1)) 34 1.1 christos (code (parse-integer code-string :radix 16)) 35 1.1 christos (name-string (subseq line (1+ i1) i2))) 36 1.1 christos ; Ignore characters whose name starts with "<". 37 1.1 christos (unless (eql (char name-string 0) #\<) 38 1.1 christos ; Also ignore Hangul syllables; they are treated specially. 39 1.1 christos (unless (<= #xAC00 code #xD7A3) 40 1.1 christos ; Also ignore CJK compatibility ideographs; they are treated 41 1.1 christos ; specially as well. 42 1.1 christos (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A) 43 1.1 christos (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D)) 44 1.1 christos ; Transform the code so that it fits in 16 bits. In 45 1.1 christos ; Unicode 3.1 the following ranges are used. 46 1.1 christos ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4 47 1.1 christos ; 0x0A000..0x0A4FF >>12= 0x0A -> 0x5 48 1.1 christos ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6 49 1.1 christos ; 0x10300..0x104FF >>12= 0x10 -> 0x7 50 1.1 christos ; 0x1D000..0x1D7DD >>12= 0x1D -> 0x8 51 1.1 christos ; 0x2F800..0x2FAFF >>12= 0x2F -> 0x9 52 1.1 christos ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xA 53 1.1 christos (flet ((transform (x) 54 1.1 christos (dpb 55 1.1 christos (case (ash x -12) 56 1.1 christos ((#x00 #x01 #x02 #x03 #x04) (ash x -12)) 57 1.1 christos (#x0A 5) 58 1.1 christos (#x0F 6) 59 1.1 christos (#x10 7) 60 1.1 christos (#x1D 8) 61 1.1 christos (#x2F 9) 62 1.1 christos (#xE0 #xA) 63 1.1 christos (t (error "Update the transform function for 0x~5,'0X" x)) 64 1.1 christos ) 65 1.1 christos (byte 8 12) 66 1.1 christos x 67 1.1 christos )) ) 68 1.1 christos (push (make-unicode-char :code (transform code) 69 1.1 christos :name name-string) 70 1.1 christos all-chars 71 1.1 christos ) ) ) ) ) 72 1.1 christos ) ) ) ) 73 1.1 christos (setq all-chars (nreverse all-chars)) 74 1.1 christos ;; Split into words. 75 1.1 christos (let ((words-by-length (make-array 0 :adjustable t))) 76 1.1 christos (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars))) 77 1.1 christos (let ((i1 0)) 78 1.1 christos (loop 79 1.1 christos (when (>= i1 (length name)) (return)) 80 1.1 christos (let ((i2 (or (position #\Space name :start i1) (length name)))) 81 1.1 christos (let* ((word (subseq name i1 i2)) 82 1.1 christos (len (length word))) 83 1.1 christos (when (>= len (length words-by-length)) 84 1.1 christos (adjust-array words-by-length (1+ len)) 85 1.1 christos ) 86 1.1 christos (unless (aref words-by-length len) 87 1.1 christos (setf (aref words-by-length len) 88 1.1 christos (make-word-list 89 1.1 christos :hashed (make-hash-table :test #'equal) 90 1.1 christos :sorted '() 91 1.1 christos ) ) ) 92 1.1 christos (let ((word-list (aref words-by-length len))) 93 1.1 christos (unless (gethash word (word-list-hashed word-list)) 94 1.1 christos (setf (gethash word (word-list-hashed word-list)) t) 95 1.1 christos (push word (word-list-sorted word-list)) 96 1.1 christos ) ) 97 1.1 christos ) 98 1.1 christos (setq i1 (1+ i2)) 99 1.1 christos ) ) ) ) 100 1.1 christos ;; Sort the word lists. 101 1.1 christos (dotimes (len (length words-by-length)) 102 1.1 christos (unless (aref words-by-length len) 103 1.1 christos (setf (aref words-by-length len) 104 1.1 christos (make-word-list 105 1.1 christos :hashed (make-hash-table :test #'equal) 106 1.1 christos :sorted '() 107 1.1 christos ) ) ) 108 1.1 christos (let ((word-list (aref words-by-length len))) 109 1.1 christos (setf (word-list-sorted word-list) 110 1.1 christos (sort (word-list-sorted word-list) #'string<) 111 1.1 christos ) 112 1.1 christos (setf (word-list-size word-list) 113 1.1 christos (reduce #'+ (mapcar #'length (word-list-sorted word-list))) 114 1.1 christos ) 115 1.1 christos (setf (word-list-length word-list) 116 1.1 christos (length (word-list-sorted word-list)) 117 1.1 christos ) ) ) 118 1.1 christos ;; Output the tables. 119 1.1 christos (with-open-file (ostream outputfile :direction :output 120 1.1 christos #+UNICODE :external-format #+UNICODE charset:ascii) 121 1.1 christos (format ostream "/*~%") 122 1.1 christos (format ostream " * ~A~%" (file-namestring outputfile)) 123 1.1 christos (format ostream " *~%") 124 1.1 christos (format ostream " * Unicode character name table.~%") 125 1.1 christos (format ostream " * Generated automatically by the gen-uninames utility.~%") 126 1.1 christos (format ostream " */~%") 127 1.1 christos (format ostream "~%") 128 1.1 christos (format ostream "static const char unicode_name_words[~D] = {~%" 129 1.1 christos (let ((sum 0)) 130 1.1 christos (dotimes (len (length words-by-length)) 131 1.1 christos (let ((word-list (aref words-by-length len))) 132 1.1 christos (incf sum (word-list-size word-list)) 133 1.1 christos ) ) 134 1.1 christos sum 135 1.1 christos ) ) 136 1.1 christos (dotimes (len (length words-by-length)) 137 1.1 christos (let ((word-list (aref words-by-length len))) 138 1.1 christos (dolist (word (word-list-sorted word-list)) 139 1.1 christos (format ostream " ~{ '~C',~}~%" (coerce word 'list)) 140 1.1 christos ) ) ) 141 1.1 christos (format ostream "};~%") 142 1.1 christos (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%" 143 1.1 christos (let ((sum 0)) 144 1.1 christos (dotimes (len (length words-by-length)) 145 1.1 christos (let ((word-list (aref words-by-length len))) 146 1.1 christos (incf sum (word-list-length word-list)) 147 1.1 christos ) ) 148 1.1 christos sum 149 1.1 christos ) ) 150 1.1 christos #| ; Redundant data 151 1.1 christos (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%" 152 1.1 christos (let ((sum 0)) 153 1.1 christos (dotimes (len (length words-by-length)) 154 1.1 christos (let ((word-list (aref words-by-length len))) 155 1.1 christos (incf sum (word-list-length word-list)) 156 1.1 christos ) ) 157 1.1 christos sum 158 1.1 christos ) ) 159 1.1 christos (dotimes (len (length words-by-length)) 160 1.1 christos (let ((word-list (aref words-by-length len))) 161 1.1 christos (when (word-list-sorted word-list) 162 1.1 christos (format ostream " ") 163 1.1 christos (do ((l (word-list-sorted word-list) (cdr l)) 164 1.1 christos (offset 0 (+ offset (length (car l))))) 165 1.1 christos ((endp l)) 166 1.1 christos (format ostream "~<~% ~0,79:; ~D,~>" offset) 167 1.1 christos ) 168 1.1 christos (format ostream "~%") 169 1.1 christos ) ) ) 170 1.1 christos (format ostream "};~%") 171 1.1 christos |# 172 1.1 christos (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%" 173 1.1 christos (1+ (length words-by-length)) 174 1.1 christos ) 175 1.1 christos (let ((extra-offset 0) 176 1.1 christos (ind-offset 0)) 177 1.1 christos (dotimes (len (length words-by-length)) 178 1.1 christos (let ((word-list (aref words-by-length len))) 179 1.1 christos (format ostream " { ~D, ~D },~%" extra-offset ind-offset) 180 1.1 christos (incf extra-offset (word-list-size word-list)) 181 1.1 christos (incf ind-offset (word-list-length word-list)) 182 1.1 christos ) ) 183 1.1 christos (format ostream " { ~D, ~D }~%" extra-offset ind-offset) 184 1.1 christos ) 185 1.1 christos (format ostream "};~%") 186 1.1 christos (let ((ind-offset 0)) 187 1.1 christos (dotimes (len (length words-by-length)) 188 1.1 christos (let ((word-list (aref words-by-length len))) 189 1.1 christos (dolist (word (word-list-sorted word-list)) 190 1.1 christos (setf (gethash word (word-list-hashed word-list)) ind-offset) 191 1.1 christos (incf ind-offset) 192 1.1 christos ) ) ) ) 193 1.1 christos (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY")) 194 1.1 christos (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word 195 1.1 christos (gethash word (word-list-hashed (aref words-by-length (length word)))) 196 1.1 christos ) ) 197 1.1 christos ;; Compute the word-indices for every unicode-char. 198 1.1 christos (dolist (uc all-chars) 199 1.1 christos (let ((name (unicode-char-name uc)) 200 1.1 christos (indices '())) 201 1.1 christos (let ((i1 0)) 202 1.1 christos (loop 203 1.1 christos (when (>= i1 (length name)) (return)) 204 1.1 christos (let ((i2 (or (position #\Space name :start i1) (length name)))) 205 1.1 christos (let* ((word (subseq name i1 i2)) 206 1.1 christos (len (length word))) 207 1.1 christos (push (gethash word (word-list-hashed (aref words-by-length len))) 208 1.1 christos indices 209 1.1 christos ) 210 1.1 christos ) 211 1.1 christos (setq i1 (1+ i2)) 212 1.1 christos ) ) ) 213 1.1 christos (setf (unicode-char-word-indices uc) 214 1.1 christos (coerce (nreverse indices) 'vector) 215 1.1 christos ) 216 1.1 christos ) ) 217 1.1 christos ;; Sort the list of unicode-chars by word-indices. 218 1.1 christos (setq all-chars 219 1.1 christos (sort all-chars 220 1.1 christos (lambda (vec1 vec2) 221 1.1 christos (let ((len1 (length vec1)) 222 1.1 christos (len2 (length vec2))) 223 1.1 christos (do ((i 0 (1+ i))) 224 1.1 christos (nil) 225 1.1 christos (if (< i len2) 226 1.1 christos (if (< i len1) 227 1.1 christos (cond ((< (aref vec1 i) (aref vec2 i)) (return t)) 228 1.1 christos ((> (aref vec1 i) (aref vec2 i)) (return nil)) 229 1.1 christos ) 230 1.1 christos (return t) 231 1.1 christos ) 232 1.1 christos (return nil) 233 1.1 christos ) ) ) ) 234 1.1 christos :key #'unicode-char-word-indices 235 1.1 christos ) ) 236 1.1 christos ;; Output the word-indices. 237 1.1 christos (format ostream "static const uint16_t unicode_names[~D] = {~%" 238 1.1 christos (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 239 1.1 christos ) 240 1.1 christos (let ((i 0)) 241 1.1 christos (dolist (uc all-chars) 242 1.1 christos (format ostream " ~{ ~D,~}" 243 1.1 christos (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0))) 244 1.1 christos (coerce (unicode-char-word-indices uc) 'list) 245 1.1 christos ) 246 1.1 christos ) 247 1.1 christos (when add-comments 248 1.1 christos (format ostream "~40T/* ~A */" (unicode-char-name uc)) 249 1.1 christos ) 250 1.1 christos (format ostream "~%") 251 1.1 christos (setf (unicode-char-word-indices-index uc) i) 252 1.1 christos (incf i (length (unicode-char-word-indices uc))) 253 1.1 christos ) ) 254 1.1 christos (format ostream "};~%") 255 1.1 christos (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%" 256 1.1 christos (length all-chars) 257 1.1 christos ) 258 1.1 christos (dolist (uc all-chars) 259 1.1 christos (format ostream " { 0x~4,'0X, ~D }," 260 1.1 christos (unicode-char-code uc) 261 1.1 christos (unicode-char-word-indices-index uc) 262 1.1 christos ) 263 1.1 christos (when add-comments 264 1.1 christos (format ostream "~21T/* ~A */" (unicode-char-name uc)) 265 1.1 christos ) 266 1.1 christos (format ostream "~%") 267 1.1 christos ) 268 1.1 christos (format ostream "};~%") 269 1.1 christos (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%" 270 1.1 christos (length all-chars) 271 1.1 christos ) 272 1.1 christos (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code)) 273 1.1 christos (format ostream " { 0x~4,'0X, ~D }," 274 1.1 christos (unicode-char-code uc) 275 1.1 christos (unicode-char-word-indices-index uc) 276 1.1 christos ) 277 1.1 christos (when add-comments 278 1.1 christos (format ostream "~21T/* ~A */" (unicode-char-name uc)) 279 1.1 christos ) 280 1.1 christos (format ostream "~%") 281 1.1 christos ) 282 1.1 christos (format ostream "};~%") 283 1.1 christos (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%" 284 1.1 christos (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars)) 285 1.1 christos ) 286 1.1 christos (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%" 287 1.1 christos (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars)) 288 1.1 christos ) 289 1.1 christos ) 290 1.1 christos ) ) ) 291 1.1 christos 292 1.1 christos (main (first *args*) (second *args*)) 293