Home | History | Annotate | Line # | Download | only in libuniname
      1  1.1  christos #!/usr/local/bin/clisp -C
      2  1.1  christos 
      3  1.1  christos ;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
      4  1.1  christos ;;; Bruno Haible 2000-12-28
      5  1.1  christos 
      6  1.1  christos (defparameter add-comments nil)
      7  1.1  christos 
      8  1.1  christos (defstruct unicode-char
      9  1.1  christos   (code nil :type integer)
     10  1.1  christos   (name nil :type string)
     11  1.1  christos   word-indices
     12  1.1  christos   word-indices-index
     13  1.1  christos )
     14  1.1  christos 
     15  1.1  christos (defstruct word-list
     16  1.1  christos   (hashed nil :type hash-table)
     17  1.1  christos   (sorted nil :type list)
     18  1.1  christos   size                          ; number of characters total
     19  1.1  christos   length                        ; number of words
     20  1.1  christos )
     21  1.1  christos 
     22  1.1  christos (defun main (inputfile outputfile)
     23  1.1  christos   (declare (type string inputfile outputfile))
     24  1.1  christos   #+UNICODE (setq *default-file-encoding* charset:utf-8)
     25  1.1  christos   (let ((all-chars '()))
     26  1.1  christos     ;; Read all characters and names from the input file.
     27  1.1  christos     (with-open-file (istream inputfile :direction :input)
     28  1.1  christos       (loop
     29  1.1  christos         (let ((line (read-line istream nil nil)))
     30  1.1  christos           (unless line (return))
     31  1.1  christos           (let* ((i1 (position #\; line))
     32  1.1  christos                  (i2 (position #\; line :start (1+ i1)))
     33  1.1  christos                  (code-string (subseq line 0 i1))
     34  1.1  christos                  (code (parse-integer code-string :radix 16))
     35  1.1  christos                  (name-string (subseq line (1+ i1) i2)))
     36  1.1  christos             ; Ignore characters whose name starts with "<".
     37  1.1  christos             (unless (eql (char name-string 0) #\<)
     38  1.1  christos               ; Also ignore Hangul syllables; they are treated specially.
     39  1.1  christos               (unless (<= #xAC00 code #xD7A3)
     40  1.1  christos                 ; Also ignore CJK compatibility ideographs; they are treated
     41  1.1  christos                 ; specially as well.
     42  1.1  christos                 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
     43  1.1  christos                             (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
     44  1.1  christos                   ; Transform the code so that it fits in 16 bits. In
     45  1.1  christos                   ; Unicode 3.1 the following ranges are used.
     46  1.1  christos                   ;   0x00000..0x04DFF  >>12=  0x00..0x04  -> 0x0..0x4
     47  1.1  christos                   ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x5
     48  1.1  christos                   ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x6
     49  1.1  christos                   ;   0x10300..0x104FF  >>12=  0x10        -> 0x7
     50  1.1  christos                   ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x8
     51  1.1  christos                   ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x9
     52  1.1  christos                   ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0xA
     53  1.1  christos                   (flet ((transform (x)
     54  1.1  christos                            (dpb
     55  1.1  christos                              (case (ash x -12)
     56  1.1  christos                                ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
     57  1.1  christos                                (#x0A 5)
     58  1.1  christos                                (#x0F 6)
     59  1.1  christos                                (#x10 7)
     60  1.1  christos                                (#x1D 8)
     61  1.1  christos                                (#x2F 9)
     62  1.1  christos                                (#xE0 #xA)
     63  1.1  christos                                (t (error "Update the transform function for 0x~5,'0X" x))
     64  1.1  christos                              )
     65  1.1  christos                              (byte 8 12)
     66  1.1  christos                              x
     67  1.1  christos                         )) )
     68  1.1  christos                     (push (make-unicode-char :code (transform code)
     69  1.1  christos                                              :name name-string)
     70  1.1  christos                           all-chars
     71  1.1  christos             ) ) ) ) )
     72  1.1  christos     ) ) ) )
     73  1.1  christos     (setq all-chars (nreverse all-chars))
     74  1.1  christos     ;; Split into words.
     75  1.1  christos     (let ((words-by-length (make-array 0 :adjustable t)))
     76  1.1  christos       (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
     77  1.1  christos         (let ((i1 0))
     78  1.1  christos           (loop
     79  1.1  christos             (when (>= i1 (length name)) (return))
     80  1.1  christos             (let ((i2 (or (position #\Space name :start i1) (length name))))
     81  1.1  christos               (let* ((word (subseq name i1 i2))
     82  1.1  christos                      (len (length word)))
     83  1.1  christos                 (when (>= len (length words-by-length))
     84  1.1  christos                   (adjust-array words-by-length (1+ len))
     85  1.1  christos                 )
     86  1.1  christos                 (unless (aref words-by-length len)
     87  1.1  christos                   (setf (aref words-by-length len)
     88  1.1  christos                         (make-word-list
     89  1.1  christos                           :hashed (make-hash-table :test #'equal)
     90  1.1  christos                           :sorted '()
     91  1.1  christos                 ) )     )
     92  1.1  christos                 (let ((word-list (aref words-by-length len)))
     93  1.1  christos                   (unless (gethash word (word-list-hashed word-list))
     94  1.1  christos                     (setf (gethash word (word-list-hashed word-list)) t)
     95  1.1  christos                     (push word (word-list-sorted word-list))
     96  1.1  christos                 ) )
     97  1.1  christos               )
     98  1.1  christos               (setq i1 (1+ i2))
     99  1.1  christos       ) ) ) )
    100  1.1  christos       ;; Sort the word lists.
    101  1.1  christos       (dotimes (len (length words-by-length))
    102  1.1  christos         (unless (aref words-by-length len)
    103  1.1  christos           (setf (aref words-by-length len)
    104  1.1  christos                 (make-word-list
    105  1.1  christos                   :hashed (make-hash-table :test #'equal)
    106  1.1  christos                   :sorted '()
    107  1.1  christos         ) )     )
    108  1.1  christos         (let ((word-list (aref words-by-length len)))
    109  1.1  christos           (setf (word-list-sorted word-list)
    110  1.1  christos                 (sort (word-list-sorted word-list) #'string<)
    111  1.1  christos           )
    112  1.1  christos           (setf (word-list-size word-list)
    113  1.1  christos                 (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
    114  1.1  christos           )
    115  1.1  christos           (setf (word-list-length word-list)
    116  1.1  christos                 (length (word-list-sorted word-list))
    117  1.1  christos       ) ) )
    118  1.1  christos       ;; Output the tables.
    119  1.1  christos       (with-open-file (ostream outputfile :direction :output
    120  1.1  christos                        #+UNICODE :external-format #+UNICODE charset:ascii)
    121  1.1  christos         (format ostream "/*~%")
    122  1.1  christos         (format ostream " * ~A~%" (file-namestring outputfile))
    123  1.1  christos         (format ostream " *~%")
    124  1.1  christos         (format ostream " * Unicode character name table.~%")
    125  1.1  christos         (format ostream " * Generated automatically by the gen-uninames utility.~%")
    126  1.1  christos         (format ostream " */~%")
    127  1.1  christos         (format ostream "~%")
    128  1.1  christos         (format ostream "static const char unicode_name_words[~D] = {~%"
    129  1.1  christos                         (let ((sum 0))
    130  1.1  christos                           (dotimes (len (length words-by-length))
    131  1.1  christos                             (let ((word-list (aref words-by-length len)))
    132  1.1  christos                               (incf sum (word-list-size word-list))
    133  1.1  christos                           ) )
    134  1.1  christos                           sum
    135  1.1  christos         )               )
    136  1.1  christos         (dotimes (len (length words-by-length))
    137  1.1  christos           (let ((word-list (aref words-by-length len)))
    138  1.1  christos             (dolist (word (word-list-sorted word-list))
    139  1.1  christos               (format ostream " ~{ '~C',~}~%" (coerce word 'list))
    140  1.1  christos         ) ) )
    141  1.1  christos         (format ostream "};~%")
    142  1.1  christos         (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
    143  1.1  christos                         (let ((sum 0))
    144  1.1  christos                           (dotimes (len (length words-by-length))
    145  1.1  christos                             (let ((word-list (aref words-by-length len)))
    146  1.1  christos                               (incf sum (word-list-length word-list))
    147  1.1  christos                           ) )
    148  1.1  christos                           sum
    149  1.1  christos         )               )
    150  1.1  christos         #| ; Redundant data
    151  1.1  christos         (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
    152  1.1  christos                         (let ((sum 0))
    153  1.1  christos                           (dotimes (len (length words-by-length))
    154  1.1  christos                             (let ((word-list (aref words-by-length len)))
    155  1.1  christos                               (incf sum (word-list-length word-list))
    156  1.1  christos                           ) )
    157  1.1  christos                           sum
    158  1.1  christos         )               )
    159  1.1  christos         (dotimes (len (length words-by-length))
    160  1.1  christos           (let ((word-list (aref words-by-length len)))
    161  1.1  christos             (when (word-list-sorted word-list)
    162  1.1  christos               (format ostream " ")
    163  1.1  christos               (do ((l (word-list-sorted word-list) (cdr l))
    164  1.1  christos                    (offset 0 (+ offset (length (car l)))))
    165  1.1  christos                   ((endp l))
    166  1.1  christos                 (format ostream "~<~% ~0,79:; ~D,~>" offset)
    167  1.1  christos               )
    168  1.1  christos               (format ostream "~%")
    169  1.1  christos         ) ) )
    170  1.1  christos         (format ostream "};~%")
    171  1.1  christos         |#
    172  1.1  christos         (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
    173  1.1  christos                         (1+ (length words-by-length))
    174  1.1  christos         )
    175  1.1  christos         (let ((extra-offset 0)
    176  1.1  christos               (ind-offset 0))
    177  1.1  christos           (dotimes (len (length words-by-length))
    178  1.1  christos             (let ((word-list (aref words-by-length len)))
    179  1.1  christos               (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
    180  1.1  christos               (incf extra-offset (word-list-size word-list))
    181  1.1  christos               (incf ind-offset (word-list-length word-list))
    182  1.1  christos           ) )
    183  1.1  christos           (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
    184  1.1  christos         )
    185  1.1  christos         (format ostream "};~%")
    186  1.1  christos         (let ((ind-offset 0))
    187  1.1  christos           (dotimes (len (length words-by-length))
    188  1.1  christos             (let ((word-list (aref words-by-length len)))
    189  1.1  christos               (dolist (word (word-list-sorted word-list))
    190  1.1  christos                 (setf (gethash word (word-list-hashed word-list)) ind-offset)
    191  1.1  christos                 (incf ind-offset)
    192  1.1  christos         ) ) ) )
    193  1.1  christos         (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
    194  1.1  christos           (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
    195  1.1  christos                           (gethash word (word-list-hashed (aref words-by-length (length word))))
    196  1.1  christos         ) )
    197  1.1  christos         ;; Compute the word-indices for every unicode-char.
    198  1.1  christos         (dolist (uc all-chars)
    199  1.1  christos           (let ((name (unicode-char-name uc))
    200  1.1  christos                 (indices '()))
    201  1.1  christos             (let ((i1 0))
    202  1.1  christos               (loop
    203  1.1  christos                 (when (>= i1 (length name)) (return))
    204  1.1  christos                 (let ((i2 (or (position #\Space name :start i1) (length name))))
    205  1.1  christos                   (let* ((word (subseq name i1 i2))
    206  1.1  christos                          (len (length word)))
    207  1.1  christos                     (push (gethash word (word-list-hashed (aref words-by-length len)))
    208  1.1  christos                           indices
    209  1.1  christos                     )
    210  1.1  christos                   )
    211  1.1  christos                   (setq i1 (1+ i2))
    212  1.1  christos             ) ) )
    213  1.1  christos             (setf (unicode-char-word-indices uc)
    214  1.1  christos                   (coerce (nreverse indices) 'vector)
    215  1.1  christos             )
    216  1.1  christos         ) )
    217  1.1  christos         ;; Sort the list of unicode-chars by word-indices.
    218  1.1  christos         (setq all-chars
    219  1.1  christos               (sort all-chars
    220  1.1  christos                     (lambda (vec1 vec2)
    221  1.1  christos                       (let ((len1 (length vec1))
    222  1.1  christos                             (len2 (length vec2)))
    223  1.1  christos                         (do ((i 0 (1+ i)))
    224  1.1  christos                             (nil)
    225  1.1  christos                           (if (< i len2)
    226  1.1  christos                             (if (< i len1)
    227  1.1  christos                               (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
    228  1.1  christos                                     ((> (aref vec1 i) (aref vec2 i)) (return nil))
    229  1.1  christos                               )
    230  1.1  christos                               (return t)
    231  1.1  christos                             )
    232  1.1  christos                             (return nil)
    233  1.1  christos                     ) ) ) )
    234  1.1  christos                     :key #'unicode-char-word-indices
    235  1.1  christos         )     )
    236  1.1  christos         ;; Output the word-indices.
    237  1.1  christos         (format ostream "static const uint16_t unicode_names[~D] = {~%"
    238  1.1  christos                         (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
    239  1.1  christos         )
    240  1.1  christos         (let ((i 0))
    241  1.1  christos           (dolist (uc all-chars)
    242  1.1  christos             (format ostream " ~{ ~D,~}"
    243  1.1  christos                             (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
    244  1.1  christos                                      (coerce (unicode-char-word-indices uc) 'list)
    245  1.1  christos                             )
    246  1.1  christos             )
    247  1.1  christos             (when add-comments
    248  1.1  christos               (format ostream "~40T/* ~A */" (unicode-char-name uc))
    249  1.1  christos             )
    250  1.1  christos             (format ostream "~%")
    251  1.1  christos             (setf (unicode-char-word-indices-index uc) i)
    252  1.1  christos             (incf i (length (unicode-char-word-indices uc)))
    253  1.1  christos         ) )
    254  1.1  christos         (format ostream "};~%")
    255  1.1  christos         (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
    256  1.1  christos                         (length all-chars)
    257  1.1  christos         )
    258  1.1  christos         (dolist (uc all-chars)
    259  1.1  christos           (format ostream "  { 0x~4,'0X, ~D },"
    260  1.1  christos                           (unicode-char-code uc)
    261  1.1  christos                           (unicode-char-word-indices-index uc)
    262  1.1  christos           )
    263  1.1  christos           (when add-comments
    264  1.1  christos             (format ostream "~21T/* ~A */" (unicode-char-name uc))
    265  1.1  christos           )
    266  1.1  christos           (format ostream "~%")
    267  1.1  christos         )
    268  1.1  christos         (format ostream "};~%")
    269  1.1  christos         (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
    270  1.1  christos                         (length all-chars)
    271  1.1  christos         )
    272  1.1  christos         (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
    273  1.1  christos           (format ostream "  { 0x~4,'0X, ~D },"
    274  1.1  christos                           (unicode-char-code uc)
    275  1.1  christos                           (unicode-char-word-indices-index uc)
    276  1.1  christos           )
    277  1.1  christos           (when add-comments
    278  1.1  christos             (format ostream "~21T/* ~A */" (unicode-char-name uc))
    279  1.1  christos           )
    280  1.1  christos           (format ostream "~%")
    281  1.1  christos         )
    282  1.1  christos         (format ostream "};~%")
    283  1.1  christos         (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
    284  1.1  christos                         (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
    285  1.1  christos         )
    286  1.1  christos         (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
    287  1.1  christos                         (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
    288  1.1  christos         )
    289  1.1  christos       )
    290  1.1  christos ) ) )
    291  1.1  christos 
    292  1.1  christos (main (first *args*) (second *args*))
    293