convmap.pl revision f2e35a3a
1#!/usr/bin/perl -w
2# $XTermId: convmap.pl,v 1.15 2018/09/09 17:22:24 tom Exp $
3#
4# Generate keysym2ucs.c file
5#
6# See also:
7# http://mail.nl.linux.org/linux-utf8/2001-04/msg00248.html
8#
9# $XFree86: xc/programs/xterm/unicode/convmap.pl,v 1.5 2000/01/24 22:22:05 dawes Exp $
10
11use strict;
12
13our $keysym;
14our %name;
15our %keysym_to_ucs;
16our %keysym_to_keysymname;
17
18sub utf8 ($);
19
20sub utf8 ($) {
21    my $c = shift(@_);
22
23    if ($c < 0x80) {
24        return sprintf("%c", $c);
25    } elsif ($c < 0x800) {
26        return sprintf("%c%c", 0xc0 | ($c >> 6), 0x80 | ($c & 0x3f));
27    } elsif ($c < 0x10000) {
28        return sprintf("%c%c%c",
29                       0xe0 |  ($c >> 12),
30                       0x80 | (($c >>  6) & 0x3f),
31                       0x80 | ( $c        & 0x3f));
32    } elsif ($c < 0x200000) {
33        return sprintf("%c%c%c%c",
34                       0xf0 |  ($c >> 18),
35                       0x80 | (($c >> 12) & 0x3f),
36                       0x80 | (($c >>  6) & 0x3f),
37                       0x80 | ( $c        & 0x3f));
38    } elsif ($c < 0x4000000) {
39        return sprintf("%c%c%c%c%c",
40                       0xf8 |  ($c >> 24),
41                       0x80 | (($c >> 18) & 0x3f),
42                       0x80 | (($c >> 12) & 0x3f),
43                       0x80 | (($c >>  6) & 0x3f),
44                       0x80 | ( $c        & 0x3f));
45
46    } elsif ($c < 0x80000000) {
47        return sprintf("%c%c%c%c%c%c",
48                       0xfe |  ($c >> 30),
49                       0x80 | (($c >> 24) & 0x3f),
50                       0x80 | (($c >> 18) & 0x3f),
51                       0x80 | (($c >> 12) & 0x3f),
52                       0x80 | (($c >> 6)  & 0x3f),
53                       0x80 | ( $c        & 0x3f));
54    } else {
55        return utf8(0xfffd);
56    }
57}
58
59my $unicodedata = "UnicodeData.txt";
60
61# read list of all Unicode names
62if (!open(UDATA, $unicodedata) && !open(UDATA, "$unicodedata")) {
63    die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
64         "Please make sure that you have downloaded the file\n" .
65         "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
66}
67while (<UDATA>) {
68    if (/^([0-9,A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
69        $name{hex($1)} = $2;
70    } else {
71        die("Syntax error in line '$_' in file '$unicodedata'");
72    }
73}
74close(UDATA);
75
76# read mapping (from http://wsinwp07.win.tue.nl:1234/unicode/keysym.map)
77open(LIST, "<keysym.map") || die ("Can't open map file:\n$!\n");
78while (<LIST>) {
79    if (/^0x([0-9a-f]{4})\s+U([0-9a-f]{4})\s*(\#.*)?$/){
80        my $keysym = hex($1);
81        my $ucs = hex($2);
82        my $comment = $3;
83        $comment =~ s/^#\s*//;
84        $keysym_to_ucs{$keysym} = $ucs;
85        $keysym_to_keysymname{$keysym} = $comment;
86    } elsif (/^\s*\#/ || /^\s*$/) {
87    } else {
88        die("Syntax error in 'list' in line\n$_\n");
89    }
90}
91close(LIST);
92
93# read entries in keysymdef.h
94open(LIST, "</usr/include/X11/keysymdef.h") || die ("Can't open keysymdef.h:\n$!\n");
95while (<LIST>) {
96    if (/^\#define\s+XK_([A-Za-z_0-9]+)\s+0x([0-9a-fA-F]+)\s*(\/.*)?$/) {
97        next if /\/\* deprecated \*\//;
98        my $keysymname = $1;
99        my $keysym = hex($2);
100        $keysym_to_keysymname{$keysym} = $keysymname;
101    }
102}
103close(LIST);
104
105print <<EOT;
106/* \$XTermId\$
107 * This module converts keysym values into the corresponding ISO 10646
108 * (UCS, Unicode) values.
109 *
110 * The array keysymtab[] contains pairs of X11 keysym values for graphical
111 * characters and the corresponding Unicode value. The function
112 * keysym2ucs() maps a keysym onto a Unicode value using a binary search,
113 * therefore keysymtab[] must remain SORTED by keysym value.
114 *
115 * The keysym -> UTF-8 conversion will hopefully one day be provided
116 * by Xlib via XmbLookupString() and should ideally not have to be
117 * done in X applications. But we are not there yet.
118 *
119 * We allow to represent any UCS character in the range U-00000000 to
120 * U-00FFFFFF by a keysym value in the range 0x01000000 to 0x01ffffff.
121 * This admittedly does not cover the entire 31-bit space of UCS, but
122 * it does cover all of the characters up to U-10FFFF, which can be
123 * represented by UTF-16, and more, and it is very unlikely that higher
124 * UCS codes will ever be assigned by ISO. So to get Unicode character
125 * U+ABCD you can directly use keysym 0x0100abcd.
126 *
127 * NOTE: The comments in the table below contain the actual character
128 * encoded in UTF-8, so for viewing and editing best use an editor in
129 * UTF-8 mode.
130 *
131 * Author: Markus G. Kuhn <mkuhn\@acm.org>, University of Cambridge, April 2001
132 *
133 * Special thanks to Richard Verhoeven <river\@win.tue.nl> for preparing
134 * an initial draft of the mapping table.
135 *
136 * This software is in the public domain. Share and enjoy!
137 *
138 * AUTOMATICALLY GENERATED FILE, DO NOT EDIT !!! (unicode/convmap.pl)
139 */
140
141#ifndef KEYSYM2UCS_INCLUDED
142
143#include "keysym2ucs.h"
144#define VISIBLE /* */
145
146#else
147
148#define VISIBLE static
149
150#endif
151
152static struct codepair {
153  unsigned short keysym;
154  unsigned short ucs;
155} keysymtab[] = {
156EOT
157
158for $keysym (sort {$a <=> $b} keys(%keysym_to_keysymname)) {
159    my $ucs = $keysym_to_ucs{$keysym};
160    next if $keysym >= 0xf000 || $keysym < 0x100;
161    if ($ucs) {
162        printf("  { 0x%04x, 0x%04x }, /*%28s %s %s */\n",
163               $keysym, $ucs, $keysym_to_keysymname{$keysym}, utf8($ucs),
164               defined($name{$ucs}) ? $name{$ucs} : "???" );
165    } else {
166        printf("/*  0x%04x   %39s ? ??? */\n",
167               $keysym, $keysym_to_keysymname{$keysym});
168    }
169}
170
171print <<EOT;
172};
173
174VISIBLE
175long keysym2ucs(KeySym keysym)
176{
177    int min = 0;
178    int max = sizeof(keysymtab) / sizeof(struct codepair) - 1;
179
180    /* first check for Latin-1 characters (1:1 mapping) */
181    if ((keysym >= 0x0020 && keysym <= 0x007e) ||
182        (keysym >= 0x00a0 && keysym <= 0x00ff))
183        return (long) keysym;
184
185    /* also check for directly encoded 24-bit UCS characters */
186    if ((keysym & 0xff000000) == 0x01000000)
187        return (long) (keysym & 0x00ffffff);
188
189    /* binary search in table */
190    while (max >= min) {
191        int mid = (min + max) / 2;
192        if (keysymtab[mid].keysym < keysym)
193            min = mid + 1;
194        else if (keysymtab[mid].keysym > keysym)
195            max = mid - 1;
196        else {
197            /* found it */
198            return keysymtab[mid].ucs;
199        }
200    }
201
202    /* no matching Unicode value found */
203    return -1;
204}
205EOT
206