convmap.pl revision f2e35a3a
18c9fbc29Smrg#!/usr/bin/perl -w
28c9fbc29Smrg# $XTermId: convmap.pl,v 1.15 2018/09/09 17:22:24 tom Exp $
38c9fbc29Smrg#
48c9fbc29Smrg# Generate keysym2ucs.c file
58c9fbc29Smrg#
68c9fbc29Smrg# See also:
78c9fbc29Smrg# http://mail.nl.linux.org/linux-utf8/2001-04/msg00248.html
88c9fbc29Smrg#
98c9fbc29Smrg# $XFree86: xc/programs/xterm/unicode/convmap.pl,v 1.5 2000/01/24 22:22:05 dawes Exp $
108c9fbc29Smrg
118c9fbc29Smrguse strict;
128c9fbc29Smrg
138c9fbc29Smrgour $keysym;
148c9fbc29Smrgour %name;
158c9fbc29Smrgour %keysym_to_ucs;
168c9fbc29Smrgour %keysym_to_keysymname;
178c9fbc29Smrg
188c9fbc29Smrgsub utf8 ($);
198c9fbc29Smrg
208c9fbc29Smrgsub utf8 ($) {
218c9fbc29Smrg    my $c = shift(@_);
228c9fbc29Smrg
238c9fbc29Smrg    if ($c < 0x80) {
248c9fbc29Smrg        return sprintf("%c", $c);
258c9fbc29Smrg    } elsif ($c < 0x800) {
268c9fbc29Smrg        return sprintf("%c%c", 0xc0 | ($c >> 6), 0x80 | ($c & 0x3f));
278c9fbc29Smrg    } elsif ($c < 0x10000) {
288c9fbc29Smrg        return sprintf("%c%c%c",
298c9fbc29Smrg                       0xe0 |  ($c >> 12),
308c9fbc29Smrg                       0x80 | (($c >>  6) & 0x3f),
318c9fbc29Smrg                       0x80 | ( $c        & 0x3f));
328c9fbc29Smrg    } elsif ($c < 0x200000) {
338c9fbc29Smrg        return sprintf("%c%c%c%c",
348c9fbc29Smrg                       0xf0 |  ($c >> 18),
358c9fbc29Smrg                       0x80 | (($c >> 12) & 0x3f),
368c9fbc29Smrg                       0x80 | (($c >>  6) & 0x3f),
378c9fbc29Smrg                       0x80 | ( $c        & 0x3f));
388c9fbc29Smrg    } elsif ($c < 0x4000000) {
398c9fbc29Smrg        return sprintf("%c%c%c%c%c",
408c9fbc29Smrg                       0xf8 |  ($c >> 24),
418c9fbc29Smrg                       0x80 | (($c >> 18) & 0x3f),
428c9fbc29Smrg                       0x80 | (($c >> 12) & 0x3f),
438c9fbc29Smrg                       0x80 | (($c >>  6) & 0x3f),
448c9fbc29Smrg                       0x80 | ( $c        & 0x3f));
458c9fbc29Smrg
468c9fbc29Smrg    } elsif ($c < 0x80000000) {
478c9fbc29Smrg        return sprintf("%c%c%c%c%c%c",
488c9fbc29Smrg                       0xfe |  ($c >> 30),
498c9fbc29Smrg                       0x80 | (($c >> 24) & 0x3f),
508c9fbc29Smrg                       0x80 | (($c >> 18) & 0x3f),
518c9fbc29Smrg                       0x80 | (($c >> 12) & 0x3f),
528c9fbc29Smrg                       0x80 | (($c >> 6)  & 0x3f),
538c9fbc29Smrg                       0x80 | ( $c        & 0x3f));
548c9fbc29Smrg    } else {
558c9fbc29Smrg        return utf8(0xfffd);
568c9fbc29Smrg    }
578c9fbc29Smrg}
588c9fbc29Smrg
598c9fbc29Smrgmy $unicodedata = "UnicodeData.txt";
608c9fbc29Smrg
618c9fbc29Smrg# read list of all Unicode names
628c9fbc29Smrgif (!open(UDATA, $unicodedata) && !open(UDATA, "$unicodedata")) {
638c9fbc29Smrg    die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
648c9fbc29Smrg         "Please make sure that you have downloaded the file\n" .
658c9fbc29Smrg         "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
668c9fbc29Smrg}
678c9fbc29Smrgwhile (<UDATA>) {
688c9fbc29Smrg    if (/^([0-9,A-F]{4,6});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
698c9fbc29Smrg        $name{hex($1)} = $2;
708c9fbc29Smrg    } else {
718c9fbc29Smrg        die("Syntax error in line '$_' in file '$unicodedata'");
728c9fbc29Smrg    }
738c9fbc29Smrg}
748c9fbc29Smrgclose(UDATA);
758c9fbc29Smrg
768c9fbc29Smrg# read mapping (from http://wsinwp07.win.tue.nl:1234/unicode/keysym.map)
778c9fbc29Smrgopen(LIST, "<keysym.map") || die ("Can't open map file:\n$!\n");
788c9fbc29Smrgwhile (<LIST>) {
798c9fbc29Smrg    if (/^0x([0-9a-f]{4})\s+U([0-9a-f]{4})\s*(\#.*)?$/){
808c9fbc29Smrg        my $keysym = hex($1);
818c9fbc29Smrg        my $ucs = hex($2);
828c9fbc29Smrg        my $comment = $3;
838c9fbc29Smrg        $comment =~ s/^#\s*//;
848c9fbc29Smrg        $keysym_to_ucs{$keysym} = $ucs;
858c9fbc29Smrg        $keysym_to_keysymname{$keysym} = $comment;
868c9fbc29Smrg    } elsif (/^\s*\#/ || /^\s*$/) {
878c9fbc29Smrg    } else {
888c9fbc29Smrg        die("Syntax error in 'list' in line\n$_\n");
898c9fbc29Smrg    }
908c9fbc29Smrg}
918c9fbc29Smrgclose(LIST);
928c9fbc29Smrg
938c9fbc29Smrg# read entries in keysymdef.h
948c9fbc29Smrgopen(LIST, "</usr/include/X11/keysymdef.h") || die ("Can't open keysymdef.h:\n$!\n");
958c9fbc29Smrgwhile (<LIST>) {
968c9fbc29Smrg    if (/^\#define\s+XK_([A-Za-z_0-9]+)\s+0x([0-9a-fA-F]+)\s*(\/.*)?$/) {
978c9fbc29Smrg        next if /\/\* deprecated \*\//;
988c9fbc29Smrg        my $keysymname = $1;
998c9fbc29Smrg        my $keysym = hex($2);
1008c9fbc29Smrg        $keysym_to_keysymname{$keysym} = $keysymname;
1018c9fbc29Smrg    }
1028c9fbc29Smrg}
1038c9fbc29Smrgclose(LIST);
1048c9fbc29Smrg
1058c9fbc29Smrgprint <<EOT;
1068c9fbc29Smrg/* \$XTermId\$
1078c9fbc29Smrg * This module converts keysym values into the corresponding ISO 10646
1088c9fbc29Smrg * (UCS, Unicode) values.
1098c9fbc29Smrg *
1108c9fbc29Smrg * The array keysymtab[] contains pairs of X11 keysym values for graphical
1118c9fbc29Smrg * characters and the corresponding Unicode value. The function
1128c9fbc29Smrg * keysym2ucs() maps a keysym onto a Unicode value using a binary search,
1138c9fbc29Smrg * therefore keysymtab[] must remain SORTED by keysym value.
1148c9fbc29Smrg *
1158c9fbc29Smrg * The keysym -> UTF-8 conversion will hopefully one day be provided
1168c9fbc29Smrg * by Xlib via XmbLookupString() and should ideally not have to be
1178c9fbc29Smrg * done in X applications. But we are not there yet.
1188c9fbc29Smrg *
1198c9fbc29Smrg * We allow to represent any UCS character in the range U-00000000 to
1208c9fbc29Smrg * U-00FFFFFF by a keysym value in the range 0x01000000 to 0x01ffffff.
1218c9fbc29Smrg * This admittedly does not cover the entire 31-bit space of UCS, but
1228c9fbc29Smrg * it does cover all of the characters up to U-10FFFF, which can be
1238c9fbc29Smrg * represented by UTF-16, and more, and it is very unlikely that higher
1248c9fbc29Smrg * UCS codes will ever be assigned by ISO. So to get Unicode character
1258c9fbc29Smrg * U+ABCD you can directly use keysym 0x0100abcd.
1268c9fbc29Smrg *
1278c9fbc29Smrg * NOTE: The comments in the table below contain the actual character
1288c9fbc29Smrg * encoded in UTF-8, so for viewing and editing best use an editor in
1298c9fbc29Smrg * UTF-8 mode.
1308c9fbc29Smrg *
1318c9fbc29Smrg * Author: Markus G. Kuhn <mkuhn\@acm.org>, University of Cambridge, April 2001
1328c9fbc29Smrg *
1338c9fbc29Smrg * Special thanks to Richard Verhoeven <river\@win.tue.nl> for preparing
1348c9fbc29Smrg * an initial draft of the mapping table.
1358c9fbc29Smrg *
1368c9fbc29Smrg * This software is in the public domain. Share and enjoy!
1378c9fbc29Smrg *
1388c9fbc29Smrg * AUTOMATICALLY GENERATED FILE, DO NOT EDIT !!! (unicode/convmap.pl)
1398c9fbc29Smrg */
1408c9fbc29Smrg
1418c9fbc29Smrg#ifndef KEYSYM2UCS_INCLUDED
1428c9fbc29Smrg
1438c9fbc29Smrg#include "keysym2ucs.h"
1448c9fbc29Smrg#define VISIBLE /* */
1458c9fbc29Smrg
1468c9fbc29Smrg#else
1478c9fbc29Smrg
1488c9fbc29Smrg#define VISIBLE static
1498c9fbc29Smrg
1508c9fbc29Smrg#endif
1518c9fbc29Smrg
1528c9fbc29Smrgstatic struct codepair {
1538c9fbc29Smrg  unsigned short keysym;
1548c9fbc29Smrg  unsigned short ucs;
1558c9fbc29Smrg} keysymtab[] = {
1568c9fbc29SmrgEOT
1578c9fbc29Smrg
1588c9fbc29Smrgfor $keysym (sort {$a <=> $b} keys(%keysym_to_keysymname)) {
1598c9fbc29Smrg    my $ucs = $keysym_to_ucs{$keysym};
1608c9fbc29Smrg    next if $keysym >= 0xf000 || $keysym < 0x100;
1618c9fbc29Smrg    if ($ucs) {
1628c9fbc29Smrg        printf("  { 0x%04x, 0x%04x }, /*%28s %s %s */\n",
1638c9fbc29Smrg               $keysym, $ucs, $keysym_to_keysymname{$keysym}, utf8($ucs),
1648c9fbc29Smrg               defined($name{$ucs}) ? $name{$ucs} : "???" );
1658c9fbc29Smrg    } else {
1668c9fbc29Smrg        printf("/*  0x%04x   %39s ? ??? */\n",
1678c9fbc29Smrg               $keysym, $keysym_to_keysymname{$keysym});
1688c9fbc29Smrg    }
1698c9fbc29Smrg}
1708c9fbc29Smrg
1718c9fbc29Smrgprint <<EOT;
1728c9fbc29Smrg};
1738c9fbc29Smrg
1748c9fbc29SmrgVISIBLE
1758c9fbc29Smrglong keysym2ucs(KeySym keysym)
1768c9fbc29Smrg{
1778c9fbc29Smrg    int min = 0;
1788c9fbc29Smrg    int max = sizeof(keysymtab) / sizeof(struct codepair) - 1;
1798c9fbc29Smrg
1808c9fbc29Smrg    /* first check for Latin-1 characters (1:1 mapping) */
1818c9fbc29Smrg    if ((keysym >= 0x0020 && keysym <= 0x007e) ||
1828c9fbc29Smrg        (keysym >= 0x00a0 && keysym <= 0x00ff))
1838c9fbc29Smrg        return (long) keysym;
1848c9fbc29Smrg
1858c9fbc29Smrg    /* also check for directly encoded 24-bit UCS characters */
1868c9fbc29Smrg    if ((keysym & 0xff000000) == 0x01000000)
1878c9fbc29Smrg        return (long) (keysym & 0x00ffffff);
1888c9fbc29Smrg
1898c9fbc29Smrg    /* binary search in table */
1908c9fbc29Smrg    while (max >= min) {
1918c9fbc29Smrg        int mid = (min + max) / 2;
1928c9fbc29Smrg        if (keysymtab[mid].keysym < keysym)
1938c9fbc29Smrg            min = mid + 1;
1948c9fbc29Smrg        else if (keysymtab[mid].keysym > keysym)
1958c9fbc29Smrg            max = mid - 1;
1968c9fbc29Smrg        else {
1978c9fbc29Smrg            /* found it */
1988c9fbc29Smrg            return keysymtab[mid].ucs;
1998c9fbc29Smrg        }
2008c9fbc29Smrg    }
2018c9fbc29Smrg
2028c9fbc29Smrg    /* no matching Unicode value found */
2038c9fbc29Smrg    return -1;
2048c9fbc29Smrg}
2058c9fbc29SmrgEOT
2068c9fbc29Smrg