1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010-2013 Various Authors
# Copyright 2010 Johannes Weißl
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
import sys
import re
import os.path
import urllib2
from optparse import OptionParser
# Some letters don't have a decomposition, but can't be composed on all
# keyboards. This dictionary maps them to an ASCII character which
# *looks* similar.
special_decompositions = {
u'Æ': u'A',
u'Ð': u'D',
u'×': u'x',
u'Ø': u'O',
u'Þ': u'P',
u'ß': u'B',
u'æ': u'a',
u'ð': u'd',
u'ø': u'o',
u'þ': u'p',
# Various punctation/quotation characters
u'‐': u'-',
u'‒': u'-',
u'–': u'-',
u'−': u'-',
u'—': u'-',
u'―': u'-',
u'‘': u"'",
u'’': u"'",
u'′': u"'",
u'“': u'"',
u'”': u'"',
u'″': u'"',
u'〃': u'"',
u'…': u'.',
}
def parse_unidata(f):
u = {}
for line in f:
d = line.rstrip('\n').split(';')
cp = int(d[0], 16)
u[cp] = {}
u[cp]['name'] = d[1]
decomp = d[5]
if decomp:
m = re.match(r'<.*> (.*)', decomp)
u[cp]['compat'] = bool(m)
if m:
decomp = m.group(1)
u[cp]['decomp'] = [int(x, 16) for x in decomp.split(' ')]
else:
u[cp]['decomp'] = []
return u
def unidata_expand_decomp(unidata):
def recurse(k):
if k not in unidata or not unidata[k]['decomp']:
return [k]
exp = []
for d in unidata[k]['decomp']:
exp += recurse(d)
return exp
for k in unidata.keys():
exp = recurse(k)
if exp != [k]:
unidata[k]['decomp'] = exp
def unidata_add_mapping(unidata, mapping):
for k, v in mapping.items():
unidata[ord(k)]['decomp'] = [ord(v)]
def is_diacritical_mark(c):
return c >= 0x0300 and c <= 0x036F
def filter_unidata(unidata, include):
for k, v in unidata.items():
if k in include:
continue
if not v['decomp']:
del unidata[k]
continue
b = v['decomp'][0]
if unichr(b) == u' ' or is_diacritical_mark(b):
del unidata[k]
continue
has_accents = False
for d in v['decomp'][1:]:
if is_diacritical_mark(d):
has_accents = True
break
if not has_accents:
del unidata[k]
def output(unidata, f):
buf = '''/* This file is automatically generated. DO NOT EDIT!
Instead, edit %s and re-run. */
static struct {
uchar composed;
uchar base;
} unidecomp_map[] = {
''' % os.path.basename(sys.argv[0])
for k in sorted(unidata.keys()):
b = unidata[k]['decomp'][0]
buf += ('\t{ %#6x, %#6x },\t// %s -> %s,\t%s' % \
(k, b,
unichr(k).encode('utf-8'),
unichr(b).encode('utf-8'),
', '.join([' %s (%x)' %
(unichr(d).encode('utf-8'), d)
for d in unidata[k]['decomp'][1:]]))).rstrip() + '\n'
buf += '};'
f.write(buf+'\n')
def main(argv=None):
if not argv:
argv = sys.argv
parser = OptionParser(usage='usage: %prog [-w] [-o unidecomp.h]')
parser.add_option('-w', '--wget', action='store_true',
help='get unicode data from unicode.org')
parser.add_option('-o', '--output',
help='output file, default stdout')
(options, args) = parser.parse_args(argv[1:])
urlbase = 'http://unicode.org/Public/UNIDATA/'
unidata_filename = 'UnicodeData.txt'
if not os.path.exists(unidata_filename) and not options.wget:
parser.error('''need %s in the current directory, download
from unicode.org or use `--wget' option.''' % unidata_filename)
if options.wget:
unidata_file = urllib2.urlopen(urlbase+unidata_filename)
else:
unidata_file = open(unidata_filename, 'rb')
unidata = parse_unidata(unidata_file)
unidata_file.close()
unidata_add_mapping(unidata, special_decompositions)
unidata_expand_decomp(unidata)
filter_unidata(unidata, [ord(x) for x in special_decompositions])
outfile = sys.stdout
if options.output:
outfile = open(options.output, 'wb')
output(unidata, outfile)
if options.output:
outfile.close()
if __name__ == '__main__':
sys.exit(main())
|