diff options
Diffstat (limited to 'lib/dodai/tools/himo.py')
-rw-r--r-- | lib/dodai/tools/himo.py | 174 |
1 files changed, 174 insertions, 0 deletions
diff --git a/lib/dodai/tools/himo.py b/lib/dodai/tools/himo.py new file mode 100644 index 0000000..f56eaf8 --- /dev/null +++ b/lib/dodai/tools/himo.py | |||
@@ -0,0 +1,174 @@ | |||
1 | # Copyright (C) 2010 Leonard Thomas | ||
2 | # | ||
3 | # This file is part of Dodai. | ||
4 | # | ||
5 | # Dodai is free software: you can redistribute it and/or modify | ||
6 | # it under the terms of the GNU General Public License as published by | ||
7 | # the Free Software Foundation, either version 3 of the License, or | ||
8 | # (at your option) any later version. | ||
9 | # | ||
10 | # Dodai is distributed in the hope that it will be useful, | ||
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | # GNU General Public License for more details. | ||
14 | # | ||
15 | # You should have received a copy of the GNU General Public License | ||
16 | # along with Dodai. If not, see <http://www.gnu.org/licenses/>. | ||
17 | |||
18 | import chardet | ||
19 | import re | ||
20 | import sys | ||
21 | import unicodedata | ||
22 | from htmlentitydefs import name2codepoint | ||
23 | from htmlentitydefs import codepoint2name | ||
24 | from decimal import Decimal as D | ||
25 | from dodai.exception import HimoAsciiError | ||
26 | from dodai.tools import normalize_unicode | ||
27 | |||
28 | class String2Himo(object): | ||
29 | """ | ||
30 | This in an object that is used for converting python string objects | ||
31 | into Himo (unicode) objects. | ||
32 | |||
33 | """ | ||
34 | |||
35 | def __init__(self, default_encoding=None): | ||
36 | """ | ||
37 | default_encoding is the encoding value to be used as the | ||
38 | classwide default. If default encoding is not set then | ||
39 | the encoding of the system will be used. | ||
40 | |||
41 | """ | ||
42 | self.default_encoding = default_encoding or self._system_encoding() | ||
43 | self._expression = re.compile(r'&(#?)(x?)(\w+);') | ||
44 | |||
45 | def __call__(self, data, encoding=None): | ||
46 | """ | ||
47 | Converts the input (data) string object into a Himo object | ||
48 | using the passed in (encoding). If encoding is omitted then | ||
49 | the default_encoding will be used. | ||
50 | |||
51 | returns a Himo object | ||
52 | |||
53 | """ | ||
54 | encoding = encoding or self.default_encoding | ||
55 | data = self._as_unicode(data, encoding) | ||
56 | data = self._decode_html(data) | ||
57 | data = normalize_unicode(data) | ||
58 | return Himo(data) | ||
59 | |||
60 | def _as_unicode(self, data, encoding): | ||
61 | # Returns string as a unicode string. | ||
62 | |||
63 | if not isinstance(data, unicode): | ||
64 | if not isinstance(data, str): | ||
65 | data = str(data) | ||
66 | try: | ||
67 | data = data.decode(encoding) | ||
68 | except UnicodeDecodeError: | ||
69 | info = chardet.detect(data) | ||
70 | data = data.decode(info['encoding']) | ||
71 | return unicodedata.normalize('NFC', data) | ||
72 | |||
73 | def _decode_html(self, data): | ||
74 | # Returns a unicode string. If data contains any html encoded | ||
75 | # characters, the characters will be converted to their unicode | ||
76 | # equivalent | ||
77 | |||
78 | return unicode(self._expression.subn(self._html_decode, data)[0]) | ||
79 | |||
80 | def _html_decode(self, values): | ||
81 | # Returns the unicode character from the re.subn | ||
82 | |||
83 | value = values.group(3) | ||
84 | if values.group(1): | ||
85 | if values.group(2): | ||
86 | return unichr(int('0x{0}'.format(value), 16)) | ||
87 | else: | ||
88 | return unichr(int(value)) | ||
89 | else: | ||
90 | try: | ||
91 | char = name2codepoint[value] | ||
92 | except KeyError: | ||
93 | return values.group() | ||
94 | else: | ||
95 | return unichr(char) | ||
96 | |||
97 | def _system_encoding(self): | ||
98 | # Returns the character encoding of the operating system | ||
99 | |||
100 | encoding = sys.getdefaultencoding() | ||
101 | filesystem_encoding = sys.getfilesystemencoding() | ||
102 | if filesystem_encoding: | ||
103 | encoding = filesystem_encoding | ||
104 | return encoding | ||
105 | |||
106 | |||
107 | class Himo(unicode): | ||
108 | """ | ||
109 | A unicode-string object with some added features to help with | ||
110 | output formatting. Himo means rope or string in Japanese, hence | ||
111 | the string to Himo connection. | ||
112 | |||
113 | """ | ||
114 | |||
115 | MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} | ||
116 | |||
117 | def html(self): | ||
118 | """ | ||
119 | Returns a unicode string containing this object's value | ||
120 | html enetity encoded. | ||
121 | |||
122 | """ | ||
123 | out = [] | ||
124 | for char in self: | ||
125 | out.append(self._html_char_encode(char)) | ||
126 | return unicode(''.join(out)) | ||
127 | |||
128 | def _html_char_encode(self, char): | ||
129 | # Returns an html version of the char | ||
130 | |||
131 | number = ord(char) | ||
132 | try: | ||
133 | char = "&{0};".format(codepoint2name[number]) | ||
134 | except KeyError: | ||
135 | if number > 127: | ||
136 | char = "&#{0};".format(number) | ||
137 | return char | ||
138 | |||
139 | def decimal(self): | ||
140 | """ | ||
141 | Returns a decimal object with the value of this object | ||
142 | |||
143 | """ | ||
144 | |||
145 | return D(self) | ||
146 | |||
147 | def ascii(self): | ||
148 | """ | ||
149 | Returns an ascii representation of this object value. | ||
150 | Throws HimoAsciiError if this method was unable to | ||
151 | convert a unicode character down to it's root character. | ||
152 | For example if in your string you have a character | ||
153 | like the letter 'e' but it has an accent mark over it, | ||
154 | this method will convert that character to it's root | ||
155 | character. Thus 'e' with an accent mark over it will | ||
156 | replaced with the regular letter 'e'. | ||
157 | |||
158 | """ | ||
159 | out = [] | ||
160 | for char in self: | ||
161 | if ord(char) < 127: | ||
162 | out.append(char) | ||
163 | elif ord(char) in self.MAP: | ||
164 | out.append(self.MAP[ord(char)]) | ||
165 | else: | ||
166 | num = unicodedata.decomposition(char).split(' ')[0] | ||
167 | if num: | ||
168 | out.append(unichr(int(num, 16))) | ||
169 | else: | ||
170 | raise HimoAsciiError(char) | ||
171 | return str(''.join(out)) | ||
172 | |||
173 | class HimoAsciiError(Exception): | ||
174 | pass | ||