aboutsummaryrefslogtreecommitdiff
path: root/lib/dodai/tools/himo.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dodai/tools/himo.py')
-rw-r--r--lib/dodai/tools/himo.py174
1 files changed, 174 insertions, 0 deletions
diff --git a/lib/dodai/tools/himo.py b/lib/dodai/tools/himo.py
new file mode 100644
index 0000000..f56eaf8
--- /dev/null
+++ b/lib/dodai/tools/himo.py
@@ -0,0 +1,174 @@
1# Copyright (C) 2010 Leonard Thomas
2#
3# This file is part of Dodai.
4#
5# Dodai is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 3 of the License, or
8# (at your option) any later version.
9#
10# Dodai is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with Dodai. If not, see <http://www.gnu.org/licenses/>.
17
18import chardet
19import re
20import sys
21import unicodedata
22from htmlentitydefs import name2codepoint
23from htmlentitydefs import codepoint2name
24from decimal import Decimal as D
25from dodai.exception import HimoAsciiError
26from dodai.tools import normalize_unicode
27
28class String2Himo(object):
29 """
30 This in an object that is used for converting python string objects
31 into Himo (unicode) objects.
32
33 """
34
35 def __init__(self, default_encoding=None):
36 """
37 default_encoding is the encoding value to be used as the
38 classwide default. If default encoding is not set then
39 the encoding of the system will be used.
40
41 """
42 self.default_encoding = default_encoding or self._system_encoding()
43 self._expression = re.compile(r'&(#?)(x?)(\w+);')
44
45 def __call__(self, data, encoding=None):
46 """
47 Converts the input (data) string object into a Himo object
48 using the passed in (encoding). If encoding is omitted then
49 the default_encoding will be used.
50
51 returns a Himo object
52
53 """
54 encoding = encoding or self.default_encoding
55 data = self._as_unicode(data, encoding)
56 data = self._decode_html(data)
57 data = normalize_unicode(data)
58 return Himo(data)
59
60 def _as_unicode(self, data, encoding):
61 # Returns string as a unicode string.
62
63 if not isinstance(data, unicode):
64 if not isinstance(data, str):
65 data = str(data)
66 try:
67 data = data.decode(encoding)
68 except UnicodeDecodeError:
69 info = chardet.detect(data)
70 data = data.decode(info['encoding'])
71 return unicodedata.normalize('NFC', data)
72
73 def _decode_html(self, data):
74 # Returns a unicode string. If data contains any html encoded
75 # characters, the characters will be converted to their unicode
76 # equivalent
77
78 return unicode(self._expression.subn(self._html_decode, data)[0])
79
80 def _html_decode(self, values):
81 # Returns the unicode character from the re.subn
82
83 value = values.group(3)
84 if values.group(1):
85 if values.group(2):
86 return unichr(int('0x{0}'.format(value), 16))
87 else:
88 return unichr(int(value))
89 else:
90 try:
91 char = name2codepoint[value]
92 except KeyError:
93 return values.group()
94 else:
95 return unichr(char)
96
97 def _system_encoding(self):
98 # Returns the character encoding of the operating system
99
100 encoding = sys.getdefaultencoding()
101 filesystem_encoding = sys.getfilesystemencoding()
102 if filesystem_encoding:
103 encoding = filesystem_encoding
104 return encoding
105
106
107class Himo(unicode):
108 """
109 A unicode-string object with some added features to help with
110 output formatting. Himo means rope or string in Japanese, hence
111 the string to Himo connection.
112
113 """
114
115 MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
116
117 def html(self):
118 """
119 Returns a unicode string containing this object's value
120 html enetity encoded.
121
122 """
123 out = []
124 for char in self:
125 out.append(self._html_char_encode(char))
126 return unicode(''.join(out))
127
128 def _html_char_encode(self, char):
129 # Returns an html version of the char
130
131 number = ord(char)
132 try:
133 char = "&{0};".format(codepoint2name[number])
134 except KeyError:
135 if number > 127:
136 char = "&#{0};".format(number)
137 return char
138
139 def decimal(self):
140 """
141 Returns a decimal object with the value of this object
142
143 """
144
145 return D(self)
146
147 def ascii(self):
148 """
149 Returns an ascii representation of this object value.
150 Throws HimoAsciiError if this method was unable to
151 convert a unicode character down to it's root character.
152 For example if in your string you have a character
153 like the letter 'e' but it has an accent mark over it,
154 this method will convert that character to it's root
155 character. Thus 'e' with an accent mark over it will
156 replaced with the regular letter 'e'.
157
158 """
159 out = []
160 for char in self:
161 if ord(char) < 127:
162 out.append(char)
163 elif ord(char) in self.MAP:
164 out.append(self.MAP[ord(char)])
165 else:
166 num = unicodedata.decomposition(char).split(' ')[0]
167 if num:
168 out.append(unichr(int(num, 16)))
169 else:
170 raise HimoAsciiError(char)
171 return str(''.join(out))
172
173class HimoAsciiError(Exception):
174 pass