aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSix <unknown>2010-04-10 22:31:18 -0400
committerSix <unknown>2010-04-10 22:31:18 -0400
commitc5003706aca7bf32031e848ef57146362bf7e3de (patch)
tree7aaf2cdba6e80eb94496901ef48e17a2e7aaf8cd
parent65302b6621bc44b49682efde4522e131951bfd5f (diff)
downloaddodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.bz2
dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.xz
dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.zip
changed the himo object and added an object used to create himo objects
-rw-r--r--.hgignore1
-rw-r--r--dodai/tools/__init__.py1
-rw-r--r--dodai/tools/himo.py365
-rw-r--r--test/test_tools/test_himo.py107
4 files changed, 197 insertions, 277 deletions
diff --git a/.hgignore b/.hgignore
index 592f378..3c866fc 100644
--- a/.hgignore
+++ b/.hgignore
@@ -5,3 +5,4 @@ syntax: glob
5bin 5bin
6include 6include
7lib 7lib
8.coverage
diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py
index c54f7f2..9d2fad7 100644
--- a/dodai/tools/__init__.py
+++ b/dodai/tools/__init__.py
@@ -14,3 +14,4 @@
14# 14#
15# You should have received a copy of the GNU General Public License 15# You should have received a copy of the GNU General Public License
16# along with Dodai. If not, see <http://www.gnu.org/licenses/>. 16# along with Dodai. If not, see <http://www.gnu.org/licenses/>.
17
diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py
index 5a96f91..60051a7 100644
--- a/dodai/tools/himo.py
+++ b/dodai/tools/himo.py
@@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint
23from htmlentitydefs import codepoint2name 23from htmlentitydefs import codepoint2name
24from decimal import Decimal as D 24from decimal import Decimal as D
25 25
26class Himo(object): 26class String2Himo(object):
27 """ 27 """
28 A unicode-string object with some added features to help with 28 This in an object that is used for converting python string objects
29 unicode decoding and output conversions. 29 into Himo (unicode) objects.
30 """
31
32 MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
33
34 def __init__(self, data, encoding=None):
35 """
36 data: Accepts any type of string object (int, float,
37 string, unicode)
38
39 encoding: Character encoding to help with converting the input
40 into unicode
41 30
42 The input data will be converted into an unicode object, unless 31 """
43 the input data is already an unicode object. If the param
44 'encoding' is set, the input data will be converted to unicode
45 using that value. If no 'encoding' is given this object will
46 attempt to figure out the encoding. First the encoding of the
47 operating system will be used. If there are any errors, the
48 chardet module will be used. This object makes no guarantees
49 that the correct encoding will be detected.
50 32
33 def __init__(self, default_encoding=None):
51 """ 34 """
35 default_encoding is the encoding value to be used as the
36 classwide default. If default encoding is not set then
37 the encoding of the system will be used.
52 38
53 self._encoding = encoding or self._system_encoding()
54 self.data = self._decode(data)
55
56 def ascii(self):
57 """ 39 """
58 Returns an ascii representation of this object value. 40 self.default_encoding = default_encoding or self._system_encoding()
59 Throws HimoAsciiError if this method was unable to 41 self._expression = re.compile(r'&(#?)(x?)(\w+);')
60 convert a unicode character down to it's root character.
61 For example if in your string you have a character
62 like the letter 'e' but it has an accent mark over it,
63 this method will convert that character to it's root
64 character. Thus 'e' with an accent mark over it will
65 replaced with the regular letter 'e'.
66 42
43 def __call__(self, data, encoding=None):
67 """ 44 """
68 out = [] 45 Converts the input (data) string object into a Himo object
69 for char in self.data: 46 using the passed in (encoding). If encoding is omitted then
70 if ord(char) < 127: 47 the default_encoding will be used.
71 out.append(char)
72 elif ord(char) in self.MAP:
73 out.append(self.MAP[ord(char)])
74 else:
75 num = unicodedata.decomposition(char).split(' ')[0]
76 if num:
77 out.append(unichr(int(num, 16)))
78 else:
79 print char
80 raise HimoAsciiError("Unable to convert 'u{0}' "\
81 "character to ascii".format(ord(char)))
82 return str(''.join(out))
83
84 def html(self):
85 """
86 Returns a unicode string containing this object's value
87 html enetity encoded.
88 """
89 out = []
90 for char in self.data:
91 out.append(self._html_char_encode(char))
92 return ''.join(out)
93 48
94 def decimal(self): 49 returns a Himo object
95 """
96 Returns a decimal object with the value of this object
97 50
98 """ 51 """
52 encoding = encoding or self.default_encoding
53 data = self._as_unicode(data, encoding)
54 data = self._decode_html(data)
55 return Himo(data)
99 56
100 return D(self.data) 57 def _as_unicode(self, data, encoding):
101 58 # Returns string as a unicode string.
102 def _decode(self, data):
103 # Returns a unicode string. If data contains any html encoded
104 # characters, the characters will be converted to their unicode
105 # equivalent
106
107 data = self._as_unicode(data)
108 expression = re.compile(r'&(#?)(x?)(\w+);')
109 return expression.subn(self._html_decode, data)[0]
110
111 def _as_unicode(self, data):
112 # Returns string as a unicode string
113 59
114 if not isinstance(data, unicode): 60 if not isinstance(data, unicode):
115 if not isinstance(data, str): 61 if not isinstance(data, str):
116 data = str(data) 62 data = str(data)
117 try: 63 try:
118 data = data.decode(self._encoding) 64 data = data.decode(encoding)
119 except UnicodeDecodeError: 65 except UnicodeDecodeError:
120 info = chardet.detect(data) 66 info = chardet.detect(data)
121 self.encoding = info['encoding']
122 data = data.decode(info['encoding']) 67 data = data.decode(info['encoding'])
123 return unicodedata.normalize('NFC', data) 68 return unicodedata.normalize('NFC', data)
124 69
125 def _html_char_encode(self, char): 70 def _decode_html(self, data):
126 # Returns an html version of the char 71 # Returns a unicode string. If data contains any html encoded
72 # characters, the characters will be converted to their unicode
73 # equivalent
127 74
128 number = ord(char) 75 return unicode(self._expression.subn(self._html_decode, data)[0])
129 try:
130 char = "&{0};".format(codepoint2name[number])
131 except KeyError:
132 if number > 127:
133 char = "&#{0};".format(number)
134 return char
135 76
136 def _html_decode(self, values): 77 def _html_decode(self, values):
137 # Returns the unicode character from the re.subn 78 # Returns the unicode character from the re.subn
@@ -151,212 +92,82 @@ class Himo(object):
151 return unichr(char) 92 return unichr(char)
152 93
153 def _system_encoding(self): 94 def _system_encoding(self):
154 # Returns the character encoding of the system 95 # Returns the character encoding of the operating system
155 96
156 encoding = sys.getfilesystemencoding() 97 encoding = sys.getdefaultencoding()
157 if not encoding: 98 filesystem_encoding = sys.getfilesystemencoding()
158 encoding = sys.getdefaultencoding() 99 if filesystem_encoding:
100 encoding = filesystem_encoding
159 return encoding 101 return encoding
160 102
161 #def __cmp__(self, other):
162 # if self.__eq__(other):
163 # return 1
164 # else:
165 # pool = [str(self.data), str(other)]
166 # pool.sort()
167 # if pool[0] == self.data:
168 # return -1
169 # else:
170 # return 1
171
172
173 def _is_himo(self, other):
174 if hasattr(other, '_is_himo'):
175 return True
176 return False
177
178 def __len__(self):
179 return len(self.data)
180
181 def __repr__(self):
182 return repr(self.data)
183
184 def __str__(self):
185 return self.data.encode(self._encoding)
186
187 def __iter__(self):
188 for char in self.data:
189 yield char
190
191 def __int__(self):
192 return int(self.data)
193
194 def __float__(self):
195 return float(self.data)
196
197 def __eq__(self, other):
198 if self._is_himo(other):
199 other = other.data
200 return self.data.__eq__(other)
201
202 def __ne__(self, other):
203 if self._is_himo(other):
204 other = other.data
205 return self.data.__ne__(other)
206
207 def __gt__(self, other):
208 if self._is_himo(other):
209 other = other.data
210 lines = [self.data, other]
211 lines.sort()
212 if lines[0] == self.data:
213 return True
214 else:
215 return False
216
217 def __lt__(self, other):
218 if self._is_himo(other):
219 other = other.data
220 lines = [self.data, other]
221 lines.sort()
222 if lines[0] != self.data:
223 return True
224 else:
225 return False
226
227 def __cmp__(self, other):
228 if self.__eq__(other):
229 return 0
230 elif self.__lt__(other):
231 return -1
232 else:
233 return 1
234
235 def __unicode__(self):
236 return self.data
237 103
238 def capitalize(self, *args, **kargs): 104class Himo(unicode):
239 return self.data.capitalize(*args, **kargs) 105 """
240 106 A unicode-string object with some added features to help with
241 def center(self, *args, **kargs): 107 output formatting. Himo means rope or string in Japanese, hence
242 return self.data.center(*args, **kargs) 108 the string to Himo connection.
243
244 def count(self, *args, **kargs):
245 return self.data.count(*args, **kargs)
246
247 def decode(self, *args, **kargs):
248 return self.data.decode(*args, **kargs)
249
250 def encode(self, *args, **kargs):
251 return self.data.encode(*args, **kargs)
252
253 def encode(self, *args, **kargs):
254 return self.data.encode(*args, **kargs)
255
256 def endswith(self, *args, **kargs):
257 return self.data.endswith(*args, **kargs)
258
259 def expandtabs(self, *args, **kargs):
260 return self.data.expandtabs(*args, **kargs)
261
262 def find(self, *args, **kargs):
263 return self.data.find(*args, **kargs)
264
265 def format(self, *args, **kargs):
266 return self.data.format(*args, **kargs)
267
268 def index(self, *args, **kargs):
269 return self.data.index(*args, **kargs)
270
271 def isalnum(self, *args, **kargs):
272 return self.data.isalnum(*args, **kargs)
273
274 def isalpha(self, *args, **kargs):
275 return self.data.isalpha(*args, **kargs)
276
277 def isdecimal(self, *args, **kargs):
278 return self.data.isdecimal(*args, **kargs)
279
280 def isdigit(self, *args, **kargs):
281 return self.data.isdigit(*args, **kargs)
282
283 def islower(self, *args, **kargs):
284 return self.data.islower(*args, **kargs)
285
286 def isnumeric(self, *args, **kargs):
287 return self.data.isnumeric(*args, **kargs)
288
289 def isspace(self, *args, **kargs):
290 return self.data.isspace(*args, **kargs)
291
292 def istitle(self, *args, **kargs):
293 return self.data.istitle(*args, **kargs)
294
295 def isupper(self, *args, **kargs):
296 return self.data.isupper(*args, **kargs)
297
298 def join(self, *args, **kargs):
299 return self.data.join(*args, **kargs)
300
301 def ljust(self, *args, **kargs):
302 return self.data.ljust(*args, **kargs)
303
304 def lower(self, *args, **kargs):
305 return self.data.lower(*args, **kargs)
306
307 def lstrip(self, *args, **kargs):
308 return self.data.lstrip(*args, **kargs)
309
310 def partition(self, *args, **kargs):
311 return self.data.partition(*args, **kargs)
312
313 def replace(self, *args, **kargs):
314 return self.data.replace(*args, **kargs)
315
316 def rfind(self, *args, **kargs):
317 return self.data.rfind(*args, **kargs)
318
319 def rindex(self, *args, **kargs):
320 return self.data.rindex(*args, **kargs)
321
322 def rjust(self, *args, **kargs):
323 return self.data.rjust(*args, **kargs)
324
325 def rpartition(self, *args, **kargs):
326 return self.data.rpartition(*args, **kargs)
327 109
328 def rsplit(self, *args, **kargs): 110 """
329 return self.data.rsplit(*args, **kargs)
330 111
331 def rstrip(self, *args, **kargs): 112 MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
332 return self.data.rstrip(*args, **kargs)
333 113
334 def split(self, *args, **kargs): 114 def html(self):
335 return self.data.split(*args, **kargs) 115 """
116 Returns a unicode string containing this object's value
117 html enetity encoded.
336 118
337 def splitlines(self, *args, **kargs): 119 """
338 return self.data.splitlines(*args, **kargs) 120 out = []
121 for char in self:
122 out.append(self._html_char_encode(char))
123 return unicode(''.join(out))
339 124
340 def startswith(self, *args, **kargs): 125 def _html_char_encode(self, char):
341 return self.data.startswith(*args, **kargs) 126 # Returns an html version of the char
342 127
343 def strip(self, *args, **kargs): 128 number = ord(char)
344 return self.data.strip(*args, **kargs) 129 try:
130 char = "&{0};".format(codepoint2name[number])
131 except KeyError:
132 if number > 127:
133 char = "&#{0};".format(number)
134 return char
345 135
346 def swapcase(self, *args, **kargs): 136 def decimal(self):
347 return self.data.swapcase(*args, **kargs) 137 """
138 Returns a decimal object with the value of this object
348 139
349 def title(self, *args, **kargs): 140 """
350 return self.data.title(*args, **kargs)
351 141
352 def translate(self, *args, **kargs): 142 return D(self)
353 return self.data.translate(*args, **kargs)
354 143
355 def upper(self, *args, **kargs): 144 def ascii(self):
356 return self.data.upper(*args, **kargs) 145 """
146 Returns an ascii representation of this object value.
147 Throws HimoAsciiError if this method was unable to
148 convert a unicode character down to it's root character.
149 For example if in your string you have a character
150 like the letter 'e' but it has an accent mark over it,
151 this method will convert that character to it's root
152 character. Thus 'e' with an accent mark over it will
153 replaced with the regular letter 'e'.
357 154
358 def zfill(self, *args, **kargs): 155 """
359 return self.data.zfill(*args, **kargs) 156 out = []
157 for char in self:
158 if ord(char) < 127:
159 out.append(char)
160 elif ord(char) in self.MAP:
161 out.append(self.MAP[ord(char)])
162 else:
163 num = unicodedata.decomposition(char).split(' ')[0]
164 if num:
165 out.append(unichr(int(num, 16)))
166 else:
167 print char
168 raise HimoAsciiError("Unable to convert 'u{0}' "\
169 "character to ascii".format(ord(char)))
170 return str(''.join(out))
360 171
361class HimoAsciiError(Exception): 172class HimoAsciiError(Exception):
362 pass 173 pass
diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py
new file mode 100644
index 0000000..cb58ca5
--- /dev/null
+++ b/test/test_tools/test_himo.py
@@ -0,0 +1,107 @@
1import sys
2import os
3import unittest
4path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..'))
5sys.path.append(path)
6from dodai.tools.himo import String2Himo
7from dodai.tools.himo import Himo
8from dodai.tools.himo import HimoAsciiError
9from decimal import Decimal as D
10
11class TestString2Himo(unittest.TestCase):
12
13 def setUp(self):
14 self.string_to_himo_one = String2Himo()
15 self.string_to_himo_two = String2Himo('unicode_escape')
16
17 def test_regular_string_one(self):
18 obj = self.string_to_himo_one(str('abcd'))
19 self.assertTrue(isinstance(obj, Himo))
20
21 def test_regular_string_two(self):
22 test = str('abcd')
23 obj = self.string_to_himo_one(test)
24 self.assertTrue(obj == test)
25
26 def test_regular_string_three(self):
27 obj = self.string_to_himo_two(str('abcd'))
28 self.assertTrue(isinstance(obj, Himo))
29
30 def test_regular_string_four(self):
31 test = str('abcd')
32 obj = self.string_to_himo_two(test)
33 self.assertTrue(obj == test)
34
35 def test_regular_string_five(self):
36 test = 1
37 obj = self.string_to_himo_two(test)
38 self.assertEqual(int(obj), test)
39
40 def test_decode_html(self):
41 test = u'\u4e5d\xf2\xe5\xe9'
42 obj = self.string_to_himo_one(u'&#20061;&ograve;&aring;&eacute;')
43 self.assertEqual(test, obj)
44
45 def test_decode_chardet(self):
46 # Korean Text in the EUC-KR character set
47 kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\
48 '\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7'
49 test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
50 '\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape')
51 obj = self.string_to_himo_one(kor)
52 self.assertEqual(obj, test)
53
54 def test_html_decode_one(self):
55 # Korean Text
56 kor = '&#xb2f9;&#49888;&#51060; &#52286;&#45716; &#47784;&#46304; '\
57 '&#49828;&#53440;&#51068;, &#50725;&#49496;&amp; foo'
58 test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
59 u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo'
60 obj = self.string_to_himo_one(kor)
61 self.assertEqual(obj, test)
62
63 def test_html_decode_two(self):
64 kor = '&ampppe;'
65 obj = self.string_to_himo_one(kor)
66 self.assertEqual(obj, kor)
67
68
69class TestHimo(unittest.TestCase):
70
71 def setUp(self):
72 # "Elephants are our brothers" in Japanese. This line comes from
73 # an episode of a old Japanese TV show called "Koko Ga Hen Da Yo,
74 # Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin
75 # This line came from an African native who spoke Japanese fluently
76 self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\
77 u'\u3067\u3059'
78 self.jp_html = u'&#35937;&#12399;&#31169;&#12383;&#12385;&#12398;'\
79 u'&#20804;&#24351;&#12391;&#12377;'
80
81 def test_html(self):
82 obj = Himo(self.jp)
83 test = obj.html()
84 self.assertEqual(test, self.jp_html)
85
86 def test_decimal(self):
87 obj = Himo('23.55')
88 obj = obj.decimal()
89 self.assertTrue(isinstance(obj, D))
90 self.assertEqual(obj, D('23.55'))
91
92 def test_ascii_one(self):
93 # "Elephants live in Africa" in spanish
94 obj = Himo(u'Los elefantes viven en \xc1frica')
95 test = u'Los elefantes viven en Africa'
96 self.assertEqual(obj.ascii(), test)
97
98 def test_ascii_two(self):
99 # "Elephants are our brothers" in French
100 obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9')
101 test = u'Les elephants sont nos freres (C)'
102 self.assertEqual(obj.ascii(), test)
103
104 def test_ascii_three(self):
105
106 obj = Himo(self.jp)
107 self.failUnlessRaises(HimoAsciiError, obj.ascii)