diff options
author | Six <unknown> | 2010-04-10 22:31:18 -0400 |
---|---|---|
committer | Six <unknown> | 2010-04-10 22:31:18 -0400 |
commit | c5003706aca7bf32031e848ef57146362bf7e3de (patch) | |
tree | 7aaf2cdba6e80eb94496901ef48e17a2e7aaf8cd | |
parent | 65302b6621bc44b49682efde4522e131951bfd5f (diff) | |
download | dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.bz2 dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.xz dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.zip |
changed the himo object and added an object used to create himo objects
-rw-r--r-- | .hgignore | 1 | ||||
-rw-r--r-- | dodai/tools/__init__.py | 1 | ||||
-rw-r--r-- | dodai/tools/himo.py | 365 | ||||
-rw-r--r-- | test/test_tools/test_himo.py | 107 |
4 files changed, 197 insertions, 277 deletions
@@ -5,3 +5,4 @@ syntax: glob | |||
5 | bin | 5 | bin |
6 | include | 6 | include |
7 | lib | 7 | lib |
8 | .coverage | ||
diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py index c54f7f2..9d2fad7 100644 --- a/dodai/tools/__init__.py +++ b/dodai/tools/__init__.py | |||
@@ -14,3 +14,4 @@ | |||
14 | # | 14 | # |
15 | # You should have received a copy of the GNU General Public License | 15 | # You should have received a copy of the GNU General Public License |
16 | # along with Dodai. If not, see <http://www.gnu.org/licenses/>. | 16 | # along with Dodai. If not, see <http://www.gnu.org/licenses/>. |
17 | |||
diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py index 5a96f91..60051a7 100644 --- a/dodai/tools/himo.py +++ b/dodai/tools/himo.py | |||
@@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint | |||
23 | from htmlentitydefs import codepoint2name | 23 | from htmlentitydefs import codepoint2name |
24 | from decimal import Decimal as D | 24 | from decimal import Decimal as D |
25 | 25 | ||
26 | class Himo(object): | 26 | class String2Himo(object): |
27 | """ | 27 | """ |
28 | A unicode-string object with some added features to help with | 28 | This in an object that is used for converting python string objects |
29 | unicode decoding and output conversions. | 29 | into Himo (unicode) objects. |
30 | """ | ||
31 | |||
32 | MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} | ||
33 | |||
34 | def __init__(self, data, encoding=None): | ||
35 | """ | ||
36 | data: Accepts any type of string object (int, float, | ||
37 | string, unicode) | ||
38 | |||
39 | encoding: Character encoding to help with converting the input | ||
40 | into unicode | ||
41 | 30 | ||
42 | The input data will be converted into an unicode object, unless | 31 | """ |
43 | the input data is already an unicode object. If the param | ||
44 | 'encoding' is set, the input data will be converted to unicode | ||
45 | using that value. If no 'encoding' is given this object will | ||
46 | attempt to figure out the encoding. First the encoding of the | ||
47 | operating system will be used. If there are any errors, the | ||
48 | chardet module will be used. This object makes no guarantees | ||
49 | that the correct encoding will be detected. | ||
50 | 32 | ||
33 | def __init__(self, default_encoding=None): | ||
51 | """ | 34 | """ |
35 | default_encoding is the encoding value to be used as the | ||
36 | classwide default. If default encoding is not set then | ||
37 | the encoding of the system will be used. | ||
52 | 38 | ||
53 | self._encoding = encoding or self._system_encoding() | ||
54 | self.data = self._decode(data) | ||
55 | |||
56 | def ascii(self): | ||
57 | """ | 39 | """ |
58 | Returns an ascii representation of this object value. | 40 | self.default_encoding = default_encoding or self._system_encoding() |
59 | Throws HimoAsciiError if this method was unable to | 41 | self._expression = re.compile(r'&(#?)(x?)(\w+);') |
60 | convert a unicode character down to it's root character. | ||
61 | For example if in your string you have a character | ||
62 | like the letter 'e' but it has an accent mark over it, | ||
63 | this method will convert that character to it's root | ||
64 | character. Thus 'e' with an accent mark over it will | ||
65 | replaced with the regular letter 'e'. | ||
66 | 42 | ||
43 | def __call__(self, data, encoding=None): | ||
67 | """ | 44 | """ |
68 | out = [] | 45 | Converts the input (data) string object into a Himo object |
69 | for char in self.data: | 46 | using the passed in (encoding). If encoding is omitted then |
70 | if ord(char) < 127: | 47 | the default_encoding will be used. |
71 | out.append(char) | ||
72 | elif ord(char) in self.MAP: | ||
73 | out.append(self.MAP[ord(char)]) | ||
74 | else: | ||
75 | num = unicodedata.decomposition(char).split(' ')[0] | ||
76 | if num: | ||
77 | out.append(unichr(int(num, 16))) | ||
78 | else: | ||
79 | print char | ||
80 | raise HimoAsciiError("Unable to convert 'u{0}' "\ | ||
81 | "character to ascii".format(ord(char))) | ||
82 | return str(''.join(out)) | ||
83 | |||
84 | def html(self): | ||
85 | """ | ||
86 | Returns a unicode string containing this object's value | ||
87 | html enetity encoded. | ||
88 | """ | ||
89 | out = [] | ||
90 | for char in self.data: | ||
91 | out.append(self._html_char_encode(char)) | ||
92 | return ''.join(out) | ||
93 | 48 | ||
94 | def decimal(self): | 49 | returns a Himo object |
95 | """ | ||
96 | Returns a decimal object with the value of this object | ||
97 | 50 | ||
98 | """ | 51 | """ |
52 | encoding = encoding or self.default_encoding | ||
53 | data = self._as_unicode(data, encoding) | ||
54 | data = self._decode_html(data) | ||
55 | return Himo(data) | ||
99 | 56 | ||
100 | return D(self.data) | 57 | def _as_unicode(self, data, encoding): |
101 | 58 | # Returns string as a unicode string. | |
102 | def _decode(self, data): | ||
103 | # Returns a unicode string. If data contains any html encoded | ||
104 | # characters, the characters will be converted to their unicode | ||
105 | # equivalent | ||
106 | |||
107 | data = self._as_unicode(data) | ||
108 | expression = re.compile(r'&(#?)(x?)(\w+);') | ||
109 | return expression.subn(self._html_decode, data)[0] | ||
110 | |||
111 | def _as_unicode(self, data): | ||
112 | # Returns string as a unicode string | ||
113 | 59 | ||
114 | if not isinstance(data, unicode): | 60 | if not isinstance(data, unicode): |
115 | if not isinstance(data, str): | 61 | if not isinstance(data, str): |
116 | data = str(data) | 62 | data = str(data) |
117 | try: | 63 | try: |
118 | data = data.decode(self._encoding) | 64 | data = data.decode(encoding) |
119 | except UnicodeDecodeError: | 65 | except UnicodeDecodeError: |
120 | info = chardet.detect(data) | 66 | info = chardet.detect(data) |
121 | self.encoding = info['encoding'] | ||
122 | data = data.decode(info['encoding']) | 67 | data = data.decode(info['encoding']) |
123 | return unicodedata.normalize('NFC', data) | 68 | return unicodedata.normalize('NFC', data) |
124 | 69 | ||
125 | def _html_char_encode(self, char): | 70 | def _decode_html(self, data): |
126 | # Returns an html version of the char | 71 | # Returns a unicode string. If data contains any html encoded |
72 | # characters, the characters will be converted to their unicode | ||
73 | # equivalent | ||
127 | 74 | ||
128 | number = ord(char) | 75 | return unicode(self._expression.subn(self._html_decode, data)[0]) |
129 | try: | ||
130 | char = "&{0};".format(codepoint2name[number]) | ||
131 | except KeyError: | ||
132 | if number > 127: | ||
133 | char = "&#{0};".format(number) | ||
134 | return char | ||
135 | 76 | ||
136 | def _html_decode(self, values): | 77 | def _html_decode(self, values): |
137 | # Returns the unicode character from the re.subn | 78 | # Returns the unicode character from the re.subn |
@@ -151,212 +92,82 @@ class Himo(object): | |||
151 | return unichr(char) | 92 | return unichr(char) |
152 | 93 | ||
153 | def _system_encoding(self): | 94 | def _system_encoding(self): |
154 | # Returns the character encoding of the system | 95 | # Returns the character encoding of the operating system |
155 | 96 | ||
156 | encoding = sys.getfilesystemencoding() | 97 | encoding = sys.getdefaultencoding() |
157 | if not encoding: | 98 | filesystem_encoding = sys.getfilesystemencoding() |
158 | encoding = sys.getdefaultencoding() | 99 | if filesystem_encoding: |
100 | encoding = filesystem_encoding | ||
159 | return encoding | 101 | return encoding |
160 | 102 | ||
161 | #def __cmp__(self, other): | ||
162 | # if self.__eq__(other): | ||
163 | # return 1 | ||
164 | # else: | ||
165 | # pool = [str(self.data), str(other)] | ||
166 | # pool.sort() | ||
167 | # if pool[0] == self.data: | ||
168 | # return -1 | ||
169 | # else: | ||
170 | # return 1 | ||
171 | |||
172 | |||
173 | def _is_himo(self, other): | ||
174 | if hasattr(other, '_is_himo'): | ||
175 | return True | ||
176 | return False | ||
177 | |||
178 | def __len__(self): | ||
179 | return len(self.data) | ||
180 | |||
181 | def __repr__(self): | ||
182 | return repr(self.data) | ||
183 | |||
184 | def __str__(self): | ||
185 | return self.data.encode(self._encoding) | ||
186 | |||
187 | def __iter__(self): | ||
188 | for char in self.data: | ||
189 | yield char | ||
190 | |||
191 | def __int__(self): | ||
192 | return int(self.data) | ||
193 | |||
194 | def __float__(self): | ||
195 | return float(self.data) | ||
196 | |||
197 | def __eq__(self, other): | ||
198 | if self._is_himo(other): | ||
199 | other = other.data | ||
200 | return self.data.__eq__(other) | ||
201 | |||
202 | def __ne__(self, other): | ||
203 | if self._is_himo(other): | ||
204 | other = other.data | ||
205 | return self.data.__ne__(other) | ||
206 | |||
207 | def __gt__(self, other): | ||
208 | if self._is_himo(other): | ||
209 | other = other.data | ||
210 | lines = [self.data, other] | ||
211 | lines.sort() | ||
212 | if lines[0] == self.data: | ||
213 | return True | ||
214 | else: | ||
215 | return False | ||
216 | |||
217 | def __lt__(self, other): | ||
218 | if self._is_himo(other): | ||
219 | other = other.data | ||
220 | lines = [self.data, other] | ||
221 | lines.sort() | ||
222 | if lines[0] != self.data: | ||
223 | return True | ||
224 | else: | ||
225 | return False | ||
226 | |||
227 | def __cmp__(self, other): | ||
228 | if self.__eq__(other): | ||
229 | return 0 | ||
230 | elif self.__lt__(other): | ||
231 | return -1 | ||
232 | else: | ||
233 | return 1 | ||
234 | |||
235 | def __unicode__(self): | ||
236 | return self.data | ||
237 | 103 | ||
238 | def capitalize(self, *args, **kargs): | 104 | class Himo(unicode): |
239 | return self.data.capitalize(*args, **kargs) | 105 | """ |
240 | 106 | A unicode-string object with some added features to help with | |
241 | def center(self, *args, **kargs): | 107 | output formatting. Himo means rope or string in Japanese, hence |
242 | return self.data.center(*args, **kargs) | 108 | the string to Himo connection. |
243 | |||
244 | def count(self, *args, **kargs): | ||
245 | return self.data.count(*args, **kargs) | ||
246 | |||
247 | def decode(self, *args, **kargs): | ||
248 | return self.data.decode(*args, **kargs) | ||
249 | |||
250 | def encode(self, *args, **kargs): | ||
251 | return self.data.encode(*args, **kargs) | ||
252 | |||
253 | def encode(self, *args, **kargs): | ||
254 | return self.data.encode(*args, **kargs) | ||
255 | |||
256 | def endswith(self, *args, **kargs): | ||
257 | return self.data.endswith(*args, **kargs) | ||
258 | |||
259 | def expandtabs(self, *args, **kargs): | ||
260 | return self.data.expandtabs(*args, **kargs) | ||
261 | |||
262 | def find(self, *args, **kargs): | ||
263 | return self.data.find(*args, **kargs) | ||
264 | |||
265 | def format(self, *args, **kargs): | ||
266 | return self.data.format(*args, **kargs) | ||
267 | |||
268 | def index(self, *args, **kargs): | ||
269 | return self.data.index(*args, **kargs) | ||
270 | |||
271 | def isalnum(self, *args, **kargs): | ||
272 | return self.data.isalnum(*args, **kargs) | ||
273 | |||
274 | def isalpha(self, *args, **kargs): | ||
275 | return self.data.isalpha(*args, **kargs) | ||
276 | |||
277 | def isdecimal(self, *args, **kargs): | ||
278 | return self.data.isdecimal(*args, **kargs) | ||
279 | |||
280 | def isdigit(self, *args, **kargs): | ||
281 | return self.data.isdigit(*args, **kargs) | ||
282 | |||
283 | def islower(self, *args, **kargs): | ||
284 | return self.data.islower(*args, **kargs) | ||
285 | |||
286 | def isnumeric(self, *args, **kargs): | ||
287 | return self.data.isnumeric(*args, **kargs) | ||
288 | |||
289 | def isspace(self, *args, **kargs): | ||
290 | return self.data.isspace(*args, **kargs) | ||
291 | |||
292 | def istitle(self, *args, **kargs): | ||
293 | return self.data.istitle(*args, **kargs) | ||
294 | |||
295 | def isupper(self, *args, **kargs): | ||
296 | return self.data.isupper(*args, **kargs) | ||
297 | |||
298 | def join(self, *args, **kargs): | ||
299 | return self.data.join(*args, **kargs) | ||
300 | |||
301 | def ljust(self, *args, **kargs): | ||
302 | return self.data.ljust(*args, **kargs) | ||
303 | |||
304 | def lower(self, *args, **kargs): | ||
305 | return self.data.lower(*args, **kargs) | ||
306 | |||
307 | def lstrip(self, *args, **kargs): | ||
308 | return self.data.lstrip(*args, **kargs) | ||
309 | |||
310 | def partition(self, *args, **kargs): | ||
311 | return self.data.partition(*args, **kargs) | ||
312 | |||
313 | def replace(self, *args, **kargs): | ||
314 | return self.data.replace(*args, **kargs) | ||
315 | |||
316 | def rfind(self, *args, **kargs): | ||
317 | return self.data.rfind(*args, **kargs) | ||
318 | |||
319 | def rindex(self, *args, **kargs): | ||
320 | return self.data.rindex(*args, **kargs) | ||
321 | |||
322 | def rjust(self, *args, **kargs): | ||
323 | return self.data.rjust(*args, **kargs) | ||
324 | |||
325 | def rpartition(self, *args, **kargs): | ||
326 | return self.data.rpartition(*args, **kargs) | ||
327 | 109 | ||
328 | def rsplit(self, *args, **kargs): | 110 | """ |
329 | return self.data.rsplit(*args, **kargs) | ||
330 | 111 | ||
331 | def rstrip(self, *args, **kargs): | 112 | MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} |
332 | return self.data.rstrip(*args, **kargs) | ||
333 | 113 | ||
334 | def split(self, *args, **kargs): | 114 | def html(self): |
335 | return self.data.split(*args, **kargs) | 115 | """ |
116 | Returns a unicode string containing this object's value | ||
117 | html enetity encoded. | ||
336 | 118 | ||
337 | def splitlines(self, *args, **kargs): | 119 | """ |
338 | return self.data.splitlines(*args, **kargs) | 120 | out = [] |
121 | for char in self: | ||
122 | out.append(self._html_char_encode(char)) | ||
123 | return unicode(''.join(out)) | ||
339 | 124 | ||
340 | def startswith(self, *args, **kargs): | 125 | def _html_char_encode(self, char): |
341 | return self.data.startswith(*args, **kargs) | 126 | # Returns an html version of the char |
342 | 127 | ||
343 | def strip(self, *args, **kargs): | 128 | number = ord(char) |
344 | return self.data.strip(*args, **kargs) | 129 | try: |
130 | char = "&{0};".format(codepoint2name[number]) | ||
131 | except KeyError: | ||
132 | if number > 127: | ||
133 | char = "&#{0};".format(number) | ||
134 | return char | ||
345 | 135 | ||
346 | def swapcase(self, *args, **kargs): | 136 | def decimal(self): |
347 | return self.data.swapcase(*args, **kargs) | 137 | """ |
138 | Returns a decimal object with the value of this object | ||
348 | 139 | ||
349 | def title(self, *args, **kargs): | 140 | """ |
350 | return self.data.title(*args, **kargs) | ||
351 | 141 | ||
352 | def translate(self, *args, **kargs): | 142 | return D(self) |
353 | return self.data.translate(*args, **kargs) | ||
354 | 143 | ||
355 | def upper(self, *args, **kargs): | 144 | def ascii(self): |
356 | return self.data.upper(*args, **kargs) | 145 | """ |
146 | Returns an ascii representation of this object value. | ||
147 | Throws HimoAsciiError if this method was unable to | ||
148 | convert a unicode character down to it's root character. | ||
149 | For example if in your string you have a character | ||
150 | like the letter 'e' but it has an accent mark over it, | ||
151 | this method will convert that character to it's root | ||
152 | character. Thus 'e' with an accent mark over it will | ||
153 | replaced with the regular letter 'e'. | ||
357 | 154 | ||
358 | def zfill(self, *args, **kargs): | 155 | """ |
359 | return self.data.zfill(*args, **kargs) | 156 | out = [] |
157 | for char in self: | ||
158 | if ord(char) < 127: | ||
159 | out.append(char) | ||
160 | elif ord(char) in self.MAP: | ||
161 | out.append(self.MAP[ord(char)]) | ||
162 | else: | ||
163 | num = unicodedata.decomposition(char).split(' ')[0] | ||
164 | if num: | ||
165 | out.append(unichr(int(num, 16))) | ||
166 | else: | ||
167 | print char | ||
168 | raise HimoAsciiError("Unable to convert 'u{0}' "\ | ||
169 | "character to ascii".format(ord(char))) | ||
170 | return str(''.join(out)) | ||
360 | 171 | ||
361 | class HimoAsciiError(Exception): | 172 | class HimoAsciiError(Exception): |
362 | pass | 173 | pass |
diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py new file mode 100644 index 0000000..cb58ca5 --- /dev/null +++ b/test/test_tools/test_himo.py | |||
@@ -0,0 +1,107 @@ | |||
1 | import sys | ||
2 | import os | ||
3 | import unittest | ||
4 | path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..')) | ||
5 | sys.path.append(path) | ||
6 | from dodai.tools.himo import String2Himo | ||
7 | from dodai.tools.himo import Himo | ||
8 | from dodai.tools.himo import HimoAsciiError | ||
9 | from decimal import Decimal as D | ||
10 | |||
11 | class TestString2Himo(unittest.TestCase): | ||
12 | |||
13 | def setUp(self): | ||
14 | self.string_to_himo_one = String2Himo() | ||
15 | self.string_to_himo_two = String2Himo('unicode_escape') | ||
16 | |||
17 | def test_regular_string_one(self): | ||
18 | obj = self.string_to_himo_one(str('abcd')) | ||
19 | self.assertTrue(isinstance(obj, Himo)) | ||
20 | |||
21 | def test_regular_string_two(self): | ||
22 | test = str('abcd') | ||
23 | obj = self.string_to_himo_one(test) | ||
24 | self.assertTrue(obj == test) | ||
25 | |||
26 | def test_regular_string_three(self): | ||
27 | obj = self.string_to_himo_two(str('abcd')) | ||
28 | self.assertTrue(isinstance(obj, Himo)) | ||
29 | |||
30 | def test_regular_string_four(self): | ||
31 | test = str('abcd') | ||
32 | obj = self.string_to_himo_two(test) | ||
33 | self.assertTrue(obj == test) | ||
34 | |||
35 | def test_regular_string_five(self): | ||
36 | test = 1 | ||
37 | obj = self.string_to_himo_two(test) | ||
38 | self.assertEqual(int(obj), test) | ||
39 | |||
40 | def test_decode_html(self): | ||
41 | test = u'\u4e5d\xf2\xe5\xe9' | ||
42 | obj = self.string_to_himo_one(u'九òåé') | ||
43 | self.assertEqual(test, obj) | ||
44 | |||
45 | def test_decode_chardet(self): | ||
46 | # Korean Text in the EUC-KR character set | ||
47 | kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\ | ||
48 | '\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7' | ||
49 | test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\ | ||
50 | '\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape') | ||
51 | obj = self.string_to_himo_one(kor) | ||
52 | self.assertEqual(obj, test) | ||
53 | |||
54 | def test_html_decode_one(self): | ||
55 | # Korean Text | ||
56 | kor = '당신이 찾는 모든 '\ | ||
57 | '스타일, 옥션& foo' | ||
58 | test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\ | ||
59 | u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo' | ||
60 | obj = self.string_to_himo_one(kor) | ||
61 | self.assertEqual(obj, test) | ||
62 | |||
63 | def test_html_decode_two(self): | ||
64 | kor = '&ppe;' | ||
65 | obj = self.string_to_himo_one(kor) | ||
66 | self.assertEqual(obj, kor) | ||
67 | |||
68 | |||
69 | class TestHimo(unittest.TestCase): | ||
70 | |||
71 | def setUp(self): | ||
72 | # "Elephants are our brothers" in Japanese. This line comes from | ||
73 | # an episode of a old Japanese TV show called "Koko Ga Hen Da Yo, | ||
74 | # Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin | ||
75 | # This line came from an African native who spoke Japanese fluently | ||
76 | self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\ | ||
77 | u'\u3067\u3059' | ||
78 | self.jp_html = u'象は私たちの'\ | ||
79 | u'兄弟です' | ||
80 | |||
81 | def test_html(self): | ||
82 | obj = Himo(self.jp) | ||
83 | test = obj.html() | ||
84 | self.assertEqual(test, self.jp_html) | ||
85 | |||
86 | def test_decimal(self): | ||
87 | obj = Himo('23.55') | ||
88 | obj = obj.decimal() | ||
89 | self.assertTrue(isinstance(obj, D)) | ||
90 | self.assertEqual(obj, D('23.55')) | ||
91 | |||
92 | def test_ascii_one(self): | ||
93 | # "Elephants live in Africa" in spanish | ||
94 | obj = Himo(u'Los elefantes viven en \xc1frica') | ||
95 | test = u'Los elefantes viven en Africa' | ||
96 | self.assertEqual(obj.ascii(), test) | ||
97 | |||
98 | def test_ascii_two(self): | ||
99 | # "Elephants are our brothers" in French | ||
100 | obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9') | ||
101 | test = u'Les elephants sont nos freres (C)' | ||
102 | self.assertEqual(obj.ascii(), test) | ||
103 | |||
104 | def test_ascii_three(self): | ||
105 | |||
106 | obj = Himo(self.jp) | ||
107 | self.failUnlessRaises(HimoAsciiError, obj.ascii) | ||