From c5003706aca7bf32031e848ef57146362bf7e3de Mon Sep 17 00:00:00 2001 From: Six Date: Sat, 10 Apr 2010 22:31:18 -0400 Subject: changed the himo object and added an object used to create himo objects --- .hgignore | 1 + dodai/tools/__init__.py | 1 + dodai/tools/himo.py | 365 +++++++++++-------------------------------- test/test_tools/test_himo.py | 107 +++++++++++++ 4 files changed, 197 insertions(+), 277 deletions(-) create mode 100644 test/test_tools/test_himo.py diff --git a/.hgignore b/.hgignore index 592f378..3c866fc 100644 --- a/.hgignore +++ b/.hgignore @@ -5,3 +5,4 @@ syntax: glob bin include lib +.coverage diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py index c54f7f2..9d2fad7 100644 --- a/dodai/tools/__init__.py +++ b/dodai/tools/__init__.py @@ -14,3 +14,4 @@ # # You should have received a copy of the GNU General Public License # along with Dodai. If not, see . + diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py index 5a96f91..60051a7 100644 --- a/dodai/tools/himo.py +++ b/dodai/tools/himo.py @@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint from htmlentitydefs import codepoint2name from decimal import Decimal as D -class Himo(object): +class String2Himo(object): """ - A unicode-string object with some added features to help with - unicode decoding and output conversions. - """ - - MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} - - def __init__(self, data, encoding=None): - """ - data: Accepts any type of string object (int, float, - string, unicode) - - encoding: Character encoding to help with converting the input - into unicode + This in an object that is used for converting python string objects + into Himo (unicode) objects. - The input data will be converted into an unicode object, unless - the input data is already an unicode object. If the param - 'encoding' is set, the input data will be converted to unicode - using that value. If no 'encoding' is given this object will - attempt to figure out the encoding. First the encoding of the - operating system will be used. If there are any errors, the - chardet module will be used. This object makes no guarantees - that the correct encoding will be detected. + """ + def __init__(self, default_encoding=None): """ + default_encoding is the encoding value to be used as the + classwide default. If default encoding is not set then + the encoding of the system will be used. - self._encoding = encoding or self._system_encoding() - self.data = self._decode(data) - - def ascii(self): """ - Returns an ascii representation of this object value. - Throws HimoAsciiError if this method was unable to - convert a unicode character down to it's root character. - For example if in your string you have a character - like the letter 'e' but it has an accent mark over it, - this method will convert that character to it's root - character. Thus 'e' with an accent mark over it will - replaced with the regular letter 'e'. + self.default_encoding = default_encoding or self._system_encoding() + self._expression = re.compile(r'&(#?)(x?)(\w+);') + def __call__(self, data, encoding=None): """ - out = [] - for char in self.data: - if ord(char) < 127: - out.append(char) - elif ord(char) in self.MAP: - out.append(self.MAP[ord(char)]) - else: - num = unicodedata.decomposition(char).split(' ')[0] - if num: - out.append(unichr(int(num, 16))) - else: - print char - raise HimoAsciiError("Unable to convert 'u{0}' "\ - "character to ascii".format(ord(char))) - return str(''.join(out)) - - def html(self): - """ - Returns a unicode string containing this object's value - html enetity encoded. - """ - out = [] - for char in self.data: - out.append(self._html_char_encode(char)) - return ''.join(out) + Converts the input (data) string object into a Himo object + using the passed in (encoding). If encoding is omitted then + the default_encoding will be used. - def decimal(self): - """ - Returns a decimal object with the value of this object + returns a Himo object """ + encoding = encoding or self.default_encoding + data = self._as_unicode(data, encoding) + data = self._decode_html(data) + return Himo(data) - return D(self.data) - - def _decode(self, data): - # Returns a unicode string. If data contains any html encoded - # characters, the characters will be converted to their unicode - # equivalent - - data = self._as_unicode(data) - expression = re.compile(r'&(#?)(x?)(\w+);') - return expression.subn(self._html_decode, data)[0] - - def _as_unicode(self, data): - # Returns string as a unicode string + def _as_unicode(self, data, encoding): + # Returns string as a unicode string. if not isinstance(data, unicode): if not isinstance(data, str): data = str(data) try: - data = data.decode(self._encoding) + data = data.decode(encoding) except UnicodeDecodeError: info = chardet.detect(data) - self.encoding = info['encoding'] data = data.decode(info['encoding']) return unicodedata.normalize('NFC', data) - def _html_char_encode(self, char): - # Returns an html version of the char + def _decode_html(self, data): + # Returns a unicode string. If data contains any html encoded + # characters, the characters will be converted to their unicode + # equivalent - number = ord(char) - try: - char = "&{0};".format(codepoint2name[number]) - except KeyError: - if number > 127: - char = "&#{0};".format(number) - return char + return unicode(self._expression.subn(self._html_decode, data)[0]) def _html_decode(self, values): # Returns the unicode character from the re.subn @@ -151,212 +92,82 @@ class Himo(object): return unichr(char) def _system_encoding(self): - # Returns the character encoding of the system + # Returns the character encoding of the operating system - encoding = sys.getfilesystemencoding() - if not encoding: - encoding = sys.getdefaultencoding() + encoding = sys.getdefaultencoding() + filesystem_encoding = sys.getfilesystemencoding() + if filesystem_encoding: + encoding = filesystem_encoding return encoding - #def __cmp__(self, other): - # if self.__eq__(other): - # return 1 - # else: - # pool = [str(self.data), str(other)] - # pool.sort() - # if pool[0] == self.data: - # return -1 - # else: - # return 1 - - - def _is_himo(self, other): - if hasattr(other, '_is_himo'): - return True - return False - - def __len__(self): - return len(self.data) - - def __repr__(self): - return repr(self.data) - - def __str__(self): - return self.data.encode(self._encoding) - - def __iter__(self): - for char in self.data: - yield char - - def __int__(self): - return int(self.data) - - def __float__(self): - return float(self.data) - - def __eq__(self, other): - if self._is_himo(other): - other = other.data - return self.data.__eq__(other) - - def __ne__(self, other): - if self._is_himo(other): - other = other.data - return self.data.__ne__(other) - - def __gt__(self, other): - if self._is_himo(other): - other = other.data - lines = [self.data, other] - lines.sort() - if lines[0] == self.data: - return True - else: - return False - - def __lt__(self, other): - if self._is_himo(other): - other = other.data - lines = [self.data, other] - lines.sort() - if lines[0] != self.data: - return True - else: - return False - - def __cmp__(self, other): - if self.__eq__(other): - return 0 - elif self.__lt__(other): - return -1 - else: - return 1 - - def __unicode__(self): - return self.data - def capitalize(self, *args, **kargs): - return self.data.capitalize(*args, **kargs) - - def center(self, *args, **kargs): - return self.data.center(*args, **kargs) - - def count(self, *args, **kargs): - return self.data.count(*args, **kargs) - - def decode(self, *args, **kargs): - return self.data.decode(*args, **kargs) - - def encode(self, *args, **kargs): - return self.data.encode(*args, **kargs) - - def encode(self, *args, **kargs): - return self.data.encode(*args, **kargs) - - def endswith(self, *args, **kargs): - return self.data.endswith(*args, **kargs) - - def expandtabs(self, *args, **kargs): - return self.data.expandtabs(*args, **kargs) - - def find(self, *args, **kargs): - return self.data.find(*args, **kargs) - - def format(self, *args, **kargs): - return self.data.format(*args, **kargs) - - def index(self, *args, **kargs): - return self.data.index(*args, **kargs) - - def isalnum(self, *args, **kargs): - return self.data.isalnum(*args, **kargs) - - def isalpha(self, *args, **kargs): - return self.data.isalpha(*args, **kargs) - - def isdecimal(self, *args, **kargs): - return self.data.isdecimal(*args, **kargs) - - def isdigit(self, *args, **kargs): - return self.data.isdigit(*args, **kargs) - - def islower(self, *args, **kargs): - return self.data.islower(*args, **kargs) - - def isnumeric(self, *args, **kargs): - return self.data.isnumeric(*args, **kargs) - - def isspace(self, *args, **kargs): - return self.data.isspace(*args, **kargs) - - def istitle(self, *args, **kargs): - return self.data.istitle(*args, **kargs) - - def isupper(self, *args, **kargs): - return self.data.isupper(*args, **kargs) - - def join(self, *args, **kargs): - return self.data.join(*args, **kargs) - - def ljust(self, *args, **kargs): - return self.data.ljust(*args, **kargs) - - def lower(self, *args, **kargs): - return self.data.lower(*args, **kargs) - - def lstrip(self, *args, **kargs): - return self.data.lstrip(*args, **kargs) - - def partition(self, *args, **kargs): - return self.data.partition(*args, **kargs) - - def replace(self, *args, **kargs): - return self.data.replace(*args, **kargs) - - def rfind(self, *args, **kargs): - return self.data.rfind(*args, **kargs) - - def rindex(self, *args, **kargs): - return self.data.rindex(*args, **kargs) - - def rjust(self, *args, **kargs): - return self.data.rjust(*args, **kargs) - - def rpartition(self, *args, **kargs): - return self.data.rpartition(*args, **kargs) +class Himo(unicode): + """ + A unicode-string object with some added features to help with + output formatting. Himo means rope or string in Japanese, hence + the string to Himo connection. - def rsplit(self, *args, **kargs): - return self.data.rsplit(*args, **kargs) + """ - def rstrip(self, *args, **kargs): - return self.data.rstrip(*args, **kargs) + MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} - def split(self, *args, **kargs): - return self.data.split(*args, **kargs) + def html(self): + """ + Returns a unicode string containing this object's value + html enetity encoded. - def splitlines(self, *args, **kargs): - return self.data.splitlines(*args, **kargs) + """ + out = [] + for char in self: + out.append(self._html_char_encode(char)) + return unicode(''.join(out)) - def startswith(self, *args, **kargs): - return self.data.startswith(*args, **kargs) + def _html_char_encode(self, char): + # Returns an html version of the char - def strip(self, *args, **kargs): - return self.data.strip(*args, **kargs) + number = ord(char) + try: + char = "&{0};".format(codepoint2name[number]) + except KeyError: + if number > 127: + char = "&#{0};".format(number) + return char - def swapcase(self, *args, **kargs): - return self.data.swapcase(*args, **kargs) + def decimal(self): + """ + Returns a decimal object with the value of this object - def title(self, *args, **kargs): - return self.data.title(*args, **kargs) + """ - def translate(self, *args, **kargs): - return self.data.translate(*args, **kargs) + return D(self) - def upper(self, *args, **kargs): - return self.data.upper(*args, **kargs) + def ascii(self): + """ + Returns an ascii representation of this object value. + Throws HimoAsciiError if this method was unable to + convert a unicode character down to it's root character. + For example if in your string you have a character + like the letter 'e' but it has an accent mark over it, + this method will convert that character to it's root + character. Thus 'e' with an accent mark over it will + replaced with the regular letter 'e'. - def zfill(self, *args, **kargs): - return self.data.zfill(*args, **kargs) + """ + out = [] + for char in self: + if ord(char) < 127: + out.append(char) + elif ord(char) in self.MAP: + out.append(self.MAP[ord(char)]) + else: + num = unicodedata.decomposition(char).split(' ')[0] + if num: + out.append(unichr(int(num, 16))) + else: + print char + raise HimoAsciiError("Unable to convert 'u{0}' "\ + "character to ascii".format(ord(char))) + return str(''.join(out)) class HimoAsciiError(Exception): pass diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py new file mode 100644 index 0000000..cb58ca5 --- /dev/null +++ b/test/test_tools/test_himo.py @@ -0,0 +1,107 @@ +import sys +import os +import unittest +path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..')) +sys.path.append(path) +from dodai.tools.himo import String2Himo +from dodai.tools.himo import Himo +from dodai.tools.himo import HimoAsciiError +from decimal import Decimal as D + +class TestString2Himo(unittest.TestCase): + + def setUp(self): + self.string_to_himo_one = String2Himo() + self.string_to_himo_two = String2Himo('unicode_escape') + + def test_regular_string_one(self): + obj = self.string_to_himo_one(str('abcd')) + self.assertTrue(isinstance(obj, Himo)) + + def test_regular_string_two(self): + test = str('abcd') + obj = self.string_to_himo_one(test) + self.assertTrue(obj == test) + + def test_regular_string_three(self): + obj = self.string_to_himo_two(str('abcd')) + self.assertTrue(isinstance(obj, Himo)) + + def test_regular_string_four(self): + test = str('abcd') + obj = self.string_to_himo_two(test) + self.assertTrue(obj == test) + + def test_regular_string_five(self): + test = 1 + obj = self.string_to_himo_two(test) + self.assertEqual(int(obj), test) + + def test_decode_html(self): + test = u'\u4e5d\xf2\xe5\xe9' + obj = self.string_to_himo_one(u'九òåé') + self.assertEqual(test, obj) + + def test_decode_chardet(self): + # Korean Text in the EUC-KR character set + kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\ + '\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7' + test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\ + '\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape') + obj = self.string_to_himo_one(kor) + self.assertEqual(obj, test) + + def test_html_decode_one(self): + # Korean Text + kor = '당신이 찾는 모든 '\ + '스타일, 옥션& foo' + test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\ + u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo' + obj = self.string_to_himo_one(kor) + self.assertEqual(obj, test) + + def test_html_decode_two(self): + kor = '&ppe;' + obj = self.string_to_himo_one(kor) + self.assertEqual(obj, kor) + + +class TestHimo(unittest.TestCase): + + def setUp(self): + # "Elephants are our brothers" in Japanese. This line comes from + # an episode of a old Japanese TV show called "Koko Ga Hen Da Yo, + # Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin + # This line came from an African native who spoke Japanese fluently + self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\ + u'\u3067\u3059' + self.jp_html = u'象は私たちの'\ + u'兄弟です' + + def test_html(self): + obj = Himo(self.jp) + test = obj.html() + self.assertEqual(test, self.jp_html) + + def test_decimal(self): + obj = Himo('23.55') + obj = obj.decimal() + self.assertTrue(isinstance(obj, D)) + self.assertEqual(obj, D('23.55')) + + def test_ascii_one(self): + # "Elephants live in Africa" in spanish + obj = Himo(u'Los elefantes viven en \xc1frica') + test = u'Los elefantes viven en Africa' + self.assertEqual(obj.ascii(), test) + + def test_ascii_two(self): + # "Elephants are our brothers" in French + obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9') + test = u'Les elephants sont nos freres (C)' + self.assertEqual(obj.ascii(), test) + + def test_ascii_three(self): + + obj = Himo(self.jp) + self.failUnlessRaises(HimoAsciiError, obj.ascii) -- cgit v1.2.3