changed the himo object and added an object used to create himo objects

author: Six <unknown> 2010-04-10 22:31:18 -0400
committer: Six <unknown> 2010-04-10 22:31:18 -0400
commit: c5003706aca7bf32031e848ef57146362bf7e3de (patch)
tree: 7aaf2cdba6e80eb94496901ef48e17a2e7aaf8cd
parent: 65302b6621bc44b49682efde4522e131951bfd5f (diff)
download: dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.bz2
dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.xz
dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.zip
4 files changed, 197 insertions, 277 deletions
diff --git a/.hgignore b/.hgignore
index 592f378..3c866fc 100644
--- a/.hgignore
+++ b/.hgignore
@@ -5,3 +5,4 @@ syntax: glob
 bin
 include
 lib
+.coverage
diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py
index c54f7f2..9d2fad7 100644
--- a/dodai/tools/__init__.py
+++ b/dodai/tools/__init__.py
@@ -14,3 +14,4 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with Dodai.  If not, see <http://www.gnu.org/licenses/>.
diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py
index 5a96f91..60051a7 100644
--- a/dodai/tools/himo.py
+++ b/dodai/tools/himo.py
@@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint
 from htmlentitydefs import codepoint2name
 from decimal import Decimal as D
-class Himo(object):
+class String2Himo(object):
    """
-    A unicode-string object with some added features to help with
+    This in an object that is used for converting python string objects
-    unicode decoding and output conversions.
+    into Himo (unicode) objects.
-    """
-    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
-    def __init__(self, data, encoding=None):
-        """
-        data:       Accepts any type of string object (int, float,
-                    string, unicode)
-        encoding:   Character encoding to help with converting the input
-                    into unicode
-        The input data will be converted into an unicode object, unless
+    """
-        the input data is already an unicode object.  If the param
-        'encoding' is set, the input data will be converted to unicode
-        using that value.  If no 'encoding' is given this object will
-        attempt to figure out the encoding.  First the encoding of the
-        operating system will be used.  If there are any errors, the
-        chardet module will be used.  This object makes no guarantees
-        that the correct encoding will be detected.
+    def __init__(self, default_encoding=None):
        """
+        default_encoding is the encoding value to be used as the
+        classwide default.  If default encoding is not set then
+        the encoding of the system will be used.
-        self._encoding = encoding or self._system_encoding()
-        self.data = self._decode(data)
-    def ascii(self):
        """
-        Returns an ascii representation of this object value.
+        self.default_encoding = default_encoding or self._system_encoding()
-        Throws HimoAsciiError if this method was unable to
+        self._expression = re.compile(r'&(#?)(x?)(\w+);')
-        convert a unicode character down to it's root character.
-        For example if in your string you have a character
-        like the letter 'e' but it has an accent mark over it,
-        this method will convert that character to it's root
-        character.  Thus 'e' with an accent mark over it will
-        replaced with the regular letter 'e'.
+    def __call__(self, data, encoding=None):
        """
-        out = []
+        Converts the input (data) string object into a Himo object
-        for char in self.data:
+        using the passed in (encoding).  If encoding is omitted then
-            if ord(char) < 127:
+        the default_encoding will be used.
-                out.append(char)
-            elif ord(char) in self.MAP:
-                out.append(self.MAP[ord(char)])
-            else:
-                num = unicodedata.decomposition(char).split(' ')[0]
-                if num:
-                    out.append(unichr(int(num, 16)))
-                else:
-                    print char
-                    raise HimoAsciiError("Unable to convert 'u{0}' "\
-                                    "character to ascii".format(ord(char)))
-        return str(''.join(out))
-    def html(self):
-        """
-        Returns a unicode string containing this object's value
-        html enetity encoded.
-        """
-        out = []
-        for char in self.data:
-            out.append(self._html_char_encode(char))
-        return ''.join(out)
-    def decimal(self):
+        returns a Himo object
-        """
-        Returns a decimal object with the value of this object
        """
+        encoding = encoding or self.default_encoding
+        data = self._as_unicode(data, encoding)
+        data = self._decode_html(data)
+        return Himo(data)
-        return D(self.data)
+    def _as_unicode(self, data, encoding):
+        # Returns string as a unicode string.
-    def _decode(self, data):
-        # Returns a unicode string.  If data contains any html encoded
-        # characters, the characters will be converted to their unicode
-        # equivalent
-        data = self._as_unicode(data)
-        expression = re.compile(r'&(#?)(x?)(\w+);')
-        return expression.subn(self._html_decode, data)[0]
-    def _as_unicode(self, data):
-        # Returns string as a unicode string
        if not isinstance(data, unicode):
            if not isinstance(data, str):
                data = str(data)
            try:
-                data = data.decode(self._encoding)
+                data = data.decode(encoding)
            except UnicodeDecodeError:
                info = chardet.detect(data)
-                self.encoding = info['encoding']
                data = data.decode(info['encoding'])
        return unicodedata.normalize('NFC', data)
-    def _html_char_encode(self, char):
+    def _decode_html(self, data):
-        # Returns an html version of the char
+        # Returns a unicode string.  If data contains any html encoded
+        # characters, the characters will be converted to their unicode
+        # equivalent
-        number = ord(char)
+        return unicode(self._expression.subn(self._html_decode, data)[0])
-        try:
-            char = "&{0};".format(codepoint2name[number])
-        except KeyError:
-            if number > 127:
-                char = "&#{0};".format(number)
-        return char
    def _html_decode(self, values):
        # Returns the unicode character from the re.subn
@@ -151,212 +92,82 @@ class Himo(object):
                return unichr(char)
    def _system_encoding(self):
-        # Returns the character encoding of the system
+        # Returns the character encoding of the operating system
-        encoding = sys.getfilesystemencoding()
+        encoding = sys.getdefaultencoding()
-        if not encoding:
+        filesystem_encoding = sys.getfilesystemencoding()
-            encoding = sys.getdefaultencoding()
+        if filesystem_encoding:
+            encoding = filesystem_encoding
        return encoding
-    #def __cmp__(self, other):
-    #    if self.__eq__(other):
-    #        return 1
-    #    else:
-    #        pool = [str(self.data), str(other)]
-    #        pool.sort()
-    #        if pool[0] == self.data:
-    #            return -1
-    #        else:
-    #            return 1
-    def _is_himo(self, other):
-        if hasattr(other, '_is_himo'):
-            return True
-        return False
-    def __len__(self):
-        return len(self.data)
-    def __repr__(self):
-        return repr(self.data)
-    def __str__(self):
-        return self.data.encode(self._encoding)
-    def __iter__(self):
-        for char in self.data:
-            yield char
-    def __int__(self):
-        return int(self.data)
-    def __float__(self):
-        return float(self.data)
-    def __eq__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        return self.data.__eq__(other)
-    def __ne__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        return self.data.__ne__(other)
-    def __gt__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        lines = [self.data, other]
-        lines.sort()
-        if lines[0] == self.data:
-            return True
-        else:
-            return False
-    def __lt__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        lines = [self.data, other]
-        lines.sort()
-        if lines[0] != self.data:
-            return True
-        else:
-            return False
-    def __cmp__(self, other):
-        if self.__eq__(other):
-            return 0
-        elif self.__lt__(other):
-            return -1
-        else:
-            return 1
-    def __unicode__(self):
-        return self.data
-    def capitalize(self, *args, **kargs):
+class Himo(unicode):
-        return self.data.capitalize(*args, **kargs)
+    """
+    A unicode-string object with some added features to help with
-    def center(self, *args, **kargs):
+    output formatting.  Himo means rope or string in Japanese, hence
-        return self.data.center(*args, **kargs)
+    the string to Himo connection.
-    def count(self, *args, **kargs):
-        return self.data.count(*args, **kargs)
-    def decode(self, *args, **kargs):
-        return self.data.decode(*args, **kargs)
-    def encode(self, *args, **kargs):
-        return self.data.encode(*args, **kargs)
-    def encode(self, *args, **kargs):
-        return self.data.encode(*args, **kargs)
-    def endswith(self, *args, **kargs):
-        return self.data.endswith(*args, **kargs)
-    def expandtabs(self, *args, **kargs):
-        return self.data.expandtabs(*args, **kargs)
-    def find(self, *args, **kargs):
-        return self.data.find(*args, **kargs)
-    def format(self, *args, **kargs):
-        return self.data.format(*args, **kargs)
-    def index(self, *args, **kargs):
-        return self.data.index(*args, **kargs)
-    def isalnum(self, *args, **kargs):
-        return self.data.isalnum(*args, **kargs)
-    def isalpha(self, *args, **kargs):
-        return self.data.isalpha(*args, **kargs)
-    def isdecimal(self, *args, **kargs):
-        return self.data.isdecimal(*args, **kargs)
-    def isdigit(self, *args, **kargs):
-        return self.data.isdigit(*args, **kargs)
-    def islower(self, *args, **kargs):
-        return self.data.islower(*args, **kargs)
-    def isnumeric(self, *args, **kargs):
-        return self.data.isnumeric(*args, **kargs)
-    def isspace(self, *args, **kargs):
-        return self.data.isspace(*args, **kargs)
-    def istitle(self, *args, **kargs):
-        return self.data.istitle(*args, **kargs)
-    def isupper(self, *args, **kargs):
-        return self.data.isupper(*args, **kargs)
-    def join(self, *args, **kargs):
-        return self.data.join(*args, **kargs)
-    def ljust(self, *args, **kargs):
-        return self.data.ljust(*args, **kargs)
-    def lower(self, *args, **kargs):
-        return self.data.lower(*args, **kargs)
-    def lstrip(self, *args, **kargs):
-        return self.data.lstrip(*args, **kargs)
-    def partition(self, *args, **kargs):
-        return self.data.partition(*args, **kargs)
-    def replace(self, *args, **kargs):
-        return self.data.replace(*args, **kargs)
-    def rfind(self, *args, **kargs):
-        return self.data.rfind(*args, **kargs)
-    def rindex(self, *args, **kargs):
-        return self.data.rindex(*args, **kargs)
-    def rjust(self, *args, **kargs):
-        return self.data.rjust(*args, **kargs)
-    def rpartition(self, *args, **kargs):
-        return self.data.rpartition(*args, **kargs)
-    def rsplit(self, *args, **kargs):
+    """
-        return self.data.rsplit(*args, **kargs)
-    def rstrip(self, *args, **kargs):
+    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
-        return self.data.rstrip(*args, **kargs)
-    def split(self, *args, **kargs):
+    def html(self):
-        return self.data.split(*args, **kargs)
+        """
+        Returns a unicode string containing this object's value
+        html enetity encoded.
-    def splitlines(self, *args, **kargs):
+        """
-        return self.data.splitlines(*args, **kargs)
+        out = []
+        for char in self:
+            out.append(self._html_char_encode(char))
+        return unicode(''.join(out))
-    def startswith(self, *args, **kargs):
+    def _html_char_encode(self, char):
-        return self.data.startswith(*args, **kargs)
+        # Returns an html version of the char
-    def strip(self, *args, **kargs):
+        number = ord(char)
-        return self.data.strip(*args, **kargs)
+        try:
+            char = "&{0};".format(codepoint2name[number])
+        except KeyError:
+            if number > 127:
+                char = "&#{0};".format(number)
+        return char
-    def swapcase(self, *args, **kargs):
+    def decimal(self):
-        return self.data.swapcase(*args, **kargs)
+        """
+        Returns a decimal object with the value of this object
-    def title(self, *args, **kargs):
+        """
-        return self.data.title(*args, **kargs)
-    def translate(self, *args, **kargs):
+        return D(self)
-        return self.data.translate(*args, **kargs)
-    def upper(self, *args, **kargs):
+    def ascii(self):
-        return self.data.upper(*args, **kargs)
+        """
+        Returns an ascii representation of this object value.
+        Throws HimoAsciiError if this method was unable to
+        convert a unicode character down to it's root character.
+        For example if in your string you have a character
+        like the letter 'e' but it has an accent mark over it,
+        this method will convert that character to it's root
+        character.  Thus 'e' with an accent mark over it will
+        replaced with the regular letter 'e'.
-    def zfill(self, *args, **kargs):
+        """
-        return self.data.zfill(*args, **kargs)
+        out = []
+        for char in self:
+            if ord(char) < 127:
+                out.append(char)
+            elif ord(char) in self.MAP:
+                out.append(self.MAP[ord(char)])
+            else:
+                num = unicodedata.decomposition(char).split(' ')[0]
+                if num:
+                    out.append(unichr(int(num, 16)))
+                else:
+                    print char
+                    raise HimoAsciiError("Unable to convert 'u{0}' "\
+                                    "character to ascii".format(ord(char)))
+        return str(''.join(out))
 class HimoAsciiError(Exception):
    pass
diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py
new file mode 100644
index 0000000..cb58ca5
--- /dev/null
+++ b/test/test_tools/test_himo.py
@@ -0,0 +1,107 @@
+import sys
+import os
+import unittest
+path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..'))
+sys.path.append(path)
+from dodai.tools.himo import String2Himo
+from dodai.tools.himo import Himo
+from dodai.tools.himo import HimoAsciiError
+from decimal import Decimal as D
+class TestString2Himo(unittest.TestCase):
+    def setUp(self):
+        self.string_to_himo_one = String2Himo()
+        self.string_to_himo_two = String2Himo('unicode_escape')
+    def test_regular_string_one(self):
+        obj = self.string_to_himo_one(str('abcd'))
+        self.assertTrue(isinstance(obj, Himo))
+    def test_regular_string_two(self):
+        test = str('abcd')
+        obj = self.string_to_himo_one(test)
+        self.assertTrue(obj == test)
+    def test_regular_string_three(self):
+        obj = self.string_to_himo_two(str('abcd'))
+        self.assertTrue(isinstance(obj, Himo))
+    def test_regular_string_four(self):
+        test = str('abcd')
+        obj = self.string_to_himo_two(test)
+        self.assertTrue(obj == test)
+    def test_regular_string_five(self):
+        test = 1
+        obj = self.string_to_himo_two(test)
+        self.assertEqual(int(obj), test)
+    def test_decode_html(self):
+        test = u'\u4e5d\xf2\xe5\xe9'
+        obj = self.string_to_himo_one(u'&#20061;&ograve;&aring;&eacute;')
+        self.assertEqual(test, obj)
+    def test_decode_chardet(self):
+        # Korean Text in the EUC-KR character set
+        kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\
+               '\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7'
+        test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
+               '\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape')
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, test)
+    def test_html_decode_one(self):
+        # Korean Text
+        kor = '&#xb2f9;&#49888;&#51060; &#52286;&#45716; &#47784;&#46304; '\
+              '&#49828;&#53440;&#51068;, &#50725;&#49496;&amp; foo'
+        test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
+               u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo'
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, test)
+    def test_html_decode_two(self):
+        kor = '&ampppe;'
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, kor)
+class TestHimo(unittest.TestCase):
+    def setUp(self):
+        # "Elephants are our brothers" in Japanese.  This line comes from
+        # an episode of a old Japanese TV show called "Koko Ga Hen Da Yo,
+        # Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin
+        # This line came from an African native who spoke Japanese fluently
+        self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\
+                  u'\u3067\u3059'
+        self.jp_html = u'&#35937;&#12399;&#31169;&#12383;&#12385;&#12398;'\
+                       u'&#20804;&#24351;&#12391;&#12377;'
+    def test_html(self):
+        obj = Himo(self.jp)
+        test = obj.html()
+        self.assertEqual(test, self.jp_html)
+    def test_decimal(self):
+        obj = Himo('23.55')
+        obj = obj.decimal()
+        self.assertTrue(isinstance(obj, D))
+        self.assertEqual(obj, D('23.55'))
+    def test_ascii_one(self):
+        # "Elephants live in Africa" in spanish
+        obj = Himo(u'Los elefantes viven en \xc1frica')
+        test = u'Los elefantes viven en Africa'
+        self.assertEqual(obj.ascii(), test)
+    def test_ascii_two(self):
+        # "Elephants are our brothers" in French
+        obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9')
+        test = u'Les elephants sont nos freres (C)'
+        self.assertEqual(obj.ascii(), test)
+    def test_ascii_three(self):
+        obj = Himo(self.jp)
+        self.failUnlessRaises(HimoAsciiError, obj.ascii)
author	Six <unknown>	2010-04-10 22:31:18 -0400
committer	Six <unknown>	2010-04-10 22:31:18 -0400
commit	c5003706aca7bf32031e848ef57146362bf7e3de (patch)
tree	7aaf2cdba6e80eb94496901ef48e17a2e7aaf8cd
parent	65302b6621bc44b49682efde4522e131951bfd5f (diff)
download	dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.bz2 dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.tar.xz dodai-macsupport-c5003706aca7bf32031e848ef57146362bf7e3de.zip

diff --git a/.hgignore b/.hgignore index 592f378..3c866fc 100644 --- a/.hgignore +++ b/.hgignore
@@ -5,3 +5,4 @@ syntax: glob
5	bin	5	bin
6	include	6	include
7	lib	7	lib
		8	.coverage


diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py index c54f7f2..9d2fad7 100644 --- a/dodai/tools/__init__.py +++ b/dodai/tools/__init__.py
@@ -14,3 +14,4 @@
14	#	14	#
15	# You should have received a copy of the GNU General Public License	15	# You should have received a copy of the GNU General Public License
16	# along with Dodai. If not, see <http://www.gnu.org/licenses/>.	16	# along with Dodai. If not, see <http://www.gnu.org/licenses/>.
		17


diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py index 5a96f91..60051a7 100644 --- a/dodai/tools/himo.py +++ b/dodai/tools/himo.py
@@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint
23	from htmlentitydefs import codepoint2name	23	from htmlentitydefs import codepoint2name
24	from decimal import Decimal as D	24	from decimal import Decimal as D
25		25
26	class Himo(object):	26	class String2Himo(object):
27	"""	27	"""
28	A unicode-string object with some added features to help with	28	This in an object that is used for converting python string objects
29	unicode decoding and output conversions.	29	into Himo (unicode) objects.
30	"""
31
32	MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
33
34	def __init__(self, data, encoding=None):
35	"""
36	data: Accepts any type of string object (int, float,
37	string, unicode)
38
39	encoding: Character encoding to help with converting the input
40	into unicode
41		30
42	The input data will be converted into an unicode object, unless	31	"""
43	the input data is already an unicode object. If the param
44	'encoding' is set, the input data will be converted to unicode
45	using that value. If no 'encoding' is given this object will
46	attempt to figure out the encoding. First the encoding of the
47	operating system will be used. If there are any errors, the
48	chardet module will be used. This object makes no guarantees
49	that the correct encoding will be detected.
50		32
		33	def __init__(self, default_encoding=None):
51	"""	34	"""
		35	default_encoding is the encoding value to be used as the
		36	classwide default. If default encoding is not set then
		37	the encoding of the system will be used.
52		38
53	self._encoding = encoding or self._system_encoding()
54	self.data = self._decode(data)
55
56	def ascii(self):
57	"""	39	"""
58	Returns an ascii representation of this object value.	40	self.default_encoding = default_encoding or self._system_encoding()
59	Throws HimoAsciiError if this method was unable to	41	self._expression = re.compile(r'&(#?)(x?)(\w+);')
60	convert a unicode character down to it's root character.
61	For example if in your string you have a character
62	like the letter 'e' but it has an accent mark over it,
63	this method will convert that character to it's root
64	character. Thus 'e' with an accent mark over it will
65	replaced with the regular letter 'e'.
66		42
		43	def __call__(self, data, encoding=None):
67	"""	44	"""
68	out = []	45	Converts the input (data) string object into a Himo object
69	for char in self.data:	46	using the passed in (encoding). If encoding is omitted then
70	if ord(char) < 127:	47	the default_encoding will be used.
71	out.append(char)
72	elif ord(char) in self.MAP:
73	out.append(self.MAP[ord(char)])
74	else:
75	num = unicodedata.decomposition(char).split(' ')[0]
76	if num:
77	out.append(unichr(int(num, 16)))
78	else:
79	print char
80	raise HimoAsciiError("Unable to convert 'u{0}' "\
81	"character to ascii".format(ord(char)))
82	return str(''.join(out))
83
84	def html(self):
85	"""
86	Returns a unicode string containing this object's value
87	html enetity encoded.
88	"""
89	out = []
90	for char in self.data:
91	out.append(self._html_char_encode(char))
92	return ''.join(out)
93		48
94	def decimal(self):	49	returns a Himo object
95	"""
96	Returns a decimal object with the value of this object
97		50
98	"""	51	"""
		52	encoding = encoding or self.default_encoding
		53	data = self._as_unicode(data, encoding)
		54	data = self._decode_html(data)
		55	return Himo(data)
99		56
100	return D(self.data)	57	def _as_unicode(self, data, encoding):
101		58	# Returns string as a unicode string.
102	def _decode(self, data):
103	# Returns a unicode string. If data contains any html encoded
104	# characters, the characters will be converted to their unicode
105	# equivalent
106
107	data = self._as_unicode(data)
108	expression = re.compile(r'&(#?)(x?)(\w+);')
109	return expression.subn(self._html_decode, data)[0]
110
111	def _as_unicode(self, data):
112	# Returns string as a unicode string
113		59
114	if not isinstance(data, unicode):	60	if not isinstance(data, unicode):
115	if not isinstance(data, str):	61	if not isinstance(data, str):
116	data = str(data)	62	data = str(data)
117	try:	63	try:
118	data = data.decode(self._encoding)	64	data = data.decode(encoding)
119	except UnicodeDecodeError:	65	except UnicodeDecodeError:
120	info = chardet.detect(data)	66	info = chardet.detect(data)
121	self.encoding = info['encoding']
122	data = data.decode(info['encoding'])	67	data = data.decode(info['encoding'])
123	return unicodedata.normalize('NFC', data)	68	return unicodedata.normalize('NFC', data)
124		69
125	def _html_char_encode(self, char):	70	def _decode_html(self, data):
126	# Returns an html version of the char	71	# Returns a unicode string. If data contains any html encoded
		72	# characters, the characters will be converted to their unicode
		73	# equivalent
127		74
128	number = ord(char)	75	return unicode(self._expression.subn(self._html_decode, data)[0])
129	try:
130	char = "&{0};".format(codepoint2name[number])
131	except KeyError:
132	if number > 127:
133	char = "&#{0};".format(number)
134	return char
135		76
136	def _html_decode(self, values):	77	def _html_decode(self, values):
137	# Returns the unicode character from the re.subn	78	# Returns the unicode character from the re.subn
@@ -151,212 +92,82 @@ class Himo(object):
151	return unichr(char)	92	return unichr(char)
152		93
153	def _system_encoding(self):	94	def _system_encoding(self):
154	# Returns the character encoding of the system	95	# Returns the character encoding of the operating system
155		96
156	encoding = sys.getfilesystemencoding()	97	encoding = sys.getdefaultencoding()
157	if not encoding:	98	filesystem_encoding = sys.getfilesystemencoding()
158	encoding = sys.getdefaultencoding()	99	if filesystem_encoding:
		100	encoding = filesystem_encoding
159	return encoding	101	return encoding
160		102
161	#def __cmp__(self, other):
162	# if self.__eq__(other):
163	# return 1
164	# else:
165	# pool = [str(self.data), str(other)]
166	# pool.sort()
167	# if pool[0] == self.data:
168	# return -1
169	# else:
170	# return 1
171
172
173	def _is_himo(self, other):
174	if hasattr(other, '_is_himo'):
175	return True
176	return False
177
178	def __len__(self):
179	return len(self.data)
180
181	def __repr__(self):
182	return repr(self.data)
183
184	def __str__(self):
185	return self.data.encode(self._encoding)
186
187	def __iter__(self):
188	for char in self.data:
189	yield char
190
191	def __int__(self):
192	return int(self.data)
193
194	def __float__(self):
195	return float(self.data)
196
197	def __eq__(self, other):
198	if self._is_himo(other):
199	other = other.data
200	return self.data.__eq__(other)
201
202	def __ne__(self, other):
203	if self._is_himo(other):
204	other = other.data
205	return self.data.__ne__(other)
206
207	def __gt__(self, other):
208	if self._is_himo(other):
209	other = other.data
210	lines = [self.data, other]
211	lines.sort()
212	if lines[0] == self.data:
213	return True
214	else:
215	return False
216
217	def __lt__(self, other):
218	if self._is_himo(other):
219	other = other.data
220	lines = [self.data, other]
221	lines.sort()
222	if lines[0] != self.data:
223	return True
224	else:
225	return False
226
227	def __cmp__(self, other):
228	if self.__eq__(other):
229	return 0
230	elif self.__lt__(other):
231	return -1
232	else:
233	return 1
234
235	def __unicode__(self):
236	return self.data
237		103
238	def capitalize(self, args, *kargs):	104	class Himo(unicode):
239	return self.data.capitalize(args, *kargs)	105	"""
240		106	A unicode-string object with some added features to help with
241	def center(self, args, *kargs):	107	output formatting. Himo means rope or string in Japanese, hence
242	return self.data.center(args, *kargs)	108	the string to Himo connection.
243
244	def count(self, args, *kargs):
245	return self.data.count(args, *kargs)
246
247	def decode(self, args, *kargs):
248	return self.data.decode(args, *kargs)
249
250	def encode(self, args, *kargs):
251	return self.data.encode(args, *kargs)
252
253	def encode(self, args, *kargs):
254	return self.data.encode(args, *kargs)
255
256	def endswith(self, args, *kargs):
257	return self.data.endswith(args, *kargs)
258
259	def expandtabs(self, args, *kargs):
260	return self.data.expandtabs(args, *kargs)
261
262	def find(self, args, *kargs):
263	return self.data.find(args, *kargs)
264
265	def format(self, args, *kargs):
266	return self.data.format(args, *kargs)
267
268	def index(self, args, *kargs):
269	return self.data.index(args, *kargs)
270
271	def isalnum(self, args, *kargs):
272	return self.data.isalnum(args, *kargs)
273
274	def isalpha(self, args, *kargs):
275	return self.data.isalpha(args, *kargs)
276
277	def isdecimal(self, args, *kargs):
278	return self.data.isdecimal(args, *kargs)
279
280	def isdigit(self, args, *kargs):
281	return self.data.isdigit(args, *kargs)
282
283	def islower(self, args, *kargs):
284	return self.data.islower(args, *kargs)
285
286	def isnumeric(self, args, *kargs):
287	return self.data.isnumeric(args, *kargs)
288
289	def isspace(self, args, *kargs):
290	return self.data.isspace(args, *kargs)
291
292	def istitle(self, args, *kargs):
293	return self.data.istitle(args, *kargs)
294
295	def isupper(self, args, *kargs):
296	return self.data.isupper(args, *kargs)
297
298	def join(self, args, *kargs):
299	return self.data.join(args, *kargs)
300
301	def ljust(self, args, *kargs):
302	return self.data.ljust(args, *kargs)
303
304	def lower(self, args, *kargs):
305	return self.data.lower(args, *kargs)
306
307	def lstrip(self, args, *kargs):
308	return self.data.lstrip(args, *kargs)
309
310	def partition(self, args, *kargs):
311	return self.data.partition(args, *kargs)
312
313	def replace(self, args, *kargs):
314	return self.data.replace(args, *kargs)
315
316	def rfind(self, args, *kargs):
317	return self.data.rfind(args, *kargs)
318
319	def rindex(self, args, *kargs):
320	return self.data.rindex(args, *kargs)
321
322	def rjust(self, args, *kargs):
323	return self.data.rjust(args, *kargs)
324
325	def rpartition(self, args, *kargs):
326	return self.data.rpartition(args, *kargs)
327		109
328	def rsplit(self, args, *kargs):	110	"""
329	return self.data.rsplit(args, *kargs)
330		111
331	def rstrip(self, args, *kargs):	112	MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
332	return self.data.rstrip(args, *kargs)
333		113
334	def split(self, args, *kargs):	114	def html(self):
335	return self.data.split(args, *kargs)	115	"""
		116	Returns a unicode string containing this object's value
		117	html enetity encoded.
336		118
337	def splitlines(self, args, *kargs):	119	"""
338	return self.data.splitlines(args, *kargs)	120	out = []
		121	for char in self:
		122	out.append(self._html_char_encode(char))
		123	return unicode(''.join(out))
339		124
340	def startswith(self, args, *kargs):	125	def _html_char_encode(self, char):
341	return self.data.startswith(args, *kargs)	126	# Returns an html version of the char
342		127
343	def strip(self, args, *kargs):	128	number = ord(char)
344	return self.data.strip(args, *kargs)	129	try:
		130	char = "&{0};".format(codepoint2name[number])
		131	except KeyError:
		132	if number > 127:
		133	char = "&#{0};".format(number)
		134	return char
345		135
346	def swapcase(self, args, *kargs):	136	def decimal(self):
347	return self.data.swapcase(args, *kargs)	137	"""
		138	Returns a decimal object with the value of this object
348		139
349	def title(self, args, *kargs):	140	"""
350	return self.data.title(args, *kargs)
351		141
352	def translate(self, args, *kargs):	142	return D(self)
353	return self.data.translate(args, *kargs)
354		143
355	def upper(self, args, *kargs):	144	def ascii(self):
356	return self.data.upper(args, *kargs)	145	"""
		146	Returns an ascii representation of this object value.
		147	Throws HimoAsciiError if this method was unable to
		148	convert a unicode character down to it's root character.
		149	For example if in your string you have a character
		150	like the letter 'e' but it has an accent mark over it,
		151	this method will convert that character to it's root
		152	character. Thus 'e' with an accent mark over it will
		153	replaced with the regular letter 'e'.
357		154
358	def zfill(self, args, *kargs):	155	"""
359	return self.data.zfill(args, *kargs)	156	out = []
		157	for char in self:
		158	if ord(char) < 127:
		159	out.append(char)
		160	elif ord(char) in self.MAP:
		161	out.append(self.MAP[ord(char)])
		162	else:
		163	num = unicodedata.decomposition(char).split(' ')[0]
		164	if num:
		165	out.append(unichr(int(num, 16)))
		166	else:
		167	print char
		168	raise HimoAsciiError("Unable to convert 'u{0}' "\
		169	"character to ascii".format(ord(char)))
		170	return str(''.join(out))
360		171
361	class HimoAsciiError(Exception):	172	class HimoAsciiError(Exception):
362	pass	173	pass


diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py new file mode 100644 index 0000000..cb58ca5 --- /dev/null +++ b/test/test_tools/test_himo.py
@@ -0,0 +1,107 @@
		1	import sys
		2	import os
		3	import unittest
		4	path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..'))
		5	sys.path.append(path)
		6	from dodai.tools.himo import String2Himo
		7	from dodai.tools.himo import Himo
		8	from dodai.tools.himo import HimoAsciiError
		9	from decimal import Decimal as D
		10
		11	class TestString2Himo(unittest.TestCase):
		12
		13	def setUp(self):
		14	self.string_to_himo_one = String2Himo()
		15	self.string_to_himo_two = String2Himo('unicode_escape')
		16
		17	def test_regular_string_one(self):
		18	obj = self.string_to_himo_one(str('abcd'))
		19	self.assertTrue(isinstance(obj, Himo))
		20
		21	def test_regular_string_two(self):
		22	test = str('abcd')
		23	obj = self.string_to_himo_one(test)
		24	self.assertTrue(obj == test)
		25
		26	def test_regular_string_three(self):
		27	obj = self.string_to_himo_two(str('abcd'))
		28	self.assertTrue(isinstance(obj, Himo))
		29
		30	def test_regular_string_four(self):
		31	test = str('abcd')
		32	obj = self.string_to_himo_two(test)
		33	self.assertTrue(obj == test)
		34
		35	def test_regular_string_five(self):
		36	test = 1
		37	obj = self.string_to_himo_two(test)
		38	self.assertEqual(int(obj), test)
		39
		40	def test_decode_html(self):
		41	test = u'\u4e5d\xf2\xe5\xe9'
		42	obj = self.string_to_himo_one(u'九òåé')
		43	self.assertEqual(test, obj)
		44
		45	def test_decode_chardet(self):
		46	# Korean Text in the EUC-KR character set
		47	kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\
		48	'\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7'
		49	test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
		50	'\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape')
		51	obj = self.string_to_himo_one(kor)
		52	self.assertEqual(obj, test)
		53
		54	def test_html_decode_one(self):
		55	# Korean Text
		56	kor = '당신이 찾는 모든 '\
		57	'스타일, 옥션& foo'
		58	test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
		59	u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo'
		60	obj = self.string_to_himo_one(kor)
		61	self.assertEqual(obj, test)
		62
		63	def test_html_decode_two(self):
		64	kor = '&ampppe;'
		65	obj = self.string_to_himo_one(kor)
		66	self.assertEqual(obj, kor)
		67
		68
		69	class TestHimo(unittest.TestCase):
		70
		71	def setUp(self):
		72	# "Elephants are our brothers" in Japanese. This line comes from
		73	# an episode of a old Japanese TV show called "Koko Ga Hen Da Yo,
		74	# Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin
		75	# This line came from an African native who spoke Japanese fluently
		76	self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\
		77	u'\u3067\u3059'
		78	self.jp_html = u'象は私たちの'\
		79	u'兄弟です'
		80
		81	def test_html(self):
		82	obj = Himo(self.jp)
		83	test = obj.html()
		84	self.assertEqual(test, self.jp_html)
		85
		86	def test_decimal(self):
		87	obj = Himo('23.55')
		88	obj = obj.decimal()
		89	self.assertTrue(isinstance(obj, D))
		90	self.assertEqual(obj, D('23.55'))
		91
		92	def test_ascii_one(self):
		93	# "Elephants live in Africa" in spanish
		94	obj = Himo(u'Los elefantes viven en \xc1frica')
		95	test = u'Los elefantes viven en Africa'
		96	self.assertEqual(obj.ascii(), test)
		97
		98	def test_ascii_two(self):
		99	# "Elephants are our brothers" in French
		100	obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9')
		101	test = u'Les elephants sont nos freres (C)'
		102	self.assertEqual(obj.ascii(), test)
		103
		104	def test_ascii_three(self):
		105
		106	obj = Himo(self.jp)
		107	self.failUnlessRaises(HimoAsciiError, obj.ascii)