From c5003706aca7bf32031e848ef57146362bf7e3de Mon Sep 17 00:00:00 2001
From: Six <unknown>
Date: Sat, 10 Apr 2010 22:31:18 -0400
Subject: changed the himo object and added an object used to create himo
 objects

---
 .hgignore                    |   1 +
 dodai/tools/__init__.py      |   1 +
 dodai/tools/himo.py          | 365 +++++++++++--------------------------------
 test/test_tools/test_himo.py | 107 +++++++++++++
 4 files changed, 197 insertions(+), 277 deletions(-)
 create mode 100644 test/test_tools/test_himo.py
diff --git a/.hgignore b/.hgignore
index 592f378..3c866fc 100644
--- a/.hgignore
+++ b/.hgignore
@@ -5,3 +5,4 @@ syntax: glob
 bin
 include
 lib
+.coverage
diff --git a/dodai/tools/__init__.py b/dodai/tools/__init__.py
index c54f7f2..9d2fad7 100644
--- a/dodai/tools/__init__.py
+++ b/dodai/tools/__init__.py
@@ -14,3 +14,4 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with Dodai.  If not, see <http://www.gnu.org/licenses/>.
+
diff --git a/dodai/tools/himo.py b/dodai/tools/himo.py
index 5a96f91..60051a7 100644
--- a/dodai/tools/himo.py
+++ b/dodai/tools/himo.py
@@ -23,115 +23,56 @@ from htmlentitydefs import name2codepoint
 from htmlentitydefs import codepoint2name
 from decimal import Decimal as D
 
-class Himo(object):
+class String2Himo(object):
     """
-    A unicode-string object with some added features to help with
-    unicode decoding and output conversions.
-    """
-
-    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
-
-    def __init__(self, data, encoding=None):
-        """
-        data:       Accepts any type of string object (int, float,
-                    string, unicode)
-
-        encoding:   Character encoding to help with converting the input
-                    into unicode
+    This in an object that is used for converting python string objects
+    into Himo (unicode) objects.
 
-        The input data will be converted into an unicode object, unless
-        the input data is already an unicode object.  If the param
-        'encoding' is set, the input data will be converted to unicode
-        using that value.  If no 'encoding' is given this object will
-        attempt to figure out the encoding.  First the encoding of the
-        operating system will be used.  If there are any errors, the
-        chardet module will be used.  This object makes no guarantees
-        that the correct encoding will be detected.
+    """
 
+    def __init__(self, default_encoding=None):
         """
+        default_encoding is the encoding value to be used as the
+        classwide default.  If default encoding is not set then
+        the encoding of the system will be used.
 
-        self._encoding = encoding or self._system_encoding()
-        self.data = self._decode(data)
-
-    def ascii(self):
         """
-        Returns an ascii representation of this object value.
-        Throws HimoAsciiError if this method was unable to
-        convert a unicode character down to it's root character.
-        For example if in your string you have a character
-        like the letter 'e' but it has an accent mark over it,
-        this method will convert that character to it's root
-        character.  Thus 'e' with an accent mark over it will
-        replaced with the regular letter 'e'.
+        self.default_encoding = default_encoding or self._system_encoding()
+        self._expression = re.compile(r'&(#?)(x?)(\w+);')
 
+    def __call__(self, data, encoding=None):
         """
-        out = []
-        for char in self.data:
-            if ord(char) < 127:
-                out.append(char)
-            elif ord(char) in self.MAP:
-                out.append(self.MAP[ord(char)])
-            else:
-                num = unicodedata.decomposition(char).split(' ')[0]
-                if num:
-                    out.append(unichr(int(num, 16)))
-                else:
-                    print char
-                    raise HimoAsciiError("Unable to convert 'u{0}' "\
-                                    "character to ascii".format(ord(char)))
-        return str(''.join(out))
-
-    def html(self):
-        """
-        Returns a unicode string containing this object's value
-        html enetity encoded.
-        """
-        out = []
-        for char in self.data:
-            out.append(self._html_char_encode(char))
-        return ''.join(out)
+        Converts the input (data) string object into a Himo object
+        using the passed in (encoding).  If encoding is omitted then
+        the default_encoding will be used.
 
-    def decimal(self):
-        """
-        Returns a decimal object with the value of this object
+        returns a Himo object
 
         """
+        encoding = encoding or self.default_encoding
+        data = self._as_unicode(data, encoding)
+        data = self._decode_html(data)
+        return Himo(data)
 
-        return D(self.data)
-
-    def _decode(self, data):
-        # Returns a unicode string.  If data contains any html encoded
-        # characters, the characters will be converted to their unicode
-        # equivalent
-
-        data = self._as_unicode(data)
-        expression = re.compile(r'&(#?)(x?)(\w+);')
-        return expression.subn(self._html_decode, data)[0]
-
-    def _as_unicode(self, data):
-        # Returns string as a unicode string
+    def _as_unicode(self, data, encoding):
+        # Returns string as a unicode string.
 
         if not isinstance(data, unicode):
             if not isinstance(data, str):
                 data = str(data)
             try:
-                data = data.decode(self._encoding)
+                data = data.decode(encoding)
             except UnicodeDecodeError:
                 info = chardet.detect(data)
-                self.encoding = info['encoding']
                 data = data.decode(info['encoding'])
         return unicodedata.normalize('NFC', data)
 
-    def _html_char_encode(self, char):
-        # Returns an html version of the char
+    def _decode_html(self, data):
+        # Returns a unicode string.  If data contains any html encoded
+        # characters, the characters will be converted to their unicode
+        # equivalent
 
-        number = ord(char)
-        try:
-            char = "&{0};".format(codepoint2name[number])
-        except KeyError:
-            if number > 127:
-                char = "&#{0};".format(number)
-        return char
+        return unicode(self._expression.subn(self._html_decode, data)[0])
 
     def _html_decode(self, values):
         # Returns the unicode character from the re.subn
@@ -151,212 +92,82 @@ class Himo(object):
                 return unichr(char)
 
     def _system_encoding(self):
-        # Returns the character encoding of the system
+        # Returns the character encoding of the operating system
 
-        encoding = sys.getfilesystemencoding()
-        if not encoding:
-            encoding = sys.getdefaultencoding()
+        encoding = sys.getdefaultencoding()
+        filesystem_encoding = sys.getfilesystemencoding()
+        if filesystem_encoding:
+            encoding = filesystem_encoding
         return encoding
 
-    #def __cmp__(self, other):
-    #    if self.__eq__(other):
-    #        return 1
-    #    else:
-    #        pool = [str(self.data), str(other)]
-    #        pool.sort()
-    #        if pool[0] == self.data:
-    #            return -1
-    #        else:
-    #            return 1
-
-
-    def _is_himo(self, other):
-        if hasattr(other, '_is_himo'):
-            return True
-        return False
-
-    def __len__(self):
-        return len(self.data)
-
-    def __repr__(self):
-        return repr(self.data)
-
-    def __str__(self):
-        return self.data.encode(self._encoding)
-
-    def __iter__(self):
-        for char in self.data:
-            yield char
-
-    def __int__(self):
-        return int(self.data)
-
-    def __float__(self):
-        return float(self.data)
-
-    def __eq__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        return self.data.__eq__(other)
-
-    def __ne__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        return self.data.__ne__(other)
-
-    def __gt__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        lines = [self.data, other]
-        lines.sort()
-        if lines[0] == self.data:
-            return True
-        else:
-            return False
-
-    def __lt__(self, other):
-        if self._is_himo(other):
-            other = other.data
-        lines = [self.data, other]
-        lines.sort()
-        if lines[0] != self.data:
-            return True
-        else:
-            return False
-
-    def __cmp__(self, other):
-        if self.__eq__(other):
-            return 0
-        elif self.__lt__(other):
-            return -1
-        else:
-            return 1
-
-    def __unicode__(self):
-        return self.data
 
-    def capitalize(self, *args, **kargs):
-        return self.data.capitalize(*args, **kargs)
-
-    def center(self, *args, **kargs):
-        return self.data.center(*args, **kargs)
-
-    def count(self, *args, **kargs):
-        return self.data.count(*args, **kargs)
-
-    def decode(self, *args, **kargs):
-        return self.data.decode(*args, **kargs)
-
-    def encode(self, *args, **kargs):
-        return self.data.encode(*args, **kargs)
-
-    def encode(self, *args, **kargs):
-        return self.data.encode(*args, **kargs)
-
-    def endswith(self, *args, **kargs):
-        return self.data.endswith(*args, **kargs)
-
-    def expandtabs(self, *args, **kargs):
-        return self.data.expandtabs(*args, **kargs)
-
-    def find(self, *args, **kargs):
-        return self.data.find(*args, **kargs)
-
-    def format(self, *args, **kargs):
-        return self.data.format(*args, **kargs)
-
-    def index(self, *args, **kargs):
-        return self.data.index(*args, **kargs)
-
-    def isalnum(self, *args, **kargs):
-        return self.data.isalnum(*args, **kargs)
-
-    def isalpha(self, *args, **kargs):
-        return self.data.isalpha(*args, **kargs)
-
-    def isdecimal(self, *args, **kargs):
-        return self.data.isdecimal(*args, **kargs)
-
-    def isdigit(self, *args, **kargs):
-        return self.data.isdigit(*args, **kargs)
-
-    def islower(self, *args, **kargs):
-        return self.data.islower(*args, **kargs)
-
-    def isnumeric(self, *args, **kargs):
-        return self.data.isnumeric(*args, **kargs)
-
-    def isspace(self, *args, **kargs):
-        return self.data.isspace(*args, **kargs)
-
-    def istitle(self, *args, **kargs):
-        return self.data.istitle(*args, **kargs)
-
-    def isupper(self, *args, **kargs):
-        return self.data.isupper(*args, **kargs)
-
-    def join(self, *args, **kargs):
-        return self.data.join(*args, **kargs)
-
-    def ljust(self, *args, **kargs):
-        return self.data.ljust(*args, **kargs)
-
-    def lower(self, *args, **kargs):
-        return self.data.lower(*args, **kargs)
-
-    def lstrip(self, *args, **kargs):
-        return self.data.lstrip(*args, **kargs)
-
-    def partition(self, *args, **kargs):
-        return self.data.partition(*args, **kargs)
-
-    def replace(self, *args, **kargs):
-        return self.data.replace(*args, **kargs)
-
-    def rfind(self, *args, **kargs):
-        return self.data.rfind(*args, **kargs)
-
-    def rindex(self, *args, **kargs):
-        return self.data.rindex(*args, **kargs)
-
-    def rjust(self, *args, **kargs):
-        return self.data.rjust(*args, **kargs)
-
-    def rpartition(self, *args, **kargs):
-        return self.data.rpartition(*args, **kargs)
+class Himo(unicode):
+    """
+    A unicode-string object with some added features to help with
+    output formatting.  Himo means rope or string in Japanese, hence
+    the string to Himo connection.
 
-    def rsplit(self, *args, **kargs):
-        return self.data.rsplit(*args, **kargs)
+    """
 
-    def rstrip(self, *args, **kargs):
-        return self.data.rstrip(*args, **kargs)
+    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
 
-    def split(self, *args, **kargs):
-        return self.data.split(*args, **kargs)
+    def html(self):
+        """
+        Returns a unicode string containing this object's value
+        html enetity encoded.
 
-    def splitlines(self, *args, **kargs):
-        return self.data.splitlines(*args, **kargs)
+        """
+        out = []
+        for char in self:
+            out.append(self._html_char_encode(char))
+        return unicode(''.join(out))
 
-    def startswith(self, *args, **kargs):
-        return self.data.startswith(*args, **kargs)
+    def _html_char_encode(self, char):
+        # Returns an html version of the char
 
-    def strip(self, *args, **kargs):
-        return self.data.strip(*args, **kargs)
+        number = ord(char)
+        try:
+            char = "&{0};".format(codepoint2name[number])
+        except KeyError:
+            if number > 127:
+                char = "&#{0};".format(number)
+        return char
 
-    def swapcase(self, *args, **kargs):
-        return self.data.swapcase(*args, **kargs)
+    def decimal(self):
+        """
+        Returns a decimal object with the value of this object
 
-    def title(self, *args, **kargs):
-        return self.data.title(*args, **kargs)
+        """
 
-    def translate(self, *args, **kargs):
-        return self.data.translate(*args, **kargs)
+        return D(self)
 
-    def upper(self, *args, **kargs):
-        return self.data.upper(*args, **kargs)
+    def ascii(self):
+        """
+        Returns an ascii representation of this object value.
+        Throws HimoAsciiError if this method was unable to
+        convert a unicode character down to it's root character.
+        For example if in your string you have a character
+        like the letter 'e' but it has an accent mark over it,
+        this method will convert that character to it's root
+        character.  Thus 'e' with an accent mark over it will
+        replaced with the regular letter 'e'.
 
-    def zfill(self, *args, **kargs):
-        return self.data.zfill(*args, **kargs)
+        """
+        out = []
+        for char in self:
+            if ord(char) < 127:
+                out.append(char)
+            elif ord(char) in self.MAP:
+                out.append(self.MAP[ord(char)])
+            else:
+                num = unicodedata.decomposition(char).split(' ')[0]
+                if num:
+                    out.append(unichr(int(num, 16)))
+                else:
+                    print char
+                    raise HimoAsciiError("Unable to convert 'u{0}' "\
+                                    "character to ascii".format(ord(char)))
+        return str(''.join(out))
 
 class HimoAsciiError(Exception):
     pass
diff --git a/test/test_tools/test_himo.py b/test/test_tools/test_himo.py
new file mode 100644
index 0000000..cb58ca5
--- /dev/null
+++ b/test/test_tools/test_himo.py
@@ -0,0 +1,107 @@
+import sys
+import os
+import unittest
+path = os.path.realpath(os.path.join(os.path.dirname(__file__), '..', '..'))
+sys.path.append(path)
+from dodai.tools.himo import String2Himo
+from dodai.tools.himo import Himo
+from dodai.tools.himo import HimoAsciiError
+from decimal import Decimal as D
+
+class TestString2Himo(unittest.TestCase):
+
+    def setUp(self):
+        self.string_to_himo_one = String2Himo()
+        self.string_to_himo_two = String2Himo('unicode_escape')
+
+    def test_regular_string_one(self):
+        obj = self.string_to_himo_one(str('abcd'))
+        self.assertTrue(isinstance(obj, Himo))
+
+    def test_regular_string_two(self):
+        test = str('abcd')
+        obj = self.string_to_himo_one(test)
+        self.assertTrue(obj == test)
+
+    def test_regular_string_three(self):
+        obj = self.string_to_himo_two(str('abcd'))
+        self.assertTrue(isinstance(obj, Himo))
+
+    def test_regular_string_four(self):
+        test = str('abcd')
+        obj = self.string_to_himo_two(test)
+        self.assertTrue(obj == test)
+
+    def test_regular_string_five(self):
+        test = 1
+        obj = self.string_to_himo_two(test)
+        self.assertEqual(int(obj), test)
+
+    def test_decode_html(self):
+        test = u'\u4e5d\xf2\xe5\xe9'
+        obj = self.string_to_himo_one(u'&#20061;&ograve;&aring;&eacute;')
+        self.assertEqual(test, obj)
+
+    def test_decode_chardet(self):
+        # Korean Text in the EUC-KR character set
+        kor = '\xb4\xe7\xbd\xc5\xc0\xcc \xc3\xa3\xb4\xc2 \xb8\xf0\xb5\xe7 '\
+               '\xbd\xba\xc5\xb8\xc0\xcf, \xbf\xc1\xbc\xc7'
+        test = '\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
+               '\uc2a4\ud0c0\uc77c, \uc625\uc158'.decode('unicode_escape')
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, test)
+
+    def test_html_decode_one(self):
+        # Korean Text
+        kor = '&#xb2f9;&#49888;&#51060; &#52286;&#45716; &#47784;&#46304; '\
+              '&#49828;&#53440;&#51068;, &#50725;&#49496;&amp; foo'
+        test = u'\ub2f9\uc2e0\uc774 \ucc3e\ub294 \ubaa8\ub4e0 '\
+               u'\uc2a4\ud0c0\uc77c, \uc625\uc158& foo'
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, test)
+
+    def test_html_decode_two(self):
+        kor = '&ampppe;'
+        obj = self.string_to_himo_one(kor)
+        self.assertEqual(obj, kor)
+
+
+class TestHimo(unittest.TestCase):
+
+    def setUp(self):
+        # "Elephants are our brothers" in Japanese.  This line comes from
+        # an episode of a old Japanese TV show called "Koko Ga Hen Da Yo,
+        # Nihonjin" http://en.wikipedia.org/wiki/Koko_ga_hen_da_yo,_nihonjin
+        # This line came from an African native who spoke Japanese fluently
+        self.jp = u'\u8c61\u306f\u79c1\u305f\u3061\u306e\u5144\u5f1f'\
+                  u'\u3067\u3059'
+        self.jp_html = u'&#35937;&#12399;&#31169;&#12383;&#12385;&#12398;'\
+                       u'&#20804;&#24351;&#12391;&#12377;'
+
+    def test_html(self):
+        obj = Himo(self.jp)
+        test = obj.html()
+        self.assertEqual(test, self.jp_html)
+
+    def test_decimal(self):
+        obj = Himo('23.55')
+        obj = obj.decimal()
+        self.assertTrue(isinstance(obj, D))
+        self.assertEqual(obj, D('23.55'))
+
+    def test_ascii_one(self):
+        # "Elephants live in Africa" in spanish
+        obj = Himo(u'Los elefantes viven en \xc1frica')
+        test = u'Los elefantes viven en Africa'
+        self.assertEqual(obj.ascii(), test)
+
+    def test_ascii_two(self):
+        # "Elephants are our brothers" in French
+        obj = Himo(u'Les \xe9l\xe9phants sont nos fr\xe8res \xa9')
+        test = u'Les elephants sont nos freres (C)'
+        self.assertEqual(obj.ascii(), test)
+
+    def test_ascii_three(self):
+
+        obj = Himo(self.jp)
+        self.failUnlessRaises(HimoAsciiError, obj.ascii)
-- 
cgit v1.2.3