1 files changed, 174 insertions, 0 deletions
diff --git a/lib/dodai/tools/himo.py b/lib/dodai/tools/himo.py
new file mode 100644
index 0000000..f56eaf8
--- /dev/null
+++ b/lib/dodai/tools/himo.py
@@ -0,0 +1,174 @@
+# Copyright (C) 2010  Leonard Thomas
+#
+# This file is part of Dodai.
+#
+# Dodai is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Dodai is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Dodai.  If not, see <http://www.gnu.org/licenses/>.
+import chardet
+import re
+import sys
+import unicodedata
+from htmlentitydefs import name2codepoint
+from htmlentitydefs import codepoint2name
+from decimal import Decimal as D
+from dodai.exception import HimoAsciiError
+from dodai.tools import normalize_unicode
+class String2Himo(object):
+    """
+    This in an object that is used for converting python string objects
+    into Himo (unicode) objects.
+    """
+    def __init__(self, default_encoding=None):
+        """
+        default_encoding is the encoding value to be used as the
+        classwide default.  If default encoding is not set then
+        the encoding of the system will be used.
+        """
+        self.default_encoding = default_encoding or self._system_encoding()
+        self._expression = re.compile(r'&(#?)(x?)(\w+);')
+    def __call__(self, data, encoding=None):
+        """
+        Converts the input (data) string object into a Himo object
+        using the passed in (encoding).  If encoding is omitted then
+        the default_encoding will be used.
+        returns a Himo object
+        """
+        encoding = encoding or self.default_encoding
+        data = self._as_unicode(data, encoding)
+        data = self._decode_html(data)
+        data = normalize_unicode(data)
+        return Himo(data)
+    def _as_unicode(self, data, encoding):
+        # Returns string as a unicode string.
+        if not isinstance(data, unicode):
+            if not isinstance(data, str):
+                data = str(data)
+            try:
+                data = data.decode(encoding)
+            except UnicodeDecodeError:
+                info = chardet.detect(data)
+                data = data.decode(info['encoding'])
+        return unicodedata.normalize('NFC', data)
+    def _decode_html(self, data):
+        # Returns a unicode string.  If data contains any html encoded
+        # characters, the characters will be converted to their unicode
+        # equivalent
+        return unicode(self._expression.subn(self._html_decode, data)[0])
+    def _html_decode(self, values):
+        # Returns the unicode character from the re.subn
+        value = values.group(3)
+        if values.group(1):
+            if values.group(2):
+                return unichr(int('0x{0}'.format(value), 16))
+            else:
+                return unichr(int(value))
+        else:
+            try:
+                char = name2codepoint[value]
+            except KeyError:
+                return values.group()
+            else:
+                return unichr(char)
+    def _system_encoding(self):
+        # Returns the character encoding of the operating system
+        encoding = sys.getdefaultencoding()
+        filesystem_encoding = sys.getfilesystemencoding()
+        if filesystem_encoding:
+            encoding = filesystem_encoding
+        return encoding
+class Himo(unicode):
+    """
+    A unicode-string object with some added features to help with
+    output formatting.  Himo means rope or string in Japanese, hence
+    the string to Himo connection.
+    """
+    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
+    def html(self):
+        """
+        Returns a unicode string containing this object's value
+        html enetity encoded.
+        """
+        out = []
+        for char in self:
+            out.append(self._html_char_encode(char))
+        return unicode(''.join(out))
+    def _html_char_encode(self, char):
+        # Returns an html version of the char
+        number = ord(char)
+        try:
+            char = "&{0};".format(codepoint2name[number])
+        except KeyError:
+            if number > 127:
+                char = "&#{0};".format(number)
+        return char
+    def decimal(self):
+        """
+        Returns a decimal object with the value of this object
+        """
+        return D(self)
+    def ascii(self):
+        """
+        Returns an ascii representation of this object value.
+        Throws HimoAsciiError if this method was unable to
+        convert a unicode character down to it's root character.
+        For example if in your string you have a character
+        like the letter 'e' but it has an accent mark over it,
+        this method will convert that character to it's root
+        character.  Thus 'e' with an accent mark over it will
+        replaced with the regular letter 'e'.
+        """
+        out = []
+        for char in self:
+            if ord(char) < 127:
+                out.append(char)
+            elif ord(char) in self.MAP:
+                out.append(self.MAP[ord(char)])
+            else:
+                num = unicodedata.decomposition(char).split(' ')[0]
+                if num:
+                    out.append(unichr(int(num, 16)))
+                else:
+                    raise HimoAsciiError(char)
+        return str(''.join(out))
+class HimoAsciiError(Exception):
+    pass

diff --git a/lib/dodai/tools/himo.py b/lib/dodai/tools/himo.py new file mode 100644 index 0000000..f56eaf8 --- /dev/null +++ b/lib/dodai/tools/himo.py
@@ -0,0 +1,174 @@
	1	# Copyright (C) 2010 Leonard Thomas
	2	#
	3	# This file is part of Dodai.
	4	#
	5	# Dodai is free software: you can redistribute it and/or modify
	6	# it under the terms of the GNU General Public License as published by
	7	# the Free Software Foundation, either version 3 of the License, or
	8	# (at your option) any later version.
	9	#
	10	# Dodai is distributed in the hope that it will be useful,
	11	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	# GNU General Public License for more details.
	14	#
	15	# You should have received a copy of the GNU General Public License
	16	# along with Dodai. If not, see <http://www.gnu.org/licenses/>.
	17
	18	import chardet
	19	import re
	20	import sys
	21	import unicodedata
	22	from htmlentitydefs import name2codepoint
	23	from htmlentitydefs import codepoint2name
	24	from decimal import Decimal as D
	25	from dodai.exception import HimoAsciiError
	26	from dodai.tools import normalize_unicode
	27
	28	class String2Himo(object):
	29	"""
	30	This in an object that is used for converting python string objects
	31	into Himo (unicode) objects.
	32
	33	"""
	34
	35	def __init__(self, default_encoding=None):
	36	"""
	37	default_encoding is the encoding value to be used as the
	38	classwide default. If default encoding is not set then
	39	the encoding of the system will be used.
	40
	41	"""
	42	self.default_encoding = default_encoding or self._system_encoding()
	43	self._expression = re.compile(r'&(#?)(x?)(\w+);')
	44
	45	def __call__(self, data, encoding=None):
	46	"""
	47	Converts the input (data) string object into a Himo object
	48	using the passed in (encoding). If encoding is omitted then
	49	the default_encoding will be used.
	50
	51	returns a Himo object
	52
	53	"""
	54	encoding = encoding or self.default_encoding
	55	data = self._as_unicode(data, encoding)
	56	data = self._decode_html(data)
	57	data = normalize_unicode(data)
	58	return Himo(data)
	59
	60	def _as_unicode(self, data, encoding):
	61	# Returns string as a unicode string.
	62
	63	if not isinstance(data, unicode):
	64	if not isinstance(data, str):
	65	data = str(data)
	66	try:
	67	data = data.decode(encoding)
	68	except UnicodeDecodeError:
	69	info = chardet.detect(data)
	70	data = data.decode(info['encoding'])
	71	return unicodedata.normalize('NFC', data)
	72
	73	def _decode_html(self, data):
	74	# Returns a unicode string. If data contains any html encoded
	75	# characters, the characters will be converted to their unicode
	76	# equivalent
	77
	78	return unicode(self._expression.subn(self._html_decode, data)[0])
	79
	80	def _html_decode(self, values):
	81	# Returns the unicode character from the re.subn
	82
	83	value = values.group(3)
	84	if values.group(1):
	85	if values.group(2):
	86	return unichr(int('0x{0}'.format(value), 16))
	87	else:
	88	return unichr(int(value))
	89	else:
	90	try:
	91	char = name2codepoint[value]
	92	except KeyError:
	93	return values.group()
	94	else:
	95	return unichr(char)
	96
	97	def _system_encoding(self):
	98	# Returns the character encoding of the operating system
	99
	100	encoding = sys.getdefaultencoding()
	101	filesystem_encoding = sys.getfilesystemencoding()
	102	if filesystem_encoding:
	103	encoding = filesystem_encoding
	104	return encoding
	105
	106
	107	class Himo(unicode):
	108	"""
	109	A unicode-string object with some added features to help with
	110	output formatting. Himo means rope or string in Japanese, hence
	111	the string to Himo connection.
	112
	113	"""
	114
	115	MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}
	116
	117	def html(self):
	118	"""
	119	Returns a unicode string containing this object's value
	120	html enetity encoded.
	121
	122	"""
	123	out = []
	124	for char in self:
	125	out.append(self._html_char_encode(char))
	126	return unicode(''.join(out))
	127
	128	def _html_char_encode(self, char):
	129	# Returns an html version of the char
	130
	131	number = ord(char)
	132	try:
	133	char = "&{0};".format(codepoint2name[number])
	134	except KeyError:
	135	if number > 127:
	136	char = "&#{0};".format(number)
	137	return char
	138
	139	def decimal(self):
	140	"""
	141	Returns a decimal object with the value of this object
	142
	143	"""
	144
	145	return D(self)
	146
	147	def ascii(self):
	148	"""
	149	Returns an ascii representation of this object value.
	150	Throws HimoAsciiError if this method was unable to
	151	convert a unicode character down to it's root character.
	152	For example if in your string you have a character
	153	like the letter 'e' but it has an accent mark over it,
	154	this method will convert that character to it's root
	155	character. Thus 'e' with an accent mark over it will
	156	replaced with the regular letter 'e'.
	157
	158	"""
	159	out = []
	160	for char in self:
	161	if ord(char) < 127:
	162	out.append(char)
	163	elif ord(char) in self.MAP:
	164	out.append(self.MAP[ord(char)])
	165	else:
	166	num = unicodedata.decomposition(char).split(' ')[0]
	167	if num:
	168	out.append(unichr(int(num, 16)))
	169	else:
	170	raise HimoAsciiError(char)
	171	return str(''.join(out))
	172
	173	class HimoAsciiError(Exception):
	174	pass