dodai/tools/himo.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

# Copyright (C) 2010  Leonard Thomas
#
# This file is part of Dodai.
#
# Dodai is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Dodai is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Dodai.  If not, see <http://www.gnu.org/licenses/>.

import chardet
import re
import sys
import unicodedata
from htmlentitydefs import name2codepoint
from htmlentitydefs import codepoint2name
from decimal import Decimal as D
from dodai.exception import HimoAsciiError

class String2Himo(object):
    """
    This in an object that is used for converting python string objects
    into Himo (unicode) objects.

    """

    def __init__(self, default_encoding=None):
        """
        default_encoding is the encoding value to be used as the
        classwide default.  If default encoding is not set then
        the encoding of the system will be used.

        """
        self.default_encoding = default_encoding or self._system_encoding()
        self._expression = re.compile(r'&(#?)(x?)(\w+);')

    def __call__(self, data, encoding=None):
        """
        Converts the input (data) string object into a Himo object
        using the passed in (encoding).  If encoding is omitted then
        the default_encoding will be used.

        returns a Himo object

        """
        encoding = encoding or self.default_encoding
        data = self._as_unicode(data, encoding)
        data = self._decode_html(data)
        return Himo(data)

    def _as_unicode(self, data, encoding):
        # Returns string as a unicode string.

        if not isinstance(data, unicode):
            if not isinstance(data, str):
                data = str(data)
            try:
                data = data.decode(encoding)
            except UnicodeDecodeError:
                info = chardet.detect(data)
                data = data.decode(info['encoding'])
        return unicodedata.normalize('NFC', data)

    def _decode_html(self, data):
        # Returns a unicode string.  If data contains any html encoded
        # characters, the characters will be converted to their unicode
        # equivalent

        return unicode(self._expression.subn(self._html_decode, data)[0])

    def _html_decode(self, values):
        # Returns the unicode character from the re.subn

        value = values.group(3)
        if values.group(1):
            if values.group(2):
                return unichr(int('0x{0}'.format(value), 16))
            else:
                return unichr(int(value))
        else:
            try:
                char = name2codepoint[value]
            except KeyError:
                return values.group()
            else:
                return unichr(char)

    def _system_encoding(self):
        # Returns the character encoding of the operating system

        encoding = sys.getdefaultencoding()
        filesystem_encoding = sys.getfilesystemencoding()
        if filesystem_encoding:
            encoding = filesystem_encoding
        return encoding


class Himo(unicode):
    """
    A unicode-string object with some added features to help with
    output formatting.  Himo means rope or string in Japanese, hence
    the string to Himo connection.

    """

    MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'}

    def html(self):
        """
        Returns a unicode string containing this object's value
        html enetity encoded.

        """
        out = []
        for char in self:
            out.append(self._html_char_encode(char))
        return unicode(''.join(out))

    def _html_char_encode(self, char):
        # Returns an html version of the char

        number = ord(char)
        try:
            char = "&{0};".format(codepoint2name[number])
        except KeyError:
            if number > 127:
                char = "&#{0};".format(number)
        return char

    def decimal(self):
        """
        Returns a decimal object with the value of this object

        """

        return D(self)

    def ascii(self):
        """
        Returns an ascii representation of this object value.
        Throws HimoAsciiError if this method was unable to
        convert a unicode character down to it's root character.
        For example if in your string you have a character
        like the letter 'e' but it has an accent mark over it,
        this method will convert that character to it's root
        character.  Thus 'e' with an accent mark over it will
        replaced with the regular letter 'e'.

        """
        out = []
        for char in self:
            if ord(char) < 127:
                out.append(char)
            elif ord(char) in self.MAP:
                out.append(self.MAP[ord(char)])
            else:
                num = unicodedata.decomposition(char).split(' ')[0]
                if num:
                    out.append(unichr(int(num, 16)))
                else:
                    raise HimoAsciiError(char)
        return str(''.join(out))

class HimoAsciiError(Exception):
    pass