# Copyright (C) 2010 Leonard Thomas # # This file is part of Dodai. # # Dodai is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Dodai is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Dodai. If not, see . import chardet import re import sys import unicodedata from htmlentitydefs import name2codepoint from htmlentitydefs import codepoint2name from decimal import Decimal as D from dodai.exception import HimoAsciiError from dodai.tools import normalize_unicode class String2Himo(object): """ This in an object that is used for converting python string objects into Himo (unicode) objects. """ def __init__(self, default_encoding=None): """ default_encoding is the encoding value to be used as the classwide default. If default encoding is not set then the encoding of the system will be used. """ self.default_encoding = default_encoding or self._system_encoding() self._expression = re.compile(r'&(#?)(x?)(\w+);') def __call__(self, data, encoding=None): """ Converts the input (data) string object into a Himo object using the passed in (encoding). If encoding is omitted then the default_encoding will be used. returns a Himo object """ encoding = encoding or self.default_encoding data = self._as_unicode(data, encoding) data = self._decode_html(data) data = normalize_unicode(data) return Himo(data) def _as_unicode(self, data, encoding): # Returns string as a unicode string. if not isinstance(data, unicode): if not isinstance(data, str): data = str(data) try: data = data.decode(encoding) except UnicodeDecodeError: info = chardet.detect(data) data = data.decode(info['encoding']) return unicodedata.normalize('NFC', data) def _decode_html(self, data): # Returns a unicode string. If data contains any html encoded # characters, the characters will be converted to their unicode # equivalent return unicode(self._expression.subn(self._html_decode, data)[0]) def _html_decode(self, values): # Returns the unicode character from the re.subn value = values.group(3) if values.group(1): if values.group(2): return unichr(int('0x{0}'.format(value), 16)) else: return unichr(int(value)) else: try: char = name2codepoint[value] except KeyError: return values.group() else: return unichr(char) def _system_encoding(self): # Returns the character encoding of the operating system encoding = sys.getdefaultencoding() filesystem_encoding = sys.getfilesystemencoding() if filesystem_encoding: encoding = filesystem_encoding return encoding class Himo(unicode): """ A unicode-string object with some added features to help with output formatting. Himo means rope or string in Japanese, hence the string to Himo connection. """ MAP = {169:u'(C)', 174:u'(R)', 8471:u'(P)'} def html(self): """ Returns a unicode string containing this object's value html enetity encoded. """ out = [] for char in self: out.append(self._html_char_encode(char)) return unicode(''.join(out)) def _html_char_encode(self, char): # Returns an html version of the char number = ord(char) try: char = "&{0};".format(codepoint2name[number]) except KeyError: if number > 127: char = "&#{0};".format(number) return char def decimal(self): """ Returns a decimal object with the value of this object """ return D(self) def ascii(self): """ Returns an ascii representation of this object value. Throws HimoAsciiError if this method was unable to convert a unicode character down to it's root character. For example if in your string you have a character like the letter 'e' but it has an accent mark over it, this method will convert that character to it's root character. Thus 'e' with an accent mark over it will replaced with the regular letter 'e'. """ out = [] for char in self: if ord(char) < 127: out.append(char) elif ord(char) in self.MAP: out.append(self.MAP[ord(char)]) else: num = unicodedata.decomposition(char).split(' ')[0] if num: out.append(unichr(int(num, 16))) else: raise HimoAsciiError(char) return str(''.join(out)) class HimoAsciiError(Exception): pass