Viewing file: i18n.py (19.37 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
#!/usr/bin/python -tt # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Library General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
def dummy_wrapper(str): ''' Dummy Translation wrapper, just returning the same string. ''' return to_unicode(str)
def dummyP_wrapper(str1, str2, n): ''' Dummy Plural Translation wrapper, just returning the singular or plural string. ''' if n == 1: return str1 else: return str2
# This is ported from ustr_utf8_* which I got from: # http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c # I've tried to leave it close to the original C (same names etc.) so that # it is easy to read/compare both versions...
# ----------------------------- BEG utf8 ----------------------------- # This is an implementation of wcwidth() and wcswidth() (defined in # IEEE Std 1002.1-2001) for Unicode. # # http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html # http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html # # In fixed-width output devices, Latin characters all occupy a single # "cell" position of equal width, whereas ideographic CJK characters # occupy two such cells. Interoperability between terminal-line # applications and (teletype-style) character terminals using the # UTF-8 encoding requires agreement on which character should advance # the cursor by how many cell positions. No established formal # standards exist at present on which Unicode character shall occupy # how many cell positions on character terminals. These routines are # a first attempt of defining such behavior based on simple rules # applied to data provided by the Unicode Consortium. # # [...] # # Markus Kuhn -- 2007-05-26 (Unicode 5.0) # # Permission to use, copy, modify, and distribute this software # for any purpose and without fee is hereby granted. The author # disclaims all warranties with regard to this software. # # Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
def __utf8_bisearch(ucs, table): """ auxiliary function for binary search in interval table. """
min = 0 max = len(table) - 1 if ucs < table[min][0] or ucs > table[max][1]: return False
while max >= min: mid = (min + max) / 2 if ucs > table[mid][1]: min = mid + 1 elif ucs < table[mid][0]: max = mid - 1 else: return True
return False
def __utf8_ucp_width(ucs): """ Get the textual width of a ucs character. """ # sorted list of non-overlapping intervals of non-spacing characters # generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" combining = [ ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ), ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ), ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ), ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ), ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ), ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ), ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ), ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ), ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ), ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ), ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ), ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ), ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ), ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ), ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ), ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ), ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ), ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ), ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ), ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ), ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ), ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ), ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ), ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ), ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ), ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ), ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ), ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ), ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ), ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ), ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ), ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ), ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ), ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ), ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ), ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ), ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ), ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ), ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ), ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ), ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ), ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ), ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ), ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ), ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ), ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ), ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ), ( 0xE0100, 0xE01EF )]
# test for 8-bit control characters if ucs == 0: return 0
if ucs < 32 or (ucs >= 0x7f and ucs < 0xa0): return (-1)
if __utf8_bisearch(ucs, combining): return 0
# if we arrive here, ucs is not a combining or C0/C1 control character
return (1 + (ucs >= 0x1100 and (ucs <= 0x115f or # Hangul Jamo init. consonants ucs == 0x2329 or ucs == 0x232a or (ucs >= 0x2e80 and ucs <= 0xa4cf and ucs != 0x303f) or # CJK ... Yi (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms (ucs >= 0xffe0 and ucs <= 0xffe6) or (ucs >= 0x20000 and ucs <= 0x2fffd) or (ucs >= 0x30000 and ucs <= 0x3fffd))))
def __utf8_iter_ints(msg): for byte in to_utf8(msg): yield ord(byte) def __utf8_iter_ucs(msg): uiter = __utf8_iter_ints(msg) for byte0 in uiter: if byte0 < 0x80: # 0xxxxxxx yield (byte0, 1) elif (byte0 & 0xe0) == 0xc0: # 110XXXXx 10xxxxxx byte1 = uiter.next() if (((byte1 & 0xc0) != 0x80) or ((byte0 & 0xfe) == 0xc0)): # overlong? yield (None, 2) return yield ((((byte0 & 0x1f) << 6) | (byte1 & 0x3f)), 2) elif (byte0 & 0xf0) == 0xe0: # 1110XXXX 10Xxxxxx 10xxxxxx byte1 = uiter.next() byte2 = uiter.next() if (((byte1 & 0xc0) != 0x80) or ((byte2 & 0xc0) != 0x80) or ((byte0 == 0xe0) and ((byte1 & 0xe0) == 0x80)) or # overlong? ((byte0 == 0xed) and ((byte1 & 0xe0) == 0xa0)) or # surrogate? ((byte0 == 0xef) and (byte1 == 0xbf) and ((byte2 & 0xfe) == 0xbe))): # U+FFFE or U+FFFF? yield (None, 3) return yield ((((byte0 & 0x0f) << 12) | ((byte1 & 0x3f) << 6) | (byte2 & 0x3f)), 3) elif (byte0 & 0xf8) == 0xf0: # 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx byte1 = uiter.next() byte2 = uiter.next() byte3 = uiter.next() if (((byte1 & 0xc0) != 0x80) or ((byte2 & 0xc0) != 0x80) or ((byte3 & 0xc0) != 0x80) or ((byte0 == 0xf0) and ((byte1 & 0xf0) == 0x80)) or # overlong? ((byte0 == 0xf4) and (byte1 > 0x8f)) or # > U+10FFFF? (byte0 > 0xf4)): # > U+10FFFF? yield (None, 4) return
yield ((((byte0 & 0x07) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)), 4) else: yield (None, 1) return
def utf8_width(msg): """ Get the textual width of a utf8 string. """ ret = 0 for (ucs, bytes) in __utf8_iter_ucs(msg): if ucs is None: ret += bytes # Ugly ... should not feed bad utf8 else: ret += __utf8_ucp_width(ucs) return ret
def utf8_width_chop(msg, chop=None): """ Return the textual width of a utf8 string, chopping it to a specified value. This is what you want to use instead of %.*s, as it does the "right" thing with regard to utf-8 sequences. Eg. "%.*s" % (10, msg) <= becomes => "%s" % (utf8_width_chop(msg, 10)) """
if chop is None or utf8_width(msg) <= chop: return utf8_width(msg), msg
ret = 0 passed_unicode = isinstance(msg, unicode) msg_bytes = 0 msg = to_utf8(msg) for (ucs, bytes) in __utf8_iter_ucs(msg): if ucs is None: width = bytes # Ugly ... should not feed bad utf8 else: width = __utf8_ucp_width(ucs)
if chop is not None and (ret + width) > chop: msg = msg[:msg_bytes] break ret += width msg_bytes += bytes
if passed_unicode: msg = to_unicode(msg)
return ret, msg
def utf8_width_fill(msg, fill, chop=None, left=True, prefix='', suffix=''): """ Expand a utf8 msg to a specified "width" or chop to same. Expansion can be left or right. This is what you want to use instead of %*.*s, as it does the "right" thing with regard to utf-8 sequences. prefix and suffix should be used for "invisible" bytes, like highlighting. Eg. "%-*.*s" % (10, 20, msg) <= becomes => "%s" % (utf8_width_fill(msg, 10, 20)).
"%20.10s" % (msg) <= becomes => "%s" % (utf8_width_fill(msg, 20, 10, left=False)).
"%s%.10s%s" % (prefix, msg, suffix) <= becomes => "%s" % (utf8_width_fill(msg, 0, 10, prefix=prefix, suffix=suffix)). """ passed_msg = msg width, msg = utf8_width_chop(msg, chop)
if width >= fill: if prefix or suffix: msg = ''.join([prefix, msg, suffix]) else: extra = " " * (fill - width) if left: msg = ''.join([prefix, msg, suffix, extra]) else: msg = ''.join([extra, prefix, msg, suffix])
if isinstance(passed_msg, unicode): return to_unicode(msg)
return msg
def utf8_valid(msg): """ Return True/False is the text is valid utf8. """ for (ucs, bytes) in __utf8_iter_ucs(msg): if ucs is None: return False return True
def _utf8_width_le(width, *args): """ Minor speed hack, we often want to know "does X fit in Y". It takes "a while" to work out a utf8_width() (see above), and we know that a utf8 character is always <= byte. So given:
assert bytes >= characters characters <= width?
...we can change to:
bytes <= width or characters <= width
...and bytes are much faster. """ # This assumes that all args. are utf8. ret = 0 for arg in args: ret += len(arg) if ret <= width: return True ret = 0 for arg in args: ret += utf8_width(arg) return ret <= width
def utf8_text_wrap(text, width=70, initial_indent='', subsequent_indent=''): """ Works like we want textwrap.wrap() to work, uses utf-8 data and doesn't screw up lists/blocks/etc. """ # Tested with: # yum info robodoc gpicview php-pear-Net-Socket wmctrl ustr moreutils # mediawiki-HNP ocspd insight yum mousepad # ...at 120, 80 and 40 chars. # Also, notable among lots of others, searching for "\n ": # exim-clamav, jpackage-utils, tcldom, synaptics, "quake3", # perl-Class-Container, ez-ipupdate, perl-Net-XMPP, "kipi-plugins", # perl-Apache-DBI, netcdf, python-configobj, "translate-toolkit", alpine, # "udunits", "conntrack-tools" # # Note that, we "fail" on: # alsa-plugins-jack, setools*, dblatex, uisp, "perl-Getopt-GUI-Long", # suitesparse, "synce-serial", writer2latex, xenwatch, ltsp-utils
passed_unicode = isinstance(text, unicode)
def _indent_at_beg(line): count = 0 byte = 'X' for byte in line: if byte != ' ': break count += 1 if byte not in ("-", "*", ".", "o", '\xe2'): return count, 0 list_chr = utf8_width_chop(line[count:], 1)[1] if list_chr in ("-", "*", ".", "o", "\xe2\x80\xa2", "\xe2\x80\xa3", "\xe2\x88\x98"): nxt = _indent_at_beg(line[count+len(list_chr):]) nxt = nxt[1] or nxt[0] if nxt: return count, count + 1 + nxt return count, 0
initial_indent = to_utf8(initial_indent) subsequent_indent = to_utf8(subsequent_indent)
text = to_utf8(text).rstrip('\n') lines = to_utf8(text).replace('\t', ' ' * 8).split('\n')
ret = [] indent = initial_indent wrap_last = False csab = 0 cspc_indent = 0 for line in lines: line = line.rstrip(' ') (lsab, lspc_indent) = (csab, cspc_indent) (csab, cspc_indent) = _indent_at_beg(line) force_nl = False # We want to stop wrapping under "certain" conditions: if wrap_last and cspc_indent: # if line starts a list or force_nl = True if wrap_last and csab == len(line):# is empty line force_nl = True if wrap_last and not lspc_indent: # if line doesn't continue a list and if csab >= 4 and csab != lsab: # is "block indented" force_nl = True if force_nl: ret.append(indent.rstrip(' ')) indent = subsequent_indent wrap_last = False if csab == len(line): # empty line, remove spaces to make it easier. line = '' if wrap_last: line = line.lstrip(' ') cspc_indent = lspc_indent
if _utf8_width_le(width, indent, line): wrap_last = False ret.append(indent + line) indent = subsequent_indent continue
wrap_last = True words = line.split(' ') line = indent spcs = cspc_indent if not spcs and csab >= 4: spcs = csab for word in words: if (not _utf8_width_le(width, line, word) and utf8_width(line) > utf8_width(subsequent_indent)): ret.append(line.rstrip(' ')) line = subsequent_indent + ' ' * spcs line += word line += ' ' indent = line.rstrip(' ') + ' ' if wrap_last: ret.append(indent.rstrip(' '))
if passed_unicode: return map(to_unicode, ret) return ret
def utf8_text_fill(text, *args, **kwargs): """ Works like we want textwrap.fill() to work, uses utf-8 data and doesn't screw up lists/blocks/etc. """ return '\n'.join(utf8_text_wrap(text, *args, **kwargs)) # ----------------------------- END utf8 -----------------------------
def to_unicode(obj, encoding='utf-8', errors='replace'): ''' convert a 'str' to 'unicode' ''' if isinstance(obj, basestring): if not isinstance(obj, unicode): obj = unicode(obj, encoding, errors) return obj
def to_utf8(obj, errors='replace'): '''convert 'unicode' to an encoded utf-8 byte string ''' if isinstance(obj, unicode): obj = obj.encode('utf-8', errors) return obj
# Don't use this, to_unicode should just work now def to_unicode_maybe(obj, encoding='utf-8', errors='replace'): ''' Don't ask don't tell, only use when you must ''' try: return to_unicode(obj, encoding, errors) except UnicodeEncodeError: return obj
def to_str(obj): """ Convert something to a string, if it isn't one. """ # NOTE: unicode counts as a string just fine. We just want objects to call # their __str__ methods. if not isinstance(obj, basestring): obj = str(obj) return obj
if True: # Python-2.* sucks for i18n/UnicodeErrors ... so we turn it off. We've # got most of the major stuff beaten in Fedora, so if you really want it # change the above to False and have fun (just don't log bugs). _ = dummy_wrapper P_ = dummyP_wrapper else: try: ''' Setup the yum translation domain and make _() and P_() translation wrappers available. using ugettext to make sure translated strings are in Unicode. ''' import gettext t = gettext.translation('yum', fallback=True) _ = t.ugettext P_ = t.ungettext except: ''' Something went wrong so we make a dummy _() wrapper there is just returning the same text ''' _ = dummy_wrapper P_ = dummyP_wrapper
if __name__ == "__main__": import sys
def out(arg): arg = to_utf8(arg) print "UTF8 :", arg print "len :", len(arg) arg = to_unicode(arg) print "USC :", arg print "len :", len(arg) print "valid:", utf8_valid(arg) print "width:", utf8_width(arg) print "4.8 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 8), '>') print "4.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 3), '>') print "4.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 2), '>') print "4.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 4, 1), '>') print "3.3 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 3), '>') print "3.2 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 2), '>') print "3.1 :", "%s%s%s" % ('<', utf8_width_fill(arg, 3, 1), '>') print "40.79:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 79), '>') print "40.20:", "%s%s%s" % ('<', utf8_width_fill(arg, 40, 20), '>') print ''
print " ---- Arguments/str ---- " for arg in sys.argv[1:]: out(arg)
print " ---- Arguments/gettext ---- " for arg in sys.argv[1:]: try: arg = _(arg) except UnicodeDecodeError: continue out(arg)
if len(sys.argv) > 2: print " ---- Arguments/str/all ---- " out(sys.argv[1] % sys.argv[2:])
print " ---- Arguments/gettext/all ---- " try: arg = _(sys.argv[1]) % map(_, sys.argv[2:]) except UnicodeDecodeError: sys.exit(0) out(arg)
|