Source Code: python_unicode.xml
>>> print "@ symbol as ASCII Hex: \x40" @ symbol as ASCII Hex: @ >>> print "@ symbol as ASCII Octal: \100" @ symbol as ASCII Octal: @ >>> print "null char \x00 is ok" null char is ok>>> print u"hello kitty".encode('utf-8') hello kitty >>> print u"hello \u0040 unicode \u5b57".encode('utf-8') hello @ unicode 字 >>> u'\u5b57'.encode('utf-8') '\xe5\xad\x97'>>> print u"\u5b57" 切>>> type(u"\u5b57") is unicode True >>> isinstance(u"\u5b57", unicode) True >>> isinstance(u"\u5b57", str) False >>> isinstance(u"\u5b57", basestring) True >>> isinstance('\xe5\xad\x97', basestring) True >>> isinstance(u"\u5b57".encode('utf-8'), str) True>>> u'@'.encode('ascii') == '@' True >>> u'@'.encode('utf-8') == '@' True >>> hex(ord("@")) == '0x40' True >>> print '\x40' @ >>> print u'\x40' @>>> u'@'.encode('utf-16') == '@' False>>> len(u'\u0040'.encode('utf-8')) 1 >>> len(u"\u5b57".encode('utf-8')) 3 >>> u'\u5b57'.encode('utf-8') '\xe5\xad\x97' >>> u'\u0040'.encode('utf-8') '@' (or '\x40')>>> len(u'\u5b57') 1 >>> raw_utf8_data = '\xe5\xad\x97' >>> len(raw_utf8_data) 3 >>> print raw_utf8_data >>> raw_utf8_data.decode('utf-8') u'\u5b57' >>> len(raw_utf8_data.decode('utf-8')) 1>>> u'\u5b57'.encode('big5') '\xa6\x72' >>> len(u'\u5b57'.encode('big5')) 2>>> u'\u00D8'.encode('latin-1') '\xD8' >>> u'\xD8'.encode('latin-1') '\xD8' >>> print u'\u00D8' Ø >>> '\xD8'.decode('latin-1') u'\xD8' # The first 256 characters in the Unicode standard are the same as latin-1! >>> print '\xD8'.decode('latin-1').encode('utf-8') Ø>>> print '\xD8' �>>> print '\xD8' Ø>>> happy_utf8_data = u"hello \u0040 unicode \u5b57".encode('utf-8') >>> evil_utf8_data = happy_utf8_data + '\xff\xff\xff' >>> print evil_utf8_data.decode('utf-8') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/python2.6/encodings/utf_8.py", line 16, in decode return codecs.utf_8_decode(input, errors, True) UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 19: unexpected code byte >>> print evil_utf8_data.decode('utf-8', 'replace') hello @ unicode 字��� >>> print evil_utf8_data.decode('utf-8', 'ignore') hello @ unicode 字>>> f = open('/tmp/lawl', "wb") >>> f.write(u'\u5b57') Traceback (most recent call last): File "<stdin>", line 1, in ? UnicodeEncodeError: 'ascii' codec can't encode character u'\u5b57' in position 0: ordinal not in range(128) >>> f.close()>>> import sys >>> sys.getdefaultencoding() 'ascii'>>> f = open('/tmp/lawl', "wb") >>> f.write(u'\u5b57'.encode('utf-8')) >>> f.close() >>> f = open('/tmp/lawl', "rb") >>> raw_utf8_data = f.read() >>> f.close() >>> raw_utf8_data '\xe5\xad\x97' >>> raw_utf8_data.decode('utf-8') u'\u5b57' >>> f.close()>>> import codecs >>> f = codecs.open('/tmp/lawl', 'wb', 'utf-8') >>> f.write(u'\u5b57') >>> f.close() >>> f = codecs.open('/tmp/lawl', 'rb', 'utf-8') >>> f.read() u'\u5b57' >>> f.close()>>> import codecs >>> f = codecs.open('/tmp/lawl', 'wb', 'utf-8') >>> f.write('\xe5\xad\x97') Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/lib/python2.6/codecs.py", line 686, in write return self.writer.write(data) File "/usr/lib/python2.6/codecs.py", line 351, in write data, consumed = self.encode(object, self.errors) UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)happy_unicode_data = open('tps_report.txt').read().decode('latin-1')>>> 'hello ' + 'there' 'hello there' >>> 'hello ' + u'there' u'hello there' >>> 'hello ' + u'André' u'hello Andr\xe9'>>> 'hello ' + 'André' 'hello Andr\xc3\xa9' # I copy/pasted 'é' as a UTF-8 character >>> 'hello'.encode('utf-8') # this is incorrect, but python lets it slide because 'hello' is ASCII 'hello' >>> ('hello ' + 'André').encode('utf-8') # but don't get spoiled! Traceback (most recent call last): File "<stdin>", line 1, in <module> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 10: ordinal not in range(128)#!/usr/bin/env python # -*- coding: utf-8 -*- import logging def say_hello(name): return "hello %s" % (name) logging.warn(say_hello('Jill')) logging.warn(say_hello(" ".join(['John', u'Doe']))) logging.warn(say_hello(u'André')) logging.warn(say_hello('André')) # stop being spoiled!jart@compy:~$ python test.py WARNING:root:hello Jill WARNING:root:hello André WARNING:root:hello John Doe Traceback (most recent call last): File "/usr/lib/python2.6/logging/__init__.py", line 773, in emit stream.write(fs % msg.encode("UTF-8")) UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 23: ordinal not in range(128)#!/usr/bin/env python # -*- coding: utf-8 -*- # this will crash if Python isn't able to figure out what encodings # your terminal supports print u"hello @ unicode 字" # this is safer if you KNOW your terminal will support UTF-8, or would # rather just have it not crash and print jibberish print u"hello @ unicode 字".encode('utf-8') # same concept applies, but you lose the benefits of unicode strings # most importantly Python will assume byte strings are ASCII the moment # they hit any standard I/o print "hello @ unicode 字"jart@compy:~$ python test.py hello @ unicode 字 hello @ unicode 字 hello @ unicode 字 jart@compy:~$ python test.py >/dev/null Traceback (most recent call last): File "test.py", line 6, in <module> print u"hello @ unicode 字" UnicodeEncodeError: 'ascii' codec can't encode character u'\u5b57' in position 16: ordinal not in range(128)<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />>>> print u"Ö in HTML should be \u00D6 in Python" Ö in HTML should be Ö in Python >>> hex(0214) '0xD6'# Copyright (c) 2009 Lobstertech, Inc. # Licensed MIT import types def smart_unicode(s, encoding='utf-8', errors='strict'): if type(s) in (unicode, int, long, float, types.NoneType): return unicode(s) elif type(s) is str or hasattr(s, '__unicode__'): return unicode(s, encoding, errors) else: return unicode(str(s), encoding, errors) def smart_str(s, encoding='utf-8', errors='strict', from_encoding='utf-8'): if type(s) in (int, long, float, types.NoneType): return str(s) elif type(s) is str: if encoding != from_encoding: return s.decode(from_encoding, errors).encode(encoding, errors) else: return s elif type(s) is unicode: return s.encode(encoding, errors) elif hasattr(s, '__str__'): return smart_str(str(s), encoding, errors, from_encoding) elif hasattr(s, '__unicode__'): return smart_str(unicode(s), encoding, errors, from_encoding) else: return smart_str(str(s), encoding, errors, from_encoding)# -*- coding: utf-8 -*- # Copyright (c) 2009 Lobstertech, Inc. # Licensed MIT # # py.test -xl test_smart_encoding.py import py.test from smart_encoding import smart_unicode, smart_str def test_smart_str(): assert type(smart_str('hello world')) is str assert smart_str('hello world') == 'hello world' assert smart_str(u'hello world') == u'hello world' assert type(smart_str(u'hello world')) is str assert type(smart_str(u'hello world')) is str assert smart_str(u"\u96c6") == '\xe9\x9b\x86' assert smart_str(u"\u96c6", "big5") == '\xb6\xb0' py.test.raises(UnicodeDecodeError, lambda: smart_str('\xb6\xb0', "big5")) py.test.raises(UnicodeDecodeError, lambda: smart_str('\xb6\xb0', "ascii")) assert smart_str('\xb6\xb0', "big5", errors="ignore") == '' assert smart_str('\xb6\xb0', "big5", errors="replace") == '??' assert smart_str('hello \xb6\xb0', "ascii", errors="replace") == 'hello ??' assert smart_str('\xe9\x9b\x86 \xb6\xb0', "big5", errors="replace") == '\xb6\xb0 ??' assert smart_str('\xb6\xb0', "big5", from_encoding="big5") == '\xb6\xb0' assert smart_str('\xb6\xb0', "utf-8", from_encoding="big5") == '\xe9\x9b\x86' def test_smart_unicode(): assert type(smart_unicode('hello world')) is unicode assert smart_unicode('hello world') == u'hello world' assert smart_unicode(u'hello world') == u'hello world' assert type(smart_unicode(u'hello world')) is unicode assert type(smart_unicode(u'hello world')) is unicode assert smart_unicode(u"\u96c6") == u"\u96c6" assert smart_unicode(u"\u96c6", "big5") == u"\u96c6" assert smart_unicode("\xb6\xb0", "big5") == u"\u96c6" assert smart_unicode("hi \xa3", "latin-1") == u'hi \xa3' assert smart_unicode("hi \xa3", "latin-1") == u'hi \u00a3' py.test.raises(UnicodeDecodeError, lambda: smart_unicode("hi \xa3", "ascii")) assert smart_unicode("hi \xa3", "ascii", errors="ignore") == u"hi " assert smart_unicode("hi \xa3", "ascii", errors="replace") == u"hi \ufffd" # unicode question mark def test_object(): class Lawl(object): pass class Mog: pass assert type(smart_unicode(Lawl())) is unicode assert smart_unicode(Lawl()).startswith('<') assert type(smart_unicode(Mog())) is unicode assert smart_unicode(Mog()).startswith('<') class Hurt: def __str__(self): return '\xe9\x9b\x86' assert smart_unicode(Hurt()) == u"\u96c6" assert smart_str(Hurt()) == '\xe9\x9b\x86' class TheHurting: def __str__(self): return '\xb6\xb0' assert smart_unicode(TheHurting(), 'big5') == u"\u96c6"; /etc/my.cnf [mysqld] ; ... default-character-set=utf8 default-collation=utf8_general_ci ; ... [client] default-character-set=utf8$DB->Query("SET CHARACTER SET UTF8"); $DB->Query("SET NAMES UTF8");#!/usr/bin/env python # # convert_mixed_utf_latin1.py # # This script will read 'backup.sql' and decode it as utf8. # If it comes across any characters it can't decode, it # will assume they are latin1. # # It will then output a file that is fully utf8 named 'backup.new.sql' # data = open('backup.sql').read() final = [] while True: try: final.append(data.decode('utf8')) break except UnicodeDecodeError, exc: print "oh snap: %r -> %r" % ( data[exc.start], data[exc.start].decode('latin1').encode('utf8')) # everything up to crazy character should be good final.append(data[:exc.start].decode('utf8')) # crazy character is probably latin1 final.append(data[exc.start].decode('latin1')) # remove already encoded stuff data = data[exc.start+1:] f = open('backup.new.sql', 'wb') f.write("".join(final).encode('utf8')) f.close()jart@compy:~$ mysqldump --opt -u root happydb >backup.sql jart@compy:~$ echo 'create database happydb_new;' | mysql -uroot jart@compy:~$ convert_mixed_utf_latin1.py jart@compy:~$ sed -i -e 's/latin1/utf8/' backup.new.sql jart@compy:~$ mysql -uroot happydb_new <backup.new.sql>>> import email >>> msg = email.message_from_file(open('chinese_spam.txt')) >>> text = msg.get_payload() >>> msg.get_content_charset() 'gb2312' >>> unicode_text = text.decode('gb2312') >>> type(unicode_text) == unicode True >>> print unicode_text.encode('utf-8') [...] ◆工作经验: 10多年高科技企业产品研发和研发管理工作经历,先后担任过项目经理、研究管理部经 理、开发部经理等职位,在长期的产品研发管理实践中积累了丰富的技术和管理经验。在华 [...]>>> print msg['Subject'] =?GB2312?B?svrGt9HQt6K8sLy8yvXIy9SxusvQxLncwO28vMTc0bXBtw==?= >>> msg['Subject'].decode(msg.get_content_charset()).encode('utf-8') '=?GB2312?B?svrGt9HQt6K8sLy8yvXIy9SxusvQxLncwO28vMTc0bXBtw==?='>>> import email.Header >>> (text, encoding) = email.Header.decode_header(msg['Subject'])[0] >>> print text ��Ʒ�з���������Ա���Ĺ�����ѵ� >>> type(text) is unicode False >>> encoding 'gb2312' >>> print text.decode(encoding).encode('utf-8') 产品研发及技术人员核心管理技能训练