LOBSTERTECHNOLOGIES
CSSPROPAGANDA

Source Code: python_unicode.xml


        >>> print "@ symbol as ASCII Hex: \x40"
        @ symbol as ASCII Hex: @
        >>> print "@ symbol as ASCII Octal: \100"
        @ symbol as ASCII Octal: @
        >>> print "null char \x00 is ok"
        null char  is ok
      
        >>> print u"hello kitty".encode('utf-8')
        hello kitty
        >>> print u"hello \u0040 unicode \u5b57".encode('utf-8')
        hello @ unicode 字
        >>> u'\u5b57'.encode('utf-8')
        '\xe5\xad\x97'
      
        >>> print u"\u5b57"
        切
      
        >>> type(u"\u5b57") is unicode
        True
        >>> isinstance(u"\u5b57", unicode)
        True
        >>> isinstance(u"\u5b57", str)
        False
        >>> isinstance(u"\u5b57", basestring)
        True
        >>> isinstance('\xe5\xad\x97', basestring)
        True
        >>> isinstance(u"\u5b57".encode('utf-8'), str)
        True
      
        >>> u'@'.encode('ascii') == '@'
        True
        >>> u'@'.encode('utf-8') == '@'
        True
        >>> hex(ord("@")) == '0x40'
        True
        >>> print '\x40'
        @
        >>> print u'\x40'
        @
      
        >>> u'@'.encode('utf-16') == '@'
        False
      
        >>> len(u'\u0040'.encode('utf-8'))
        1
        >>> len(u"\u5b57".encode('utf-8'))
        3
        >>> u'\u5b57'.encode('utf-8')
        '\xe5\xad\x97'
        >>> u'\u0040'.encode('utf-8')
        '@' (or '\x40')
      
        >>> len(u'\u5b57')
        1
        >>> raw_utf8_data = '\xe5\xad\x97'
        >>> len(raw_utf8_data)
        3
        >>> print raw_utf8_data
        >>> raw_utf8_data.decode('utf-8')
        u'\u5b57'
        >>> len(raw_utf8_data.decode('utf-8'))
        1
      
        >>> u'\u5b57'.encode('big5')
        '\xa6\x72'
        >>> len(u'\u5b57'.encode('big5'))
        2
      
        >>> u'\u00D8'.encode('latin-1')
        '\xD8'
        >>> u'\xD8'.encode('latin-1')
        '\xD8'
        >>> print u'\u00D8'
        Ø
        >>> '\xD8'.decode('latin-1')
        u'\xD8' # The first 256 characters in the Unicode standard are the same as latin-1!
        >>> print '\xD8'.decode('latin-1').encode('utf-8')
        Ø
      
        >>> print '\xD8'
        �
      
        >>> print '\xD8'
        Ø
      
        >>> happy_utf8_data = u"hello \u0040 unicode \u5b57".encode('utf-8')
        >>> evil_utf8_data = happy_utf8_data + '\xff\xff\xff'
        >>> print evil_utf8_data.decode('utf-8')
        Traceback (most recent call last):
            File "<stdin>", line 1, in <module>
            File "/usr/lib/python2.6/encodings/utf_8.py", line 16, in decode
              return codecs.utf_8_decode(input, errors, True)
        UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 19: unexpected code byte
        >>> print evil_utf8_data.decode('utf-8', 'replace')
        hello @ unicode 字���
        >>> print evil_utf8_data.decode('utf-8', 'ignore')
        hello @ unicode 字
      
        >>> f = open('/tmp/lawl', "wb")
        >>> f.write(u'\u5b57')
        Traceback (most recent call last):
            File "<stdin>", line 1, in ?
        UnicodeEncodeError: 'ascii' codec can't encode character u'\u5b57' in position 0: ordinal not in range(128)
        >>> f.close()
      
        >>> import sys
        >>> sys.getdefaultencoding()
        'ascii'
      
        >>> f = open('/tmp/lawl', "wb")
        >>> f.write(u'\u5b57'.encode('utf-8'))
        >>> f.close()
        >>> f = open('/tmp/lawl', "rb")
        >>> raw_utf8_data = f.read()
        >>> f.close()
        >>> raw_utf8_data
        '\xe5\xad\x97'
        >>> raw_utf8_data.decode('utf-8')
        u'\u5b57'
        >>> f.close()
      
        >>> import codecs
        >>> f = codecs.open('/tmp/lawl', 'wb', 'utf-8')
        >>> f.write(u'\u5b57')
        >>> f.close()
        >>> f = codecs.open('/tmp/lawl', 'rb', 'utf-8')
        >>> f.read()
        u'\u5b57'
        >>> f.close()
      
        >>> import codecs
        >>> f = codecs.open('/tmp/lawl', 'wb', 'utf-8')
        >>> f.write('\xe5\xad\x97')
        Traceback (most recent call last):
            File "<stdin>", line 1, in <module>
            File "/usr/lib/python2.6/codecs.py", line 686, in write
              return self.writer.write(data)
            File "/usr/lib/python2.6/codecs.py", line 351, in write
              data, consumed = self.encode(object, self.errors)
        UnicodeDecodeError: 'ascii' codec can't decode byte 0xe5 in position 0: ordinal not in range(128)
      
      happy_unicode_data = open('tps_report.txt').read().decode('latin-1')
      
        >>> 'hello ' + 'there'
        'hello there'
        >>> 'hello ' + u'there'
        u'hello there'
        >>> 'hello ' + u'André'
        u'hello Andr\xe9'
      
        >>> 'hello ' + 'André'
        'hello Andr\xc3\xa9' # I copy/pasted 'é' as a UTF-8 character
        >>> 'hello'.encode('utf-8') # this is incorrect, but python lets it slide because 'hello' is ASCII
        'hello'
        >>> ('hello ' + 'André').encode('utf-8') # but don't get spoiled!
        Traceback (most recent call last):
            File "<stdin>", line 1, in <module>
        UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 10: ordinal not in range(128)
      
        #!/usr/bin/env python
        # -*- coding: utf-8 -*-
        import logging

        def say_hello(name):
            return "hello %s" % (name)

        logging.warn(say_hello('Jill'))
        logging.warn(say_hello(" ".join(['John', u'Doe'])))
        logging.warn(say_hello(u'André'))
        logging.warn(say_hello('André')) # stop being spoiled!
      
        jart@compy:~$ python test.py
        WARNING:root:hello Jill
        WARNING:root:hello André
        WARNING:root:hello John Doe
        Traceback (most recent call last):
            File "/usr/lib/python2.6/logging/__init__.py", line 773, in emit
              stream.write(fs % msg.encode("UTF-8"))
        UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 23: ordinal not in range(128)
      
        #!/usr/bin/env python
        # -*- coding: utf-8 -*-

        # this will crash if Python isn't able to figure out what encodings
        # your terminal supports
        print u"hello @ unicode 字"

        # this is safer if you KNOW your terminal will support UTF-8, or would
        # rather just have it not crash and print jibberish
        print u"hello @ unicode 字".encode('utf-8')

        # same concept applies, but you lose the benefits of unicode strings
        # most importantly Python will assume byte strings are ASCII the moment
        # they hit any standard I/o
        print "hello @ unicode 字"
      
        jart@compy:~$ python test.py
        hello @ unicode 字
        hello @ unicode 字
        hello @ unicode 字
        jart@compy:~$ python test.py >/dev/null
        Traceback (most recent call last):
            File "test.py", line 6, in <module>
              print u"hello @ unicode 字"
        UnicodeEncodeError: 'ascii' codec can't encode character u'\u5b57' in position 16: ordinal not in range(128)
      
      <meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
      
        >>> print u"&#0214; in HTML should be \u00D6 in Python"
        &#0214; in HTML should be Ö in Python
        >>> hex(0214)
        '0xD6'
      
        # Copyright (c) 2009 Lobstertech, Inc.
        # Licensed MIT
        import types

        def smart_unicode(s, encoding='utf-8', errors='strict'):
            if type(s) in (unicode, int, long, float, types.NoneType):
                return unicode(s)
            elif type(s) is str or hasattr(s, '__unicode__'):
                return unicode(s, encoding, errors)
            else:
                return unicode(str(s), encoding, errors)

        def smart_str(s, encoding='utf-8', errors='strict', from_encoding='utf-8'):
            if type(s) in (int, long, float, types.NoneType):
                return str(s)
            elif type(s) is str:
                if encoding != from_encoding:
                return s.decode(from_encoding, errors).encode(encoding, errors)
                else:
                    return s
            elif type(s) is unicode:
                return s.encode(encoding, errors)
            elif hasattr(s, '__str__'):
                return smart_str(str(s), encoding, errors, from_encoding)
            elif hasattr(s, '__unicode__'):
                return smart_str(unicode(s), encoding, errors, from_encoding)
            else:
                return smart_str(str(s), encoding, errors, from_encoding)
      
        # -*- coding: utf-8 -*-
        # Copyright (c) 2009 Lobstertech, Inc.
        # Licensed MIT
        #
        # py.test -xl test_smart_encoding.py

        import py.test
        from smart_encoding import smart_unicode, smart_str

        def test_smart_str():
            assert type(smart_str('hello world')) is str
            assert smart_str('hello world') == 'hello world'
            assert smart_str(u'hello world') == u'hello world'
            assert type(smart_str(u'hello world')) is str
            assert type(smart_str(u'hello world')) is str

            assert smart_str(u"\u96c6") == '\xe9\x9b\x86'
            assert smart_str(u"\u96c6", "big5") == '\xb6\xb0'
            py.test.raises(UnicodeDecodeError, lambda: smart_str('\xb6\xb0', "big5"))
            py.test.raises(UnicodeDecodeError, lambda: smart_str('\xb6\xb0', "ascii"))

            assert smart_str('\xb6\xb0', "big5", errors="ignore") == ''
            assert smart_str('\xb6\xb0', "big5", errors="replace") == '??'
            assert smart_str('hello \xb6\xb0', "ascii", errors="replace") == 'hello ??'
            assert smart_str('\xe9\x9b\x86 \xb6\xb0', "big5", errors="replace") == '\xb6\xb0 ??'
            assert smart_str('\xb6\xb0', "big5", from_encoding="big5") == '\xb6\xb0'
            assert smart_str('\xb6\xb0', "utf-8", from_encoding="big5") == '\xe9\x9b\x86'

        def test_smart_unicode():
            assert type(smart_unicode('hello world')) is unicode
            assert smart_unicode('hello world') == u'hello world'
            assert smart_unicode(u'hello world') == u'hello world'
            assert type(smart_unicode(u'hello world')) is unicode
            assert type(smart_unicode(u'hello world')) is unicode

            assert smart_unicode(u"\u96c6") == u"\u96c6"
            assert smart_unicode(u"\u96c6", "big5") == u"\u96c6"
            assert smart_unicode("\xb6\xb0", "big5") == u"\u96c6"
            assert smart_unicode("hi \xa3", "latin-1") == u'hi \xa3'
            assert smart_unicode("hi \xa3", "latin-1") == u'hi \u00a3'
            py.test.raises(UnicodeDecodeError, lambda: smart_unicode("hi \xa3", "ascii"))
            assert smart_unicode("hi \xa3", "ascii", errors="ignore") == u"hi "
            assert smart_unicode("hi \xa3", "ascii", errors="replace") == u"hi \ufffd" # unicode question mark

        def test_object():
            class Lawl(object): pass
            class Mog: pass
            assert type(smart_unicode(Lawl())) is unicode
            assert smart_unicode(Lawl()).startswith('<')
            assert type(smart_unicode(Mog())) is unicode
            assert smart_unicode(Mog()).startswith('<')
      
            class Hurt:
                def __str__(self):
                    return '\xe9\x9b\x86'
            assert smart_unicode(Hurt()) == u"\u96c6"
            assert smart_str(Hurt()) == '\xe9\x9b\x86'

            class TheHurting:
                def __str__(self):
                    return '\xb6\xb0'
            assert smart_unicode(TheHurting(), 'big5') == u"\u96c6"
      
        ; /etc/my.cnf
        [mysqld]
        ; ...
        default-character-set=utf8
        default-collation=utf8_general_ci
        ; ...
        [client]
        default-character-set=utf8
        
        $DB->Query("SET CHARACTER SET UTF8");
        $DB->Query("SET NAMES UTF8");
        
        #!/usr/bin/env python
        #
        # convert_mixed_utf_latin1.py
        #
        # This script will read 'backup.sql' and decode it as utf8.
        # If it comes across any characters it can't decode, it
        # will assume they are latin1.
        #
        # It will then output a file that is fully utf8 named 'backup.new.sql'
        #

        data = open('backup.sql').read()
        final = []
        while True:
            try:
                final.append(data.decode('utf8'))
                break
            except UnicodeDecodeError, exc:
                print "oh snap: %r -> %r" % (
                    data[exc.start],
                    data[exc.start].decode('latin1').encode('utf8'))
                # everything up to crazy character should be good
                final.append(data[:exc.start].decode('utf8'))
                # crazy character is probably latin1
                final.append(data[exc.start].decode('latin1'))
                # remove already encoded stuff
                data = data[exc.start+1:]
        f = open('backup.new.sql', 'wb')
        f.write("".join(final).encode('utf8'))
        f.close()
      
        jart@compy:~$ mysqldump --opt -u root happydb >backup.sql
        jart@compy:~$ echo 'create database happydb_new;' | mysql -uroot
        jart@compy:~$ convert_mixed_utf_latin1.py
        jart@compy:~$ sed -i -e 's/latin1/utf8/' backup.new.sql
        jart@compy:~$ mysql -uroot happydb_new <backup.new.sql
      
        >>> import email
        >>> msg = email.message_from_file(open('chinese_spam.txt'))
        >>> text = msg.get_payload()
        >>> msg.get_content_charset()
        'gb2312'
        >>> unicode_text = text.decode('gb2312')
        >>> type(unicode_text) == unicode
        True
        >>> print unicode_text.encode('utf-8')
        [...]
        ◆工作经验:
        10多年高科技企业产品研发和研发管理工作经历,先后担任过项目经理、研究管理部经
        理、开发部经理等职位,在长期的产品研发管理实践中积累了丰富的技术和管理经验。在华
        [...]
      
        >>> print msg['Subject']
        =?GB2312?B?svrGt9HQt6K8sLy8yvXIy9SxusvQxLncwO28vMTc0bXBtw==?=
        >>> msg['Subject'].decode(msg.get_content_charset()).encode('utf-8')
        '=?GB2312?B?svrGt9HQt6K8sLy8yvXIy9SxusvQxLncwO28vMTc0bXBtw==?='
      
        >>> import email.Header
        >>> (text, encoding) = email.Header.decode_header(msg['Subject'])[0]
        >>> print text
        ��Ʒ�з���������Ա���Ĺ�����ѵ�
        >>> type(text) is unicode
        False
        >>> encoding
        'gb2312'
        >>> print text.decode(encoding).encode('utf-8')
        产品研发及技术人员核心管理技能训练