example-code-2e/attic/strings-bytes/encodings_demo.py
2015-04-01 22:48:56 -03:00

55 lines
1.6 KiB
Python

import unicodedata
encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
widths = {encoding:1 for encoding in encodings[:-3]}
widths.update(zip(encodings[-3:], (2, 4, 4)))
chars = sorted([
'A', # \u0041 : LATIN CAPITAL LETTER A
'¿', # \u00bf : INVERTED QUESTION MARK
'Ã', # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
'á', # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
'Ω', # \u03a9 : GREEK CAPITAL LETTER OMEGA
'µ',
'Ц',
'', # \u20ac : EURO SIGN
'',
'',
'',
'', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
'𝄞', # \u1d11e : MUSICAL SYMBOL G CLEF
])
callout1_code = 0x278a # ➊ DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
missing_mark = '*'
def list_chars():
for char in chars:
print('%r, # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
def show_encodings():
print(end='\t\t')
for encoding in encodings:
print(encoding.ljust(widths[encoding] * 2), end='\t')
print()
for lineno, char in enumerate(chars):
codepoint = 'U+{:04X}'.format(ord(char))
print(char, codepoint, sep='\t', end='\t')
for encoding in encodings:
try:
bytes = char.encode(encoding)
dump = ' '.join('%02X' % byte for byte in bytes)
except UnicodeEncodeError:
dump = missing_mark
dump = dump.ljust(widths[encoding] * 2)
print(dump, end='\t')
# print(chr(callout1_code + lineno))
print(unicodedata.name(char))
# print()
#list_chars()
show_encodings()