updated contents from Atlas repo
This commit is contained in:
1
strings-bytes/cafe-gr.txt
Normal file
1
strings-bytes/cafe-gr.txt
Normal file
@@ -0,0 +1 @@
|
||||
καφέ
|
||||
1
strings-bytes/cafe.txt
Normal file
1
strings-bytes/cafe.txt
Normal file
@@ -0,0 +1 @@
|
||||
café
|
||||
19
strings-bytes/casefold_demo.py
Normal file
19
strings-bytes/casefold_demo.py
Normal file
@@ -0,0 +1,19 @@
|
||||
import sys
|
||||
from unicodedata import name, normalize
|
||||
|
||||
changed = 0
|
||||
assigned = 0
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
char_name = name(char, None)
|
||||
if char_name is None:
|
||||
continue
|
||||
cf = char.casefold()
|
||||
assigned += 1
|
||||
if cf != char.lower():
|
||||
cf_display = ' '.join(cf)
|
||||
cf_names = ';'.join(name(c) for c in cf)
|
||||
changed += 1
|
||||
print('%4d U+%04x' % (changed, i), char, cf_display, char_name + ' -> ' + cf_names, sep='\t')
|
||||
|
||||
print(changed, '/', assigned, '=', changed/assigned*100)
|
||||
15
strings-bytes/category_demo.py
Normal file
15
strings-bytes/category_demo.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import sys
|
||||
import unicodedata
|
||||
|
||||
categories = set()
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
name = unicodedata.name(char, None)
|
||||
if name is None:
|
||||
continue
|
||||
cat = unicodedata.category(char)
|
||||
if cat[0] not in categories:
|
||||
print('U+%04x' % i, char.center(6),
|
||||
cat, name, sep='\t')
|
||||
categories.add(cat[0])
|
||||
23
strings-bytes/charfinder.py
Executable file
23
strings-bytes/charfinder.py
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python
|
||||
from unicodedata import name
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
query = sys.argv[1:]
|
||||
else:
|
||||
query = input('search words: ').split()
|
||||
|
||||
query = [s.upper() for s in query]
|
||||
|
||||
count = 0
|
||||
for i in range(20, sys.maxunicode):
|
||||
car = chr(i)
|
||||
descr = name(car, None)
|
||||
if descr is None:
|
||||
continue
|
||||
words = descr.split()
|
||||
if all(word in words for word in query):
|
||||
print('{i:5d} {i:04x} {car:^5} {descr}'.format(**locals()))
|
||||
count += 1
|
||||
|
||||
print('{0} character(s) found'.format(count))
|
||||
9
strings-bytes/currency_demo.py
Normal file
9
strings-bytes/currency_demo.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import sys
|
||||
import unicodedata
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
if unicodedata.category(char) == 'Sc':
|
||||
name = unicodedata.name(char, None)
|
||||
print('U+%04x' % i, char.center(6),
|
||||
name, sep='\t')
|
||||
21
strings-bytes/default_encodings.py
Normal file
21
strings-bytes/default_encodings.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import sys, locale
|
||||
|
||||
expressions = """
|
||||
locale.getpreferredencoding()
|
||||
type(my_file)
|
||||
my_file.encoding
|
||||
sys.stdout.isatty()
|
||||
sys.stdout.encoding
|
||||
sys.stdin.isatty()
|
||||
sys.stdin.encoding
|
||||
sys.stderr.isatty()
|
||||
sys.stderr.encoding
|
||||
sys.getdefaultencoding()
|
||||
sys.getfilesystemencoding()
|
||||
"""
|
||||
|
||||
my_file = open('dummy', 'w')
|
||||
|
||||
for expression in expressions.split():
|
||||
value = eval(expression)
|
||||
print(expression.rjust(30), '->', repr(value))
|
||||
54
strings-bytes/encodings_demo.py
Normal file
54
strings-bytes/encodings_demo.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import unicodedata
|
||||
|
||||
encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
|
||||
|
||||
widths = {encoding:1 for encoding in encodings[:-3]}
|
||||
widths.update(zip(encodings[-3:], (2, 4, 4)))
|
||||
|
||||
chars = sorted([
|
||||
'A', # \u0041 : LATIN CAPITAL LETTER A
|
||||
'¿', # \u00bf : INVERTED QUESTION MARK
|
||||
'Ã', # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
|
||||
'á', # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
|
||||
'Ω', # \u03a9 : GREEK CAPITAL LETTER OMEGA
|
||||
'µ',
|
||||
'Ц',
|
||||
'€', # \u20ac : EURO SIGN
|
||||
'“',
|
||||
'┌',
|
||||
'气',
|
||||
'氣', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
|
||||
'𝄞', # \u1d11e : MUSICAL SYMBOL G CLEF
|
||||
])
|
||||
|
||||
callout1_code = 0x278a # ➊ DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
|
||||
|
||||
missing_mark = '*'
|
||||
|
||||
def list_chars():
|
||||
for char in chars:
|
||||
print('%r, # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
|
||||
|
||||
def show_encodings():
|
||||
print(end='\t\t')
|
||||
for encoding in encodings:
|
||||
print(encoding.ljust(widths[encoding] * 2), end='\t')
|
||||
print()
|
||||
|
||||
for lineno, char in enumerate(chars):
|
||||
codepoint = 'U+{:04X}'.format(ord(char))
|
||||
print(char, codepoint, sep='\t', end='\t')
|
||||
for encoding in encodings:
|
||||
try:
|
||||
bytes = char.encode(encoding)
|
||||
dump = ' '.join('%02X' % byte for byte in bytes)
|
||||
except UnicodeEncodeError:
|
||||
dump = missing_mark
|
||||
dump = dump.ljust(widths[encoding] * 2)
|
||||
print(dump, end='\t')
|
||||
# print(chr(callout1_code + lineno))
|
||||
print(unicodedata.name(char))
|
||||
# print()
|
||||
|
||||
#list_chars()
|
||||
show_encodings()
|
||||
7
strings-bytes/identifier_norm.py
Normal file
7
strings-bytes/identifier_norm.py
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
café = 1
|
||||
café = 2
|
||||
names = {(name, tuple(name)):value
|
||||
for name, value in globals().items()
|
||||
if not name.startswith('__')}
|
||||
print(names)
|
||||
12
strings-bytes/identifier_norm_writer.py
Normal file
12
strings-bytes/identifier_norm_writer.py
Normal file
@@ -0,0 +1,12 @@
|
||||
|
||||
src = """
|
||||
café = 1
|
||||
cafe\u0301 = 2
|
||||
names = {(name, tuple(name)):value
|
||||
for name, value in globals().items()
|
||||
if not name.startswith('__')}
|
||||
print(names)
|
||||
"""
|
||||
|
||||
with open('identifier_norm.py', 'tw', encoding='utf8') as out:
|
||||
out.write(src)
|
||||
14
strings-bytes/nfc_demo.py
Normal file
14
strings-bytes/nfc_demo.py
Normal file
@@ -0,0 +1,14 @@
|
||||
import sys
|
||||
from unicodedata import name, normalize
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
char_name = name(char, None)
|
||||
if char_name is None:
|
||||
continue
|
||||
nfc = normalize('NFC', char)
|
||||
if nfc == char:
|
||||
continue
|
||||
if len(nfc) > 1:
|
||||
nfc_display = ' '.join(nfc)
|
||||
print('U+%04x' % i, char, nfc_display, char_name, sep='\t')
|
||||
16
strings-bytes/nfk_demo.py
Normal file
16
strings-bytes/nfk_demo.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sys
|
||||
from unicodedata import name, normalize
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
char_name = name(char, None)
|
||||
if char_name is None:
|
||||
continue
|
||||
kc = normalize('NFKC', char)
|
||||
if kc == char:
|
||||
continue
|
||||
kd = normalize('NFKD', char)
|
||||
if kc != kd:
|
||||
kc_display = ' '.join(kc)
|
||||
kd_display = ' '.join(kd)
|
||||
print('U+%04x' % i, char, kc_display, kd_display, char_name, sep='\t')
|
||||
39
strings-bytes/normeq.py
Normal file
39
strings-bytes/normeq.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""
|
||||
Utility functions for normalized Unicode string comparison.
|
||||
|
||||
Using Normal Form C, case sensitive:
|
||||
|
||||
>>> s1 = 'café'
|
||||
>>> s2 = 'cafe\u0301'
|
||||
>>> s1 == s2
|
||||
False
|
||||
>>> nfc_equal(s1, s2)
|
||||
True
|
||||
>>> nfc_equal('A', 'a')
|
||||
False
|
||||
|
||||
Using Normal Form C with case folding:
|
||||
|
||||
>>> s3 = 'Straße'
|
||||
>>> s4 = 'strasse'
|
||||
>>> s3 == s4
|
||||
False
|
||||
>>> nfc_equal(s3, s4)
|
||||
False
|
||||
>>> fold_equal(s3, s4)
|
||||
True
|
||||
>>> fold_equal(s1, s2)
|
||||
True
|
||||
>>> fold_equal('A', 'a')
|
||||
True
|
||||
|
||||
"""
|
||||
|
||||
from unicodedata import normalize
|
||||
|
||||
def nfc_equal(str1, str2):
|
||||
return normalize('NFC', str1) == normalize('NFC', str2)
|
||||
|
||||
def fold_equal(str1, str2):
|
||||
return (normalize('NFC', str1).casefold() ==
|
||||
normalize('NFC', str2).casefold())
|
||||
15
strings-bytes/numerics.py
Normal file
15
strings-bytes/numerics.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import sys
|
||||
from unicodedata import name
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
try:
|
||||
char_name = name(char)
|
||||
except ValueError: # no such name
|
||||
continue
|
||||
flags = []
|
||||
flags.append('D' if char.isdigit() else '')
|
||||
flags.append('N' if char.isnumeric() else '')
|
||||
if any(flags):
|
||||
flags = '\t'.join(flags)
|
||||
print('U+%04x' % i, char, flags, char_name, sep='\t')
|
||||
18
strings-bytes/numerics_demo.py
Normal file
18
strings-bytes/numerics_demo.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# BEGIN NUMERICS_DEMO
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
re_digit = re.compile(r'\d')
|
||||
|
||||
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
|
||||
|
||||
for char in sample:
|
||||
print('U+%04x' % ord(char), # <1>
|
||||
char.center(6), # <2>
|
||||
're_dig' if re_digit.match(char) else '-', # <3>
|
||||
'isdig' if char.isdigit() else '-', # <4>
|
||||
'isnum' if char.isnumeric() else '-', # <5>
|
||||
format(unicodedata.numeric(char), '5.2f'), # <6>
|
||||
unicodedata.name(char), # <7>
|
||||
sep='\t')
|
||||
# END NUMERICS_DEMO
|
||||
9
strings-bytes/numerics_demo.txt
Normal file
9
strings-bytes/numerics_demo.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
U+0031 1 re_dig isdig isnum 1.00 DIGIT ONE
|
||||
U+00b2 ² - isdig isnum 2.00 SUPERSCRIPT TWO
|
||||
U+00bc ¼ - - isnum 0.25 VULGAR FRACTION ONE QUARTER
|
||||
U+0969 ३ re_dig isdig isnum 3.00 DEVANAGARI DIGIT THREE
|
||||
U+136b ፫ - isdig isnum 3.00 ETHIOPIC DIGIT THREE
|
||||
U+216b Ⅻ - - isnum 12.00 ROMAN NUMERAL TWELVE
|
||||
U+2466 ⑦ - isdig isnum 7.00 CIRCLED DIGIT SEVEN
|
||||
U+2480 ⒀ - - isnum 13.00 PARENTHESIZED NUMBER THIRTEEN
|
||||
U+3285 ㊅ - - isnum 6.00 CIRCLED IDEOGRAPH SIX
|
||||
3
strings-bytes/ola.py
Normal file
3
strings-bytes/ola.py
Normal file
@@ -0,0 +1,3 @@
|
||||
# coding: cp1252
|
||||
|
||||
print('Olá, Mundo!')
|
||||
16
strings-bytes/plane_count.py
Normal file
16
strings-bytes/plane_count.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import sys
|
||||
from unicodedata import name, normalize
|
||||
|
||||
total_count = 0
|
||||
bmp_count = 0
|
||||
|
||||
for i in range(sys.maxunicode):
|
||||
char = chr(i)
|
||||
char_name = name(char, None)
|
||||
if char_name is None:
|
||||
continue
|
||||
total_count += 1
|
||||
if i <= 0xffff:
|
||||
bmp_count += 1
|
||||
|
||||
print(total_count, bmp_count, bmp_count/total_count, bmp_count/total_count*100)
|
||||
21
strings-bytes/ramanujan.py
Normal file
21
strings-bytes/ramanujan.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# BEGIN RE_DEMO
|
||||
import re
|
||||
|
||||
re_numbers_str = re.compile(r'\d+') # <1>
|
||||
re_words_str = re.compile(r'\w+')
|
||||
re_numbers_bytes = re.compile(rb'\d+') # <2>
|
||||
re_words_bytes = re.compile(rb'\w+')
|
||||
|
||||
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" # <3>
|
||||
" as 1729 = 1³ + 12³ = 9³ + 10³.") # <4>
|
||||
|
||||
text_bytes = text_str.encode('utf_8') # <5>
|
||||
|
||||
print('Text', repr(text_str), sep='\n ')
|
||||
print('Numbers')
|
||||
print(' str :', re_numbers_str.findall(text_str)) # <6>
|
||||
print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
|
||||
print('Words')
|
||||
print(' str :', re_words_str.findall(text_str)) # <8>
|
||||
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
|
||||
# END RE_DEMO
|
||||
87
strings-bytes/sanitize.py
Normal file
87
strings-bytes/sanitize.py
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
"""
|
||||
Radical folding and text sanitizing.
|
||||
|
||||
Handling a string with `cp1252` symbols:
|
||||
|
||||
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
|
||||
>>> shave_marks(order)
|
||||
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
||||
>>> shave_marks_latin(order)
|
||||
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
||||
>>> dewinize(order)
|
||||
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
|
||||
>>> asciize(order)
|
||||
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
|
||||
|
||||
Handling a string with Greek and Latin accented characters:
|
||||
|
||||
>>> greek = 'Ζέφυρος, Zéfiro'
|
||||
>>> shave_marks(greek)
|
||||
'Ζεφυρος, Zefiro'
|
||||
>>> shave_marks_latin(greek)
|
||||
'Ζέφυρος, Zefiro'
|
||||
>>> dewinize(greek)
|
||||
'Ζέφυρος, Zéfiro'
|
||||
>>> asciize(greek)
|
||||
'Ζέφυρος, Zefiro'
|
||||
|
||||
"""
|
||||
|
||||
# BEGIN SHAVE_MARKS
|
||||
import unicodedata
|
||||
import string
|
||||
|
||||
|
||||
def shave_marks(txt):
|
||||
"""Remove all diacritic marks"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
shaved = ''.join(c for c in norm_txt
|
||||
if not unicodedata.combining(c)) # <2>
|
||||
return unicodedata.normalize('NFC', shaved) # <3>
|
||||
# END SHAVE_MARKS
|
||||
|
||||
# BEGIN SHAVE_MARKS_LATIN
|
||||
def shave_marks_latin(txt):
|
||||
"""Remove all diacritic marks from Latin base characters"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
latin_base = False
|
||||
keepers = []
|
||||
for c in norm_txt:
|
||||
if unicodedata.combining(c) and latin_base: # <2>
|
||||
continue # ignore diacritic on Latin base char
|
||||
keepers.append(c) # <3>
|
||||
# if it isn't combining char, it's a new base char
|
||||
if not unicodedata.combining(c): # <4>
|
||||
latin_base = c in string.ascii_letters
|
||||
shaved = ''.join(keepers)
|
||||
return unicodedata.normalize('NFC', shaved) # <5>
|
||||
# END SHAVE_MARKS_LATIN
|
||||
|
||||
# BEGIN ASCIIZE
|
||||
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
|
||||
"""'f"*^<''""---~>""")
|
||||
|
||||
multi_map = str.maketrans({ # <2>
|
||||
'€': '<euro>',
|
||||
'…': '...',
|
||||
'Œ': 'OE',
|
||||
'™': '(TM)',
|
||||
'œ': 'oe',
|
||||
'‰': '<per mille>',
|
||||
'‡': '**',
|
||||
})
|
||||
|
||||
multi_map.update(single_map) # <3>
|
||||
|
||||
|
||||
def dewinize(txt):
|
||||
"""Replace Win1252 symbols with ASCII chars or sequences"""
|
||||
return txt.translate(multi_map) # <4>
|
||||
|
||||
|
||||
def asciize(txt):
|
||||
no_marks = shave_marks_latin(dewinize(txt)) # <5>
|
||||
no_marks = no_marks.replace('ß', 'ss') # <6>
|
||||
return unicodedata.normalize('NFKC', no_marks) # <7>
|
||||
# END ASCIIZE
|
||||
26
strings-bytes/sorting.py
Normal file
26
strings-bytes/sorting.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import locale
|
||||
|
||||
def check(sorted_list):
|
||||
return 'CORRECT' if fruits == sorted_list else 'WRONG'
|
||||
|
||||
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
|
||||
|
||||
print(locale.getlocale(locale.LC_COLLATE))
|
||||
|
||||
print('manual_sort ', fruits)
|
||||
|
||||
plain_sort = sorted(fruits)
|
||||
|
||||
print('plain_sort ', plain_sort, check(plain_sort))
|
||||
|
||||
locale_sort1 = sorted(fruits, key=locale.strxfrm)
|
||||
|
||||
print('locale_sort1', locale_sort1, check(locale_sort1))
|
||||
|
||||
locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
|
||||
|
||||
print('locale set to:', locale.getlocale(locale.LC_COLLATE))
|
||||
|
||||
locale_sort2 = sorted(fruits, key=locale.strxfrm)
|
||||
|
||||
print('locale_sort2', locale_sort2, check(locale_sort2))
|
||||
18
strings-bytes/sorting_uca.py
Normal file
18
strings-bytes/sorting_uca.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from pyuca import Collator
|
||||
|
||||
def check(sorted_list):
|
||||
return 'CORRECT' if fruits == sorted_list else 'WRONG'
|
||||
|
||||
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
|
||||
|
||||
print('manual_sort', fruits)
|
||||
|
||||
plain_sort = sorted(fruits)
|
||||
|
||||
print('plain_sort ', plain_sort, check(plain_sort))
|
||||
|
||||
coll = Collator()
|
||||
|
||||
pyuca_sort = sorted(fruits, key=coll.sort_key)
|
||||
|
||||
print('pyuca_sort ', pyuca_sort, check(pyuca_sort))
|
||||
11
strings-bytes/sorting_uca.txt
Normal file
11
strings-bytes/sorting_uca.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
PS > pip install pyuca
|
||||
Downloading/unpacking pyuca
|
||||
Running setup.py (path:C:\Users\...) egg_info for package pyuca
|
||||
Installing collected packages: pyuca
|
||||
Running setup.py install for pyuca
|
||||
Successfully installed pyuca
|
||||
Cleaning up...
|
||||
PS > python .\sorting_uca.py
|
||||
manual_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju']
|
||||
plain_sort ['acaíba', 'acerola', 'açaí', 'caju', 'cajá'] WRONG
|
||||
pyuca_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju'] CORRECT
|
||||
14
strings-bytes/str_repr.py
Normal file
14
strings-bytes/str_repr.py
Normal file
@@ -0,0 +1,14 @@
|
||||
|
||||
last_len = 0
|
||||
last_repr = ''
|
||||
lengths = set()
|
||||
for i in range(0x110000):
|
||||
r = repr(chr(i))[1:-1]
|
||||
if len(r) != last_len:
|
||||
lengths.add(len(r))
|
||||
last_len = len(r)
|
||||
if i > 0:
|
||||
prev_repr = repr(chr(i-1))[1:-1]
|
||||
print('{}'.format(prev_repr))
|
||||
print('U+{:04x} {:{max_len}} ...'.format(i, r, max_len=max(lengths)), end=' ')
|
||||
last_repr = r
|
||||
50
strings-bytes/str_repr2.py
Normal file
50
strings-bytes/str_repr2.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from itertools import groupby
|
||||
|
||||
def bare_repr(codepoint):
|
||||
return repr(chr(codepoint))[1:-1]
|
||||
|
||||
def display(codepoint):
|
||||
repstr = repr(chr(codepoint))[1:-1]
|
||||
print('U+{:04x} {:{max_len}}'.format(
|
||||
codepoint, repstr, max_len=max(lengths)))
|
||||
|
||||
def repr_shape(codepoint):
|
||||
brepr = bare_repr(codepoint)
|
||||
if len(brepr) == 1:
|
||||
shape = 'GLYPH'
|
||||
else:
|
||||
shape = brepr[:2]
|
||||
escapes.add(shape)
|
||||
return len(brepr), shape
|
||||
|
||||
escapes = set()
|
||||
|
||||
group_gen = groupby((codepoint for codepoint in range(0x110000)), repr_shape)
|
||||
|
||||
for len_shape, group in group_gen:
|
||||
len_brepr, shape = len_shape
|
||||
group = list(group)
|
||||
cp_first = group[0]
|
||||
cp_last = group[-1]
|
||||
cp_mid = group[len(group)//2]
|
||||
if len(group) == 1:
|
||||
glyph_sample = bare_repr(cp_first) if shape == 'GLYPH' else ''
|
||||
print('{:6d} U+{:04X} {:5} {}'.format(
|
||||
len(group), cp_first, shape, glyph_sample))
|
||||
else:
|
||||
if len(group) == 2:
|
||||
if shape == 'GLYPH':
|
||||
glyph_sample = bare_repr(cp_first) + ' ' + bare_repr(cp_last)
|
||||
else:
|
||||
glyph_sample = ''
|
||||
print('{:6d} U+{:04X} , U+{:04X} {:5} {}'.format(
|
||||
len(group), cp_first, cp_last, shape, glyph_sample))
|
||||
else:
|
||||
if shape == 'GLYPH':
|
||||
glyph_sample = ' '.join([bare_repr(cp_first),
|
||||
bare_repr(cp_mid), bare_repr(cp_last)])
|
||||
else:
|
||||
glyph_sample = ''
|
||||
print('{:6d} U+{:04X}...U+{:04X} {:5} {}'.format(
|
||||
len(group), cp_first, cp_last, shape, glyph_sample))
|
||||
print('escapes:', ' '.join(sorted(escapes, key=str.upper)))
|
||||
22
strings-bytes/strings-bytes-test.txt
Normal file
22
strings-bytes/strings-bytes-test.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
>>> s = 'naïve' <1>
|
||||
>>> b = b'naïve' <2>
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
SyntaxError: bytes can only contain ASCII literal characters.
|
||||
>>> b = bytes('naïve', 'iso8859-1') <3>
|
||||
>>> b <4>
|
||||
b'na\xefve'
|
||||
>>> s <5>
|
||||
'naïve'
|
||||
>>> b == s.encode('iso8859-1') <6>
|
||||
True
|
||||
>>> s[2] <7>
|
||||
'ï'
|
||||
>>> b[2] <8>
|
||||
239
|
||||
>>> ord(s[2]) <9>
|
||||
239
|
||||
>>> s.upper() <10>
|
||||
'NAÏVE'
|
||||
>>> b.upper() <11>
|
||||
b'NA\xefVE'
|
||||
Reference in New Issue
Block a user