updated contents from Atlas repo

This commit is contained in:
Luciano Ramalho
2014-10-14 14:26:55 -03:00
parent 40688c038d
commit 981d5bc473
157 changed files with 71134 additions and 1 deletions

View File

@@ -0,0 +1 @@
καφέ

1
strings-bytes/cafe.txt Normal file
View File

@@ -0,0 +1 @@
café

View File

@@ -0,0 +1,19 @@
import sys
from unicodedata import name, normalize
changed = 0
assigned = 0
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
cf = char.casefold()
assigned += 1
if cf != char.lower():
cf_display = ' '.join(cf)
cf_names = ';'.join(name(c) for c in cf)
changed += 1
print('%4d U+%04x' % (changed, i), char, cf_display, char_name + ' -> ' + cf_names, sep='\t')
print(changed, '/', assigned, '=', changed/assigned*100)

View File

@@ -0,0 +1,15 @@
import sys
import unicodedata
categories = set()
for i in range(sys.maxunicode):
char = chr(i)
name = unicodedata.name(char, None)
if name is None:
continue
cat = unicodedata.category(char)
if cat[0] not in categories:
print('U+%04x' % i, char.center(6),
cat, name, sep='\t')
categories.add(cat[0])

23
strings-bytes/charfinder.py Executable file
View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python
from unicodedata import name
import sys
if len(sys.argv) > 1:
query = sys.argv[1:]
else:
query = input('search words: ').split()
query = [s.upper() for s in query]
count = 0
for i in range(20, sys.maxunicode):
car = chr(i)
descr = name(car, None)
if descr is None:
continue
words = descr.split()
if all(word in words for word in query):
print('{i:5d} {i:04x} {car:^5} {descr}'.format(**locals()))
count += 1
print('{0} character(s) found'.format(count))

View File

@@ -0,0 +1,9 @@
import sys
import unicodedata
for i in range(sys.maxunicode):
char = chr(i)
if unicodedata.category(char) == 'Sc':
name = unicodedata.name(char, None)
print('U+%04x' % i, char.center(6),
name, sep='\t')

View File

@@ -0,0 +1,21 @@
import sys, locale
expressions = """
locale.getpreferredencoding()
type(my_file)
my_file.encoding
sys.stdout.isatty()
sys.stdout.encoding
sys.stdin.isatty()
sys.stdin.encoding
sys.stderr.isatty()
sys.stderr.encoding
sys.getdefaultencoding()
sys.getfilesystemencoding()
"""
my_file = open('dummy', 'w')
for expression in expressions.split():
value = eval(expression)
print(expression.rjust(30), '->', repr(value))

View File

@@ -0,0 +1,54 @@
import unicodedata
encodings = 'ascii latin1 cp1252 cp437 gb2312 utf-8 utf-16le'.split()
widths = {encoding:1 for encoding in encodings[:-3]}
widths.update(zip(encodings[-3:], (2, 4, 4)))
chars = sorted([
'A', # \u0041 : LATIN CAPITAL LETTER A
'¿', # \u00bf : INVERTED QUESTION MARK
'Ã', # \u00c3 : LATIN CAPITAL LETTER A WITH TILDE
'á', # \u00e1 : LATIN SMALL LETTER A WITH ACUTE
'Ω', # \u03a9 : GREEK CAPITAL LETTER OMEGA
'µ',
'Ц',
'', # \u20ac : EURO SIGN
'',
'',
'',
'', # \u6c23 : CJK UNIFIED IDEOGRAPH-6C23
'𝄞', # \u1d11e : MUSICAL SYMBOL G CLEF
])
callout1_code = 0x278a # ➊ DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
missing_mark = '*'
def list_chars():
for char in chars:
print('%r, # \\u%04x : %s' % (char, ord(char), unicodedata.name(char)))
def show_encodings():
print(end='\t\t')
for encoding in encodings:
print(encoding.ljust(widths[encoding] * 2), end='\t')
print()
for lineno, char in enumerate(chars):
codepoint = 'U+{:04X}'.format(ord(char))
print(char, codepoint, sep='\t', end='\t')
for encoding in encodings:
try:
bytes = char.encode(encoding)
dump = ' '.join('%02X' % byte for byte in bytes)
except UnicodeEncodeError:
dump = missing_mark
dump = dump.ljust(widths[encoding] * 2)
print(dump, end='\t')
# print(chr(callout1_code + lineno))
print(unicodedata.name(char))
# print()
#list_chars()
show_encodings()

View File

@@ -0,0 +1,7 @@
café = 1
café = 2
names = {(name, tuple(name)):value
for name, value in globals().items()
if not name.startswith('__')}
print(names)

View File

@@ -0,0 +1,12 @@
src = """
café = 1
cafe\u0301 = 2
names = {(name, tuple(name)):value
for name, value in globals().items()
if not name.startswith('__')}
print(names)
"""
with open('identifier_norm.py', 'tw', encoding='utf8') as out:
out.write(src)

14
strings-bytes/nfc_demo.py Normal file
View File

@@ -0,0 +1,14 @@
import sys
from unicodedata import name, normalize
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
nfc = normalize('NFC', char)
if nfc == char:
continue
if len(nfc) > 1:
nfc_display = ' '.join(nfc)
print('U+%04x' % i, char, nfc_display, char_name, sep='\t')

16
strings-bytes/nfk_demo.py Normal file
View File

@@ -0,0 +1,16 @@
import sys
from unicodedata import name, normalize
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
kc = normalize('NFKC', char)
if kc == char:
continue
kd = normalize('NFKD', char)
if kc != kd:
kc_display = ' '.join(kc)
kd_display = ' '.join(kd)
print('U+%04x' % i, char, kc_display, kd_display, char_name, sep='\t')

39
strings-bytes/normeq.py Normal file
View File

@@ -0,0 +1,39 @@
"""
Utility functions for normalized Unicode string comparison.
Using Normal Form C, case sensitive:
>>> s1 = 'café'
>>> s2 = 'cafe\u0301'
>>> s1 == s2
False
>>> nfc_equal(s1, s2)
True
>>> nfc_equal('A', 'a')
False
Using Normal Form C with case folding:
>>> s3 = 'Straße'
>>> s4 = 'strasse'
>>> s3 == s4
False
>>> nfc_equal(s3, s4)
False
>>> fold_equal(s3, s4)
True
>>> fold_equal(s1, s2)
True
>>> fold_equal('A', 'a')
True
"""
from unicodedata import normalize
def nfc_equal(str1, str2):
return normalize('NFC', str1) == normalize('NFC', str2)
def fold_equal(str1, str2):
return (normalize('NFC', str1).casefold() ==
normalize('NFC', str2).casefold())

15
strings-bytes/numerics.py Normal file
View File

@@ -0,0 +1,15 @@
import sys
from unicodedata import name
for i in range(sys.maxunicode):
char = chr(i)
try:
char_name = name(char)
except ValueError: # no such name
continue
flags = []
flags.append('D' if char.isdigit() else '')
flags.append('N' if char.isnumeric() else '')
if any(flags):
flags = '\t'.join(flags)
print('U+%04x' % i, char, flags, char_name, sep='\t')

View File

@@ -0,0 +1,18 @@
# BEGIN NUMERICS_DEMO
import unicodedata
import re
re_digit = re.compile(r'\d')
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
for char in sample:
print('U+%04x' % ord(char), # <1>
char.center(6), # <2>
're_dig' if re_digit.match(char) else '-', # <3>
'isdig' if char.isdigit() else '-', # <4>
'isnum' if char.isnumeric() else '-', # <5>
format(unicodedata.numeric(char), '5.2f'), # <6>
unicodedata.name(char), # <7>
sep='\t')
# END NUMERICS_DEMO

View File

@@ -0,0 +1,9 @@
U+0031 1 re_dig isdig isnum 1.00 DIGIT ONE
U+00b2 ² - isdig isnum 2.00 SUPERSCRIPT TWO
U+00bc ¼ - - isnum 0.25 VULGAR FRACTION ONE QUARTER
U+0969 ३ re_dig isdig isnum 3.00 DEVANAGARI DIGIT THREE
U+136b ፫ - isdig isnum 3.00 ETHIOPIC DIGIT THREE
U+216b Ⅻ - - isnum 12.00 ROMAN NUMERAL TWELVE
U+2466 ⑦ - isdig isnum 7.00 CIRCLED DIGIT SEVEN
U+2480 ⒀ - - isnum 13.00 PARENTHESIZED NUMBER THIRTEEN
U+3285 ㊅ - - isnum 6.00 CIRCLED IDEOGRAPH SIX

3
strings-bytes/ola.py Normal file
View File

@@ -0,0 +1,3 @@
# coding: cp1252
print('Olá, Mundo!')

View File

@@ -0,0 +1,16 @@
import sys
from unicodedata import name, normalize
total_count = 0
bmp_count = 0
for i in range(sys.maxunicode):
char = chr(i)
char_name = name(char, None)
if char_name is None:
continue
total_count += 1
if i <= 0xffff:
bmp_count += 1
print(total_count, bmp_count, bmp_count/total_count, bmp_count/total_count*100)

View File

@@ -0,0 +1,21 @@
# BEGIN RE_DEMO
import re
re_numbers_str = re.compile(r'\d+') # <1>
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+') # <2>
re_words_bytes = re.compile(rb'\w+')
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" # <3>
" as 1729 = 1³ + 12³ = 9³ + 10³.") # <4>
text_bytes = text_str.encode('utf_8') # <5>
print('Text', repr(text_str), sep='\n ')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str)) # <6>
print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
print('Words')
print(' str :', re_words_str.findall(text_str)) # <8>
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
# END RE_DEMO

87
strings-bytes/sanitize.py Normal file
View File

@@ -0,0 +1,87 @@
"""
Radical folding and text sanitizing.
Handling a string with `cp1252` symbols:
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
>>> shave_marks(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> shave_marks_latin(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> dewinize(order)
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
>>> asciize(order)
'"Herr Voss: - 12 cup of OEtker(TM) caffe latte - bowl of acai."'
Handling a string with Greek and Latin accented characters:
>>> greek = 'Ζέφυρος, Zéfiro'
>>> shave_marks(greek)
'Ζεφυρος, Zefiro'
>>> shave_marks_latin(greek)
'Ζέφυρος, Zefiro'
>>> dewinize(greek)
'Ζέφυρος, Zéfiro'
>>> asciize(greek)
'Ζέφυρος, Zefiro'
"""
# BEGIN SHAVE_MARKS
import unicodedata
import string
def shave_marks(txt):
"""Remove all diacritic marks"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
shaved = ''.join(c for c in norm_txt
if not unicodedata.combining(c)) # <2>
return unicodedata.normalize('NFC', shaved) # <3>
# END SHAVE_MARKS
# BEGIN SHAVE_MARKS_LATIN
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
latin_base = False
keepers = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base: # <2>
continue # ignore diacritic on Latin base char
keepers.append(c) # <3>
# if it isn't combining char, it's a new base char
if not unicodedata.combining(c): # <4>
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved) # <5>
# END SHAVE_MARKS_LATIN
# BEGIN ASCIIZE
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
"""'f"*^<''""---~>""")
multi_map = str.maketrans({ # <2>
'': '<euro>',
'': '...',
'Œ': 'OE',
'': '(TM)',
'œ': 'oe',
'': '<per mille>',
'': '**',
})
multi_map.update(single_map) # <3>
def dewinize(txt):
"""Replace Win1252 symbols with ASCII chars or sequences"""
return txt.translate(multi_map) # <4>
def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt)) # <5>
no_marks = no_marks.replace('ß', 'ss') # <6>
return unicodedata.normalize('NFKC', no_marks) # <7>
# END ASCIIZE

26
strings-bytes/sorting.py Normal file
View File

@@ -0,0 +1,26 @@
import locale
def check(sorted_list):
return 'CORRECT' if fruits == sorted_list else 'WRONG'
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
print(locale.getlocale(locale.LC_COLLATE))
print('manual_sort ', fruits)
plain_sort = sorted(fruits)
print('plain_sort ', plain_sort, check(plain_sort))
locale_sort1 = sorted(fruits, key=locale.strxfrm)
print('locale_sort1', locale_sort1, check(locale_sort1))
locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print('locale set to:', locale.getlocale(locale.LC_COLLATE))
locale_sort2 = sorted(fruits, key=locale.strxfrm)
print('locale_sort2', locale_sort2, check(locale_sort2))

View File

@@ -0,0 +1,18 @@
from pyuca import Collator
def check(sorted_list):
return 'CORRECT' if fruits == sorted_list else 'WRONG'
fruits = ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']
print('manual_sort', fruits)
plain_sort = sorted(fruits)
print('plain_sort ', plain_sort, check(plain_sort))
coll = Collator()
pyuca_sort = sorted(fruits, key=coll.sort_key)
print('pyuca_sort ', pyuca_sort, check(pyuca_sort))

View File

@@ -0,0 +1,11 @@
PS > pip install pyuca
Downloading/unpacking pyuca
Running setup.py (path:C:\Users\...) egg_info for package pyuca
Installing collected packages: pyuca
Running setup.py install for pyuca
Successfully installed pyuca
Cleaning up...
PS > python .\sorting_uca.py
manual_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju']
plain_sort ['acaíba', 'acerola', 'açaí', 'caju', 'cajá'] WRONG
pyuca_sort ['açaí', 'acaíba', 'acerola', 'cajá', 'caju'] CORRECT

14
strings-bytes/str_repr.py Normal file
View File

@@ -0,0 +1,14 @@
last_len = 0
last_repr = ''
lengths = set()
for i in range(0x110000):
r = repr(chr(i))[1:-1]
if len(r) != last_len:
lengths.add(len(r))
last_len = len(r)
if i > 0:
prev_repr = repr(chr(i-1))[1:-1]
print('{}'.format(prev_repr))
print('U+{:04x} {:{max_len}} ...'.format(i, r, max_len=max(lengths)), end=' ')
last_repr = r

View File

@@ -0,0 +1,50 @@
from itertools import groupby
def bare_repr(codepoint):
return repr(chr(codepoint))[1:-1]
def display(codepoint):
repstr = repr(chr(codepoint))[1:-1]
print('U+{:04x} {:{max_len}}'.format(
codepoint, repstr, max_len=max(lengths)))
def repr_shape(codepoint):
brepr = bare_repr(codepoint)
if len(brepr) == 1:
shape = 'GLYPH'
else:
shape = brepr[:2]
escapes.add(shape)
return len(brepr), shape
escapes = set()
group_gen = groupby((codepoint for codepoint in range(0x110000)), repr_shape)
for len_shape, group in group_gen:
len_brepr, shape = len_shape
group = list(group)
cp_first = group[0]
cp_last = group[-1]
cp_mid = group[len(group)//2]
if len(group) == 1:
glyph_sample = bare_repr(cp_first) if shape == 'GLYPH' else ''
print('{:6d} U+{:04X} {:5} {}'.format(
len(group), cp_first, shape, glyph_sample))
else:
if len(group) == 2:
if shape == 'GLYPH':
glyph_sample = bare_repr(cp_first) + ' ' + bare_repr(cp_last)
else:
glyph_sample = ''
print('{:6d} U+{:04X} , U+{:04X} {:5} {}'.format(
len(group), cp_first, cp_last, shape, glyph_sample))
else:
if shape == 'GLYPH':
glyph_sample = ' '.join([bare_repr(cp_first),
bare_repr(cp_mid), bare_repr(cp_last)])
else:
glyph_sample = ''
print('{:6d} U+{:04X}...U+{:04X} {:5} {}'.format(
len(group), cp_first, cp_last, shape, glyph_sample))
print('escapes:', ' '.join(sorted(escapes, key=str.upper)))

View File

@@ -0,0 +1,22 @@
>>> s = 'naïve' <1>
>>> b = b'naïve' <2>
Traceback (most recent call last):
...
SyntaxError: bytes can only contain ASCII literal characters.
>>> b = bytes('naïve', 'iso8859-1') <3>
>>> b <4>
b'na\xefve'
>>> s <5>
'naïve'
>>> b == s.encode('iso8859-1') <6>
True
>>> s[2] <7>
'ï'
>>> b[2] <8>
239
>>> ord(s[2]) <9>
239
>>> s.upper() <10>
'NAÏVE'
>>> b.upper() <11>
b'NA\xefVE'