updated from Atlas
This commit is contained in:
21
04-text-byte/default_encodings.py
Normal file
21
04-text-byte/default_encodings.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import sys, locale
|
||||
|
||||
expressions = """
|
||||
locale.getpreferredencoding()
|
||||
type(my_file)
|
||||
my_file.encoding
|
||||
sys.stdout.isatty()
|
||||
sys.stdout.encoding
|
||||
sys.stdin.isatty()
|
||||
sys.stdin.encoding
|
||||
sys.stderr.isatty()
|
||||
sys.stderr.encoding
|
||||
sys.getdefaultencoding()
|
||||
sys.getfilesystemencoding()
|
||||
"""
|
||||
|
||||
my_file = open('dummy', 'w')
|
||||
|
||||
for expression in expressions.split():
|
||||
value = eval(expression)
|
||||
print(expression.rjust(30), '->', repr(value))
|
||||
39
04-text-byte/normeq.py
Normal file
39
04-text-byte/normeq.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""
|
||||
Utility functions for normalized Unicode string comparison.
|
||||
|
||||
Using Normal Form C, case sensitive:
|
||||
|
||||
>>> s1 = 'café'
|
||||
>>> s2 = 'cafe\u0301'
|
||||
>>> s1 == s2
|
||||
False
|
||||
>>> nfc_equal(s1, s2)
|
||||
True
|
||||
>>> nfc_equal('A', 'a')
|
||||
False
|
||||
|
||||
Using Normal Form C with case folding:
|
||||
|
||||
>>> s3 = 'Straße'
|
||||
>>> s4 = 'strasse'
|
||||
>>> s3 == s4
|
||||
False
|
||||
>>> nfc_equal(s3, s4)
|
||||
False
|
||||
>>> fold_equal(s3, s4)
|
||||
True
|
||||
>>> fold_equal(s1, s2)
|
||||
True
|
||||
>>> fold_equal('A', 'a')
|
||||
True
|
||||
|
||||
"""
|
||||
|
||||
from unicodedata import normalize
|
||||
|
||||
def nfc_equal(str1, str2):
|
||||
return normalize('NFC', str1) == normalize('NFC', str2)
|
||||
|
||||
def fold_equal(str1, str2):
|
||||
return (normalize('NFC', str1).casefold() ==
|
||||
normalize('NFC', str2).casefold())
|
||||
18
04-text-byte/numerics_demo.py
Normal file
18
04-text-byte/numerics_demo.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# BEGIN NUMERICS_DEMO
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
re_digit = re.compile(r'\d')
|
||||
|
||||
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
|
||||
|
||||
for char in sample:
|
||||
print('U+%04x' % ord(char), # <1>
|
||||
char.center(6), # <2>
|
||||
're_dig' if re_digit.match(char) else '-', # <3>
|
||||
'isdig' if char.isdigit() else '-', # <4>
|
||||
'isnum' if char.isnumeric() else '-', # <5>
|
||||
format(unicodedata.numeric(char), '5.2f'), # <6>
|
||||
unicodedata.name(char), # <7>
|
||||
sep='\t')
|
||||
# END NUMERICS_DEMO
|
||||
21
04-text-byte/ramanujan.py
Normal file
21
04-text-byte/ramanujan.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# BEGIN RE_DEMO
|
||||
import re
|
||||
|
||||
re_numbers_str = re.compile(r'\d+') # <1>
|
||||
re_words_str = re.compile(r'\w+')
|
||||
re_numbers_bytes = re.compile(rb'\d+') # <2>
|
||||
re_words_bytes = re.compile(rb'\w+')
|
||||
|
||||
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef" # <3>
|
||||
" as 1729 = 1³ + 12³ = 9³ + 10³.") # <4>
|
||||
|
||||
text_bytes = text_str.encode('utf_8') # <5>
|
||||
|
||||
print('Text', repr(text_str), sep='\n ')
|
||||
print('Numbers')
|
||||
print(' str :', re_numbers_str.findall(text_str)) # <6>
|
||||
print(' bytes:', re_numbers_bytes.findall(text_bytes)) # <7>
|
||||
print('Words')
|
||||
print(' str :', re_words_str.findall(text_str)) # <8>
|
||||
print(' bytes:', re_words_bytes.findall(text_bytes)) # <9>
|
||||
# END RE_DEMO
|
||||
87
04-text-byte/sanitize.py
Normal file
87
04-text-byte/sanitize.py
Normal file
@@ -0,0 +1,87 @@
|
||||
|
||||
"""
|
||||
Radical folding and text sanitizing.
|
||||
|
||||
Handling a string with `cp1252` symbols:
|
||||
|
||||
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
|
||||
>>> shave_marks(order)
|
||||
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
||||
>>> shave_marks_latin(order)
|
||||
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
|
||||
>>> dewinize(order)
|
||||
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
|
||||
>>> asciize(order)
|
||||
'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'
|
||||
|
||||
Handling a string with Greek and Latin accented characters:
|
||||
|
||||
>>> greek = 'Ζέφυρος, Zéfiro'
|
||||
>>> shave_marks(greek)
|
||||
'Ζεφυρος, Zefiro'
|
||||
>>> shave_marks_latin(greek)
|
||||
'Ζέφυρος, Zefiro'
|
||||
>>> dewinize(greek)
|
||||
'Ζέφυρος, Zéfiro'
|
||||
>>> asciize(greek)
|
||||
'Ζέφυρος, Zefiro'
|
||||
|
||||
"""
|
||||
|
||||
# BEGIN SHAVE_MARKS
|
||||
import unicodedata
|
||||
import string
|
||||
|
||||
|
||||
def shave_marks(txt):
|
||||
"""Remove all diacritic marks"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
shaved = ''.join(c for c in norm_txt
|
||||
if not unicodedata.combining(c)) # <2>
|
||||
return unicodedata.normalize('NFC', shaved) # <3>
|
||||
# END SHAVE_MARKS
|
||||
|
||||
# BEGIN SHAVE_MARKS_LATIN
|
||||
def shave_marks_latin(txt):
|
||||
"""Remove all diacritic marks from Latin base characters"""
|
||||
norm_txt = unicodedata.normalize('NFD', txt) # <1>
|
||||
latin_base = False
|
||||
keepers = []
|
||||
for c in norm_txt:
|
||||
if unicodedata.combining(c) and latin_base: # <2>
|
||||
continue # ignore diacritic on Latin base char
|
||||
keepers.append(c) # <3>
|
||||
# if it isn't combining char, it's a new base char
|
||||
if not unicodedata.combining(c): # <4>
|
||||
latin_base = c in string.ascii_letters
|
||||
shaved = ''.join(keepers)
|
||||
return unicodedata.normalize('NFC', shaved) # <5>
|
||||
# END SHAVE_MARKS_LATIN
|
||||
|
||||
# BEGIN ASCIIZE
|
||||
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•–—˜›""", # <1>
|
||||
"""'f"*^<''""---~>""")
|
||||
|
||||
multi_map = str.maketrans({ # <2>
|
||||
'€': '<euro>',
|
||||
'…': '...',
|
||||
'Œ': 'OE',
|
||||
'™': '(TM)',
|
||||
'œ': 'oe',
|
||||
'‰': '<per mille>',
|
||||
'‡': '**',
|
||||
})
|
||||
|
||||
multi_map.update(single_map) # <3>
|
||||
|
||||
|
||||
def dewinize(txt):
|
||||
"""Replace Win1252 symbols with ASCII chars or sequences"""
|
||||
return txt.translate(multi_map) # <4>
|
||||
|
||||
|
||||
def asciize(txt):
|
||||
no_marks = shave_marks_latin(dewinize(txt)) # <5>
|
||||
no_marks = no_marks.replace('ß', 'ss') # <6>
|
||||
return unicodedata.normalize('NFKC', no_marks) # <7>
|
||||
# END ASCIIZE
|
||||
Reference in New Issue
Block a user