example-code-2e/04-text-byte/simplify.py
2021-06-26 13:42:28 -03:00

91 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Radical folding and diacritic mark removal.
Handling a string with `cp1252` symbols:
>>> order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
>>> shave_marks(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> shave_marks_latin(order)
'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'
>>> dewinize(order)
'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'
>>> asciize(order)
'"Herr Voss: - 12 cup of OEtker(TM) caffe latte - bowl of acai."'
Handling a string with Greek and Latin accented characters:
>>> greek = 'Ζέφυρος, Zéfiro'
>>> shave_marks(greek)
'Ζεφυρος, Zefiro'
>>> shave_marks_latin(greek)
'Ζέφυρος, Zefiro'
>>> dewinize(greek)
'Ζέφυρος, Zéfiro'
>>> asciize(greek)
'Ζέφυρος, Zefiro'
"""
# tag::SHAVE_MARKS[]
import unicodedata
import string
def shave_marks(txt):
"""Remove all diacritic marks"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
shaved = ''.join(c for c in norm_txt
if not unicodedata.combining(c)) # <2>
return unicodedata.normalize('NFC', shaved) # <3>
# end::SHAVE_MARKS[]
# tag::SHAVE_MARKS_LATIN[]
def shave_marks_latin(txt):
"""Remove all diacritic marks from Latin base characters"""
norm_txt = unicodedata.normalize('NFD', txt) # <1>
latin_base = False
preserve = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base: # <2>
continue # ignore diacritic on Latin base char
preserve.append(c) # <3>
# if it isn't a combining char, it's a new base char
if not unicodedata.combining(c): # <4>
latin_base = c in string.ascii_letters
shaved = ''.join(preserve)
return unicodedata.normalize('NFC', shaved) # <5>
# end::SHAVE_MARKS_LATIN[]
# tag::ASCIIZE[]
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
"""'f"^<''""---~>""")
multi_map = str.maketrans({ # <2>
'': 'EUR',
'': '...',
'Æ': 'AE',
'æ': 'ae',
'Œ': 'OE',
'œ': 'oe',
'': '(TM)',
'': '<per mille>',
'': '**',
'': '***',
})
multi_map.update(single_map) # <3>
def dewinize(txt):
"""Replace Win1252 symbols with ASCII chars or sequences"""
return txt.translate(multi_map) # <4>
def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt)) # <5>
no_marks = no_marks.replace('ß', 'ss') # <6>
return unicodedata.normalize('NFKC', no_marks) # <7>
# end::ASCIIZE[]